Update README and core files, remove test/debug scripts, improve documentation and robustness

2025-07-03 19:27:16 +01:00
parent a202abf5c0
commit 00b42f2c0f
8 changed files with 794 additions and 875 deletions
@@ -8,6 +8,9 @@ import config
 import os
 import aiofiles
 from datetime import datetime
+import logging
+
+logger = logging.getLogger(__name__)

 class DocumentProcessor:
    def __init__(self):
@@ -160,27 +163,127 @@ class DocumentProcessor:
            import json
            import re
            
-            # Find JSON in response
+            # Find JSON in response - try multiple patterns
            json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
            if json_match:
                json_str = json_match.group()
-                data = json.loads(json_str)
+                
+                # Clean up common JSON issues
+                json_str = re.sub(r',\s*([}\]])', r'\1', json_str)  # Remove trailing commas
+                json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str)  # Quote unquoted keys
+                
+                try:
+                    data = json.loads(json_str)
+                except json.JSONDecodeError as e:
+                    # Try to fix common JSON issues
+                    logger.warning(f"Initial JSON parsing failed: {e}")
+                    
+                    # Try to extract individual fields using regex
+                    vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
+                    total_amount_match = re.search(r'"total_amount"\s*:\s*([0-9.]+)', json_str)
+                    tax_amount_match = re.search(r'"tax_amount"\s*:\s*([0-9.]+)', json_str)
+                    date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
+                    category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
+                    confidence_match = re.search(r'"confidence"\s*:\s*([0-9.]+)', json_str)
+                    
+                    data = {
+                        "vendor": vendor_match.group(1) if vendor_match else "",
+                        "total_amount": float(total_amount_match.group(1)) if total_amount_match else 0.0,
+                        "tax_amount": float(tax_amount_match.group(1)) if tax_amount_match else 0.0,
+                        "date": date_match.group(1) if date_match else "",
+                        "category": category_match.group(1) if category_match else "Other",
+                        "confidence": float(confidence_match.group(1)) if confidence_match else 0.5
+                    }
                
                # Validate and clean data
                return {
-                    "vendor": data.get("vendor", "").strip(),
+                    "vendor": str(data.get("vendor", "")).strip(),
                    "total_amount": float(data.get("total_amount", 0)),
                    "tax_amount": float(data.get("tax_amount", 0)),
-                    "date": data.get("date", ""),
-                    "category": data.get("category", "Other"),
+                    "date": str(data.get("date", "")).strip(),
+                    "category": str(data.get("category", "Other")).strip(),
                    "confidence": float(data.get("confidence", 0.5)),
                    "extraction_success": True
                }
            else:
-                return {"error": "Could not parse JSON from AI response"}
+                # Try to extract fields from plain text
+                logger.warning("No JSON found in response, attempting text extraction")
+                return self._extract_from_plain_text(result_text)
                
        except Exception as e:
-            return {"error": f"JSON parsing error: {str(e)}"}
+            logger.error(f"JSON parsing error: {str(e)}")
+            return {"error": f"JSON parsing error: {str(e)}", "extraction_success": False}
+    
+    def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
+        """Extract receipt data from plain text when JSON parsing fails"""
+        try:
+            import re
+            
+            # Extract vendor (look for common patterns)
+            vendor_patterns = [
+                r'(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)',
+                r'([A-Z][A-Za-z0-9\s&.,]{3,30})',  # Capitalized words
+            ]
+            
+            vendor = ""
+            for pattern in vendor_patterns:
+                match = re.search(pattern, text, re.IGNORECASE)
+                if match:
+                    vendor = match.group(1).strip()
+                    break
+            
+            # Extract amount (look for currency patterns)
+            amount_patterns = [
+                r'\$?\s*([0-9,]+\.?[0-9]*)',
+                r'(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)',
+            ]
+            
+            total_amount = 0.0
+            for pattern in amount_patterns:
+                match = re.search(pattern, text, re.IGNORECASE)
+                if match:
+                    try:
+                        total_amount = float(match.group(1).replace(',', ''))
+                        break
+                    except ValueError:
+                        continue
+            
+            # Extract date
+            date_patterns = [
+                r'(\d{4}-\d{2}-\d{2})',
+                r'(\d{1,2}/\d{1,2}/\d{2,4})',
+                r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}',
+            ]
+            
+            date = ""
+            for pattern in date_patterns:
+                match = re.search(pattern, text, re.IGNORECASE)
+                if match:
+                    date = match.group(0)
+                    break
+            
+            return {
+                "vendor": vendor or "Unknown",
+                "total_amount": total_amount,
+                "tax_amount": 0.0,
+                "date": date or "",
+                "category": "Other",
+                "confidence": 0.3,  # Low confidence for text extraction
+                "extraction_success": True
+            }
+            
+        except Exception as e:
+            logger.error(f"Text extraction error: {str(e)}")
+            return {
+                "vendor": "Unknown",
+                "total_amount": 0.0,
+                "tax_amount": 0.0,
+                "date": "",
+                "category": "Other",
+                "confidence": 0.1,
+                "extraction_success": False,
+                "error": f"Text extraction failed: {str(e)}"
+            }
    
    async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
        """Save uploaded file to temporary storage"""
@@ -286,44 +389,55 @@ class DocumentProcessor:
        try:
            import json
            import re
-            
-            # Find JSON in response
-            json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
-            if json_match:
-                json_str = json_match.group()
-                data = json.loads(json_str)
-                
-                # Validate and clean data
-                transactions = data.get("transactions", [])
-                cleaned_transactions = []
-                
-                for txn in transactions:
-                    try:
-                        # Clean and validate each transaction
-                        cleaned_txn = {
-                            "date": str(txn.get("date", "")).strip(),
-                            "amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
-                            "vendor": str(txn.get("vendor", "")).strip(),
-                            "memo": str(txn.get("memo", "")).strip()
-                        }
-                        cleaned_transactions.append(cleaned_txn)
-                    except Exception as e:
-                        # Skip invalid transactions
-                        continue
-                
-                return {
-                    "extraction_success": data.get("extraction_success", True),
-                    "transactions": cleaned_transactions,
-                    "total_transactions": len(cleaned_transactions)
-                }
-            else:
+
+            # Find the first '{' and last '}'
+            start = result_text.find('{')
+            end = result_text.rfind('}')
+            if start == -1 or end == -1 or end <= start:
                return {
                    "extraction_success": False,
-                    "error": "Could not parse JSON from AI response",
+                    "error": "Could not find JSON object in AI response",
                    "transactions": []
                }
-                
+            json_str = result_text[start:end+1]
+
+            # Remove trailing commas before } or ]
+            json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
+
+            try:
+                data = json.loads(json_str)
+            except Exception as e:
+                import logging
+                logging.error(f"JSON parsing error: {str(e)}")
+                logging.error(f"Offending JSON string:\n{json_str}")
+                return {
+                    "extraction_success": False,
+                    "error": f"JSON parsing error: {str(e)}",
+                    "transactions": []
+                }
+
+            # Validate and clean data
+            transactions = data.get("transactions", [])
+            cleaned_transactions = []
+            for txn in transactions:
+                try:
+                    cleaned_txn = {
+                        "date": str(txn.get("date", "")).strip(),
+                        "amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
+                        "vendor": str(txn.get("vendor", "")).strip(),
+                        "memo": str(txn.get("memo", "")).strip()
+                    }
+                    cleaned_transactions.append(cleaned_txn)
+                except Exception as e:
+                    continue
+            return {
+                "extraction_success": data.get("extraction_success", True),
+                "transactions": cleaned_transactions,
+                "total_transactions": len(cleaned_transactions)
+            }
        except Exception as e:
+            import logging
+            logging.error(f"JSON parsing error (outer): {str(e)}")
            return {
                "extraction_success": False,
                "error": f"JSON parsing error: {str(e)}",