Update README and core files, remove test/debug scripts, improve documentation and robustness

This commit is contained in:
Iyeoluwa Akinrinola
2025-07-03 19:27:16 +01:00
parent a202abf5c0
commit 00b42f2c0f
8 changed files with 794 additions and 875 deletions
+154 -40
View File
@@ -8,6 +8,9 @@ import config
import os
import aiofiles
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self):
@@ -160,27 +163,127 @@ class DocumentProcessor:
import json
import re
# Find JSON in response
# Find JSON in response - try multiple patterns
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
if json_match:
json_str = json_match.group()
data = json.loads(json_str)
# Clean up common JSON issues
json_str = re.sub(r',\s*([}\]])', r'\1', json_str) # Remove trailing commas
json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str) # Quote unquoted keys
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
# Try to fix common JSON issues
logger.warning(f"Initial JSON parsing failed: {e}")
# Try to extract individual fields using regex
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
total_amount_match = re.search(r'"total_amount"\s*:\s*([0-9.]+)', json_str)
tax_amount_match = re.search(r'"tax_amount"\s*:\s*([0-9.]+)', json_str)
date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
confidence_match = re.search(r'"confidence"\s*:\s*([0-9.]+)', json_str)
data = {
"vendor": vendor_match.group(1) if vendor_match else "",
"total_amount": float(total_amount_match.group(1)) if total_amount_match else 0.0,
"tax_amount": float(tax_amount_match.group(1)) if tax_amount_match else 0.0,
"date": date_match.group(1) if date_match else "",
"category": category_match.group(1) if category_match else "Other",
"confidence": float(confidence_match.group(1)) if confidence_match else 0.5
}
# Validate and clean data
return {
"vendor": data.get("vendor", "").strip(),
"vendor": str(data.get("vendor", "")).strip(),
"total_amount": float(data.get("total_amount", 0)),
"tax_amount": float(data.get("tax_amount", 0)),
"date": data.get("date", ""),
"category": data.get("category", "Other"),
"date": str(data.get("date", "")).strip(),
"category": str(data.get("category", "Other")).strip(),
"confidence": float(data.get("confidence", 0.5)),
"extraction_success": True
}
else:
return {"error": "Could not parse JSON from AI response"}
# Try to extract fields from plain text
logger.warning("No JSON found in response, attempting text extraction")
return self._extract_from_plain_text(result_text)
except Exception as e:
return {"error": f"JSON parsing error: {str(e)}"}
logger.error(f"JSON parsing error: {str(e)}")
return {"error": f"JSON parsing error: {str(e)}", "extraction_success": False}
def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
"""Extract receipt data from plain text when JSON parsing fails"""
try:
import re
# Extract vendor (look for common patterns)
vendor_patterns = [
r'(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)',
r'([A-Z][A-Za-z0-9\s&.,]{3,30})', # Capitalized words
]
vendor = ""
for pattern in vendor_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vendor = match.group(1).strip()
break
# Extract amount (look for currency patterns)
amount_patterns = [
r'\$?\s*([0-9,]+\.?[0-9]*)',
r'(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)',
]
total_amount = 0.0
for pattern in amount_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
total_amount = float(match.group(1).replace(',', ''))
break
except ValueError:
continue
# Extract date
date_patterns = [
r'(\d{4}-\d{2}-\d{2})',
r'(\d{1,2}/\d{1,2}/\d{2,4})',
r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}',
]
date = ""
for pattern in date_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
date = match.group(0)
break
return {
"vendor": vendor or "Unknown",
"total_amount": total_amount,
"tax_amount": 0.0,
"date": date or "",
"category": "Other",
"confidence": 0.3, # Low confidence for text extraction
"extraction_success": True
}
except Exception as e:
logger.error(f"Text extraction error: {str(e)}")
return {
"vendor": "Unknown",
"total_amount": 0.0,
"tax_amount": 0.0,
"date": "",
"category": "Other",
"confidence": 0.1,
"extraction_success": False,
"error": f"Text extraction failed: {str(e)}"
}
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
"""Save uploaded file to temporary storage"""
@@ -286,44 +389,55 @@ class DocumentProcessor:
try:
import json
import re
# Find JSON in response
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
if json_match:
json_str = json_match.group()
data = json.loads(json_str)
# Validate and clean data
transactions = data.get("transactions", [])
cleaned_transactions = []
for txn in transactions:
try:
# Clean and validate each transaction
cleaned_txn = {
"date": str(txn.get("date", "")).strip(),
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
"vendor": str(txn.get("vendor", "")).strip(),
"memo": str(txn.get("memo", "")).strip()
}
cleaned_transactions.append(cleaned_txn)
except Exception as e:
# Skip invalid transactions
continue
return {
"extraction_success": data.get("extraction_success", True),
"transactions": cleaned_transactions,
"total_transactions": len(cleaned_transactions)
}
else:
# Find the first '{' and last '}'
start = result_text.find('{')
end = result_text.rfind('}')
if start == -1 or end == -1 or end <= start:
return {
"extraction_success": False,
"error": "Could not parse JSON from AI response",
"error": "Could not find JSON object in AI response",
"transactions": []
}
json_str = result_text[start:end+1]
# Remove trailing commas before } or ]
json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
try:
data = json.loads(json_str)
except Exception as e:
import logging
logging.error(f"JSON parsing error: {str(e)}")
logging.error(f"Offending JSON string:\n{json_str}")
return {
"extraction_success": False,
"error": f"JSON parsing error: {str(e)}",
"transactions": []
}
# Validate and clean data
transactions = data.get("transactions", [])
cleaned_transactions = []
for txn in transactions:
try:
cleaned_txn = {
"date": str(txn.get("date", "")).strip(),
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
"vendor": str(txn.get("vendor", "")).strip(),
"memo": str(txn.get("memo", "")).strip()
}
cleaned_transactions.append(cleaned_txn)
except Exception as e:
continue
return {
"extraction_success": data.get("extraction_success", True),
"transactions": cleaned_transactions,
"total_transactions": len(cleaned_transactions)
}
except Exception as e:
import logging
logging.error(f"JSON parsing error (outer): {str(e)}")
return {
"extraction_success": False,
"error": f"JSON parsing error: {str(e)}",