Update README and core files, remove test/debug scripts, improve documentation and robustness
This commit is contained in:
+154
-40
@@ -8,6 +8,9 @@ import config
|
||||
import os
|
||||
import aiofiles
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentProcessor:
|
||||
def __init__(self):
|
||||
@@ -160,27 +163,127 @@ class DocumentProcessor:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Find JSON in response
|
||||
# Find JSON in response - try multiple patterns
|
||||
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str = json_match.group()
|
||||
data = json.loads(json_str)
|
||||
|
||||
# Clean up common JSON issues
|
||||
json_str = re.sub(r',\s*([}\]])', r'\1', json_str) # Remove trailing commas
|
||||
json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str) # Quote unquoted keys
|
||||
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
# Try to fix common JSON issues
|
||||
logger.warning(f"Initial JSON parsing failed: {e}")
|
||||
|
||||
# Try to extract individual fields using regex
|
||||
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
|
||||
total_amount_match = re.search(r'"total_amount"\s*:\s*([0-9.]+)', json_str)
|
||||
tax_amount_match = re.search(r'"tax_amount"\s*:\s*([0-9.]+)', json_str)
|
||||
date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
|
||||
category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
|
||||
confidence_match = re.search(r'"confidence"\s*:\s*([0-9.]+)', json_str)
|
||||
|
||||
data = {
|
||||
"vendor": vendor_match.group(1) if vendor_match else "",
|
||||
"total_amount": float(total_amount_match.group(1)) if total_amount_match else 0.0,
|
||||
"tax_amount": float(tax_amount_match.group(1)) if tax_amount_match else 0.0,
|
||||
"date": date_match.group(1) if date_match else "",
|
||||
"category": category_match.group(1) if category_match else "Other",
|
||||
"confidence": float(confidence_match.group(1)) if confidence_match else 0.5
|
||||
}
|
||||
|
||||
# Validate and clean data
|
||||
return {
|
||||
"vendor": data.get("vendor", "").strip(),
|
||||
"vendor": str(data.get("vendor", "")).strip(),
|
||||
"total_amount": float(data.get("total_amount", 0)),
|
||||
"tax_amount": float(data.get("tax_amount", 0)),
|
||||
"date": data.get("date", ""),
|
||||
"category": data.get("category", "Other"),
|
||||
"date": str(data.get("date", "")).strip(),
|
||||
"category": str(data.get("category", "Other")).strip(),
|
||||
"confidence": float(data.get("confidence", 0.5)),
|
||||
"extraction_success": True
|
||||
}
|
||||
else:
|
||||
return {"error": "Could not parse JSON from AI response"}
|
||||
# Try to extract fields from plain text
|
||||
logger.warning("No JSON found in response, attempting text extraction")
|
||||
return self._extract_from_plain_text(result_text)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"JSON parsing error: {str(e)}"}
|
||||
logger.error(f"JSON parsing error: {str(e)}")
|
||||
return {"error": f"JSON parsing error: {str(e)}", "extraction_success": False}
|
||||
|
||||
def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
|
||||
"""Extract receipt data from plain text when JSON parsing fails"""
|
||||
try:
|
||||
import re
|
||||
|
||||
# Extract vendor (look for common patterns)
|
||||
vendor_patterns = [
|
||||
r'(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)',
|
||||
r'([A-Z][A-Za-z0-9\s&.,]{3,30})', # Capitalized words
|
||||
]
|
||||
|
||||
vendor = ""
|
||||
for pattern in vendor_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
vendor = match.group(1).strip()
|
||||
break
|
||||
|
||||
# Extract amount (look for currency patterns)
|
||||
amount_patterns = [
|
||||
r'\$?\s*([0-9,]+\.?[0-9]*)',
|
||||
r'(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)',
|
||||
]
|
||||
|
||||
total_amount = 0.0
|
||||
for pattern in amount_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
total_amount = float(match.group(1).replace(',', ''))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Extract date
|
||||
date_patterns = [
|
||||
r'(\d{4}-\d{2}-\d{2})',
|
||||
r'(\d{1,2}/\d{1,2}/\d{2,4})',
|
||||
r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}',
|
||||
]
|
||||
|
||||
date = ""
|
||||
for pattern in date_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
date = match.group(0)
|
||||
break
|
||||
|
||||
return {
|
||||
"vendor": vendor or "Unknown",
|
||||
"total_amount": total_amount,
|
||||
"tax_amount": 0.0,
|
||||
"date": date or "",
|
||||
"category": "Other",
|
||||
"confidence": 0.3, # Low confidence for text extraction
|
||||
"extraction_success": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Text extraction error: {str(e)}")
|
||||
return {
|
||||
"vendor": "Unknown",
|
||||
"total_amount": 0.0,
|
||||
"tax_amount": 0.0,
|
||||
"date": "",
|
||||
"category": "Other",
|
||||
"confidence": 0.1,
|
||||
"extraction_success": False,
|
||||
"error": f"Text extraction failed: {str(e)}"
|
||||
}
|
||||
|
||||
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
|
||||
"""Save uploaded file to temporary storage"""
|
||||
@@ -286,44 +389,55 @@ class DocumentProcessor:
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Find JSON in response
|
||||
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str = json_match.group()
|
||||
data = json.loads(json_str)
|
||||
|
||||
# Validate and clean data
|
||||
transactions = data.get("transactions", [])
|
||||
cleaned_transactions = []
|
||||
|
||||
for txn in transactions:
|
||||
try:
|
||||
# Clean and validate each transaction
|
||||
cleaned_txn = {
|
||||
"date": str(txn.get("date", "")).strip(),
|
||||
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
|
||||
"vendor": str(txn.get("vendor", "")).strip(),
|
||||
"memo": str(txn.get("memo", "")).strip()
|
||||
}
|
||||
cleaned_transactions.append(cleaned_txn)
|
||||
except Exception as e:
|
||||
# Skip invalid transactions
|
||||
continue
|
||||
|
||||
return {
|
||||
"extraction_success": data.get("extraction_success", True),
|
||||
"transactions": cleaned_transactions,
|
||||
"total_transactions": len(cleaned_transactions)
|
||||
}
|
||||
else:
|
||||
|
||||
# Find the first '{' and last '}'
|
||||
start = result_text.find('{')
|
||||
end = result_text.rfind('}')
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": "Could not parse JSON from AI response",
|
||||
"error": "Could not find JSON object in AI response",
|
||||
"transactions": []
|
||||
}
|
||||
|
||||
json_str = result_text[start:end+1]
|
||||
|
||||
# Remove trailing commas before } or ]
|
||||
json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
|
||||
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"JSON parsing error: {str(e)}")
|
||||
logging.error(f"Offending JSON string:\n{json_str}")
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"JSON parsing error: {str(e)}",
|
||||
"transactions": []
|
||||
}
|
||||
|
||||
# Validate and clean data
|
||||
transactions = data.get("transactions", [])
|
||||
cleaned_transactions = []
|
||||
for txn in transactions:
|
||||
try:
|
||||
cleaned_txn = {
|
||||
"date": str(txn.get("date", "")).strip(),
|
||||
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
|
||||
"vendor": str(txn.get("vendor", "")).strip(),
|
||||
"memo": str(txn.get("memo", "")).strip()
|
||||
}
|
||||
cleaned_transactions.append(cleaned_txn)
|
||||
except Exception as e:
|
||||
continue
|
||||
return {
|
||||
"extraction_success": data.get("extraction_success", True),
|
||||
"transactions": cleaned_transactions,
|
||||
"total_transactions": len(cleaned_transactions)
|
||||
}
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"JSON parsing error (outer): {str(e)}")
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"JSON parsing error: {str(e)}",
|
||||
|
||||
Reference in New Issue
Block a user