Implement code changes to enhance functionality and improve performance

This commit is contained in:
2025-10-10 17:18:52 +00:00
parent 3559cbe19d
commit c8da3c61ca
4 changed files with 1632 additions and 749 deletions
+1 -1
View File
@@ -416,7 +416,7 @@ async def process_document(
user_location=request.user_location,
ai_rules=ai_rules_list,
)
logger.info(f"Extracted receipt data: {receipt_data}")
# Parse date for database storage
receipt_date = None
if receipt_data.get("date"):
+87 -35
View File
@@ -130,8 +130,7 @@ Candidate {i + 1}:
- Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%)
"""
prompt = f"""
You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.
prompt = f"""You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.
RECEIPT TO MATCH:
- Vendor: {receipt.vendor}
@@ -143,27 +142,39 @@ RECEIPT TO MATCH:
CANDIDATE TRANSACTIONS:
{candidates_text}
SCORING CRITERIA:
- Perfect matches (same vendor, amount, date): 0.95-1.0
- High confidence (minor differences): 0.8-0.94
- Medium confidence (moderate differences): 0.6-0.79
- Low confidence (significant differences): 0.4-0.59
- Very low confidence (major differences): 0.2-0.39
- Minimal similarity: 0.1-0.19
- No meaningful similarity: 0.0-0.09
SCORING CRITERIA (Amount is the PRIMARY factor):
The most important factor to consider is the Amount for both the transaction and the receipt. The closer the amounts, the higher the score. If the amounts are different or not close return a low score (0-0.1) based on other factors.
Consider vendor name similarity, amount accuracy, date proximity, and description/notes relevance.
Amount Similarity (MOST IMPORTANT - 60% weight):
- Exact match or within 1%: Start at 0.9-1.0
- Within 5%: Start at 0.75-0.89
- Within 10%: Start at 0.5-0.74
- Within 20%: Start at 0.3-0.49
- More than 20% difference: Start at 0.0-0.29
IMPORTANT:
You MUST return the candidate with the highest match score, even if it's very low. Never return NONE.
Return ONLY the best match in this exact format:
CANDIDATE_NUMBER|CONFIDENCE_SCORE|REASON
Then adjust UP or DOWN based on:
- Vendor similarity (20% weight): Exact or similar name increases score
- Date proximity (15% weight): Within 7 days increases score, within 30 days moderate increase
- Description/notes match (5% weight): Relevant keywords increase score
Example: 3|0.87|Same vendor name, exact amount match, 1 day apart
Example of low match: 5|0.15|Best available option despite significant differences in vendor and amount
"""
EXAMPLES:
- Amount match + vendor match + close date = 0.95-1.0 (Perfect match)
- Amount match + different vendor + close date = 0.85-0.94 (High confidence)
- Amount match + different vendor + far date = 0.70-0.84 (Medium-high confidence)
- Amount similar (5%) + vendor match = 0.75-0.85 (Medium-high confidence)
- Amount similar (10%) + some matches = 0.50-0.69 (Medium confidence)
- Amount very different (>20%) = 0.0-0.29 regardless of other factors
CRITICAL: You MUST return valid JSON only. No explanations, no text before or after.
Return format:
{{"candidate_number": 1, "confidence_score": 0.87, "reason": "Exact amount match with similar vendor"}}
Another example:
{{"candidate_number": 3, "confidence_score": 0.15, "reason": "Poor match but best available"}}
Return ONLY JSON for the best candidate:"""
logger.info(f"This is the prompt: {prompt}")
for attempt in range(self.max_retries):
try:
result = self._call_groq_api_with_timeout(
@@ -206,18 +217,59 @@ Example of low match: 5|0.15|Best available option despite significant differenc
return None
def _parse_single_match_response(self, result: str) -> Tuple[int, float, str]:
"""Parse AI response for single best match"""
"""Parse AI response for single best match (JSON format)"""
import json
import re
result = result.strip()
logger.debug(f"Parsing single match response: {result}")
try:
if result.upper().startswith("NONE"):
# This should not happen with new prompt, but handle as parsing error
logger.warning(
"AI returned NONE despite being instructed to always return best match"
)
return -1, 0.0, "AI returned NONE unexpectedly"
# First, try to parse the entire result as JSON
try:
data = json.loads(result)
candidate_num = int(data.get("candidate_number", -1)) - 1
score = float(data.get("confidence_score", 0.0))
reason = str(data.get("reason", "No reason provided"))
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed JSON: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
except json.JSONDecodeError:
pass
# Try to extract JSON object from the response using improved regex
# This handles nested braces better
json_pattern = r'\{[^{}]*"candidate_number"[^{}]*"confidence_score"[^{}]*"reason"[^{}]*\}'
json_match = re.search(json_pattern, result)
if json_match:
json_str = json_match.group()
data = json.loads(json_str)
candidate_num = int(data.get("candidate_number", -1)) - 1
score = float(data.get("confidence_score", 0.0))
reason = str(data.get("reason", "No reason provided"))
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed extracted JSON: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
# Try to find any JSON-like structure with the required fields
candidate_match = re.search(r'"candidate_number"\s*:\s*(\d+)', result)
score_match = re.search(r'"confidence_score"\s*:\s*([\d.]+)', result)
reason_match = re.search(r'"reason"\s*:\s*"([^"]*)"', result)
if candidate_match and score_match and reason_match:
candidate_num = int(candidate_match.group(1)) - 1
score = float(score_match.group(1))
reason = reason_match.group(1)
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed fields individually: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
except (json.JSONDecodeError, ValueError, KeyError) as e:
logger.warning(f"Error parsing JSON response: {e}")
# Fallback to old pipe-delimited format for backwards compatibility
try:
if "|" in result:
parts = result.split("|")
if len(parts) >= 3:
@@ -226,8 +278,6 @@ Example of low match: 5|0.15|Best available option despite significant differenc
reason = "|".join(parts[2:]).strip()
# Extract candidate number
import re
candidate_match = re.search(r"\d+", candidate_str)
if candidate_match:
candidate_num = (
@@ -246,14 +296,13 @@ Example of low match: 5|0.15|Best available option despite significant differenc
score = max(0.0, min(1.0, score))
logger.debug(
f"Parsed: candidate={candidate_num}, score={score}, reason={reason}"
f"Parsed (fallback): candidate={candidate_num}, score={score}, reason={reason}"
)
return candidate_num, score, reason
except Exception as fallback_error:
logger.warning(f"Fallback parsing also failed: {fallback_error}")
except Exception as e:
logger.warning(f"Error parsing single match response: {e}")
# Fallback
# Final fallback
logger.warning(f"Could not parse single match response: {result}")
return -1, 0.0, f"Parse error: {result[:50]}..."
@@ -455,8 +504,11 @@ Example of low match: 5|0.15|Best available option despite significant differenc
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=200,
messages=[
{"role": "system", "content": "You are a JSON-only response assistant. Return only valid JSON, no explanations."},
{"role": "user", "content": prompt}
],
max_tokens=150,
temperature=0.1,
)
return response.choices[0].message.content.strip()
+44 -20
View File
@@ -2,6 +2,7 @@ import base64
import json
import logging
import os
import re
from datetime import datetime
from typing import Any, Dict
@@ -125,23 +126,36 @@ class DocumentProcessor:
# Build AI rules context for categorization
ai_rules_context = ""
if ai_rules and len(ai_rules) > 0:
ai_rules_context = "\n CATEGORIZATION RULES (IMPORTANT - Apply these first):"
# Create a simple, direct instruction for each rule
ai_rules_context = "\n "
for idx, rule in enumerate(ai_rules, 1):
condition = rule.get("condition", "")
action = rule.get("action", "")
ai_rules_context += f"\n {idx}. If {condition} → set category to '{action}'"
ai_rules_context += "\n - Apply these custom rules before using default categorization logic\n - If multiple rules match, use the first matching rule\n - If no rules match, use default categorization based on vendor type"
# Extract the keyword and category from the rule
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
if keyword_match and category_match:
keyword = keyword_match.group(1)
category = category_match.group(1).strip()
# Create one simple instruction per line
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
ai_rules_context += "\n"
# Create Groq vision prompt
prompt = f"""
Analyze this receipt image and extract the following information in JSON format:
Analyze this receipt image and extract the following information in JSON format.
{ai_rules_context}
JSON Format:
{{
"vendor": "Store/company name",
"description": "Detailed description of items/services purchased",
"total_amount": 0.00,
"tax_amount": 0.00,
"date": "YYYY-MM-DD",
"category": "Food/Transport/Office/Other",
"category": "Check rules above first",
"confidence": 0.95,
"currency": "USD",
"location": "Province/State, Country",
@@ -150,10 +164,11 @@ class DocumentProcessor:
"name_of_asset": null,
"cca_rate": null,
"useful_life": null,
"residual_value": null
"residual_value": null,
"extraction_success": True
}}
Rules:
EXTRACTION Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax
@@ -161,7 +176,7 @@ class DocumentProcessor:
- Date should be the date on the receipt
- Confidence score 0-1 based on how clear the receipt is
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
{ai_rules_context}
{user_location_context}
LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country)
@@ -199,11 +214,9 @@ class DocumentProcessor:
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
CATEGORY RULES:
- Assign the category based on all the details in the receipt
Return only valid JSON.
"""
logger.info(f"This is the prompt: {prompt}")
# Call Groq vision API with correct format
response = self.client.chat.completions.create(
messages=[
@@ -293,16 +306,27 @@ class DocumentProcessor:
# Build AI rules context for categorization
ai_rules_context = ""
if ai_rules and len(ai_rules) > 0:
ai_rules_context = "\n CATEGORIZATION RULES (IMPORTANT - Apply these first):"
# Create a simple, direct instruction for each rule
ai_rules_context = "\n "
for idx, rule in enumerate(ai_rules, 1):
condition = rule.get("condition", "")
action = rule.get("action", "")
ai_rules_context += f"\n {idx}. If {condition} → set category to '{action}'"
ai_rules_context += "\n - Apply these custom rules before using default categorization logic\n - If multiple rules match, use the first matching rule\n - If no rules match, use default categorization based on vendor type"
# Extract the keyword and category from the rule
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
if keyword_match and category_match:
keyword = keyword_match.group(1)
category = category_match.group(1).strip()
# Create one simple instruction per line
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
ai_rules_context += "\n"
prompt = f"""
Analyze this receipt text and extract the following information in JSON format:
Analyze this receipt text and extract the following information in JSON format.
{ai_rules_context}
Receipt Text:
{text_content}
@@ -313,7 +337,7 @@ class DocumentProcessor:
"total_amount": 0.00,
"tax_amount": 0.00,
"date": "YYYY-MM-DD",
"category": "Food/Transport/Office/Other",
"category": "Check rules above first",
"confidence": 0.95,
"currency": "USD",
"location": "Province/State, Country",
@@ -322,10 +346,11 @@ class DocumentProcessor:
"name_of_asset": null,
"cca_rate": null,
"useful_life": null,
"residual_value": null
"residual_value": null,
"extraction_success": True
}}
Rules:
EXTRACTION Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax
@@ -333,7 +358,6 @@ class DocumentProcessor:
- Date should be the date on the receipt
- Confidence score 0-1 based on clarity
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
{ai_rules_context}
{user_location_context}
LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country)
+1476 -669
View File
File diff suppressed because it is too large Load Diff