Implement code changes to enhance functionality and improve performance

This commit is contained in:
2025-10-10 17:18:52 +00:00
parent 3559cbe19d
commit c8da3c61ca
4 changed files with 1632 additions and 749 deletions
+1 -1
View File
@@ -416,7 +416,7 @@ async def process_document(
user_location=request.user_location, user_location=request.user_location,
ai_rules=ai_rules_list, ai_rules=ai_rules_list,
) )
logger.info(f"Extracted receipt data: {receipt_data}")
# Parse date for database storage # Parse date for database storage
receipt_date = None receipt_date = None
if receipt_data.get("date"): if receipt_data.get("date"):
+87 -35
View File
@@ -130,8 +130,7 @@ Candidate {i + 1}:
- Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%) - Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%)
""" """
prompt = f""" prompt = f"""You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.
You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.
RECEIPT TO MATCH: RECEIPT TO MATCH:
- Vendor: {receipt.vendor} - Vendor: {receipt.vendor}
@@ -143,27 +142,39 @@ RECEIPT TO MATCH:
CANDIDATE TRANSACTIONS: CANDIDATE TRANSACTIONS:
{candidates_text} {candidates_text}
SCORING CRITERIA: SCORING CRITERIA (Amount is the PRIMARY factor):
- Perfect matches (same vendor, amount, date): 0.95-1.0
- High confidence (minor differences): 0.8-0.94
- Medium confidence (moderate differences): 0.6-0.79
- Low confidence (significant differences): 0.4-0.59
- Very low confidence (major differences): 0.2-0.39
- Minimal similarity: 0.1-0.19
- No meaningful similarity: 0.0-0.09
The most important factor to consider is the Amount for both the transaction and the receipt. The closer the amounts, the higher the score. If the amounts are different or not close return a low score (0-0.1) based on other factors. Amount Similarity (MOST IMPORTANT - 60% weight):
Consider vendor name similarity, amount accuracy, date proximity, and description/notes relevance. - Exact match or within 1%: Start at 0.9-1.0
- Within 5%: Start at 0.75-0.89
- Within 10%: Start at 0.5-0.74
- Within 20%: Start at 0.3-0.49
- More than 20% difference: Start at 0.0-0.29
IMPORTANT: Then adjust UP or DOWN based on:
You MUST return the candidate with the highest match score, even if it's very low. Never return NONE. - Vendor similarity (20% weight): Exact or similar name increases score
Return ONLY the best match in this exact format: - Date proximity (15% weight): Within 7 days increases score, within 30 days moderate increase
CANDIDATE_NUMBER|CONFIDENCE_SCORE|REASON - Description/notes match (5% weight): Relevant keywords increase score
Example: 3|0.87|Same vendor name, exact amount match, 1 day apart EXAMPLES:
Example of low match: 5|0.15|Best available option despite significant differences in vendor and amount - Amount match + vendor match + close date = 0.95-1.0 (Perfect match)
""" - Amount match + different vendor + close date = 0.85-0.94 (High confidence)
- Amount match + different vendor + far date = 0.70-0.84 (Medium-high confidence)
- Amount similar (5%) + vendor match = 0.75-0.85 (Medium-high confidence)
- Amount similar (10%) + some matches = 0.50-0.69 (Medium confidence)
- Amount very different (>20%) = 0.0-0.29 regardless of other factors
CRITICAL: You MUST return valid JSON only. No explanations, no text before or after.
Return format:
{{"candidate_number": 1, "confidence_score": 0.87, "reason": "Exact amount match with similar vendor"}}
Another example:
{{"candidate_number": 3, "confidence_score": 0.15, "reason": "Poor match but best available"}}
Return ONLY JSON for the best candidate:"""
logger.info(f"This is the prompt: {prompt}")
for attempt in range(self.max_retries): for attempt in range(self.max_retries):
try: try:
result = self._call_groq_api_with_timeout( result = self._call_groq_api_with_timeout(
@@ -206,18 +217,59 @@ Example of low match: 5|0.15|Best available option despite significant differenc
return None return None
def _parse_single_match_response(self, result: str) -> Tuple[int, float, str]: def _parse_single_match_response(self, result: str) -> Tuple[int, float, str]:
"""Parse AI response for single best match""" """Parse AI response for single best match (JSON format)"""
import json
import re
result = result.strip() result = result.strip()
logger.debug(f"Parsing single match response: {result}") logger.debug(f"Parsing single match response: {result}")
try: try:
if result.upper().startswith("NONE"): # First, try to parse the entire result as JSON
# This should not happen with new prompt, but handle as parsing error try:
logger.warning( data = json.loads(result)
"AI returned NONE despite being instructed to always return best match" candidate_num = int(data.get("candidate_number", -1)) - 1
) score = float(data.get("confidence_score", 0.0))
return -1, 0.0, "AI returned NONE unexpectedly" reason = str(data.get("reason", "No reason provided"))
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed JSON: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
except json.JSONDecodeError:
pass
# Try to extract JSON object from the response using improved regex
# This handles nested braces better
json_pattern = r'\{[^{}]*"candidate_number"[^{}]*"confidence_score"[^{}]*"reason"[^{}]*\}'
json_match = re.search(json_pattern, result)
if json_match:
json_str = json_match.group()
data = json.loads(json_str)
candidate_num = int(data.get("candidate_number", -1)) - 1
score = float(data.get("confidence_score", 0.0))
reason = str(data.get("reason", "No reason provided"))
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed extracted JSON: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
# Try to find any JSON-like structure with the required fields
candidate_match = re.search(r'"candidate_number"\s*:\s*(\d+)', result)
score_match = re.search(r'"confidence_score"\s*:\s*([\d.]+)', result)
reason_match = re.search(r'"reason"\s*:\s*"([^"]*)"', result)
if candidate_match and score_match and reason_match:
candidate_num = int(candidate_match.group(1)) - 1
score = float(score_match.group(1))
reason = reason_match.group(1)
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed fields individually: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
except (json.JSONDecodeError, ValueError, KeyError) as e:
logger.warning(f"Error parsing JSON response: {e}")
# Fallback to old pipe-delimited format for backwards compatibility
try:
if "|" in result: if "|" in result:
parts = result.split("|") parts = result.split("|")
if len(parts) >= 3: if len(parts) >= 3:
@@ -226,8 +278,6 @@ Example of low match: 5|0.15|Best available option despite significant differenc
reason = "|".join(parts[2:]).strip() reason = "|".join(parts[2:]).strip()
# Extract candidate number # Extract candidate number
import re
candidate_match = re.search(r"\d+", candidate_str) candidate_match = re.search(r"\d+", candidate_str)
if candidate_match: if candidate_match:
candidate_num = ( candidate_num = (
@@ -246,14 +296,13 @@ Example of low match: 5|0.15|Best available option despite significant differenc
score = max(0.0, min(1.0, score)) score = max(0.0, min(1.0, score))
logger.debug( logger.debug(
f"Parsed: candidate={candidate_num}, score={score}, reason={reason}" f"Parsed (fallback): candidate={candidate_num}, score={score}, reason={reason}"
) )
return candidate_num, score, reason return candidate_num, score, reason
except Exception as fallback_error:
logger.warning(f"Fallback parsing also failed: {fallback_error}")
except Exception as e: # Final fallback
logger.warning(f"Error parsing single match response: {e}")
# Fallback
logger.warning(f"Could not parse single match response: {result}") logger.warning(f"Could not parse single match response: {result}")
return -1, 0.0, f"Parse error: {result[:50]}..." return -1, 0.0, f"Parse error: {result[:50]}..."
@@ -455,8 +504,11 @@ Example of low match: 5|0.15|Best available option despite significant differenc
try: try:
response = self.client.chat.completions.create( response = self.client.chat.completions.create(
model=self.model, model=self.model,
messages=[{"role": "user", "content": prompt}], messages=[
max_tokens=200, {"role": "system", "content": "You are a JSON-only response assistant. Return only valid JSON, no explanations."},
{"role": "user", "content": prompt}
],
max_tokens=150,
temperature=0.1, temperature=0.1,
) )
return response.choices[0].message.content.strip() return response.choices[0].message.content.strip()
+44 -20
View File
@@ -2,6 +2,7 @@ import base64
import json import json
import logging import logging
import os import os
import re
from datetime import datetime from datetime import datetime
from typing import Any, Dict from typing import Any, Dict
@@ -125,23 +126,36 @@ class DocumentProcessor:
# Build AI rules context for categorization # Build AI rules context for categorization
ai_rules_context = "" ai_rules_context = ""
if ai_rules and len(ai_rules) > 0: if ai_rules and len(ai_rules) > 0:
ai_rules_context = "\n CATEGORIZATION RULES (IMPORTANT - Apply these first):" # Create a simple, direct instruction for each rule
ai_rules_context = "\n "
for idx, rule in enumerate(ai_rules, 1): for idx, rule in enumerate(ai_rules, 1):
condition = rule.get("condition", "") condition = rule.get("condition", "")
action = rule.get("action", "") action = rule.get("action", "")
ai_rules_context += f"\n {idx}. If {condition} → set category to '{action}'"
ai_rules_context += "\n - Apply these custom rules before using default categorization logic\n - If multiple rules match, use the first matching rule\n - If no rules match, use default categorization based on vendor type" # Extract the keyword and category from the rule
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
if keyword_match and category_match:
keyword = keyword_match.group(1)
category = category_match.group(1).strip()
# Create one simple instruction per line
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
ai_rules_context += "\n"
# Create Groq vision prompt # Create Groq vision prompt
prompt = f""" prompt = f"""
Analyze this receipt image and extract the following information in JSON format: Analyze this receipt image and extract the following information in JSON format.
{ai_rules_context}
JSON Format:
{{ {{
"vendor": "Store/company name", "vendor": "Store/company name",
"description": "Detailed description of items/services purchased", "description": "Detailed description of items/services purchased",
"total_amount": 0.00, "total_amount": 0.00,
"tax_amount": 0.00, "tax_amount": 0.00,
"date": "YYYY-MM-DD", "date": "YYYY-MM-DD",
"category": "Food/Transport/Office/Other", "category": "Check rules above first",
"confidence": 0.95, "confidence": 0.95,
"currency": "USD", "currency": "USD",
"location": "Province/State, Country", "location": "Province/State, Country",
@@ -150,10 +164,11 @@ class DocumentProcessor:
"name_of_asset": null, "name_of_asset": null,
"cca_rate": null, "cca_rate": null,
"useful_life": null, "useful_life": null,
"residual_value": null "residual_value": null,
"extraction_success": True
}} }}
Rules: EXTRACTION Rules:
- Extract vendor name as it appears on receipt - Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies") - Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax - Total amount should be the final total including tax
@@ -161,7 +176,7 @@ class DocumentProcessor:
- Date should be the date on the receipt - Date should be the date on the receipt
- Confidence score 0-1 based on how clear the receipt is - Confidence score 0-1 based on how clear the receipt is
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD") - Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
{ai_rules_context}
{user_location_context} {user_location_context}
LOCATION & TAX RULES: LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country) - Extract location from receipt (look for store address, province/state, country)
@@ -199,11 +214,9 @@ class DocumentProcessor:
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles) * residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null - If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
CATEGORY RULES:
- Assign the category based on all the details in the receipt
Return only valid JSON. Return only valid JSON.
""" """
logger.info(f"This is the prompt: {prompt}")
# Call Groq vision API with correct format # Call Groq vision API with correct format
response = self.client.chat.completions.create( response = self.client.chat.completions.create(
messages=[ messages=[
@@ -293,16 +306,27 @@ class DocumentProcessor:
# Build AI rules context for categorization # Build AI rules context for categorization
ai_rules_context = "" ai_rules_context = ""
if ai_rules and len(ai_rules) > 0: if ai_rules and len(ai_rules) > 0:
ai_rules_context = "\n CATEGORIZATION RULES (IMPORTANT - Apply these first):" # Create a simple, direct instruction for each rule
ai_rules_context = "\n "
for idx, rule in enumerate(ai_rules, 1): for idx, rule in enumerate(ai_rules, 1):
condition = rule.get("condition", "") condition = rule.get("condition", "")
action = rule.get("action", "") action = rule.get("action", "")
ai_rules_context += f"\n {idx}. If {condition} → set category to '{action}'"
ai_rules_context += "\n - Apply these custom rules before using default categorization logic\n - If multiple rules match, use the first matching rule\n - If no rules match, use default categorization based on vendor type" # Extract the keyword and category from the rule
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
if keyword_match and category_match:
keyword = keyword_match.group(1)
category = category_match.group(1).strip()
# Create one simple instruction per line
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
ai_rules_context += "\n"
prompt = f""" prompt = f"""
Analyze this receipt text and extract the following information in JSON format: Analyze this receipt text and extract the following information in JSON format.
{ai_rules_context}
Receipt Text: Receipt Text:
{text_content} {text_content}
@@ -313,7 +337,7 @@ class DocumentProcessor:
"total_amount": 0.00, "total_amount": 0.00,
"tax_amount": 0.00, "tax_amount": 0.00,
"date": "YYYY-MM-DD", "date": "YYYY-MM-DD",
"category": "Food/Transport/Office/Other", "category": "Check rules above first",
"confidence": 0.95, "confidence": 0.95,
"currency": "USD", "currency": "USD",
"location": "Province/State, Country", "location": "Province/State, Country",
@@ -322,10 +346,11 @@ class DocumentProcessor:
"name_of_asset": null, "name_of_asset": null,
"cca_rate": null, "cca_rate": null,
"useful_life": null, "useful_life": null,
"residual_value": null "residual_value": null,
"extraction_success": True
}} }}
Rules: EXTRACTION Rules:
- Extract vendor name as it appears on receipt - Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies") - Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax - Total amount should be the final total including tax
@@ -333,7 +358,6 @@ class DocumentProcessor:
- Date should be the date on the receipt - Date should be the date on the receipt
- Confidence score 0-1 based on clarity - Confidence score 0-1 based on clarity
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD") - Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
{ai_rules_context}
{user_location_context} {user_location_context}
LOCATION & TAX RULES: LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country) - Extract location from receipt (look for store address, province/state, country)
+1476 -669
View File
File diff suppressed because it is too large Load Diff