Implement code changes to enhance functionality and improve performance

2025-10-10 17:18:52 +00:00
parent 3559cbe19d
commit c8da3c61ca
4 changed files with 1632 additions and 749 deletions
@@ -416,7 +416,7 @@ async def process_document(
            user_location=request.user_location,
            ai_rules=ai_rules_list,
        )
-
+        logger.info(f"Extracted receipt data: {receipt_data}")
        # Parse date for database storage
        receipt_date = None
        if receipt_data.get("date"):
@@ -130,8 +130,7 @@ Candidate {i + 1}:
 - Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%)
 """

-        prompt = f"""
-You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.
+        prompt = f"""You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.

 RECEIPT TO MATCH:
 - Vendor: {receipt.vendor}
@@ -143,27 +142,39 @@ RECEIPT TO MATCH:
 CANDIDATE TRANSACTIONS:
 {candidates_text}

-SCORING CRITERIA:
- Perfect matches (same vendor, amount, date): 0.95-1.0
- High confidence (minor differences): 0.8-0.94
- Medium confidence (moderate differences): 0.6-0.79
- Low confidence (significant differences): 0.4-0.59
- Very low confidence (major differences): 0.2-0.39
- Minimal similarity: 0.1-0.19
- No meaningful similarity: 0.0-0.09
+SCORING CRITERIA (Amount is the PRIMARY factor):

-The most important factor to consider is the Amount for both the transaction and the receipt. The closer the amounts, the higher the score. If the amounts are different or not close return a low score (0-0.1) based on other factors.
-Consider vendor name similarity, amount accuracy, date proximity, and description/notes relevance.
+Amount Similarity (MOST IMPORTANT - 60% weight):
+- Exact match or within 1%: Start at 0.9-1.0
+- Within 5%: Start at 0.75-0.89
+- Within 10%: Start at 0.5-0.74
+- Within 20%: Start at 0.3-0.49
+- More than 20% difference: Start at 0.0-0.29

-IMPORTANT: 
-You MUST return the candidate with the highest match score, even if it's very low. Never return NONE.
-Return ONLY the best match in this exact format:
-CANDIDATE_NUMBER|CONFIDENCE_SCORE|REASON
+Then adjust UP or DOWN based on:
+- Vendor similarity (20% weight): Exact or similar name increases score
+- Date proximity (15% weight): Within 7 days increases score, within 30 days moderate increase
+- Description/notes match (5% weight): Relevant keywords increase score

-Example: 3|0.87|Same vendor name, exact amount match, 1 day apart
-Example of low match: 5|0.15|Best available option despite significant differences in vendor and amount
-"""
+EXAMPLES:
+- Amount match + vendor match + close date = 0.95-1.0 (Perfect match)
+- Amount match + different vendor + close date = 0.85-0.94 (High confidence)
+- Amount match + different vendor + far date = 0.70-0.84 (Medium-high confidence)
+- Amount similar (5%) + vendor match = 0.75-0.85 (Medium-high confidence)
+- Amount similar (10%) + some matches = 0.50-0.69 (Medium confidence)
+- Amount very different (>20%) = 0.0-0.29 regardless of other factors

+CRITICAL: You MUST return valid JSON only. No explanations, no text before or after.
+
+Return format:
+{{"candidate_number": 1, "confidence_score": 0.87, "reason": "Exact amount match with similar vendor"}}
+
+Another example:
+{{"candidate_number": 3, "confidence_score": 0.15, "reason": "Poor match but best available"}}
+
+Return ONLY JSON for the best candidate:"""
+
+        logger.info(f"This is the prompt: {prompt}")
        for attempt in range(self.max_retries):
            try:
                result = self._call_groq_api_with_timeout(
@@ -206,18 +217,59 @@ Example of low match: 5|0.15|Best available option despite significant differenc
        return None

    def _parse_single_match_response(self, result: str) -> Tuple[int, float, str]:
-        """Parse AI response for single best match"""
+        """Parse AI response for single best match (JSON format)"""
+        import json
+        import re
+        
        result = result.strip()
        logger.debug(f"Parsing single match response: {result}")

        try:
-            if result.upper().startswith("NONE"):
-                # This should not happen with new prompt, but handle as parsing error
-                logger.warning(
-                    "AI returned NONE despite being instructed to always return best match"
-                )
-                return -1, 0.0, "AI returned NONE unexpectedly"
+            # First, try to parse the entire result as JSON
+            try:
+                data = json.loads(result)
+                candidate_num = int(data.get("candidate_number", -1)) - 1
+                score = float(data.get("confidence_score", 0.0))
+                reason = str(data.get("reason", "No reason provided"))
+                score = max(0.0, min(1.0, score))
+                logger.debug(f"Parsed JSON: candidate={candidate_num}, score={score}, reason={reason}")
+                return candidate_num, score, reason
+            except json.JSONDecodeError:
+                pass
            
+            # Try to extract JSON object from the response using improved regex
+            # This handles nested braces better
+            json_pattern = r'\{[^{}]*"candidate_number"[^{}]*"confidence_score"[^{}]*"reason"[^{}]*\}'
+            json_match = re.search(json_pattern, result)
+            
+            if json_match:
+                json_str = json_match.group()
+                data = json.loads(json_str)
+                candidate_num = int(data.get("candidate_number", -1)) - 1
+                score = float(data.get("confidence_score", 0.0))
+                reason = str(data.get("reason", "No reason provided"))
+                score = max(0.0, min(1.0, score))
+                logger.debug(f"Parsed extracted JSON: candidate={candidate_num}, score={score}, reason={reason}")
+                return candidate_num, score, reason
+            
+            # Try to find any JSON-like structure with the required fields
+            candidate_match = re.search(r'"candidate_number"\s*:\s*(\d+)', result)
+            score_match = re.search(r'"confidence_score"\s*:\s*([\d.]+)', result)
+            reason_match = re.search(r'"reason"\s*:\s*"([^"]*)"', result)
+            
+            if candidate_match and score_match and reason_match:
+                candidate_num = int(candidate_match.group(1)) - 1
+                score = float(score_match.group(1))
+                reason = reason_match.group(1)
+                score = max(0.0, min(1.0, score))
+                logger.debug(f"Parsed fields individually: candidate={candidate_num}, score={score}, reason={reason}")
+                return candidate_num, score, reason
+
+        except (json.JSONDecodeError, ValueError, KeyError) as e:
+            logger.warning(f"Error parsing JSON response: {e}")
+            
+            # Fallback to old pipe-delimited format for backwards compatibility
+            try:
                if "|" in result:
                    parts = result.split("|")
                    if len(parts) >= 3:
@@ -226,8 +278,6 @@ Example of low match: 5|0.15|Best available option despite significant differenc
                        reason = "|".join(parts[2:]).strip()

                        # Extract candidate number
-                    import re
-
                        candidate_match = re.search(r"\d+", candidate_str)
                        if candidate_match:
                            candidate_num = (
@@ -246,14 +296,13 @@ Example of low match: 5|0.15|Best available option despite significant differenc
                        score = max(0.0, min(1.0, score))

                        logger.debug(
-                        f"Parsed: candidate={candidate_num}, score={score}, reason={reason}"
+                            f"Parsed (fallback): candidate={candidate_num}, score={score}, reason={reason}"
                        )
                        return candidate_num, score, reason
+            except Exception as fallback_error:
+                logger.warning(f"Fallback parsing also failed: {fallback_error}")

-        except Exception as e:
-            logger.warning(f"Error parsing single match response: {e}")
-
-        # Fallback
+        # Final fallback
        logger.warning(f"Could not parse single match response: {result}")
        return -1, 0.0, f"Parse error: {result[:50]}..."

@@ -455,8 +504,11 @@ Example of low match: 5|0.15|Best available option despite significant differenc
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
-                    messages=[{"role": "user", "content": prompt}],
-                    max_tokens=200,
+                    messages=[
+                        {"role": "system", "content": "You are a JSON-only response assistant. Return only valid JSON, no explanations."},
+                        {"role": "user", "content": prompt}
+                    ],
+                    max_tokens=150,
                    temperature=0.1,
                )
                return response.choices[0].message.content.strip()
@@ -2,6 +2,7 @@ import base64
 import json
 import logging
 import os
+import re
 from datetime import datetime
 from typing import Any, Dict

@@ -125,23 +126,36 @@ class DocumentProcessor:
            # Build AI rules context for categorization
            ai_rules_context = ""
            if ai_rules and len(ai_rules) > 0:
-                ai_rules_context = "\n            CATEGORIZATION RULES (IMPORTANT - Apply these first):"
+                # Create a simple, direct instruction for each rule
+                ai_rules_context = "\n            "
                for idx, rule in enumerate(ai_rules, 1):
                    condition = rule.get("condition", "")
                    action = rule.get("action", "")
-                    ai_rules_context += f"\n            {idx}. If {condition} → set category to '{action}'"
-                ai_rules_context += "\n            - Apply these custom rules before using default categorization logic\n            - If multiple rules match, use the first matching rule\n            - If no rules match, use default categorization based on vendor type"
+                    
+                    # Extract the keyword and category from the rule
+                    keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
+                    category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
+                    
+                    if keyword_match and category_match:
+                        keyword = keyword_match.group(1)
+                        category = category_match.group(1).strip()
+                        # Create one simple instruction per line
+                        ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n            '
+                
+                ai_rules_context += "\n"

            # Create Groq vision prompt
            prompt = f"""
-            Analyze this receipt image and extract the following information in JSON format:
+            Analyze this receipt image and extract the following information in JSON format.
+            {ai_rules_context}
+            JSON Format:
            {{
                "vendor": "Store/company name",
                "description": "Detailed description of items/services purchased",
                "total_amount": 0.00,
                "tax_amount": 0.00,
                "date": "YYYY-MM-DD",
-                "category": "Food/Transport/Office/Other",
+                "category": "Check rules above first",
                "confidence": 0.95,
                "currency": "USD",
                "location": "Province/State, Country",
@@ -150,10 +164,11 @@ class DocumentProcessor:
                "name_of_asset": null,
                "cca_rate": null,
                "useful_life": null,
-                "residual_value": null
+                "residual_value": null,
+                "extraction_success": True
            }}
            
-            Rules:
+            EXTRACTION Rules:
            - Extract vendor name as it appears on receipt
            - Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
            - Total amount should be the final total including tax
@@ -161,7 +176,7 @@ class DocumentProcessor:
            - Date should be the date on the receipt
            - Confidence score 0-1 based on how clear the receipt is
            - Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
-            {ai_rules_context}
+            
            {user_location_context}
            LOCATION & TAX RULES:
            - Extract location from receipt (look for store address, province/state, country)
@@ -199,11 +214,9 @@ class DocumentProcessor:
              * residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
            - If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null

-            CATEGORY RULES:
-            - Assign the category based on all the details in the receipt
            Return only valid JSON.
            """
-
+            logger.info(f"This is the prompt: {prompt}")
            # Call Groq vision API with correct format
            response = self.client.chat.completions.create(
                messages=[
@@ -293,16 +306,27 @@ class DocumentProcessor:
            # Build AI rules context for categorization
            ai_rules_context = ""
            if ai_rules and len(ai_rules) > 0:
-                ai_rules_context = "\n            CATEGORIZATION RULES (IMPORTANT - Apply these first):"
+                # Create a simple, direct instruction for each rule
+                ai_rules_context = "\n            "
                for idx, rule in enumerate(ai_rules, 1):
                    condition = rule.get("condition", "")
                    action = rule.get("action", "")
-                    ai_rules_context += f"\n            {idx}. If {condition} → set category to '{action}'"
-                ai_rules_context += "\n            - Apply these custom rules before using default categorization logic\n            - If multiple rules match, use the first matching rule\n            - If no rules match, use default categorization based on vendor type"
+                    
+                    # Extract the keyword and category from the rule
+                    keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
+                    category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
+                    
+                    if keyword_match and category_match:
+                        keyword = keyword_match.group(1)
+                        category = category_match.group(1).strip()
+                        # Create one simple instruction per line
+                        ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n            '
+                
+                ai_rules_context += "\n"

            prompt = f"""
-            Analyze this receipt text and extract the following information in JSON format:
-            
+            Analyze this receipt text and extract the following information in JSON format.
+            {ai_rules_context}
            Receipt Text:
            {text_content}
            
@@ -313,7 +337,7 @@ class DocumentProcessor:
                "total_amount": 0.00,
                "tax_amount": 0.00,
                "date": "YYYY-MM-DD",
-                "category": "Food/Transport/Office/Other",
+                "category": "Check rules above first",
                "confidence": 0.95,
                "currency": "USD",
                "location": "Province/State, Country",
@@ -322,10 +346,11 @@ class DocumentProcessor:
                "name_of_asset": null,
                "cca_rate": null,
                "useful_life": null,
-                "residual_value": null
+                "residual_value": null,
+                "extraction_success": True
            }}
            
-            Rules:
+            EXTRACTION Rules:
            - Extract vendor name as it appears on receipt
            - Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
            - Total amount should be the final total including tax
@@ -333,7 +358,6 @@ class DocumentProcessor:
            - Date should be the date on the receipt
            - Confidence score 0-1 based on clarity
            - Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
-            {ai_rules_context}
            {user_location_context}
            LOCATION & TAX RULES:
            - Extract location from receipt (look for store address, province/state, country)