Implement batch processing for LLM-based tax analysis and enhance match confidence scoring

2025-10-05 19:38:34 +01:00
parent c45e3fa791
commit ae200bd30f
2 changed files with 367 additions and 4 deletions
@@ -51,6 +51,50 @@ class LLMTaxAnalyzer:
        self.model = "llama-3.1-8b-instant"
        self.max_retries = 3

+    def analyze_and_apply_tax_rules_batch(
+        self,
+        matches: list,  # List of Match objects
+        user_location: str = "ON",
+    ) -> list:
+        """
+        Batch process all matches in a SINGLE LLM call to reduce costs.
+        Analyzes all receipt-transaction pairs together and applies tax rules.
+        """
+        if not matches:
+            return matches
+
+        # Build batch context for all matches
+        batch_context = self._build_batch_analysis_context(matches, user_location)
+
+        # Get LLM analysis for ALL matches at once
+        llm_batch_analysis = self._get_llm_tax_analysis_batch(
+            batch_context, len(matches)
+        )
+
+        # Apply results to each match
+        enhanced_matches = []
+        for i, match in enumerate(matches):
+            try:
+                # Get the analysis for this specific match from the batch results
+                match_analysis = llm_batch_analysis.get(f"match_{i}", {})
+
+                if match_analysis:
+                    # Apply the tax analysis to this match
+                    enhanced_match = self._apply_tax_analysis_to_match(
+                        match, match_analysis
+                    )
+                    enhanced_matches.append(enhanced_match)
+                else:
+                    # No analysis available for this match, use as-is
+                    match.match_reason += " (Tax analysis incomplete)"
+                    enhanced_matches.append(match)
+            except Exception as e:
+                logger.error(f"Error applying tax analysis to match {i}: {str(e)}")
+                match.match_reason += " (Tax analysis error)"
+                enhanced_matches.append(match)
+
+        return enhanced_matches
+
    def analyze_and_apply_tax_rules(
        self,
        receipt: Receipt,
@@ -58,6 +102,9 @@ class LLMTaxAnalyzer:
        user_location: str = "ON",  # Default to Ontario
    ) -> Dict[str, Any]:
        """
+        Legacy single-match analysis method (kept for backward compatibility).
+        Use analyze_and_apply_tax_rules_batch() for better performance.
+
        Use LLM to intelligently analyze and apply all tax rules:
        1. Sales tax based on receipt location (shipping/billing address priority)
        2. Foreign exchange rules for currency mismatches
@@ -496,3 +543,294 @@ Provide a structured JSON response with the following format:
                    "analysis_method": "fallback",
                },
            }
+
+    def _build_batch_analysis_context(self, matches: list, user_location: str) -> str:
+        """Build comprehensive context for batch LLM analysis of all matches"""
+
+        # Normalize user_location to province code
+        user_province = self._normalize_location_to_province(user_location)
+
+        logger.info(
+            f"Building batch tax analysis context for {len(matches)} matches - User Location: {user_location} → Province Code: {user_province}"
+        )
+
+        # Build tax rates and CCA references once
+        tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
+        cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
+
+        # Build match entries
+        matches_info = []
+        for i, match in enumerate(matches):
+            receipt = match.receipt
+            transaction = match.transaction
+            receipt_location = self._extract_receipt_location(receipt)
+
+            match_info = f"""
+MATCH {i} (ID: match_{i}):
+Receipt Details:
+  - Vendor: {receipt.vendor}
+  - Amount: ${receipt.amount:.2f}
+  - Tax: ${receipt.tax:.2f}
+  - Currency: {receipt.currency}
+  - Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
+  - Category: {receipt.category}
+  - Description: {receipt.description}
+  - Billing Address: {self._format_address(receipt.billing_address)}
+  - Shipping Address: {self._format_address(receipt.shipping_address)}
+  - Is Meals & Entertainment: {receipt.is_meals_entertainment}
+
+Transaction Details:
+  - Vendor: {transaction.vendor}
+  - Amount: ${transaction.amount:.2f}
+  - Currency: {transaction.currency}
+  - Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
+  - Notes: {transaction.notes}
+  - FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
+
+Receipt Location Detected:
+{receipt_location}
+"""
+            matches_info.append(match_info)
+
+        matches_section = "\n".join(matches_info)
+
+        context = f"""
+USER CONTEXT:
+- User Location (Province): {user_province}
+- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
+- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
+
+PROVINCIAL TAX RATES REFERENCE:
+{tax_rates_info}
+
+CCA DEPRECIATION RATES BY ASSET CLASS:
+{cca_rates_info}
+
+=== MATCHES TO ANALYZE ({len(matches)} total) ===
+{matches_section}
+"""
+        return context
+
+    def _get_llm_tax_analysis_batch(
+        self, context: str, num_matches: int
+    ) -> Dict[str, Any]:
+        """Get tax rule analysis from LLM for ALL matches in a single call"""
+
+        prompt = f"""
+You are a Canadian tax expert analyzing MULTIPLE receipt-transaction matches. Apply the following tax rules intelligently to EACH match.
+
+{context}
+
+=== FOUR CORE TAX RULES ===
+
+### 1. SALES TAX RULE
+**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
+
+**Key Principles**:
+- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
+- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
+- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
+
+**Scenario Examples**:
+a) User in Ontario, Receipt from Quebec:
+   - Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
+   
+b) User in Ontario, Receipt from USA (New York):
+   - DO NOT apply Canadian sales tax
+   - This is an international transaction
+   - Flag for FX review instead
+   
+c) User in Ontario, Receipt has NO address information:
+   - DEFAULT to user's location (Ontario 13% HST)
+
+**Tax Calculation**:
+- Compare calculated tax vs stated tax on receipt
+- Flag discrepancies for review
+
+### 2. FOREIGN EXCHANGE (FX) RULE
+**Purpose**: Handle currency mismatches between receipts and transactions.
+
+**Actions**:
+- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
+- Calculate expected transaction amount using FX rate if available
+- Flag discrepancies > $5 or 5% for manual review
+- If FX rate missing but currencies differ, flag for review
+
+### 3. DEPRECIATION RULE
+**Purpose**: Identify capital assets requiring depreciation based on USER'S location.
+
+**Critical**: Depreciation is ALWAYS based on the USER'S location (for Canadian tax filing), NOT the receipt location.
+
+**Capital Asset Criteria**:
+- Cost > $500 typically
+- Useful life > 1 year
+- Examples: computers, vehicles, furniture, machinery, buildings
+
+**CCA Classes**: Assign appropriate class and rate based on asset type and user's jurisdiction
+
+### 4. MEALS & ENTERTAINMENT RULE
+**Purpose**: Apply 50% tax deduction limit for M&E expenses.
+
+**Actions**:
+- Identify M&E expenses (meals, entertainment, client dinners, etc.)
+- Tax Deduction: 50% of total amount (including tax)
+- Accounting Deduction: 100% of total amount (including tax)
+- Always include sales tax in both calculations
+
+=== YOUR TASK ===
+
+Analyze EACH match (match_0, match_1, match_2, etc.) and return a JSON object where each key is the match ID (e.g., "match_0") and the value is the complete tax analysis for that match.
+
+Return your response as a SINGLE JSON object in this format:
+
+{{
+  "match_0": {{
+    "final_tax_amount": XX.XX,
+    "sales_tax": {{
+      "applicable_province": "XX",
+      "applicable_rate": 0.XX,
+      "tax_name": "HST/GST/PST",
+      "calculated_tax": XX.XX,
+      "stated_tax": XX.XX,
+      "discrepancy": XX.XX,
+      "reason": "Detailed explanation",
+      "requires_review": true/false
+    }},
+    "foreign_exchange": {{
+      "currency_mismatch": true/false,
+      "receipt_currency": "XXX",
+      "transaction_currency": "XXX",
+      "expected_transaction_amount": XX.XX,
+      "actual_transaction_amount": XX.XX,
+      "discrepancy": XX.XX,
+      "requires_manual_review": true/false,
+      "reason": "Explanation"
+    }},
+    "depreciation": {{
+      "is_capital_asset": true/false,
+      "asset_class": "class_XX",
+      "cca_rate": 0.XX,
+      "applicable_jurisdiction": "XX",
+      "reason": "Explanation"
+    }},
+    "meals_entertainment": {{
+      "is_meals_entertainment": true/false,
+      "tax_deduction_amount": XX.XX,
+      "accounting_deduction_amount": XX.XX,
+      "sales_tax_included": XX.XX,
+      "reason": "Explanation"
+    }},
+    "confidence_adjustment": {{
+      "boost": 0.XX,
+      "reduce": 0.XX,
+      "reason": "Why confidence should be adjusted"
+    }},
+    "overall_assessment": "Summary for this match"
+  }},
+  "match_1": {{
+    ... same structure ...
+  }},
+  ... for all {num_matches} matches ...
+}}
+
+**Critical Reminders**:
+- Sales tax uses RECEIPT location (or user location if receipt has none)
+- Depreciation ALWAYS uses USER location
+- For different addresses, use SHIPPING address for sales tax
+- International transactions: no Canadian tax + FX flag
+- Be precise with all calculations
+- Always explain your reasoning clearly
+- Return analysis for ALL {num_matches} matches
+"""
+
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a Canadian tax expert. Analyze multiple transactions in batch and apply tax rules accurately. Always return valid JSON with all requested matches.",
+                    },
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=0.1,  # Low temperature for consistent, factual responses
+                max_tokens=8000,  # Higher limit for batch processing
+            )
+
+            content = response.choices[0].message.content.strip()
+            logger.info(
+                f"LLM batch tax analysis received: {len(content)} characters for {num_matches} matches"
+            )
+
+            # Parse the JSON response
+            json_str = content
+            if "```json" in content:
+                json_str = content.split("```json")[1].split("```")[0].strip()
+            elif "```" in content:
+                json_str = content.split("```")[1].split("```")[0].strip()
+
+            batch_analysis = json.loads(json_str)
+            return batch_analysis
+
+        except Exception as e:
+            logger.error(f"Error getting batch LLM tax analysis: {str(e)}")
+            # Return empty dict so each match can handle fallback individually
+            return {}
+
+    def _apply_tax_analysis_to_match(self, match, tax_analysis: Dict[str, Any]):
+        """Apply tax analysis results to a match object"""
+
+        # Store the complete tax analysis
+        match.tax_analysis = tax_analysis
+
+        # Apply confidence adjustments based on tax analysis
+        confidence_adj = tax_analysis.get("confidence_adjustment", {})
+
+        # Boost confidence if tax rules validate the match
+        boost = confidence_adj.get("boost", 0.0)
+        if boost > 0:
+            match.confidence_score = min(1.0, match.confidence_score + boost)
+            match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
+
+        # Reduce confidence if tax issues detected
+        reduce = confidence_adj.get("reduce", 0.0)
+        if reduce > 0:
+            match.confidence_score = max(0.0, match.confidence_score - reduce)
+            match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
+
+        # Add flags for manual review if needed
+        review_flags = []
+
+        # Check sales tax issues
+        sales_tax = tax_analysis.get("sales_tax", {})
+        if sales_tax.get("requires_review", False):
+            review_flags.append("Sales Tax Review Required")
+
+        # Check FX issues
+        fx_analysis = tax_analysis.get("foreign_exchange", {})
+        if fx_analysis.get("requires_manual_review", False):
+            review_flags.append(
+                f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
+            )
+
+        # Check depreciation
+        depreciation = tax_analysis.get("depreciation", {})
+        if depreciation.get("is_capital_asset", False):
+            review_flags.append(
+                f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
+            )
+
+        # Check meals & entertainment
+        meals_ent = tax_analysis.get("meals_entertainment", {})
+        if meals_ent.get("is_meals_entertainment", False):
+            tax_deduction = meals_ent.get("tax_deduction_amount", 0)
+            accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
+            review_flags.append(
+                f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
+            )
+
+        # Add review flags to match reason
+        if review_flags:
+            match.match_reason += " | REVIEW: " + "; ".join(review_flags)
+
+        return match
@@ -25,11 +25,36 @@ class MatchingEngine:
            receipts, transactions
        )

-        # Apply rules and enhance matches
-        enhanced_matches = []
+        # Apply traditional rules first (lightweight, no API calls)
        for match in ai_matches:
-            enhanced_match = self._enhance_match_with_rules(match, user_location)
-            enhanced_matches.append(enhanced_match)
+            rule_results = self.rules_engine.apply_rules(
+                match.receipt, match.transaction
+            )
+
+            # Apply confidence boost from traditional rules
+            if rule_results["confidence_boost"] > 0:
+                match.confidence_score = min(
+                    1.0, match.confidence_score + rule_results["confidence_boost"]
+                )
+
+            # Auto-approve if rules say so
+            if rule_results["auto_approve"]:
+                match.confidence_score = 1.0
+                match.match_reason += " (Auto-approved by rules)"
+
+        # Now apply LLM-based tax analysis in a SINGLE batch call
+        try:
+            enhanced_matches = self.llm_tax_analyzer.analyze_and_apply_tax_rules_batch(
+                ai_matches, user_location
+            )
+        except Exception as e:
+            # If batch LLM analysis fails, log it and continue with matches as-is
+            import logging
+
+            logging.error(f"Batch LLM tax analysis failed: {str(e)}")
+            for match in ai_matches:
+                match.match_reason += " (Note: Advanced tax analysis unavailable)"
+            enhanced_matches = ai_matches

        return enhanced_matches