Implement batch processing for LLM-based tax analysis and enhance match confidence scoring

2025-10-05 19:38:34 +01:00
parent c45e3fa791
commit ae200bd30f
2 changed files with 367 additions and 4 deletions
@@ -51,6 +51,50 @@ class LLMTaxAnalyzer:
        self.model = "llama-3.1-8b-instant"
        self.max_retries = 3
    def analyze_and_apply_tax_rules_batch(
        self,
        matches: list,  # List of Match objects
        user_location: str = "ON",
    ) -> list:
        """
        Batch process all matches in a SINGLE LLM call to reduce costs.
        Analyzes all receipt-transaction pairs together and applies tax rules.
        """
        if not matches:
            return matches
        # Build batch context for all matches
        batch_context = self._build_batch_analysis_context(matches, user_location)
        # Get LLM analysis for ALL matches at once
        llm_batch_analysis = self._get_llm_tax_analysis_batch(
            batch_context, len(matches)
        )
        # Apply results to each match
        enhanced_matches = []
        for i, match in enumerate(matches):
            try:
                # Get the analysis for this specific match from the batch results
                match_analysis = llm_batch_analysis.get(f"match_{i}", {})
                if match_analysis:
                    # Apply the tax analysis to this match
                    enhanced_match = self._apply_tax_analysis_to_match(
                        match, match_analysis
                    )
                    enhanced_matches.append(enhanced_match)
                else:
                    # No analysis available for this match, use as-is
                    match.match_reason += " (Tax analysis incomplete)"
                    enhanced_matches.append(match)
            except Exception as e:
                logger.error(f"Error applying tax analysis to match {i}: {str(e)}")
                match.match_reason += " (Tax analysis error)"
                enhanced_matches.append(match)
        return enhanced_matches
    def analyze_and_apply_tax_rules(
        self,
        receipt: Receipt,
@@ -58,6 +102,9 @@ class LLMTaxAnalyzer:
        user_location: str = "ON",  # Default to Ontario
    ) -> Dict[str, Any]:
        """
        Legacy single-match analysis method (kept for backward compatibility).
        Use analyze_and_apply_tax_rules_batch() for better performance.
        Use LLM to intelligently analyze and apply all tax rules:
        1. Sales tax based on receipt location (shipping/billing address priority)
        2. Foreign exchange rules for currency mismatches
@@ -496,3 +543,294 @@ Provide a structured JSON response with the following format:
                    "analysis_method": "fallback",
                },
            }
    def _build_batch_analysis_context(self, matches: list, user_location: str) -> str:
        """Build comprehensive context for batch LLM analysis of all matches"""
        # Normalize user_location to province code
        user_province = self._normalize_location_to_province(user_location)
        logger.info(
            f"Building batch tax analysis context for {len(matches)} matches - User Location: {user_location} → Province Code: {user_province}"
        )
        # Build tax rates and CCA references once
        tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
        cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
        # Build match entries
        matches_info = []
        for i, match in enumerate(matches):
            receipt = match.receipt
            transaction = match.transaction
            receipt_location = self._extract_receipt_location(receipt)
            match_info = f"""
 MATCH {i} (ID: match_{i}):
 Receipt Details:
  - Vendor: {receipt.vendor}
  - Amount: ${receipt.amount:.2f}
  - Tax: ${receipt.tax:.2f}
  - Currency: {receipt.currency}
  - Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
  - Category: {receipt.category}
  - Description: {receipt.description}
  - Billing Address: {self._format_address(receipt.billing_address)}
  - Shipping Address: {self._format_address(receipt.shipping_address)}
  - Is Meals & Entertainment: {receipt.is_meals_entertainment}
 Transaction Details:
  - Vendor: {transaction.vendor}
  - Amount: ${transaction.amount:.2f}
  - Currency: {transaction.currency}
  - Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
  - Notes: {transaction.notes}
  - FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
 Receipt Location Detected:
 {receipt_location}
 """
            matches_info.append(match_info)
        matches_section = "\n".join(matches_info)
        context = f"""
 USER CONTEXT:
 - User Location (Province): {user_province}
 - User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
 - User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
 PROVINCIAL TAX RATES REFERENCE:
 {tax_rates_info}
 CCA DEPRECIATION RATES BY ASSET CLASS:
 {cca_rates_info}
 === MATCHES TO ANALYZE ({len(matches)} total) ===
 {matches_section}
 """
        return context
    def _get_llm_tax_analysis_batch(
        self, context: str, num_matches: int
    ) -> Dict[str, Any]:
        """Get tax rule analysis from LLM for ALL matches in a single call"""
        prompt = f"""
 You are a Canadian tax expert analyzing MULTIPLE receipt-transaction matches. Apply the following tax rules intelligently to EACH match.
 {context}
 === FOUR CORE TAX RULES ===
 ### 1. SALES TAX RULE
 **Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
 **Key Principles**:
 - When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
 - When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
 - Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
 **Scenario Examples**:
 a) User in Ontario, Receipt from Quebec:
   - Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
 b) User in Ontario, Receipt from USA (New York):
   - DO NOT apply Canadian sales tax
   - This is an international transaction
   - Flag for FX review instead
 c) User in Ontario, Receipt has NO address information:
   - DEFAULT to user's location (Ontario 13% HST)
 **Tax Calculation**:
 - Compare calculated tax vs stated tax on receipt
 - Flag discrepancies for review
 ### 2. FOREIGN EXCHANGE (FX) RULE
 **Purpose**: Handle currency mismatches between receipts and transactions.
 **Actions**:
 - Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
 - Calculate expected transaction amount using FX rate if available
 - Flag discrepancies > $5 or 5% for manual review
 - If FX rate missing but currencies differ, flag for review
 ### 3. DEPRECIATION RULE
 **Purpose**: Identify capital assets requiring depreciation based on USER'S location.
 **Critical**: Depreciation is ALWAYS based on the USER'S location (for Canadian tax filing), NOT the receipt location.
 **Capital Asset Criteria**:
 - Cost > $500 typically
 - Useful life > 1 year
 - Examples: computers, vehicles, furniture, machinery, buildings
 **CCA Classes**: Assign appropriate class and rate based on asset type and user's jurisdiction
 ### 4. MEALS & ENTERTAINMENT RULE
 **Purpose**: Apply 50% tax deduction limit for M&E expenses.
 **Actions**:
 - Identify M&E expenses (meals, entertainment, client dinners, etc.)
 - Tax Deduction: 50% of total amount (including tax)
 - Accounting Deduction: 100% of total amount (including tax)
 - Always include sales tax in both calculations
 === YOUR TASK ===
 Analyze EACH match (match_0, match_1, match_2, etc.) and return a JSON object where each key is the match ID (e.g., "match_0") and the value is the complete tax analysis for that match.
 Return your response as a SINGLE JSON object in this format:
 {{
  "match_0": {{
    "final_tax_amount": XX.XX,
    "sales_tax": {{
      "applicable_province": "XX",
      "applicable_rate": 0.XX,
      "tax_name": "HST/GST/PST",
      "calculated_tax": XX.XX,
      "stated_tax": XX.XX,
      "discrepancy": XX.XX,
      "reason": "Detailed explanation",
      "requires_review": true/false
    }},
    "foreign_exchange": {{
      "currency_mismatch": true/false,
      "receipt_currency": "XXX",
      "transaction_currency": "XXX",
      "expected_transaction_amount": XX.XX,
      "actual_transaction_amount": XX.XX,
      "discrepancy": XX.XX,
      "requires_manual_review": true/false,
      "reason": "Explanation"
    }},
    "depreciation": {{
      "is_capital_asset": true/false,
      "asset_class": "class_XX",
      "cca_rate": 0.XX,
      "applicable_jurisdiction": "XX",
      "reason": "Explanation"
    }},
    "meals_entertainment": {{
      "is_meals_entertainment": true/false,
      "tax_deduction_amount": XX.XX,
      "accounting_deduction_amount": XX.XX,
      "sales_tax_included": XX.XX,
      "reason": "Explanation"
    }},
    "confidence_adjustment": {{
      "boost": 0.XX,
      "reduce": 0.XX,
      "reason": "Why confidence should be adjusted"
    }},
    "overall_assessment": "Summary for this match"
  }},
  "match_1": {{
    ... same structure ...
  }},
  ... for all {num_matches} matches ...
 }}
 **Critical Reminders**:
 - Sales tax uses RECEIPT location (or user location if receipt has none)
 - Depreciation ALWAYS uses USER location
 - For different addresses, use SHIPPING address for sales tax
 - International transactions: no Canadian tax + FX flag
 - Be precise with all calculations
 - Always explain your reasoning clearly
 - Return analysis for ALL {num_matches} matches
 """
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a Canadian tax expert. Analyze multiple transactions in batch and apply tax rules accurately. Always return valid JSON with all requested matches.",
                    },
                    {"role": "user", "content": prompt},
                ],
                temperature=0.1,  # Low temperature for consistent, factual responses
                max_tokens=8000,  # Higher limit for batch processing
            )
            content = response.choices[0].message.content.strip()
            logger.info(
                f"LLM batch tax analysis received: {len(content)} characters for {num_matches} matches"
            )
            # Parse the JSON response
            json_str = content
            if "```json" in content:
                json_str = content.split("```json")[1].split("```")[0].strip()
            elif "```" in content:
                json_str = content.split("```")[1].split("```")[0].strip()
            batch_analysis = json.loads(json_str)
            return batch_analysis
        except Exception as e:
            logger.error(f"Error getting batch LLM tax analysis: {str(e)}")
            # Return empty dict so each match can handle fallback individually
            return {}
    def _apply_tax_analysis_to_match(self, match, tax_analysis: Dict[str, Any]):
        """Apply tax analysis results to a match object"""
        # Store the complete tax analysis
        match.tax_analysis = tax_analysis
        # Apply confidence adjustments based on tax analysis
        confidence_adj = tax_analysis.get("confidence_adjustment", {})
        # Boost confidence if tax rules validate the match
        boost = confidence_adj.get("boost", 0.0)
        if boost > 0:
            match.confidence_score = min(1.0, match.confidence_score + boost)
            match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
        # Reduce confidence if tax issues detected
        reduce = confidence_adj.get("reduce", 0.0)
        if reduce > 0:
            match.confidence_score = max(0.0, match.confidence_score - reduce)
            match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
        # Add flags for manual review if needed
        review_flags = []
        # Check sales tax issues
        sales_tax = tax_analysis.get("sales_tax", {})
        if sales_tax.get("requires_review", False):
            review_flags.append("Sales Tax Review Required")
        # Check FX issues
        fx_analysis = tax_analysis.get("foreign_exchange", {})
        if fx_analysis.get("requires_manual_review", False):
            review_flags.append(
                f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
            )
        # Check depreciation
        depreciation = tax_analysis.get("depreciation", {})
        if depreciation.get("is_capital_asset", False):
            review_flags.append(
                f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
            )
        # Check meals & entertainment
        meals_ent = tax_analysis.get("meals_entertainment", {})
        if meals_ent.get("is_meals_entertainment", False):
            tax_deduction = meals_ent.get("tax_deduction_amount", 0)
            accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
            review_flags.append(
                f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
            )
        # Add review flags to match reason
        if review_flags:
            match.match_reason += " | REVIEW: " + "; ".join(review_flags)
        return match
@@ -25,11 +25,36 @@ class MatchingEngine:
            receipts, transactions
        )
-        # Apply rules and enhance matches
+        # Apply traditional rules first (lightweight, no API calls)
        enhanced_matches = []
        for match in ai_matches:
-            enhanced_match = self._enhance_match_with_rules(match, user_location)
+            rule_results = self.rules_engine.apply_rules(
-            enhanced_matches.append(enhanced_match)
+                match.receipt, match.transaction
            )
            # Apply confidence boost from traditional rules
            if rule_results["confidence_boost"] > 0:
                match.confidence_score = min(
                    1.0, match.confidence_score + rule_results["confidence_boost"]
                )
            # Auto-approve if rules say so
            if rule_results["auto_approve"]:
                match.confidence_score = 1.0
                match.match_reason += " (Auto-approved by rules)"
        # Now apply LLM-based tax analysis in a SINGLE batch call
        try:
            enhanced_matches = self.llm_tax_analyzer.analyze_and_apply_tax_rules_batch(
                ai_matches, user_location
            )
        except Exception as e:
            # If batch LLM analysis fails, log it and continue with matches as-is
            import logging
            logging.error(f"Batch LLM tax analysis failed: {str(e)}")
            for match in ai_matches:
                match.match_reason += " (Note: Advanced tax analysis unavailable)"
            enhanced_matches = ai_matches
        return enhanced_matches