From ae200bd30f7f33963b8a1593c80f85f5057ec65b Mon Sep 17 00:00:00 2001 From: bolade Date: Sun, 5 Oct 2025 19:38:34 +0100 Subject: [PATCH] Implement batch processing for LLM-based tax analysis and enhance match confidence scoring --- app/services/llm_tax_analyzer.py | 338 +++++++++++++++++++++++++++++++ app/services/matching_engine.py | 33 ++- 2 files changed, 367 insertions(+), 4 deletions(-) diff --git a/app/services/llm_tax_analyzer.py b/app/services/llm_tax_analyzer.py index 2c2d37f..a4e9280 100644 --- a/app/services/llm_tax_analyzer.py +++ b/app/services/llm_tax_analyzer.py @@ -51,6 +51,50 @@ class LLMTaxAnalyzer: self.model = "llama-3.1-8b-instant" self.max_retries = 3 + def analyze_and_apply_tax_rules_batch( + self, + matches: list, # List of Match objects + user_location: str = "ON", + ) -> list: + """ + Batch process all matches in a SINGLE LLM call to reduce costs. + Analyzes all receipt-transaction pairs together and applies tax rules. + """ + if not matches: + return matches + + # Build batch context for all matches + batch_context = self._build_batch_analysis_context(matches, user_location) + + # Get LLM analysis for ALL matches at once + llm_batch_analysis = self._get_llm_tax_analysis_batch( + batch_context, len(matches) + ) + + # Apply results to each match + enhanced_matches = [] + for i, match in enumerate(matches): + try: + # Get the analysis for this specific match from the batch results + match_analysis = llm_batch_analysis.get(f"match_{i}", {}) + + if match_analysis: + # Apply the tax analysis to this match + enhanced_match = self._apply_tax_analysis_to_match( + match, match_analysis + ) + enhanced_matches.append(enhanced_match) + else: + # No analysis available for this match, use as-is + match.match_reason += " (Tax analysis incomplete)" + enhanced_matches.append(match) + except Exception as e: + logger.error(f"Error applying tax analysis to match {i}: {str(e)}") + match.match_reason += " (Tax analysis error)" + enhanced_matches.append(match) + + return enhanced_matches + def analyze_and_apply_tax_rules( self, receipt: Receipt, @@ -58,6 +102,9 @@ class LLMTaxAnalyzer: user_location: str = "ON", # Default to Ontario ) -> Dict[str, Any]: """ + Legacy single-match analysis method (kept for backward compatibility). + Use analyze_and_apply_tax_rules_batch() for better performance. + Use LLM to intelligently analyze and apply all tax rules: 1. Sales tax based on receipt location (shipping/billing address priority) 2. Foreign exchange rules for currency mismatches @@ -496,3 +543,294 @@ Provide a structured JSON response with the following format: "analysis_method": "fallback", }, } + + def _build_batch_analysis_context(self, matches: list, user_location: str) -> str: + """Build comprehensive context for batch LLM analysis of all matches""" + + # Normalize user_location to province code + user_province = self._normalize_location_to_province(user_location) + + logger.info( + f"Building batch tax analysis context for {len(matches)} matches - User Location: {user_location} → Province Code: {user_province}" + ) + + # Build tax rates and CCA references once + tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2) + cca_rates_info = json.dumps(self.CCA_RATES, indent=2) + + # Build match entries + matches_info = [] + for i, match in enumerate(matches): + receipt = match.receipt + transaction = match.transaction + receipt_location = self._extract_receipt_location(receipt) + + match_info = f""" +MATCH {i} (ID: match_{i}): +Receipt Details: + - Vendor: {receipt.vendor} + - Amount: ${receipt.amount:.2f} + - Tax: ${receipt.tax:.2f} + - Currency: {receipt.currency} + - Date: {receipt.receipt_date.strftime("%Y-%m-%d")} + - Category: {receipt.category} + - Description: {receipt.description} + - Billing Address: {self._format_address(receipt.billing_address)} + - Shipping Address: {self._format_address(receipt.shipping_address)} + - Is Meals & Entertainment: {receipt.is_meals_entertainment} + +Transaction Details: + - Vendor: {transaction.vendor} + - Amount: ${transaction.amount:.2f} + - Currency: {transaction.currency} + - Date: {transaction.transaction_date.strftime("%Y-%m-%d")} + - Notes: {transaction.notes} + - FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"} + +Receipt Location Detected: +{receipt_location} +""" + matches_info.append(match_info) + + matches_section = "\n".join(matches_info) + + context = f""" +USER CONTEXT: +- User Location (Province): {user_province} +- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}% +- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")} + +PROVINCIAL TAX RATES REFERENCE: +{tax_rates_info} + +CCA DEPRECIATION RATES BY ASSET CLASS: +{cca_rates_info} + +=== MATCHES TO ANALYZE ({len(matches)} total) === +{matches_section} +""" + return context + + def _get_llm_tax_analysis_batch( + self, context: str, num_matches: int + ) -> Dict[str, Any]: + """Get tax rule analysis from LLM for ALL matches in a single call""" + + prompt = f""" +You are a Canadian tax expert analyzing MULTIPLE receipt-transaction matches. Apply the following tax rules intelligently to EACH match. + +{context} + +=== FOUR CORE TAX RULES === + +### 1. SALES TAX RULE +**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses. + +**Key Principles**: +- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location. +- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address. +- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location). + +**Scenario Examples**: +a) User in Ontario, Receipt from Quebec: + - Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST + +b) User in Ontario, Receipt from USA (New York): + - DO NOT apply Canadian sales tax + - This is an international transaction + - Flag for FX review instead + +c) User in Ontario, Receipt has NO address information: + - DEFAULT to user's location (Ontario 13% HST) + +**Tax Calculation**: +- Compare calculated tax vs stated tax on receipt +- Flag discrepancies for review + +### 2. FOREIGN EXCHANGE (FX) RULE +**Purpose**: Handle currency mismatches between receipts and transactions. + +**Actions**: +- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD) +- Calculate expected transaction amount using FX rate if available +- Flag discrepancies > $5 or 5% for manual review +- If FX rate missing but currencies differ, flag for review + +### 3. DEPRECIATION RULE +**Purpose**: Identify capital assets requiring depreciation based on USER'S location. + +**Critical**: Depreciation is ALWAYS based on the USER'S location (for Canadian tax filing), NOT the receipt location. + +**Capital Asset Criteria**: +- Cost > $500 typically +- Useful life > 1 year +- Examples: computers, vehicles, furniture, machinery, buildings + +**CCA Classes**: Assign appropriate class and rate based on asset type and user's jurisdiction + +### 4. MEALS & ENTERTAINMENT RULE +**Purpose**: Apply 50% tax deduction limit for M&E expenses. + +**Actions**: +- Identify M&E expenses (meals, entertainment, client dinners, etc.) +- Tax Deduction: 50% of total amount (including tax) +- Accounting Deduction: 100% of total amount (including tax) +- Always include sales tax in both calculations + +=== YOUR TASK === + +Analyze EACH match (match_0, match_1, match_2, etc.) and return a JSON object where each key is the match ID (e.g., "match_0") and the value is the complete tax analysis for that match. + +Return your response as a SINGLE JSON object in this format: + +{{ + "match_0": {{ + "final_tax_amount": XX.XX, + "sales_tax": {{ + "applicable_province": "XX", + "applicable_rate": 0.XX, + "tax_name": "HST/GST/PST", + "calculated_tax": XX.XX, + "stated_tax": XX.XX, + "discrepancy": XX.XX, + "reason": "Detailed explanation", + "requires_review": true/false + }}, + "foreign_exchange": {{ + "currency_mismatch": true/false, + "receipt_currency": "XXX", + "transaction_currency": "XXX", + "expected_transaction_amount": XX.XX, + "actual_transaction_amount": XX.XX, + "discrepancy": XX.XX, + "requires_manual_review": true/false, + "reason": "Explanation" + }}, + "depreciation": {{ + "is_capital_asset": true/false, + "asset_class": "class_XX", + "cca_rate": 0.XX, + "applicable_jurisdiction": "XX", + "reason": "Explanation" + }}, + "meals_entertainment": {{ + "is_meals_entertainment": true/false, + "tax_deduction_amount": XX.XX, + "accounting_deduction_amount": XX.XX, + "sales_tax_included": XX.XX, + "reason": "Explanation" + }}, + "confidence_adjustment": {{ + "boost": 0.XX, + "reduce": 0.XX, + "reason": "Why confidence should be adjusted" + }}, + "overall_assessment": "Summary for this match" + }}, + "match_1": {{ + ... same structure ... + }}, + ... for all {num_matches} matches ... +}} + +**Critical Reminders**: +- Sales tax uses RECEIPT location (or user location if receipt has none) +- Depreciation ALWAYS uses USER location +- For different addresses, use SHIPPING address for sales tax +- International transactions: no Canadian tax + FX flag +- Be precise with all calculations +- Always explain your reasoning clearly +- Return analysis for ALL {num_matches} matches +""" + + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + { + "role": "system", + "content": "You are a Canadian tax expert. Analyze multiple transactions in batch and apply tax rules accurately. Always return valid JSON with all requested matches.", + }, + {"role": "user", "content": prompt}, + ], + temperature=0.1, # Low temperature for consistent, factual responses + max_tokens=8000, # Higher limit for batch processing + ) + + content = response.choices[0].message.content.strip() + logger.info( + f"LLM batch tax analysis received: {len(content)} characters for {num_matches} matches" + ) + + # Parse the JSON response + json_str = content + if "```json" in content: + json_str = content.split("```json")[1].split("```")[0].strip() + elif "```" in content: + json_str = content.split("```")[1].split("```")[0].strip() + + batch_analysis = json.loads(json_str) + return batch_analysis + + except Exception as e: + logger.error(f"Error getting batch LLM tax analysis: {str(e)}") + # Return empty dict so each match can handle fallback individually + return {} + + def _apply_tax_analysis_to_match(self, match, tax_analysis: Dict[str, Any]): + """Apply tax analysis results to a match object""" + + # Store the complete tax analysis + match.tax_analysis = tax_analysis + + # Apply confidence adjustments based on tax analysis + confidence_adj = tax_analysis.get("confidence_adjustment", {}) + + # Boost confidence if tax rules validate the match + boost = confidence_adj.get("boost", 0.0) + if boost > 0: + match.confidence_score = min(1.0, match.confidence_score + boost) + match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})" + + # Reduce confidence if tax issues detected + reduce = confidence_adj.get("reduce", 0.0) + if reduce > 0: + match.confidence_score = max(0.0, match.confidence_score - reduce) + match.match_reason += f" (Tax issues detected: -{reduce:.2f})" + + # Add flags for manual review if needed + review_flags = [] + + # Check sales tax issues + sales_tax = tax_analysis.get("sales_tax", {}) + if sales_tax.get("requires_review", False): + review_flags.append("Sales Tax Review Required") + + # Check FX issues + fx_analysis = tax_analysis.get("foreign_exchange", {}) + if fx_analysis.get("requires_manual_review", False): + review_flags.append( + f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})" + ) + + # Check depreciation + depreciation = tax_analysis.get("depreciation", {}) + if depreciation.get("is_capital_asset", False): + review_flags.append( + f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})" + ) + + # Check meals & entertainment + meals_ent = tax_analysis.get("meals_entertainment", {}) + if meals_ent.get("is_meals_entertainment", False): + tax_deduction = meals_ent.get("tax_deduction_amount", 0) + accounting_deduction = meals_ent.get("accounting_deduction_amount", 0) + review_flags.append( + f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)" + ) + + # Add review flags to match reason + if review_flags: + match.match_reason += " | REVIEW: " + "; ".join(review_flags) + + return match diff --git a/app/services/matching_engine.py b/app/services/matching_engine.py index ac42118..d7b1fd8 100644 --- a/app/services/matching_engine.py +++ b/app/services/matching_engine.py @@ -25,11 +25,36 @@ class MatchingEngine: receipts, transactions ) - # Apply rules and enhance matches - enhanced_matches = [] + # Apply traditional rules first (lightweight, no API calls) for match in ai_matches: - enhanced_match = self._enhance_match_with_rules(match, user_location) - enhanced_matches.append(enhanced_match) + rule_results = self.rules_engine.apply_rules( + match.receipt, match.transaction + ) + + # Apply confidence boost from traditional rules + if rule_results["confidence_boost"] > 0: + match.confidence_score = min( + 1.0, match.confidence_score + rule_results["confidence_boost"] + ) + + # Auto-approve if rules say so + if rule_results["auto_approve"]: + match.confidence_score = 1.0 + match.match_reason += " (Auto-approved by rules)" + + # Now apply LLM-based tax analysis in a SINGLE batch call + try: + enhanced_matches = self.llm_tax_analyzer.analyze_and_apply_tax_rules_batch( + ai_matches, user_location + ) + except Exception as e: + # If batch LLM analysis fails, log it and continue with matches as-is + import logging + + logging.error(f"Batch LLM tax analysis failed: {str(e)}") + for match in ai_matches: + match.match_reason += " (Note: Advanced tax analysis unavailable)" + enhanced_matches = ai_matches return enhanced_matches