Implement batch processing for LLM-based tax analysis and enhance match confidence scoring

This commit is contained in:
bolade
2025-10-05 19:38:34 +01:00
parent c45e3fa791
commit ae200bd30f
2 changed files with 367 additions and 4 deletions
+338
View File
@@ -51,6 +51,50 @@ class LLMTaxAnalyzer:
self.model = "llama-3.1-8b-instant"
self.max_retries = 3
def analyze_and_apply_tax_rules_batch(
self,
matches: list, # List of Match objects
user_location: str = "ON",
) -> list:
"""
Batch process all matches in a SINGLE LLM call to reduce costs.
Analyzes all receipt-transaction pairs together and applies tax rules.
"""
if not matches:
return matches
# Build batch context for all matches
batch_context = self._build_batch_analysis_context(matches, user_location)
# Get LLM analysis for ALL matches at once
llm_batch_analysis = self._get_llm_tax_analysis_batch(
batch_context, len(matches)
)
# Apply results to each match
enhanced_matches = []
for i, match in enumerate(matches):
try:
# Get the analysis for this specific match from the batch results
match_analysis = llm_batch_analysis.get(f"match_{i}", {})
if match_analysis:
# Apply the tax analysis to this match
enhanced_match = self._apply_tax_analysis_to_match(
match, match_analysis
)
enhanced_matches.append(enhanced_match)
else:
# No analysis available for this match, use as-is
match.match_reason += " (Tax analysis incomplete)"
enhanced_matches.append(match)
except Exception as e:
logger.error(f"Error applying tax analysis to match {i}: {str(e)}")
match.match_reason += " (Tax analysis error)"
enhanced_matches.append(match)
return enhanced_matches
def analyze_and_apply_tax_rules(
self,
receipt: Receipt,
@@ -58,6 +102,9 @@ class LLMTaxAnalyzer:
user_location: str = "ON", # Default to Ontario
) -> Dict[str, Any]:
"""
Legacy single-match analysis method (kept for backward compatibility).
Use analyze_and_apply_tax_rules_batch() for better performance.
Use LLM to intelligently analyze and apply all tax rules:
1. Sales tax based on receipt location (shipping/billing address priority)
2. Foreign exchange rules for currency mismatches
@@ -496,3 +543,294 @@ Provide a structured JSON response with the following format:
"analysis_method": "fallback",
},
}
def _build_batch_analysis_context(self, matches: list, user_location: str) -> str:
"""Build comprehensive context for batch LLM analysis of all matches"""
# Normalize user_location to province code
user_province = self._normalize_location_to_province(user_location)
logger.info(
f"Building batch tax analysis context for {len(matches)} matches - User Location: {user_location} → Province Code: {user_province}"
)
# Build tax rates and CCA references once
tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
# Build match entries
matches_info = []
for i, match in enumerate(matches):
receipt = match.receipt
transaction = match.transaction
receipt_location = self._extract_receipt_location(receipt)
match_info = f"""
MATCH {i} (ID: match_{i}):
Receipt Details:
- Vendor: {receipt.vendor}
- Amount: ${receipt.amount:.2f}
- Tax: ${receipt.tax:.2f}
- Currency: {receipt.currency}
- Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
- Category: {receipt.category}
- Description: {receipt.description}
- Billing Address: {self._format_address(receipt.billing_address)}
- Shipping Address: {self._format_address(receipt.shipping_address)}
- Is Meals & Entertainment: {receipt.is_meals_entertainment}
Transaction Details:
- Vendor: {transaction.vendor}
- Amount: ${transaction.amount:.2f}
- Currency: {transaction.currency}
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
- Notes: {transaction.notes}
- FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
Receipt Location Detected:
{receipt_location}
"""
matches_info.append(match_info)
matches_section = "\n".join(matches_info)
context = f"""
USER CONTEXT:
- User Location (Province): {user_province}
- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
PROVINCIAL TAX RATES REFERENCE:
{tax_rates_info}
CCA DEPRECIATION RATES BY ASSET CLASS:
{cca_rates_info}
=== MATCHES TO ANALYZE ({len(matches)} total) ===
{matches_section}
"""
return context
def _get_llm_tax_analysis_batch(
self, context: str, num_matches: int
) -> Dict[str, Any]:
"""Get tax rule analysis from LLM for ALL matches in a single call"""
prompt = f"""
You are a Canadian tax expert analyzing MULTIPLE receipt-transaction matches. Apply the following tax rules intelligently to EACH match.
{context}
=== FOUR CORE TAX RULES ===
### 1. SALES TAX RULE
**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
**Key Principles**:
- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
**Scenario Examples**:
a) User in Ontario, Receipt from Quebec:
- Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
b) User in Ontario, Receipt from USA (New York):
- DO NOT apply Canadian sales tax
- This is an international transaction
- Flag for FX review instead
c) User in Ontario, Receipt has NO address information:
- DEFAULT to user's location (Ontario 13% HST)
**Tax Calculation**:
- Compare calculated tax vs stated tax on receipt
- Flag discrepancies for review
### 2. FOREIGN EXCHANGE (FX) RULE
**Purpose**: Handle currency mismatches between receipts and transactions.
**Actions**:
- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
- Calculate expected transaction amount using FX rate if available
- Flag discrepancies > $5 or 5% for manual review
- If FX rate missing but currencies differ, flag for review
### 3. DEPRECIATION RULE
**Purpose**: Identify capital assets requiring depreciation based on USER'S location.
**Critical**: Depreciation is ALWAYS based on the USER'S location (for Canadian tax filing), NOT the receipt location.
**Capital Asset Criteria**:
- Cost > $500 typically
- Useful life > 1 year
- Examples: computers, vehicles, furniture, machinery, buildings
**CCA Classes**: Assign appropriate class and rate based on asset type and user's jurisdiction
### 4. MEALS & ENTERTAINMENT RULE
**Purpose**: Apply 50% tax deduction limit for M&E expenses.
**Actions**:
- Identify M&E expenses (meals, entertainment, client dinners, etc.)
- Tax Deduction: 50% of total amount (including tax)
- Accounting Deduction: 100% of total amount (including tax)
- Always include sales tax in both calculations
=== YOUR TASK ===
Analyze EACH match (match_0, match_1, match_2, etc.) and return a JSON object where each key is the match ID (e.g., "match_0") and the value is the complete tax analysis for that match.
Return your response as a SINGLE JSON object in this format:
{{
"match_0": {{
"final_tax_amount": XX.XX,
"sales_tax": {{
"applicable_province": "XX",
"applicable_rate": 0.XX,
"tax_name": "HST/GST/PST",
"calculated_tax": XX.XX,
"stated_tax": XX.XX,
"discrepancy": XX.XX,
"reason": "Detailed explanation",
"requires_review": true/false
}},
"foreign_exchange": {{
"currency_mismatch": true/false,
"receipt_currency": "XXX",
"transaction_currency": "XXX",
"expected_transaction_amount": XX.XX,
"actual_transaction_amount": XX.XX,
"discrepancy": XX.XX,
"requires_manual_review": true/false,
"reason": "Explanation"
}},
"depreciation": {{
"is_capital_asset": true/false,
"asset_class": "class_XX",
"cca_rate": 0.XX,
"applicable_jurisdiction": "XX",
"reason": "Explanation"
}},
"meals_entertainment": {{
"is_meals_entertainment": true/false,
"tax_deduction_amount": XX.XX,
"accounting_deduction_amount": XX.XX,
"sales_tax_included": XX.XX,
"reason": "Explanation"
}},
"confidence_adjustment": {{
"boost": 0.XX,
"reduce": 0.XX,
"reason": "Why confidence should be adjusted"
}},
"overall_assessment": "Summary for this match"
}},
"match_1": {{
... same structure ...
}},
... for all {num_matches} matches ...
}}
**Critical Reminders**:
- Sales tax uses RECEIPT location (or user location if receipt has none)
- Depreciation ALWAYS uses USER location
- For different addresses, use SHIPPING address for sales tax
- International transactions: no Canadian tax + FX flag
- Be precise with all calculations
- Always explain your reasoning clearly
- Return analysis for ALL {num_matches} matches
"""
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": "You are a Canadian tax expert. Analyze multiple transactions in batch and apply tax rules accurately. Always return valid JSON with all requested matches.",
},
{"role": "user", "content": prompt},
],
temperature=0.1, # Low temperature for consistent, factual responses
max_tokens=8000, # Higher limit for batch processing
)
content = response.choices[0].message.content.strip()
logger.info(
f"LLM batch tax analysis received: {len(content)} characters for {num_matches} matches"
)
# Parse the JSON response
json_str = content
if "```json" in content:
json_str = content.split("```json")[1].split("```")[0].strip()
elif "```" in content:
json_str = content.split("```")[1].split("```")[0].strip()
batch_analysis = json.loads(json_str)
return batch_analysis
except Exception as e:
logger.error(f"Error getting batch LLM tax analysis: {str(e)}")
# Return empty dict so each match can handle fallback individually
return {}
def _apply_tax_analysis_to_match(self, match, tax_analysis: Dict[str, Any]):
"""Apply tax analysis results to a match object"""
# Store the complete tax analysis
match.tax_analysis = tax_analysis
# Apply confidence adjustments based on tax analysis
confidence_adj = tax_analysis.get("confidence_adjustment", {})
# Boost confidence if tax rules validate the match
boost = confidence_adj.get("boost", 0.0)
if boost > 0:
match.confidence_score = min(1.0, match.confidence_score + boost)
match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
# Reduce confidence if tax issues detected
reduce = confidence_adj.get("reduce", 0.0)
if reduce > 0:
match.confidence_score = max(0.0, match.confidence_score - reduce)
match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
# Add flags for manual review if needed
review_flags = []
# Check sales tax issues
sales_tax = tax_analysis.get("sales_tax", {})
if sales_tax.get("requires_review", False):
review_flags.append("Sales Tax Review Required")
# Check FX issues
fx_analysis = tax_analysis.get("foreign_exchange", {})
if fx_analysis.get("requires_manual_review", False):
review_flags.append(
f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
)
# Check depreciation
depreciation = tax_analysis.get("depreciation", {})
if depreciation.get("is_capital_asset", False):
review_flags.append(
f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
)
# Check meals & entertainment
meals_ent = tax_analysis.get("meals_entertainment", {})
if meals_ent.get("is_meals_entertainment", False):
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
review_flags.append(
f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
)
# Add review flags to match reason
if review_flags:
match.match_reason += " | REVIEW: " + "; ".join(review_flags)
return match
+29 -4
View File
@@ -25,11 +25,36 @@ class MatchingEngine:
receipts, transactions
)
# Apply rules and enhance matches
enhanced_matches = []
# Apply traditional rules first (lightweight, no API calls)
for match in ai_matches:
enhanced_match = self._enhance_match_with_rules(match, user_location)
enhanced_matches.append(enhanced_match)
rule_results = self.rules_engine.apply_rules(
match.receipt, match.transaction
)
# Apply confidence boost from traditional rules
if rule_results["confidence_boost"] > 0:
match.confidence_score = min(
1.0, match.confidence_score + rule_results["confidence_boost"]
)
# Auto-approve if rules say so
if rule_results["auto_approve"]:
match.confidence_score = 1.0
match.match_reason += " (Auto-approved by rules)"
# Now apply LLM-based tax analysis in a SINGLE batch call
try:
enhanced_matches = self.llm_tax_analyzer.analyze_and_apply_tax_rules_batch(
ai_matches, user_location
)
except Exception as e:
# If batch LLM analysis fails, log it and continue with matches as-is
import logging
logging.error(f"Batch LLM tax analysis failed: {str(e)}")
for match in ai_matches:
match.match_reason += " (Note: Advanced tax analysis unavailable)"
enhanced_matches = ai_matches
return enhanced_matches