Implement batch processing for LLM-based tax analysis and enhance match confidence scoring
This commit is contained in:
@@ -51,6 +51,50 @@ class LLMTaxAnalyzer:
|
|||||||
self.model = "llama-3.1-8b-instant"
|
self.model = "llama-3.1-8b-instant"
|
||||||
self.max_retries = 3
|
self.max_retries = 3
|
||||||
|
|
||||||
|
def analyze_and_apply_tax_rules_batch(
|
||||||
|
self,
|
||||||
|
matches: list, # List of Match objects
|
||||||
|
user_location: str = "ON",
|
||||||
|
) -> list:
|
||||||
|
"""
|
||||||
|
Batch process all matches in a SINGLE LLM call to reduce costs.
|
||||||
|
Analyzes all receipt-transaction pairs together and applies tax rules.
|
||||||
|
"""
|
||||||
|
if not matches:
|
||||||
|
return matches
|
||||||
|
|
||||||
|
# Build batch context for all matches
|
||||||
|
batch_context = self._build_batch_analysis_context(matches, user_location)
|
||||||
|
|
||||||
|
# Get LLM analysis for ALL matches at once
|
||||||
|
llm_batch_analysis = self._get_llm_tax_analysis_batch(
|
||||||
|
batch_context, len(matches)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply results to each match
|
||||||
|
enhanced_matches = []
|
||||||
|
for i, match in enumerate(matches):
|
||||||
|
try:
|
||||||
|
# Get the analysis for this specific match from the batch results
|
||||||
|
match_analysis = llm_batch_analysis.get(f"match_{i}", {})
|
||||||
|
|
||||||
|
if match_analysis:
|
||||||
|
# Apply the tax analysis to this match
|
||||||
|
enhanced_match = self._apply_tax_analysis_to_match(
|
||||||
|
match, match_analysis
|
||||||
|
)
|
||||||
|
enhanced_matches.append(enhanced_match)
|
||||||
|
else:
|
||||||
|
# No analysis available for this match, use as-is
|
||||||
|
match.match_reason += " (Tax analysis incomplete)"
|
||||||
|
enhanced_matches.append(match)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error applying tax analysis to match {i}: {str(e)}")
|
||||||
|
match.match_reason += " (Tax analysis error)"
|
||||||
|
enhanced_matches.append(match)
|
||||||
|
|
||||||
|
return enhanced_matches
|
||||||
|
|
||||||
def analyze_and_apply_tax_rules(
|
def analyze_and_apply_tax_rules(
|
||||||
self,
|
self,
|
||||||
receipt: Receipt,
|
receipt: Receipt,
|
||||||
@@ -58,6 +102,9 @@ class LLMTaxAnalyzer:
|
|||||||
user_location: str = "ON", # Default to Ontario
|
user_location: str = "ON", # Default to Ontario
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
|
Legacy single-match analysis method (kept for backward compatibility).
|
||||||
|
Use analyze_and_apply_tax_rules_batch() for better performance.
|
||||||
|
|
||||||
Use LLM to intelligently analyze and apply all tax rules:
|
Use LLM to intelligently analyze and apply all tax rules:
|
||||||
1. Sales tax based on receipt location (shipping/billing address priority)
|
1. Sales tax based on receipt location (shipping/billing address priority)
|
||||||
2. Foreign exchange rules for currency mismatches
|
2. Foreign exchange rules for currency mismatches
|
||||||
@@ -496,3 +543,294 @@ Provide a structured JSON response with the following format:
|
|||||||
"analysis_method": "fallback",
|
"analysis_method": "fallback",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _build_batch_analysis_context(self, matches: list, user_location: str) -> str:
|
||||||
|
"""Build comprehensive context for batch LLM analysis of all matches"""
|
||||||
|
|
||||||
|
# Normalize user_location to province code
|
||||||
|
user_province = self._normalize_location_to_province(user_location)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Building batch tax analysis context for {len(matches)} matches - User Location: {user_location} → Province Code: {user_province}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build tax rates and CCA references once
|
||||||
|
tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
|
||||||
|
cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
|
||||||
|
|
||||||
|
# Build match entries
|
||||||
|
matches_info = []
|
||||||
|
for i, match in enumerate(matches):
|
||||||
|
receipt = match.receipt
|
||||||
|
transaction = match.transaction
|
||||||
|
receipt_location = self._extract_receipt_location(receipt)
|
||||||
|
|
||||||
|
match_info = f"""
|
||||||
|
MATCH {i} (ID: match_{i}):
|
||||||
|
Receipt Details:
|
||||||
|
- Vendor: {receipt.vendor}
|
||||||
|
- Amount: ${receipt.amount:.2f}
|
||||||
|
- Tax: ${receipt.tax:.2f}
|
||||||
|
- Currency: {receipt.currency}
|
||||||
|
- Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
|
||||||
|
- Category: {receipt.category}
|
||||||
|
- Description: {receipt.description}
|
||||||
|
- Billing Address: {self._format_address(receipt.billing_address)}
|
||||||
|
- Shipping Address: {self._format_address(receipt.shipping_address)}
|
||||||
|
- Is Meals & Entertainment: {receipt.is_meals_entertainment}
|
||||||
|
|
||||||
|
Transaction Details:
|
||||||
|
- Vendor: {transaction.vendor}
|
||||||
|
- Amount: ${transaction.amount:.2f}
|
||||||
|
- Currency: {transaction.currency}
|
||||||
|
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
|
||||||
|
- Notes: {transaction.notes}
|
||||||
|
- FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
|
||||||
|
|
||||||
|
Receipt Location Detected:
|
||||||
|
{receipt_location}
|
||||||
|
"""
|
||||||
|
matches_info.append(match_info)
|
||||||
|
|
||||||
|
matches_section = "\n".join(matches_info)
|
||||||
|
|
||||||
|
context = f"""
|
||||||
|
USER CONTEXT:
|
||||||
|
- User Location (Province): {user_province}
|
||||||
|
- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
|
||||||
|
- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
|
||||||
|
|
||||||
|
PROVINCIAL TAX RATES REFERENCE:
|
||||||
|
{tax_rates_info}
|
||||||
|
|
||||||
|
CCA DEPRECIATION RATES BY ASSET CLASS:
|
||||||
|
{cca_rates_info}
|
||||||
|
|
||||||
|
=== MATCHES TO ANALYZE ({len(matches)} total) ===
|
||||||
|
{matches_section}
|
||||||
|
"""
|
||||||
|
return context
|
||||||
|
|
||||||
|
def _get_llm_tax_analysis_batch(
|
||||||
|
self, context: str, num_matches: int
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Get tax rule analysis from LLM for ALL matches in a single call"""
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
You are a Canadian tax expert analyzing MULTIPLE receipt-transaction matches. Apply the following tax rules intelligently to EACH match.
|
||||||
|
|
||||||
|
{context}
|
||||||
|
|
||||||
|
=== FOUR CORE TAX RULES ===
|
||||||
|
|
||||||
|
### 1. SALES TAX RULE
|
||||||
|
**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
|
||||||
|
|
||||||
|
**Key Principles**:
|
||||||
|
- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
|
||||||
|
- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
|
||||||
|
- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
|
||||||
|
|
||||||
|
**Scenario Examples**:
|
||||||
|
a) User in Ontario, Receipt from Quebec:
|
||||||
|
- Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
|
||||||
|
|
||||||
|
b) User in Ontario, Receipt from USA (New York):
|
||||||
|
- DO NOT apply Canadian sales tax
|
||||||
|
- This is an international transaction
|
||||||
|
- Flag for FX review instead
|
||||||
|
|
||||||
|
c) User in Ontario, Receipt has NO address information:
|
||||||
|
- DEFAULT to user's location (Ontario 13% HST)
|
||||||
|
|
||||||
|
**Tax Calculation**:
|
||||||
|
- Compare calculated tax vs stated tax on receipt
|
||||||
|
- Flag discrepancies for review
|
||||||
|
|
||||||
|
### 2. FOREIGN EXCHANGE (FX) RULE
|
||||||
|
**Purpose**: Handle currency mismatches between receipts and transactions.
|
||||||
|
|
||||||
|
**Actions**:
|
||||||
|
- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
|
||||||
|
- Calculate expected transaction amount using FX rate if available
|
||||||
|
- Flag discrepancies > $5 or 5% for manual review
|
||||||
|
- If FX rate missing but currencies differ, flag for review
|
||||||
|
|
||||||
|
### 3. DEPRECIATION RULE
|
||||||
|
**Purpose**: Identify capital assets requiring depreciation based on USER'S location.
|
||||||
|
|
||||||
|
**Critical**: Depreciation is ALWAYS based on the USER'S location (for Canadian tax filing), NOT the receipt location.
|
||||||
|
|
||||||
|
**Capital Asset Criteria**:
|
||||||
|
- Cost > $500 typically
|
||||||
|
- Useful life > 1 year
|
||||||
|
- Examples: computers, vehicles, furniture, machinery, buildings
|
||||||
|
|
||||||
|
**CCA Classes**: Assign appropriate class and rate based on asset type and user's jurisdiction
|
||||||
|
|
||||||
|
### 4. MEALS & ENTERTAINMENT RULE
|
||||||
|
**Purpose**: Apply 50% tax deduction limit for M&E expenses.
|
||||||
|
|
||||||
|
**Actions**:
|
||||||
|
- Identify M&E expenses (meals, entertainment, client dinners, etc.)
|
||||||
|
- Tax Deduction: 50% of total amount (including tax)
|
||||||
|
- Accounting Deduction: 100% of total amount (including tax)
|
||||||
|
- Always include sales tax in both calculations
|
||||||
|
|
||||||
|
=== YOUR TASK ===
|
||||||
|
|
||||||
|
Analyze EACH match (match_0, match_1, match_2, etc.) and return a JSON object where each key is the match ID (e.g., "match_0") and the value is the complete tax analysis for that match.
|
||||||
|
|
||||||
|
Return your response as a SINGLE JSON object in this format:
|
||||||
|
|
||||||
|
{{
|
||||||
|
"match_0": {{
|
||||||
|
"final_tax_amount": XX.XX,
|
||||||
|
"sales_tax": {{
|
||||||
|
"applicable_province": "XX",
|
||||||
|
"applicable_rate": 0.XX,
|
||||||
|
"tax_name": "HST/GST/PST",
|
||||||
|
"calculated_tax": XX.XX,
|
||||||
|
"stated_tax": XX.XX,
|
||||||
|
"discrepancy": XX.XX,
|
||||||
|
"reason": "Detailed explanation",
|
||||||
|
"requires_review": true/false
|
||||||
|
}},
|
||||||
|
"foreign_exchange": {{
|
||||||
|
"currency_mismatch": true/false,
|
||||||
|
"receipt_currency": "XXX",
|
||||||
|
"transaction_currency": "XXX",
|
||||||
|
"expected_transaction_amount": XX.XX,
|
||||||
|
"actual_transaction_amount": XX.XX,
|
||||||
|
"discrepancy": XX.XX,
|
||||||
|
"requires_manual_review": true/false,
|
||||||
|
"reason": "Explanation"
|
||||||
|
}},
|
||||||
|
"depreciation": {{
|
||||||
|
"is_capital_asset": true/false,
|
||||||
|
"asset_class": "class_XX",
|
||||||
|
"cca_rate": 0.XX,
|
||||||
|
"applicable_jurisdiction": "XX",
|
||||||
|
"reason": "Explanation"
|
||||||
|
}},
|
||||||
|
"meals_entertainment": {{
|
||||||
|
"is_meals_entertainment": true/false,
|
||||||
|
"tax_deduction_amount": XX.XX,
|
||||||
|
"accounting_deduction_amount": XX.XX,
|
||||||
|
"sales_tax_included": XX.XX,
|
||||||
|
"reason": "Explanation"
|
||||||
|
}},
|
||||||
|
"confidence_adjustment": {{
|
||||||
|
"boost": 0.XX,
|
||||||
|
"reduce": 0.XX,
|
||||||
|
"reason": "Why confidence should be adjusted"
|
||||||
|
}},
|
||||||
|
"overall_assessment": "Summary for this match"
|
||||||
|
}},
|
||||||
|
"match_1": {{
|
||||||
|
... same structure ...
|
||||||
|
}},
|
||||||
|
... for all {num_matches} matches ...
|
||||||
|
}}
|
||||||
|
|
||||||
|
**Critical Reminders**:
|
||||||
|
- Sales tax uses RECEIPT location (or user location if receipt has none)
|
||||||
|
- Depreciation ALWAYS uses USER location
|
||||||
|
- For different addresses, use SHIPPING address for sales tax
|
||||||
|
- International transactions: no Canadian tax + FX flag
|
||||||
|
- Be precise with all calculations
|
||||||
|
- Always explain your reasoning clearly
|
||||||
|
- Return analysis for ALL {num_matches} matches
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.model,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a Canadian tax expert. Analyze multiple transactions in batch and apply tax rules accurately. Always return valid JSON with all requested matches.",
|
||||||
|
},
|
||||||
|
{"role": "user", "content": prompt},
|
||||||
|
],
|
||||||
|
temperature=0.1, # Low temperature for consistent, factual responses
|
||||||
|
max_tokens=8000, # Higher limit for batch processing
|
||||||
|
)
|
||||||
|
|
||||||
|
content = response.choices[0].message.content.strip()
|
||||||
|
logger.info(
|
||||||
|
f"LLM batch tax analysis received: {len(content)} characters for {num_matches} matches"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse the JSON response
|
||||||
|
json_str = content
|
||||||
|
if "```json" in content:
|
||||||
|
json_str = content.split("```json")[1].split("```")[0].strip()
|
||||||
|
elif "```" in content:
|
||||||
|
json_str = content.split("```")[1].split("```")[0].strip()
|
||||||
|
|
||||||
|
batch_analysis = json.loads(json_str)
|
||||||
|
return batch_analysis
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting batch LLM tax analysis: {str(e)}")
|
||||||
|
# Return empty dict so each match can handle fallback individually
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _apply_tax_analysis_to_match(self, match, tax_analysis: Dict[str, Any]):
|
||||||
|
"""Apply tax analysis results to a match object"""
|
||||||
|
|
||||||
|
# Store the complete tax analysis
|
||||||
|
match.tax_analysis = tax_analysis
|
||||||
|
|
||||||
|
# Apply confidence adjustments based on tax analysis
|
||||||
|
confidence_adj = tax_analysis.get("confidence_adjustment", {})
|
||||||
|
|
||||||
|
# Boost confidence if tax rules validate the match
|
||||||
|
boost = confidence_adj.get("boost", 0.0)
|
||||||
|
if boost > 0:
|
||||||
|
match.confidence_score = min(1.0, match.confidence_score + boost)
|
||||||
|
match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
|
||||||
|
|
||||||
|
# Reduce confidence if tax issues detected
|
||||||
|
reduce = confidence_adj.get("reduce", 0.0)
|
||||||
|
if reduce > 0:
|
||||||
|
match.confidence_score = max(0.0, match.confidence_score - reduce)
|
||||||
|
match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
|
||||||
|
|
||||||
|
# Add flags for manual review if needed
|
||||||
|
review_flags = []
|
||||||
|
|
||||||
|
# Check sales tax issues
|
||||||
|
sales_tax = tax_analysis.get("sales_tax", {})
|
||||||
|
if sales_tax.get("requires_review", False):
|
||||||
|
review_flags.append("Sales Tax Review Required")
|
||||||
|
|
||||||
|
# Check FX issues
|
||||||
|
fx_analysis = tax_analysis.get("foreign_exchange", {})
|
||||||
|
if fx_analysis.get("requires_manual_review", False):
|
||||||
|
review_flags.append(
|
||||||
|
f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check depreciation
|
||||||
|
depreciation = tax_analysis.get("depreciation", {})
|
||||||
|
if depreciation.get("is_capital_asset", False):
|
||||||
|
review_flags.append(
|
||||||
|
f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check meals & entertainment
|
||||||
|
meals_ent = tax_analysis.get("meals_entertainment", {})
|
||||||
|
if meals_ent.get("is_meals_entertainment", False):
|
||||||
|
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
|
||||||
|
accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
|
||||||
|
review_flags.append(
|
||||||
|
f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add review flags to match reason
|
||||||
|
if review_flags:
|
||||||
|
match.match_reason += " | REVIEW: " + "; ".join(review_flags)
|
||||||
|
|
||||||
|
return match
|
||||||
|
|||||||
@@ -25,11 +25,36 @@ class MatchingEngine:
|
|||||||
receipts, transactions
|
receipts, transactions
|
||||||
)
|
)
|
||||||
|
|
||||||
# Apply rules and enhance matches
|
# Apply traditional rules first (lightweight, no API calls)
|
||||||
enhanced_matches = []
|
|
||||||
for match in ai_matches:
|
for match in ai_matches:
|
||||||
enhanced_match = self._enhance_match_with_rules(match, user_location)
|
rule_results = self.rules_engine.apply_rules(
|
||||||
enhanced_matches.append(enhanced_match)
|
match.receipt, match.transaction
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apply confidence boost from traditional rules
|
||||||
|
if rule_results["confidence_boost"] > 0:
|
||||||
|
match.confidence_score = min(
|
||||||
|
1.0, match.confidence_score + rule_results["confidence_boost"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Auto-approve if rules say so
|
||||||
|
if rule_results["auto_approve"]:
|
||||||
|
match.confidence_score = 1.0
|
||||||
|
match.match_reason += " (Auto-approved by rules)"
|
||||||
|
|
||||||
|
# Now apply LLM-based tax analysis in a SINGLE batch call
|
||||||
|
try:
|
||||||
|
enhanced_matches = self.llm_tax_analyzer.analyze_and_apply_tax_rules_batch(
|
||||||
|
ai_matches, user_location
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
# If batch LLM analysis fails, log it and continue with matches as-is
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logging.error(f"Batch LLM tax analysis failed: {str(e)}")
|
||||||
|
for match in ai_matches:
|
||||||
|
match.match_reason += " (Note: Advanced tax analysis unavailable)"
|
||||||
|
enhanced_matches = ai_matches
|
||||||
|
|
||||||
return enhanced_matches
|
return enhanced_matches
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user