Compare commits

24 Commits

Author SHA1 Message Date
michael f6535908fe added unmatched transactions 2025-11-17 14:43:54 +00:00
michael 8d745c1f8e refactor: update model initialization to use settings.model across services 2025-11-11 12:02:08 +00:00
michael 2b83ffe00c removed nohup output 2025-10-29 14:31:28 +00:00
michael 85fafae311 added a server manager 2025-10-29 14:27:44 +00:00
bolade fa25f7bafd removed dollar sign 2025-10-23 23:13:10 +01:00
bolade 2f917ec085 Added source column 2025-10-23 23:11:13 +01:00
bolade 7296d09319 Added quickbooks data 2025-10-23 19:38:59 +01:00
michael 01aa2efa43 Implement code changes to enhance functionality and improve performance 2025-10-13 17:06:03 +00:00
michael c8da3c61ca Implement code changes to enhance functionality and improve performance 2025-10-10 17:18:52 +00:00
michael 3559cbe19d Add test script for JSON extraction functionality
This commit introduces a new test script, `test_json_extraction.py`, which verifies the correctness of the JSON extraction logic. The script includes a function to extract the first valid JSON object from raw input and a series of test cases covering various scenarios, such as clean JSON, JSON with extra text, nested JSON, and escaped quotes. The tests ensure that the extraction function behaves as expected and handles edge cases appropriately.
2025-10-09 19:56:22 +00:00
bolade 2e020437a8 Add AI rules support for document processing and matching; enhance tax analysis with flag_for_review and auto_approve fields 2025-10-08 00:12:09 +01:00
bolade f582110674 Add name_of_asset field to receipt processing and update related logic in DocumentProcessor 2025-10-07 20:35:43 +01:00
bolade 5116fb5efb Enhance tax extraction rules in DocumentProcessor to clarify handling of explicit and unclear tax amounts on receipts 2025-10-07 12:44:27 +01:00
bolade b2bf631448 Refactor document processing endpoint to accept file_id as a path parameter and update related logic; modify DocumentProcessRequest to make file_id optional; add clarification to tax processing rules in DocumentProcessor. 2025-10-07 12:36:04 +01:00
bolade 659ca4ff15 Add user location support for tax calculations in document processing 2025-10-07 12:03:26 +01:00
bolade d8315f13ac Add new fields for tax and depreciation in receipt processing
- Introduced fields: receipt_location, calculated_tax, is_depreciable, cca_rate, useful_life, and residual_value in DBReceipt model.
- Updated process_document function to handle new receipt data attributes.
- Enhanced DocumentProcessResponse schema to include new fields.
- Updated document processing rules to incorporate tax calculation based on location and depreciation rules.
2025-10-07 11:15:26 +01:00
michael 823c05f78d Implement code changes to enhance functionality and improve performance 2025-10-05 23:38:03 +00:00
bolade c2a7c5a087 Add manual tax calculator for rule-based tax analysis and integrate with matching engine 2025-10-05 20:48:05 +01:00
bolade e3f610e01a Refine JSON response handling in batch analysis to exclude markdown code blocks and improve extraction logic 2025-10-05 20:36:47 +01:00
bolade 7c412bcf9e Enhance batch processing in LLMTaxAnalyzer with fallback to individual analysis on failure 2025-10-05 20:03:46 +01:00
bolade ae200bd30f Implement batch processing for LLM-based tax analysis and enhance match confidence scoring 2025-10-05 19:38:34 +01:00
bolade c45e3fa791 Add user location support and tax analysis enhancements
- Introduced user location extraction from user tax info for improved matching.
- Normalized user location to province codes for tax calculations.
- Updated MatchResponse schema to include tax analysis data.
- Enhanced LLMTaxAnalyzer to handle various location formats and provide fallback logic.
2025-10-05 18:34:35 +01:00
bolade c78c4c6fe9 Enhance receipt matching by adding user location support and implementing LLM-based tax analysis rules 2025-10-05 13:25:55 +01:00
michael 3d48cf0385 Add requirements.txt with essential dependencies for the project 2025-10-05 11:29:45 +00:00
26 changed files with 4263 additions and 1996 deletions
+10 -225
View File
@@ -1,229 +1,14 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be added to the global gitignore or merged into this project gitignore. For a PyCharm
# project, it is recommended to include the following files:
# .idea/
# *.iml
# *.ipr
# *.iws
# VS Code
.vscode/
# macOS
.DS_Store
.AppleDouble
.LSOverride
# Windows
Thumbs.db
ehthumbs.db
Desktop.ini
# Linux
*~
# Temporary files
*.tmp
*.temp
*.swp
*.swo
*~
# Log files
*.log
# Database files
__pycache__/
*.pyc
*.pyo
*.pyd
*.db
*.sqlite
*.sqlite3
# Configuration files with sensitive data
config.ini
secrets.json
.env.local
.env.production
# Test files
test_*.py
*_test.py
tests/
# Documentation
docs/
*.md
!README.md
# IDE files
.idea/
.vscode/
*.sublime-*
.atom/
# OS generated files
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
uploads/
chequing statement.csv
test_images/
.cursorrules.md
.env
*.log
/uploads
server_manager.sh
server.log
server.pid
-262
View File
@@ -1,262 +0,0 @@
# AI Bookkeeper - Data Science Engine
AI-powered receipt-to-transaction matching engine using Groq LLM. This is a **Data Science Engine** that provides intelligent matching capabilities for backend applications.
## 🎯 Purpose
This Data Science Engine receives QuickBooks transaction data from backend applications and provides:
- **AI-powered receipt processing** (OCR and data extraction)
- **Intelligent receipt-transaction matching** with confidence scores
- **Configurable AI rules** for business logic
- **Feedback logging** for continuous improvement
- **RESTful API** for easy integration
## 🚀 Quick Start
### 1. Install Dependencies
```bash
pip install -r requirements.txt
```
### 2. Configure API Keys
Create a `.env` file in the project root with your Groq API key:
```bash
# Create .env file
echo "GROQ_API_KEY=your_actual_groq_api_key_here" > .env
```
**Important**: Get your API key from [Groq Console](https://console.groq.com/)
### 3. Start the Server
```bash
# Option 1: Using the main script
python main.py
# Option 2: Using uvicorn directly
uvicorn main:app --host 0.0.0.0 --port 8343 --reload
```
### 4. Access API Documentation
- **Swagger UI**: http://localhost:8343/docs
- **ReDoc**: http://localhost:8343/redoc
## 📋 API Endpoints
### Transaction Import
- `POST /transactions/import/csv` - Import transactions from CSV file
- `POST /transactions/import/image` - Import transactions from image/PDF
### Receipt Processing
- `POST /upload-multiple` - Upload multiple receipt documents
- `POST /process/{file_id}` - Extract data from uploaded documents
### AI Matching Engine
- `POST /match-specific` - Match specific receipts to transactions using AI
### AI Rules Management
- `POST /rules` - Add new AI rules
- `GET /rules` - List all active rules
- `DELETE /rules/{rule_name}` - Delete rules
### System Monitoring
- `GET /stats` - Get system statistics and performance metrics
- `GET /` - Health check endpoint
## 🔧 Core Components
### **AIMatcher** (`ai_matcher.py`)
- Uses Groq LLM to compare receipts and transactions
- Provides confidence scores and reasoning
- Configurable matching criteria (amount, date, vendor)
- Rate limiting to prevent API quota exhaustion
### **AIRulesEngine** (`ai_rules.py`)
- Applies business rules for auto-approval and categorization
- Configurable rule conditions and actions
- Supports system and user-generated rules
- Safe condition evaluation with proper error handling
### **DocumentProcessor** (`document_processor.py`)
- AI-powered receipt data extraction using Groq vision model
- Supports PDF and image formats
- Robust JSON parsing with error handling
- Extracts vendor, amount, date, tax, and category information
### **MatchingEngine** (`matching_engine.py`)
- Main orchestrator combining all components
- Handles the complete matching workflow
- Provides statistics and feedback logging
- Configurable confidence thresholds
### **FeedbackLogger** (`feedback_logger.py`)
- Tracks manual overrides for AI training
- Maintains audit trail of user decisions
- Enables continuous model improvement
## 📊 Configuration
Edit `config.py` to adjust:
- **Confidence threshold** (default: 0.3)
- **Date tolerance days** (default: 7)
- **Amount tolerance percent** (default: 5%)
- **Groq API key** (from environment variable)
## 🔄 Integration Workflow
### 1. Import Transactions
```bash
# Import from CSV
curl -X POST -F "file=@transactions.csv" http://localhost:8343/transactions/import/csv
# Import from image
curl -X POST -F "file=@statement.jpg" http://localhost:8343/transactions/import/image
```
### 2. Upload and Process Receipts
```bash
# Upload receipts
curl -X POST -F "files=@receipt1.jpg" -F "files=@receipt2.jpg" http://localhost:8343/upload-multiple
# Process a specific receipt
curl -X POST http://localhost:8343/process/{file_id}
```
### 3. AI Matching
```bash
# Match specific receipts
curl -X POST -H "Content-Type: application/json" \
-d '["file_id_1", "file_id_2"]' \
http://localhost:8343/match-specific
```
### 4. Check Results
```bash
# Get system stats
curl http://localhost:8343/stats
# View AI rules
curl http://localhost:8343/rules
```
## 🎯 Key Features
- **AI-powered matching** with confidence scores
- **Rule-based auto-approval** and categorization
- **Feedback logging** for continuous improvement
- **Configurable matching parameters**
- **RESTful JSON API** for easy backend integration
- **Comprehensive error handling**
- **Rate limiting** to prevent API quota exhaustion
- **Robust JSON parsing** for AI responses
## 📝 Data Formats
### Transaction Input (CSV)
```csv
Date,Description,Amount,Category
2024-01-15,Starbucks Coffee,12.50,Food & Dining
2024-01-16,Office Supplies,45.99,Office
```
### Receipt Processing Output
```json
{
"vendor": "Starbucks",
"total_amount": 12.50,
"tax_amount": 1.25,
"date": "2024-01-15",
"category": "Food & Dining",
"confidence": 0.95,
"extraction_success": true
}
```
### Match Result Output
```json
{
"receipt_id": "uuid",
"transaction_id": "transaction_123",
"confidence_score": 0.95,
"match_reason": "Same vendor, minor date difference (Auto-approved by rules)",
"receipt_vendor": "Starbucks",
"receipt_amount": 12.50,
"transaction_vendor": "STARBUCKS",
"transaction_amount": 12.50
}
```
## 🔍 AI Matching Criteria
The engine uses multiple criteria for matching:
1. **Amount Similarity** - Compares receipt and transaction amounts (5% tolerance)
2. **Date Proximity** - Checks date closeness (7-day tolerance)
3. **Vendor Matching** - AI-powered vendor name comparison using Groq LLM
4. **Rule-based Auto-approval** - Automatic approval for exact matches and high-confidence matches
## 🛠️ Development
### Project Structure
```
├── main.py # FastAPI application entry point
├── ai_matcher.py # AI-powered matching logic
├── ai_rules.py # Business rules engine
├── document_processor.py # Receipt data extraction
├── matching_engine.py # Main matching orchestrator
├── feedback_logger.py # User feedback tracking
├── models.py # Pydantic data models
├── api_models.py # API request/response models
├── config.py # Configuration settings
├── requirements.txt # Python dependencies
└── test_images/ # Test image files
```
### Running Tests
```bash
# Test the server
curl http://localhost:8343/
# Test stats endpoint
curl http://localhost:8343/stats
# Test rules endpoint
curl http://localhost:8343/rules
```
## 🚀 Production Deployment
For production deployment:
- Replace in-memory storage with a database (PostgreSQL recommended)
- Configure proper authentication and authorization
- Set up monitoring and logging (ELK stack recommended)
- Use environment variables for all configuration
- Implement proper error handling and retries
- Set up rate limiting and API quotas
- Configure CORS for frontend integration
- Use HTTPS in production
## 📞 Support
This Data Science Engine is designed to be integrated with backend applications that handle:
- QuickBooks API connections
- User interface and workflows
- Data persistence and management
- External integrations
The engine focuses purely on AI/ML capabilities and provides a clean JSON API for backend integration.
## 🔧 Troubleshooting
### Common Issues
1. **API Key Error**: Ensure `GROQ_API_KEY` is set in your `.env` file
2. **Port Already in Use**: Kill existing process with `pkill -f "python main.py"`
3. **Import Errors**: Install dependencies with `pip install -r requirements.txt`
4. **Rate Limiting**: The system includes built-in rate limiting to prevent API quota exhaustion
### Logs
Check the application logs for detailed error information:
```bash
tail -f app.log
```
-142
View File
@@ -1,142 +0,0 @@
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel
class AddressRequest(BaseModel):
province: str
city: str
postal_code: str
country: str = "Canada"
class ReceiptRequest(BaseModel):
id: str
file_name: str
upload_date: datetime
receipt_date: datetime
amount: float
tax: float
vendor: str
category: str
description: str
# Tax rule fields
billing_address: Optional[AddressRequest] = None
shipping_address: Optional[AddressRequest] = None
currency: str = "CAD"
is_meals_entertainment: bool = False
class TransactionRequest(BaseModel):
id: str
transaction_date: datetime
amount: float
vendor: str
notes: str
# Tax rule fields
currency: str = "CAD"
fx_rate: Optional[float] = None
class AssetRequest(BaseModel):
id: str
name: str
purchase_date: datetime
purchase_amount: float
useful_life_years: int
residual_value: float
cca_rate: float
asset_class: str
class MatchingRequest(BaseModel):
receipt_ids: List[str]
transaction_ids: List[str]
class MatchResponse(BaseModel):
receipt_id: str
transaction_id: str
confidence_score: float
match_reason: str
receipt_vendor: str
receipt_amount: float
receipt_description: str
receipt_category: str
receipt_tax_amount: float
transaction_vendor: str
transaction_amount: float
class MatchingResponse(BaseModel):
matches: List[MatchResponse]
stats: dict
class ApprovalRequest(BaseModel):
match_id: str
approved: bool
reason: Optional[str] = None
class RuleRequest(BaseModel):
name: str
condition: str
action: str
source: str = "user"
class DocumentUploadResponse(BaseModel):
file_id: str
filename: str
upload_date: datetime
status: str
class DocumentProcessResponse(BaseModel):
file_id: str
extraction_success: bool
vendor: Optional[str] = None
description: Optional[str] = None
total_amount: Optional[float] = None
tax_amount: Optional[float] = None
date: Optional[str] = None
category: Optional[str] = None
confidence: Optional[float] = None
error: Optional[str] = None
# New tax-related models
class TaxCalculationRequest(BaseModel):
receipt_id: str
transaction_id: Optional[str] = None
class TaxCalculationResponse(BaseModel):
receipt_id: str
rules_applied: List[str]
sales_tax: dict
fx_analysis: Optional[dict] = None
meals_entertainment: dict
class DepreciationRequest(BaseModel):
asset: AssetRequest
year: int
method: str # "straight_line" or "cca"
class DepreciationResponse(BaseModel):
asset_id: str
year: int
method: str
depreciation: float
book_value: float
total_depreciation: Optional[float] = None
success: bool
error: Optional[str] = None
class MatchSpecificRequest(BaseModel):
file_ids: List[str]
categorization_id: str
View File
+13
View File
@@ -0,0 +1,13 @@
from pydantic_settings import BaseSettings
from typing import Optional
class Settings(BaseSettings):
database_url: Optional[str] = None
secret_key: Optional[str] = None
api_key: Optional[str] = None
model: str = "openai/gpt-oss-120b"
GROQ_API_KEY: str
class Config:
env_file = ".env"
settings = Settings()
+139
View File
@@ -0,0 +1,139 @@
from typing import Annotated
from fastapi import Depends
from sqlalchemy import Column, DateTime, Float, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, sessionmaker
SQLALCHEMY_DATABASE_URL = "sqlite:///./sql_app.db"
engine = create_engine(
SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
db_dependency = Annotated[Session, Depends(get_db)]
Base = declarative_base()
def create_db_tables():
"""Create database tables safely with error handling"""
import logging
logger = logging.getLogger(__name__)
try:
# Check if tables already exist to avoid unnecessary DDL operations
from sqlalchemy import inspect
inspector = inspect(engine)
existing_tables = inspector.get_table_names()
if existing_tables:
logger.info(f"Database tables already exist: {existing_tables}")
return
# Create tables with timeout protection
logger.info("Creating database tables...")
Base.metadata.create_all(bind=engine, checkfirst=True)
logger.info("Database tables created successfully")
except KeyboardInterrupt:
logger.warning("Database creation interrupted by user")
raise
except Exception as e:
logger.error(f"Error creating database tables: {e}")
# Don't crash the app - tables might already exist
pass
def clear_all_data():
"""Clear all data from the database (useful for testing)"""
db = SessionLocal()
try:
db.query(DBTransaction).delete()
db.query(DBReceipt).delete()
db.query(DBUploadedFile).delete()
db.commit()
finally:
db.close()
# Transactions table
class DBTransaction(Base):
__tablename__ = "transactions"
id = Column(Integer, primary_key=True, index=True)
transaction_id = Column(String, index=True)
amount = Column(Float, nullable=False)
date = Column(DateTime, nullable=False)
vendor = Column(String, nullable=False)
description = Column(String, nullable=True)
category = Column(String, nullable=True)
tax_amount = Column(Float, nullable=True)
categorisation_id = Column(String, nullable=True)
user_id = Column(String, nullable=True)
source = Column(String, nullable=True) # e.g., "csv", "image", "manual", "api"
# Additional QuickBooks CSV columns
TxnId = Column(String, nullable=True)
AccountType = Column(String, nullable=True)
AccountNumber = Column(String, nullable=True)
TransactionDate = Column(String, nullable=True)
TransactionType = Column(String, nullable=True)
ChequeNumber = Column(String, nullable=True)
Description1 = Column(String, nullable=True)
Description2 = Column(String, nullable=True)
VendorId = Column(String, nullable=True)
VendorName = Column(String, nullable=True)
AccountId = Column(String, nullable=True)
AccountName = Column(String, nullable=True)
# Uploaded Files table
class DBUploadedFile(Base):
__tablename__ = "uploaded_files"
id = Column(Integer, primary_key=True, index=True)
file_id = Column(String, unique=True, index=True)
filename = Column(String, nullable=False)
file_path = Column(String, nullable=False)
file_type = Column(String, nullable=False)
upload_date = Column(DateTime, nullable=False)
status = Column(String, nullable=False, default="uploaded")
# Receipts table
class DBReceipt(Base):
__tablename__ = "receipts"
id = Column(Integer, primary_key=True, index=True)
receipt_id = Column(String, unique=True, index=True)
file_id = Column(String, unique=True, index=True)
amount = Column(Float, nullable=False)
date = Column(DateTime, nullable=False)
vendor = Column(String, nullable=False)
description = Column(String, nullable=True)
category = Column(String, nullable=True)
tax_amount = Column(Float, nullable=True)
confidence = Column(Float, nullable=True)
extraction_success = Column(String, nullable=True)
error_message = Column(String, nullable=True)
receipt_currency = Column(String, nullable=True)
receipt_location = Column(String, nullable=True)
calculated_tax = Column(Float, nullable=True)
is_depreciable = Column(String, nullable=True) # Store as string "True"/"False"
name_of_asset = Column(String, nullable=True) # Name/description of the asset
cca_rate = Column(Float, nullable=True)
useful_life = Column(Integer, nullable=True)
residual_value = Column(Float, nullable=True)
+456 -397
View File
File diff suppressed because it is too large Load Diff
+351
View File
@@ -0,0 +1,351 @@
from dataclasses import dataclass
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel
@dataclass
class Address:
"""Address information for tax calculations"""
province: str
city: str
postal_code: str
country: str = "Canada"
@dataclass
class Receipt:
id: str
file_name: str
upload_date: datetime
receipt_date: datetime
amount: float
tax: float
vendor: str
category: str
description: str
# Tax rule fields
billing_address: Optional[Address] = None
shipping_address: Optional[Address] = None
currency: str = "CAD"
is_meals_entertainment: bool = False
@dataclass
class Transaction:
id: str
transaction_date: datetime
amount: float
vendor: str
notes: str
# Tax rule fields
currency: str = "CAD"
fx_rate: Optional[float] = None
source: Optional[str] = None # e.g., "csv", "image", "manual", "api"
# QuickBooks CSV fields
TxnId: Optional[str] = None
AccountType: Optional[str] = None
AccountNumber: Optional[str] = None
TransactionDate: Optional[str] = None
TransactionType: Optional[str] = None
ChequeNumber: Optional[str] = None
Description1: Optional[str] = None
Description2: Optional[str] = None
VendorId: Optional[str] = None
VendorName: Optional[str] = None
AccountId: Optional[str] = None
AccountName: Optional[str] = None
@dataclass
class Asset:
"""Asset for depreciation calculations"""
id: str
name: str
purchase_date: datetime
purchase_amount: float
useful_life_years: int
residual_value: float
cca_rate: float # Capital Cost Allowance rate
asset_class: str
@dataclass
class Match:
receipt: Receipt
transaction: Transaction
confidence_score: float
match_reason: str
tax_analysis: Optional[dict] = None
class AddressRequest(BaseModel):
province: str
city: str
postal_code: str
country: str = "Canada"
class ReceiptRequest(BaseModel):
id: str
file_name: str
upload_date: datetime
receipt_date: datetime
amount: float
tax: float
vendor: str
category: str
description: str
# Tax rule fields
billing_address: Optional[AddressRequest] = None
shipping_address: Optional[AddressRequest] = None
currency: str = "CAD"
is_meals_entertainment: bool = False
class TransactionRequest(BaseModel):
id: str
transaction_date: datetime
amount: float
vendor: str
notes: str
# Tax rule fields
currency: str = "CAD"
fx_rate: Optional[float] = None
source: Optional[str] = None # e.g., "csv", "image", "manual", "api"
# QuickBooks CSV fields
TxnId: Optional[str] = None
AccountType: Optional[str] = None
AccountNumber: Optional[str] = None
TransactionDate: Optional[str] = None
TransactionType: Optional[str] = None
ChequeNumber: Optional[str] = None
Description1: Optional[str] = None
Description2: Optional[str] = None
VendorId: Optional[str] = None
VendorName: Optional[str] = None
AccountId: Optional[str] = None
AccountName: Optional[str] = None
class AssetRequest(BaseModel):
id: str
name: str
purchase_date: datetime
purchase_amount: float
useful_life_years: int
residual_value: float
cca_rate: float
asset_class: str
class MatchingRequest(BaseModel):
receipt_ids: List[str]
transaction_ids: List[str]
class MatchResponse(BaseModel):
receipt_id: str
transaction_id: str
confidence_score: float
match_reason: str
receipt_vendor: str
receipt_amount: float
receipt_description: str
receipt_category: str
receipt_tax_amount: float
transaction_vendor: str
transaction_amount: float
tax_analysis: Optional[dict] = None
flag_for_review: Optional[bool] = None
auto_approve: Optional[bool] = None
# Transaction metadata
transaction_source: Optional[str] = None # Source of the transaction
# QuickBooks CSV fields from transaction
TxnId: Optional[str] = None
AccountType: Optional[str] = None
AccountNumber: Optional[str] = None
TransactionDate: Optional[str] = None
TransactionType: Optional[str] = None
ChequeNumber: Optional[str] = None
Description1: Optional[str] = None
Description2: Optional[str] = None
VendorId: Optional[str] = None
VendorName: Optional[str] = None
AccountId: Optional[str] = None
AccountName: Optional[str] = None
Source: Optional[str] = None
class MatchingResponse(BaseModel):
matches: List[MatchResponse]
stats: dict
class ApprovalRequest(BaseModel):
match_id: str
approved: bool
reason: Optional[str] = None
class RuleRequest(BaseModel):
name: str
condition: str
action: str
source: str = "user"
class DocumentUploadResponse(BaseModel):
file_id: str
filename: str
file_type: str
upload_date: datetime
status: str
class AIRules(BaseModel):
condition: str
action: str
class DocumentProcessRequest(BaseModel):
file_id: Optional[str] = None
user_location: Optional[str] = (
None # Format: "State/Province, Country" (e.g., "Ontario, Canada")
)
ai_rules: Optional[List[AIRules]] = None
class DocumentProcessResponse(BaseModel):
file_id: str
receipt_id: str
extraction_success: bool
vendor: Optional[str] = None
description: Optional[str] = None
total_amount: Optional[float] = None
tax_amount: Optional[float] = None
date: Optional[str] = None
category: Optional[str] = None
confidence: Optional[float] = None
error: Optional[str] = None
receipt_currency: Optional[str] = "CAD"
receipt_location: Optional[str] = (
None # Location from receipt (e.g., "Ontario, Canada" or "California, USA")
)
calculated_tax: Optional[float] = None # Calculated sales tax if not clearly shown
is_depreciable: Optional[bool] = None # Whether item is a depreciable asset
name_of_asset: Optional[str] = None # Name/description of the asset if depreciable
cca_rate: Optional[float] = (
None # CCA rate for tax depreciation (e.g., 0.30 for 30%)
)
useful_life: Optional[int] = (
None # Useful life in years for straight-line depreciation
)
residual_value: Optional[float] = (
None # Residual value for straight-line depreciation
)
# New tax-related models
class TaxCalculationRequest(BaseModel):
receipt_id: str
transaction_id: Optional[str] = None
class TaxCalculationResponse(BaseModel):
receipt_id: str
rules_applied: List[str]
sales_tax: dict
fx_analysis: Optional[dict] = None
meals_entertainment: dict
class DepreciationRequest(BaseModel):
asset: AssetRequest
year: int
method: str # "straight_line" or "cca"
class DepreciationResponse(BaseModel):
asset_id: str
year: int
method: str
depreciation: float
book_value: float
total_depreciation: Optional[float] = None
success: bool
error: Optional[str] = None
class CityInfo(BaseModel):
"""City information from user tax info"""
id: int
name: str
state_id: int
state_code: str
country_id: int
country_code: str
latitude: Optional[str] = None
longitude: Optional[str] = None
class StateInfo(BaseModel):
"""State/Province information from user tax info"""
id: int
name: str
country_id: int
country_code: str
state_code: str
class CountryInfo(BaseModel):
"""Country information from user tax info"""
id: int
name: str
iso3: str
iso2: str
phone_code: str
capital: str
currency: str
native: Optional[str] = None
region: Optional[str] = None
subregion: Optional[str] = None
emoji: Optional[str] = None
emojiU: Optional[str] = None
class UserTaxInfo(BaseModel):
"""User tax information for location-based tax calculations"""
id: int
user_id: int
company_name: str
tax_id: Optional[str] = ""
tax_id_type: Optional[str] = "EIN"
address_line_1: Optional[str] = ""
address_line_2: Optional[str] = ""
city: CityInfo
state: StateInfo
zip_postal_code: Optional[str] = ""
country: CountryInfo
include_on_invoices: Optional[int] = 1
created_at: Optional[str] = None
updated_at: Optional[str] = None
class MatchSpecificRequest(BaseModel):
file_ids: List[str]
categorization_id: str
user_location: Optional[str] = "Canada" # Kept for backward compatibility
user_tax_info: Optional[UserTaxInfo] = None
ai_rules: Optional[List[AIRules]] = None
View File
+173 -73
View File
@@ -4,8 +4,8 @@ from typing import List, Tuple
import groq
import config
from models import Match, Receipt, Transaction
from config import settings
from schemas import Match, Receipt, Transaction
# Set up logging
logging.basicConfig(level=logging.INFO)
@@ -14,8 +14,8 @@ logger = logging.getLogger(__name__)
class AIMatcher:
def __init__(self, use_batch_matching=True):
self.client = groq.Groq(api_key=config.GROQ_API_KEY)
self.model = "llama3-8b-8192"
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
self.model = settings.model
self.max_retries = 3
self.retry_delay = 2 # seconds - increased for rate limiting
self.rate_limit_delay = 1.0 # seconds between API calls
@@ -116,7 +116,7 @@ class AIMatcher:
for i, transaction in enumerate(candidates):
transaction_amount_abs = abs(transaction.amount)
date_diff = abs((receipt.receipt_date - transaction.transaction_date).days)
amount_diff = abs(receipt.amount - transaction_amount_abs)
amount_diff = abs(receipt.amount - transaction_amount_abs - receipt.tax)
amount_percent_diff = (
(amount_diff / receipt.amount) * 100 if receipt.amount > 0 else 0
)
@@ -127,11 +127,12 @@ Candidate {i + 1}:
- Amount: ${transaction.amount} (absolute: ${transaction_amount_abs})
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")} ({date_diff} days difference)
- Notes: {transaction.notes}
- Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%)
- Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%) Taking in account receipt tax
"""
logger.info(f"\nThis is the receipt: {receipt}\n")
logger.info(f"\nCandidate text: {candidates_text}\n")
prompt = f"""
You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.
prompt = f"""You are an expert at matching receipts to bank transactions. Your PRIMARY goal is to find the candidate with the CLOSEST AMOUNT match.
RECEIPT TO MATCH:
- Vendor: {receipt.vendor}
@@ -143,25 +144,52 @@ RECEIPT TO MATCH:
CANDIDATE TRANSACTIONS:
{candidates_text}
SCORING CRITERIA:
- Perfect matches (same vendor, amount, date): 0.95-1.0
- High confidence (minor differences): 0.8-0.94
- Medium confidence (moderate differences): 0.6-0.79
- Low confidence (significant differences): 0.4-0.59
- Very low confidence (major differences): 0.2-0.39
- Minimal similarity: 0.1-0.19
- No meaningful similarity: 0.0-0.09
CRITICAL INSTRUCTIONS FOR SELECTION:
1. FIRST: Find the candidate(s) with the SMALLEST amount percentage difference
2. ONLY if multiple candidates have similar amounts (within 2% of each other), THEN consider vendor/date/notes
3. USE THE PERCENTAGE DIFFERENCE PROVIDED for each candidate - DO NOT calculate yourself
4. IGNORE vendor/description matches if amounts are far apart (>20% difference)
5. The candidate with the closest amount is almost always the correct match
Consider vendor name similarity, amount accuracy, date proximity, and description/notes relevance.
SCORING CRITERIA - AMOUNT DIFFERENCE IS 90% OF THE DECISION:
IMPORTANT: You MUST return the candidate with the highest match score, even if it's very low. Never return NONE.
Return ONLY the best match in this exact format:
CANDIDATE_NUMBER|CONFIDENCE_SCORE|REASON
Step 1: Calculate BASE SCORE using the provided amount percentage difference:
- 0-1% difference: Base score = 0.95
- 1-2% difference: Base score = 0.90
- 2-3% difference: Base score = 0.85
- 3-5% difference: Base score = 0.75
- 5-7% difference: Base score = 0.65
- 7-10% difference: Base score = 0.55
- 10-15% difference: Base score = 0.40
- 15-20% difference: Base score = 0.25
- 20-30% difference: Base score = 0.15
- 30-50% difference: Base score = 0.08
- 50-100% difference: Base score = 0.03
- >100% difference: Base score = 0.01
Example: 3|0.87|Same vendor name, exact amount match, 1 day apart
Example of low match: 5|0.15|Best available option despite significant differences in vendor and amount
"""
Step 2: ADJUST the base score (±0.10 maximum):
- Vendor exact match: +0.10
- Vendor similar/partial match: +0.05
- Date within 7 days: +0.05
- Date within 30 days: +0.02
- Description/notes keywords match: +0.02
- Vendor completely different: -0.05
- Date >90 days apart: -0.03
Step 3: Ensure final score is between 0.0 and 1.0
CRITICAL: You MUST return valid JSON only. No explanations, no text before or after.
Return format:
{{"candidate_number": 1, "confidence_score": 0.65, "reason": "5.8% amount difference with similar vendor"}}
Another example:
{{"candidate_number": 2, "confidence_score": 0.01, "reason": "9850% amount difference, extremely poor match"}}
Return ONLY JSON for the best candidate:"""
# logger.info(f"This is the prompt: {prompt}")
for attempt in range(self.max_retries):
try:
result = self._call_groq_api_with_timeout(
@@ -179,6 +207,22 @@ Example of low match: 5|0.15|Best available option despite significant differenc
if 0 <= candidate_num < len(candidates):
best_transaction = candidates[candidate_num]
# Validate the match - catch AI errors with extreme amount differences
transaction_amount_abs = abs(best_transaction.amount)
amount_diff = abs(receipt.amount - transaction_amount_abs)
amount_percent_diff = (
(amount_diff / receipt.amount) * 100 if receipt.amount > 0 else 0
)
# If amount difference is >100%, force very low score
if amount_percent_diff > 100:
logger.warning(
f"Overriding AI score for extreme mismatch: {receipt.amount} vs {transaction_amount_abs} ({amount_percent_diff:.1f}% diff)"
)
score = min(0.05, score) # Cap at 0.05 for extreme mismatches
reason = f"{amount_percent_diff:.1f}% amount difference, extreme mismatch"
logger.info(
f"AI selected candidate {candidate_num + 1}: {best_transaction.vendor} (score: {score:.3f})"
)
@@ -204,55 +248,93 @@ Example of low match: 5|0.15|Best available option despite significant differenc
return None
def _parse_single_match_response(self, result: str) -> Tuple[int, float, str]:
"""Parse AI response for single best match"""
"""Parse AI response for single best match (JSON format)"""
import json
import re
result = result.strip()
logger.debug(f"Parsing single match response: {result}")
try:
if result.upper().startswith("NONE"):
# This should not happen with new prompt, but handle as parsing error
logger.warning(
"AI returned NONE despite being instructed to always return best match"
)
return -1, 0.0, "AI returned NONE unexpectedly"
# First, try to parse the entire result as JSON
try:
data = json.loads(result)
candidate_num = int(data.get("candidate_number", -1)) - 1
score = float(data.get("confidence_score", 0.0))
reason = str(data.get("reason", "No reason provided"))
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed JSON: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
except json.JSONDecodeError:
pass
# Try to extract JSON object from the response using improved regex
# This handles nested braces better
json_pattern = r'\{[^{}]*"candidate_number"[^{}]*"confidence_score"[^{}]*"reason"[^{}]*\}'
json_match = re.search(json_pattern, result)
if json_match:
json_str = json_match.group()
data = json.loads(json_str)
candidate_num = int(data.get("candidate_number", -1)) - 1
score = float(data.get("confidence_score", 0.0))
reason = str(data.get("reason", "No reason provided"))
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed extracted JSON: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
# Try to find any JSON-like structure with the required fields
candidate_match = re.search(r'"candidate_number"\s*:\s*(\d+)', result)
score_match = re.search(r'"confidence_score"\s*:\s*([\d.]+)', result)
reason_match = re.search(r'"reason"\s*:\s*"([^"]*)"', result)
if candidate_match and score_match and reason_match:
candidate_num = int(candidate_match.group(1)) - 1
score = float(score_match.group(1))
reason = reason_match.group(1)
score = max(0.0, min(1.0, score))
logger.debug(f"Parsed fields individually: candidate={candidate_num}, score={score}, reason={reason}")
return candidate_num, score, reason
if "|" in result:
parts = result.split("|")
if len(parts) >= 3:
candidate_str = parts[0].strip()
score_str = parts[1].strip()
reason = "|".join(parts[2:]).strip()
except (json.JSONDecodeError, ValueError, KeyError) as e:
logger.warning(f"Error parsing JSON response: {e}")
# Fallback to old pipe-delimited format for backwards compatibility
try:
if "|" in result:
parts = result.split("|")
if len(parts) >= 3:
candidate_str = parts[0].strip()
score_str = parts[1].strip()
reason = "|".join(parts[2:]).strip()
# Extract candidate number
import re
# Extract candidate number
candidate_match = re.search(r"\d+", candidate_str)
if candidate_match:
candidate_num = (
int(candidate_match.group()) - 1
) # Convert to 0-based index
else:
raise ValueError("No candidate number found")
candidate_match = re.search(r"\d+", candidate_str)
if candidate_match:
candidate_num = (
int(candidate_match.group()) - 1
) # Convert to 0-based index
else:
raise ValueError("No candidate number found")
# Extract score
score_clean = "".join(
c for c in score_str if c.isdigit() or c == "."
)
score = float(score_clean) if score_clean else 0.0
# Extract score
score_clean = "".join(
c for c in score_str if c.isdigit() or c == "."
)
score = float(score_clean) if score_clean else 0.0
# Ensure score is in valid range
score = max(0.0, min(1.0, score))
# Ensure score is in valid range
score = max(0.0, min(1.0, score))
logger.debug(
f"Parsed (fallback): candidate={candidate_num}, score={score}, reason={reason}"
)
return candidate_num, score, reason
except Exception as fallback_error:
logger.warning(f"Fallback parsing also failed: {fallback_error}")
logger.debug(
f"Parsed: candidate={candidate_num}, score={score}, reason={reason}"
)
return candidate_num, score, reason
except Exception as e:
logger.warning(f"Error parsing single match response: {e}")
# Fallback
logger.warning(f"Could not parse single match response: {result}")
# Final fallback
# logger.warning(f"Could not parse single match response: {result}")
return -1, 0.0, f"Parse error: {result[:50]}..."
def _filter_candidates(
@@ -260,18 +342,29 @@ Example of low match: 5|0.15|Best available option despite significant differenc
) -> List[Transaction]:
"""Filter transactions to create a reasonable candidate list"""
candidates = []
amount_threshold = receipt.amount * 2.0 # 200% threshold - very inclusive
for transaction in transactions:
# Use absolute value for transaction amount comparison
transaction_amount_abs = abs(transaction.amount)
# Only exclude transactions with obviously different amounts
if abs(receipt.amount - transaction_amount_abs) <= amount_threshold:
amount_diff = abs(receipt.amount - transaction_amount_abs)
# Calculate percentage difference
if receipt.amount > 0:
percent_diff = (amount_diff / receipt.amount) * 100
else:
percent_diff = 0
# Be more restrictive: exclude transactions with >300% difference
# This prevents extreme mismatches while still being generous
if percent_diff <= 300:
candidates.append(transaction)
else:
logger.debug(
f"Filtered out transaction ${transaction_amount_abs} for receipt ${receipt.amount} ({percent_diff:.1f}% difference)"
)
logger.debug(
f"Filtered {len(transactions)} transactions to {len(candidates)} candidates"
f"Filtered {len(transactions)} transactions to {len(candidates)} candidates for receipt ${receipt.amount}"
)
return candidates
@@ -338,6 +431,10 @@ Example of low match: 5|0.15|Best available option despite significant differenc
Consider description and category similarity in your scoring.
THINGS TO NOTE:
The most important factor to consider is the Amount for both the transaction and the receipt, the closer the amounts, the higher the score.
If the amounts are different or not close return a low score (0-0.1) based on other factors.
IMPORTANT: Return ONLY the score and reason separated by a pipe character.
Format: [score]|[reason]
Example: 0.85|Same vendor, same amount, 2 days apart
@@ -352,8 +449,8 @@ Example of low match: 5|0.15|Best available option despite significant differenc
# Parse the result - handle multiple formats
score, reason = self._parse_ai_response(result)
logger.debug(f"AI Response: {result}")
logger.debug(f"Parsed: score={score}, reason={reason}")
# logger.debug(f"AI Response: {result}")
# logger.debug(f"Parsed: score={score}, reason={reason}")
return score, reason
@@ -451,9 +548,12 @@ Example of low match: 5|0.15|Best available option despite significant differenc
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=200,
temperature=0.1,
messages=[
{"role": "system", "content": "You are a JSON-only response assistant. Return only valid JSON, no explanations."},
{"role": "user", "content": prompt}
],
max_tokens=150,
temperature=0,
)
return response.choices[0].message.content.strip()
except Exception as e:
+2 -2
View File
@@ -1,8 +1,8 @@
from dataclasses import dataclass
from typing import Any, Dict, List
from models import Receipt, Transaction
from tax_rules_engine import TaxRulesEngine
from schemas import Receipt, Transaction
from services.tax_rules_engine import TaxRulesEngine
@dataclass
+273
View File
@@ -0,0 +1,273 @@
import json
import logging
from typing import Dict, List, Optional
import groq
from config import settings
from schemas import Match
logger = logging.getLogger(__name__)
class AIRulesMatcher:
"""
AI-powered rules engine for post-matching evaluation.
Uses LLM to intelligently apply custom rules and determine if matches should be:
- Flagged for manual review (flag_for_review=True)
- Auto-approved (auto_approve=True)
"""
def __init__(self):
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
self.model = settings.model
def apply_rules_to_matches(
self, matches: List[Match], ai_rules: Optional[List[Dict]] = None
) -> List[Match]:
"""
Apply AI rules to all matches and add flag_for_review and auto_approve fields.
Args:
matches: List of Match objects from the matching engine
ai_rules: Optional list of custom rules (format: [{"condition": str, "action": str}])
Returns:
Enhanced matches with tax_analysis containing flag_for_review and auto_approve
"""
if not matches:
return matches
logger.info(
f"Applying AI rules to {len(matches)} matches with {len(ai_rules) if ai_rules else 0} custom rules"
)
# Built-in rule: currency mismatch should always flag for review
builtin_rules = [
{
"condition": "receipt currency differs from transaction currency",
"action": "flag_for_review",
}
]
# Combine built-in rules with user-provided rules
all_rules = builtin_rules + (ai_rules if ai_rules else [])
# Process each match
for match in matches:
try:
rule_evaluation = self._evaluate_rules_for_match(match, all_rules)
# Initialize or update tax_analysis
if match.tax_analysis is None:
match.tax_analysis = {}
# Add rule evaluation results
match.tax_analysis["flag_for_review"] = rule_evaluation[
"flag_for_review"
]
match.tax_analysis["auto_approve"] = rule_evaluation["auto_approve"]
match.tax_analysis["rules_applied"] = rule_evaluation["rules_applied"]
match.tax_analysis["rule_reasons"] = rule_evaluation["reasons"]
# Update match reason with rule information
if rule_evaluation["flag_for_review"]:
match.match_reason += " | 🚩 FLAGGED FOR REVIEW"
if rule_evaluation["auto_approve"]:
match.match_reason += " | ✅ AUTO-APPROVED"
logger.info(
f"Match {match.receipt.id}{match.transaction.id}: "
f"flag_for_review={rule_evaluation['flag_for_review']}, "
f"auto_approve={rule_evaluation['auto_approve']}"
)
except Exception as e:
logger.error(f"Error applying rules to match: {str(e)}")
# Fail safe: flag for review if rule processing fails
if match.tax_analysis is None:
match.tax_analysis = {}
match.tax_analysis["flag_for_review"] = True
match.tax_analysis["auto_approve"] = False
match.tax_analysis["rule_reasons"] = [
f"Rule evaluation error: {str(e)}"
]
return matches
def _evaluate_rules_for_match(
self, match: Match, rules: List[Dict]
) -> Dict[str, any]:
"""
Use LLM to evaluate all rules for a single match.
Returns:
{
"flag_for_review": bool,
"auto_approve": bool,
"rules_applied": List[str],
"reasons": List[str]
}
"""
# Build context about the match
match_context = self._build_match_context(match)
# Build rules context
rules_context = self._build_rules_context(rules)
# Create prompt for LLM
prompt = f"""You are a financial matching rules engine. Analyze the following receipt-to-transaction match and apply the specified rules.
MATCH DETAILS:
{match_context}
RULES TO APPLY:
{rules_context}
INSTRUCTIONS:
1. Evaluate each rule's condition against the match details
2. If a rule's condition is TRUE, apply the action:
- If action is "flag_for_review" or "review" → set flag_for_review = true
- If action is "auto_approve" or "approve" → set auto_approve = true
- For other actions, determine if they imply review or approval
3. If BOTH flag_for_review and auto_approve are triggered, flag_for_review takes priority
4. If NO rules match, set both to false (default behavior)
IMPORTANT BUILT-IN RULE:
- If receipt currency differs from transaction currency → ALWAYS set flag_for_review = true
Return ONLY a valid JSON object with this exact format:
{{
"flag_for_review": boolean,
"auto_approve": boolean,
"rules_applied": ["list of rule conditions that matched"],
"reasons": ["list of reasons for the decisions"]
}}
"""
try:
# Call LLM
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": "You are a financial rules evaluation assistant. You analyze transaction matches and apply business rules. Always respond with valid JSON only.",
},
{"role": "user", "content": prompt},
],
temperature=0.1,
max_tokens=500,
)
result_text = response.choices[0].message.content.strip()
# Parse JSON response
result = self._parse_llm_response(result_text)
# Validate and enforce constraints
if result["flag_for_review"] and result["auto_approve"]:
logger.warning(
"Both flag_for_review and auto_approve were true, prioritizing flag_for_review"
)
result["auto_approve"] = False
result["reasons"].append(
"Conflicting rules: prioritized manual review over auto-approval"
)
return result
except Exception as e:
logger.error(f"LLM evaluation failed: {str(e)}")
# Fail safe: flag for review
return {
"flag_for_review": True,
"auto_approve": False,
"rules_applied": [],
"reasons": [f"Error evaluating rules: {str(e)}"],
}
def _build_match_context(self, match: Match) -> str:
"""Build a text description of the match for the LLM"""
receipt = match.receipt
transaction = match.transaction
context = f"""Receipt Information:
- ID: {receipt.id}
- Vendor: {receipt.vendor}
- Amount: ${receipt.amount:.2f}
- Tax: ${receipt.tax:.2f}
- Category: {receipt.category}
- Description: {receipt.description}
- Date: {receipt.receipt_date}
- Currency: {receipt.currency}
Transaction Information:
- ID: {transaction.id}
- Vendor: {transaction.vendor}
- Amount: ${transaction.amount:.2f}
- Date: {transaction.transaction_date}
- Notes: {transaction.notes}
- Currency: {transaction.currency}
Match Quality:
- Confidence Score: {match.confidence_score:.2%}
- Match Reason: {match.match_reason}
"""
# Add tax analysis if available
if match.tax_analysis:
context += f"\nTax Analysis:\n{json.dumps(match.tax_analysis, indent=2)}"
return context
def _build_rules_context(self, rules: List[Dict]) -> str:
"""Build a formatted list of rules for the LLM"""
if not rules:
return "No custom rules provided. Apply default evaluation."
rules_text = ""
for idx, rule in enumerate(rules, 1):
condition = rule.get("condition", "")
action = rule.get("action", "")
rules_text += f"{idx}. IF {condition} → THEN {action}\n"
return rules_text
def _parse_llm_response(self, response_text: str) -> Dict:
"""Parse and validate LLM JSON response"""
try:
# Remove markdown code blocks if present
if "```json" in response_text:
response_text = response_text.split("```json")[1].split("```")[0]
elif "```" in response_text:
response_text = response_text.split("```")[1].split("```")[0]
# Parse JSON
result = json.loads(response_text.strip())
# Validate required fields
if "flag_for_review" not in result:
result["flag_for_review"] = False
if "auto_approve" not in result:
result["auto_approve"] = False
if "rules_applied" not in result:
result["rules_applied"] = []
if "reasons" not in result:
result["reasons"] = []
# Ensure boolean types
result["flag_for_review"] = bool(result["flag_for_review"])
result["auto_approve"] = bool(result["auto_approve"])
return result
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response as JSON: {str(e)}")
logger.error(f"Response text: {response_text}")
# Return safe defaults
return {
"flag_for_review": True, # Fail safe to manual review
"auto_approve": False,
"rules_applied": [],
"reasons": ["Failed to parse LLM response"],
}
+859
View File
@@ -0,0 +1,859 @@
import base64
import json
import logging
import os
import re
from datetime import datetime
from typing import Any, Dict
import aiofiles
import groq
import PyPDF2
from config import settings
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self):
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
self.model = "meta-llama/llama-4-scout-17b-16e-instruct" # Vision model
def _extract_first_json(self, raw: str) -> dict:
"""Extract the first valid JSON object from raw LLM output.
Handles cases where LLM returns extra text after/before the JSON.
"""
try:
# First try direct parsing (fastest path)
return json.loads(raw)
except json.JSONDecodeError:
pass
# Find the first '{' and match closing '}'
start = raw.find("{")
if start == -1:
raise ValueError("No JSON object found in LLM output")
depth = 0
end = -1
in_string = False
escape_next = False
for i in range(start, len(raw)):
ch = raw[i]
# Handle string escaping
if escape_next:
escape_next = False
continue
if ch == "\\":
escape_next = True
continue
# Track if we're inside a string
if ch == '"':
in_string = not in_string
continue
# Only count braces outside of strings
if not in_string:
if ch == "{":
depth += 1
elif ch == "}":
depth -= 1
if depth == 0:
end = i + 1
break
if end == -1:
raise ValueError("Unbalanced JSON braces in LLM output")
json_str = raw[start:end]
return json.loads(json_str)
async def process_file(
self,
file_path: str,
file_type: str,
user_location: str = None,
ai_rules: list = None,
) -> Dict[str, Any]:
"""Process uploaded file and extract receipt data
Args:
file_path: Path to the file to process
file_type: Type of file (jpg, pdf, etc.)
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
ai_rules: List of AI rules for categorization (e.g., [{"condition": "vendor is Starbucks", "action": "Food"}])
"""
try:
if file_type.lower() in ["jpg", "jpeg", "png", "gif", "bmp"]:
return await self._process_image(file_path, user_location, ai_rules)
elif file_type.lower() == "pdf":
return await self._process_pdf(file_path, user_location, ai_rules)
else:
raise ValueError(f"Unsupported file type: {file_type}")
except Exception as e:
return {"error": str(e)}
async def _process_image(
self, image_path: str, user_location: str = None, ai_rules: list = None
) -> Dict[str, Any]:
"""Extract data from image using Groq vision
Args:
image_path: Path to the image file
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
ai_rules: List of AI rules for categorization
"""
try:
# Encode image to base64
base64_image = self._encode_image(image_path)
# Build user location context
user_location_context = ""
if user_location:
user_location_context = f"""
USER LOCATION CONTEXT:
The user is located in {user_location}.
- If the receipt location is MISSING or UNCLEAR, use the user's location ({user_location}) for tax calculations.
- If the receipt clearly shows a different location, use the receipt's location instead.
- Apply depreciation rules based on the user's location.
"""
# Build AI rules context for categorization
ai_rules_context = ""
if ai_rules and len(ai_rules) > 0:
# Create a simple, direct instruction for each rule
ai_rules_context = "\n "
for idx, rule in enumerate(ai_rules, 1):
condition = rule.get("condition", "")
action = rule.get("action", "")
# Extract the keyword and category from the rule
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
if keyword_match and category_match:
keyword = keyword_match.group(1)
category = category_match.group(1).strip()
# Create one simple instruction per line
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
ai_rules_context += "\n"
# Create Groq vision prompt
prompt = f"""
Analyze this receipt image and extract the following information in JSON format.
{ai_rules_context}
JSON Format:
{{
"vendor": "Store/company name",
"description": "Detailed description of items/services purchased",
"total_amount": 0.00,
"tax_amount": 0.00,
"date": "YYYY-MM-DD",
"category": "Check rules above first",
"confidence": 0.95,
"currency": "USD",
"location": "Province/State, Country",
"calculated_tax": 0.00,
"is_depreciable": false,
"name_of_asset": null,
"cca_rate": null,
"useful_life": null,
"residual_value": null,
"extraction_success": True
}}
EXTRACTION Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax
- Tax amount is separate tax line if available (if not clearly shown, calculate based on location)
- Date should be the date on the receipt
- Confidence score 0-1 based on how clear the receipt is
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
{user_location_context}
LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country)
- Format location as "Province/State, Country" (e.g., "Ontario, Canada" or "California, USA")
- If location not shown on receipt, return null for location (system will use user location as fallback)
TAX EXTRACTION RULES (IMPORTANT):
- If tax is EXPLICITLY shown on receipt (even if $0 or 0%), use that exact value:
* If receipt shows "Tax: $0", "Tax: $0.00", "Tax (0%)", or similar → set tax_amount to 0.00 and calculated_tax to null
* If receipt shows any other tax amount → set tax_amount to that value and calculated_tax to null
- If tax_amount is NOT shown or UNCLEAR on receipt, calculate it based on location:
* Ontario, Canada: 13% HST
* Quebec, Canada: 9.975% QST + 5% GST = 14.975% total
* British Columbia, Canada: 12% (5% GST + 7% PST)
* Alberta, Canada: 5% GST
* California, USA: ~7.25% (varies by locality)
* New York, USA: ~8.875% (varies by locality)
* Texas, USA: 6.25%
* For other locations, estimate based on typical rates
* Store calculated tax in "calculated_tax" field and set tax_amount to the calculated value
DEPRECIATION RULES:
- Determine if item is a depreciable asset (vehicles, machinery, equipment, computers, furniture, buildings)
- Set is_depreciable to true only for capital assets, false for consumables/services
- If is_depreciable is true, provide:
* name_of_asset: Specific name/model of the asset (e.g., "2024 Honda Accord", "Dell Laptop XPS 15", "Office Desk")
* cca_rate: CCA rate as decimal (e.g., 0.30 for 30%, 0.20 for 20%, 0.04 for 4%)
- Class 10 (Vehicles): 30%
- Class 8 (Furniture, equipment): 20%
- Class 50 (Computers, software): 55%
- Class 1 (Buildings): 4%
- Class 10.1 (Passenger vehicles >$30k): 30%
* useful_life: Expected years of use (e.g., 5 for computers, 8 for vehicles, 10 for furniture)
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
Return only valid JSON.
"""
logger.info(f"This is the prompt: {prompt}")
# Call Groq vision API with correct format
response = self.client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
}
],
model=self.model,
max_tokens=800,
temperature=0.1,
)
# Parse response
result_text = response.choices[0].message.content.strip()
return self._parse_extraction_result(result_text)
except Exception as e:
return {"error": f"Image processing error: {str(e)}"}
def _encode_image(self, image_path: str) -> str:
"""Encode image to base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode("utf-8")
async def _process_pdf(
self, pdf_path: str, user_location: str = None, ai_rules: list = None
) -> Dict[str, Any]:
"""Extract data from PDF by converting to image first
Args:
pdf_path: Path to the PDF file
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
ai_rules: List of AI rules for categorization
"""
try:
# For now, extract text from PDF and process as text
text_content = self._extract_text_from_pdf(pdf_path)
return self._process_text_content(text_content, user_location, ai_rules)
except Exception as e:
return {"error": f"PDF processing error: {str(e)}"}
def _extract_text_from_pdf(self, pdf_path: str) -> str:
"""Extract text from PDF"""
try:
with open(pdf_path, "rb") as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception:
return ""
def _process_text_content(
self, text_content: str, user_location: str = None, ai_rules: list = None
) -> Dict[str, Any]:
"""Process text content using Groq (fallback for PDFs)
Args:
text_content: Extracted text from PDF
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
ai_rules: List of AI rules for categorization
"""
try:
# Build user location context
user_location_context = ""
if user_location:
user_location_context = f"""
USER LOCATION CONTEXT:
The user is located in {user_location}.
- If the receipt location is MISSING or UNCLEAR, use the user's location ({user_location}) for tax calculations.
- If the receipt clearly shows a different location, use the receipt's location instead.
- Apply depreciation rules based on the user's location.
"""
# Build AI rules context for categorization
ai_rules_context = ""
if ai_rules and len(ai_rules) > 0:
# Create a simple, direct instruction for each rule
ai_rules_context = "\n "
for idx, rule in enumerate(ai_rules, 1):
condition = rule.get("condition", "")
action = rule.get("action", "")
# Extract the keyword and category from the rule
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
if keyword_match and category_match:
keyword = keyword_match.group(1)
category = category_match.group(1).strip()
# Create one simple instruction per line
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
ai_rules_context += "\n"
prompt = f"""
Analyze this receipt text and extract the following information in JSON format.
{ai_rules_context}
Receipt Text:
{text_content}
Extract:
{{
"vendor": "Store/company name",
"description": "Detailed description of items/services purchased",
"total_amount": 0.00,
"tax_amount": 0.00,
"date": "YYYY-MM-DD",
"category": "Check rules above first",
"confidence": 0.95,
"currency": "USD",
"location": "Province/State, Country",
"calculated_tax": 0.00,
"is_depreciable": false,
"name_of_asset": null,
"cca_rate": null,
"useful_life": null,
"residual_value": null,
"extraction_success": True
}}
EXTRACTION Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax
- Tax amount is separate tax line if available (if not clearly shown, calculate based on location)
- Date should be the date on the receipt
- Confidence score 0-1 based on clarity
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
{user_location_context}
LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country)
- Format location as "Province/State, Country" (e.g., "Ontario, Canada" or "California, USA")
- If location not shown on receipt, return null for location (system will use user location as fallback)
TAX EXTRACTION RULES (IMPORTANT):
- If tax is EXPLICITLY shown on receipt (even if $0 or 0%), use that exact value:
* If receipt shows "Tax: $0", "Tax: $0.00", "Tax (0%)", or similar → set tax_amount to 0.00 and calculated_tax to null
* If receipt shows any other tax amount → set tax_amount to that value and calculated_tax to null
- If tax_amount is NOT shown or UNCLEAR on receipt, calculate it based on location:
* Ontario, Canada: 13% HST
* Quebec, Canada: 9.975% QST + 5% GST = 14.975% total
* British Columbia, Canada: 12% (5% GST + 7% PST)
* Alberta, Canada: 5% GST
* California, USA: ~7.25% (varies by locality)
* New York, USA: ~8.875% (varies by locality)
* Texas, USA: 6.25%
* For other locations, estimate based on typical rates
* Store calculated tax in "calculated_tax" field and set tax_amount to the calculated value
DEPRECIATION RULES:
- Determine if item is a depreciable asset (vehicles, machinery, equipment, computers, furniture, buildings)
- Set is_depreciable to true only for capital assets, false for consumables/services
- If is_depreciable is true, provide:
* name_of_asset: Specific name/model of the asset (e.g., "2024 Honda Accord", "Dell Laptop XPS 15", "Office Desk")
* cca_rate: CCA rate as decimal (e.g., 0.30 for 30%, 0.20 for 20%, 0.04 for 4%)
- Class 10 (Vehicles): 30%
- Class 8 (Furniture, equipment): 20%
- Class 50 (Computers, software): 55%
- Class 1 (Buildings): 4%
- Class 10.1 (Passenger vehicles >$30k): 30%
* useful_life: Expected years of use (e.g., 5 for computers, 8 for vehicles, 10 for furniture)
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
Return only valid JSON.
"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=800,
temperature=0.1,
)
result_text = response.choices[0].message.content.strip()
return self._parse_extraction_result(result_text)
except Exception as e:
return {"error": f"Text processing error: {str(e)}"}
def _parse_extraction_result(self, result_text: str) -> Dict[str, Any]:
"""Parse Groq response and extract JSON data"""
try:
import re
# Try robust JSON extraction first (handles extra text)
try:
data = self._extract_first_json(result_text)
return data
except (json.JSONDecodeError, ValueError) as e:
logger.warning(f"Robust JSON extraction failed: {e}. Trying fallback methods...")
# Fallback: Find JSON in response - try multiple patterns
json_match = re.search(r"\{.*\}", result_text, re.DOTALL)
if json_match:
json_str = json_match.group()
# Clean up common JSON issues
json_str = re.sub(
r",\s*([}\]])", r"\1", json_str
) # Remove trailing commas
json_str = re.sub(
r"([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:", r'\1"\2":', json_str
) # Quote unquoted keys
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
# Try to fix common JSON issues
logger.warning(f"Fallback JSON parsing also failed: {e}")
# Try to extract individual fields using regex
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
description_match = re.search(
r'"description"\s*:\s*"([^"]*)"', json_str
)
total_amount_match = re.search(
r'"total_amount"\s*:\s*([0-9.]+)', json_str
)
tax_amount_match = re.search(
r'"tax_amount"\s*:\s*([0-9.]+)', json_str
)
date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
confidence_match = re.search(
r'"confidence"\s*:\s*([0-9.]+)', json_str
)
currency_match = re.search(r'"currency"\s*:\s*"([^"]*)"', json_str)
location_match = re.search(r'"location"\s*:\s*"([^"]*)"', json_str)
calculated_tax_match = re.search(
r'"calculated_tax"\s*:\s*([0-9.]+|null)', json_str
)
is_depreciable_match = re.search(
r'"is_depreciable"\s*:\s*(true|false)', json_str
)
name_of_asset_match = re.search(
r'"name_of_asset"\s*:\s*"([^"]*)"', json_str
)
cca_rate_match = re.search(
r'"cca_rate"\s*:\s*([0-9.]+|null)', json_str
)
useful_life_match = re.search(
r'"useful_life"\s*:\s*([0-9]+|null)', json_str
)
residual_value_match = re.search(
r'"residual_value"\s*:\s*([0-9.]+|null)', json_str
)
data = {
"vendor": vendor_match.group(1) if vendor_match else "",
"description": description_match.group(1)
if description_match
else "",
"total_amount": float(total_amount_match.group(1))
if total_amount_match
else 0.0,
"tax_amount": float(tax_amount_match.group(1))
if tax_amount_match
else 0.0,
"date": date_match.group(1) if date_match else "",
"category": category_match.group(1)
if category_match
else "Other",
"confidence": float(confidence_match.group(1))
if confidence_match
else 0.5,
"currency": currency_match.group(1)
if currency_match
else "CAD",
"location": location_match.group(1) if location_match else None,
"calculated_tax": float(calculated_tax_match.group(1))
if calculated_tax_match
and calculated_tax_match.group(1) != "null"
else None,
"is_depreciable": is_depreciable_match.group(1) == "true"
if is_depreciable_match
else None,
"name_of_asset": name_of_asset_match.group(1)
if name_of_asset_match
else None,
"cca_rate": float(cca_rate_match.group(1))
if cca_rate_match and cca_rate_match.group(1) != "null"
else None,
"useful_life": int(useful_life_match.group(1))
if useful_life_match and useful_life_match.group(1) != "null"
else None,
"residual_value": float(residual_value_match.group(1))
if residual_value_match
and residual_value_match.group(1) != "null"
else None,
}
# Validate and clean data
return {
"vendor": str(data.get("vendor", "")).strip(),
"description": str(data.get("description", "")).strip(),
"total_amount": float(data.get("total_amount", 0)),
"tax_amount": float(data.get("tax_amount", 0)),
"date": str(data.get("date", "")).strip(),
"category": str(data.get("category", "Other")).strip(),
"confidence": float(data.get("confidence", 0.5)),
"extraction_success": True,
"currency": data.get("currency", "CAD").strip(),
"location": data.get("location"),
"calculated_tax": data.get("calculated_tax"),
"is_depreciable": data.get("is_depreciable"),
"name_of_asset": data.get("name_of_asset"),
"cca_rate": data.get("cca_rate"),
"useful_life": data.get("useful_life"),
"residual_value": data.get("residual_value"),
}
else:
# Try to extract fields from plain text
logger.warning("No JSON found in response, attempting text extraction")
return self._extract_from_plain_text(result_text)
except Exception as e:
logger.error(f"JSON parsing error: {str(e)}")
return {
"error": f"JSON parsing error: {str(e)}",
"extraction_success": False,
}
def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
"""Extract receipt data from plain text when JSON parsing fails"""
try:
import re
# Extract vendor (look for common patterns)
vendor_patterns = [
r"(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)",
r"([A-Z][A-Za-z0-9\s&.,]{3,30})", # Capitalized words
]
vendor = ""
for pattern in vendor_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vendor = match.group(1).strip()
break
# Extract amount (look for currency patterns)
amount_patterns = [
r"\$?\s*([0-9,]+\.?[0-9]*)",
r"(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)",
]
total_amount = 0.0
for pattern in amount_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
total_amount = float(match.group(1).replace(",", ""))
break
except ValueError:
continue
# Extract date
date_patterns = [
r"(\d{4}-\d{2}-\d{2})",
r"(\d{1,2}/\d{1,2}/\d{2,4})",
r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}",
]
date = ""
for pattern in date_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
date = match.group(0)
break
return {
"vendor": vendor or "Unknown",
"total_amount": total_amount,
"tax_amount": 0.0,
"date": date or "",
"category": "Other",
"confidence": 0.3, # Low confidence for text extraction
"extraction_success": True,
"location": None,
"calculated_tax": None,
"is_depreciable": None,
"name_of_asset": None,
"cca_rate": None,
"useful_life": None,
"residual_value": None,
}
except Exception as e:
logger.error(f"Text extraction error: {str(e)}")
return {
"vendor": "Unknown",
"total_amount": 0.0,
"tax_amount": 0.0,
"date": "",
"category": "Other",
"confidence": 0.1,
"extraction_success": False,
"error": f"Text extraction failed: {str(e)}",
"location": None,
"calculated_tax": None,
"is_depreciable": None,
"name_of_asset": None,
"cca_rate": None,
"useful_life": None,
"residual_value": None,
}
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
"""Save uploaded file to temporary storage"""
try:
# Create uploads directory if it doesn't exist
upload_dir = "uploads"
os.makedirs(upload_dir, exist_ok=True)
# Generate unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = f"{timestamp}_{filename.replace(' ', '_')}"
file_path = os.path.join(upload_dir, safe_filename)
# Save file
async with aiofiles.open(file_path, "wb") as f:
await f.write(file_content)
return file_path
except Exception as e:
raise Exception(f"Failed to save file: {str(e)}")
async def extract_transactions_from_image(self, image_path: str) -> Dict[str, Any]:
"""Extract multiple transactions from an image (bank statement, credit card statement, etc.)"""
try:
# Encode image to base64
base64_image = self._encode_image(image_path)
# Create Groq vision prompt for transaction extraction
prompt = """
Analyze this financial document image (bank statement, credit card statement, etc.) and extract ALL transactions in JSON format.
Look for transaction lists, payment records, or any financial entries that show:
- Date
- Amount (positive or negative)
- Vendor/Description/Payee name
- Any additional notes or memo
Return the transactions as a JSON array:
{
"extraction_success": true,
"transactions": [
{
"date": "YYYY-MM-DD",
"amount": 0.00,
"vendor": "Vendor name",
"memo": "Additional notes"
},
{
"date": "YYYY-MM-DD",
"amount": -0.00,
"vendor": "Another vendor",
"memo": "Payment or charge description"
}
]
}
Rules:
- Extract ALL visible transactions
- Include both positive (credits) and negative (debits) amounts
- Use the actual date format from the document
- Vendor should be the merchant/payee name
- Memo can include transaction type, reference numbers, etc.
- If no transactions found, return empty array but set extraction_success to true
Return only valid JSON.
"""
# Call Groq vision API
response = self.client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
}
],
model=self.model,
max_tokens=2000, # Higher token limit for multiple transactions
temperature=0.1,
)
# Parse response
result_text = response.choices[0].message.content.strip()
return self._parse_transaction_extraction_result(result_text)
except Exception as e:
return {
"extraction_success": False,
"error": f"Transaction extraction error: {str(e)}",
"transactions": [],
}
def _parse_transaction_extraction_result(self, result_text: str) -> Dict[str, Any]:
"""Parse Groq response for transaction extraction"""
try:
import json
import re
# Find the first '{' and last '}'
start = result_text.find("{")
end = result_text.rfind("}")
if start == -1 or end == -1 or end <= start:
return {
"extraction_success": False,
"error": "Could not find JSON object in AI response",
"transactions": [],
}
json_str = result_text[start : end + 1]
# Remove trailing commas before } or ]
json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
try:
data = json.loads(json_str)
except Exception as e:
import logging
logging.error(f"JSON parsing error: {str(e)}")
logging.error(f"Offending JSON string:\n{json_str}")
return {
"extraction_success": False,
"error": f"JSON parsing error: {str(e)}",
"transactions": [],
}
# Validate and clean data
transactions = data.get("transactions", [])
cleaned_transactions = []
for txn in transactions:
try:
cleaned_txn = {
"date": str(txn.get("date", "")).strip(),
"amount": float(
str(txn.get("amount", 0)).replace("$", "").replace(",", "")
),
"vendor": str(txn.get("vendor", "")).strip(),
"memo": str(txn.get("memo", "")).strip(),
}
cleaned_transactions.append(cleaned_txn)
except Exception:
continue
return {
"extraction_success": data.get("extraction_success", True),
"transactions": cleaned_transactions,
"total_transactions": len(cleaned_transactions),
}
except Exception as e:
import logging
logging.error(f"JSON parsing error (outer): {str(e)}")
return {
"extraction_success": False,
"error": f"JSON parsing error: {str(e)}",
"transactions": [],
}
def _parse_date_to_iso(self, date_str: str) -> str:
"""Parse various date formats and convert to YYYY-MM-DD"""
try:
import re
from datetime import datetime
date_str = date_str.strip().upper()
# Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024"
month_pattern = r"(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s+(\d{1,2})(?:,\s*(\d{4}))?"
match = re.match(month_pattern, date_str)
if match:
month_abbr, day, year = match.groups()
month_map = {
"JAN": 1,
"FEB": 2,
"MAR": 3,
"APR": 4,
"MAY": 5,
"JUN": 6,
"JUL": 7,
"AUG": 8,
"SEP": 9,
"OCT": 10,
"NOV": 11,
"DEC": 12,
}
month = month_map[month_abbr]
day = int(day)
year = int(year) if year else datetime.now().year
# Handle 2-digit years
if year < 100:
year += 2000
return f"{year:04d}-{month:02d}-{day:02d}"
# Handle YYYY-MM-DD format
if re.match(r"\d{4}-\d{2}-\d{2}", date_str):
return date_str
# Handle MM/DD/YYYY format
if re.match(r"\d{1,2}/\d{1,2}/\d{4}", date_str):
return datetime.strptime(date_str, "%m/%d/%Y").strftime("%Y-%m-%d")
# Handle MM/DD/YY format
if re.match(r"\d{1,2}/\d{1,2}/\d{2}", date_str):
return datetime.strptime(date_str, "%m/%d/%y").strftime("%Y-%m-%d")
return None
except Exception:
return None
+992
View File
@@ -0,0 +1,992 @@
import json
import logging
from typing import Any, Dict
import groq
from config import settings
from schemas import Receipt, Transaction
logger = logging.getLogger(__name__)
class LLMTaxAnalyzer:
"""
Uses LLM to intelligently apply tax rules based on context.
Implements four core tax rules:
1. Sales Tax Rule - Based on receipt location (shipping/billing address)
2. Foreign Exchange Rule - Handles currency mismatches
3. Depreciation Rule - Capital assets (based on user location)
4. Meals & Entertainment Rule - 50% tax deduction, 100% accounting deduction
"""
# Provincial tax rates for reference
PROVINCIAL_TAX_RATES = {
"ON": {"rate": 0.13, "name": "HST", "type": "Harmonized"},
"QC": {"rate": 0.14975, "name": "QST + GST", "type": "Combined"},
"BC": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
"AB": {"rate": 0.05, "name": "GST", "type": "Federal only"},
"SK": {"rate": 0.11, "name": "PST + GST", "type": "Combined"},
"MB": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
"NS": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"NB": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"NL": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"PE": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"NT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
"NU": {"rate": 0.05, "name": "GST", "type": "Federal only"},
"YT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
}
# CCA rates by asset class (simplified)
CCA_RATES = {
"vehicles": 0.30, # Class 10
"computer_equipment": 0.55, # Class 50
"furniture": 0.20, # Class 8
"buildings": 0.04, # Class 1
"machinery": 0.20, # Class 8
}
def __init__(self):
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
self.model = settings.model
self.max_retries = 3
def analyze_and_apply_tax_rules_batch(
self,
matches: list, # List of Match objects
user_location: str = "ON",
) -> list:
"""
Batch process all matches in a SINGLE LLM call to reduce costs.
Analyzes all receipt-transaction pairs together and applies tax rules.
Falls back to individual processing if batch fails.
"""
if not matches:
return matches
logger.info(f"Starting batch tax analysis for {len(matches)} matches")
# Build batch context for all matches
try:
batch_context = self._build_batch_analysis_context(matches, user_location)
except Exception as e:
logger.error(f"Error building batch context: {str(e)}")
# If we can't even build the context, return matches as-is
for match in matches:
match.match_reason += " (Batch analysis setup failed)"
return matches
# Get LLM analysis for ALL matches at once
llm_batch_analysis = self._get_llm_tax_analysis_batch(
batch_context, len(matches)
)
# Check if we got any analysis back
if not llm_batch_analysis:
logger.warning("Batch LLM analysis returned empty results")
# Fallback: Try processing each match individually if batch size is small
if (
len(matches) <= 5
): # Only fallback for small batches to avoid excessive API calls
logger.info(
f"Attempting individual processing fallback for {len(matches)} matches"
)
return self._process_matches_individually(matches, user_location)
else:
logger.warning(
f"Batch too large ({len(matches)} matches) for individual fallback - returning matches without enhanced tax analysis"
)
for match in matches:
match.match_reason += " (Batch tax analysis unavailable)"
return matches
logger.info(f"Received batch analysis for {len(llm_batch_analysis)} matches")
# Apply results to each match
enhanced_matches = []
for i, match in enumerate(matches):
try:
# Get the analysis for this specific match from the batch results
match_key = f"match_{i}"
match_analysis = llm_batch_analysis.get(match_key, {})
if match_analysis and isinstance(match_analysis, dict):
# Apply the tax analysis to this match
enhanced_match = self._apply_tax_analysis_to_match(
match, match_analysis
)
enhanced_matches.append(enhanced_match)
else:
# No analysis available for this match, use as-is
logger.warning(
f"No analysis found for match {i} (key: {match_key})"
)
match.match_reason += " (Tax analysis incomplete)"
enhanced_matches.append(match)
except Exception as e:
logger.error(f"Error applying tax analysis to match {i}: {str(e)}")
match.match_reason += " (Tax analysis error)"
enhanced_matches.append(match)
logger.info(
f"Completed batch tax analysis, enhanced {len(enhanced_matches)} matches"
)
# logger.info(
# f"\n\n\nFinal batch enhanced matches: {enhanced_matches}"
# )
return enhanced_matches
def _process_matches_individually(self, matches: list, user_location: str) -> list:
"""
Fallback method: Process matches one at a time using the legacy method.
Only used when batch processing fails and batch size is small.
"""
logger.info(f"Processing {len(matches)} matches individually as fallback")
enhanced_matches = []
for i, match in enumerate(matches):
try:
# Use the legacy single-match analysis method
tax_analysis = self.analyze_and_apply_tax_rules(
match.receipt, match.transaction, user_location
)
# Apply the analysis to the match
enhanced_match = self._apply_tax_analysis_to_match(match, tax_analysis)
enhanced_matches.append(enhanced_match)
logger.info(
f"Successfully processed match {i + 1}/{len(matches)} individually"
)
except Exception as e:
logger.error(f"Error in individual processing for match {i}: {str(e)}")
match.match_reason += " (Individual tax analysis failed)"
enhanced_matches.append(match)
return enhanced_matches
def analyze_and_apply_tax_rules(
self,
receipt: Receipt,
transaction: Transaction,
user_location: str = "ON", # Default to Ontario
) -> Dict[str, Any]:
"""
Legacy single-match analysis method (kept for backward compatibility).
Use analyze_and_apply_tax_rules_batch() for better performance.
Use LLM to intelligently analyze and apply all tax rules:
1. Sales tax based on receipt location (shipping/billing address priority)
2. Foreign exchange rules for currency mismatches
3. Depreciation rules for capital assets (based on user location)
4. Meals & Entertainment deduction rules
"""
# Prepare context for LLM
analysis_context = self._build_analysis_context(
receipt, transaction, user_location
)
# Get LLM analysis
llm_analysis = self._get_llm_tax_analysis(analysis_context)
# Parse and structure the results
structured_results = self._structure_analysis_results(
llm_analysis, receipt, transaction, user_location
)
return structured_results
def _build_analysis_context(
self, receipt: Receipt, transaction: Transaction, user_location: str
) -> str:
"""Build comprehensive context for LLM analysis"""
# Extract location information
receipt_location = self._extract_receipt_location(receipt)
# Normalize user_location to province code (handle "Canada", "Ontario", "ON", etc.)
user_province = self._normalize_location_to_province(user_location)
logger.info(
f"Building tax analysis context - User Location: {user_location} → Province Code: {user_province}"
)
# Build tax rates reference
tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
context = f"""
RECEIPT DETAILS:
- Vendor: {receipt.vendor}
- Amount: ${receipt.amount:.2f}
- Currency: {receipt.currency}
- Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
- Category: {receipt.category}
- Description: {receipt.description}
- Billing Address: {self._format_address(receipt.billing_address)}
- Shipping Address: {self._format_address(receipt.shipping_address)}
- Is Meals & Entertainment: {receipt.is_meals_entertainment}
TRANSACTION DETAILS:
- Vendor: {transaction.vendor}
- Amount: ${transaction.amount:.2f}
- Currency: {transaction.currency}
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
- Notes: {transaction.notes}
- FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
USER CONTEXT:
- User Location (Province): {user_province}
- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
RECEIPT LOCATION DETECTED:
{receipt_location}
PROVINCIAL TAX RATES REFERENCE:
{tax_rates_info}
CCA DEPRECIATION RATES BY ASSET CLASS:
{cca_rates_info}
"""
return context
def _normalize_location_to_province(self, location: str) -> str:
"""
Normalize various location formats to province code.
Handles: "ON", "Ontario", "Canada", etc.
"""
location_upper = location.upper().strip()
# Direct province code match
if location_upper in self.PROVINCIAL_TAX_RATES:
return location_upper
# Map full province names to codes
province_name_map = {
"ONTARIO": "ON",
"QUEBEC": "QC",
"BRITISH COLUMBIA": "BC",
"ALBERTA": "AB",
"SASKATCHEWAN": "SK",
"MANITOBA": "MB",
"NOVA SCOTIA": "NS",
"NEW BRUNSWICK": "NB",
"NEWFOUNDLAND AND LABRADOR": "NL",
"NEWFOUNDLAND": "NL",
"PRINCE EDWARD ISLAND": "PE",
"NORTHWEST TERRITORIES": "NT",
"NUNAVUT": "NU",
"YUKON": "YT",
}
if location_upper in province_name_map:
return province_name_map[location_upper]
# Default to Ontario if country is Canada or unspecified
if location_upper in ["CANADA", "CAN", "CA", ""]:
logger.warning(f"Location '{location}' is too generic, defaulting to ON")
return "ON"
# If nothing matches, default to Ontario
logger.warning(f"Could not parse location '{location}', defaulting to ON")
return "ON"
def _extract_receipt_location(self, receipt: Receipt) -> str:
"""Extract and format receipt location information"""
# Priority: Use shipping address if available, then billing
location = (
receipt.shipping_address
if receipt.shipping_address
else receipt.billing_address
)
if location:
return f"""
- Province: {location.province}
- City: {location.city}
- Country: {location.country}
- Postal Code: {location.postal_code}
"""
else:
return "- No address information available (will use user location)"
def _format_address(self, address) -> str:
"""Format address for display"""
if address:
return f"{address.city}, {address.province}, {address.country} ({address.postal_code})"
return "Not provided"
def _get_llm_tax_analysis(self, context: str) -> str:
"""Get tax rule analysis from LLM"""
prompt = f"""
You are a tax expert analyzing a receipt-transaction match. Apply the following tax rules intelligently:
And you are to calculate the tax for the receipt based on the context provided.
{context}
=== FOUR CORE TAX RULES ===
### 1. SALES TAX RULE
**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
**Key Principles**:
- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
**Scenario Examples**:
a) User in Ontario, Receipt from Quebec:
- Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
- The user's location is only for depreciation purposes
b) User in Ontario, Receipt from USA (New York):
- DO NOT apply Canadian sales tax
- This is an international transaction
- Flag for FX review instead
c) User in USA (New York), Receipt from California:
- Apply California's sales tax rate (receipt location)
- Not New York's rate (user location)
d) User in Ontario, Receipt has NO address information:
- DEFAULT to user's location (Ontario 13% HST)
- This is the fallback when receipt location is unknown
**Tax Calculation**:
- Compare calculated tax vs stated tax on receipt
- Flag discrepancies for review
### 2. FOREIGN EXCHANGE (FX) RULE
**Purpose**: Handle currency mismatches between receipts and transactions.
**Actions**:
- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
- Calculate the absolute discrepancy: |receipt_amount - transaction_amount|
- ALWAYS flag for manual review - DO NOT fetch exchange rates automatically
- If FX rate is provided in transaction data, note it but still require manual review
**Examples**:
- Transaction: USD $100, Receipt: CAD $125 → Discrepancy: $25, Flag for review
- The user must manually approve or adjust the FX difference
### 3. DEPRECIATION RULE
**Purpose**: Calculate depreciation for assets using two methods.
**Key Principle**: Depreciation is ALWAYS based on USER'S location, NOT receipt location.
**Asset Identification**:
- Only applies to capital assets: vehicles, equipment, furniture, buildings, machinery
- Identify from receipt category and description
- Typical threshold: Assets generally > $500
**Two Methods Required**:
a) **Straight-Line Depreciation** (for accounting purposes):
Formula: (Cost - Residual Value) / Useful Life
Example: Asset $10,000, 5-year life, $1,000 residual = $1,800/year
b) **CCA Depreciation** (for tax purposes - Canada):
Method: Declining Balance
Formula: Book Value × CCA Rate each year
Example: Truck $20,000, 30% CCA:
- Year 1: $20,000 × 30% = $6,000
- Year 2: ($20,000 - $6,000) × 30% = $4,200
- Continues declining each year
**CCA Classes** (Canada):
- Vehicles: 30% (Class 10)
- Computer Equipment: 55% (Class 50)
- Furniture/Machinery: 20% (Class 8)
- Buildings: 4% (Class 1)
### 4. MEALS & ENTERTAINMENT TAX DEDUCTION RULE
**Purpose**: Apply correct deductions for meals and entertainment expenses.
**Deduction Rules**:
- **For Tax Purposes**: Only 50% of total receipt amount is deductible
- **For Accounting Purposes**: 100% of total receipt amount is deductible
- **Sales Tax**: Full sales tax amount is deductible for accounting
**Example**:
- Receipt: $100 meal + $12 sales tax = $112 total
- **Tax Deduction**: $50 (50% of meal) + $12 (full tax) = $62
- **Accounting Deduction**: $100 (full meal) + $12 (full tax) = $112
=== LOCATION-BASED SCENARIO HANDLING ===
**When Receipt Location ≠ User Location**:
1. **Sales Tax**: Use RECEIPT's location for tax calculation
- Exception: If international (different country), no Canadian sales tax + flag FX
- Exception: If no location on receipt, use user's location as default
2. **Depreciation**: ALWAYS use USER's location for depreciation rules
- Receipt location is irrelevant for depreciation
- Apply user's country/province depreciation methods
3. **FX Handling**:
- If receipt currency ≠ transaction currency: Flag for manual review
- Do NOT automatically fetch or apply exchange rates
4. **Missing Location**:
- If receipt has no address: Default to user's location for sales tax
- Still apply user's location for depreciation
=== ANALYSIS REQUIRED ===
Provide a structured JSON response with the following format:
**CRITICAL INSTRUCTION FOR final_tax_amount:**
- This field MUST contain ONLY the calculated sales tax amount in dollars
- This is NOT the total amount including tax
- This is ONLY the tax portion (HST/GST/PST/QST)
- Example: If receipt total is $100 and calculated tax is $13, return 13.00 (not 113.00)
- For meals & entertainment: Return the FULL calculated tax amount (not the 50% adjusted amount)
{{
"final_tax_amount": XX.XX, // ONLY the calculated tax amount (e.g., 13.00 for $100 + $13 HST)
"sales_tax": {{
"applicable_province": "XX",
"applicable_rate": 0.XX,
"tax_name": "HST/GST/PST/QST",
"calculated_tax": XX.XX, // This should match final_tax_amount above
"stated_tax": XX.XX,
"discrepancy": XX.XX,
"reason": "Detailed explanation",
"requires_review": true/false
}},
"foreign_exchange": {{
"currency_mismatch": true/false,
"receipt_currency": "XXX",
"transaction_currency": "XXX",
"receipt_amount": XX.XX,
"transaction_amount": XX.XX,
"discrepancy": XX.XX,
"requires_manual_review": true/false,
"reason": "Explanation of FX situation"
}},
"depreciation": {{
"is_capital_asset": true/false,
"asset_class": "category name or N/A",
"suggested_cca_rate": 0.XX,
"straight_line_applicable": true/false,
"cca_applicable": true/false,
"straight_line_example": "Brief calculation example if applicable",
"cca_example": "Brief calculation example if applicable",
"reason": "Why this is/isn't a capital asset, which CCA class, and why depreciation based on user's location"
}},
"meals_entertainment": {{
"is_meals_entertainment": true/false,
"tax_deduction_amount": XX.XX,
"accounting_deduction_amount": XX.XX,
"sales_tax_included": XX.XX,
"reason": "Explanation of M&E rule application"
}},
"confidence_adjustment": {{
"boost": 0.XX,
"reduce": 0.XX,
"reason": "Why confidence should be adjusted based on tax analysis"
}},
"overall_assessment": "Comprehensive summary: which rules applied, why, what location used for what purpose, and any required actions"
}}
**IMPORTANT**: The "final_tax_amount" field at the top level must contain the final calculated tax amount. This should be the calculated_tax from sales_tax analysis. If this is a meals & entertainment expense, ensure you return the FULL tax amount here (not the 50% adjusted amount).
**Critical Reminders**:
- Sales tax uses RECEIPT location (or user location if receipt has none)
- Depreciation ALWAYS uses USER location
- For different addresses, use SHIPPING address for sales tax
- International transactions: no Canadian tax + FX flag
- Be precise with all calculations
- Always explain your reasoning clearly
"""
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": "You are a Canadian tax expert. Analyze transactions and apply tax rules accurately. Always return valid JSON.",
},
{"role": "user", "content": prompt},
],
temperature=0.1, # Low temperature for consistent, factual responses
max_tokens=2000,
)
content = response.choices[0].message.content.strip()
logger.info(f"LLM tax analysis received: {len(content)} characters")
return content
except Exception as e:
logger.error(f"Error getting LLM tax analysis: {str(e)}")
return self._get_fallback_analysis()
def _get_fallback_analysis(self) -> str:
"""Return fallback analysis if LLM fails"""
return json.dumps(
{
"final_tax_amount": 0.0,
"sales_tax": {
"applicable_province": "ON",
"applicable_rate": 0.13,
"tax_name": "HST",
"calculated_tax": 0.0,
"stated_tax": 0.0,
"discrepancy": 0.0,
"reason": "LLM analysis failed - using defaults",
"requires_review": True,
},
"foreign_exchange": {
"currency_mismatch": False,
"requires_manual_review": False,
"reason": "Analysis not available",
},
"depreciation": {
"is_capital_asset": False,
"reason": "Analysis not available",
},
"meals_entertainment": {
"is_meals_entertainment": False,
"reason": "Analysis not available",
},
"confidence_adjustment": {
"boost": 0.0,
"reduce": 0.1,
"reason": "LLM analysis failed - recommend manual review",
},
"overall_assessment": "Automatic analysis failed. Manual review recommended.",
}
)
def _structure_analysis_results(
self,
llm_response: str,
receipt: Receipt,
transaction: Transaction,
user_location: str,
) -> Dict[str, Any]:
"""Parse LLM response and structure it for application"""
try:
# Extract JSON from LLM response (may have markdown code blocks)
json_str = llm_response
if "```json" in llm_response:
json_str = llm_response.split("```json")[1].split("```")[0].strip()
elif "```" in llm_response:
json_str = llm_response.split("```")[1].split("```")[0].strip()
analysis = json.loads(json_str)
# Add metadata
analysis["metadata"] = {
"user_location": user_location,
"receipt_id": receipt.id,
"transaction_id": transaction.id,
"analysis_method": "LLM-based",
"model": self.model,
}
return analysis
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response as JSON: {str(e)}")
logger.error(f"LLM response was: {llm_response}")
# Return structured fallback
return {
"final_tax_amount": receipt.tax if receipt.tax else 0.0,
"sales_tax": {
"requires_review": True,
"reason": "Failed to parse LLM response",
},
"foreign_exchange": {
"requires_manual_review": receipt.currency != transaction.currency
},
"depreciation": {"is_capital_asset": False},
"confidence_adjustment": {
"boost": 0.0,
"reduce": 0.15,
"reason": "Analysis parsing failed",
},
"overall_assessment": "Analysis failed. Manual review required.",
"error": str(e),
"metadata": {
"user_location": user_location,
"analysis_method": "fallback",
},
}
def _build_batch_analysis_context(self, matches: list, user_location: str) -> str:
"""Build comprehensive context for batch LLM analysis of all matches"""
# Normalize user_location to province code
user_province = self._normalize_location_to_province(user_location)
logger.info(
f"Building batch tax analysis context for {len(matches)} matches - User Location: {user_location} → Province Code: {user_province}"
)
# Build tax rates and CCA references once
tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
# Build match entries
matches_info = []
for i, match in enumerate(matches):
receipt = match.receipt
transaction = match.transaction
receipt_location = self._extract_receipt_location(receipt)
match_info = f"""
MATCH {i} (ID: match_{i}):
Receipt Details:
- Vendor: {receipt.vendor}
- Amount: ${receipt.amount:.2f}
- Currency: {receipt.currency}
- Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
- Category: {receipt.category}
- Description: {receipt.description}
- Billing Address: {self._format_address(receipt.billing_address)}
- Shipping Address: {self._format_address(receipt.shipping_address)}
- Is Meals & Entertainment: {receipt.is_meals_entertainment}
Transaction Details:
- Vendor: {transaction.vendor}
- Amount: ${transaction.amount:.2f}
- Currency: {transaction.currency}
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
- Notes: {transaction.notes}
- FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
Receipt Location Detected:
{receipt_location}
"""
matches_info.append(match_info)
matches_section = "\n".join(matches_info)
context = f"""
USER CONTEXT:
- User Location (Province): {user_province}
- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
PROVINCIAL TAX RATES REFERENCE:
{tax_rates_info}
CCA DEPRECIATION RATES BY ASSET CLASS:
{cca_rates_info}
=== MATCHES TO ANALYZE ({len(matches)} total) ===
{matches_section}
"""
return context
def _get_llm_tax_analysis_batch(self, context: str, num_matches: int) -> Dict[str, Any]:
"""Get tax rule analysis from LLM for ALL matches in a single call"""
prompt = f"""
You are a Canadian tax expert analyzing MULTIPLE receipt-transaction matches.
{context}
=== FOUR CORE TAX RULES ===
### 1. SALES TAX RULE
**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
**Key Principles**:
- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
**Scenario Examples**:
a) User in Ontario, Receipt from Quebec:
- Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
b) User in Ontario, Receipt from USA (New York):
- DO NOT apply Canadian sales tax
- This is an international transaction
- Flag for FX review instead
c) User in Ontario, Receipt has NO address information:
- DEFAULT to user's location (Ontario 13% HST)
**Tax Calculation**:
- Compare calculated tax vs stated tax on receipt
- Flag discrepancies for review
### 2. FOREIGN EXCHANGE (FX) RULE
**Purpose**: Handle currency mismatches between receipts and transactions.
**Actions**:
- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
- Calculate expected transaction amount using FX rate if available
- Flag discrepancies > $5 or 5% for manual review
- If FX rate missing but currencies differ, flag for review
### 3. DEPRECIATION RULE
**Purpose**: Identify capital assets requiring depreciation based on USER'S location.
**Critical**: Depreciation is ALWAYS based on the USER'S location (for Canadian tax filing), NOT the receipt location.
**Capital Asset Criteria**:
- Cost > $500 typically
- Useful life > 1 year
- Examples: computers, vehicles, furniture, machinery, buildings
**CCA Classes**: Assign appropriate class and rate based on asset type and user's jurisdiction
### 4. MEALS & ENTERTAINMENT RULE
**Purpose**: Apply 50% tax deduction limit for M&E expenses.
**Actions**:
- Identify M&E expenses (meals, entertainment, client dinners, etc.)
- Tax Deduction: 50% of total amount (including tax)
- Accounting Deduction: 100% of total amount (including tax)
- Always include sales tax in both calculations
=== YOUR TASK ===
Analyze EACH match and return a JSON object where each key is the match ID and the value is the complete tax analysis.
**CRITICAL INSTRUCTION FOR final_tax_amount:**
- This field MUST contain ONLY the calculated sales tax amount in dollars
- This is NOT the total amount including tax
- This is ONLY the tax portion (HST/GST/PST/QST)
- Example: If receipt total is $100 and calculated tax is $13, return 13.00 (not 113.00)
- For meals & entertainment: Return the FULL calculated tax amount (not the 50% adjusted amount)
- VERIFY: final_tax_amount should equal sales_tax.calculated_tax
-
Return your response as a SINGLE JSON object in this format:
{{
"match_0": {{
"final_tax_amount": XX.XX, // ONLY the calculated tax amount
"sales_tax": {{
"applicable_province": "XX",
"applicable_rate": 0.XX,
"tax_name": "HST/GST/PST",
"calculated_tax": XX.XX,
"stated_tax": XX.XX,
"discrepancy": XX.XX,
"reason": "Detailed explanation",
"requires_review": true/false
}},
"foreign_exchange": {{
"currency_mismatch": true/false,
"receipt_currency": "XXX",
"transaction_currency": "XXX",
"expected_transaction_amount": XX.XX,
"actual_transaction_amount": XX.XX,
"discrepancy": XX.XX,
"requires_manual_review": true/false,
"reason": "Explanation"
}},
"depreciation": {{
"is_capital_asset": true/false,
"asset_class": "class_XX",
"cca_rate": 0.XX,
"applicable_jurisdiction": "XX",
"reason": "Explanation"
}},
"meals_entertainment": {{
"is_meals_entertainment": true/false,
"tax_deduction_amount": XX.XX,
"accounting_deduction_amount": XX.XX,
"sales_tax_included": XX.XX,
"reason": "Explanation"
}},
"confidence_adjustment": {{
"boost": 0.XX,
"reduce": 0.XX,
"reason": "Why confidence should be adjusted"
}},
"overall_assessment": "Summary for this match"
}},
"match_1": {{
... same structure ...
}},
... for all {num_matches} matches ...
}}
"""
try:
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "system",
"content": "You are a Canadian tax expert. Analyze multiple transactions in batch and apply tax rules accurately. Return ONLY valid JSON - no markdown code blocks, no explanations, just pure JSON.",
},
{"role": "user", "content": prompt},
],
temperature=0.1, # Low temperature for consistent, factual responses
max_tokens=8000, # Higher limit for batch processing
)
content = response.choices[0].message.content
# Validate that we got content
if not content:
logger.error("LLM returned empty response")
return {}
content = content.strip()
# Check if content is empty after stripping
if not content:
logger.error("LLM returned whitespace-only response")
return {}
logger.info(
f"LLM batch tax analysis received: {len(content)} characters for {num_matches} matches"
)
logger.debug(f"Raw LLM response: {content[:500]}...") # Log first 500 chars
# Parse the JSON response - handle various markdown code block formats
json_str = content
# Check for markdown code blocks with various language identifiers
if "```json" in content:
json_str = content.split("```json")[1].split("```")[0].strip()
elif "```javascript" in content:
json_str = content.split("```javascript")[1].split("```")[0].strip()
elif "```js" in content:
json_str = content.split("```js")[1].split("```")[0].strip()
elif "```" in content:
# Generic code block - extract content between first ``` and last ```
parts = content.split("```")
if len(parts) >= 3:
# Take the second part (index 1), which is between first and second ```
json_str = parts[1].strip()
# Remove language identifier if it's on the first line
lines = json_str.split("\n", 1)
if len(lines) > 1 and lines[0].strip() in [
"json",
"javascript",
"js",
"",
]:
json_str = lines[1].strip()
# Validate JSON string is not empty
if not json_str:
logger.error("Extracted JSON string is empty")
logger.error(f"Original content was: {content[:500]}")
return {}
batch_analysis = json.loads(json_str)
# Validate we got a dictionary back
if not isinstance(batch_analysis, dict):
logger.error(f"LLM returned non-dict type: {type(batch_analysis)}")
return {}
logger.info(
f"Successfully parsed batch analysis with {len(batch_analysis)} matches"
)
return batch_analysis
except json.JSONDecodeError as e:
logger.error(f"JSON decode error in batch LLM tax analysis: {str(e)}")
logger.error(
f"Failed to parse: {json_str[:500] if 'json_str' in locals() else 'N/A'}"
)
return {}
except Exception as e:
logger.error(f"Error getting batch LLM tax analysis: {str(e)}")
logger.error(f"Exception type: {type(e).__name__}")
# Return empty dict so each match can handle fallback individually
return {}
def _apply_tax_analysis_to_match(self, match, tax_analysis: Dict[str, Any]):
"""Apply tax analysis results to a match object"""
# **CRITICAL FIX: Ensure final_tax_amount matches calculated_tax**
final_tax = tax_analysis.get("final_tax_amount", 0.0)
calculated_tax = tax_analysis.get("sales_tax", {}).get("calculated_tax", 0.0)
# If there's a mismatch, use calculated_tax as the source of truth
if abs(final_tax - calculated_tax) > 0.01:
logger.warning(
f"Correcting final_tax_amount mismatch for {match.receipt.vendor}: "
f"LLM returned final_tax_amount={final_tax}, but calculated_tax={calculated_tax}. "
f"Using calculated_tax as final value."
)
tax_analysis["final_tax_amount"] = calculated_tax
# Special case: If final_tax is 0 but calculated_tax > 0, always use calculated_tax
if final_tax == 0.0 and calculated_tax > 0.0:
logger.warning(
f"Correcting zero final_tax_amount for {match.receipt.vendor}: "
f"LLM returned 0 but calculated {calculated_tax} HST. Setting final_tax_amount={calculated_tax}"
)
tax_analysis["final_tax_amount"] = calculated_tax
tax_analysis["sales_tax"]["requires_review"] = True
# Apply the corrected tax analysis
match.tax_analysis = tax_analysis
logger.debug(
f"Applied tax analysis to match: {match.receipt.vendor} -> "
f"final_tax_amount={tax_analysis['final_tax_amount']}"
)
# Apply confidence adjustments based on tax analysis
confidence_adj = tax_analysis.get("confidence_adjustment", {})
# Boost confidence if tax rules validate the match
boost = confidence_adj.get("boost", 0.0)
if boost > 0:
match.confidence_score = min(1.0, match.confidence_score + boost)
match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
# Reduce confidence if tax issues detected
reduce = confidence_adj.get("reduce", 0.0)
if reduce > 0:
match.confidence_score = max(0.0, match.confidence_score - reduce)
match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
# Add flags for manual review if needed
review_flags = []
# Check sales tax issues
sales_tax = tax_analysis.get("sales_tax", {})
if sales_tax.get("requires_review", False):
review_flags.append("Sales Tax Review Required")
# Check FX issues
fx_analysis = tax_analysis.get("foreign_exchange", {})
if fx_analysis.get("requires_manual_review", False):
review_flags.append(
f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
)
# Check depreciation
depreciation = tax_analysis.get("depreciation", {})
if depreciation.get("is_capital_asset", False):
review_flags.append(
f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
)
# Check meals & entertainment
meals_ent = tax_analysis.get("meals_entertainment", {})
if meals_ent.get("is_meals_entertainment", False):
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
review_flags.append(
f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
)
# Add review flags to match reason
if review_flags:
match.match_reason += " | REVIEW: " + "; ".join(review_flags)
return match
+583
View File
@@ -0,0 +1,583 @@
"""
Manual Tax Calculator - Rule-based tax calculations without LLM
Implements the four core tax rules based on rules.py specifications
"""
import logging
from typing import Any, Dict, Optional, Tuple
from schemas import Receipt, Transaction
logger = logging.getLogger(__name__)
class ManualTaxCalculator:
"""
Deterministic tax calculator based on explicit rules from rules.py
No LLM calls - pure business logic for accurate, consistent tax calculations
"""
# Provincial tax rates for Canada
PROVINCIAL_TAX_RATES = {
"ON": {"rate": 0.13, "name": "HST", "type": "Harmonized"},
"QC": {"rate": 0.14975, "name": "QST + GST", "type": "Combined"},
"BC": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
"AB": {"rate": 0.05, "name": "GST", "type": "Federal only"},
"SK": {"rate": 0.11, "name": "PST + GST", "type": "Combined"},
"MB": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
"NS": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"NB": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"NL": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"PE": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
"NT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
"NU": {"rate": 0.05, "name": "GST", "type": "Federal only"},
"YT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
}
# CCA rates by asset class (Canada Revenue Agency rates)
CCA_RATES = {
"vehicles": {"rate": 0.30, "class": "Class 10", "description": "Vehicles"},
"computer_equipment": {
"rate": 0.55,
"class": "Class 50",
"description": "Computer Equipment",
},
"furniture": {
"rate": 0.20,
"class": "Class 8",
"description": "Furniture & Fixtures",
},
"buildings": {"rate": 0.04, "class": "Class 1", "description": "Buildings"},
"machinery": {
"rate": 0.20,
"class": "Class 8",
"description": "Machinery & Equipment",
},
}
# Capital asset threshold
CAPITAL_ASSET_THRESHOLD = 500.00
# Meals & Entertainment categories
MEALS_ENTERTAINMENT_KEYWORDS = [
"restaurant",
"cafe",
"coffee",
"dining",
"food",
"meal",
"catering",
"entertainment",
"bar",
"pub",
"bistro",
"eatery",
]
# Capital asset keywords
CAPITAL_ASSET_KEYWORDS = {
"vehicles": ["vehicle", "car", "truck", "van", "automobile", "suv"],
"computer_equipment": [
"computer",
"laptop",
"desktop",
"server",
"tablet",
"monitor",
"printer",
"scanner",
],
"furniture": [
"furniture",
"desk",
"chair",
"table",
"cabinet",
"bookshelf",
"sofa",
],
"buildings": ["building", "property", "real estate", "office space"],
"machinery": ["machinery", "equipment", "tool", "industrial"],
}
def calculate_tax_analysis(
self, receipt: Receipt, transaction: Transaction, user_location: str = "ON"
) -> Dict[str, Any]:
"""
Calculate comprehensive tax analysis for a receipt-transaction match
Returns:
Dict containing:
- sales_tax: Sales tax calculation and validation
- foreign_exchange: FX analysis and discrepancies
- depreciation: Capital asset depreciation details
- meals_entertainment: M&E deduction calculations
- confidence_adjustment: Confidence boost/reduction
"""
analysis = {}
# 1. Sales Tax Rule
analysis["sales_tax"] = self._calculate_sales_tax(
receipt, transaction, user_location
)
# 2. Foreign Exchange Rule
analysis["foreign_exchange"] = self._calculate_foreign_exchange(
receipt, transaction
)
# 3. Depreciation Rule
analysis["depreciation"] = self._calculate_depreciation(receipt, user_location)
# 4. Meals & Entertainment Rule
analysis["meals_entertainment"] = self._calculate_meals_entertainment(receipt)
# Calculate confidence adjustments
analysis["confidence_adjustment"] = self._calculate_confidence_adjustment(
analysis
)
# Calculate final tax amount
analysis["final_tax_amount"] = analysis["sales_tax"]["calculated_tax"]
return analysis
def _calculate_sales_tax(
self, receipt: Receipt, transaction: Transaction, user_location: str
) -> Dict[str, Any]:
"""
Rule 1: Sales Tax Calculation
- Priority: shipping address > billing address > user location
- Different country: no Canadian tax
- Missing location: default to user location
"""
# Determine the applicable location for tax
receipt_location, location_source = self._determine_receipt_location(
receipt, user_location
)
# Check if international transaction
is_international = self._is_international_transaction(
receipt_location, user_location
)
if is_international:
return {
"applicable_province": None,
"applicable_rate": 0.0,
"tax_name": "N/A",
"calculated_tax": 0.0,
"stated_tax": receipt.tax,
"discrepancy": abs(receipt.tax - 0.0),
"reason": f"International transaction - no Canadian tax applied. Receipt location: {receipt_location}",
"requires_review": True,
"location_source": location_source,
"is_international": True,
}
# Get tax rate for the applicable province
tax_info = self.PROVINCIAL_TAX_RATES.get(
receipt_location, self.PROVINCIAL_TAX_RATES.get(user_location)
)
# Calculate expected tax based on receipt amount
# Tax should be calculated on pre-tax amount
pre_tax_amount = receipt.amount - receipt.tax
calculated_tax = round(pre_tax_amount * tax_info["rate"], 2)
# Calculate discrepancy
discrepancy = abs(receipt.tax - calculated_tax)
discrepancy_percentage = (
(discrepancy / receipt.tax * 100) if receipt.tax > 0 else 0
)
# Determine if review is needed (>5% discrepancy)
requires_review = discrepancy_percentage > 5.0
return {
"applicable_province": receipt_location,
"applicable_rate": tax_info["rate"],
"tax_name": tax_info["name"],
"calculated_tax": calculated_tax,
"stated_tax": receipt.tax,
"discrepancy": discrepancy,
"discrepancy_percentage": round(discrepancy_percentage, 2),
"reason": f"Tax calculated for {receipt_location} ({tax_info['name']}) - {location_source}",
"requires_review": requires_review,
"location_source": location_source,
"is_international": False,
}
def _calculate_foreign_exchange(
self, receipt: Receipt, transaction: Transaction
) -> Dict[str, Any]:
"""
Rule 2: Foreign Exchange Handling
- Flag currency mismatches
- Don't auto-fetch rates
- Manual review required
"""
currency_mismatch = receipt.currency != transaction.currency
if not currency_mismatch:
return {
"currency_mismatch": False,
"receipt_currency": receipt.currency,
"transaction_currency": transaction.currency,
"requires_manual_review": False,
"reason": "Currencies match - no FX adjustment needed",
}
# Calculate discrepancy
discrepancy = abs(receipt.amount - transaction.amount)
# Check if transaction has FX rate
has_fx_rate = transaction.fx_rate is not None and transaction.fx_rate > 0
if has_fx_rate:
expected_amount = round(receipt.amount * transaction.fx_rate, 2)
calculated_discrepancy = abs(transaction.amount - expected_amount)
else:
expected_amount = None
calculated_discrepancy = None
return {
"currency_mismatch": True,
"receipt_currency": receipt.currency,
"transaction_currency": transaction.currency,
"receipt_amount": receipt.amount,
"transaction_amount": transaction.amount,
"discrepancy": discrepancy,
"fx_rate": transaction.fx_rate,
"expected_amount": expected_amount,
"calculated_discrepancy": calculated_discrepancy,
"requires_manual_review": True,
"reason": f"Currency mismatch detected: {receipt.currency}{transaction.currency}. Manual review required.",
}
def _calculate_depreciation(
self, receipt: Receipt, user_location: str
) -> Dict[str, Any]:
"""
Rule 3: Depreciation Calculation
- Always based on USER location (not receipt location)
- Threshold: $500+
- Two methods: Straight-Line (accounting) and CCA (tax)
"""
# Check if this is a capital asset
is_capital_asset = receipt.amount >= self.CAPITAL_ASSET_THRESHOLD
asset_class = None
cca_info = None
if is_capital_asset:
# Identify asset class from category and description
asset_class = self._identify_asset_class(receipt)
if asset_class:
cca_info = self.CCA_RATES.get(asset_class)
if not is_capital_asset or not asset_class:
return {
"is_capital_asset": False,
"reason": f"Not a capital asset (Amount: ${receipt.amount:.2f}, Threshold: ${self.CAPITAL_ASSET_THRESHOLD:.2f})",
}
# Calculate straight-line depreciation (accounting)
# Default: 5-year useful life, 10% residual value
useful_life_years = 5
residual_percentage = 0.10
residual_value = receipt.amount * residual_percentage
annual_straight_line = (receipt.amount - residual_value) / useful_life_years
# Calculate CCA depreciation (tax - declining balance)
cca_rate = cca_info["rate"]
year1_cca = receipt.amount * cca_rate
year2_cca = (receipt.amount - year1_cca) * cca_rate
return {
"is_capital_asset": True,
"asset_class": asset_class,
"cca_class": cca_info["class"],
"cca_description": cca_info["description"],
"asset_cost": receipt.amount,
"user_location": user_location,
"straight_line_depreciation": {
"method": "Straight-Line (Accounting)",
"useful_life_years": useful_life_years,
"residual_value": round(residual_value, 2),
"annual_depreciation": round(annual_straight_line, 2),
},
"cca_depreciation": {
"method": "CCA Declining Balance (Tax)",
"cca_rate": cca_rate,
"year_1_depreciation": round(year1_cca, 2),
"year_2_depreciation": round(year2_cca, 2),
},
"reason": f"Capital asset identified: {cca_info['description']} - Depreciation calculated based on user location ({user_location})",
}
def _calculate_meals_entertainment(self, receipt: Receipt) -> Dict[str, Any]:
"""
Rule 4: Meals & Entertainment Deductions
- Tax: 50% of meal cost + 100% of sales tax
- Accounting: 100% of meal cost + 100% of sales tax
"""
# Check if this is meals & entertainment
is_meals_entertainment = self._is_meals_entertainment(receipt)
if not is_meals_entertainment:
return {
"is_meals_entertainment": False,
"reason": "Not classified as meals & entertainment",
}
# Calculate pre-tax meal amount
meal_amount = receipt.amount - receipt.tax
sales_tax = receipt.tax
# Tax deduction: 50% of meal + 100% of tax
tax_deduction = (meal_amount * 0.50) + sales_tax
# Accounting deduction: 100% of meal + 100% of tax
accounting_deduction = meal_amount + sales_tax
return {
"is_meals_entertainment": True,
"meal_amount": round(meal_amount, 2),
"sales_tax": round(sales_tax, 2),
"total_receipt": round(receipt.amount, 2),
"tax_deduction_amount": round(tax_deduction, 2),
"tax_deduction_percentage": 50.0,
"accounting_deduction_amount": round(accounting_deduction, 2),
"accounting_deduction_percentage": 100.0,
"reason": "Meals & Entertainment: 50% deductible for tax purposes, 100% for accounting",
"breakdown": {
"meal_cost": round(meal_amount, 2),
"tax_50_percent": round(meal_amount * 0.50, 2),
"full_sales_tax": round(sales_tax, 2),
},
}
def _calculate_confidence_adjustment(
self, analysis: Dict[str, Any]
) -> Dict[str, float]:
"""
Calculate confidence boost/reduction based on tax analysis
"""
boost = 0.0
reduce = 0.0
# Sales tax analysis
sales_tax = analysis.get("sales_tax", {})
if sales_tax.get("requires_review"):
reduce += 0.05
else:
# Small discrepancy is good
discrepancy_pct = sales_tax.get("discrepancy_percentage", 0)
if discrepancy_pct < 2.0:
boost += 0.05
# Foreign exchange
fx = analysis.get("foreign_exchange", {})
if fx.get("currency_mismatch"):
reduce += 0.10 # FX always requires review
# Depreciation - capital assets need review
depreciation = analysis.get("depreciation", {})
if depreciation.get("is_capital_asset"):
reduce += 0.05
return {"boost": round(boost, 2), "reduce": round(reduce, 2)}
def _determine_receipt_location(
self, receipt: Receipt, user_location: str
) -> Tuple[str, str]:
"""
Determine the applicable location for tax calculation
Priority: shipping address > billing address > user location
Returns: (province_code, source_description)
"""
# Check shipping address first
if receipt.shipping_address:
province = self._extract_province_from_address(receipt.shipping_address)
if province:
return province, "shipping address"
# Check billing address
if receipt.billing_address:
province = self._extract_province_from_address(receipt.billing_address)
if province:
return province, "billing address"
# Default to user location
return user_location, "user location (default)"
def _extract_province_from_address(self, address: str) -> Optional[str]:
"""
Extract Canadian province code from address string
"""
if not address:
return None
address_upper = address.upper()
# Check for province codes
for province_code in self.PROVINCIAL_TAX_RATES.keys():
if province_code in address_upper:
return province_code
# Check for full province names
province_names = {
"ONTARIO": "ON",
"QUEBEC": "QC",
"BRITISH COLUMBIA": "BC",
"ALBERTA": "AB",
"SASKATCHEWAN": "SK",
"MANITOBA": "MB",
"NOVA SCOTIA": "NS",
"NEW BRUNSWICK": "NB",
"NEWFOUNDLAND": "NL",
"PRINCE EDWARD ISLAND": "PE",
"NORTHWEST TERRITORIES": "NT",
"NUNAVUT": "NU",
"YUKON": "YT",
}
for full_name, code in province_names.items():
if full_name in address_upper:
return code
return None
def _is_international_transaction(
self, receipt_location: str, user_location: str
) -> bool:
"""
Check if this is an international transaction
(receipt from outside Canada when user is in Canada, or vice versa)
"""
# If receipt location is not a Canadian province, it's international
is_canadian = receipt_location in self.PROVINCIAL_TAX_RATES
# For now, assume user_location is always Canadian
# In future, add support for other countries
return not is_canadian
def _identify_asset_class(self, receipt: Receipt) -> Optional[str]:
"""
Identify the asset class from receipt category and description
"""
search_text = (
f"{receipt.category} {receipt.description} {receipt.vendor}".lower()
)
for asset_class, keywords in self.CAPITAL_ASSET_KEYWORDS.items():
for keyword in keywords:
if keyword in search_text:
return asset_class
return None
def _is_meals_entertainment(self, receipt: Receipt) -> bool:
"""
Check if receipt is for meals & entertainment
"""
# Check explicit flag first
if (
hasattr(receipt, "is_meals_entertainment")
and receipt.is_meals_entertainment
):
return True
# Check category and description
search_text = (
f"{receipt.category} {receipt.description} {receipt.vendor}".lower()
)
for keyword in self.MEALS_ENTERTAINMENT_KEYWORDS:
if keyword in search_text:
return True
return False
def format_analysis_summary(self, analysis: Dict[str, Any]) -> str:
"""
Format the tax analysis into a human-readable summary
"""
lines = ["=== Tax Analysis Summary ===", ""]
# Sales Tax
st = analysis.get("sales_tax", {})
lines.append("1. SALES TAX:")
if st.get("is_international"):
lines.append(f" - {st['reason']}")
lines.append(" - ⚠️ Review Required: International Transaction")
else:
lines.append(f" - Province: {st.get('applicable_province', 'N/A')}")
lines.append(
f" - Tax Rate: {st.get('applicable_rate', 0) * 100:.2f}% ({st.get('tax_name', 'N/A')})"
)
lines.append(f" - Calculated Tax: ${st.get('calculated_tax', 0):.2f}")
lines.append(f" - Stated Tax: ${st.get('stated_tax', 0):.2f}")
lines.append(
f" - Discrepancy: ${st.get('discrepancy', 0):.2f} ({st.get('discrepancy_percentage', 0):.1f}%)"
)
if st.get("requires_review"):
lines.append(" - ⚠️ Review Required: Tax discrepancy > 5%")
lines.append("")
# Foreign Exchange
fx = analysis.get("foreign_exchange", {})
lines.append("2. FOREIGN EXCHANGE:")
if fx.get("currency_mismatch"):
lines.append(
f" - Currency Mismatch: {fx['receipt_currency']}{fx['transaction_currency']}"
)
lines.append(f" - Receipt Amount: ${fx['receipt_amount']:.2f}")
lines.append(f" - Transaction Amount: ${fx['transaction_amount']:.2f}")
lines.append(f" - Discrepancy: ${fx['discrepancy']:.2f}")
lines.append(" - ⚠️ Manual Review Required")
else:
lines.append(" - No currency mismatch")
lines.append("")
# Depreciation
dep = analysis.get("depreciation", {})
lines.append("3. DEPRECIATION:")
if dep.get("is_capital_asset"):
lines.append(f" - Capital Asset: Yes ({dep['cca_description']})")
lines.append(f" - Asset Cost: ${dep['asset_cost']:.2f}")
lines.append(
f" - CCA Class: {dep['cca_class']} ({dep['cca_depreciation']['cca_rate'] * 100:.0f}%)"
)
lines.append(
f" - Year 1 CCA: ${dep['cca_depreciation']['year_1_depreciation']:.2f}"
)
lines.append(
f" - Annual Straight-Line: ${dep['straight_line_depreciation']['annual_depreciation']:.2f}"
)
else:
lines.append(" - Not a capital asset")
lines.append("")
# Meals & Entertainment
me = analysis.get("meals_entertainment", {})
lines.append("4. MEALS & ENTERTAINMENT:")
if me.get("is_meals_entertainment"):
lines.append(" - Type: Meals & Entertainment Expense")
lines.append(f" - Meal Amount: ${me['meal_amount']:.2f}")
lines.append(f" - Sales Tax: ${me['sales_tax']:.2f}")
lines.append(f" - Tax Deduction (50%): ${me['tax_deduction_amount']:.2f}")
lines.append(
f" - Accounting Deduction (100%): ${me['accounting_deduction_amount']:.2f}"
)
else:
lines.append(" - Not a meals & entertainment expense")
lines.append("")
# Confidence Adjustment
conf = analysis.get("confidence_adjustment", {})
lines.append("CONFIDENCE ADJUSTMENT:")
lines.append(f" - Boost: +{conf.get('boost', 0):.2f}")
lines.append(f" - Reduce: -{conf.get('reduce', 0):.2f}")
return "\n".join(lines)
+312
View File
@@ -0,0 +1,312 @@
from typing import Any, Dict, List, Optional
from schemas import Match, Receipt, Transaction
from services.ai_matcher import AIMatcher
from services.ai_rules import AIRulesEngine
from services.ai_rules_matcher import AIRulesMatcher
from services.feedback_logger import FeedbackLogger
from services.llm_tax_analyzer import LLMTaxAnalyzer
from services.manual_tax_calculator import ManualTaxCalculator
class MatchingEngine:
def __init__(self, use_manual_tax_calculator: bool = False):
self.ai_matcher = AIMatcher()
self.rules_engine = AIRulesEngine()
self.feedback_logger = FeedbackLogger()
self.llm_tax_analyzer = LLMTaxAnalyzer()
self.manual_tax_calculator = ManualTaxCalculator()
self.ai_rules_matcher = AIRulesMatcher()
self.use_manual_tax_calculator = use_manual_tax_calculator
def process_matching(
self,
receipts: List[Receipt],
transactions: List[Transaction],
user_location: str = "ON",
ai_rules: Optional[List[Dict]] = None,
) -> List[Match]:
# Get AI matches
ai_matches = self.ai_matcher.match_receipts_to_transactions(
receipts, transactions
)
# # Apply traditional rules first (lightweight, no API calls)
# for match in ai_matches:
# rule_results = self.rules_engine.apply_rules(
# match.receipt, match.transaction
# )
# # Apply confidence boost from traditional rules
# if rule_results["confidence_boost"] > 0:
# match.confidence_score = min(
# 1.0, match.confidence_score + rule_results["confidence_boost"]
# )
# # Auto-approve if rules say so
# if rule_results["auto_approve"]:
# match.confidence_score = 1.0
# match.match_reason += " (Auto-approved by rules)"
# # Apply tax analysis - use manual calculator or LLM based on configuration
# if self.use_manual_tax_calculator:
# # Use deterministic rule-based calculator
# enhanced_matches = self._apply_manual_tax_analysis(
# ai_matches, user_location
# )
# else:
# # No tax analysis, just use the matches as-is
# enhanced_matches = ai_matches
# Apply AI rules for post-matching evaluation
# This adds flag_for_review and auto_approve fields based on custom rules
if ai_rules:
enhanced_matches = self.ai_rules_matcher.apply_rules_to_matches(
ai_matches, ai_rules
)
else:
# Even without custom rules, apply built-in rules (e.g., currency mismatch)
enhanced_matches = self.ai_rules_matcher.apply_rules_to_matches(
ai_matches, None
)
return enhanced_matches
def _enhance_match_with_rules(
self, match: Match, user_location: str = "ON"
) -> Match:
"""
Enhanced version using LLM to intelligently apply tax rules:
1. Sales tax based on receipt location (shipping/billing address priority)
2. Foreign exchange rules for currency mismatches
3. Depreciation rules for capital assets (based on user location)
4. Meals & Entertainment tax deduction rules (50% for tax, 100% for accounting)
"""
# First, apply traditional rule-based checks for basic matching quality
rule_results = self.rules_engine.apply_rules(match.receipt, match.transaction)
# Apply confidence boost from traditional rules
if rule_results["confidence_boost"] > 0:
match.confidence_score = min(
1.0, match.confidence_score + rule_results["confidence_boost"]
)
# Auto-approve if rules say so
if rule_results["auto_approve"]:
match.confidence_score = 1.0
match.match_reason += " (Auto-approved by rules)"
# Now apply LLM-based tax analysis
try:
llm_tax_analysis = self.llm_tax_analyzer.analyze_and_apply_tax_rules(
match.receipt, match.transaction, user_location
)
# Store the complete tax analysis
match.tax_analysis = llm_tax_analysis
# Apply confidence adjustments based on tax analysis
confidence_adj = llm_tax_analysis.get("confidence_adjustment", {})
# Boost confidence if tax rules validate the match
boost = confidence_adj.get("boost", 0.0)
if boost > 0:
match.confidence_score = min(1.0, match.confidence_score + boost)
match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
# Reduce confidence if tax issues detected
reduce = confidence_adj.get("reduce", 0.0)
if reduce > 0:
match.confidence_score = max(0.0, match.confidence_score - reduce)
match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
# Add flags for manual review if needed
review_flags = []
# Check sales tax issues
sales_tax = llm_tax_analysis.get("sales_tax", {})
if sales_tax.get("requires_review", False):
review_flags.append("Sales Tax Review Required")
# Check FX issues
fx_analysis = llm_tax_analysis.get("foreign_exchange", {})
if fx_analysis.get("requires_manual_review", False):
review_flags.append(
f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
)
# Check depreciation
depreciation = llm_tax_analysis.get("depreciation", {})
if depreciation.get("is_capital_asset", False):
review_flags.append(
f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
)
# Check meals & entertainment
meals_ent = llm_tax_analysis.get("meals_entertainment", {})
if meals_ent.get("is_meals_entertainment", False):
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
review_flags.append(
f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
)
# Add review flags to match reason
if review_flags:
match.match_reason += " | REVIEW: " + "; ".join(review_flags)
except Exception as e:
# If LLM analysis fails, log it and continue with traditional rules
import logging
logging.error(f"LLM tax analysis failed: {str(e)}")
match.match_reason += " (Note: Advanced tax analysis unavailable)"
# Fall back to traditional tax rules if available
if rule_results.get("tax_analysis"):
match.tax_analysis = rule_results["tax_analysis"]
return match
def _apply_manual_tax_analysis(
self, matches: List[Match], user_location: str = "ON"
) -> List[Match]:
"""
Apply deterministic rule-based tax analysis to all matches
No LLM calls - pure business logic for consistent results
"""
import logging
logger = logging.getLogger(__name__)
logger.info(
f"Applying manual tax analysis to {len(matches)} matches using rule-based calculator"
)
enhanced_matches = []
for match in matches:
try:
# Get comprehensive tax analysis from manual calculator
tax_analysis = self.manual_tax_calculator.calculate_tax_analysis(
match.receipt, match.transaction, user_location
)
# Store the complete tax analysis
match.tax_analysis = tax_analysis
# Apply confidence adjustments
confidence_adj = tax_analysis.get("confidence_adjustment", {})
# Boost confidence if tax rules validate the match
boost = confidence_adj.get("boost", 0.0)
if boost > 0:
match.confidence_score = min(1.0, match.confidence_score + boost)
match.match_reason += f" (Tax validated: +{boost:.2f})"
# Reduce confidence if tax issues detected
reduce = confidence_adj.get("reduce", 0.0)
if reduce > 0:
match.confidence_score = max(0.0, match.confidence_score - reduce)
match.match_reason += f" (Tax issues: -{reduce:.2f})"
# Add flags for manual review
review_flags = []
# Sales tax issues
sales_tax = tax_analysis.get("sales_tax", {})
if sales_tax.get("requires_review"):
if sales_tax.get("is_international"):
review_flags.append("International Transaction - FX Review")
else:
discrepancy_pct = sales_tax.get("discrepancy_percentage", 0)
review_flags.append(
f"Sales Tax Discrepancy: {discrepancy_pct:.1f}%"
)
# FX issues
fx = tax_analysis.get("foreign_exchange", {})
if fx.get("currency_mismatch"):
review_flags.append(
f"FX: {fx['receipt_currency']}{fx['transaction_currency']} (${fx['discrepancy']:.2f})"
)
# Capital asset depreciation
depreciation = tax_analysis.get("depreciation", {})
if depreciation.get("is_capital_asset"):
cca_class = depreciation.get("cca_class", "Unknown")
year1_cca = depreciation.get("cca_depreciation", {}).get(
"year_1_depreciation", 0
)
review_flags.append(
f"Capital Asset ({cca_class}) - Year 1 CCA: ${year1_cca:.2f}"
)
# Meals & entertainment
meals_ent = tax_analysis.get("meals_entertainment", {})
if meals_ent.get("is_meals_entertainment"):
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
accounting_deduction = meals_ent.get(
"accounting_deduction_amount", 0
)
review_flags.append(
f"M&E: Tax ${tax_deduction:.2f} (50%), Accounting ${accounting_deduction:.2f} (100%)"
)
# Add review flags to match reason
if review_flags:
match.match_reason += " | " + "; ".join(review_flags)
enhanced_matches.append(match)
except Exception as e:
logger.error(
f"Manual tax analysis failed for match: {str(e)}", exc_info=True
)
match.match_reason += " (Tax analysis failed)"
enhanced_matches.append(match)
logger.info(
f"Manual tax analysis completed for {len(enhanced_matches)} matches"
)
return enhanced_matches
def approve_match(self, match: Match, user_id: str):
# Log the approval
self.feedback_logger.log_override(
transaction_id=match.transaction.id,
original_match=f"AI Score: {match.confidence_score}",
correction="Approved",
reason="User approved match",
user_id=user_id,
)
def reject_match(self, match: Match, reason: str, user_id: str):
# Log the rejection
self.feedback_logger.log_override(
transaction_id=match.transaction.id,
original_match=f"AI Score: {match.confidence_score}",
correction="Rejected",
reason=reason,
user_id=user_id,
)
def get_matching_stats(self, matches: List[Match]) -> Dict[str, Any]:
if not matches:
return {
"total": 0,
"high_confidence": 0,
"low_confidence": 0,
"avg_score": 0,
}
high_confidence = len([m for m in matches if m.confidence_score >= 0.8])
low_confidence = len([m for m in matches if m.confidence_score < 0.8])
avg_score = sum(m.confidence_score for m in matches) / len(matches)
return {
"total": len(matches),
"high_confidence": high_confidence,
"low_confidence": low_confidence,
"avg_score": round(avg_score, 3),
}
+96
View File
@@ -0,0 +1,96 @@
rule = '''
### Rule Scenarios
Impact of Signup Fields on Tax Calculation and Receipt Matching
Impact of Signup Fields (Country and Province/State) on Tax Calculation and Matching**
**Scenario 1:** User Location (Canada, Ontario) but Receipt from Another Location (e.g., Quebec)
User's Location: Canada, Ontario (for tax and depreciation purposes).
Receipt Location: The receipt comes from Quebec (the tax rules in Quebec are different from Ontario).
What Happens:
The sales tax rate should be applied based on the location of the receipt, not the user's profile location.
**For example:**
The user in Ontario will have 13% HST applied to their purchases.
If the receipt is from Quebec, the QST (Quebec Sales Tax) of 9.975% applies instead.
**Scenario 2:** User Location (Canada, Ontario) and Receipt Location is Different Country (e.g., USA)
User's Location: Canada, Ontario.
Receipt Location: The receipt is from a business in the USA (e.g., New York).
**What Happens:**
Sales Tax should not be applied for international transactions (USA in this case) unless the user is importing or there is a customs duty involved.
The system will not apply a Canadian sales tax to the receipt from the USA, but the foreign exchange (FX) rule will apply because there is a mismatch between currencies (USD vs. CAD).
**Scenario 3:** User Location (USA, New York) but Receipt from Another Location in the Same Country (e.g., California)
User's Location: USA, New York (for tax purposes).
Receipt Location: The receipt is from California (still in the USA, but the sales tax rate is different).
**What Happens:**
Sales tax should be applied based on the location of the receipt, not the users location, since the receipt was issued in California.
California may have a different sales tax rate than New York.
**Scenario 4:** User Location (Canada, Ontario) and Receipt Location with No Address Information
User's Location: Canada, Ontario.
Receipt Location: The receipt contains no clear shipping or billing address.
**What Happens:**
If the receipt does not have a clear location, the system will default to the users location for sales tax and depreciation.
Action:
Sales Tax: Apply the sales tax rate based on the user's location (Ontario). For example, 13% HST will be applied.
Depreciation: Apply the depreciation rules based on the users location (Ontario), even if the receipt doesnt have address information.
**Summary of Actions in These Scenarios:**
Sales Tax:If the receipt is from a different location (same country or foreign), use the location from the receipt for sales tax calculation.
If the receipt is from a different country, dont apply sales tax from the user's country but flag the FX discrepancy.
If the location is missing, apply the users location sales tax by default.
**Depreciation:** Always apply depreciation rules based on the users location, regardless of where the receipt is from.
**FX (Foreign Exchange):** If the receipt is in a different currency, flag the FX difference for manual review but dont fetch exchange rates.
### 2. **Foreign Exchange (FX) Rule**
**Purpose**: To handle discrepancies when transactions and receipts are in different currencies (e.g., USD vs. CAD).
- **Action**: Identify the currency mismatch, but do not automatically fetch the exchange rate. Flag the FX difference for manual review, allowing the user to approve or adjust the balance.
**Example**:
1. A transaction in USD for $100, matched to a receipt in CAD for $125, results in an FX discrepancy of $25.
2. The system flags the discrepancy for manual review by the user. The user can then approve the difference or adjust the amounts manually.
### 3. **Depreciation Rule**
**Purpose**: To calculate the depreciation for assets based on the Straight-Line Method (for accounting) or CCA Depreciation (Declining Balance) for tax purposes.
**Action**:
- Apply Straight-Line Depreciation (for accounting) across the assets useful life.
- Apply CCA Depreciation (for tax purposes) using a declining balance method.
**Example**:
1. Straight-Line Depreciation: An asset purchased for $10,000, with a 5-year useful life and a residual value of $1,000, will have an annual depreciation of:
- (10,000 - 1,000)/5 = 1,800 per year for 5 years.
2. CCA Depreciation: A truck purchased for $20,000, eligible for 30% CCA per year. The depreciation will be:
- Year 1: 20,000 x 30% = $6,000
- Year 2: (20,000 - 6,000) x 30% = $4,200
- The depreciation will decline each year as the book value reduces.
### 4. **Meals & Entertainment Tax Deduction Rule**
**Purpose**: To apply the correct tax deduction for Meals & Entertainment expenses.
**Action**:
- For Tax Purposes: Only 50% of the total receipt amount is deductible.
- For Accounting Purposes: 100% of the total receipt amount is deductible.
- Sales Tax: The full sales tax will be deducted for accounting purposes.
**Example**:
1. A $100 meal receipt for a business dinner:
- **Tax Purposes**: Only $50 of the total amount is deductible.
- **Accounting Purposes**: The full $100 is deductible.
2. If the sales tax on the meal is $12, the entire $12 is included in the accounting deduction, but for tax purposes, the $50 deduction will reflect the adjusted amount after the 50% rule is applied.
### **When Location on Receipt is Different from User's Location**
**1. Sales Tax**:
- **Scenario 1**: If the **receipt's location** is different (e.g., receipt from Quebec for a user in Ontario), the **sales tax** is applied based on the **receipt's location** (Quebec sales tax).
- **Scenario 2**: If the **receipt** is from a different **country** (e.g., USA), the **system flags** the **currency mismatch** but does not apply **Canadian sales tax**.
**2. Depreciation**:
- Depreciation is always calculated based on the **user's location**, not the receipt's location.
- **Depreciation Method** for **Canada (Ontario)**: **CCA method** will apply, regardless of where the receipt comes from.
**3. FX Handling**:
- If the receipt is in a different **currency** (e.g., USD for a CAD-based user), the system will **flag FX differences** for manual review but wont fetch exchange rates.
**4. General Process**:
- When the **receipt location** is different from the **user's location**, ensure that the **tax and depreciation** are correctly applied based on the **receipt's data**.
- For **foreign transactions**, ensure that **FX differences** are flagged for user review.
- For **missing location information**, apply **users location** by default for tax and depreciation.
'''
@@ -1,7 +1,7 @@
import logging
from typing import Any, Dict, Optional
from models import Address, Asset, Receipt, Transaction
from schemas import Address, Asset, Receipt, Transaction
logger = logging.getLogger(__name__)
-15
View File
@@ -1,15 +0,0 @@
import os
from dotenv import load_dotenv
load_dotenv()
# Get API key from environment variable with fallback
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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")
# Validate API key
if not GROQ_API_KEY or GROQ_API_KEY == "your_api_key_here":
raise ValueError("GROQ_API_KEY environment variable is not set or invalid. Please set it in your .env file.")
CONFIDENCE_THRESHOLD = 0.3
DATE_TOLERANCE_DAYS = 7
AMOUNT_TOLERANCE_PERCENT = 0.05
-75
View File
@@ -1,75 +0,0 @@
from typing import Annotated
from fastapi import Depends
from sqlalchemy import Column, DateTime, Float, Integer, String, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, sessionmaker
SQLALCHEMY_DATABASE_URL = "sqlite:///./sql_app.db"
engine = create_engine(
SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
db_dependency = Annotated[Session, Depends(get_db)]
Base = declarative_base()
def create_db_tables():
Base.metadata.create_all(bind=engine)
def clear_all_data():
"""Clear all data from the database (useful for testing)"""
db = SessionLocal()
try:
db.query(Transaction).delete()
db.query(Receipt).delete()
db.commit()
finally:
db.close()
# Transactions table
class Transaction(Base):
__tablename__ = "transactions"
id = Column(Integer, primary_key=True, index=True)
transaction_id = Column(String, unique=True, index=True)
amount = Column(Float, nullable=False)
date = Column(DateTime, nullable=False)
vendor = Column(String, nullable=False)
description = Column(String, nullable=True)
category = Column(String, nullable=True)
tax_amount = Column(Float, nullable=True)
categorisation_id = Column(String, nullable=True)
user_id = Column(String, nullable=True)
# Receipts table
class Receipt(Base):
__tablename__ = "receipts"
id = Column(Integer, primary_key=True, index=True)
receipt_id = Column(String, unique=True, index=True)
file_id = Column(String, unique=True, index=True)
amount = Column(Float, nullable=False)
date = Column(DateTime, nullable=False)
vendor = Column(String, nullable=False)
description = Column(String, nullable=True)
category = Column(String, nullable=True)
tax_amount = Column(Float, nullable=True)
confidence = Column(Float, nullable=True)
extraction_success = Column(String, nullable=True)
error_message = Column(String, nullable=True)
-498
View File
@@ -1,498 +0,0 @@
import groq
import base64
import io
from PIL import Image
import PyPDF2
from typing import Dict, Any, List, Optional
import config
import os
import aiofiles
from datetime import datetime
import logging
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self):
self.client = groq.Groq(api_key=config.GROQ_API_KEY)
self.model = "meta-llama/llama-4-scout-17b-16e-instruct" # Vision model
async def process_file(self, file_path: str, file_type: str) -> Dict[str, Any]:
"""Process uploaded file and extract receipt data"""
try:
if file_type.lower() in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
return await self._process_image(file_path)
elif file_type.lower() == 'pdf':
return await self._process_pdf(file_path)
else:
raise ValueError(f"Unsupported file type: {file_type}")
except Exception as e:
return {"error": str(e)}
async def _process_image(self, image_path: str) -> Dict[str, Any]:
"""Extract data from image using Groq vision"""
try:
# Encode image to base64
base64_image = self._encode_image(image_path)
# Create Groq vision prompt
prompt = """
Analyze this receipt image and extract the following information in JSON format:
{
"vendor": "Store/company name",
"description": "Detailed description of items/services purchased",
"total_amount": 0.00,
"tax_amount": 0.00,
"date": "YYYY-MM-DD",
"category": "Food/Transport/Office/Other",
"confidence": 0.95
}
Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax
- Tax amount is separate tax line if available
- Date should be the date on the receipt
- Categorize based on vendor type (Starbucks=Food, Shell=Transport, etc.)
- Confidence score 0-1 based on how clear the receipt is
Return only valid JSON.
"""
# Call Groq vision API with correct format
response = self.client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
}
],
model=self.model,
max_tokens=500,
temperature=0.1
)
# Parse response
result_text = response.choices[0].message.content.strip()
return self._parse_extraction_result(result_text)
except Exception as e:
return {"error": f"Image processing error: {str(e)}"}
def _encode_image(self, image_path: str) -> str:
"""Encode image to base64 string"""
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
async def _process_pdf(self, pdf_path: str) -> Dict[str, Any]:
"""Extract data from PDF by converting to image first"""
try:
# For now, extract text from PDF and process as text
text_content = self._extract_text_from_pdf(pdf_path)
return self._process_text_content(text_content)
except Exception as e:
return {"error": f"PDF processing error: {str(e)}"}
def _extract_text_from_pdf(self, pdf_path: str) -> str:
"""Extract text from PDF"""
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
return ""
def _process_text_content(self, text_content: str) -> Dict[str, Any]:
"""Process text content using Groq (fallback for PDFs)"""
try:
prompt = f"""
Analyze this receipt text and extract the following information in JSON format:
Receipt Text:
{text_content}
Extract:
{{
"vendor": "Store/company name",
"description": "Detailed description of items/services purchased",
"total_amount": 0.00,
"tax_amount": 0.00,
"date": "YYYY-MM-DD",
"category": "Food/Transport/Office/Other",
"confidence": 0.95
}}
Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
- Total amount should be the final total including tax
- Tax amount is separate tax line if available
- Date should be the date on the receipt
- Categorize based on vendor type
- Confidence score 0-1 based on clarity
Return only valid JSON.
"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
max_tokens=500,
temperature=0.1
)
result_text = response.choices[0].message.content.strip()
return self._parse_extraction_result(result_text)
except Exception as e:
return {"error": f"Text processing error: {str(e)}"}
def _parse_extraction_result(self, result_text: str) -> Dict[str, Any]:
"""Parse Groq response and extract JSON data"""
try:
# Clean up response and extract JSON
import json
import re
# Find JSON in response - try multiple patterns
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
if json_match:
json_str = json_match.group()
# Clean up common JSON issues
json_str = re.sub(r',\s*([}\]])', r'\1', json_str) # Remove trailing commas
json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str) # Quote unquoted keys
try:
data = json.loads(json_str)
except json.JSONDecodeError as e:
# Try to fix common JSON issues
logger.warning(f"Initial JSON parsing failed: {e}")
# Try to extract individual fields using regex
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
description_match = re.search(r'"description"\s*:\s*"([^"]*)"', json_str)
total_amount_match = re.search(r'"total_amount"\s*:\s*([0-9.]+)', json_str)
tax_amount_match = re.search(r'"tax_amount"\s*:\s*([0-9.]+)', json_str)
date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
confidence_match = re.search(r'"confidence"\s*:\s*([0-9.]+)', json_str)
data = {
"vendor": vendor_match.group(1) if vendor_match else "",
"description": description_match.group(1) if description_match else "",
"total_amount": float(total_amount_match.group(1)) if total_amount_match else 0.0,
"tax_amount": float(tax_amount_match.group(1)) if tax_amount_match else 0.0,
"date": date_match.group(1) if date_match else "",
"category": category_match.group(1) if category_match else "Other",
"confidence": float(confidence_match.group(1)) if confidence_match else 0.5
}
# Validate and clean data
return {
"vendor": str(data.get("vendor", "")).strip(),
"description": str(data.get("description", "")).strip(),
"total_amount": float(data.get("total_amount", 0)),
"tax_amount": float(data.get("tax_amount", 0)),
"date": str(data.get("date", "")).strip(),
"category": str(data.get("category", "Other")).strip(),
"confidence": float(data.get("confidence", 0.5)),
"extraction_success": True
}
else:
# Try to extract fields from plain text
logger.warning("No JSON found in response, attempting text extraction")
return self._extract_from_plain_text(result_text)
except Exception as e:
logger.error(f"JSON parsing error: {str(e)}")
return {"error": f"JSON parsing error: {str(e)}", "extraction_success": False}
def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
"""Extract receipt data from plain text when JSON parsing fails"""
try:
import re
# Extract vendor (look for common patterns)
vendor_patterns = [
r'(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)',
r'([A-Z][A-Za-z0-9\s&.,]{3,30})', # Capitalized words
]
vendor = ""
for pattern in vendor_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
vendor = match.group(1).strip()
break
# Extract amount (look for currency patterns)
amount_patterns = [
r'\$?\s*([0-9,]+\.?[0-9]*)',
r'(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)',
]
total_amount = 0.0
for pattern in amount_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
try:
total_amount = float(match.group(1).replace(',', ''))
break
except ValueError:
continue
# Extract date
date_patterns = [
r'(\d{4}-\d{2}-\d{2})',
r'(\d{1,2}/\d{1,2}/\d{2,4})',
r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}',
]
date = ""
for pattern in date_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
date = match.group(0)
break
return {
"vendor": vendor or "Unknown",
"total_amount": total_amount,
"tax_amount": 0.0,
"date": date or "",
"category": "Other",
"confidence": 0.3, # Low confidence for text extraction
"extraction_success": True
}
except Exception as e:
logger.error(f"Text extraction error: {str(e)}")
return {
"vendor": "Unknown",
"total_amount": 0.0,
"tax_amount": 0.0,
"date": "",
"category": "Other",
"confidence": 0.1,
"extraction_success": False,
"error": f"Text extraction failed: {str(e)}"
}
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
"""Save uploaded file to temporary storage"""
try:
# Create uploads directory if it doesn't exist
upload_dir = "uploads"
os.makedirs(upload_dir, exist_ok=True)
# Generate unique filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
safe_filename = f"{timestamp}_{filename.replace(' ', '_')}"
file_path = os.path.join(upload_dir, safe_filename)
# Save file
async with aiofiles.open(file_path, 'wb') as f:
await f.write(file_content)
return file_path
except Exception as e:
raise Exception(f"Failed to save file: {str(e)}")
async def extract_transactions_from_image(self, image_path: str) -> Dict[str, Any]:
"""Extract multiple transactions from an image (bank statement, credit card statement, etc.)"""
try:
# Encode image to base64
base64_image = self._encode_image(image_path)
# Create Groq vision prompt for transaction extraction
prompt = """
Analyze this financial document image (bank statement, credit card statement, etc.) and extract ALL transactions in JSON format.
Look for transaction lists, payment records, or any financial entries that show:
- Date
- Amount (positive or negative)
- Vendor/Description/Payee name
- Any additional notes or memo
Return the transactions as a JSON array:
{
"extraction_success": true,
"transactions": [
{
"date": "YYYY-MM-DD",
"amount": 0.00,
"vendor": "Vendor name",
"memo": "Additional notes"
},
{
"date": "YYYY-MM-DD",
"amount": -0.00,
"vendor": "Another vendor",
"memo": "Payment or charge description"
}
]
}
Rules:
- Extract ALL visible transactions
- Include both positive (credits) and negative (debits) amounts
- Use the actual date format from the document
- Vendor should be the merchant/payee name
- Memo can include transaction type, reference numbers, etc.
- If no transactions found, return empty array but set extraction_success to true
Return only valid JSON.
"""
# Call Groq vision API
response = self.client.chat.completions.create(
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
}
],
model=self.model,
max_tokens=2000, # Higher token limit for multiple transactions
temperature=0.1
)
# Parse response
result_text = response.choices[0].message.content.strip()
return self._parse_transaction_extraction_result(result_text)
except Exception as e:
return {
"extraction_success": False,
"error": f"Transaction extraction error: {str(e)}",
"transactions": []
}
def _parse_transaction_extraction_result(self, result_text: str) -> Dict[str, Any]:
"""Parse Groq response for transaction extraction"""
try:
import json
import re
# Find the first '{' and last '}'
start = result_text.find('{')
end = result_text.rfind('}')
if start == -1 or end == -1 or end <= start:
return {
"extraction_success": False,
"error": "Could not find JSON object in AI response",
"transactions": []
}
json_str = result_text[start:end+1]
# Remove trailing commas before } or ]
json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
try:
data = json.loads(json_str)
except Exception as e:
import logging
logging.error(f"JSON parsing error: {str(e)}")
logging.error(f"Offending JSON string:\n{json_str}")
return {
"extraction_success": False,
"error": f"JSON parsing error: {str(e)}",
"transactions": []
}
# Validate and clean data
transactions = data.get("transactions", [])
cleaned_transactions = []
for txn in transactions:
try:
cleaned_txn = {
"date": str(txn.get("date", "")).strip(),
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
"vendor": str(txn.get("vendor", "")).strip(),
"memo": str(txn.get("memo", "")).strip()
}
cleaned_transactions.append(cleaned_txn)
except Exception as e:
continue
return {
"extraction_success": data.get("extraction_success", True),
"transactions": cleaned_transactions,
"total_transactions": len(cleaned_transactions)
}
except Exception as e:
import logging
logging.error(f"JSON parsing error (outer): {str(e)}")
return {
"extraction_success": False,
"error": f"JSON parsing error: {str(e)}",
"transactions": []
}
def _parse_date_to_iso(self, date_str: str) -> str:
"""Parse various date formats and convert to YYYY-MM-DD"""
try:
import re
from datetime import datetime
date_str = date_str.strip().upper()
# Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024"
month_pattern = r'(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s+(\d{1,2})(?:,\s*(\d{4}))?'
match = re.match(month_pattern, date_str)
if match:
month_abbr, day, year = match.groups()
month_map = {
'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
}
month = month_map[month_abbr]
day = int(day)
year = int(year) if year else datetime.now().year
# Handle 2-digit years
if year < 100:
year += 2000
return f"{year:04d}-{month:02d}-{day:02d}"
# Handle YYYY-MM-DD format
if re.match(r'\d{4}-\d{2}-\d{2}', date_str):
return date_str
# Handle MM/DD/YYYY format
if re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_str):
return datetime.strptime(date_str, '%m/%d/%Y').strftime('%Y-%m-%d')
# Handle MM/DD/YY format
if re.match(r'\d{1,2}/\d{1,2}/\d{2}', date_str):
return datetime.strptime(date_str, '%m/%d/%y').strftime('%Y-%m-%d')
return None
except Exception:
return None
-157
View File
@@ -1,157 +0,0 @@
import os
from datetime import datetime, timedelta
from typing import Any, Dict, List
class GoogleDriveSync:
def __init__(self):
self.service = None
self.processed_files = set()
def authenticate(self):
"""Authenticate with Google Drive API"""
try:
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
# Load existing credentials
if os.path.exists("token.json"):
self.creds = Credentials.from_authorized_user_file("token.json", SCOPES)
# If no valid credentials available, let user log in
if not self.creds or not self.creds.valid:
if self.creds and self.creds.expired and self.creds.refresh_token:
self.creds.refresh(Request())
else:
if not os.path.exists("credentials.json"):
raise Exception(
"credentials.json not found. Please download from Google Cloud Console."
)
flow = InstalledAppFlow.from_client_secrets_file(
"credentials.json", SCOPES
)
self.creds = flow.run_local_server(port=0)
# Save credentials for next run
with open("token.json", "w") as token:
token.write(self.creds.to_json())
# Build the Drive service
self.service = build("drive", "v3", credentials=self.creds)
return True
except Exception as e:
print(f"Authentication error: {e}")
return False
def list_folders(self) -> List[Dict[str, Any]]:
"""List all folders in Google Drive"""
if not self.service:
if not self.authenticate():
return []
try:
results = (
self.service.files()
.list(
q="mimeType='application/vnd.google-apps.folder'",
pageSize=100,
fields="nextPageToken, files(id, name, createdTime, modifiedTime)",
)
.execute()
)
return results.get("files", [])
except Exception as e:
print(f"Error listing folders: {e}")
return []
def get_folder_info(self, folder_id: str) -> Dict[str, Any]:
"""Get information about a Google Drive folder"""
if not self.service:
if not self.authenticate():
return {}
try:
folder = (
self.service.files()
.get(fileId=folder_id, fields="id, name, createdTime, modifiedTime")
.execute()
)
return folder
except Exception as e:
print(f"Error getting folder info: {e}")
return {}
async def process_drive_files(self, folder_id: str = None) -> List[Dict[str, Any]]:
"""Process all receipt files from Google Drive"""
if not self.service:
if not self.authenticate():
return []
results = []
try:
# File types to look for
file_types = [
"'application/pdf'",
"'image/jpeg'",
"'image/png'",
"'image/gif'",
"'image/bmp'",
]
mime_types = " or ".join(file_types)
# Build query
query = f"mimeType contains {mime_types}"
if folder_id:
query += f" and '{folder_id}' in parents"
# Add date filter (last 30 days)
thirty_days_ago = (datetime.now() - timedelta(days=30)).isoformat() + "Z"
query += f" and modifiedTime > '{thirty_days_ago}'"
results_files = (
self.service.files()
.list(
q=query,
pageSize=100,
fields="nextPageToken, files(id, name, mimeType, modifiedTime, size)",
)
.execute()
)
files = results_files.get("files", [])
files = [file for file in files if file["id"] not in self.processed_files]
# For demo purposes, return mock results
for file in files[:3]: # Process first 3 files
mock_result = {
"file_id": file["id"],
"filename": file["name"],
"drive_modified": file["modifiedTime"],
"file_size": file.get("size", 0),
"extraction_success": True,
"vendor": "Demo Vendor",
"description": "Coffee and sandwich",
"total_amount": 25.50,
"tax_amount": 2.04,
"date": "2024-01-15",
"category": "Food",
"confidence": 0.95,
}
results.append(mock_result)
self.processed_files.add(file["id"])
except Exception as e:
print(f"Error processing Drive files: {e}")
return results
-89
View File
@@ -1,89 +0,0 @@
from typing import Any, Dict, List
from ai_matcher import AIMatcher
from ai_rules import AIRulesEngine
from feedback_logger import FeedbackLogger
from models import Match, Receipt, Transaction
class MatchingEngine:
def __init__(self):
self.ai_matcher = AIMatcher()
self.rules_engine = AIRulesEngine()
self.feedback_logger = FeedbackLogger()
def process_matching(
self, receipts: List[Receipt], transactions: List[Transaction]
) -> List[Match]:
# Get AI matches
ai_matches = self.ai_matcher.match_receipts_to_transactions(
receipts, transactions
)
# Apply rules and enhance matches
enhanced_matches = []
for match in ai_matches:
enhanced_match = self._enhance_match_with_rules(match)
enhanced_matches.append(enhanced_match)
return enhanced_matches
def _enhance_match_with_rules(self, match: Match) -> Match:
rule_results = self.rules_engine.apply_rules(match.receipt, match.transaction)
# Apply confidence boost from rules
if rule_results["confidence_boost"] > 0:
match.confidence_score = min(
1.0, match.confidence_score + rule_results["confidence_boost"]
)
# Auto-approve if rules say so
if rule_results["auto_approve"]:
match.confidence_score = 1.0
match.match_reason += " (Auto-approved by rules)"
# Add tax analysis to match
if rule_results.get("tax_analysis"):
match.tax_analysis = rule_results["tax_analysis"]
return match
def approve_match(self, match: Match, user_id: str):
# Log the approval
self.feedback_logger.log_override(
transaction_id=match.transaction.id,
original_match=f"AI Score: {match.confidence_score}",
correction="Approved",
reason="User approved match",
user_id=user_id,
)
def reject_match(self, match: Match, reason: str, user_id: str):
# Log the rejection
self.feedback_logger.log_override(
transaction_id=match.transaction.id,
original_match=f"AI Score: {match.confidence_score}",
correction="Rejected",
reason=reason,
user_id=user_id,
)
def get_matching_stats(self, matches: List[Match]) -> Dict[str, Any]:
if not matches:
return {
"total": 0,
"high_confidence": 0,
"low_confidence": 0,
"avg_score": 0,
}
high_confidence = len([m for m in matches if m.confidence_score >= 0.8])
low_confidence = len([m for m in matches if m.confidence_score < 0.8])
avg_score = sum(m.confidence_score for m in matches) / len(matches)
return {
"total": len(matches),
"high_confidence": high_confidence,
"low_confidence": low_confidence,
"avg_score": round(avg_score, 3),
}
-59
View File
@@ -1,59 +0,0 @@
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
@dataclass
class Address:
"""Address information for tax calculations"""
province: str
city: str
postal_code: str
country: str = "Canada"
@dataclass
class Receipt:
id: str
file_name: str
upload_date: datetime
receipt_date: datetime
amount: float
tax: float
vendor: str
category: str
description: str
# Tax rule fields
billing_address: Optional[Address] = None
shipping_address: Optional[Address] = None
currency: str = "CAD"
is_meals_entertainment: bool = False
@dataclass
class Transaction:
id: str
transaction_date: datetime
amount: float
vendor: str
notes: str
# Tax rule fields
currency: str = "CAD"
fx_rate: Optional[float] = None
@dataclass
class Asset:
"""Asset for depreciation calculations"""
id: str
name: str
purchase_date: datetime
purchase_amount: float
useful_life_years: int
residual_value: float
cca_rate: float # Capital Cost Allowance rate
asset_class: str
@dataclass
class Match:
receipt: Receipt
transaction: Transaction
confidence_score: float
match_reason: str
tax_analysis: Optional[dict] = None
+3 -1
View File
@@ -13,4 +13,6 @@ aiofiles
google-auth
google-auth-oauthlib
google-auth-httplib2
google-api-python-client
google-api-python-client
sqlalchemy
pydantic-settings