Compare commits
24 Commits
main
..
version_two
| Author | SHA1 | Date | |
|---|---|---|---|
| f6535908fe | |||
| 8d745c1f8e | |||
| 2b83ffe00c | |||
| 85fafae311 | |||
| fa25f7bafd | |||
| 2f917ec085 | |||
| 7296d09319 | |||
| 01aa2efa43 | |||
| c8da3c61ca | |||
| 3559cbe19d | |||
| 2e020437a8 | |||
| f582110674 | |||
| 5116fb5efb | |||
| b2bf631448 | |||
| 659ca4ff15 | |||
| d8315f13ac | |||
| 823c05f78d | |||
| c2a7c5a087 | |||
| e3f610e01a | |||
| 7c412bcf9e | |||
| ae200bd30f | |||
| c45e3fa791 | |||
| c78c4c6fe9 | |||
| 3d48cf0385 |
+10
-225
@@ -1,229 +1,14 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be added to the global gitignore or merged into this project gitignore. For a PyCharm
|
||||
# project, it is recommended to include the following files:
|
||||
# .idea/
|
||||
# *.iml
|
||||
# *.ipr
|
||||
# *.iws
|
||||
|
||||
# VS Code
|
||||
.vscode/
|
||||
|
||||
# macOS
|
||||
.DS_Store
|
||||
.AppleDouble
|
||||
.LSOverride
|
||||
|
||||
# Windows
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
Desktop.ini
|
||||
|
||||
# Linux
|
||||
*~
|
||||
|
||||
# Temporary files
|
||||
*.tmp
|
||||
*.temp
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# Log files
|
||||
*.log
|
||||
|
||||
# Database files
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
*.db
|
||||
*.sqlite
|
||||
*.sqlite3
|
||||
|
||||
# Configuration files with sensitive data
|
||||
config.ini
|
||||
secrets.json
|
||||
.env.local
|
||||
.env.production
|
||||
|
||||
# Test files
|
||||
test_*.py
|
||||
*_test.py
|
||||
tests/
|
||||
|
||||
# Documentation
|
||||
docs/
|
||||
*.md
|
||||
!README.md
|
||||
|
||||
# IDE files
|
||||
.idea/
|
||||
.vscode/
|
||||
*.sublime-*
|
||||
.atom/
|
||||
|
||||
# OS generated files
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
uploads/
|
||||
chequing statement.csv
|
||||
test_images/
|
||||
.cursorrules.md
|
||||
.env
|
||||
*.log
|
||||
/uploads
|
||||
server_manager.sh
|
||||
server.log
|
||||
server.pid
|
||||
@@ -1,262 +0,0 @@
|
||||
# AI Bookkeeper - Data Science Engine
|
||||
|
||||
AI-powered receipt-to-transaction matching engine using Groq LLM. This is a **Data Science Engine** that provides intelligent matching capabilities for backend applications.
|
||||
|
||||
## 🎯 Purpose
|
||||
|
||||
This Data Science Engine receives QuickBooks transaction data from backend applications and provides:
|
||||
- **AI-powered receipt processing** (OCR and data extraction)
|
||||
- **Intelligent receipt-transaction matching** with confidence scores
|
||||
- **Configurable AI rules** for business logic
|
||||
- **Feedback logging** for continuous improvement
|
||||
- **RESTful API** for easy integration
|
||||
|
||||
## 🚀 Quick Start
|
||||
|
||||
### 1. Install Dependencies
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 2. Configure API Keys
|
||||
Create a `.env` file in the project root with your Groq API key:
|
||||
|
||||
```bash
|
||||
# Create .env file
|
||||
echo "GROQ_API_KEY=your_actual_groq_api_key_here" > .env
|
||||
```
|
||||
|
||||
**Important**: Get your API key from [Groq Console](https://console.groq.com/)
|
||||
|
||||
### 3. Start the Server
|
||||
```bash
|
||||
# Option 1: Using the main script
|
||||
python main.py
|
||||
|
||||
# Option 2: Using uvicorn directly
|
||||
uvicorn main:app --host 0.0.0.0 --port 8343 --reload
|
||||
```
|
||||
|
||||
### 4. Access API Documentation
|
||||
- **Swagger UI**: http://localhost:8343/docs
|
||||
- **ReDoc**: http://localhost:8343/redoc
|
||||
|
||||
## 📋 API Endpoints
|
||||
|
||||
### Transaction Import
|
||||
- `POST /transactions/import/csv` - Import transactions from CSV file
|
||||
- `POST /transactions/import/image` - Import transactions from image/PDF
|
||||
|
||||
### Receipt Processing
|
||||
- `POST /upload-multiple` - Upload multiple receipt documents
|
||||
- `POST /process/{file_id}` - Extract data from uploaded documents
|
||||
|
||||
### AI Matching Engine
|
||||
- `POST /match-specific` - Match specific receipts to transactions using AI
|
||||
|
||||
### AI Rules Management
|
||||
- `POST /rules` - Add new AI rules
|
||||
- `GET /rules` - List all active rules
|
||||
- `DELETE /rules/{rule_name}` - Delete rules
|
||||
|
||||
### System Monitoring
|
||||
- `GET /stats` - Get system statistics and performance metrics
|
||||
- `GET /` - Health check endpoint
|
||||
|
||||
## 🔧 Core Components
|
||||
|
||||
### **AIMatcher** (`ai_matcher.py`)
|
||||
- Uses Groq LLM to compare receipts and transactions
|
||||
- Provides confidence scores and reasoning
|
||||
- Configurable matching criteria (amount, date, vendor)
|
||||
- Rate limiting to prevent API quota exhaustion
|
||||
|
||||
### **AIRulesEngine** (`ai_rules.py`)
|
||||
- Applies business rules for auto-approval and categorization
|
||||
- Configurable rule conditions and actions
|
||||
- Supports system and user-generated rules
|
||||
- Safe condition evaluation with proper error handling
|
||||
|
||||
### **DocumentProcessor** (`document_processor.py`)
|
||||
- AI-powered receipt data extraction using Groq vision model
|
||||
- Supports PDF and image formats
|
||||
- Robust JSON parsing with error handling
|
||||
- Extracts vendor, amount, date, tax, and category information
|
||||
|
||||
### **MatchingEngine** (`matching_engine.py`)
|
||||
- Main orchestrator combining all components
|
||||
- Handles the complete matching workflow
|
||||
- Provides statistics and feedback logging
|
||||
- Configurable confidence thresholds
|
||||
|
||||
### **FeedbackLogger** (`feedback_logger.py`)
|
||||
- Tracks manual overrides for AI training
|
||||
- Maintains audit trail of user decisions
|
||||
- Enables continuous model improvement
|
||||
|
||||
## 📊 Configuration
|
||||
|
||||
Edit `config.py` to adjust:
|
||||
- **Confidence threshold** (default: 0.3)
|
||||
- **Date tolerance days** (default: 7)
|
||||
- **Amount tolerance percent** (default: 5%)
|
||||
- **Groq API key** (from environment variable)
|
||||
|
||||
## 🔄 Integration Workflow
|
||||
|
||||
### 1. Import Transactions
|
||||
```bash
|
||||
# Import from CSV
|
||||
curl -X POST -F "file=@transactions.csv" http://localhost:8343/transactions/import/csv
|
||||
|
||||
# Import from image
|
||||
curl -X POST -F "file=@statement.jpg" http://localhost:8343/transactions/import/image
|
||||
```
|
||||
|
||||
### 2. Upload and Process Receipts
|
||||
```bash
|
||||
# Upload receipts
|
||||
curl -X POST -F "files=@receipt1.jpg" -F "files=@receipt2.jpg" http://localhost:8343/upload-multiple
|
||||
|
||||
# Process a specific receipt
|
||||
curl -X POST http://localhost:8343/process/{file_id}
|
||||
```
|
||||
|
||||
### 3. AI Matching
|
||||
```bash
|
||||
# Match specific receipts
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
-d '["file_id_1", "file_id_2"]' \
|
||||
http://localhost:8343/match-specific
|
||||
```
|
||||
|
||||
### 4. Check Results
|
||||
```bash
|
||||
# Get system stats
|
||||
curl http://localhost:8343/stats
|
||||
|
||||
# View AI rules
|
||||
curl http://localhost:8343/rules
|
||||
```
|
||||
|
||||
## 🎯 Key Features
|
||||
|
||||
- **AI-powered matching** with confidence scores
|
||||
- **Rule-based auto-approval** and categorization
|
||||
- **Feedback logging** for continuous improvement
|
||||
- **Configurable matching parameters**
|
||||
- **RESTful JSON API** for easy backend integration
|
||||
- **Comprehensive error handling**
|
||||
- **Rate limiting** to prevent API quota exhaustion
|
||||
- **Robust JSON parsing** for AI responses
|
||||
|
||||
## 📝 Data Formats
|
||||
|
||||
### Transaction Input (CSV)
|
||||
```csv
|
||||
Date,Description,Amount,Category
|
||||
2024-01-15,Starbucks Coffee,12.50,Food & Dining
|
||||
2024-01-16,Office Supplies,45.99,Office
|
||||
```
|
||||
|
||||
### Receipt Processing Output
|
||||
```json
|
||||
{
|
||||
"vendor": "Starbucks",
|
||||
"total_amount": 12.50,
|
||||
"tax_amount": 1.25,
|
||||
"date": "2024-01-15",
|
||||
"category": "Food & Dining",
|
||||
"confidence": 0.95,
|
||||
"extraction_success": true
|
||||
}
|
||||
```
|
||||
|
||||
### Match Result Output
|
||||
```json
|
||||
{
|
||||
"receipt_id": "uuid",
|
||||
"transaction_id": "transaction_123",
|
||||
"confidence_score": 0.95,
|
||||
"match_reason": "Same vendor, minor date difference (Auto-approved by rules)",
|
||||
"receipt_vendor": "Starbucks",
|
||||
"receipt_amount": 12.50,
|
||||
"transaction_vendor": "STARBUCKS",
|
||||
"transaction_amount": 12.50
|
||||
}
|
||||
```
|
||||
|
||||
## 🔍 AI Matching Criteria
|
||||
|
||||
The engine uses multiple criteria for matching:
|
||||
|
||||
1. **Amount Similarity** - Compares receipt and transaction amounts (5% tolerance)
|
||||
2. **Date Proximity** - Checks date closeness (7-day tolerance)
|
||||
3. **Vendor Matching** - AI-powered vendor name comparison using Groq LLM
|
||||
4. **Rule-based Auto-approval** - Automatic approval for exact matches and high-confidence matches
|
||||
|
||||
## 🛠️ Development
|
||||
|
||||
### Project Structure
|
||||
```
|
||||
├── main.py # FastAPI application entry point
|
||||
├── ai_matcher.py # AI-powered matching logic
|
||||
├── ai_rules.py # Business rules engine
|
||||
├── document_processor.py # Receipt data extraction
|
||||
├── matching_engine.py # Main matching orchestrator
|
||||
├── feedback_logger.py # User feedback tracking
|
||||
├── models.py # Pydantic data models
|
||||
├── api_models.py # API request/response models
|
||||
├── config.py # Configuration settings
|
||||
├── requirements.txt # Python dependencies
|
||||
└── test_images/ # Test image files
|
||||
```
|
||||
|
||||
### Running Tests
|
||||
```bash
|
||||
# Test the server
|
||||
curl http://localhost:8343/
|
||||
|
||||
# Test stats endpoint
|
||||
curl http://localhost:8343/stats
|
||||
|
||||
# Test rules endpoint
|
||||
curl http://localhost:8343/rules
|
||||
```
|
||||
|
||||
## 🚀 Production Deployment
|
||||
|
||||
For production deployment:
|
||||
- Replace in-memory storage with a database (PostgreSQL recommended)
|
||||
- Configure proper authentication and authorization
|
||||
- Set up monitoring and logging (ELK stack recommended)
|
||||
- Use environment variables for all configuration
|
||||
- Implement proper error handling and retries
|
||||
- Set up rate limiting and API quotas
|
||||
- Configure CORS for frontend integration
|
||||
- Use HTTPS in production
|
||||
|
||||
## 📞 Support
|
||||
|
||||
This Data Science Engine is designed to be integrated with backend applications that handle:
|
||||
- QuickBooks API connections
|
||||
- User interface and workflows
|
||||
- Data persistence and management
|
||||
- External integrations
|
||||
|
||||
The engine focuses purely on AI/ML capabilities and provides a clean JSON API for backend integration.
|
||||
|
||||
## 🔧 Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **API Key Error**: Ensure `GROQ_API_KEY` is set in your `.env` file
|
||||
2. **Port Already in Use**: Kill existing process with `pkill -f "python main.py"`
|
||||
3. **Import Errors**: Install dependencies with `pip install -r requirements.txt`
|
||||
4. **Rate Limiting**: The system includes built-in rate limiting to prevent API quota exhaustion
|
||||
|
||||
### Logs
|
||||
Check the application logs for detailed error information:
|
||||
```bash
|
||||
tail -f app.log
|
||||
```
|
||||
-142
@@ -1,142 +0,0 @@
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class AddressRequest(BaseModel):
|
||||
province: str
|
||||
city: str
|
||||
postal_code: str
|
||||
country: str = "Canada"
|
||||
|
||||
|
||||
class ReceiptRequest(BaseModel):
|
||||
id: str
|
||||
file_name: str
|
||||
upload_date: datetime
|
||||
receipt_date: datetime
|
||||
amount: float
|
||||
tax: float
|
||||
vendor: str
|
||||
category: str
|
||||
description: str
|
||||
# Tax rule fields
|
||||
billing_address: Optional[AddressRequest] = None
|
||||
shipping_address: Optional[AddressRequest] = None
|
||||
currency: str = "CAD"
|
||||
is_meals_entertainment: bool = False
|
||||
|
||||
|
||||
class TransactionRequest(BaseModel):
|
||||
id: str
|
||||
transaction_date: datetime
|
||||
amount: float
|
||||
vendor: str
|
||||
notes: str
|
||||
# Tax rule fields
|
||||
currency: str = "CAD"
|
||||
fx_rate: Optional[float] = None
|
||||
|
||||
|
||||
class AssetRequest(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
purchase_date: datetime
|
||||
purchase_amount: float
|
||||
useful_life_years: int
|
||||
residual_value: float
|
||||
cca_rate: float
|
||||
asset_class: str
|
||||
|
||||
|
||||
class MatchingRequest(BaseModel):
|
||||
receipt_ids: List[str]
|
||||
transaction_ids: List[str]
|
||||
|
||||
|
||||
class MatchResponse(BaseModel):
|
||||
receipt_id: str
|
||||
transaction_id: str
|
||||
confidence_score: float
|
||||
match_reason: str
|
||||
receipt_vendor: str
|
||||
receipt_amount: float
|
||||
receipt_description: str
|
||||
receipt_category: str
|
||||
receipt_tax_amount: float
|
||||
transaction_vendor: str
|
||||
transaction_amount: float
|
||||
|
||||
|
||||
class MatchingResponse(BaseModel):
|
||||
matches: List[MatchResponse]
|
||||
stats: dict
|
||||
|
||||
|
||||
class ApprovalRequest(BaseModel):
|
||||
match_id: str
|
||||
approved: bool
|
||||
reason: Optional[str] = None
|
||||
|
||||
|
||||
class RuleRequest(BaseModel):
|
||||
name: str
|
||||
condition: str
|
||||
action: str
|
||||
source: str = "user"
|
||||
|
||||
|
||||
class DocumentUploadResponse(BaseModel):
|
||||
file_id: str
|
||||
filename: str
|
||||
upload_date: datetime
|
||||
status: str
|
||||
|
||||
|
||||
class DocumentProcessResponse(BaseModel):
|
||||
file_id: str
|
||||
extraction_success: bool
|
||||
vendor: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
total_amount: Optional[float] = None
|
||||
tax_amount: Optional[float] = None
|
||||
date: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# New tax-related models
|
||||
class TaxCalculationRequest(BaseModel):
|
||||
receipt_id: str
|
||||
transaction_id: Optional[str] = None
|
||||
|
||||
|
||||
class TaxCalculationResponse(BaseModel):
|
||||
receipt_id: str
|
||||
rules_applied: List[str]
|
||||
sales_tax: dict
|
||||
fx_analysis: Optional[dict] = None
|
||||
meals_entertainment: dict
|
||||
|
||||
|
||||
class DepreciationRequest(BaseModel):
|
||||
asset: AssetRequest
|
||||
year: int
|
||||
method: str # "straight_line" or "cca"
|
||||
|
||||
|
||||
class DepreciationResponse(BaseModel):
|
||||
asset_id: str
|
||||
year: int
|
||||
method: str
|
||||
depreciation: float
|
||||
book_value: float
|
||||
total_depreciation: Optional[float] = None
|
||||
success: bool
|
||||
error: Optional[str] = None
|
||||
|
||||
class MatchSpecificRequest(BaseModel):
|
||||
file_ids: List[str]
|
||||
categorization_id: str
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
from pydantic_settings import BaseSettings
|
||||
from typing import Optional
|
||||
|
||||
class Settings(BaseSettings):
|
||||
database_url: Optional[str] = None
|
||||
secret_key: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
model: str = "openai/gpt-oss-120b"
|
||||
GROQ_API_KEY: str
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
|
||||
settings = Settings()
|
||||
+139
@@ -0,0 +1,139 @@
|
||||
from typing import Annotated
|
||||
|
||||
from fastapi import Depends
|
||||
from sqlalchemy import Column, DateTime, Float, Integer, String, create_engine
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
SQLALCHEMY_DATABASE_URL = "sqlite:///./sql_app.db"
|
||||
|
||||
engine = create_engine(
|
||||
SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
|
||||
)
|
||||
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
db_dependency = Annotated[Session, Depends(get_db)]
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
def create_db_tables():
|
||||
"""Create database tables safely with error handling"""
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
# Check if tables already exist to avoid unnecessary DDL operations
|
||||
from sqlalchemy import inspect
|
||||
|
||||
inspector = inspect(engine)
|
||||
existing_tables = inspector.get_table_names()
|
||||
|
||||
if existing_tables:
|
||||
logger.info(f"Database tables already exist: {existing_tables}")
|
||||
return
|
||||
|
||||
# Create tables with timeout protection
|
||||
logger.info("Creating database tables...")
|
||||
Base.metadata.create_all(bind=engine, checkfirst=True)
|
||||
logger.info("Database tables created successfully")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.warning("Database creation interrupted by user")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating database tables: {e}")
|
||||
# Don't crash the app - tables might already exist
|
||||
pass
|
||||
|
||||
|
||||
def clear_all_data():
|
||||
"""Clear all data from the database (useful for testing)"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
db.query(DBTransaction).delete()
|
||||
db.query(DBReceipt).delete()
|
||||
db.query(DBUploadedFile).delete()
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# Transactions table
|
||||
class DBTransaction(Base):
|
||||
__tablename__ = "transactions"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
transaction_id = Column(String, index=True)
|
||||
amount = Column(Float, nullable=False)
|
||||
date = Column(DateTime, nullable=False)
|
||||
vendor = Column(String, nullable=False)
|
||||
description = Column(String, nullable=True)
|
||||
category = Column(String, nullable=True)
|
||||
tax_amount = Column(Float, nullable=True)
|
||||
categorisation_id = Column(String, nullable=True)
|
||||
user_id = Column(String, nullable=True)
|
||||
source = Column(String, nullable=True) # e.g., "csv", "image", "manual", "api"
|
||||
|
||||
# Additional QuickBooks CSV columns
|
||||
TxnId = Column(String, nullable=True)
|
||||
AccountType = Column(String, nullable=True)
|
||||
AccountNumber = Column(String, nullable=True)
|
||||
TransactionDate = Column(String, nullable=True)
|
||||
TransactionType = Column(String, nullable=True)
|
||||
ChequeNumber = Column(String, nullable=True)
|
||||
Description1 = Column(String, nullable=True)
|
||||
Description2 = Column(String, nullable=True)
|
||||
VendorId = Column(String, nullable=True)
|
||||
VendorName = Column(String, nullable=True)
|
||||
AccountId = Column(String, nullable=True)
|
||||
AccountName = Column(String, nullable=True)
|
||||
|
||||
|
||||
# Uploaded Files table
|
||||
class DBUploadedFile(Base):
|
||||
__tablename__ = "uploaded_files"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
file_id = Column(String, unique=True, index=True)
|
||||
filename = Column(String, nullable=False)
|
||||
file_path = Column(String, nullable=False)
|
||||
file_type = Column(String, nullable=False)
|
||||
upload_date = Column(DateTime, nullable=False)
|
||||
status = Column(String, nullable=False, default="uploaded")
|
||||
|
||||
|
||||
# Receipts table
|
||||
class DBReceipt(Base):
|
||||
__tablename__ = "receipts"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
receipt_id = Column(String, unique=True, index=True)
|
||||
file_id = Column(String, unique=True, index=True)
|
||||
amount = Column(Float, nullable=False)
|
||||
date = Column(DateTime, nullable=False)
|
||||
vendor = Column(String, nullable=False)
|
||||
description = Column(String, nullable=True)
|
||||
category = Column(String, nullable=True)
|
||||
tax_amount = Column(Float, nullable=True)
|
||||
confidence = Column(Float, nullable=True)
|
||||
extraction_success = Column(String, nullable=True)
|
||||
error_message = Column(String, nullable=True)
|
||||
receipt_currency = Column(String, nullable=True)
|
||||
receipt_location = Column(String, nullable=True)
|
||||
calculated_tax = Column(Float, nullable=True)
|
||||
is_depreciable = Column(String, nullable=True) # Store as string "True"/"False"
|
||||
name_of_asset = Column(String, nullable=True) # Name/description of the asset
|
||||
cca_rate = Column(Float, nullable=True)
|
||||
useful_life = Column(Integer, nullable=True)
|
||||
residual_value = Column(Float, nullable=True)
|
||||
+456
-397
File diff suppressed because it is too large
Load Diff
+351
@@ -0,0 +1,351 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
@dataclass
|
||||
class Address:
|
||||
"""Address information for tax calculations"""
|
||||
|
||||
province: str
|
||||
city: str
|
||||
postal_code: str
|
||||
country: str = "Canada"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Receipt:
|
||||
id: str
|
||||
file_name: str
|
||||
upload_date: datetime
|
||||
receipt_date: datetime
|
||||
amount: float
|
||||
tax: float
|
||||
vendor: str
|
||||
category: str
|
||||
description: str
|
||||
# Tax rule fields
|
||||
billing_address: Optional[Address] = None
|
||||
shipping_address: Optional[Address] = None
|
||||
currency: str = "CAD"
|
||||
is_meals_entertainment: bool = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class Transaction:
|
||||
id: str
|
||||
transaction_date: datetime
|
||||
amount: float
|
||||
vendor: str
|
||||
notes: str
|
||||
# Tax rule fields
|
||||
currency: str = "CAD"
|
||||
fx_rate: Optional[float] = None
|
||||
source: Optional[str] = None # e.g., "csv", "image", "manual", "api"
|
||||
|
||||
# QuickBooks CSV fields
|
||||
TxnId: Optional[str] = None
|
||||
AccountType: Optional[str] = None
|
||||
AccountNumber: Optional[str] = None
|
||||
TransactionDate: Optional[str] = None
|
||||
TransactionType: Optional[str] = None
|
||||
ChequeNumber: Optional[str] = None
|
||||
Description1: Optional[str] = None
|
||||
Description2: Optional[str] = None
|
||||
VendorId: Optional[str] = None
|
||||
VendorName: Optional[str] = None
|
||||
AccountId: Optional[str] = None
|
||||
AccountName: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class Asset:
|
||||
"""Asset for depreciation calculations"""
|
||||
|
||||
id: str
|
||||
name: str
|
||||
purchase_date: datetime
|
||||
purchase_amount: float
|
||||
useful_life_years: int
|
||||
residual_value: float
|
||||
cca_rate: float # Capital Cost Allowance rate
|
||||
asset_class: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class Match:
|
||||
receipt: Receipt
|
||||
transaction: Transaction
|
||||
confidence_score: float
|
||||
match_reason: str
|
||||
tax_analysis: Optional[dict] = None
|
||||
|
||||
|
||||
class AddressRequest(BaseModel):
|
||||
province: str
|
||||
city: str
|
||||
postal_code: str
|
||||
country: str = "Canada"
|
||||
|
||||
|
||||
class ReceiptRequest(BaseModel):
|
||||
id: str
|
||||
file_name: str
|
||||
upload_date: datetime
|
||||
receipt_date: datetime
|
||||
amount: float
|
||||
tax: float
|
||||
vendor: str
|
||||
category: str
|
||||
description: str
|
||||
# Tax rule fields
|
||||
billing_address: Optional[AddressRequest] = None
|
||||
shipping_address: Optional[AddressRequest] = None
|
||||
currency: str = "CAD"
|
||||
is_meals_entertainment: bool = False
|
||||
|
||||
|
||||
class TransactionRequest(BaseModel):
|
||||
id: str
|
||||
transaction_date: datetime
|
||||
amount: float
|
||||
vendor: str
|
||||
notes: str
|
||||
# Tax rule fields
|
||||
currency: str = "CAD"
|
||||
fx_rate: Optional[float] = None
|
||||
source: Optional[str] = None # e.g., "csv", "image", "manual", "api"
|
||||
|
||||
# QuickBooks CSV fields
|
||||
TxnId: Optional[str] = None
|
||||
AccountType: Optional[str] = None
|
||||
AccountNumber: Optional[str] = None
|
||||
TransactionDate: Optional[str] = None
|
||||
TransactionType: Optional[str] = None
|
||||
ChequeNumber: Optional[str] = None
|
||||
Description1: Optional[str] = None
|
||||
Description2: Optional[str] = None
|
||||
VendorId: Optional[str] = None
|
||||
VendorName: Optional[str] = None
|
||||
AccountId: Optional[str] = None
|
||||
AccountName: Optional[str] = None
|
||||
|
||||
|
||||
class AssetRequest(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
purchase_date: datetime
|
||||
purchase_amount: float
|
||||
useful_life_years: int
|
||||
residual_value: float
|
||||
cca_rate: float
|
||||
asset_class: str
|
||||
|
||||
|
||||
class MatchingRequest(BaseModel):
|
||||
receipt_ids: List[str]
|
||||
transaction_ids: List[str]
|
||||
|
||||
|
||||
class MatchResponse(BaseModel):
|
||||
receipt_id: str
|
||||
transaction_id: str
|
||||
confidence_score: float
|
||||
match_reason: str
|
||||
receipt_vendor: str
|
||||
receipt_amount: float
|
||||
receipt_description: str
|
||||
receipt_category: str
|
||||
receipt_tax_amount: float
|
||||
transaction_vendor: str
|
||||
transaction_amount: float
|
||||
tax_analysis: Optional[dict] = None
|
||||
flag_for_review: Optional[bool] = None
|
||||
auto_approve: Optional[bool] = None
|
||||
|
||||
# Transaction metadata
|
||||
transaction_source: Optional[str] = None # Source of the transaction
|
||||
|
||||
# QuickBooks CSV fields from transaction
|
||||
TxnId: Optional[str] = None
|
||||
AccountType: Optional[str] = None
|
||||
AccountNumber: Optional[str] = None
|
||||
TransactionDate: Optional[str] = None
|
||||
TransactionType: Optional[str] = None
|
||||
ChequeNumber: Optional[str] = None
|
||||
Description1: Optional[str] = None
|
||||
Description2: Optional[str] = None
|
||||
VendorId: Optional[str] = None
|
||||
VendorName: Optional[str] = None
|
||||
AccountId: Optional[str] = None
|
||||
AccountName: Optional[str] = None
|
||||
Source: Optional[str] = None
|
||||
|
||||
|
||||
class MatchingResponse(BaseModel):
|
||||
matches: List[MatchResponse]
|
||||
stats: dict
|
||||
|
||||
|
||||
class ApprovalRequest(BaseModel):
|
||||
match_id: str
|
||||
approved: bool
|
||||
reason: Optional[str] = None
|
||||
|
||||
|
||||
class RuleRequest(BaseModel):
|
||||
name: str
|
||||
condition: str
|
||||
action: str
|
||||
source: str = "user"
|
||||
|
||||
|
||||
class DocumentUploadResponse(BaseModel):
|
||||
file_id: str
|
||||
filename: str
|
||||
file_type: str
|
||||
upload_date: datetime
|
||||
status: str
|
||||
|
||||
|
||||
class AIRules(BaseModel):
|
||||
condition: str
|
||||
action: str
|
||||
|
||||
|
||||
class DocumentProcessRequest(BaseModel):
|
||||
file_id: Optional[str] = None
|
||||
user_location: Optional[str] = (
|
||||
None # Format: "State/Province, Country" (e.g., "Ontario, Canada")
|
||||
)
|
||||
ai_rules: Optional[List[AIRules]] = None
|
||||
|
||||
|
||||
class DocumentProcessResponse(BaseModel):
|
||||
file_id: str
|
||||
receipt_id: str
|
||||
extraction_success: bool
|
||||
vendor: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
total_amount: Optional[float] = None
|
||||
tax_amount: Optional[float] = None
|
||||
date: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
confidence: Optional[float] = None
|
||||
error: Optional[str] = None
|
||||
receipt_currency: Optional[str] = "CAD"
|
||||
receipt_location: Optional[str] = (
|
||||
None # Location from receipt (e.g., "Ontario, Canada" or "California, USA")
|
||||
)
|
||||
calculated_tax: Optional[float] = None # Calculated sales tax if not clearly shown
|
||||
is_depreciable: Optional[bool] = None # Whether item is a depreciable asset
|
||||
name_of_asset: Optional[str] = None # Name/description of the asset if depreciable
|
||||
cca_rate: Optional[float] = (
|
||||
None # CCA rate for tax depreciation (e.g., 0.30 for 30%)
|
||||
)
|
||||
useful_life: Optional[int] = (
|
||||
None # Useful life in years for straight-line depreciation
|
||||
)
|
||||
residual_value: Optional[float] = (
|
||||
None # Residual value for straight-line depreciation
|
||||
)
|
||||
|
||||
|
||||
# New tax-related models
|
||||
class TaxCalculationRequest(BaseModel):
|
||||
receipt_id: str
|
||||
transaction_id: Optional[str] = None
|
||||
|
||||
|
||||
class TaxCalculationResponse(BaseModel):
|
||||
receipt_id: str
|
||||
rules_applied: List[str]
|
||||
sales_tax: dict
|
||||
fx_analysis: Optional[dict] = None
|
||||
meals_entertainment: dict
|
||||
|
||||
|
||||
class DepreciationRequest(BaseModel):
|
||||
asset: AssetRequest
|
||||
year: int
|
||||
method: str # "straight_line" or "cca"
|
||||
|
||||
|
||||
class DepreciationResponse(BaseModel):
|
||||
asset_id: str
|
||||
year: int
|
||||
method: str
|
||||
depreciation: float
|
||||
book_value: float
|
||||
total_depreciation: Optional[float] = None
|
||||
success: bool
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class CityInfo(BaseModel):
|
||||
"""City information from user tax info"""
|
||||
|
||||
id: int
|
||||
name: str
|
||||
state_id: int
|
||||
state_code: str
|
||||
country_id: int
|
||||
country_code: str
|
||||
latitude: Optional[str] = None
|
||||
longitude: Optional[str] = None
|
||||
|
||||
|
||||
class StateInfo(BaseModel):
|
||||
"""State/Province information from user tax info"""
|
||||
|
||||
id: int
|
||||
name: str
|
||||
country_id: int
|
||||
country_code: str
|
||||
state_code: str
|
||||
|
||||
|
||||
class CountryInfo(BaseModel):
|
||||
"""Country information from user tax info"""
|
||||
|
||||
id: int
|
||||
name: str
|
||||
iso3: str
|
||||
iso2: str
|
||||
phone_code: str
|
||||
capital: str
|
||||
currency: str
|
||||
native: Optional[str] = None
|
||||
region: Optional[str] = None
|
||||
subregion: Optional[str] = None
|
||||
emoji: Optional[str] = None
|
||||
emojiU: Optional[str] = None
|
||||
|
||||
|
||||
class UserTaxInfo(BaseModel):
|
||||
"""User tax information for location-based tax calculations"""
|
||||
|
||||
id: int
|
||||
user_id: int
|
||||
company_name: str
|
||||
tax_id: Optional[str] = ""
|
||||
tax_id_type: Optional[str] = "EIN"
|
||||
address_line_1: Optional[str] = ""
|
||||
address_line_2: Optional[str] = ""
|
||||
city: CityInfo
|
||||
state: StateInfo
|
||||
zip_postal_code: Optional[str] = ""
|
||||
country: CountryInfo
|
||||
include_on_invoices: Optional[int] = 1
|
||||
created_at: Optional[str] = None
|
||||
updated_at: Optional[str] = None
|
||||
|
||||
|
||||
class MatchSpecificRequest(BaseModel):
|
||||
file_ids: List[str]
|
||||
categorization_id: str
|
||||
user_location: Optional[str] = "Canada" # Kept for backward compatibility
|
||||
user_tax_info: Optional[UserTaxInfo] = None
|
||||
ai_rules: Optional[List[AIRules]] = None
|
||||
@@ -4,8 +4,8 @@ from typing import List, Tuple
|
||||
|
||||
import groq
|
||||
|
||||
import config
|
||||
from models import Match, Receipt, Transaction
|
||||
from config import settings
|
||||
from schemas import Match, Receipt, Transaction
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
@@ -14,8 +14,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
class AIMatcher:
|
||||
def __init__(self, use_batch_matching=True):
|
||||
self.client = groq.Groq(api_key=config.GROQ_API_KEY)
|
||||
self.model = "llama3-8b-8192"
|
||||
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
|
||||
self.model = settings.model
|
||||
self.max_retries = 3
|
||||
self.retry_delay = 2 # seconds - increased for rate limiting
|
||||
self.rate_limit_delay = 1.0 # seconds between API calls
|
||||
@@ -116,7 +116,7 @@ class AIMatcher:
|
||||
for i, transaction in enumerate(candidates):
|
||||
transaction_amount_abs = abs(transaction.amount)
|
||||
date_diff = abs((receipt.receipt_date - transaction.transaction_date).days)
|
||||
amount_diff = abs(receipt.amount - transaction_amount_abs)
|
||||
amount_diff = abs(receipt.amount - transaction_amount_abs - receipt.tax)
|
||||
amount_percent_diff = (
|
||||
(amount_diff / receipt.amount) * 100 if receipt.amount > 0 else 0
|
||||
)
|
||||
@@ -127,11 +127,12 @@ Candidate {i + 1}:
|
||||
- Amount: ${transaction.amount} (absolute: ${transaction_amount_abs})
|
||||
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")} ({date_diff} days difference)
|
||||
- Notes: {transaction.notes}
|
||||
- Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%)
|
||||
- Amount difference: ${amount_diff} ({amount_percent_diff:.1f}%) Taking in account receipt tax
|
||||
"""
|
||||
logger.info(f"\nThis is the receipt: {receipt}\n")
|
||||
logger.info(f"\nCandidate text: {candidates_text}\n")
|
||||
|
||||
prompt = f"""
|
||||
You are an expert at matching receipts to bank transactions. Analyze the receipt below against ALL the candidate transactions and return the BEST match.
|
||||
prompt = f"""You are an expert at matching receipts to bank transactions. Your PRIMARY goal is to find the candidate with the CLOSEST AMOUNT match.
|
||||
|
||||
RECEIPT TO MATCH:
|
||||
- Vendor: {receipt.vendor}
|
||||
@@ -143,25 +144,52 @@ RECEIPT TO MATCH:
|
||||
CANDIDATE TRANSACTIONS:
|
||||
{candidates_text}
|
||||
|
||||
SCORING CRITERIA:
|
||||
- Perfect matches (same vendor, amount, date): 0.95-1.0
|
||||
- High confidence (minor differences): 0.8-0.94
|
||||
- Medium confidence (moderate differences): 0.6-0.79
|
||||
- Low confidence (significant differences): 0.4-0.59
|
||||
- Very low confidence (major differences): 0.2-0.39
|
||||
- Minimal similarity: 0.1-0.19
|
||||
- No meaningful similarity: 0.0-0.09
|
||||
CRITICAL INSTRUCTIONS FOR SELECTION:
|
||||
1. FIRST: Find the candidate(s) with the SMALLEST amount percentage difference
|
||||
2. ONLY if multiple candidates have similar amounts (within 2% of each other), THEN consider vendor/date/notes
|
||||
3. USE THE PERCENTAGE DIFFERENCE PROVIDED for each candidate - DO NOT calculate yourself
|
||||
4. IGNORE vendor/description matches if amounts are far apart (>20% difference)
|
||||
5. The candidate with the closest amount is almost always the correct match
|
||||
|
||||
Consider vendor name similarity, amount accuracy, date proximity, and description/notes relevance.
|
||||
SCORING CRITERIA - AMOUNT DIFFERENCE IS 90% OF THE DECISION:
|
||||
|
||||
IMPORTANT: You MUST return the candidate with the highest match score, even if it's very low. Never return NONE.
|
||||
Return ONLY the best match in this exact format:
|
||||
CANDIDATE_NUMBER|CONFIDENCE_SCORE|REASON
|
||||
Step 1: Calculate BASE SCORE using the provided amount percentage difference:
|
||||
- 0-1% difference: Base score = 0.95
|
||||
- 1-2% difference: Base score = 0.90
|
||||
- 2-3% difference: Base score = 0.85
|
||||
- 3-5% difference: Base score = 0.75
|
||||
- 5-7% difference: Base score = 0.65
|
||||
- 7-10% difference: Base score = 0.55
|
||||
- 10-15% difference: Base score = 0.40
|
||||
- 15-20% difference: Base score = 0.25
|
||||
- 20-30% difference: Base score = 0.15
|
||||
- 30-50% difference: Base score = 0.08
|
||||
- 50-100% difference: Base score = 0.03
|
||||
- >100% difference: Base score = 0.01
|
||||
|
||||
Example: 3|0.87|Same vendor name, exact amount match, 1 day apart
|
||||
Example of low match: 5|0.15|Best available option despite significant differences in vendor and amount
|
||||
"""
|
||||
Step 2: ADJUST the base score (±0.10 maximum):
|
||||
- Vendor exact match: +0.10
|
||||
- Vendor similar/partial match: +0.05
|
||||
- Date within 7 days: +0.05
|
||||
- Date within 30 days: +0.02
|
||||
- Description/notes keywords match: +0.02
|
||||
- Vendor completely different: -0.05
|
||||
- Date >90 days apart: -0.03
|
||||
|
||||
Step 3: Ensure final score is between 0.0 and 1.0
|
||||
|
||||
|
||||
CRITICAL: You MUST return valid JSON only. No explanations, no text before or after.
|
||||
|
||||
Return format:
|
||||
{{"candidate_number": 1, "confidence_score": 0.65, "reason": "5.8% amount difference with similar vendor"}}
|
||||
|
||||
Another example:
|
||||
{{"candidate_number": 2, "confidence_score": 0.01, "reason": "9850% amount difference, extremely poor match"}}
|
||||
|
||||
Return ONLY JSON for the best candidate:"""
|
||||
|
||||
# logger.info(f"This is the prompt: {prompt}")
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
result = self._call_groq_api_with_timeout(
|
||||
@@ -179,6 +207,22 @@ Example of low match: 5|0.15|Best available option despite significant differenc
|
||||
|
||||
if 0 <= candidate_num < len(candidates):
|
||||
best_transaction = candidates[candidate_num]
|
||||
|
||||
# Validate the match - catch AI errors with extreme amount differences
|
||||
transaction_amount_abs = abs(best_transaction.amount)
|
||||
amount_diff = abs(receipt.amount - transaction_amount_abs)
|
||||
amount_percent_diff = (
|
||||
(amount_diff / receipt.amount) * 100 if receipt.amount > 0 else 0
|
||||
)
|
||||
|
||||
# If amount difference is >100%, force very low score
|
||||
if amount_percent_diff > 100:
|
||||
logger.warning(
|
||||
f"Overriding AI score for extreme mismatch: {receipt.amount} vs {transaction_amount_abs} ({amount_percent_diff:.1f}% diff)"
|
||||
)
|
||||
score = min(0.05, score) # Cap at 0.05 for extreme mismatches
|
||||
reason = f"{amount_percent_diff:.1f}% amount difference, extreme mismatch"
|
||||
|
||||
logger.info(
|
||||
f"AI selected candidate {candidate_num + 1}: {best_transaction.vendor} (score: {score:.3f})"
|
||||
)
|
||||
@@ -204,55 +248,93 @@ Example of low match: 5|0.15|Best available option despite significant differenc
|
||||
return None
|
||||
|
||||
def _parse_single_match_response(self, result: str) -> Tuple[int, float, str]:
|
||||
"""Parse AI response for single best match"""
|
||||
"""Parse AI response for single best match (JSON format)"""
|
||||
import json
|
||||
import re
|
||||
|
||||
result = result.strip()
|
||||
logger.debug(f"Parsing single match response: {result}")
|
||||
|
||||
try:
|
||||
if result.upper().startswith("NONE"):
|
||||
# This should not happen with new prompt, but handle as parsing error
|
||||
logger.warning(
|
||||
"AI returned NONE despite being instructed to always return best match"
|
||||
)
|
||||
return -1, 0.0, "AI returned NONE unexpectedly"
|
||||
# First, try to parse the entire result as JSON
|
||||
try:
|
||||
data = json.loads(result)
|
||||
candidate_num = int(data.get("candidate_number", -1)) - 1
|
||||
score = float(data.get("confidence_score", 0.0))
|
||||
reason = str(data.get("reason", "No reason provided"))
|
||||
score = max(0.0, min(1.0, score))
|
||||
logger.debug(f"Parsed JSON: candidate={candidate_num}, score={score}, reason={reason}")
|
||||
return candidate_num, score, reason
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Try to extract JSON object from the response using improved regex
|
||||
# This handles nested braces better
|
||||
json_pattern = r'\{[^{}]*"candidate_number"[^{}]*"confidence_score"[^{}]*"reason"[^{}]*\}'
|
||||
json_match = re.search(json_pattern, result)
|
||||
|
||||
if json_match:
|
||||
json_str = json_match.group()
|
||||
data = json.loads(json_str)
|
||||
candidate_num = int(data.get("candidate_number", -1)) - 1
|
||||
score = float(data.get("confidence_score", 0.0))
|
||||
reason = str(data.get("reason", "No reason provided"))
|
||||
score = max(0.0, min(1.0, score))
|
||||
logger.debug(f"Parsed extracted JSON: candidate={candidate_num}, score={score}, reason={reason}")
|
||||
return candidate_num, score, reason
|
||||
|
||||
# Try to find any JSON-like structure with the required fields
|
||||
candidate_match = re.search(r'"candidate_number"\s*:\s*(\d+)', result)
|
||||
score_match = re.search(r'"confidence_score"\s*:\s*([\d.]+)', result)
|
||||
reason_match = re.search(r'"reason"\s*:\s*"([^"]*)"', result)
|
||||
|
||||
if candidate_match and score_match and reason_match:
|
||||
candidate_num = int(candidate_match.group(1)) - 1
|
||||
score = float(score_match.group(1))
|
||||
reason = reason_match.group(1)
|
||||
score = max(0.0, min(1.0, score))
|
||||
logger.debug(f"Parsed fields individually: candidate={candidate_num}, score={score}, reason={reason}")
|
||||
return candidate_num, score, reason
|
||||
|
||||
if "|" in result:
|
||||
parts = result.split("|")
|
||||
if len(parts) >= 3:
|
||||
candidate_str = parts[0].strip()
|
||||
score_str = parts[1].strip()
|
||||
reason = "|".join(parts[2:]).strip()
|
||||
except (json.JSONDecodeError, ValueError, KeyError) as e:
|
||||
logger.warning(f"Error parsing JSON response: {e}")
|
||||
|
||||
# Fallback to old pipe-delimited format for backwards compatibility
|
||||
try:
|
||||
if "|" in result:
|
||||
parts = result.split("|")
|
||||
if len(parts) >= 3:
|
||||
candidate_str = parts[0].strip()
|
||||
score_str = parts[1].strip()
|
||||
reason = "|".join(parts[2:]).strip()
|
||||
|
||||
# Extract candidate number
|
||||
import re
|
||||
# Extract candidate number
|
||||
candidate_match = re.search(r"\d+", candidate_str)
|
||||
if candidate_match:
|
||||
candidate_num = (
|
||||
int(candidate_match.group()) - 1
|
||||
) # Convert to 0-based index
|
||||
else:
|
||||
raise ValueError("No candidate number found")
|
||||
|
||||
candidate_match = re.search(r"\d+", candidate_str)
|
||||
if candidate_match:
|
||||
candidate_num = (
|
||||
int(candidate_match.group()) - 1
|
||||
) # Convert to 0-based index
|
||||
else:
|
||||
raise ValueError("No candidate number found")
|
||||
# Extract score
|
||||
score_clean = "".join(
|
||||
c for c in score_str if c.isdigit() or c == "."
|
||||
)
|
||||
score = float(score_clean) if score_clean else 0.0
|
||||
|
||||
# Extract score
|
||||
score_clean = "".join(
|
||||
c for c in score_str if c.isdigit() or c == "."
|
||||
)
|
||||
score = float(score_clean) if score_clean else 0.0
|
||||
# Ensure score is in valid range
|
||||
score = max(0.0, min(1.0, score))
|
||||
|
||||
# Ensure score is in valid range
|
||||
score = max(0.0, min(1.0, score))
|
||||
logger.debug(
|
||||
f"Parsed (fallback): candidate={candidate_num}, score={score}, reason={reason}"
|
||||
)
|
||||
return candidate_num, score, reason
|
||||
except Exception as fallback_error:
|
||||
logger.warning(f"Fallback parsing also failed: {fallback_error}")
|
||||
|
||||
logger.debug(
|
||||
f"Parsed: candidate={candidate_num}, score={score}, reason={reason}"
|
||||
)
|
||||
return candidate_num, score, reason
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error parsing single match response: {e}")
|
||||
|
||||
# Fallback
|
||||
logger.warning(f"Could not parse single match response: {result}")
|
||||
# Final fallback
|
||||
# logger.warning(f"Could not parse single match response: {result}")
|
||||
return -1, 0.0, f"Parse error: {result[:50]}..."
|
||||
|
||||
def _filter_candidates(
|
||||
@@ -260,18 +342,29 @@ Example of low match: 5|0.15|Best available option despite significant differenc
|
||||
) -> List[Transaction]:
|
||||
"""Filter transactions to create a reasonable candidate list"""
|
||||
candidates = []
|
||||
amount_threshold = receipt.amount * 2.0 # 200% threshold - very inclusive
|
||||
|
||||
|
||||
for transaction in transactions:
|
||||
# Use absolute value for transaction amount comparison
|
||||
transaction_amount_abs = abs(transaction.amount)
|
||||
|
||||
# Only exclude transactions with obviously different amounts
|
||||
if abs(receipt.amount - transaction_amount_abs) <= amount_threshold:
|
||||
amount_diff = abs(receipt.amount - transaction_amount_abs)
|
||||
|
||||
# Calculate percentage difference
|
||||
if receipt.amount > 0:
|
||||
percent_diff = (amount_diff / receipt.amount) * 100
|
||||
else:
|
||||
percent_diff = 0
|
||||
|
||||
# Be more restrictive: exclude transactions with >300% difference
|
||||
# This prevents extreme mismatches while still being generous
|
||||
if percent_diff <= 300:
|
||||
candidates.append(transaction)
|
||||
else:
|
||||
logger.debug(
|
||||
f"Filtered out transaction ${transaction_amount_abs} for receipt ${receipt.amount} ({percent_diff:.1f}% difference)"
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Filtered {len(transactions)} transactions to {len(candidates)} candidates"
|
||||
f"Filtered {len(transactions)} transactions to {len(candidates)} candidates for receipt ${receipt.amount}"
|
||||
)
|
||||
return candidates
|
||||
|
||||
@@ -338,6 +431,10 @@ Example of low match: 5|0.15|Best available option despite significant differenc
|
||||
|
||||
Consider description and category similarity in your scoring.
|
||||
|
||||
THINGS TO NOTE:
|
||||
The most important factor to consider is the Amount for both the transaction and the receipt, the closer the amounts, the higher the score.
|
||||
If the amounts are different or not close return a low score (0-0.1) based on other factors.
|
||||
|
||||
IMPORTANT: Return ONLY the score and reason separated by a pipe character.
|
||||
Format: [score]|[reason]
|
||||
Example: 0.85|Same vendor, same amount, 2 days apart
|
||||
@@ -352,8 +449,8 @@ Example of low match: 5|0.15|Best available option despite significant differenc
|
||||
# Parse the result - handle multiple formats
|
||||
score, reason = self._parse_ai_response(result)
|
||||
|
||||
logger.debug(f"AI Response: {result}")
|
||||
logger.debug(f"Parsed: score={score}, reason={reason}")
|
||||
# logger.debug(f"AI Response: {result}")
|
||||
# logger.debug(f"Parsed: score={score}, reason={reason}")
|
||||
|
||||
return score, reason
|
||||
|
||||
@@ -451,9 +548,12 @@ Example of low match: 5|0.15|Best available option despite significant differenc
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
max_tokens=200,
|
||||
temperature=0.1,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a JSON-only response assistant. Return only valid JSON, no explanations."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
max_tokens=150,
|
||||
temperature=0,
|
||||
)
|
||||
return response.choices[0].message.content.strip()
|
||||
except Exception as e:
|
||||
@@ -1,8 +1,8 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from models import Receipt, Transaction
|
||||
from tax_rules_engine import TaxRulesEngine
|
||||
from schemas import Receipt, Transaction
|
||||
from services.tax_rules_engine import TaxRulesEngine
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -0,0 +1,273 @@
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
import groq
|
||||
from config import settings
|
||||
from schemas import Match
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AIRulesMatcher:
|
||||
"""
|
||||
AI-powered rules engine for post-matching evaluation.
|
||||
Uses LLM to intelligently apply custom rules and determine if matches should be:
|
||||
- Flagged for manual review (flag_for_review=True)
|
||||
- Auto-approved (auto_approve=True)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
|
||||
self.model = settings.model
|
||||
|
||||
def apply_rules_to_matches(
|
||||
self, matches: List[Match], ai_rules: Optional[List[Dict]] = None
|
||||
) -> List[Match]:
|
||||
"""
|
||||
Apply AI rules to all matches and add flag_for_review and auto_approve fields.
|
||||
|
||||
Args:
|
||||
matches: List of Match objects from the matching engine
|
||||
ai_rules: Optional list of custom rules (format: [{"condition": str, "action": str}])
|
||||
|
||||
Returns:
|
||||
Enhanced matches with tax_analysis containing flag_for_review and auto_approve
|
||||
"""
|
||||
if not matches:
|
||||
return matches
|
||||
|
||||
logger.info(
|
||||
f"Applying AI rules to {len(matches)} matches with {len(ai_rules) if ai_rules else 0} custom rules"
|
||||
)
|
||||
|
||||
# Built-in rule: currency mismatch should always flag for review
|
||||
builtin_rules = [
|
||||
{
|
||||
"condition": "receipt currency differs from transaction currency",
|
||||
"action": "flag_for_review",
|
||||
}
|
||||
]
|
||||
|
||||
# Combine built-in rules with user-provided rules
|
||||
all_rules = builtin_rules + (ai_rules if ai_rules else [])
|
||||
|
||||
# Process each match
|
||||
for match in matches:
|
||||
try:
|
||||
rule_evaluation = self._evaluate_rules_for_match(match, all_rules)
|
||||
|
||||
# Initialize or update tax_analysis
|
||||
if match.tax_analysis is None:
|
||||
match.tax_analysis = {}
|
||||
|
||||
# Add rule evaluation results
|
||||
match.tax_analysis["flag_for_review"] = rule_evaluation[
|
||||
"flag_for_review"
|
||||
]
|
||||
match.tax_analysis["auto_approve"] = rule_evaluation["auto_approve"]
|
||||
match.tax_analysis["rules_applied"] = rule_evaluation["rules_applied"]
|
||||
match.tax_analysis["rule_reasons"] = rule_evaluation["reasons"]
|
||||
|
||||
# Update match reason with rule information
|
||||
if rule_evaluation["flag_for_review"]:
|
||||
match.match_reason += " | 🚩 FLAGGED FOR REVIEW"
|
||||
if rule_evaluation["auto_approve"]:
|
||||
match.match_reason += " | ✅ AUTO-APPROVED"
|
||||
|
||||
logger.info(
|
||||
f"Match {match.receipt.id} → {match.transaction.id}: "
|
||||
f"flag_for_review={rule_evaluation['flag_for_review']}, "
|
||||
f"auto_approve={rule_evaluation['auto_approve']}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error applying rules to match: {str(e)}")
|
||||
# Fail safe: flag for review if rule processing fails
|
||||
if match.tax_analysis is None:
|
||||
match.tax_analysis = {}
|
||||
match.tax_analysis["flag_for_review"] = True
|
||||
match.tax_analysis["auto_approve"] = False
|
||||
match.tax_analysis["rule_reasons"] = [
|
||||
f"Rule evaluation error: {str(e)}"
|
||||
]
|
||||
|
||||
return matches
|
||||
|
||||
def _evaluate_rules_for_match(
|
||||
self, match: Match, rules: List[Dict]
|
||||
) -> Dict[str, any]:
|
||||
"""
|
||||
Use LLM to evaluate all rules for a single match.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"flag_for_review": bool,
|
||||
"auto_approve": bool,
|
||||
"rules_applied": List[str],
|
||||
"reasons": List[str]
|
||||
}
|
||||
"""
|
||||
# Build context about the match
|
||||
match_context = self._build_match_context(match)
|
||||
|
||||
# Build rules context
|
||||
rules_context = self._build_rules_context(rules)
|
||||
|
||||
# Create prompt for LLM
|
||||
prompt = f"""You are a financial matching rules engine. Analyze the following receipt-to-transaction match and apply the specified rules.
|
||||
|
||||
MATCH DETAILS:
|
||||
{match_context}
|
||||
|
||||
RULES TO APPLY:
|
||||
{rules_context}
|
||||
|
||||
INSTRUCTIONS:
|
||||
1. Evaluate each rule's condition against the match details
|
||||
2. If a rule's condition is TRUE, apply the action:
|
||||
- If action is "flag_for_review" or "review" → set flag_for_review = true
|
||||
- If action is "auto_approve" or "approve" → set auto_approve = true
|
||||
- For other actions, determine if they imply review or approval
|
||||
3. If BOTH flag_for_review and auto_approve are triggered, flag_for_review takes priority
|
||||
4. If NO rules match, set both to false (default behavior)
|
||||
|
||||
IMPORTANT BUILT-IN RULE:
|
||||
- If receipt currency differs from transaction currency → ALWAYS set flag_for_review = true
|
||||
|
||||
Return ONLY a valid JSON object with this exact format:
|
||||
{{
|
||||
"flag_for_review": boolean,
|
||||
"auto_approve": boolean,
|
||||
"rules_applied": ["list of rule conditions that matched"],
|
||||
"reasons": ["list of reasons for the decisions"]
|
||||
}}
|
||||
"""
|
||||
|
||||
try:
|
||||
# Call LLM
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a financial rules evaluation assistant. You analyze transaction matches and apply business rules. Always respond with valid JSON only.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.1,
|
||||
max_tokens=500,
|
||||
)
|
||||
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
|
||||
# Parse JSON response
|
||||
result = self._parse_llm_response(result_text)
|
||||
|
||||
# Validate and enforce constraints
|
||||
if result["flag_for_review"] and result["auto_approve"]:
|
||||
logger.warning(
|
||||
"Both flag_for_review and auto_approve were true, prioritizing flag_for_review"
|
||||
)
|
||||
result["auto_approve"] = False
|
||||
result["reasons"].append(
|
||||
"Conflicting rules: prioritized manual review over auto-approval"
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"LLM evaluation failed: {str(e)}")
|
||||
# Fail safe: flag for review
|
||||
return {
|
||||
"flag_for_review": True,
|
||||
"auto_approve": False,
|
||||
"rules_applied": [],
|
||||
"reasons": [f"Error evaluating rules: {str(e)}"],
|
||||
}
|
||||
|
||||
def _build_match_context(self, match: Match) -> str:
|
||||
"""Build a text description of the match for the LLM"""
|
||||
receipt = match.receipt
|
||||
transaction = match.transaction
|
||||
|
||||
context = f"""Receipt Information:
|
||||
- ID: {receipt.id}
|
||||
- Vendor: {receipt.vendor}
|
||||
- Amount: ${receipt.amount:.2f}
|
||||
- Tax: ${receipt.tax:.2f}
|
||||
- Category: {receipt.category}
|
||||
- Description: {receipt.description}
|
||||
- Date: {receipt.receipt_date}
|
||||
- Currency: {receipt.currency}
|
||||
|
||||
Transaction Information:
|
||||
- ID: {transaction.id}
|
||||
- Vendor: {transaction.vendor}
|
||||
- Amount: ${transaction.amount:.2f}
|
||||
- Date: {transaction.transaction_date}
|
||||
- Notes: {transaction.notes}
|
||||
- Currency: {transaction.currency}
|
||||
|
||||
Match Quality:
|
||||
- Confidence Score: {match.confidence_score:.2%}
|
||||
- Match Reason: {match.match_reason}
|
||||
"""
|
||||
|
||||
# Add tax analysis if available
|
||||
if match.tax_analysis:
|
||||
context += f"\nTax Analysis:\n{json.dumps(match.tax_analysis, indent=2)}"
|
||||
|
||||
return context
|
||||
|
||||
def _build_rules_context(self, rules: List[Dict]) -> str:
|
||||
"""Build a formatted list of rules for the LLM"""
|
||||
if not rules:
|
||||
return "No custom rules provided. Apply default evaluation."
|
||||
|
||||
rules_text = ""
|
||||
for idx, rule in enumerate(rules, 1):
|
||||
condition = rule.get("condition", "")
|
||||
action = rule.get("action", "")
|
||||
rules_text += f"{idx}. IF {condition} → THEN {action}\n"
|
||||
|
||||
return rules_text
|
||||
|
||||
def _parse_llm_response(self, response_text: str) -> Dict:
|
||||
"""Parse and validate LLM JSON response"""
|
||||
try:
|
||||
# Remove markdown code blocks if present
|
||||
if "```json" in response_text:
|
||||
response_text = response_text.split("```json")[1].split("```")[0]
|
||||
elif "```" in response_text:
|
||||
response_text = response_text.split("```")[1].split("```")[0]
|
||||
|
||||
# Parse JSON
|
||||
result = json.loads(response_text.strip())
|
||||
|
||||
# Validate required fields
|
||||
if "flag_for_review" not in result:
|
||||
result["flag_for_review"] = False
|
||||
if "auto_approve" not in result:
|
||||
result["auto_approve"] = False
|
||||
if "rules_applied" not in result:
|
||||
result["rules_applied"] = []
|
||||
if "reasons" not in result:
|
||||
result["reasons"] = []
|
||||
|
||||
# Ensure boolean types
|
||||
result["flag_for_review"] = bool(result["flag_for_review"])
|
||||
result["auto_approve"] = bool(result["auto_approve"])
|
||||
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse LLM response as JSON: {str(e)}")
|
||||
logger.error(f"Response text: {response_text}")
|
||||
# Return safe defaults
|
||||
return {
|
||||
"flag_for_review": True, # Fail safe to manual review
|
||||
"auto_approve": False,
|
||||
"rules_applied": [],
|
||||
"reasons": ["Failed to parse LLM response"],
|
||||
}
|
||||
@@ -0,0 +1,859 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict
|
||||
|
||||
import aiofiles
|
||||
import groq
|
||||
import PyPDF2
|
||||
from config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentProcessor:
|
||||
def __init__(self):
|
||||
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
|
||||
self.model = "meta-llama/llama-4-scout-17b-16e-instruct" # Vision model
|
||||
|
||||
def _extract_first_json(self, raw: str) -> dict:
|
||||
"""Extract the first valid JSON object from raw LLM output.
|
||||
|
||||
Handles cases where LLM returns extra text after/before the JSON.
|
||||
"""
|
||||
try:
|
||||
# First try direct parsing (fastest path)
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Find the first '{' and match closing '}'
|
||||
start = raw.find("{")
|
||||
if start == -1:
|
||||
raise ValueError("No JSON object found in LLM output")
|
||||
|
||||
depth = 0
|
||||
end = -1
|
||||
in_string = False
|
||||
escape_next = False
|
||||
|
||||
for i in range(start, len(raw)):
|
||||
ch = raw[i]
|
||||
|
||||
# Handle string escaping
|
||||
if escape_next:
|
||||
escape_next = False
|
||||
continue
|
||||
if ch == "\\":
|
||||
escape_next = True
|
||||
continue
|
||||
|
||||
# Track if we're inside a string
|
||||
if ch == '"':
|
||||
in_string = not in_string
|
||||
continue
|
||||
|
||||
# Only count braces outside of strings
|
||||
if not in_string:
|
||||
if ch == "{":
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end = i + 1
|
||||
break
|
||||
|
||||
if end == -1:
|
||||
raise ValueError("Unbalanced JSON braces in LLM output")
|
||||
|
||||
json_str = raw[start:end]
|
||||
return json.loads(json_str)
|
||||
|
||||
async def process_file(
|
||||
self,
|
||||
file_path: str,
|
||||
file_type: str,
|
||||
user_location: str = None,
|
||||
ai_rules: list = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Process uploaded file and extract receipt data
|
||||
|
||||
Args:
|
||||
file_path: Path to the file to process
|
||||
file_type: Type of file (jpg, pdf, etc.)
|
||||
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
|
||||
ai_rules: List of AI rules for categorization (e.g., [{"condition": "vendor is Starbucks", "action": "Food"}])
|
||||
"""
|
||||
try:
|
||||
if file_type.lower() in ["jpg", "jpeg", "png", "gif", "bmp"]:
|
||||
return await self._process_image(file_path, user_location, ai_rules)
|
||||
elif file_type.lower() == "pdf":
|
||||
return await self._process_pdf(file_path, user_location, ai_rules)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _process_image(
|
||||
self, image_path: str, user_location: str = None, ai_rules: list = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Extract data from image using Groq vision
|
||||
|
||||
Args:
|
||||
image_path: Path to the image file
|
||||
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
|
||||
ai_rules: List of AI rules for categorization
|
||||
"""
|
||||
try:
|
||||
# Encode image to base64
|
||||
base64_image = self._encode_image(image_path)
|
||||
|
||||
# Build user location context
|
||||
user_location_context = ""
|
||||
if user_location:
|
||||
user_location_context = f"""
|
||||
|
||||
USER LOCATION CONTEXT:
|
||||
The user is located in {user_location}.
|
||||
- If the receipt location is MISSING or UNCLEAR, use the user's location ({user_location}) for tax calculations.
|
||||
- If the receipt clearly shows a different location, use the receipt's location instead.
|
||||
- Apply depreciation rules based on the user's location.
|
||||
"""
|
||||
|
||||
# Build AI rules context for categorization
|
||||
ai_rules_context = ""
|
||||
if ai_rules and len(ai_rules) > 0:
|
||||
# Create a simple, direct instruction for each rule
|
||||
ai_rules_context = "\n "
|
||||
for idx, rule in enumerate(ai_rules, 1):
|
||||
condition = rule.get("condition", "")
|
||||
action = rule.get("action", "")
|
||||
|
||||
# Extract the keyword and category from the rule
|
||||
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
|
||||
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
|
||||
|
||||
if keyword_match and category_match:
|
||||
keyword = keyword_match.group(1)
|
||||
category = category_match.group(1).strip()
|
||||
# Create one simple instruction per line
|
||||
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
|
||||
|
||||
ai_rules_context += "\n"
|
||||
|
||||
# Create Groq vision prompt
|
||||
prompt = f"""
|
||||
Analyze this receipt image and extract the following information in JSON format.
|
||||
{ai_rules_context}
|
||||
JSON Format:
|
||||
{{
|
||||
"vendor": "Store/company name",
|
||||
"description": "Detailed description of items/services purchased",
|
||||
"total_amount": 0.00,
|
||||
"tax_amount": 0.00,
|
||||
"date": "YYYY-MM-DD",
|
||||
"category": "Check rules above first",
|
||||
"confidence": 0.95,
|
||||
"currency": "USD",
|
||||
"location": "Province/State, Country",
|
||||
"calculated_tax": 0.00,
|
||||
"is_depreciable": false,
|
||||
"name_of_asset": null,
|
||||
"cca_rate": null,
|
||||
"useful_life": null,
|
||||
"residual_value": null,
|
||||
"extraction_success": True
|
||||
}}
|
||||
|
||||
EXTRACTION Rules:
|
||||
- Extract vendor name as it appears on receipt
|
||||
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
|
||||
- Total amount should be the final total including tax
|
||||
- Tax amount is separate tax line if available (if not clearly shown, calculate based on location)
|
||||
- Date should be the date on the receipt
|
||||
- Confidence score 0-1 based on how clear the receipt is
|
||||
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
|
||||
|
||||
{user_location_context}
|
||||
LOCATION & TAX RULES:
|
||||
- Extract location from receipt (look for store address, province/state, country)
|
||||
- Format location as "Province/State, Country" (e.g., "Ontario, Canada" or "California, USA")
|
||||
- If location not shown on receipt, return null for location (system will use user location as fallback)
|
||||
|
||||
TAX EXTRACTION RULES (IMPORTANT):
|
||||
- If tax is EXPLICITLY shown on receipt (even if $0 or 0%), use that exact value:
|
||||
* If receipt shows "Tax: $0", "Tax: $0.00", "Tax (0%)", or similar → set tax_amount to 0.00 and calculated_tax to null
|
||||
* If receipt shows any other tax amount → set tax_amount to that value and calculated_tax to null
|
||||
|
||||
- If tax_amount is NOT shown or UNCLEAR on receipt, calculate it based on location:
|
||||
* Ontario, Canada: 13% HST
|
||||
* Quebec, Canada: 9.975% QST + 5% GST = 14.975% total
|
||||
* British Columbia, Canada: 12% (5% GST + 7% PST)
|
||||
* Alberta, Canada: 5% GST
|
||||
* California, USA: ~7.25% (varies by locality)
|
||||
* New York, USA: ~8.875% (varies by locality)
|
||||
* Texas, USA: 6.25%
|
||||
* For other locations, estimate based on typical rates
|
||||
* Store calculated tax in "calculated_tax" field and set tax_amount to the calculated value
|
||||
|
||||
DEPRECIATION RULES:
|
||||
- Determine if item is a depreciable asset (vehicles, machinery, equipment, computers, furniture, buildings)
|
||||
- Set is_depreciable to true only for capital assets, false for consumables/services
|
||||
- If is_depreciable is true, provide:
|
||||
* name_of_asset: Specific name/model of the asset (e.g., "2024 Honda Accord", "Dell Laptop XPS 15", "Office Desk")
|
||||
* cca_rate: CCA rate as decimal (e.g., 0.30 for 30%, 0.20 for 20%, 0.04 for 4%)
|
||||
- Class 10 (Vehicles): 30%
|
||||
- Class 8 (Furniture, equipment): 20%
|
||||
- Class 50 (Computers, software): 55%
|
||||
- Class 1 (Buildings): 4%
|
||||
- Class 10.1 (Passenger vehicles >$30k): 30%
|
||||
* useful_life: Expected years of use (e.g., 5 for computers, 8 for vehicles, 10 for furniture)
|
||||
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
|
||||
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
|
||||
|
||||
Return only valid JSON.
|
||||
"""
|
||||
logger.info(f"This is the prompt: {prompt}")
|
||||
# Call Groq vision API with correct format
|
||||
response = self.client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=self.model,
|
||||
max_tokens=800,
|
||||
temperature=0.1,
|
||||
)
|
||||
|
||||
# Parse response
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
return self._parse_extraction_result(result_text)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Image processing error: {str(e)}"}
|
||||
|
||||
def _encode_image(self, image_path: str) -> str:
|
||||
"""Encode image to base64 string"""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
|
||||
async def _process_pdf(
|
||||
self, pdf_path: str, user_location: str = None, ai_rules: list = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Extract data from PDF by converting to image first
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the PDF file
|
||||
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
|
||||
ai_rules: List of AI rules for categorization
|
||||
"""
|
||||
try:
|
||||
# For now, extract text from PDF and process as text
|
||||
text_content = self._extract_text_from_pdf(pdf_path)
|
||||
return self._process_text_content(text_content, user_location, ai_rules)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"PDF processing error: {str(e)}"}
|
||||
|
||||
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
||||
"""Extract text from PDF"""
|
||||
try:
|
||||
with open(pdf_path, "rb") as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
text = ""
|
||||
for page in pdf_reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
return text
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
def _process_text_content(
|
||||
self, text_content: str, user_location: str = None, ai_rules: list = None
|
||||
) -> Dict[str, Any]:
|
||||
"""Process text content using Groq (fallback for PDFs)
|
||||
|
||||
Args:
|
||||
text_content: Extracted text from PDF
|
||||
user_location: User's location string in format "State/Province, Country" (e.g., "Ontario, Canada")
|
||||
ai_rules: List of AI rules for categorization
|
||||
"""
|
||||
try:
|
||||
# Build user location context
|
||||
user_location_context = ""
|
||||
if user_location:
|
||||
user_location_context = f"""
|
||||
|
||||
USER LOCATION CONTEXT:
|
||||
The user is located in {user_location}.
|
||||
- If the receipt location is MISSING or UNCLEAR, use the user's location ({user_location}) for tax calculations.
|
||||
- If the receipt clearly shows a different location, use the receipt's location instead.
|
||||
- Apply depreciation rules based on the user's location.
|
||||
"""
|
||||
|
||||
# Build AI rules context for categorization
|
||||
ai_rules_context = ""
|
||||
if ai_rules and len(ai_rules) > 0:
|
||||
# Create a simple, direct instruction for each rule
|
||||
ai_rules_context = "\n "
|
||||
for idx, rule in enumerate(ai_rules, 1):
|
||||
condition = rule.get("condition", "")
|
||||
action = rule.get("action", "")
|
||||
|
||||
# Extract the keyword and category from the rule
|
||||
keyword_match = re.search(r'CONTAINS\s+"([^"]+)"', condition, re.IGNORECASE)
|
||||
category_match = re.search(r'SET_CATEGORY:\s*(.+)', action, re.IGNORECASE)
|
||||
|
||||
if keyword_match and category_match:
|
||||
keyword = keyword_match.group(1)
|
||||
category = category_match.group(1).strip()
|
||||
# Create one simple instruction per line
|
||||
ai_rules_context += f'If the Vendor name contains "{keyword}": Set category to "{category}"\n '
|
||||
|
||||
ai_rules_context += "\n"
|
||||
|
||||
prompt = f"""
|
||||
Analyze this receipt text and extract the following information in JSON format.
|
||||
{ai_rules_context}
|
||||
Receipt Text:
|
||||
{text_content}
|
||||
|
||||
Extract:
|
||||
{{
|
||||
"vendor": "Store/company name",
|
||||
"description": "Detailed description of items/services purchased",
|
||||
"total_amount": 0.00,
|
||||
"tax_amount": 0.00,
|
||||
"date": "YYYY-MM-DD",
|
||||
"category": "Check rules above first",
|
||||
"confidence": 0.95,
|
||||
"currency": "USD",
|
||||
"location": "Province/State, Country",
|
||||
"calculated_tax": 0.00,
|
||||
"is_depreciable": false,
|
||||
"name_of_asset": null,
|
||||
"cca_rate": null,
|
||||
"useful_life": null,
|
||||
"residual_value": null,
|
||||
"extraction_success": True
|
||||
}}
|
||||
|
||||
EXTRACTION Rules:
|
||||
- Extract vendor name as it appears on receipt
|
||||
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
|
||||
- Total amount should be the final total including tax
|
||||
- Tax amount is separate tax line if available (if not clearly shown, calculate based on location)
|
||||
- Date should be the date on the receipt
|
||||
- Confidence score 0-1 based on clarity
|
||||
- Currency should be the currency used on the receipt (e.g., "USD", "EUR", "CAD")
|
||||
{user_location_context}
|
||||
LOCATION & TAX RULES:
|
||||
- Extract location from receipt (look for store address, province/state, country)
|
||||
- Format location as "Province/State, Country" (e.g., "Ontario, Canada" or "California, USA")
|
||||
- If location not shown on receipt, return null for location (system will use user location as fallback)
|
||||
|
||||
TAX EXTRACTION RULES (IMPORTANT):
|
||||
- If tax is EXPLICITLY shown on receipt (even if $0 or 0%), use that exact value:
|
||||
* If receipt shows "Tax: $0", "Tax: $0.00", "Tax (0%)", or similar → set tax_amount to 0.00 and calculated_tax to null
|
||||
* If receipt shows any other tax amount → set tax_amount to that value and calculated_tax to null
|
||||
|
||||
- If tax_amount is NOT shown or UNCLEAR on receipt, calculate it based on location:
|
||||
* Ontario, Canada: 13% HST
|
||||
* Quebec, Canada: 9.975% QST + 5% GST = 14.975% total
|
||||
* British Columbia, Canada: 12% (5% GST + 7% PST)
|
||||
* Alberta, Canada: 5% GST
|
||||
* California, USA: ~7.25% (varies by locality)
|
||||
* New York, USA: ~8.875% (varies by locality)
|
||||
* Texas, USA: 6.25%
|
||||
* For other locations, estimate based on typical rates
|
||||
* Store calculated tax in "calculated_tax" field and set tax_amount to the calculated value
|
||||
|
||||
DEPRECIATION RULES:
|
||||
- Determine if item is a depreciable asset (vehicles, machinery, equipment, computers, furniture, buildings)
|
||||
- Set is_depreciable to true only for capital assets, false for consumables/services
|
||||
- If is_depreciable is true, provide:
|
||||
* name_of_asset: Specific name/model of the asset (e.g., "2024 Honda Accord", "Dell Laptop XPS 15", "Office Desk")
|
||||
* cca_rate: CCA rate as decimal (e.g., 0.30 for 30%, 0.20 for 20%, 0.04 for 4%)
|
||||
- Class 10 (Vehicles): 30%
|
||||
- Class 8 (Furniture, equipment): 20%
|
||||
- Class 50 (Computers, software): 55%
|
||||
- Class 1 (Buildings): 4%
|
||||
- Class 10.1 (Passenger vehicles >$30k): 30%
|
||||
* useful_life: Expected years of use (e.g., 5 for computers, 8 for vehicles, 10 for furniture)
|
||||
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
|
||||
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
|
||||
|
||||
Return only valid JSON.
|
||||
"""
|
||||
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
max_tokens=800,
|
||||
temperature=0.1,
|
||||
)
|
||||
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
return self._parse_extraction_result(result_text)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Text processing error: {str(e)}"}
|
||||
|
||||
def _parse_extraction_result(self, result_text: str) -> Dict[str, Any]:
|
||||
"""Parse Groq response and extract JSON data"""
|
||||
try:
|
||||
import re
|
||||
|
||||
# Try robust JSON extraction first (handles extra text)
|
||||
try:
|
||||
data = self._extract_first_json(result_text)
|
||||
return data
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
logger.warning(f"Robust JSON extraction failed: {e}. Trying fallback methods...")
|
||||
|
||||
# Fallback: Find JSON in response - try multiple patterns
|
||||
json_match = re.search(r"\{.*\}", result_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str = json_match.group()
|
||||
|
||||
# Clean up common JSON issues
|
||||
json_str = re.sub(
|
||||
r",\s*([}\]])", r"\1", json_str
|
||||
) # Remove trailing commas
|
||||
json_str = re.sub(
|
||||
r"([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:", r'\1"\2":', json_str
|
||||
) # Quote unquoted keys
|
||||
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
# Try to fix common JSON issues
|
||||
logger.warning(f"Fallback JSON parsing also failed: {e}")
|
||||
|
||||
# Try to extract individual fields using regex
|
||||
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
|
||||
description_match = re.search(
|
||||
r'"description"\s*:\s*"([^"]*)"', json_str
|
||||
)
|
||||
total_amount_match = re.search(
|
||||
r'"total_amount"\s*:\s*([0-9.]+)', json_str
|
||||
)
|
||||
tax_amount_match = re.search(
|
||||
r'"tax_amount"\s*:\s*([0-9.]+)', json_str
|
||||
)
|
||||
date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
|
||||
category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
|
||||
confidence_match = re.search(
|
||||
r'"confidence"\s*:\s*([0-9.]+)', json_str
|
||||
)
|
||||
currency_match = re.search(r'"currency"\s*:\s*"([^"]*)"', json_str)
|
||||
location_match = re.search(r'"location"\s*:\s*"([^"]*)"', json_str)
|
||||
calculated_tax_match = re.search(
|
||||
r'"calculated_tax"\s*:\s*([0-9.]+|null)', json_str
|
||||
)
|
||||
is_depreciable_match = re.search(
|
||||
r'"is_depreciable"\s*:\s*(true|false)', json_str
|
||||
)
|
||||
name_of_asset_match = re.search(
|
||||
r'"name_of_asset"\s*:\s*"([^"]*)"', json_str
|
||||
)
|
||||
cca_rate_match = re.search(
|
||||
r'"cca_rate"\s*:\s*([0-9.]+|null)', json_str
|
||||
)
|
||||
useful_life_match = re.search(
|
||||
r'"useful_life"\s*:\s*([0-9]+|null)', json_str
|
||||
)
|
||||
residual_value_match = re.search(
|
||||
r'"residual_value"\s*:\s*([0-9.]+|null)', json_str
|
||||
)
|
||||
|
||||
data = {
|
||||
"vendor": vendor_match.group(1) if vendor_match else "",
|
||||
"description": description_match.group(1)
|
||||
if description_match
|
||||
else "",
|
||||
"total_amount": float(total_amount_match.group(1))
|
||||
if total_amount_match
|
||||
else 0.0,
|
||||
"tax_amount": float(tax_amount_match.group(1))
|
||||
if tax_amount_match
|
||||
else 0.0,
|
||||
"date": date_match.group(1) if date_match else "",
|
||||
"category": category_match.group(1)
|
||||
if category_match
|
||||
else "Other",
|
||||
"confidence": float(confidence_match.group(1))
|
||||
if confidence_match
|
||||
else 0.5,
|
||||
"currency": currency_match.group(1)
|
||||
if currency_match
|
||||
else "CAD",
|
||||
"location": location_match.group(1) if location_match else None,
|
||||
"calculated_tax": float(calculated_tax_match.group(1))
|
||||
if calculated_tax_match
|
||||
and calculated_tax_match.group(1) != "null"
|
||||
else None,
|
||||
"is_depreciable": is_depreciable_match.group(1) == "true"
|
||||
if is_depreciable_match
|
||||
else None,
|
||||
"name_of_asset": name_of_asset_match.group(1)
|
||||
if name_of_asset_match
|
||||
else None,
|
||||
"cca_rate": float(cca_rate_match.group(1))
|
||||
if cca_rate_match and cca_rate_match.group(1) != "null"
|
||||
else None,
|
||||
"useful_life": int(useful_life_match.group(1))
|
||||
if useful_life_match and useful_life_match.group(1) != "null"
|
||||
else None,
|
||||
"residual_value": float(residual_value_match.group(1))
|
||||
if residual_value_match
|
||||
and residual_value_match.group(1) != "null"
|
||||
else None,
|
||||
}
|
||||
|
||||
# Validate and clean data
|
||||
return {
|
||||
"vendor": str(data.get("vendor", "")).strip(),
|
||||
"description": str(data.get("description", "")).strip(),
|
||||
"total_amount": float(data.get("total_amount", 0)),
|
||||
"tax_amount": float(data.get("tax_amount", 0)),
|
||||
"date": str(data.get("date", "")).strip(),
|
||||
"category": str(data.get("category", "Other")).strip(),
|
||||
"confidence": float(data.get("confidence", 0.5)),
|
||||
"extraction_success": True,
|
||||
"currency": data.get("currency", "CAD").strip(),
|
||||
"location": data.get("location"),
|
||||
"calculated_tax": data.get("calculated_tax"),
|
||||
"is_depreciable": data.get("is_depreciable"),
|
||||
"name_of_asset": data.get("name_of_asset"),
|
||||
"cca_rate": data.get("cca_rate"),
|
||||
"useful_life": data.get("useful_life"),
|
||||
"residual_value": data.get("residual_value"),
|
||||
}
|
||||
else:
|
||||
# Try to extract fields from plain text
|
||||
logger.warning("No JSON found in response, attempting text extraction")
|
||||
return self._extract_from_plain_text(result_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"JSON parsing error: {str(e)}")
|
||||
return {
|
||||
"error": f"JSON parsing error: {str(e)}",
|
||||
"extraction_success": False,
|
||||
}
|
||||
|
||||
def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
|
||||
"""Extract receipt data from plain text when JSON parsing fails"""
|
||||
try:
|
||||
import re
|
||||
|
||||
# Extract vendor (look for common patterns)
|
||||
vendor_patterns = [
|
||||
r"(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)",
|
||||
r"([A-Z][A-Za-z0-9\s&.,]{3,30})", # Capitalized words
|
||||
]
|
||||
|
||||
vendor = ""
|
||||
for pattern in vendor_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
vendor = match.group(1).strip()
|
||||
break
|
||||
|
||||
# Extract amount (look for currency patterns)
|
||||
amount_patterns = [
|
||||
r"\$?\s*([0-9,]+\.?[0-9]*)",
|
||||
r"(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)",
|
||||
]
|
||||
|
||||
total_amount = 0.0
|
||||
for pattern in amount_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
total_amount = float(match.group(1).replace(",", ""))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Extract date
|
||||
date_patterns = [
|
||||
r"(\d{4}-\d{2}-\d{2})",
|
||||
r"(\d{1,2}/\d{1,2}/\d{2,4})",
|
||||
r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}",
|
||||
]
|
||||
|
||||
date = ""
|
||||
for pattern in date_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
date = match.group(0)
|
||||
break
|
||||
|
||||
return {
|
||||
"vendor": vendor or "Unknown",
|
||||
"total_amount": total_amount,
|
||||
"tax_amount": 0.0,
|
||||
"date": date or "",
|
||||
"category": "Other",
|
||||
"confidence": 0.3, # Low confidence for text extraction
|
||||
"extraction_success": True,
|
||||
"location": None,
|
||||
"calculated_tax": None,
|
||||
"is_depreciable": None,
|
||||
"name_of_asset": None,
|
||||
"cca_rate": None,
|
||||
"useful_life": None,
|
||||
"residual_value": None,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Text extraction error: {str(e)}")
|
||||
return {
|
||||
"vendor": "Unknown",
|
||||
"total_amount": 0.0,
|
||||
"tax_amount": 0.0,
|
||||
"date": "",
|
||||
"category": "Other",
|
||||
"confidence": 0.1,
|
||||
"extraction_success": False,
|
||||
"error": f"Text extraction failed: {str(e)}",
|
||||
"location": None,
|
||||
"calculated_tax": None,
|
||||
"is_depreciable": None,
|
||||
"name_of_asset": None,
|
||||
"cca_rate": None,
|
||||
"useful_life": None,
|
||||
"residual_value": None,
|
||||
}
|
||||
|
||||
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
|
||||
"""Save uploaded file to temporary storage"""
|
||||
try:
|
||||
# Create uploads directory if it doesn't exist
|
||||
upload_dir = "uploads"
|
||||
os.makedirs(upload_dir, exist_ok=True)
|
||||
|
||||
# Generate unique filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = f"{timestamp}_{filename.replace(' ', '_')}"
|
||||
file_path = os.path.join(upload_dir, safe_filename)
|
||||
|
||||
# Save file
|
||||
async with aiofiles.open(file_path, "wb") as f:
|
||||
await f.write(file_content)
|
||||
|
||||
return file_path
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to save file: {str(e)}")
|
||||
|
||||
async def extract_transactions_from_image(self, image_path: str) -> Dict[str, Any]:
|
||||
"""Extract multiple transactions from an image (bank statement, credit card statement, etc.)"""
|
||||
try:
|
||||
# Encode image to base64
|
||||
base64_image = self._encode_image(image_path)
|
||||
|
||||
# Create Groq vision prompt for transaction extraction
|
||||
prompt = """
|
||||
Analyze this financial document image (bank statement, credit card statement, etc.) and extract ALL transactions in JSON format.
|
||||
|
||||
Look for transaction lists, payment records, or any financial entries that show:
|
||||
- Date
|
||||
- Amount (positive or negative)
|
||||
- Vendor/Description/Payee name
|
||||
- Any additional notes or memo
|
||||
|
||||
Return the transactions as a JSON array:
|
||||
{
|
||||
"extraction_success": true,
|
||||
"transactions": [
|
||||
{
|
||||
"date": "YYYY-MM-DD",
|
||||
"amount": 0.00,
|
||||
"vendor": "Vendor name",
|
||||
"memo": "Additional notes"
|
||||
},
|
||||
{
|
||||
"date": "YYYY-MM-DD",
|
||||
"amount": -0.00,
|
||||
"vendor": "Another vendor",
|
||||
"memo": "Payment or charge description"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Extract ALL visible transactions
|
||||
- Include both positive (credits) and negative (debits) amounts
|
||||
- Use the actual date format from the document
|
||||
- Vendor should be the merchant/payee name
|
||||
- Memo can include transaction type, reference numbers, etc.
|
||||
- If no transactions found, return empty array but set extraction_success to true
|
||||
|
||||
Return only valid JSON.
|
||||
"""
|
||||
|
||||
# Call Groq vision API
|
||||
response = self.client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=self.model,
|
||||
max_tokens=2000, # Higher token limit for multiple transactions
|
||||
temperature=0.1,
|
||||
)
|
||||
|
||||
# Parse response
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
return self._parse_transaction_extraction_result(result_text)
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"Transaction extraction error: {str(e)}",
|
||||
"transactions": [],
|
||||
}
|
||||
|
||||
def _parse_transaction_extraction_result(self, result_text: str) -> Dict[str, Any]:
|
||||
"""Parse Groq response for transaction extraction"""
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Find the first '{' and last '}'
|
||||
start = result_text.find("{")
|
||||
end = result_text.rfind("}")
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": "Could not find JSON object in AI response",
|
||||
"transactions": [],
|
||||
}
|
||||
json_str = result_text[start : end + 1]
|
||||
|
||||
# Remove trailing commas before } or ]
|
||||
json_str = re.sub(r",\s*([}\]])", r"\1", json_str)
|
||||
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.error(f"JSON parsing error: {str(e)}")
|
||||
logging.error(f"Offending JSON string:\n{json_str}")
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"JSON parsing error: {str(e)}",
|
||||
"transactions": [],
|
||||
}
|
||||
|
||||
# Validate and clean data
|
||||
transactions = data.get("transactions", [])
|
||||
cleaned_transactions = []
|
||||
for txn in transactions:
|
||||
try:
|
||||
cleaned_txn = {
|
||||
"date": str(txn.get("date", "")).strip(),
|
||||
"amount": float(
|
||||
str(txn.get("amount", 0)).replace("$", "").replace(",", "")
|
||||
),
|
||||
"vendor": str(txn.get("vendor", "")).strip(),
|
||||
"memo": str(txn.get("memo", "")).strip(),
|
||||
}
|
||||
cleaned_transactions.append(cleaned_txn)
|
||||
except Exception:
|
||||
continue
|
||||
return {
|
||||
"extraction_success": data.get("extraction_success", True),
|
||||
"transactions": cleaned_transactions,
|
||||
"total_transactions": len(cleaned_transactions),
|
||||
}
|
||||
except Exception as e:
|
||||
import logging
|
||||
|
||||
logging.error(f"JSON parsing error (outer): {str(e)}")
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"JSON parsing error: {str(e)}",
|
||||
"transactions": [],
|
||||
}
|
||||
|
||||
def _parse_date_to_iso(self, date_str: str) -> str:
|
||||
"""Parse various date formats and convert to YYYY-MM-DD"""
|
||||
try:
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
date_str = date_str.strip().upper()
|
||||
|
||||
# Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024"
|
||||
month_pattern = r"(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s+(\d{1,2})(?:,\s*(\d{4}))?"
|
||||
match = re.match(month_pattern, date_str)
|
||||
|
||||
if match:
|
||||
month_abbr, day, year = match.groups()
|
||||
month_map = {
|
||||
"JAN": 1,
|
||||
"FEB": 2,
|
||||
"MAR": 3,
|
||||
"APR": 4,
|
||||
"MAY": 5,
|
||||
"JUN": 6,
|
||||
"JUL": 7,
|
||||
"AUG": 8,
|
||||
"SEP": 9,
|
||||
"OCT": 10,
|
||||
"NOV": 11,
|
||||
"DEC": 12,
|
||||
}
|
||||
|
||||
month = month_map[month_abbr]
|
||||
day = int(day)
|
||||
year = int(year) if year else datetime.now().year
|
||||
|
||||
# Handle 2-digit years
|
||||
if year < 100:
|
||||
year += 2000
|
||||
|
||||
return f"{year:04d}-{month:02d}-{day:02d}"
|
||||
|
||||
# Handle YYYY-MM-DD format
|
||||
if re.match(r"\d{4}-\d{2}-\d{2}", date_str):
|
||||
return date_str
|
||||
|
||||
# Handle MM/DD/YYYY format
|
||||
if re.match(r"\d{1,2}/\d{1,2}/\d{4}", date_str):
|
||||
return datetime.strptime(date_str, "%m/%d/%Y").strftime("%Y-%m-%d")
|
||||
|
||||
# Handle MM/DD/YY format
|
||||
if re.match(r"\d{1,2}/\d{1,2}/\d{2}", date_str):
|
||||
return datetime.strptime(date_str, "%m/%d/%y").strftime("%Y-%m-%d")
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
@@ -0,0 +1,992 @@
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict
|
||||
|
||||
import groq
|
||||
from config import settings
|
||||
from schemas import Receipt, Transaction
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LLMTaxAnalyzer:
|
||||
"""
|
||||
Uses LLM to intelligently apply tax rules based on context.
|
||||
|
||||
Implements four core tax rules:
|
||||
1. Sales Tax Rule - Based on receipt location (shipping/billing address)
|
||||
2. Foreign Exchange Rule - Handles currency mismatches
|
||||
3. Depreciation Rule - Capital assets (based on user location)
|
||||
4. Meals & Entertainment Rule - 50% tax deduction, 100% accounting deduction
|
||||
"""
|
||||
|
||||
# Provincial tax rates for reference
|
||||
PROVINCIAL_TAX_RATES = {
|
||||
"ON": {"rate": 0.13, "name": "HST", "type": "Harmonized"},
|
||||
"QC": {"rate": 0.14975, "name": "QST + GST", "type": "Combined"},
|
||||
"BC": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
|
||||
"AB": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
"SK": {"rate": 0.11, "name": "PST + GST", "type": "Combined"},
|
||||
"MB": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
|
||||
"NS": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"NB": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"NL": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"PE": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"NT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
"NU": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
"YT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
}
|
||||
|
||||
# CCA rates by asset class (simplified)
|
||||
CCA_RATES = {
|
||||
"vehicles": 0.30, # Class 10
|
||||
"computer_equipment": 0.55, # Class 50
|
||||
"furniture": 0.20, # Class 8
|
||||
"buildings": 0.04, # Class 1
|
||||
"machinery": 0.20, # Class 8
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
|
||||
self.model = settings.model
|
||||
self.max_retries = 3
|
||||
|
||||
def analyze_and_apply_tax_rules_batch(
|
||||
self,
|
||||
matches: list, # List of Match objects
|
||||
user_location: str = "ON",
|
||||
) -> list:
|
||||
"""
|
||||
Batch process all matches in a SINGLE LLM call to reduce costs.
|
||||
Analyzes all receipt-transaction pairs together and applies tax rules.
|
||||
Falls back to individual processing if batch fails.
|
||||
"""
|
||||
if not matches:
|
||||
return matches
|
||||
|
||||
logger.info(f"Starting batch tax analysis for {len(matches)} matches")
|
||||
|
||||
# Build batch context for all matches
|
||||
try:
|
||||
batch_context = self._build_batch_analysis_context(matches, user_location)
|
||||
except Exception as e:
|
||||
logger.error(f"Error building batch context: {str(e)}")
|
||||
# If we can't even build the context, return matches as-is
|
||||
for match in matches:
|
||||
match.match_reason += " (Batch analysis setup failed)"
|
||||
return matches
|
||||
|
||||
# Get LLM analysis for ALL matches at once
|
||||
llm_batch_analysis = self._get_llm_tax_analysis_batch(
|
||||
batch_context, len(matches)
|
||||
)
|
||||
|
||||
# Check if we got any analysis back
|
||||
if not llm_batch_analysis:
|
||||
logger.warning("Batch LLM analysis returned empty results")
|
||||
|
||||
# Fallback: Try processing each match individually if batch size is small
|
||||
if (
|
||||
len(matches) <= 5
|
||||
): # Only fallback for small batches to avoid excessive API calls
|
||||
logger.info(
|
||||
f"Attempting individual processing fallback for {len(matches)} matches"
|
||||
)
|
||||
return self._process_matches_individually(matches, user_location)
|
||||
else:
|
||||
logger.warning(
|
||||
f"Batch too large ({len(matches)} matches) for individual fallback - returning matches without enhanced tax analysis"
|
||||
)
|
||||
for match in matches:
|
||||
match.match_reason += " (Batch tax analysis unavailable)"
|
||||
return matches
|
||||
|
||||
logger.info(f"Received batch analysis for {len(llm_batch_analysis)} matches")
|
||||
|
||||
# Apply results to each match
|
||||
enhanced_matches = []
|
||||
for i, match in enumerate(matches):
|
||||
try:
|
||||
# Get the analysis for this specific match from the batch results
|
||||
match_key = f"match_{i}"
|
||||
match_analysis = llm_batch_analysis.get(match_key, {})
|
||||
|
||||
if match_analysis and isinstance(match_analysis, dict):
|
||||
# Apply the tax analysis to this match
|
||||
enhanced_match = self._apply_tax_analysis_to_match(
|
||||
match, match_analysis
|
||||
)
|
||||
enhanced_matches.append(enhanced_match)
|
||||
else:
|
||||
# No analysis available for this match, use as-is
|
||||
logger.warning(
|
||||
f"No analysis found for match {i} (key: {match_key})"
|
||||
)
|
||||
match.match_reason += " (Tax analysis incomplete)"
|
||||
enhanced_matches.append(match)
|
||||
except Exception as e:
|
||||
logger.error(f"Error applying tax analysis to match {i}: {str(e)}")
|
||||
match.match_reason += " (Tax analysis error)"
|
||||
enhanced_matches.append(match)
|
||||
|
||||
logger.info(
|
||||
f"Completed batch tax analysis, enhanced {len(enhanced_matches)} matches"
|
||||
)
|
||||
# logger.info(
|
||||
# f"\n\n\nFinal batch enhanced matches: {enhanced_matches}"
|
||||
# )
|
||||
return enhanced_matches
|
||||
|
||||
def _process_matches_individually(self, matches: list, user_location: str) -> list:
|
||||
"""
|
||||
Fallback method: Process matches one at a time using the legacy method.
|
||||
Only used when batch processing fails and batch size is small.
|
||||
"""
|
||||
logger.info(f"Processing {len(matches)} matches individually as fallback")
|
||||
enhanced_matches = []
|
||||
|
||||
for i, match in enumerate(matches):
|
||||
try:
|
||||
# Use the legacy single-match analysis method
|
||||
tax_analysis = self.analyze_and_apply_tax_rules(
|
||||
match.receipt, match.transaction, user_location
|
||||
)
|
||||
|
||||
# Apply the analysis to the match
|
||||
enhanced_match = self._apply_tax_analysis_to_match(match, tax_analysis)
|
||||
enhanced_matches.append(enhanced_match)
|
||||
logger.info(
|
||||
f"Successfully processed match {i + 1}/{len(matches)} individually"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in individual processing for match {i}: {str(e)}")
|
||||
match.match_reason += " (Individual tax analysis failed)"
|
||||
enhanced_matches.append(match)
|
||||
|
||||
return enhanced_matches
|
||||
|
||||
def analyze_and_apply_tax_rules(
|
||||
self,
|
||||
receipt: Receipt,
|
||||
transaction: Transaction,
|
||||
user_location: str = "ON", # Default to Ontario
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Legacy single-match analysis method (kept for backward compatibility).
|
||||
Use analyze_and_apply_tax_rules_batch() for better performance.
|
||||
|
||||
Use LLM to intelligently analyze and apply all tax rules:
|
||||
1. Sales tax based on receipt location (shipping/billing address priority)
|
||||
2. Foreign exchange rules for currency mismatches
|
||||
3. Depreciation rules for capital assets (based on user location)
|
||||
4. Meals & Entertainment deduction rules
|
||||
"""
|
||||
|
||||
# Prepare context for LLM
|
||||
analysis_context = self._build_analysis_context(
|
||||
receipt, transaction, user_location
|
||||
)
|
||||
|
||||
# Get LLM analysis
|
||||
llm_analysis = self._get_llm_tax_analysis(analysis_context)
|
||||
|
||||
# Parse and structure the results
|
||||
structured_results = self._structure_analysis_results(
|
||||
llm_analysis, receipt, transaction, user_location
|
||||
)
|
||||
|
||||
return structured_results
|
||||
|
||||
def _build_analysis_context(
|
||||
self, receipt: Receipt, transaction: Transaction, user_location: str
|
||||
) -> str:
|
||||
"""Build comprehensive context for LLM analysis"""
|
||||
|
||||
# Extract location information
|
||||
receipt_location = self._extract_receipt_location(receipt)
|
||||
|
||||
# Normalize user_location to province code (handle "Canada", "Ontario", "ON", etc.)
|
||||
user_province = self._normalize_location_to_province(user_location)
|
||||
|
||||
logger.info(
|
||||
f"Building tax analysis context - User Location: {user_location} → Province Code: {user_province}"
|
||||
)
|
||||
|
||||
# Build tax rates reference
|
||||
tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
|
||||
cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
|
||||
|
||||
context = f"""
|
||||
RECEIPT DETAILS:
|
||||
- Vendor: {receipt.vendor}
|
||||
- Amount: ${receipt.amount:.2f}
|
||||
- Currency: {receipt.currency}
|
||||
- Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
|
||||
- Category: {receipt.category}
|
||||
- Description: {receipt.description}
|
||||
- Billing Address: {self._format_address(receipt.billing_address)}
|
||||
- Shipping Address: {self._format_address(receipt.shipping_address)}
|
||||
- Is Meals & Entertainment: {receipt.is_meals_entertainment}
|
||||
|
||||
TRANSACTION DETAILS:
|
||||
- Vendor: {transaction.vendor}
|
||||
- Amount: ${transaction.amount:.2f}
|
||||
- Currency: {transaction.currency}
|
||||
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
|
||||
- Notes: {transaction.notes}
|
||||
- FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
|
||||
|
||||
USER CONTEXT:
|
||||
- User Location (Province): {user_province}
|
||||
- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
|
||||
- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
|
||||
|
||||
RECEIPT LOCATION DETECTED:
|
||||
{receipt_location}
|
||||
|
||||
PROVINCIAL TAX RATES REFERENCE:
|
||||
{tax_rates_info}
|
||||
|
||||
CCA DEPRECIATION RATES BY ASSET CLASS:
|
||||
{cca_rates_info}
|
||||
"""
|
||||
return context
|
||||
|
||||
def _normalize_location_to_province(self, location: str) -> str:
|
||||
"""
|
||||
Normalize various location formats to province code.
|
||||
Handles: "ON", "Ontario", "Canada", etc.
|
||||
"""
|
||||
location_upper = location.upper().strip()
|
||||
|
||||
# Direct province code match
|
||||
if location_upper in self.PROVINCIAL_TAX_RATES:
|
||||
return location_upper
|
||||
|
||||
# Map full province names to codes
|
||||
province_name_map = {
|
||||
"ONTARIO": "ON",
|
||||
"QUEBEC": "QC",
|
||||
"BRITISH COLUMBIA": "BC",
|
||||
"ALBERTA": "AB",
|
||||
"SASKATCHEWAN": "SK",
|
||||
"MANITOBA": "MB",
|
||||
"NOVA SCOTIA": "NS",
|
||||
"NEW BRUNSWICK": "NB",
|
||||
"NEWFOUNDLAND AND LABRADOR": "NL",
|
||||
"NEWFOUNDLAND": "NL",
|
||||
"PRINCE EDWARD ISLAND": "PE",
|
||||
"NORTHWEST TERRITORIES": "NT",
|
||||
"NUNAVUT": "NU",
|
||||
"YUKON": "YT",
|
||||
}
|
||||
|
||||
if location_upper in province_name_map:
|
||||
return province_name_map[location_upper]
|
||||
|
||||
# Default to Ontario if country is Canada or unspecified
|
||||
if location_upper in ["CANADA", "CAN", "CA", ""]:
|
||||
logger.warning(f"Location '{location}' is too generic, defaulting to ON")
|
||||
return "ON"
|
||||
|
||||
# If nothing matches, default to Ontario
|
||||
logger.warning(f"Could not parse location '{location}', defaulting to ON")
|
||||
return "ON"
|
||||
|
||||
def _extract_receipt_location(self, receipt: Receipt) -> str:
|
||||
"""Extract and format receipt location information"""
|
||||
|
||||
# Priority: Use shipping address if available, then billing
|
||||
location = (
|
||||
receipt.shipping_address
|
||||
if receipt.shipping_address
|
||||
else receipt.billing_address
|
||||
)
|
||||
|
||||
if location:
|
||||
return f"""
|
||||
- Province: {location.province}
|
||||
- City: {location.city}
|
||||
- Country: {location.country}
|
||||
- Postal Code: {location.postal_code}
|
||||
"""
|
||||
else:
|
||||
return "- No address information available (will use user location)"
|
||||
|
||||
def _format_address(self, address) -> str:
|
||||
"""Format address for display"""
|
||||
if address:
|
||||
return f"{address.city}, {address.province}, {address.country} ({address.postal_code})"
|
||||
return "Not provided"
|
||||
|
||||
def _get_llm_tax_analysis(self, context: str) -> str:
|
||||
"""Get tax rule analysis from LLM"""
|
||||
|
||||
prompt = f"""
|
||||
You are a tax expert analyzing a receipt-transaction match. Apply the following tax rules intelligently:
|
||||
And you are to calculate the tax for the receipt based on the context provided.
|
||||
|
||||
{context}
|
||||
|
||||
=== FOUR CORE TAX RULES ===
|
||||
|
||||
### 1. SALES TAX RULE
|
||||
**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
|
||||
|
||||
**Key Principles**:
|
||||
- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
|
||||
- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
|
||||
- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
|
||||
|
||||
**Scenario Examples**:
|
||||
a) User in Ontario, Receipt from Quebec:
|
||||
- Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
|
||||
- The user's location is only for depreciation purposes
|
||||
|
||||
b) User in Ontario, Receipt from USA (New York):
|
||||
- DO NOT apply Canadian sales tax
|
||||
- This is an international transaction
|
||||
- Flag for FX review instead
|
||||
|
||||
c) User in USA (New York), Receipt from California:
|
||||
- Apply California's sales tax rate (receipt location)
|
||||
- Not New York's rate (user location)
|
||||
|
||||
d) User in Ontario, Receipt has NO address information:
|
||||
- DEFAULT to user's location (Ontario 13% HST)
|
||||
- This is the fallback when receipt location is unknown
|
||||
|
||||
**Tax Calculation**:
|
||||
- Compare calculated tax vs stated tax on receipt
|
||||
- Flag discrepancies for review
|
||||
|
||||
### 2. FOREIGN EXCHANGE (FX) RULE
|
||||
**Purpose**: Handle currency mismatches between receipts and transactions.
|
||||
|
||||
**Actions**:
|
||||
- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
|
||||
- Calculate the absolute discrepancy: |receipt_amount - transaction_amount|
|
||||
- ALWAYS flag for manual review - DO NOT fetch exchange rates automatically
|
||||
- If FX rate is provided in transaction data, note it but still require manual review
|
||||
|
||||
**Examples**:
|
||||
- Transaction: USD $100, Receipt: CAD $125 → Discrepancy: $25, Flag for review
|
||||
- The user must manually approve or adjust the FX difference
|
||||
|
||||
### 3. DEPRECIATION RULE
|
||||
**Purpose**: Calculate depreciation for assets using two methods.
|
||||
|
||||
**Key Principle**: Depreciation is ALWAYS based on USER'S location, NOT receipt location.
|
||||
|
||||
**Asset Identification**:
|
||||
- Only applies to capital assets: vehicles, equipment, furniture, buildings, machinery
|
||||
- Identify from receipt category and description
|
||||
- Typical threshold: Assets generally > $500
|
||||
|
||||
**Two Methods Required**:
|
||||
a) **Straight-Line Depreciation** (for accounting purposes):
|
||||
Formula: (Cost - Residual Value) / Useful Life
|
||||
Example: Asset $10,000, 5-year life, $1,000 residual = $1,800/year
|
||||
|
||||
b) **CCA Depreciation** (for tax purposes - Canada):
|
||||
Method: Declining Balance
|
||||
Formula: Book Value × CCA Rate each year
|
||||
Example: Truck $20,000, 30% CCA:
|
||||
- Year 1: $20,000 × 30% = $6,000
|
||||
- Year 2: ($20,000 - $6,000) × 30% = $4,200
|
||||
- Continues declining each year
|
||||
|
||||
**CCA Classes** (Canada):
|
||||
- Vehicles: 30% (Class 10)
|
||||
- Computer Equipment: 55% (Class 50)
|
||||
- Furniture/Machinery: 20% (Class 8)
|
||||
- Buildings: 4% (Class 1)
|
||||
|
||||
### 4. MEALS & ENTERTAINMENT TAX DEDUCTION RULE
|
||||
**Purpose**: Apply correct deductions for meals and entertainment expenses.
|
||||
|
||||
**Deduction Rules**:
|
||||
- **For Tax Purposes**: Only 50% of total receipt amount is deductible
|
||||
- **For Accounting Purposes**: 100% of total receipt amount is deductible
|
||||
- **Sales Tax**: Full sales tax amount is deductible for accounting
|
||||
|
||||
**Example**:
|
||||
- Receipt: $100 meal + $12 sales tax = $112 total
|
||||
- **Tax Deduction**: $50 (50% of meal) + $12 (full tax) = $62
|
||||
- **Accounting Deduction**: $100 (full meal) + $12 (full tax) = $112
|
||||
|
||||
=== LOCATION-BASED SCENARIO HANDLING ===
|
||||
|
||||
**When Receipt Location ≠ User Location**:
|
||||
|
||||
1. **Sales Tax**: Use RECEIPT's location for tax calculation
|
||||
- Exception: If international (different country), no Canadian sales tax + flag FX
|
||||
- Exception: If no location on receipt, use user's location as default
|
||||
|
||||
2. **Depreciation**: ALWAYS use USER's location for depreciation rules
|
||||
- Receipt location is irrelevant for depreciation
|
||||
- Apply user's country/province depreciation methods
|
||||
|
||||
3. **FX Handling**:
|
||||
- If receipt currency ≠ transaction currency: Flag for manual review
|
||||
- Do NOT automatically fetch or apply exchange rates
|
||||
|
||||
4. **Missing Location**:
|
||||
- If receipt has no address: Default to user's location for sales tax
|
||||
- Still apply user's location for depreciation
|
||||
|
||||
=== ANALYSIS REQUIRED ===
|
||||
|
||||
Provide a structured JSON response with the following format:
|
||||
|
||||
**CRITICAL INSTRUCTION FOR final_tax_amount:**
|
||||
- This field MUST contain ONLY the calculated sales tax amount in dollars
|
||||
- This is NOT the total amount including tax
|
||||
- This is ONLY the tax portion (HST/GST/PST/QST)
|
||||
- Example: If receipt total is $100 and calculated tax is $13, return 13.00 (not 113.00)
|
||||
- For meals & entertainment: Return the FULL calculated tax amount (not the 50% adjusted amount)
|
||||
|
||||
{{
|
||||
"final_tax_amount": XX.XX, // ONLY the calculated tax amount (e.g., 13.00 for $100 + $13 HST)
|
||||
"sales_tax": {{
|
||||
"applicable_province": "XX",
|
||||
"applicable_rate": 0.XX,
|
||||
"tax_name": "HST/GST/PST/QST",
|
||||
"calculated_tax": XX.XX, // This should match final_tax_amount above
|
||||
"stated_tax": XX.XX,
|
||||
"discrepancy": XX.XX,
|
||||
"reason": "Detailed explanation",
|
||||
"requires_review": true/false
|
||||
}},
|
||||
"foreign_exchange": {{
|
||||
"currency_mismatch": true/false,
|
||||
"receipt_currency": "XXX",
|
||||
"transaction_currency": "XXX",
|
||||
"receipt_amount": XX.XX,
|
||||
"transaction_amount": XX.XX,
|
||||
"discrepancy": XX.XX,
|
||||
"requires_manual_review": true/false,
|
||||
"reason": "Explanation of FX situation"
|
||||
}},
|
||||
"depreciation": {{
|
||||
"is_capital_asset": true/false,
|
||||
"asset_class": "category name or N/A",
|
||||
"suggested_cca_rate": 0.XX,
|
||||
"straight_line_applicable": true/false,
|
||||
"cca_applicable": true/false,
|
||||
"straight_line_example": "Brief calculation example if applicable",
|
||||
"cca_example": "Brief calculation example if applicable",
|
||||
"reason": "Why this is/isn't a capital asset, which CCA class, and why depreciation based on user's location"
|
||||
}},
|
||||
"meals_entertainment": {{
|
||||
"is_meals_entertainment": true/false,
|
||||
"tax_deduction_amount": XX.XX,
|
||||
"accounting_deduction_amount": XX.XX,
|
||||
"sales_tax_included": XX.XX,
|
||||
"reason": "Explanation of M&E rule application"
|
||||
}},
|
||||
"confidence_adjustment": {{
|
||||
"boost": 0.XX,
|
||||
"reduce": 0.XX,
|
||||
"reason": "Why confidence should be adjusted based on tax analysis"
|
||||
}},
|
||||
"overall_assessment": "Comprehensive summary: which rules applied, why, what location used for what purpose, and any required actions"
|
||||
}}
|
||||
|
||||
**IMPORTANT**: The "final_tax_amount" field at the top level must contain the final calculated tax amount. This should be the calculated_tax from sales_tax analysis. If this is a meals & entertainment expense, ensure you return the FULL tax amount here (not the 50% adjusted amount).
|
||||
|
||||
**Critical Reminders**:
|
||||
- Sales tax uses RECEIPT location (or user location if receipt has none)
|
||||
- Depreciation ALWAYS uses USER location
|
||||
- For different addresses, use SHIPPING address for sales tax
|
||||
- International transactions: no Canadian tax + FX flag
|
||||
- Be precise with all calculations
|
||||
- Always explain your reasoning clearly
|
||||
"""
|
||||
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a Canadian tax expert. Analyze transactions and apply tax rules accurately. Always return valid JSON.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.1, # Low temperature for consistent, factual responses
|
||||
max_tokens=2000,
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content.strip()
|
||||
logger.info(f"LLM tax analysis received: {len(content)} characters")
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting LLM tax analysis: {str(e)}")
|
||||
return self._get_fallback_analysis()
|
||||
|
||||
def _get_fallback_analysis(self) -> str:
|
||||
"""Return fallback analysis if LLM fails"""
|
||||
return json.dumps(
|
||||
{
|
||||
"final_tax_amount": 0.0,
|
||||
"sales_tax": {
|
||||
"applicable_province": "ON",
|
||||
"applicable_rate": 0.13,
|
||||
"tax_name": "HST",
|
||||
"calculated_tax": 0.0,
|
||||
"stated_tax": 0.0,
|
||||
"discrepancy": 0.0,
|
||||
"reason": "LLM analysis failed - using defaults",
|
||||
"requires_review": True,
|
||||
},
|
||||
"foreign_exchange": {
|
||||
"currency_mismatch": False,
|
||||
"requires_manual_review": False,
|
||||
"reason": "Analysis not available",
|
||||
},
|
||||
"depreciation": {
|
||||
"is_capital_asset": False,
|
||||
"reason": "Analysis not available",
|
||||
},
|
||||
"meals_entertainment": {
|
||||
"is_meals_entertainment": False,
|
||||
"reason": "Analysis not available",
|
||||
},
|
||||
"confidence_adjustment": {
|
||||
"boost": 0.0,
|
||||
"reduce": 0.1,
|
||||
"reason": "LLM analysis failed - recommend manual review",
|
||||
},
|
||||
"overall_assessment": "Automatic analysis failed. Manual review recommended.",
|
||||
}
|
||||
)
|
||||
|
||||
def _structure_analysis_results(
|
||||
self,
|
||||
llm_response: str,
|
||||
receipt: Receipt,
|
||||
transaction: Transaction,
|
||||
user_location: str,
|
||||
) -> Dict[str, Any]:
|
||||
"""Parse LLM response and structure it for application"""
|
||||
|
||||
try:
|
||||
# Extract JSON from LLM response (may have markdown code blocks)
|
||||
json_str = llm_response
|
||||
if "```json" in llm_response:
|
||||
json_str = llm_response.split("```json")[1].split("```")[0].strip()
|
||||
elif "```" in llm_response:
|
||||
json_str = llm_response.split("```")[1].split("```")[0].strip()
|
||||
|
||||
analysis = json.loads(json_str)
|
||||
|
||||
# Add metadata
|
||||
analysis["metadata"] = {
|
||||
"user_location": user_location,
|
||||
"receipt_id": receipt.id,
|
||||
"transaction_id": transaction.id,
|
||||
"analysis_method": "LLM-based",
|
||||
"model": self.model,
|
||||
}
|
||||
|
||||
return analysis
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse LLM response as JSON: {str(e)}")
|
||||
logger.error(f"LLM response was: {llm_response}")
|
||||
|
||||
# Return structured fallback
|
||||
return {
|
||||
"final_tax_amount": receipt.tax if receipt.tax else 0.0,
|
||||
"sales_tax": {
|
||||
"requires_review": True,
|
||||
"reason": "Failed to parse LLM response",
|
||||
},
|
||||
"foreign_exchange": {
|
||||
"requires_manual_review": receipt.currency != transaction.currency
|
||||
},
|
||||
"depreciation": {"is_capital_asset": False},
|
||||
"confidence_adjustment": {
|
||||
"boost": 0.0,
|
||||
"reduce": 0.15,
|
||||
"reason": "Analysis parsing failed",
|
||||
},
|
||||
"overall_assessment": "Analysis failed. Manual review required.",
|
||||
"error": str(e),
|
||||
"metadata": {
|
||||
"user_location": user_location,
|
||||
"analysis_method": "fallback",
|
||||
},
|
||||
}
|
||||
|
||||
def _build_batch_analysis_context(self, matches: list, user_location: str) -> str:
|
||||
"""Build comprehensive context for batch LLM analysis of all matches"""
|
||||
|
||||
# Normalize user_location to province code
|
||||
user_province = self._normalize_location_to_province(user_location)
|
||||
|
||||
logger.info(
|
||||
f"Building batch tax analysis context for {len(matches)} matches - User Location: {user_location} → Province Code: {user_province}"
|
||||
)
|
||||
|
||||
# Build tax rates and CCA references once
|
||||
tax_rates_info = json.dumps(self.PROVINCIAL_TAX_RATES, indent=2)
|
||||
cca_rates_info = json.dumps(self.CCA_RATES, indent=2)
|
||||
|
||||
# Build match entries
|
||||
matches_info = []
|
||||
for i, match in enumerate(matches):
|
||||
receipt = match.receipt
|
||||
transaction = match.transaction
|
||||
receipt_location = self._extract_receipt_location(receipt)
|
||||
|
||||
match_info = f"""
|
||||
MATCH {i} (ID: match_{i}):
|
||||
Receipt Details:
|
||||
- Vendor: {receipt.vendor}
|
||||
- Amount: ${receipt.amount:.2f}
|
||||
- Currency: {receipt.currency}
|
||||
- Date: {receipt.receipt_date.strftime("%Y-%m-%d")}
|
||||
- Category: {receipt.category}
|
||||
- Description: {receipt.description}
|
||||
- Billing Address: {self._format_address(receipt.billing_address)}
|
||||
- Shipping Address: {self._format_address(receipt.shipping_address)}
|
||||
- Is Meals & Entertainment: {receipt.is_meals_entertainment}
|
||||
|
||||
Transaction Details:
|
||||
- Vendor: {transaction.vendor}
|
||||
- Amount: ${transaction.amount:.2f}
|
||||
- Currency: {transaction.currency}
|
||||
- Date: {transaction.transaction_date.strftime("%Y-%m-%d")}
|
||||
- Notes: {transaction.notes}
|
||||
- FX Rate: {transaction.fx_rate if transaction.fx_rate else "N/A"}
|
||||
|
||||
Receipt Location Detected:
|
||||
{receipt_location}
|
||||
"""
|
||||
matches_info.append(match_info)
|
||||
|
||||
matches_section = "\n".join(matches_info)
|
||||
|
||||
context = f"""
|
||||
USER CONTEXT:
|
||||
- User Location (Province): {user_province}
|
||||
- User Province Tax Rate: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("rate", 0.13) * 100}%
|
||||
- User Tax Type: {self.PROVINCIAL_TAX_RATES.get(user_province, {}).get("name", "HST")}
|
||||
|
||||
PROVINCIAL TAX RATES REFERENCE:
|
||||
{tax_rates_info}
|
||||
|
||||
CCA DEPRECIATION RATES BY ASSET CLASS:
|
||||
{cca_rates_info}
|
||||
|
||||
=== MATCHES TO ANALYZE ({len(matches)} total) ===
|
||||
{matches_section}
|
||||
"""
|
||||
return context
|
||||
|
||||
def _get_llm_tax_analysis_batch(self, context: str, num_matches: int) -> Dict[str, Any]:
|
||||
"""Get tax rule analysis from LLM for ALL matches in a single call"""
|
||||
|
||||
prompt = f"""
|
||||
You are a Canadian tax expert analyzing MULTIPLE receipt-transaction matches.
|
||||
|
||||
{context}
|
||||
|
||||
=== FOUR CORE TAX RULES ===
|
||||
|
||||
### 1. SALES TAX RULE
|
||||
**Purpose**: Calculate and apply correct sales tax based on shipping and billing addresses.
|
||||
|
||||
**Key Principles**:
|
||||
- When billing and shipping addresses are THE SAME: Apply sales tax based on that address location.
|
||||
- When billing and shipping addresses are DIFFERENT: Apply sales tax based on the SHIPPING address.
|
||||
- Tax rate is determined by the RECEIPT'S location, NOT the user's location (unless no receipt location).
|
||||
|
||||
**Scenario Examples**:
|
||||
a) User in Ontario, Receipt from Quebec:
|
||||
- Apply Quebec's tax rate (14.975% QST+GST), not Ontario's 13% HST
|
||||
|
||||
b) User in Ontario, Receipt from USA (New York):
|
||||
- DO NOT apply Canadian sales tax
|
||||
- This is an international transaction
|
||||
- Flag for FX review instead
|
||||
|
||||
c) User in Ontario, Receipt has NO address information:
|
||||
- DEFAULT to user's location (Ontario 13% HST)
|
||||
|
||||
**Tax Calculation**:
|
||||
- Compare calculated tax vs stated tax on receipt
|
||||
- Flag discrepancies for review
|
||||
|
||||
### 2. FOREIGN EXCHANGE (FX) RULE
|
||||
**Purpose**: Handle currency mismatches between receipts and transactions.
|
||||
|
||||
**Actions**:
|
||||
- Identify when receipt currency ≠ transaction currency (e.g., USD vs CAD)
|
||||
- Calculate expected transaction amount using FX rate if available
|
||||
- Flag discrepancies > $5 or 5% for manual review
|
||||
- If FX rate missing but currencies differ, flag for review
|
||||
|
||||
### 3. DEPRECIATION RULE
|
||||
**Purpose**: Identify capital assets requiring depreciation based on USER'S location.
|
||||
|
||||
**Critical**: Depreciation is ALWAYS based on the USER'S location (for Canadian tax filing), NOT the receipt location.
|
||||
|
||||
**Capital Asset Criteria**:
|
||||
- Cost > $500 typically
|
||||
- Useful life > 1 year
|
||||
- Examples: computers, vehicles, furniture, machinery, buildings
|
||||
|
||||
**CCA Classes**: Assign appropriate class and rate based on asset type and user's jurisdiction
|
||||
|
||||
### 4. MEALS & ENTERTAINMENT RULE
|
||||
**Purpose**: Apply 50% tax deduction limit for M&E expenses.
|
||||
|
||||
**Actions**:
|
||||
- Identify M&E expenses (meals, entertainment, client dinners, etc.)
|
||||
- Tax Deduction: 50% of total amount (including tax)
|
||||
- Accounting Deduction: 100% of total amount (including tax)
|
||||
- Always include sales tax in both calculations
|
||||
|
||||
=== YOUR TASK ===
|
||||
|
||||
Analyze EACH match and return a JSON object where each key is the match ID and the value is the complete tax analysis.
|
||||
|
||||
**CRITICAL INSTRUCTION FOR final_tax_amount:**
|
||||
- This field MUST contain ONLY the calculated sales tax amount in dollars
|
||||
- This is NOT the total amount including tax
|
||||
- This is ONLY the tax portion (HST/GST/PST/QST)
|
||||
- Example: If receipt total is $100 and calculated tax is $13, return 13.00 (not 113.00)
|
||||
- For meals & entertainment: Return the FULL calculated tax amount (not the 50% adjusted amount)
|
||||
- VERIFY: final_tax_amount should equal sales_tax.calculated_tax
|
||||
-
|
||||
|
||||
Return your response as a SINGLE JSON object in this format:
|
||||
|
||||
{{
|
||||
"match_0": {{
|
||||
"final_tax_amount": XX.XX, // ONLY the calculated tax amount
|
||||
"sales_tax": {{
|
||||
"applicable_province": "XX",
|
||||
"applicable_rate": 0.XX,
|
||||
"tax_name": "HST/GST/PST",
|
||||
"calculated_tax": XX.XX,
|
||||
"stated_tax": XX.XX,
|
||||
"discrepancy": XX.XX,
|
||||
"reason": "Detailed explanation",
|
||||
"requires_review": true/false
|
||||
}},
|
||||
"foreign_exchange": {{
|
||||
"currency_mismatch": true/false,
|
||||
"receipt_currency": "XXX",
|
||||
"transaction_currency": "XXX",
|
||||
"expected_transaction_amount": XX.XX,
|
||||
"actual_transaction_amount": XX.XX,
|
||||
"discrepancy": XX.XX,
|
||||
"requires_manual_review": true/false,
|
||||
"reason": "Explanation"
|
||||
}},
|
||||
"depreciation": {{
|
||||
"is_capital_asset": true/false,
|
||||
"asset_class": "class_XX",
|
||||
"cca_rate": 0.XX,
|
||||
"applicable_jurisdiction": "XX",
|
||||
"reason": "Explanation"
|
||||
}},
|
||||
"meals_entertainment": {{
|
||||
"is_meals_entertainment": true/false,
|
||||
"tax_deduction_amount": XX.XX,
|
||||
"accounting_deduction_amount": XX.XX,
|
||||
"sales_tax_included": XX.XX,
|
||||
"reason": "Explanation"
|
||||
}},
|
||||
"confidence_adjustment": {{
|
||||
"boost": 0.XX,
|
||||
"reduce": 0.XX,
|
||||
"reason": "Why confidence should be adjusted"
|
||||
}},
|
||||
"overall_assessment": "Summary for this match"
|
||||
}},
|
||||
"match_1": {{
|
||||
... same structure ...
|
||||
}},
|
||||
... for all {num_matches} matches ...
|
||||
}}
|
||||
"""
|
||||
try:
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a Canadian tax expert. Analyze multiple transactions in batch and apply tax rules accurately. Return ONLY valid JSON - no markdown code blocks, no explanations, just pure JSON.",
|
||||
},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0.1, # Low temperature for consistent, factual responses
|
||||
max_tokens=8000, # Higher limit for batch processing
|
||||
)
|
||||
|
||||
content = response.choices[0].message.content
|
||||
|
||||
# Validate that we got content
|
||||
if not content:
|
||||
logger.error("LLM returned empty response")
|
||||
return {}
|
||||
|
||||
content = content.strip()
|
||||
|
||||
# Check if content is empty after stripping
|
||||
if not content:
|
||||
logger.error("LLM returned whitespace-only response")
|
||||
return {}
|
||||
|
||||
logger.info(
|
||||
f"LLM batch tax analysis received: {len(content)} characters for {num_matches} matches"
|
||||
)
|
||||
logger.debug(f"Raw LLM response: {content[:500]}...") # Log first 500 chars
|
||||
|
||||
# Parse the JSON response - handle various markdown code block formats
|
||||
json_str = content
|
||||
|
||||
# Check for markdown code blocks with various language identifiers
|
||||
if "```json" in content:
|
||||
json_str = content.split("```json")[1].split("```")[0].strip()
|
||||
elif "```javascript" in content:
|
||||
json_str = content.split("```javascript")[1].split("```")[0].strip()
|
||||
elif "```js" in content:
|
||||
json_str = content.split("```js")[1].split("```")[0].strip()
|
||||
elif "```" in content:
|
||||
# Generic code block - extract content between first ``` and last ```
|
||||
parts = content.split("```")
|
||||
if len(parts) >= 3:
|
||||
# Take the second part (index 1), which is between first and second ```
|
||||
json_str = parts[1].strip()
|
||||
# Remove language identifier if it's on the first line
|
||||
lines = json_str.split("\n", 1)
|
||||
if len(lines) > 1 and lines[0].strip() in [
|
||||
"json",
|
||||
"javascript",
|
||||
"js",
|
||||
"",
|
||||
]:
|
||||
json_str = lines[1].strip()
|
||||
|
||||
# Validate JSON string is not empty
|
||||
if not json_str:
|
||||
logger.error("Extracted JSON string is empty")
|
||||
logger.error(f"Original content was: {content[:500]}")
|
||||
return {}
|
||||
|
||||
batch_analysis = json.loads(json_str)
|
||||
|
||||
# Validate we got a dictionary back
|
||||
if not isinstance(batch_analysis, dict):
|
||||
logger.error(f"LLM returned non-dict type: {type(batch_analysis)}")
|
||||
return {}
|
||||
|
||||
logger.info(
|
||||
f"Successfully parsed batch analysis with {len(batch_analysis)} matches"
|
||||
)
|
||||
return batch_analysis
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"JSON decode error in batch LLM tax analysis: {str(e)}")
|
||||
logger.error(
|
||||
f"Failed to parse: {json_str[:500] if 'json_str' in locals() else 'N/A'}"
|
||||
)
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting batch LLM tax analysis: {str(e)}")
|
||||
logger.error(f"Exception type: {type(e).__name__}")
|
||||
# Return empty dict so each match can handle fallback individually
|
||||
return {}
|
||||
|
||||
def _apply_tax_analysis_to_match(self, match, tax_analysis: Dict[str, Any]):
|
||||
"""Apply tax analysis results to a match object"""
|
||||
|
||||
# **CRITICAL FIX: Ensure final_tax_amount matches calculated_tax**
|
||||
final_tax = tax_analysis.get("final_tax_amount", 0.0)
|
||||
calculated_tax = tax_analysis.get("sales_tax", {}).get("calculated_tax", 0.0)
|
||||
|
||||
# If there's a mismatch, use calculated_tax as the source of truth
|
||||
if abs(final_tax - calculated_tax) > 0.01:
|
||||
logger.warning(
|
||||
f"Correcting final_tax_amount mismatch for {match.receipt.vendor}: "
|
||||
f"LLM returned final_tax_amount={final_tax}, but calculated_tax={calculated_tax}. "
|
||||
f"Using calculated_tax as final value."
|
||||
)
|
||||
tax_analysis["final_tax_amount"] = calculated_tax
|
||||
|
||||
# Special case: If final_tax is 0 but calculated_tax > 0, always use calculated_tax
|
||||
if final_tax == 0.0 and calculated_tax > 0.0:
|
||||
logger.warning(
|
||||
f"Correcting zero final_tax_amount for {match.receipt.vendor}: "
|
||||
f"LLM returned 0 but calculated {calculated_tax} HST. Setting final_tax_amount={calculated_tax}"
|
||||
)
|
||||
tax_analysis["final_tax_amount"] = calculated_tax
|
||||
tax_analysis["sales_tax"]["requires_review"] = True
|
||||
|
||||
# Apply the corrected tax analysis
|
||||
match.tax_analysis = tax_analysis
|
||||
|
||||
logger.debug(
|
||||
f"Applied tax analysis to match: {match.receipt.vendor} -> "
|
||||
f"final_tax_amount={tax_analysis['final_tax_amount']}"
|
||||
)
|
||||
|
||||
# Apply confidence adjustments based on tax analysis
|
||||
confidence_adj = tax_analysis.get("confidence_adjustment", {})
|
||||
|
||||
# Boost confidence if tax rules validate the match
|
||||
boost = confidence_adj.get("boost", 0.0)
|
||||
if boost > 0:
|
||||
match.confidence_score = min(1.0, match.confidence_score + boost)
|
||||
match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
|
||||
|
||||
# Reduce confidence if tax issues detected
|
||||
reduce = confidence_adj.get("reduce", 0.0)
|
||||
if reduce > 0:
|
||||
match.confidence_score = max(0.0, match.confidence_score - reduce)
|
||||
match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
|
||||
|
||||
# Add flags for manual review if needed
|
||||
review_flags = []
|
||||
|
||||
# Check sales tax issues
|
||||
sales_tax = tax_analysis.get("sales_tax", {})
|
||||
if sales_tax.get("requires_review", False):
|
||||
review_flags.append("Sales Tax Review Required")
|
||||
|
||||
# Check FX issues
|
||||
fx_analysis = tax_analysis.get("foreign_exchange", {})
|
||||
if fx_analysis.get("requires_manual_review", False):
|
||||
review_flags.append(
|
||||
f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
|
||||
)
|
||||
|
||||
# Check depreciation
|
||||
depreciation = tax_analysis.get("depreciation", {})
|
||||
if depreciation.get("is_capital_asset", False):
|
||||
review_flags.append(
|
||||
f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
|
||||
)
|
||||
|
||||
# Check meals & entertainment
|
||||
meals_ent = tax_analysis.get("meals_entertainment", {})
|
||||
if meals_ent.get("is_meals_entertainment", False):
|
||||
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
|
||||
accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
|
||||
review_flags.append(
|
||||
f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
|
||||
)
|
||||
|
||||
# Add review flags to match reason
|
||||
if review_flags:
|
||||
match.match_reason += " | REVIEW: " + "; ".join(review_flags)
|
||||
|
||||
return match
|
||||
@@ -0,0 +1,583 @@
|
||||
"""
|
||||
Manual Tax Calculator - Rule-based tax calculations without LLM
|
||||
Implements the four core tax rules based on rules.py specifications
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
from schemas import Receipt, Transaction
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ManualTaxCalculator:
|
||||
"""
|
||||
Deterministic tax calculator based on explicit rules from rules.py
|
||||
No LLM calls - pure business logic for accurate, consistent tax calculations
|
||||
"""
|
||||
|
||||
# Provincial tax rates for Canada
|
||||
PROVINCIAL_TAX_RATES = {
|
||||
"ON": {"rate": 0.13, "name": "HST", "type": "Harmonized"},
|
||||
"QC": {"rate": 0.14975, "name": "QST + GST", "type": "Combined"},
|
||||
"BC": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
|
||||
"AB": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
"SK": {"rate": 0.11, "name": "PST + GST", "type": "Combined"},
|
||||
"MB": {"rate": 0.12, "name": "PST + GST", "type": "Combined"},
|
||||
"NS": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"NB": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"NL": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"PE": {"rate": 0.15, "name": "HST", "type": "Harmonized"},
|
||||
"NT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
"NU": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
"YT": {"rate": 0.05, "name": "GST", "type": "Federal only"},
|
||||
}
|
||||
|
||||
# CCA rates by asset class (Canada Revenue Agency rates)
|
||||
CCA_RATES = {
|
||||
"vehicles": {"rate": 0.30, "class": "Class 10", "description": "Vehicles"},
|
||||
"computer_equipment": {
|
||||
"rate": 0.55,
|
||||
"class": "Class 50",
|
||||
"description": "Computer Equipment",
|
||||
},
|
||||
"furniture": {
|
||||
"rate": 0.20,
|
||||
"class": "Class 8",
|
||||
"description": "Furniture & Fixtures",
|
||||
},
|
||||
"buildings": {"rate": 0.04, "class": "Class 1", "description": "Buildings"},
|
||||
"machinery": {
|
||||
"rate": 0.20,
|
||||
"class": "Class 8",
|
||||
"description": "Machinery & Equipment",
|
||||
},
|
||||
}
|
||||
|
||||
# Capital asset threshold
|
||||
CAPITAL_ASSET_THRESHOLD = 500.00
|
||||
|
||||
# Meals & Entertainment categories
|
||||
MEALS_ENTERTAINMENT_KEYWORDS = [
|
||||
"restaurant",
|
||||
"cafe",
|
||||
"coffee",
|
||||
"dining",
|
||||
"food",
|
||||
"meal",
|
||||
"catering",
|
||||
"entertainment",
|
||||
"bar",
|
||||
"pub",
|
||||
"bistro",
|
||||
"eatery",
|
||||
]
|
||||
|
||||
# Capital asset keywords
|
||||
CAPITAL_ASSET_KEYWORDS = {
|
||||
"vehicles": ["vehicle", "car", "truck", "van", "automobile", "suv"],
|
||||
"computer_equipment": [
|
||||
"computer",
|
||||
"laptop",
|
||||
"desktop",
|
||||
"server",
|
||||
"tablet",
|
||||
"monitor",
|
||||
"printer",
|
||||
"scanner",
|
||||
],
|
||||
"furniture": [
|
||||
"furniture",
|
||||
"desk",
|
||||
"chair",
|
||||
"table",
|
||||
"cabinet",
|
||||
"bookshelf",
|
||||
"sofa",
|
||||
],
|
||||
"buildings": ["building", "property", "real estate", "office space"],
|
||||
"machinery": ["machinery", "equipment", "tool", "industrial"],
|
||||
}
|
||||
|
||||
def calculate_tax_analysis(
|
||||
self, receipt: Receipt, transaction: Transaction, user_location: str = "ON"
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Calculate comprehensive tax analysis for a receipt-transaction match
|
||||
|
||||
Returns:
|
||||
Dict containing:
|
||||
- sales_tax: Sales tax calculation and validation
|
||||
- foreign_exchange: FX analysis and discrepancies
|
||||
- depreciation: Capital asset depreciation details
|
||||
- meals_entertainment: M&E deduction calculations
|
||||
- confidence_adjustment: Confidence boost/reduction
|
||||
"""
|
||||
analysis = {}
|
||||
|
||||
# 1. Sales Tax Rule
|
||||
analysis["sales_tax"] = self._calculate_sales_tax(
|
||||
receipt, transaction, user_location
|
||||
)
|
||||
|
||||
# 2. Foreign Exchange Rule
|
||||
analysis["foreign_exchange"] = self._calculate_foreign_exchange(
|
||||
receipt, transaction
|
||||
)
|
||||
|
||||
# 3. Depreciation Rule
|
||||
analysis["depreciation"] = self._calculate_depreciation(receipt, user_location)
|
||||
|
||||
# 4. Meals & Entertainment Rule
|
||||
analysis["meals_entertainment"] = self._calculate_meals_entertainment(receipt)
|
||||
|
||||
# Calculate confidence adjustments
|
||||
analysis["confidence_adjustment"] = self._calculate_confidence_adjustment(
|
||||
analysis
|
||||
)
|
||||
|
||||
# Calculate final tax amount
|
||||
analysis["final_tax_amount"] = analysis["sales_tax"]["calculated_tax"]
|
||||
|
||||
return analysis
|
||||
|
||||
def _calculate_sales_tax(
|
||||
self, receipt: Receipt, transaction: Transaction, user_location: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Rule 1: Sales Tax Calculation
|
||||
- Priority: shipping address > billing address > user location
|
||||
- Different country: no Canadian tax
|
||||
- Missing location: default to user location
|
||||
"""
|
||||
# Determine the applicable location for tax
|
||||
receipt_location, location_source = self._determine_receipt_location(
|
||||
receipt, user_location
|
||||
)
|
||||
|
||||
# Check if international transaction
|
||||
is_international = self._is_international_transaction(
|
||||
receipt_location, user_location
|
||||
)
|
||||
|
||||
if is_international:
|
||||
return {
|
||||
"applicable_province": None,
|
||||
"applicable_rate": 0.0,
|
||||
"tax_name": "N/A",
|
||||
"calculated_tax": 0.0,
|
||||
"stated_tax": receipt.tax,
|
||||
"discrepancy": abs(receipt.tax - 0.0),
|
||||
"reason": f"International transaction - no Canadian tax applied. Receipt location: {receipt_location}",
|
||||
"requires_review": True,
|
||||
"location_source": location_source,
|
||||
"is_international": True,
|
||||
}
|
||||
|
||||
# Get tax rate for the applicable province
|
||||
tax_info = self.PROVINCIAL_TAX_RATES.get(
|
||||
receipt_location, self.PROVINCIAL_TAX_RATES.get(user_location)
|
||||
)
|
||||
|
||||
# Calculate expected tax based on receipt amount
|
||||
# Tax should be calculated on pre-tax amount
|
||||
pre_tax_amount = receipt.amount - receipt.tax
|
||||
calculated_tax = round(pre_tax_amount * tax_info["rate"], 2)
|
||||
|
||||
# Calculate discrepancy
|
||||
discrepancy = abs(receipt.tax - calculated_tax)
|
||||
discrepancy_percentage = (
|
||||
(discrepancy / receipt.tax * 100) if receipt.tax > 0 else 0
|
||||
)
|
||||
|
||||
# Determine if review is needed (>5% discrepancy)
|
||||
requires_review = discrepancy_percentage > 5.0
|
||||
|
||||
return {
|
||||
"applicable_province": receipt_location,
|
||||
"applicable_rate": tax_info["rate"],
|
||||
"tax_name": tax_info["name"],
|
||||
"calculated_tax": calculated_tax,
|
||||
"stated_tax": receipt.tax,
|
||||
"discrepancy": discrepancy,
|
||||
"discrepancy_percentage": round(discrepancy_percentage, 2),
|
||||
"reason": f"Tax calculated for {receipt_location} ({tax_info['name']}) - {location_source}",
|
||||
"requires_review": requires_review,
|
||||
"location_source": location_source,
|
||||
"is_international": False,
|
||||
}
|
||||
|
||||
def _calculate_foreign_exchange(
|
||||
self, receipt: Receipt, transaction: Transaction
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Rule 2: Foreign Exchange Handling
|
||||
- Flag currency mismatches
|
||||
- Don't auto-fetch rates
|
||||
- Manual review required
|
||||
"""
|
||||
currency_mismatch = receipt.currency != transaction.currency
|
||||
|
||||
if not currency_mismatch:
|
||||
return {
|
||||
"currency_mismatch": False,
|
||||
"receipt_currency": receipt.currency,
|
||||
"transaction_currency": transaction.currency,
|
||||
"requires_manual_review": False,
|
||||
"reason": "Currencies match - no FX adjustment needed",
|
||||
}
|
||||
|
||||
# Calculate discrepancy
|
||||
discrepancy = abs(receipt.amount - transaction.amount)
|
||||
|
||||
# Check if transaction has FX rate
|
||||
has_fx_rate = transaction.fx_rate is not None and transaction.fx_rate > 0
|
||||
|
||||
if has_fx_rate:
|
||||
expected_amount = round(receipt.amount * transaction.fx_rate, 2)
|
||||
calculated_discrepancy = abs(transaction.amount - expected_amount)
|
||||
else:
|
||||
expected_amount = None
|
||||
calculated_discrepancy = None
|
||||
|
||||
return {
|
||||
"currency_mismatch": True,
|
||||
"receipt_currency": receipt.currency,
|
||||
"transaction_currency": transaction.currency,
|
||||
"receipt_amount": receipt.amount,
|
||||
"transaction_amount": transaction.amount,
|
||||
"discrepancy": discrepancy,
|
||||
"fx_rate": transaction.fx_rate,
|
||||
"expected_amount": expected_amount,
|
||||
"calculated_discrepancy": calculated_discrepancy,
|
||||
"requires_manual_review": True,
|
||||
"reason": f"Currency mismatch detected: {receipt.currency} → {transaction.currency}. Manual review required.",
|
||||
}
|
||||
|
||||
def _calculate_depreciation(
|
||||
self, receipt: Receipt, user_location: str
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Rule 3: Depreciation Calculation
|
||||
- Always based on USER location (not receipt location)
|
||||
- Threshold: $500+
|
||||
- Two methods: Straight-Line (accounting) and CCA (tax)
|
||||
"""
|
||||
# Check if this is a capital asset
|
||||
is_capital_asset = receipt.amount >= self.CAPITAL_ASSET_THRESHOLD
|
||||
asset_class = None
|
||||
cca_info = None
|
||||
|
||||
if is_capital_asset:
|
||||
# Identify asset class from category and description
|
||||
asset_class = self._identify_asset_class(receipt)
|
||||
if asset_class:
|
||||
cca_info = self.CCA_RATES.get(asset_class)
|
||||
|
||||
if not is_capital_asset or not asset_class:
|
||||
return {
|
||||
"is_capital_asset": False,
|
||||
"reason": f"Not a capital asset (Amount: ${receipt.amount:.2f}, Threshold: ${self.CAPITAL_ASSET_THRESHOLD:.2f})",
|
||||
}
|
||||
|
||||
# Calculate straight-line depreciation (accounting)
|
||||
# Default: 5-year useful life, 10% residual value
|
||||
useful_life_years = 5
|
||||
residual_percentage = 0.10
|
||||
residual_value = receipt.amount * residual_percentage
|
||||
annual_straight_line = (receipt.amount - residual_value) / useful_life_years
|
||||
|
||||
# Calculate CCA depreciation (tax - declining balance)
|
||||
cca_rate = cca_info["rate"]
|
||||
year1_cca = receipt.amount * cca_rate
|
||||
year2_cca = (receipt.amount - year1_cca) * cca_rate
|
||||
|
||||
return {
|
||||
"is_capital_asset": True,
|
||||
"asset_class": asset_class,
|
||||
"cca_class": cca_info["class"],
|
||||
"cca_description": cca_info["description"],
|
||||
"asset_cost": receipt.amount,
|
||||
"user_location": user_location,
|
||||
"straight_line_depreciation": {
|
||||
"method": "Straight-Line (Accounting)",
|
||||
"useful_life_years": useful_life_years,
|
||||
"residual_value": round(residual_value, 2),
|
||||
"annual_depreciation": round(annual_straight_line, 2),
|
||||
},
|
||||
"cca_depreciation": {
|
||||
"method": "CCA Declining Balance (Tax)",
|
||||
"cca_rate": cca_rate,
|
||||
"year_1_depreciation": round(year1_cca, 2),
|
||||
"year_2_depreciation": round(year2_cca, 2),
|
||||
},
|
||||
"reason": f"Capital asset identified: {cca_info['description']} - Depreciation calculated based on user location ({user_location})",
|
||||
}
|
||||
|
||||
def _calculate_meals_entertainment(self, receipt: Receipt) -> Dict[str, Any]:
|
||||
"""
|
||||
Rule 4: Meals & Entertainment Deductions
|
||||
- Tax: 50% of meal cost + 100% of sales tax
|
||||
- Accounting: 100% of meal cost + 100% of sales tax
|
||||
"""
|
||||
# Check if this is meals & entertainment
|
||||
is_meals_entertainment = self._is_meals_entertainment(receipt)
|
||||
|
||||
if not is_meals_entertainment:
|
||||
return {
|
||||
"is_meals_entertainment": False,
|
||||
"reason": "Not classified as meals & entertainment",
|
||||
}
|
||||
|
||||
# Calculate pre-tax meal amount
|
||||
meal_amount = receipt.amount - receipt.tax
|
||||
sales_tax = receipt.tax
|
||||
|
||||
# Tax deduction: 50% of meal + 100% of tax
|
||||
tax_deduction = (meal_amount * 0.50) + sales_tax
|
||||
|
||||
# Accounting deduction: 100% of meal + 100% of tax
|
||||
accounting_deduction = meal_amount + sales_tax
|
||||
|
||||
return {
|
||||
"is_meals_entertainment": True,
|
||||
"meal_amount": round(meal_amount, 2),
|
||||
"sales_tax": round(sales_tax, 2),
|
||||
"total_receipt": round(receipt.amount, 2),
|
||||
"tax_deduction_amount": round(tax_deduction, 2),
|
||||
"tax_deduction_percentage": 50.0,
|
||||
"accounting_deduction_amount": round(accounting_deduction, 2),
|
||||
"accounting_deduction_percentage": 100.0,
|
||||
"reason": "Meals & Entertainment: 50% deductible for tax purposes, 100% for accounting",
|
||||
"breakdown": {
|
||||
"meal_cost": round(meal_amount, 2),
|
||||
"tax_50_percent": round(meal_amount * 0.50, 2),
|
||||
"full_sales_tax": round(sales_tax, 2),
|
||||
},
|
||||
}
|
||||
|
||||
def _calculate_confidence_adjustment(
|
||||
self, analysis: Dict[str, Any]
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
Calculate confidence boost/reduction based on tax analysis
|
||||
"""
|
||||
boost = 0.0
|
||||
reduce = 0.0
|
||||
|
||||
# Sales tax analysis
|
||||
sales_tax = analysis.get("sales_tax", {})
|
||||
if sales_tax.get("requires_review"):
|
||||
reduce += 0.05
|
||||
else:
|
||||
# Small discrepancy is good
|
||||
discrepancy_pct = sales_tax.get("discrepancy_percentage", 0)
|
||||
if discrepancy_pct < 2.0:
|
||||
boost += 0.05
|
||||
|
||||
# Foreign exchange
|
||||
fx = analysis.get("foreign_exchange", {})
|
||||
if fx.get("currency_mismatch"):
|
||||
reduce += 0.10 # FX always requires review
|
||||
|
||||
# Depreciation - capital assets need review
|
||||
depreciation = analysis.get("depreciation", {})
|
||||
if depreciation.get("is_capital_asset"):
|
||||
reduce += 0.05
|
||||
|
||||
return {"boost": round(boost, 2), "reduce": round(reduce, 2)}
|
||||
|
||||
def _determine_receipt_location(
|
||||
self, receipt: Receipt, user_location: str
|
||||
) -> Tuple[str, str]:
|
||||
"""
|
||||
Determine the applicable location for tax calculation
|
||||
Priority: shipping address > billing address > user location
|
||||
Returns: (province_code, source_description)
|
||||
"""
|
||||
# Check shipping address first
|
||||
if receipt.shipping_address:
|
||||
province = self._extract_province_from_address(receipt.shipping_address)
|
||||
if province:
|
||||
return province, "shipping address"
|
||||
|
||||
# Check billing address
|
||||
if receipt.billing_address:
|
||||
province = self._extract_province_from_address(receipt.billing_address)
|
||||
if province:
|
||||
return province, "billing address"
|
||||
|
||||
# Default to user location
|
||||
return user_location, "user location (default)"
|
||||
|
||||
def _extract_province_from_address(self, address: str) -> Optional[str]:
|
||||
"""
|
||||
Extract Canadian province code from address string
|
||||
"""
|
||||
if not address:
|
||||
return None
|
||||
|
||||
address_upper = address.upper()
|
||||
|
||||
# Check for province codes
|
||||
for province_code in self.PROVINCIAL_TAX_RATES.keys():
|
||||
if province_code in address_upper:
|
||||
return province_code
|
||||
|
||||
# Check for full province names
|
||||
province_names = {
|
||||
"ONTARIO": "ON",
|
||||
"QUEBEC": "QC",
|
||||
"BRITISH COLUMBIA": "BC",
|
||||
"ALBERTA": "AB",
|
||||
"SASKATCHEWAN": "SK",
|
||||
"MANITOBA": "MB",
|
||||
"NOVA SCOTIA": "NS",
|
||||
"NEW BRUNSWICK": "NB",
|
||||
"NEWFOUNDLAND": "NL",
|
||||
"PRINCE EDWARD ISLAND": "PE",
|
||||
"NORTHWEST TERRITORIES": "NT",
|
||||
"NUNAVUT": "NU",
|
||||
"YUKON": "YT",
|
||||
}
|
||||
|
||||
for full_name, code in province_names.items():
|
||||
if full_name in address_upper:
|
||||
return code
|
||||
|
||||
return None
|
||||
|
||||
def _is_international_transaction(
|
||||
self, receipt_location: str, user_location: str
|
||||
) -> bool:
|
||||
"""
|
||||
Check if this is an international transaction
|
||||
(receipt from outside Canada when user is in Canada, or vice versa)
|
||||
"""
|
||||
# If receipt location is not a Canadian province, it's international
|
||||
is_canadian = receipt_location in self.PROVINCIAL_TAX_RATES
|
||||
|
||||
# For now, assume user_location is always Canadian
|
||||
# In future, add support for other countries
|
||||
return not is_canadian
|
||||
|
||||
def _identify_asset_class(self, receipt: Receipt) -> Optional[str]:
|
||||
"""
|
||||
Identify the asset class from receipt category and description
|
||||
"""
|
||||
search_text = (
|
||||
f"{receipt.category} {receipt.description} {receipt.vendor}".lower()
|
||||
)
|
||||
|
||||
for asset_class, keywords in self.CAPITAL_ASSET_KEYWORDS.items():
|
||||
for keyword in keywords:
|
||||
if keyword in search_text:
|
||||
return asset_class
|
||||
|
||||
return None
|
||||
|
||||
def _is_meals_entertainment(self, receipt: Receipt) -> bool:
|
||||
"""
|
||||
Check if receipt is for meals & entertainment
|
||||
"""
|
||||
# Check explicit flag first
|
||||
if (
|
||||
hasattr(receipt, "is_meals_entertainment")
|
||||
and receipt.is_meals_entertainment
|
||||
):
|
||||
return True
|
||||
|
||||
# Check category and description
|
||||
search_text = (
|
||||
f"{receipt.category} {receipt.description} {receipt.vendor}".lower()
|
||||
)
|
||||
|
||||
for keyword in self.MEALS_ENTERTAINMENT_KEYWORDS:
|
||||
if keyword in search_text:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def format_analysis_summary(self, analysis: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Format the tax analysis into a human-readable summary
|
||||
"""
|
||||
lines = ["=== Tax Analysis Summary ===", ""]
|
||||
|
||||
# Sales Tax
|
||||
st = analysis.get("sales_tax", {})
|
||||
lines.append("1. SALES TAX:")
|
||||
if st.get("is_international"):
|
||||
lines.append(f" - {st['reason']}")
|
||||
lines.append(" - ⚠️ Review Required: International Transaction")
|
||||
else:
|
||||
lines.append(f" - Province: {st.get('applicable_province', 'N/A')}")
|
||||
lines.append(
|
||||
f" - Tax Rate: {st.get('applicable_rate', 0) * 100:.2f}% ({st.get('tax_name', 'N/A')})"
|
||||
)
|
||||
lines.append(f" - Calculated Tax: ${st.get('calculated_tax', 0):.2f}")
|
||||
lines.append(f" - Stated Tax: ${st.get('stated_tax', 0):.2f}")
|
||||
lines.append(
|
||||
f" - Discrepancy: ${st.get('discrepancy', 0):.2f} ({st.get('discrepancy_percentage', 0):.1f}%)"
|
||||
)
|
||||
if st.get("requires_review"):
|
||||
lines.append(" - ⚠️ Review Required: Tax discrepancy > 5%")
|
||||
lines.append("")
|
||||
|
||||
# Foreign Exchange
|
||||
fx = analysis.get("foreign_exchange", {})
|
||||
lines.append("2. FOREIGN EXCHANGE:")
|
||||
if fx.get("currency_mismatch"):
|
||||
lines.append(
|
||||
f" - Currency Mismatch: {fx['receipt_currency']} → {fx['transaction_currency']}"
|
||||
)
|
||||
lines.append(f" - Receipt Amount: ${fx['receipt_amount']:.2f}")
|
||||
lines.append(f" - Transaction Amount: ${fx['transaction_amount']:.2f}")
|
||||
lines.append(f" - Discrepancy: ${fx['discrepancy']:.2f}")
|
||||
lines.append(" - ⚠️ Manual Review Required")
|
||||
else:
|
||||
lines.append(" - No currency mismatch")
|
||||
lines.append("")
|
||||
|
||||
# Depreciation
|
||||
dep = analysis.get("depreciation", {})
|
||||
lines.append("3. DEPRECIATION:")
|
||||
if dep.get("is_capital_asset"):
|
||||
lines.append(f" - Capital Asset: Yes ({dep['cca_description']})")
|
||||
lines.append(f" - Asset Cost: ${dep['asset_cost']:.2f}")
|
||||
lines.append(
|
||||
f" - CCA Class: {dep['cca_class']} ({dep['cca_depreciation']['cca_rate'] * 100:.0f}%)"
|
||||
)
|
||||
lines.append(
|
||||
f" - Year 1 CCA: ${dep['cca_depreciation']['year_1_depreciation']:.2f}"
|
||||
)
|
||||
lines.append(
|
||||
f" - Annual Straight-Line: ${dep['straight_line_depreciation']['annual_depreciation']:.2f}"
|
||||
)
|
||||
else:
|
||||
lines.append(" - Not a capital asset")
|
||||
lines.append("")
|
||||
|
||||
# Meals & Entertainment
|
||||
me = analysis.get("meals_entertainment", {})
|
||||
lines.append("4. MEALS & ENTERTAINMENT:")
|
||||
if me.get("is_meals_entertainment"):
|
||||
lines.append(" - Type: Meals & Entertainment Expense")
|
||||
lines.append(f" - Meal Amount: ${me['meal_amount']:.2f}")
|
||||
lines.append(f" - Sales Tax: ${me['sales_tax']:.2f}")
|
||||
lines.append(f" - Tax Deduction (50%): ${me['tax_deduction_amount']:.2f}")
|
||||
lines.append(
|
||||
f" - Accounting Deduction (100%): ${me['accounting_deduction_amount']:.2f}"
|
||||
)
|
||||
else:
|
||||
lines.append(" - Not a meals & entertainment expense")
|
||||
lines.append("")
|
||||
|
||||
# Confidence Adjustment
|
||||
conf = analysis.get("confidence_adjustment", {})
|
||||
lines.append("CONFIDENCE ADJUSTMENT:")
|
||||
lines.append(f" - Boost: +{conf.get('boost', 0):.2f}")
|
||||
lines.append(f" - Reduce: -{conf.get('reduce', 0):.2f}")
|
||||
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,312 @@
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from schemas import Match, Receipt, Transaction
|
||||
from services.ai_matcher import AIMatcher
|
||||
from services.ai_rules import AIRulesEngine
|
||||
from services.ai_rules_matcher import AIRulesMatcher
|
||||
from services.feedback_logger import FeedbackLogger
|
||||
from services.llm_tax_analyzer import LLMTaxAnalyzer
|
||||
from services.manual_tax_calculator import ManualTaxCalculator
|
||||
|
||||
|
||||
class MatchingEngine:
|
||||
def __init__(self, use_manual_tax_calculator: bool = False):
|
||||
self.ai_matcher = AIMatcher()
|
||||
self.rules_engine = AIRulesEngine()
|
||||
self.feedback_logger = FeedbackLogger()
|
||||
self.llm_tax_analyzer = LLMTaxAnalyzer()
|
||||
self.manual_tax_calculator = ManualTaxCalculator()
|
||||
self.ai_rules_matcher = AIRulesMatcher()
|
||||
self.use_manual_tax_calculator = use_manual_tax_calculator
|
||||
|
||||
def process_matching(
|
||||
self,
|
||||
receipts: List[Receipt],
|
||||
transactions: List[Transaction],
|
||||
user_location: str = "ON",
|
||||
ai_rules: Optional[List[Dict]] = None,
|
||||
) -> List[Match]:
|
||||
# Get AI matches
|
||||
ai_matches = self.ai_matcher.match_receipts_to_transactions(
|
||||
receipts, transactions
|
||||
)
|
||||
|
||||
# # Apply traditional rules first (lightweight, no API calls)
|
||||
# for match in ai_matches:
|
||||
# rule_results = self.rules_engine.apply_rules(
|
||||
# match.receipt, match.transaction
|
||||
# )
|
||||
|
||||
# # Apply confidence boost from traditional rules
|
||||
# if rule_results["confidence_boost"] > 0:
|
||||
# match.confidence_score = min(
|
||||
# 1.0, match.confidence_score + rule_results["confidence_boost"]
|
||||
# )
|
||||
|
||||
# # Auto-approve if rules say so
|
||||
# if rule_results["auto_approve"]:
|
||||
# match.confidence_score = 1.0
|
||||
# match.match_reason += " (Auto-approved by rules)"
|
||||
|
||||
# # Apply tax analysis - use manual calculator or LLM based on configuration
|
||||
# if self.use_manual_tax_calculator:
|
||||
# # Use deterministic rule-based calculator
|
||||
# enhanced_matches = self._apply_manual_tax_analysis(
|
||||
# ai_matches, user_location
|
||||
# )
|
||||
# else:
|
||||
# # No tax analysis, just use the matches as-is
|
||||
# enhanced_matches = ai_matches
|
||||
|
||||
# Apply AI rules for post-matching evaluation
|
||||
# This adds flag_for_review and auto_approve fields based on custom rules
|
||||
if ai_rules:
|
||||
enhanced_matches = self.ai_rules_matcher.apply_rules_to_matches(
|
||||
ai_matches, ai_rules
|
||||
)
|
||||
else:
|
||||
# Even without custom rules, apply built-in rules (e.g., currency mismatch)
|
||||
enhanced_matches = self.ai_rules_matcher.apply_rules_to_matches(
|
||||
ai_matches, None
|
||||
)
|
||||
|
||||
return enhanced_matches
|
||||
|
||||
def _enhance_match_with_rules(
|
||||
self, match: Match, user_location: str = "ON"
|
||||
) -> Match:
|
||||
"""
|
||||
Enhanced version using LLM to intelligently apply tax rules:
|
||||
1. Sales tax based on receipt location (shipping/billing address priority)
|
||||
2. Foreign exchange rules for currency mismatches
|
||||
3. Depreciation rules for capital assets (based on user location)
|
||||
4. Meals & Entertainment tax deduction rules (50% for tax, 100% for accounting)
|
||||
"""
|
||||
|
||||
# First, apply traditional rule-based checks for basic matching quality
|
||||
rule_results = self.rules_engine.apply_rules(match.receipt, match.transaction)
|
||||
|
||||
# Apply confidence boost from traditional rules
|
||||
if rule_results["confidence_boost"] > 0:
|
||||
match.confidence_score = min(
|
||||
1.0, match.confidence_score + rule_results["confidence_boost"]
|
||||
)
|
||||
|
||||
# Auto-approve if rules say so
|
||||
if rule_results["auto_approve"]:
|
||||
match.confidence_score = 1.0
|
||||
match.match_reason += " (Auto-approved by rules)"
|
||||
|
||||
# Now apply LLM-based tax analysis
|
||||
try:
|
||||
llm_tax_analysis = self.llm_tax_analyzer.analyze_and_apply_tax_rules(
|
||||
match.receipt, match.transaction, user_location
|
||||
)
|
||||
|
||||
# Store the complete tax analysis
|
||||
match.tax_analysis = llm_tax_analysis
|
||||
|
||||
# Apply confidence adjustments based on tax analysis
|
||||
confidence_adj = llm_tax_analysis.get("confidence_adjustment", {})
|
||||
|
||||
# Boost confidence if tax rules validate the match
|
||||
boost = confidence_adj.get("boost", 0.0)
|
||||
if boost > 0:
|
||||
match.confidence_score = min(1.0, match.confidence_score + boost)
|
||||
match.match_reason += f" (Tax analysis confidence boost: +{boost:.2f})"
|
||||
|
||||
# Reduce confidence if tax issues detected
|
||||
reduce = confidence_adj.get("reduce", 0.0)
|
||||
if reduce > 0:
|
||||
match.confidence_score = max(0.0, match.confidence_score - reduce)
|
||||
match.match_reason += f" (Tax issues detected: -{reduce:.2f})"
|
||||
|
||||
# Add flags for manual review if needed
|
||||
review_flags = []
|
||||
|
||||
# Check sales tax issues
|
||||
sales_tax = llm_tax_analysis.get("sales_tax", {})
|
||||
if sales_tax.get("requires_review", False):
|
||||
review_flags.append("Sales Tax Review Required")
|
||||
|
||||
# Check FX issues
|
||||
fx_analysis = llm_tax_analysis.get("foreign_exchange", {})
|
||||
if fx_analysis.get("requires_manual_review", False):
|
||||
review_flags.append(
|
||||
f"FX Review Required (Discrepancy: ${fx_analysis.get('discrepancy', 0):.2f})"
|
||||
)
|
||||
|
||||
# Check depreciation
|
||||
depreciation = llm_tax_analysis.get("depreciation", {})
|
||||
if depreciation.get("is_capital_asset", False):
|
||||
review_flags.append(
|
||||
f"Capital Asset - Depreciation Applicable ({depreciation.get('asset_class', 'Unknown')})"
|
||||
)
|
||||
|
||||
# Check meals & entertainment
|
||||
meals_ent = llm_tax_analysis.get("meals_entertainment", {})
|
||||
if meals_ent.get("is_meals_entertainment", False):
|
||||
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
|
||||
accounting_deduction = meals_ent.get("accounting_deduction_amount", 0)
|
||||
review_flags.append(
|
||||
f"M&E Expense - Tax Deduction: ${tax_deduction:.2f} (50%), Accounting: ${accounting_deduction:.2f} (100%)"
|
||||
)
|
||||
|
||||
# Add review flags to match reason
|
||||
if review_flags:
|
||||
match.match_reason += " | REVIEW: " + "; ".join(review_flags)
|
||||
|
||||
except Exception as e:
|
||||
# If LLM analysis fails, log it and continue with traditional rules
|
||||
import logging
|
||||
|
||||
logging.error(f"LLM tax analysis failed: {str(e)}")
|
||||
match.match_reason += " (Note: Advanced tax analysis unavailable)"
|
||||
|
||||
# Fall back to traditional tax rules if available
|
||||
if rule_results.get("tax_analysis"):
|
||||
match.tax_analysis = rule_results["tax_analysis"]
|
||||
|
||||
return match
|
||||
|
||||
def _apply_manual_tax_analysis(
|
||||
self, matches: List[Match], user_location: str = "ON"
|
||||
) -> List[Match]:
|
||||
"""
|
||||
Apply deterministic rule-based tax analysis to all matches
|
||||
No LLM calls - pure business logic for consistent results
|
||||
"""
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(
|
||||
f"Applying manual tax analysis to {len(matches)} matches using rule-based calculator"
|
||||
)
|
||||
|
||||
enhanced_matches = []
|
||||
|
||||
for match in matches:
|
||||
try:
|
||||
# Get comprehensive tax analysis from manual calculator
|
||||
tax_analysis = self.manual_tax_calculator.calculate_tax_analysis(
|
||||
match.receipt, match.transaction, user_location
|
||||
)
|
||||
|
||||
# Store the complete tax analysis
|
||||
match.tax_analysis = tax_analysis
|
||||
|
||||
# Apply confidence adjustments
|
||||
confidence_adj = tax_analysis.get("confidence_adjustment", {})
|
||||
|
||||
# Boost confidence if tax rules validate the match
|
||||
boost = confidence_adj.get("boost", 0.0)
|
||||
if boost > 0:
|
||||
match.confidence_score = min(1.0, match.confidence_score + boost)
|
||||
match.match_reason += f" (Tax validated: +{boost:.2f})"
|
||||
|
||||
# Reduce confidence if tax issues detected
|
||||
reduce = confidence_adj.get("reduce", 0.0)
|
||||
if reduce > 0:
|
||||
match.confidence_score = max(0.0, match.confidence_score - reduce)
|
||||
match.match_reason += f" (Tax issues: -{reduce:.2f})"
|
||||
|
||||
# Add flags for manual review
|
||||
review_flags = []
|
||||
|
||||
# Sales tax issues
|
||||
sales_tax = tax_analysis.get("sales_tax", {})
|
||||
if sales_tax.get("requires_review"):
|
||||
if sales_tax.get("is_international"):
|
||||
review_flags.append("International Transaction - FX Review")
|
||||
else:
|
||||
discrepancy_pct = sales_tax.get("discrepancy_percentage", 0)
|
||||
review_flags.append(
|
||||
f"Sales Tax Discrepancy: {discrepancy_pct:.1f}%"
|
||||
)
|
||||
|
||||
# FX issues
|
||||
fx = tax_analysis.get("foreign_exchange", {})
|
||||
if fx.get("currency_mismatch"):
|
||||
review_flags.append(
|
||||
f"FX: {fx['receipt_currency']} → {fx['transaction_currency']} (${fx['discrepancy']:.2f})"
|
||||
)
|
||||
|
||||
# Capital asset depreciation
|
||||
depreciation = tax_analysis.get("depreciation", {})
|
||||
if depreciation.get("is_capital_asset"):
|
||||
cca_class = depreciation.get("cca_class", "Unknown")
|
||||
year1_cca = depreciation.get("cca_depreciation", {}).get(
|
||||
"year_1_depreciation", 0
|
||||
)
|
||||
review_flags.append(
|
||||
f"Capital Asset ({cca_class}) - Year 1 CCA: ${year1_cca:.2f}"
|
||||
)
|
||||
|
||||
# Meals & entertainment
|
||||
meals_ent = tax_analysis.get("meals_entertainment", {})
|
||||
if meals_ent.get("is_meals_entertainment"):
|
||||
tax_deduction = meals_ent.get("tax_deduction_amount", 0)
|
||||
accounting_deduction = meals_ent.get(
|
||||
"accounting_deduction_amount", 0
|
||||
)
|
||||
review_flags.append(
|
||||
f"M&E: Tax ${tax_deduction:.2f} (50%), Accounting ${accounting_deduction:.2f} (100%)"
|
||||
)
|
||||
|
||||
# Add review flags to match reason
|
||||
if review_flags:
|
||||
match.match_reason += " | " + "; ".join(review_flags)
|
||||
|
||||
enhanced_matches.append(match)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Manual tax analysis failed for match: {str(e)}", exc_info=True
|
||||
)
|
||||
match.match_reason += " (Tax analysis failed)"
|
||||
enhanced_matches.append(match)
|
||||
|
||||
logger.info(
|
||||
f"Manual tax analysis completed for {len(enhanced_matches)} matches"
|
||||
)
|
||||
return enhanced_matches
|
||||
|
||||
def approve_match(self, match: Match, user_id: str):
|
||||
# Log the approval
|
||||
self.feedback_logger.log_override(
|
||||
transaction_id=match.transaction.id,
|
||||
original_match=f"AI Score: {match.confidence_score}",
|
||||
correction="Approved",
|
||||
reason="User approved match",
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
def reject_match(self, match: Match, reason: str, user_id: str):
|
||||
# Log the rejection
|
||||
self.feedback_logger.log_override(
|
||||
transaction_id=match.transaction.id,
|
||||
original_match=f"AI Score: {match.confidence_score}",
|
||||
correction="Rejected",
|
||||
reason=reason,
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
def get_matching_stats(self, matches: List[Match]) -> Dict[str, Any]:
|
||||
if not matches:
|
||||
return {
|
||||
"total": 0,
|
||||
"high_confidence": 0,
|
||||
"low_confidence": 0,
|
||||
"avg_score": 0,
|
||||
}
|
||||
|
||||
high_confidence = len([m for m in matches if m.confidence_score >= 0.8])
|
||||
low_confidence = len([m for m in matches if m.confidence_score < 0.8])
|
||||
avg_score = sum(m.confidence_score for m in matches) / len(matches)
|
||||
|
||||
return {
|
||||
"total": len(matches),
|
||||
"high_confidence": high_confidence,
|
||||
"low_confidence": low_confidence,
|
||||
"avg_score": round(avg_score, 3),
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
rule = '''
|
||||
### Rule Scenarios
|
||||
Impact of Signup Fields on Tax Calculation and Receipt Matching
|
||||
Impact of Signup Fields (Country and Province/State) on Tax Calculation and Matching**
|
||||
|
||||
**Scenario 1:** User Location (Canada, Ontario) but Receipt from Another Location (e.g., Quebec)
|
||||
User's Location: Canada, Ontario (for tax and depreciation purposes).
|
||||
Receipt Location: The receipt comes from Quebec (the tax rules in Quebec are different from Ontario).
|
||||
What Happens:
|
||||
The sales tax rate should be applied based on the location of the receipt, not the user's profile location.
|
||||
**For example:**
|
||||
The user in Ontario will have 13% HST applied to their purchases.
|
||||
If the receipt is from Quebec, the QST (Quebec Sales Tax) of 9.975% applies instead.
|
||||
|
||||
**Scenario 2:** User Location (Canada, Ontario) and Receipt Location is Different Country (e.g., USA)
|
||||
User's Location: Canada, Ontario.
|
||||
Receipt Location: The receipt is from a business in the USA (e.g., New York).
|
||||
**What Happens:**
|
||||
Sales Tax should not be applied for international transactions (USA in this case) unless the user is importing or there is a customs duty involved.
|
||||
The system will not apply a Canadian sales tax to the receipt from the USA, but the foreign exchange (FX) rule will apply because there is a mismatch between currencies (USD vs. CAD).
|
||||
|
||||
**Scenario 3:** User Location (USA, New York) but Receipt from Another Location in the Same Country (e.g., California)
|
||||
User's Location: USA, New York (for tax purposes).
|
||||
Receipt Location: The receipt is from California (still in the USA, but the sales tax rate is different).
|
||||
**What Happens:**
|
||||
Sales tax should be applied based on the location of the receipt, not the user’s location, since the receipt was issued in California.
|
||||
California may have a different sales tax rate than New York.
|
||||
|
||||
**Scenario 4:** User Location (Canada, Ontario) and Receipt Location with No Address Information
|
||||
User's Location: Canada, Ontario.
|
||||
Receipt Location: The receipt contains no clear shipping or billing address.
|
||||
**What Happens:**
|
||||
If the receipt does not have a clear location, the system will default to the user’s location for sales tax and depreciation.
|
||||
Action:
|
||||
Sales Tax: Apply the sales tax rate based on the user's location (Ontario). For example, 13% HST will be applied.
|
||||
Depreciation: Apply the depreciation rules based on the user’s location (Ontario), even if the receipt doesn’t have address information.
|
||||
|
||||
**Summary of Actions in These Scenarios:**
|
||||
Sales Tax:If the receipt is from a different location (same country or foreign), use the location from the receipt for sales tax calculation.
|
||||
If the receipt is from a different country, don’t apply sales tax from the user's country but flag the FX discrepancy.
|
||||
If the location is missing, apply the user’s location sales tax by default.
|
||||
|
||||
**Depreciation:** Always apply depreciation rules based on the user’s location, regardless of where the receipt is from.
|
||||
**FX (Foreign Exchange):** If the receipt is in a different currency, flag the FX difference for manual review but don’t fetch exchange rates.
|
||||
|
||||
|
||||
|
||||
### 2. **Foreign Exchange (FX) Rule**
|
||||
**Purpose**: To handle discrepancies when transactions and receipts are in different currencies (e.g., USD vs. CAD).
|
||||
- **Action**: Identify the currency mismatch, but do not automatically fetch the exchange rate. Flag the FX difference for manual review, allowing the user to approve or adjust the balance.
|
||||
**Example**:
|
||||
1. A transaction in USD for $100, matched to a receipt in CAD for $125, results in an FX discrepancy of $25.
|
||||
2. The system flags the discrepancy for manual review by the user. The user can then approve the difference or adjust the amounts manually.
|
||||
|
||||
### 3. **Depreciation Rule**
|
||||
**Purpose**: To calculate the depreciation for assets based on the Straight-Line Method (for accounting) or CCA Depreciation (Declining Balance) for tax purposes.
|
||||
**Action**:
|
||||
- Apply Straight-Line Depreciation (for accounting) across the asset’s useful life.
|
||||
- Apply CCA Depreciation (for tax purposes) using a declining balance method.
|
||||
**Example**:
|
||||
1. Straight-Line Depreciation: An asset purchased for $10,000, with a 5-year useful life and a residual value of $1,000, will have an annual depreciation of:
|
||||
- (10,000 - 1,000)/5 = 1,800 per year for 5 years.
|
||||
2. CCA Depreciation: A truck purchased for $20,000, eligible for 30% CCA per year. The depreciation will be:
|
||||
- Year 1: 20,000 x 30% = $6,000
|
||||
- Year 2: (20,000 - 6,000) x 30% = $4,200
|
||||
- The depreciation will decline each year as the book value reduces.
|
||||
|
||||
### 4. **Meals & Entertainment Tax Deduction Rule**
|
||||
**Purpose**: To apply the correct tax deduction for Meals & Entertainment expenses.
|
||||
**Action**:
|
||||
- For Tax Purposes: Only 50% of the total receipt amount is deductible.
|
||||
- For Accounting Purposes: 100% of the total receipt amount is deductible.
|
||||
- Sales Tax: The full sales tax will be deducted for accounting purposes.
|
||||
**Example**:
|
||||
1. A $100 meal receipt for a business dinner:
|
||||
- **Tax Purposes**: Only $50 of the total amount is deductible.
|
||||
- **Accounting Purposes**: The full $100 is deductible.
|
||||
2. If the sales tax on the meal is $12, the entire $12 is included in the accounting deduction, but for tax purposes, the $50 deduction will reflect the adjusted amount after the 50% rule is applied.
|
||||
|
||||
### **When Location on Receipt is Different from User's Location**
|
||||
**1. Sales Tax**:
|
||||
- **Scenario 1**: If the **receipt's location** is different (e.g., receipt from Quebec for a user in Ontario), the **sales tax** is applied based on the **receipt's location** (Quebec sales tax).
|
||||
- **Scenario 2**: If the **receipt** is from a different **country** (e.g., USA), the **system flags** the **currency mismatch** but does not apply **Canadian sales tax**.
|
||||
|
||||
**2. Depreciation**:
|
||||
- Depreciation is always calculated based on the **user's location**, not the receipt's location.
|
||||
- **Depreciation Method** for **Canada (Ontario)**: **CCA method** will apply, regardless of where the receipt comes from.
|
||||
|
||||
**3. FX Handling**:
|
||||
- If the receipt is in a different **currency** (e.g., USD for a CAD-based user), the system will **flag FX differences** for manual review but won’t fetch exchange rates.
|
||||
|
||||
**4. General Process**:
|
||||
- When the **receipt location** is different from the **user's location**, ensure that the **tax and depreciation** are correctly applied based on the **receipt's data**.
|
||||
- For **foreign transactions**, ensure that **FX differences** are flagged for user review.
|
||||
- For **missing location information**, apply **user’s location** by default for tax and depreciation.
|
||||
'''
|
||||
@@ -1,7 +1,7 @@
|
||||
import logging
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from models import Address, Asset, Receipt, Transaction
|
||||
from schemas import Address, Asset, Receipt, Transaction
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Get API key from environment variable with fallback
|
||||
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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")
|
||||
|
||||
# Validate API key
|
||||
if not GROQ_API_KEY or GROQ_API_KEY == "your_api_key_here":
|
||||
raise ValueError("GROQ_API_KEY environment variable is not set or invalid. Please set it in your .env file.")
|
||||
|
||||
CONFIDENCE_THRESHOLD = 0.3
|
||||
DATE_TOLERANCE_DAYS = 7
|
||||
AMOUNT_TOLERANCE_PERCENT = 0.05
|
||||
-75
@@ -1,75 +0,0 @@
|
||||
from typing import Annotated
|
||||
|
||||
from fastapi import Depends
|
||||
from sqlalchemy import Column, DateTime, Float, Integer, String, create_engine
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import Session, sessionmaker
|
||||
|
||||
SQLALCHEMY_DATABASE_URL = "sqlite:///./sql_app.db"
|
||||
|
||||
engine = create_engine(
|
||||
SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
|
||||
)
|
||||
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
db_dependency = Annotated[Session, Depends(get_db)]
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
def create_db_tables():
|
||||
Base.metadata.create_all(bind=engine)
|
||||
|
||||
|
||||
def clear_all_data():
|
||||
"""Clear all data from the database (useful for testing)"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
db.query(Transaction).delete()
|
||||
db.query(Receipt).delete()
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# Transactions table
|
||||
class Transaction(Base):
|
||||
__tablename__ = "transactions"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
transaction_id = Column(String, unique=True, index=True)
|
||||
amount = Column(Float, nullable=False)
|
||||
date = Column(DateTime, nullable=False)
|
||||
vendor = Column(String, nullable=False)
|
||||
description = Column(String, nullable=True)
|
||||
category = Column(String, nullable=True)
|
||||
tax_amount = Column(Float, nullable=True)
|
||||
categorisation_id = Column(String, nullable=True)
|
||||
user_id = Column(String, nullable=True)
|
||||
|
||||
|
||||
# Receipts table
|
||||
class Receipt(Base):
|
||||
__tablename__ = "receipts"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
receipt_id = Column(String, unique=True, index=True)
|
||||
file_id = Column(String, unique=True, index=True)
|
||||
amount = Column(Float, nullable=False)
|
||||
date = Column(DateTime, nullable=False)
|
||||
vendor = Column(String, nullable=False)
|
||||
description = Column(String, nullable=True)
|
||||
category = Column(String, nullable=True)
|
||||
tax_amount = Column(Float, nullable=True)
|
||||
confidence = Column(Float, nullable=True)
|
||||
extraction_success = Column(String, nullable=True)
|
||||
error_message = Column(String, nullable=True)
|
||||
@@ -1,498 +0,0 @@
|
||||
import groq
|
||||
import base64
|
||||
import io
|
||||
from PIL import Image
|
||||
import PyPDF2
|
||||
from typing import Dict, Any, List, Optional
|
||||
import config
|
||||
import os
|
||||
import aiofiles
|
||||
from datetime import datetime
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DocumentProcessor:
|
||||
def __init__(self):
|
||||
self.client = groq.Groq(api_key=config.GROQ_API_KEY)
|
||||
self.model = "meta-llama/llama-4-scout-17b-16e-instruct" # Vision model
|
||||
|
||||
async def process_file(self, file_path: str, file_type: str) -> Dict[str, Any]:
|
||||
"""Process uploaded file and extract receipt data"""
|
||||
try:
|
||||
if file_type.lower() in ['jpg', 'jpeg', 'png', 'gif', 'bmp']:
|
||||
return await self._process_image(file_path)
|
||||
elif file_type.lower() == 'pdf':
|
||||
return await self._process_pdf(file_path)
|
||||
else:
|
||||
raise ValueError(f"Unsupported file type: {file_type}")
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
async def _process_image(self, image_path: str) -> Dict[str, Any]:
|
||||
"""Extract data from image using Groq vision"""
|
||||
try:
|
||||
# Encode image to base64
|
||||
base64_image = self._encode_image(image_path)
|
||||
|
||||
# Create Groq vision prompt
|
||||
prompt = """
|
||||
Analyze this receipt image and extract the following information in JSON format:
|
||||
{
|
||||
"vendor": "Store/company name",
|
||||
"description": "Detailed description of items/services purchased",
|
||||
"total_amount": 0.00,
|
||||
"tax_amount": 0.00,
|
||||
"date": "YYYY-MM-DD",
|
||||
"category": "Food/Transport/Office/Other",
|
||||
"confidence": 0.95
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Extract vendor name as it appears on receipt
|
||||
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
|
||||
- Total amount should be the final total including tax
|
||||
- Tax amount is separate tax line if available
|
||||
- Date should be the date on the receipt
|
||||
- Categorize based on vendor type (Starbucks=Food, Shell=Transport, etc.)
|
||||
- Confidence score 0-1 based on how clear the receipt is
|
||||
|
||||
Return only valid JSON.
|
||||
"""
|
||||
|
||||
# Call Groq vision API with correct format
|
||||
response = self.client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=self.model,
|
||||
max_tokens=500,
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
# Parse response
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
return self._parse_extraction_result(result_text)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Image processing error: {str(e)}"}
|
||||
|
||||
def _encode_image(self, image_path: str) -> str:
|
||||
"""Encode image to base64 string"""
|
||||
with open(image_path, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
async def _process_pdf(self, pdf_path: str) -> Dict[str, Any]:
|
||||
"""Extract data from PDF by converting to image first"""
|
||||
try:
|
||||
# For now, extract text from PDF and process as text
|
||||
text_content = self._extract_text_from_pdf(pdf_path)
|
||||
return self._process_text_content(text_content)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"PDF processing error: {str(e)}"}
|
||||
|
||||
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
||||
"""Extract text from PDF"""
|
||||
try:
|
||||
with open(pdf_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file)
|
||||
text = ""
|
||||
for page in pdf_reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
return text
|
||||
except Exception as e:
|
||||
return ""
|
||||
|
||||
def _process_text_content(self, text_content: str) -> Dict[str, Any]:
|
||||
"""Process text content using Groq (fallback for PDFs)"""
|
||||
try:
|
||||
prompt = f"""
|
||||
Analyze this receipt text and extract the following information in JSON format:
|
||||
|
||||
Receipt Text:
|
||||
{text_content}
|
||||
|
||||
Extract:
|
||||
{{
|
||||
"vendor": "Store/company name",
|
||||
"description": "Detailed description of items/services purchased",
|
||||
"total_amount": 0.00,
|
||||
"tax_amount": 0.00,
|
||||
"date": "YYYY-MM-DD",
|
||||
"category": "Food/Transport/Office/Other",
|
||||
"confidence": 0.95
|
||||
}}
|
||||
|
||||
Rules:
|
||||
- Extract vendor name as it appears on receipt
|
||||
- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
|
||||
- Total amount should be the final total including tax
|
||||
- Tax amount is separate tax line if available
|
||||
- Date should be the date on the receipt
|
||||
- Categorize based on vendor type
|
||||
- Confidence score 0-1 based on clarity
|
||||
|
||||
Return only valid JSON.
|
||||
"""
|
||||
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
max_tokens=500,
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
return self._parse_extraction_result(result_text)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": f"Text processing error: {str(e)}"}
|
||||
|
||||
def _parse_extraction_result(self, result_text: str) -> Dict[str, Any]:
|
||||
"""Parse Groq response and extract JSON data"""
|
||||
try:
|
||||
# Clean up response and extract JSON
|
||||
import json
|
||||
import re
|
||||
|
||||
# Find JSON in response - try multiple patterns
|
||||
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str = json_match.group()
|
||||
|
||||
# Clean up common JSON issues
|
||||
json_str = re.sub(r',\s*([}\]])', r'\1', json_str) # Remove trailing commas
|
||||
json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str) # Quote unquoted keys
|
||||
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
# Try to fix common JSON issues
|
||||
logger.warning(f"Initial JSON parsing failed: {e}")
|
||||
|
||||
# Try to extract individual fields using regex
|
||||
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
|
||||
description_match = re.search(r'"description"\s*:\s*"([^"]*)"', json_str)
|
||||
total_amount_match = re.search(r'"total_amount"\s*:\s*([0-9.]+)', json_str)
|
||||
tax_amount_match = re.search(r'"tax_amount"\s*:\s*([0-9.]+)', json_str)
|
||||
date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
|
||||
category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
|
||||
confidence_match = re.search(r'"confidence"\s*:\s*([0-9.]+)', json_str)
|
||||
|
||||
data = {
|
||||
"vendor": vendor_match.group(1) if vendor_match else "",
|
||||
"description": description_match.group(1) if description_match else "",
|
||||
"total_amount": float(total_amount_match.group(1)) if total_amount_match else 0.0,
|
||||
"tax_amount": float(tax_amount_match.group(1)) if tax_amount_match else 0.0,
|
||||
"date": date_match.group(1) if date_match else "",
|
||||
"category": category_match.group(1) if category_match else "Other",
|
||||
"confidence": float(confidence_match.group(1)) if confidence_match else 0.5
|
||||
}
|
||||
|
||||
# Validate and clean data
|
||||
return {
|
||||
"vendor": str(data.get("vendor", "")).strip(),
|
||||
"description": str(data.get("description", "")).strip(),
|
||||
"total_amount": float(data.get("total_amount", 0)),
|
||||
"tax_amount": float(data.get("tax_amount", 0)),
|
||||
"date": str(data.get("date", "")).strip(),
|
||||
"category": str(data.get("category", "Other")).strip(),
|
||||
"confidence": float(data.get("confidence", 0.5)),
|
||||
"extraction_success": True
|
||||
}
|
||||
else:
|
||||
# Try to extract fields from plain text
|
||||
logger.warning("No JSON found in response, attempting text extraction")
|
||||
return self._extract_from_plain_text(result_text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"JSON parsing error: {str(e)}")
|
||||
return {"error": f"JSON parsing error: {str(e)}", "extraction_success": False}
|
||||
|
||||
def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
|
||||
"""Extract receipt data from plain text when JSON parsing fails"""
|
||||
try:
|
||||
import re
|
||||
|
||||
# Extract vendor (look for common patterns)
|
||||
vendor_patterns = [
|
||||
r'(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)',
|
||||
r'([A-Z][A-Za-z0-9\s&.,]{3,30})', # Capitalized words
|
||||
]
|
||||
|
||||
vendor = ""
|
||||
for pattern in vendor_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
vendor = match.group(1).strip()
|
||||
break
|
||||
|
||||
# Extract amount (look for currency patterns)
|
||||
amount_patterns = [
|
||||
r'\$?\s*([0-9,]+\.?[0-9]*)',
|
||||
r'(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)',
|
||||
]
|
||||
|
||||
total_amount = 0.0
|
||||
for pattern in amount_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
total_amount = float(match.group(1).replace(',', ''))
|
||||
break
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
# Extract date
|
||||
date_patterns = [
|
||||
r'(\d{4}-\d{2}-\d{2})',
|
||||
r'(\d{1,2}/\d{1,2}/\d{2,4})',
|
||||
r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}',
|
||||
]
|
||||
|
||||
date = ""
|
||||
for pattern in date_patterns:
|
||||
match = re.search(pattern, text, re.IGNORECASE)
|
||||
if match:
|
||||
date = match.group(0)
|
||||
break
|
||||
|
||||
return {
|
||||
"vendor": vendor or "Unknown",
|
||||
"total_amount": total_amount,
|
||||
"tax_amount": 0.0,
|
||||
"date": date or "",
|
||||
"category": "Other",
|
||||
"confidence": 0.3, # Low confidence for text extraction
|
||||
"extraction_success": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Text extraction error: {str(e)}")
|
||||
return {
|
||||
"vendor": "Unknown",
|
||||
"total_amount": 0.0,
|
||||
"tax_amount": 0.0,
|
||||
"date": "",
|
||||
"category": "Other",
|
||||
"confidence": 0.1,
|
||||
"extraction_success": False,
|
||||
"error": f"Text extraction failed: {str(e)}"
|
||||
}
|
||||
|
||||
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
|
||||
"""Save uploaded file to temporary storage"""
|
||||
try:
|
||||
# Create uploads directory if it doesn't exist
|
||||
upload_dir = "uploads"
|
||||
os.makedirs(upload_dir, exist_ok=True)
|
||||
|
||||
# Generate unique filename
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
safe_filename = f"{timestamp}_{filename.replace(' ', '_')}"
|
||||
file_path = os.path.join(upload_dir, safe_filename)
|
||||
|
||||
# Save file
|
||||
async with aiofiles.open(file_path, 'wb') as f:
|
||||
await f.write(file_content)
|
||||
|
||||
return file_path
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to save file: {str(e)}")
|
||||
|
||||
async def extract_transactions_from_image(self, image_path: str) -> Dict[str, Any]:
|
||||
"""Extract multiple transactions from an image (bank statement, credit card statement, etc.)"""
|
||||
try:
|
||||
# Encode image to base64
|
||||
base64_image = self._encode_image(image_path)
|
||||
|
||||
# Create Groq vision prompt for transaction extraction
|
||||
prompt = """
|
||||
Analyze this financial document image (bank statement, credit card statement, etc.) and extract ALL transactions in JSON format.
|
||||
|
||||
Look for transaction lists, payment records, or any financial entries that show:
|
||||
- Date
|
||||
- Amount (positive or negative)
|
||||
- Vendor/Description/Payee name
|
||||
- Any additional notes or memo
|
||||
|
||||
Return the transactions as a JSON array:
|
||||
{
|
||||
"extraction_success": true,
|
||||
"transactions": [
|
||||
{
|
||||
"date": "YYYY-MM-DD",
|
||||
"amount": 0.00,
|
||||
"vendor": "Vendor name",
|
||||
"memo": "Additional notes"
|
||||
},
|
||||
{
|
||||
"date": "YYYY-MM-DD",
|
||||
"amount": -0.00,
|
||||
"vendor": "Another vendor",
|
||||
"memo": "Payment or charge description"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- Extract ALL visible transactions
|
||||
- Include both positive (credits) and negative (debits) amounts
|
||||
- Use the actual date format from the document
|
||||
- Vendor should be the merchant/payee name
|
||||
- Memo can include transaction type, reference numbers, etc.
|
||||
- If no transactions found, return empty array but set extraction_success to true
|
||||
|
||||
Return only valid JSON.
|
||||
"""
|
||||
|
||||
# Call Groq vision API
|
||||
response = self.client.chat.completions.create(
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
model=self.model,
|
||||
max_tokens=2000, # Higher token limit for multiple transactions
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
# Parse response
|
||||
result_text = response.choices[0].message.content.strip()
|
||||
return self._parse_transaction_extraction_result(result_text)
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"Transaction extraction error: {str(e)}",
|
||||
"transactions": []
|
||||
}
|
||||
|
||||
def _parse_transaction_extraction_result(self, result_text: str) -> Dict[str, Any]:
|
||||
"""Parse Groq response for transaction extraction"""
|
||||
try:
|
||||
import json
|
||||
import re
|
||||
|
||||
# Find the first '{' and last '}'
|
||||
start = result_text.find('{')
|
||||
end = result_text.rfind('}')
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": "Could not find JSON object in AI response",
|
||||
"transactions": []
|
||||
}
|
||||
json_str = result_text[start:end+1]
|
||||
|
||||
# Remove trailing commas before } or ]
|
||||
json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
|
||||
|
||||
try:
|
||||
data = json.loads(json_str)
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"JSON parsing error: {str(e)}")
|
||||
logging.error(f"Offending JSON string:\n{json_str}")
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"JSON parsing error: {str(e)}",
|
||||
"transactions": []
|
||||
}
|
||||
|
||||
# Validate and clean data
|
||||
transactions = data.get("transactions", [])
|
||||
cleaned_transactions = []
|
||||
for txn in transactions:
|
||||
try:
|
||||
cleaned_txn = {
|
||||
"date": str(txn.get("date", "")).strip(),
|
||||
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
|
||||
"vendor": str(txn.get("vendor", "")).strip(),
|
||||
"memo": str(txn.get("memo", "")).strip()
|
||||
}
|
||||
cleaned_transactions.append(cleaned_txn)
|
||||
except Exception as e:
|
||||
continue
|
||||
return {
|
||||
"extraction_success": data.get("extraction_success", True),
|
||||
"transactions": cleaned_transactions,
|
||||
"total_transactions": len(cleaned_transactions)
|
||||
}
|
||||
except Exception as e:
|
||||
import logging
|
||||
logging.error(f"JSON parsing error (outer): {str(e)}")
|
||||
return {
|
||||
"extraction_success": False,
|
||||
"error": f"JSON parsing error: {str(e)}",
|
||||
"transactions": []
|
||||
}
|
||||
|
||||
def _parse_date_to_iso(self, date_str: str) -> str:
|
||||
"""Parse various date formats and convert to YYYY-MM-DD"""
|
||||
try:
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
date_str = date_str.strip().upper()
|
||||
|
||||
# Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024"
|
||||
month_pattern = r'(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s+(\d{1,2})(?:,\s*(\d{4}))?'
|
||||
match = re.match(month_pattern, date_str)
|
||||
|
||||
if match:
|
||||
month_abbr, day, year = match.groups()
|
||||
month_map = {
|
||||
'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
|
||||
'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
|
||||
}
|
||||
|
||||
month = month_map[month_abbr]
|
||||
day = int(day)
|
||||
year = int(year) if year else datetime.now().year
|
||||
|
||||
# Handle 2-digit years
|
||||
if year < 100:
|
||||
year += 2000
|
||||
|
||||
return f"{year:04d}-{month:02d}-{day:02d}"
|
||||
|
||||
# Handle YYYY-MM-DD format
|
||||
if re.match(r'\d{4}-\d{2}-\d{2}', date_str):
|
||||
return date_str
|
||||
|
||||
# Handle MM/DD/YYYY format
|
||||
if re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_str):
|
||||
return datetime.strptime(date_str, '%m/%d/%Y').strftime('%Y-%m-%d')
|
||||
|
||||
# Handle MM/DD/YY format
|
||||
if re.match(r'\d{1,2}/\d{1,2}/\d{2}', date_str):
|
||||
return datetime.strptime(date_str, '%m/%d/%y').strftime('%Y-%m-%d')
|
||||
|
||||
return None
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
@@ -1,157 +0,0 @@
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any, Dict, List
|
||||
|
||||
|
||||
class GoogleDriveSync:
|
||||
def __init__(self):
|
||||
self.service = None
|
||||
self.processed_files = set()
|
||||
|
||||
def authenticate(self):
|
||||
"""Authenticate with Google Drive API"""
|
||||
try:
|
||||
from google.auth.transport.requests import Request
|
||||
from google.oauth2.credentials import Credentials
|
||||
from google_auth_oauthlib.flow import InstalledAppFlow
|
||||
from googleapiclient.discovery import build
|
||||
|
||||
SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
|
||||
|
||||
# Load existing credentials
|
||||
if os.path.exists("token.json"):
|
||||
self.creds = Credentials.from_authorized_user_file("token.json", SCOPES)
|
||||
|
||||
# If no valid credentials available, let user log in
|
||||
if not self.creds or not self.creds.valid:
|
||||
if self.creds and self.creds.expired and self.creds.refresh_token:
|
||||
self.creds.refresh(Request())
|
||||
else:
|
||||
if not os.path.exists("credentials.json"):
|
||||
raise Exception(
|
||||
"credentials.json not found. Please download from Google Cloud Console."
|
||||
)
|
||||
|
||||
flow = InstalledAppFlow.from_client_secrets_file(
|
||||
"credentials.json", SCOPES
|
||||
)
|
||||
self.creds = flow.run_local_server(port=0)
|
||||
|
||||
# Save credentials for next run
|
||||
with open("token.json", "w") as token:
|
||||
token.write(self.creds.to_json())
|
||||
|
||||
# Build the Drive service
|
||||
self.service = build("drive", "v3", credentials=self.creds)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Authentication error: {e}")
|
||||
return False
|
||||
|
||||
def list_folders(self) -> List[Dict[str, Any]]:
|
||||
"""List all folders in Google Drive"""
|
||||
if not self.service:
|
||||
if not self.authenticate():
|
||||
return []
|
||||
|
||||
try:
|
||||
results = (
|
||||
self.service.files()
|
||||
.list(
|
||||
q="mimeType='application/vnd.google-apps.folder'",
|
||||
pageSize=100,
|
||||
fields="nextPageToken, files(id, name, createdTime, modifiedTime)",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
|
||||
return results.get("files", [])
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error listing folders: {e}")
|
||||
return []
|
||||
|
||||
def get_folder_info(self, folder_id: str) -> Dict[str, Any]:
|
||||
"""Get information about a Google Drive folder"""
|
||||
if not self.service:
|
||||
if not self.authenticate():
|
||||
return {}
|
||||
|
||||
try:
|
||||
folder = (
|
||||
self.service.files()
|
||||
.get(fileId=folder_id, fields="id, name, createdTime, modifiedTime")
|
||||
.execute()
|
||||
)
|
||||
|
||||
return folder
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting folder info: {e}")
|
||||
return {}
|
||||
|
||||
async def process_drive_files(self, folder_id: str = None) -> List[Dict[str, Any]]:
|
||||
"""Process all receipt files from Google Drive"""
|
||||
if not self.service:
|
||||
if not self.authenticate():
|
||||
return []
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
# File types to look for
|
||||
file_types = [
|
||||
"'application/pdf'",
|
||||
"'image/jpeg'",
|
||||
"'image/png'",
|
||||
"'image/gif'",
|
||||
"'image/bmp'",
|
||||
]
|
||||
mime_types = " or ".join(file_types)
|
||||
|
||||
# Build query
|
||||
query = f"mimeType contains {mime_types}"
|
||||
if folder_id:
|
||||
query += f" and '{folder_id}' in parents"
|
||||
|
||||
# Add date filter (last 30 days)
|
||||
thirty_days_ago = (datetime.now() - timedelta(days=30)).isoformat() + "Z"
|
||||
query += f" and modifiedTime > '{thirty_days_ago}'"
|
||||
|
||||
results_files = (
|
||||
self.service.files()
|
||||
.list(
|
||||
q=query,
|
||||
pageSize=100,
|
||||
fields="nextPageToken, files(id, name, mimeType, modifiedTime, size)",
|
||||
)
|
||||
.execute()
|
||||
)
|
||||
|
||||
files = results_files.get("files", [])
|
||||
files = [file for file in files if file["id"] not in self.processed_files]
|
||||
|
||||
# For demo purposes, return mock results
|
||||
for file in files[:3]: # Process first 3 files
|
||||
mock_result = {
|
||||
"file_id": file["id"],
|
||||
"filename": file["name"],
|
||||
"drive_modified": file["modifiedTime"],
|
||||
"file_size": file.get("size", 0),
|
||||
"extraction_success": True,
|
||||
"vendor": "Demo Vendor",
|
||||
"description": "Coffee and sandwich",
|
||||
"total_amount": 25.50,
|
||||
"tax_amount": 2.04,
|
||||
"date": "2024-01-15",
|
||||
"category": "Food",
|
||||
"confidence": 0.95,
|
||||
}
|
||||
results.append(mock_result)
|
||||
self.processed_files.add(file["id"])
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing Drive files: {e}")
|
||||
|
||||
return results
|
||||
@@ -1,89 +0,0 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from ai_matcher import AIMatcher
|
||||
from ai_rules import AIRulesEngine
|
||||
from feedback_logger import FeedbackLogger
|
||||
from models import Match, Receipt, Transaction
|
||||
|
||||
|
||||
class MatchingEngine:
|
||||
def __init__(self):
|
||||
self.ai_matcher = AIMatcher()
|
||||
self.rules_engine = AIRulesEngine()
|
||||
self.feedback_logger = FeedbackLogger()
|
||||
|
||||
def process_matching(
|
||||
self, receipts: List[Receipt], transactions: List[Transaction]
|
||||
) -> List[Match]:
|
||||
# Get AI matches
|
||||
ai_matches = self.ai_matcher.match_receipts_to_transactions(
|
||||
receipts, transactions
|
||||
)
|
||||
|
||||
# Apply rules and enhance matches
|
||||
enhanced_matches = []
|
||||
for match in ai_matches:
|
||||
enhanced_match = self._enhance_match_with_rules(match)
|
||||
enhanced_matches.append(enhanced_match)
|
||||
|
||||
return enhanced_matches
|
||||
|
||||
def _enhance_match_with_rules(self, match: Match) -> Match:
|
||||
rule_results = self.rules_engine.apply_rules(match.receipt, match.transaction)
|
||||
|
||||
# Apply confidence boost from rules
|
||||
if rule_results["confidence_boost"] > 0:
|
||||
match.confidence_score = min(
|
||||
1.0, match.confidence_score + rule_results["confidence_boost"]
|
||||
)
|
||||
|
||||
# Auto-approve if rules say so
|
||||
if rule_results["auto_approve"]:
|
||||
match.confidence_score = 1.0
|
||||
match.match_reason += " (Auto-approved by rules)"
|
||||
|
||||
# Add tax analysis to match
|
||||
if rule_results.get("tax_analysis"):
|
||||
match.tax_analysis = rule_results["tax_analysis"]
|
||||
|
||||
return match
|
||||
|
||||
def approve_match(self, match: Match, user_id: str):
|
||||
# Log the approval
|
||||
self.feedback_logger.log_override(
|
||||
transaction_id=match.transaction.id,
|
||||
original_match=f"AI Score: {match.confidence_score}",
|
||||
correction="Approved",
|
||||
reason="User approved match",
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
def reject_match(self, match: Match, reason: str, user_id: str):
|
||||
# Log the rejection
|
||||
self.feedback_logger.log_override(
|
||||
transaction_id=match.transaction.id,
|
||||
original_match=f"AI Score: {match.confidence_score}",
|
||||
correction="Rejected",
|
||||
reason=reason,
|
||||
user_id=user_id,
|
||||
)
|
||||
|
||||
def get_matching_stats(self, matches: List[Match]) -> Dict[str, Any]:
|
||||
if not matches:
|
||||
return {
|
||||
"total": 0,
|
||||
"high_confidence": 0,
|
||||
"low_confidence": 0,
|
||||
"avg_score": 0,
|
||||
}
|
||||
|
||||
high_confidence = len([m for m in matches if m.confidence_score >= 0.8])
|
||||
low_confidence = len([m for m in matches if m.confidence_score < 0.8])
|
||||
avg_score = sum(m.confidence_score for m in matches) / len(matches)
|
||||
|
||||
return {
|
||||
"total": len(matches),
|
||||
"high_confidence": high_confidence,
|
||||
"low_confidence": low_confidence,
|
||||
"avg_score": round(avg_score, 3),
|
||||
}
|
||||
@@ -1,59 +0,0 @@
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class Address:
|
||||
"""Address information for tax calculations"""
|
||||
province: str
|
||||
city: str
|
||||
postal_code: str
|
||||
country: str = "Canada"
|
||||
|
||||
@dataclass
|
||||
class Receipt:
|
||||
id: str
|
||||
file_name: str
|
||||
upload_date: datetime
|
||||
receipt_date: datetime
|
||||
amount: float
|
||||
tax: float
|
||||
vendor: str
|
||||
category: str
|
||||
description: str
|
||||
# Tax rule fields
|
||||
billing_address: Optional[Address] = None
|
||||
shipping_address: Optional[Address] = None
|
||||
currency: str = "CAD"
|
||||
is_meals_entertainment: bool = False
|
||||
|
||||
@dataclass
|
||||
class Transaction:
|
||||
id: str
|
||||
transaction_date: datetime
|
||||
amount: float
|
||||
vendor: str
|
||||
notes: str
|
||||
# Tax rule fields
|
||||
currency: str = "CAD"
|
||||
fx_rate: Optional[float] = None
|
||||
|
||||
@dataclass
|
||||
class Asset:
|
||||
"""Asset for depreciation calculations"""
|
||||
id: str
|
||||
name: str
|
||||
purchase_date: datetime
|
||||
purchase_amount: float
|
||||
useful_life_years: int
|
||||
residual_value: float
|
||||
cca_rate: float # Capital Cost Allowance rate
|
||||
asset_class: str
|
||||
|
||||
@dataclass
|
||||
class Match:
|
||||
receipt: Receipt
|
||||
transaction: Transaction
|
||||
confidence_score: float
|
||||
match_reason: str
|
||||
tax_analysis: Optional[dict] = None
|
||||
+3
-1
@@ -13,4 +13,6 @@ aiofiles
|
||||
google-auth
|
||||
google-auth-oauthlib
|
||||
google-auth-httplib2
|
||||
google-api-python-client
|
||||
google-api-python-client
|
||||
sqlalchemy
|
||||
pydantic-settings
|
||||
Reference in New Issue
Block a user