Update README and core files, remove test/debug scripts, improve documentation and robustness
This commit is contained in:
@@ -7,9 +7,9 @@ AI-powered receipt-to-transaction matching engine using Groq LLM. This is a **Da
|
|||||||
This Data Science Engine receives QuickBooks transaction data from backend applications and provides:
|
This Data Science Engine receives QuickBooks transaction data from backend applications and provides:
|
||||||
- **AI-powered receipt processing** (OCR and data extraction)
|
- **AI-powered receipt processing** (OCR and data extraction)
|
||||||
- **Intelligent receipt-transaction matching** with confidence scores
|
- **Intelligent receipt-transaction matching** with confidence scores
|
||||||
- **Google Drive integration** for batch receipt processing
|
|
||||||
- **Configurable AI rules** for business logic
|
- **Configurable AI rules** for business logic
|
||||||
- **Feedback logging** for continuous improvement
|
- **Feedback logging** for continuous improvement
|
||||||
|
- **RESTful API** for easy integration
|
||||||
|
|
||||||
## 🚀 Quick Start
|
## 🚀 Quick Start
|
||||||
|
|
||||||
@@ -19,11 +19,22 @@ pip install -r requirements.txt
|
|||||||
```
|
```
|
||||||
|
|
||||||
### 2. Configure API Keys
|
### 2. Configure API Keys
|
||||||
The Groq API key is already configured in `config.py`
|
Create a `.env` file in the project root with your Groq API key:
|
||||||
|
|
||||||
### 3. Start the DS Engine
|
|
||||||
```bash
|
```bash
|
||||||
|
# Create .env file
|
||||||
|
echo "GROQ_API_KEY=your_actual_groq_api_key_here" > .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Important**: Get your API key from [Groq Console](https://console.groq.com/)
|
||||||
|
|
||||||
|
### 3. Start the Server
|
||||||
|
```bash
|
||||||
|
# Option 1: Using the main script
|
||||||
python main.py
|
python main.py
|
||||||
|
|
||||||
|
# Option 2: Using uvicorn directly
|
||||||
|
uvicorn main:app --host 0.0.0.0 --port 8343 --reload
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. Access API Documentation
|
### 4. Access API Documentation
|
||||||
@@ -32,22 +43,16 @@ python main.py
|
|||||||
|
|
||||||
## 📋 API Endpoints
|
## 📋 API Endpoints
|
||||||
|
|
||||||
### QuickBooks Data Import
|
### Transaction Import
|
||||||
- `POST /transactions/import/quickbooks` - Import and convert QuickBooks transactions
|
- `POST /transactions/import/csv` - Import transactions from CSV file
|
||||||
|
- `POST /transactions/import/image` - Import transactions from image/PDF
|
||||||
|
|
||||||
### Receipt Processing
|
### Receipt Processing
|
||||||
- `POST /upload` - Upload receipt documents (PDF/images)
|
- `POST /upload-multiple` - Upload multiple receipt documents
|
||||||
- `POST /process/{file_id}` - Extract data from uploaded documents
|
- `POST /process/{file_id}` - Extract data from uploaded documents
|
||||||
- `GET /documents` - List all processed documents
|
|
||||||
|
|
||||||
### Google Drive Integration
|
|
||||||
- `POST /drive/sync` - Sync and process receipts from Google Drive
|
|
||||||
- `GET /drive/folders` - List accessible Google Drive folders
|
|
||||||
- `GET /drive/folder/{folder_id}` - Get folder information
|
|
||||||
|
|
||||||
### AI Matching Engine
|
### AI Matching Engine
|
||||||
- `POST /match` - Match receipts to transactions using AI
|
- `POST /match-specific` - Match specific receipts to transactions using AI
|
||||||
- `POST /approve` - Approve or reject AI matches
|
|
||||||
|
|
||||||
### AI Rules Management
|
### AI Rules Management
|
||||||
- `POST /rules` - Add new AI rules
|
- `POST /rules` - Add new AI rules
|
||||||
@@ -56,6 +61,7 @@ python main.py
|
|||||||
|
|
||||||
### System Monitoring
|
### System Monitoring
|
||||||
- `GET /stats` - Get system statistics and performance metrics
|
- `GET /stats` - Get system statistics and performance metrics
|
||||||
|
- `GET /` - Health check endpoint
|
||||||
|
|
||||||
## 🔧 Core Components
|
## 🔧 Core Components
|
||||||
|
|
||||||
@@ -63,21 +69,25 @@ python main.py
|
|||||||
- Uses Groq LLM to compare receipts and transactions
|
- Uses Groq LLM to compare receipts and transactions
|
||||||
- Provides confidence scores and reasoning
|
- Provides confidence scores and reasoning
|
||||||
- Configurable matching criteria (amount, date, vendor)
|
- Configurable matching criteria (amount, date, vendor)
|
||||||
|
- Rate limiting to prevent API quota exhaustion
|
||||||
|
|
||||||
### **AIRulesEngine** (`ai_rules.py`)
|
### **AIRulesEngine** (`ai_rules.py`)
|
||||||
- Applies business rules for auto-approval and categorization
|
- Applies business rules for auto-approval and categorization
|
||||||
- Configurable rule conditions and actions
|
- Configurable rule conditions and actions
|
||||||
- Supports system and user-generated rules
|
- Supports system and user-generated rules
|
||||||
|
- Safe condition evaluation with proper error handling
|
||||||
|
|
||||||
### **DocumentProcessor** (`document_processor.py`)
|
### **DocumentProcessor** (`document_processor.py`)
|
||||||
- AI-powered receipt data extraction
|
- AI-powered receipt data extraction using Groq vision model
|
||||||
- Supports PDF and image formats
|
- Supports PDF and image formats
|
||||||
- Uses Groq vision model for OCR
|
- Robust JSON parsing with error handling
|
||||||
|
- Extracts vendor, amount, date, tax, and category information
|
||||||
|
|
||||||
### **MatchingEngine** (`matching_engine.py`)
|
### **MatchingEngine** (`matching_engine.py`)
|
||||||
- Main orchestrator combining all components
|
- Main orchestrator combining all components
|
||||||
- Handles the complete matching workflow
|
- Handles the complete matching workflow
|
||||||
- Provides statistics and feedback logging
|
- Provides statistics and feedback logging
|
||||||
|
- Configurable confidence thresholds
|
||||||
|
|
||||||
### **FeedbackLogger** (`feedback_logger.py`)
|
### **FeedbackLogger** (`feedback_logger.py`)
|
||||||
- Tracks manual overrides for AI training
|
- Tracks manual overrides for AI training
|
||||||
@@ -87,70 +97,46 @@ python main.py
|
|||||||
## 📊 Configuration
|
## 📊 Configuration
|
||||||
|
|
||||||
Edit `config.py` to adjust:
|
Edit `config.py` to adjust:
|
||||||
- **Confidence threshold** (default: 0.8)
|
- **Confidence threshold** (default: 0.3)
|
||||||
- **Date tolerance days** (default: 7)
|
- **Date tolerance days** (default: 7)
|
||||||
- **Amount tolerance percent** (default: 5%)
|
- **Amount tolerance percent** (default: 5%)
|
||||||
- **Groq API key** (already configured)
|
- **Groq API key** (from environment variable)
|
||||||
|
|
||||||
## 🔄 Integration Workflow
|
## 🔄 Integration Workflow
|
||||||
|
|
||||||
### 1. Backend Sends QuickBooks Data
|
### 1. Import Transactions
|
||||||
```python
|
```bash
|
||||||
# Backend sends QuickBooks transactions
|
# Import from CSV
|
||||||
response = requests.post(
|
curl -X POST -F "file=@transactions.csv" http://localhost:8343/transactions/import/csv
|
||||||
"http://localhost:8343/transactions/import/quickbooks",
|
|
||||||
json={
|
# Import from image
|
||||||
"transactions": [
|
curl -X POST -F "file=@statement.jpg" http://localhost:8343/transactions/import/image
|
||||||
{
|
|
||||||
"id": "QB_TXN_123",
|
|
||||||
"txn_date": "2024-01-15",
|
|
||||||
"amount": 12.50,
|
|
||||||
"payee_name": "Starbucks",
|
|
||||||
"memo": "Coffee purchase"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### 2. Process Receipts
|
### 2. Upload and Process Receipts
|
||||||
```python
|
```bash
|
||||||
# Sync from Google Drive
|
# Upload receipts
|
||||||
response = requests.post(
|
curl -X POST -F "files=@receipt1.jpg" -F "files=@receipt2.jpg" http://localhost:8343/upload-multiple
|
||||||
"http://localhost:8343/drive/sync",
|
|
||||||
json={"folder_id": "your_folder_id"}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Or upload directly
|
# Process a specific receipt
|
||||||
response = requests.post(
|
curl -X POST http://localhost:8343/process/{file_id}
|
||||||
"http://localhost:8343/upload",
|
|
||||||
files={"file": receipt_file}
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### 3. AI Matching
|
### 3. AI Matching
|
||||||
```python
|
```bash
|
||||||
# Match receipts to transactions
|
# Match specific receipts
|
||||||
response = requests.post(
|
curl -X POST -H "Content-Type: application/json" \
|
||||||
"http://localhost:8343/match",
|
-d '["file_id_1", "file_id_2"]' \
|
||||||
json={
|
http://localhost:8343/match-specific
|
||||||
"receipts": processed_receipts,
|
|
||||||
"transactions": converted_transactions
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4. User Feedback
|
### 4. Check Results
|
||||||
```python
|
```bash
|
||||||
# Approve or reject matches
|
# Get system stats
|
||||||
response = requests.post(
|
curl http://localhost:8343/stats
|
||||||
"http://localhost:8343/approve",
|
|
||||||
json={
|
# View AI rules
|
||||||
"match_id": "match_123",
|
curl http://localhost:8343/rules
|
||||||
"user_id": "user_456",
|
|
||||||
"action": "approve"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🎯 Key Features
|
## 🎯 Key Features
|
||||||
@@ -159,55 +145,96 @@ response = requests.post(
|
|||||||
- **Rule-based auto-approval** and categorization
|
- **Rule-based auto-approval** and categorization
|
||||||
- **Feedback logging** for continuous improvement
|
- **Feedback logging** for continuous improvement
|
||||||
- **Configurable matching parameters**
|
- **Configurable matching parameters**
|
||||||
- **Google Drive integration** for batch processing
|
- **RESTful JSON API** for easy backend integration
|
||||||
- **JSON API** for easy backend integration
|
|
||||||
- **Comprehensive error handling**
|
- **Comprehensive error handling**
|
||||||
|
- **Rate limiting** to prevent API quota exhaustion
|
||||||
|
- **Robust JSON parsing** for AI responses
|
||||||
|
|
||||||
## 📝 Data Formats
|
## 📝 Data Formats
|
||||||
|
|
||||||
### QuickBooks Transaction Input
|
### Transaction Input (CSV)
|
||||||
|
```csv
|
||||||
|
Date,Description,Amount,Category
|
||||||
|
2024-01-15,Starbucks Coffee,12.50,Food & Dining
|
||||||
|
2024-01-16,Office Supplies,45.99,Office
|
||||||
|
```
|
||||||
|
|
||||||
|
### Receipt Processing Output
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"id": "string",
|
"vendor": "Starbucks",
|
||||||
"txn_date": "YYYY-MM-DD",
|
"total_amount": 12.50,
|
||||||
"amount": 0.00,
|
"tax_amount": 1.25,
|
||||||
"payee_name": "string",
|
"date": "2024-01-15",
|
||||||
"memo": "string (optional)",
|
"category": "Food & Dining",
|
||||||
"account_name": "string (optional)",
|
"confidence": 0.95,
|
||||||
"txn_type": "string (optional)"
|
"extraction_success": true
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
### Match Result Output
|
### Match Result Output
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"receipt_id": "string",
|
"receipt_id": "uuid",
|
||||||
"transaction_id": "string",
|
"transaction_id": "transaction_123",
|
||||||
"confidence_score": 0.95,
|
"confidence_score": 0.95,
|
||||||
"match_reason": "string",
|
"match_reason": "Same vendor, minor date difference (Auto-approved by rules)",
|
||||||
"receipt_vendor": "string",
|
"receipt_vendor": "Starbucks",
|
||||||
"receipt_amount": 0.00,
|
"receipt_amount": 12.50,
|
||||||
"transaction_vendor": "string",
|
"transaction_vendor": "STARBUCKS",
|
||||||
"transaction_amount": 0.00
|
"transaction_amount": 12.50
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🔍 AI Matching Criteria
|
## 🔍 AI Matching Criteria
|
||||||
|
|
||||||
The engine uses three primary criteria for matching:
|
The engine uses multiple criteria for matching:
|
||||||
|
|
||||||
1. **Amount Similarity** - Compares receipt and transaction amounts (5% tolerance)
|
1. **Amount Similarity** - Compares receipt and transaction amounts (5% tolerance)
|
||||||
2. **Date Proximity** - Checks date closeness (7-day tolerance)
|
2. **Date Proximity** - Checks date closeness (7-day tolerance)
|
||||||
3. **Vendor Matching** - AI-powered vendor name comparison
|
3. **Vendor Matching** - AI-powered vendor name comparison using Groq LLM
|
||||||
|
4. **Rule-based Auto-approval** - Automatic approval for exact matches and high-confidence matches
|
||||||
|
|
||||||
|
## 🛠️ Development
|
||||||
|
|
||||||
|
### Project Structure
|
||||||
|
```
|
||||||
|
├── main.py # FastAPI application entry point
|
||||||
|
├── ai_matcher.py # AI-powered matching logic
|
||||||
|
├── ai_rules.py # Business rules engine
|
||||||
|
├── document_processor.py # Receipt data extraction
|
||||||
|
├── matching_engine.py # Main matching orchestrator
|
||||||
|
├── feedback_logger.py # User feedback tracking
|
||||||
|
├── models.py # Pydantic data models
|
||||||
|
├── api_models.py # API request/response models
|
||||||
|
├── config.py # Configuration settings
|
||||||
|
├── requirements.txt # Python dependencies
|
||||||
|
└── test_images/ # Test image files
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running Tests
|
||||||
|
```bash
|
||||||
|
# Test the server
|
||||||
|
curl http://localhost:8343/
|
||||||
|
|
||||||
|
# Test stats endpoint
|
||||||
|
curl http://localhost:8343/stats
|
||||||
|
|
||||||
|
# Test rules endpoint
|
||||||
|
curl http://localhost:8343/rules
|
||||||
|
```
|
||||||
|
|
||||||
## 🚀 Production Deployment
|
## 🚀 Production Deployment
|
||||||
|
|
||||||
For production deployment:
|
For production deployment:
|
||||||
- Replace in-memory storage with a database
|
- Replace in-memory storage with a database (PostgreSQL recommended)
|
||||||
- Configure proper authentication
|
- Configure proper authentication and authorization
|
||||||
- Set up monitoring and logging
|
- Set up monitoring and logging (ELK stack recommended)
|
||||||
- Use environment variables for configuration
|
- Use environment variables for all configuration
|
||||||
- Implement proper error handling and retries
|
- Implement proper error handling and retries
|
||||||
|
- Set up rate limiting and API quotas
|
||||||
|
- Configure CORS for frontend integration
|
||||||
|
- Use HTTPS in production
|
||||||
|
|
||||||
## 📞 Support
|
## 📞 Support
|
||||||
|
|
||||||
@@ -218,3 +245,18 @@ This Data Science Engine is designed to be integrated with backend applications
|
|||||||
- External integrations
|
- External integrations
|
||||||
|
|
||||||
The engine focuses purely on AI/ML capabilities and provides a clean JSON API for backend integration.
|
The engine focuses purely on AI/ML capabilities and provides a clean JSON API for backend integration.
|
||||||
|
|
||||||
|
## 🔧 Troubleshooting
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
1. **API Key Error**: Ensure `GROQ_API_KEY` is set in your `.env` file
|
||||||
|
2. **Port Already in Use**: Kill existing process with `pkill -f "python main.py"`
|
||||||
|
3. **Import Errors**: Install dependencies with `pip install -r requirements.txt`
|
||||||
|
4. **Rate Limiting**: The system includes built-in rate limiting to prevent API quota exhaustion
|
||||||
|
|
||||||
|
### Logs
|
||||||
|
Check the application logs for detailed error information:
|
||||||
|
```bash
|
||||||
|
tail -f app.log
|
||||||
|
```
|
||||||
+151
-27
@@ -3,34 +3,75 @@ from datetime import datetime, timedelta
|
|||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
import config
|
import config
|
||||||
from models import Receipt, Transaction, Match
|
from models import Receipt, Transaction, Match
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class AIMatcher:
|
class AIMatcher:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.client = groq.Groq(api_key=config.GROQ_API_KEY)
|
self.client = groq.Groq(api_key=config.GROQ_API_KEY)
|
||||||
self.model = "llama3-8b-8192"
|
self.model = "llama3-8b-8192"
|
||||||
|
self.max_retries = 3
|
||||||
|
self.retry_delay = 2 # seconds - increased for rate limiting
|
||||||
|
self.rate_limit_delay = 1.0 # seconds between API calls
|
||||||
|
self.last_api_call = 0
|
||||||
|
|
||||||
def match_receipts_to_transactions(self, receipts: List[Receipt], transactions: List[Transaction]) -> List[Match]:
|
def match_receipts_to_transactions(self, receipts: List[Receipt], transactions: List[Transaction]) -> List[Match]:
|
||||||
|
"""Match receipts to transactions using AI"""
|
||||||
|
logger.info(f"Starting AI matching for {len(receipts)} receipts against {len(transactions)} transactions")
|
||||||
matches = []
|
matches = []
|
||||||
|
|
||||||
for receipt in receipts:
|
for i, receipt in enumerate(receipts):
|
||||||
|
logger.info(f"Processing receipt {i+1}/{len(receipts)}: {receipt.vendor} - ${receipt.amount}")
|
||||||
|
|
||||||
|
# Rate limiting
|
||||||
|
self._rate_limit()
|
||||||
|
|
||||||
# Get the BEST match for this receipt (highest confidence score)
|
# Get the BEST match for this receipt (highest confidence score)
|
||||||
best_match = self._find_best_match(receipt, transactions)
|
best_match = self._find_best_match(receipt, transactions)
|
||||||
if best_match:
|
if best_match:
|
||||||
matches.append(best_match)
|
matches.append(best_match)
|
||||||
|
logger.info(f"Found match: {best_match.confidence_score:.3f} - {best_match.match_reason}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"No match found for receipt: {receipt.vendor} - ${receipt.amount}")
|
||||||
|
|
||||||
return sorted(matches, key=lambda x: x.confidence_score, reverse=True)
|
# Sort by confidence score (highest first)
|
||||||
|
matches = sorted(matches, key=lambda x: x.confidence_score, reverse=True)
|
||||||
|
logger.info(f"AI matching completed. Found {len(matches)} matches")
|
||||||
|
return matches
|
||||||
|
|
||||||
|
def _rate_limit(self):
|
||||||
|
"""Implement rate limiting to avoid API quota exhaustion"""
|
||||||
|
current_time = time.time()
|
||||||
|
time_since_last_call = current_time - self.last_api_call
|
||||||
|
|
||||||
|
if time_since_last_call < self.rate_limit_delay:
|
||||||
|
sleep_time = self.rate_limit_delay - time_since_last_call
|
||||||
|
logger.debug(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
|
self.last_api_call = time.time()
|
||||||
|
|
||||||
def _find_best_match(self, receipt: Receipt, transactions: List[Transaction]) -> Match:
|
def _find_best_match(self, receipt: Receipt, transactions: List[Transaction]) -> Match:
|
||||||
"""Find the BEST match for a receipt (highest confidence score)"""
|
"""Find the BEST match for a receipt (highest confidence score)"""
|
||||||
candidates = self._filter_candidates(receipt, transactions)
|
candidates = self._filter_candidates(receipt, transactions)
|
||||||
if not candidates:
|
if not candidates:
|
||||||
|
logger.warning(f"No candidates found for receipt: {receipt.vendor} - ${receipt.amount}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
logger.info(f"Found {len(candidates)} candidates for receipt: {receipt.vendor}")
|
||||||
|
|
||||||
best_match = None
|
best_match = None
|
||||||
highest_score = 0
|
highest_score = 0
|
||||||
|
|
||||||
for transaction in candidates:
|
for transaction in candidates:
|
||||||
score, reason = self._calculate_match_score(receipt, transaction)
|
score, reason = self._calculate_match_score(receipt, transaction)
|
||||||
|
logger.debug(f"Score {score:.3f} for transaction {transaction.vendor}: {reason}")
|
||||||
|
|
||||||
# Keep the match with the highest score, regardless of how low it is
|
# Keep the match with the highest score, regardless of how low it is
|
||||||
if score > highest_score:
|
if score > highest_score:
|
||||||
highest_score = score
|
highest_score = score
|
||||||
@@ -39,21 +80,23 @@ class AIMatcher:
|
|||||||
return best_match
|
return best_match
|
||||||
|
|
||||||
def _filter_candidates(self, receipt: Receipt, transactions: List[Transaction]) -> List[Transaction]:
|
def _filter_candidates(self, receipt: Receipt, transactions: List[Transaction]) -> List[Transaction]:
|
||||||
# Return MOST transactions - let the AI decide on scoring
|
"""Filter transactions to create a reasonable candidate list"""
|
||||||
# Only filter out transactions with completely different amounts (>100% difference) to avoid obvious mismatches
|
|
||||||
candidates = []
|
candidates = []
|
||||||
amount_threshold = receipt.amount * 1.0 # 100% threshold - more inclusive
|
amount_threshold = receipt.amount * 2.0 # 200% threshold - very inclusive
|
||||||
|
|
||||||
for transaction in transactions:
|
for transaction in transactions:
|
||||||
# Use absolute value for transaction amount comparison
|
# Use absolute value for transaction amount comparison
|
||||||
transaction_amount_abs = abs(transaction.amount)
|
transaction_amount_abs = abs(transaction.amount)
|
||||||
|
|
||||||
# Only exclude transactions with obviously different amounts
|
# Only exclude transactions with obviously different amounts
|
||||||
if abs(receipt.amount - transaction_amount_abs) <= amount_threshold:
|
if abs(receipt.amount - transaction_amount_abs) <= amount_threshold:
|
||||||
candidates.append(transaction)
|
candidates.append(transaction)
|
||||||
|
|
||||||
|
logger.debug(f"Filtered {len(transactions)} transactions to {len(candidates)} candidates")
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
def _calculate_match_score(self, receipt: Receipt, transaction: Transaction) -> Tuple[float, str]:
|
def _calculate_match_score(self, receipt: Receipt, transaction: Transaction) -> Tuple[float, str]:
|
||||||
|
"""Calculate match score using AI"""
|
||||||
# Calculate differences for the AI to consider
|
# Calculate differences for the AI to consider
|
||||||
date_diff = abs((receipt.receipt_date - transaction.transaction_date).days)
|
date_diff = abs((receipt.receipt_date - transaction.transaction_date).days)
|
||||||
transaction_amount_abs = abs(transaction.amount)
|
transaction_amount_abs = abs(transaction.amount)
|
||||||
@@ -61,7 +104,7 @@ class AIMatcher:
|
|||||||
amount_percent_diff = (amount_diff / receipt.amount) * 100 if receipt.amount > 0 else 0
|
amount_percent_diff = (amount_diff / receipt.amount) * 100 if receipt.amount > 0 else 0
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
Compare this receipt with this transaction and provide a confidence score (0-1) and brief reason:
|
Compare this receipt with this transaction and provide a confidence score (0-1) and brief reason.
|
||||||
|
|
||||||
Receipt: {receipt.vendor}, ${receipt.amount}, {receipt.receipt_date.strftime('%Y-%m-%d')}
|
Receipt: {receipt.vendor}, ${receipt.amount}, {receipt.receipt_date.strftime('%Y-%m-%d')}
|
||||||
Transaction: {transaction.vendor}, ${transaction.amount} (absolute: ${transaction_amount_abs}), {transaction.transaction_date.strftime('%Y-%m-%d')}
|
Transaction: {transaction.vendor}, ${transaction.amount} (absolute: ${transaction_amount_abs}), {transaction.transaction_date.strftime('%Y-%m-%d')}
|
||||||
@@ -81,33 +124,114 @@ class AIMatcher:
|
|||||||
- Minimal similarity: 0.1-0.19
|
- Minimal similarity: 0.1-0.19
|
||||||
- No meaningful similarity: 0.0-0.09
|
- No meaningful similarity: 0.0-0.09
|
||||||
|
|
||||||
Examples:
|
IMPORTANT: Return ONLY the score and reason separated by a pipe character.
|
||||||
- Same vendor, same amount, 11 days apart: 0.7-0.8
|
Format: [score]|[reason]
|
||||||
- Similar vendor name, same amount, same date: 0.8-0.9
|
Example: 0.85|Same vendor, same amount, 2 days apart
|
||||||
- Same vendor, 10% amount difference, same date: 0.6-0.7
|
|
||||||
- Different vendor, same amount, same date: 0.3-0.4
|
|
||||||
- Completely different vendor, amount, date: 0.1-0.2
|
|
||||||
|
|
||||||
Consider vendor name similarity, amount accuracy, and date proximity. Score based on overall likelihood this is the correct match.
|
|
||||||
|
|
||||||
Return only: score|reason
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
for attempt in range(self.max_retries):
|
||||||
|
try:
|
||||||
|
result = self._call_groq_api_with_timeout(prompt, timeout=30) # Increased timeout
|
||||||
|
|
||||||
|
# Parse the result - handle multiple formats
|
||||||
|
score, reason = self._parse_ai_response(result)
|
||||||
|
|
||||||
|
logger.debug(f"AI Response: {result}")
|
||||||
|
logger.debug(f"Parsed: score={score}, reason={reason}")
|
||||||
|
|
||||||
|
return score, reason
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Attempt {attempt + 1} failed for receipt {receipt.id}: {str(e)}")
|
||||||
|
if attempt < self.max_retries - 1:
|
||||||
|
# Exponential backoff for rate limiting
|
||||||
|
sleep_time = self.retry_delay * (2 ** attempt)
|
||||||
|
logger.info(f"Waiting {sleep_time} seconds before retry...")
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
else:
|
||||||
|
logger.error(f"All attempts failed for receipt {receipt.id}")
|
||||||
|
return 0.0, f"AI error after {self.max_retries} attempts: {str(e)}"
|
||||||
|
|
||||||
|
def _parse_ai_response(self, result: str) -> Tuple[float, str]:
|
||||||
|
"""Parse AI response with robust error handling"""
|
||||||
|
result = result.strip()
|
||||||
|
logger.debug(f"Parsing AI response: {result}")
|
||||||
|
|
||||||
|
# Try to find score in various formats
|
||||||
|
if '|' in result:
|
||||||
|
parts = result.split('|')
|
||||||
|
logger.debug(f"Split response into {len(parts)} parts: {parts}")
|
||||||
|
|
||||||
|
# Look for a numeric score in any part
|
||||||
|
for i, part in enumerate(parts):
|
||||||
|
part = part.strip()
|
||||||
|
try:
|
||||||
|
# Remove any non-numeric characters except decimal point
|
||||||
|
score_str_clean = ''.join(c for c in part if c.isdigit() or c == '.')
|
||||||
|
if score_str_clean:
|
||||||
|
score = float(score_str_clean)
|
||||||
|
if 0 <= score <= 1: # Valid confidence score
|
||||||
|
# Get reason from other parts
|
||||||
|
reason_parts = [p.strip() for j, p in enumerate(parts) if j != i and p.strip()]
|
||||||
|
reason = ' | '.join(reason_parts) if reason_parts else "Score extracted"
|
||||||
|
logger.debug(f"Found score {score} in part {i}, reason: {reason}")
|
||||||
|
return score, reason
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try to extract just a number from the response
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
numbers = re.findall(r'\d+\.?\d*', result)
|
||||||
|
if numbers:
|
||||||
|
for num_str in numbers:
|
||||||
|
score = float(num_str)
|
||||||
|
if 0 <= score <= 1: # Valid confidence score
|
||||||
|
logger.debug(f"Extracted score {score} from response")
|
||||||
|
return score, f"Extracted from response: {result[:50]}..."
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fallback - try to find any number and normalize it
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
numbers = re.findall(r'\d+\.?\d*', result)
|
||||||
|
if numbers:
|
||||||
|
score = float(numbers[0])
|
||||||
|
# Normalize to 0-1 range if it's a percentage or other scale
|
||||||
|
if score > 1:
|
||||||
|
score = score / 100 # Assume percentage
|
||||||
|
score = max(0, min(1, score)) # Clamp to 0-1
|
||||||
|
logger.debug(f"Normalized score {score} from response")
|
||||||
|
return score, f"Normalized from response: {result[:50]}..."
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Final fallback
|
||||||
|
logger.warning(f"Could not parse AI response: {result}")
|
||||||
|
return 0.0, f"Unparseable response: {result[:50]}..."
|
||||||
|
|
||||||
|
def _call_groq_api_with_timeout(self, prompt: str, timeout: int = 15) -> str:
|
||||||
|
"""Make API call with timeout and retry logic"""
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
def api_call():
|
||||||
try:
|
try:
|
||||||
response = self.client.chat.completions.create(
|
response = self.client.chat.completions.create(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
max_tokens=100,
|
max_tokens=200,
|
||||||
temperature=0.1
|
temperature=0.1
|
||||||
)
|
)
|
||||||
|
return response.choices[0].message.content.strip()
|
||||||
result = response.choices[0].message.content.strip()
|
|
||||||
if '|' in result:
|
|
||||||
score_str, reason = result.split('|', 1)
|
|
||||||
score = float(score_str.strip())
|
|
||||||
return min(max(score, 0), 1), reason.strip()
|
|
||||||
else:
|
|
||||||
return 0.0, "Invalid AI response"
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return 0.0, f"AI error: {str(e)}"
|
raise e
|
||||||
|
|
||||||
|
try:
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
|
future = executor.submit(api_call)
|
||||||
|
return future.result(timeout=timeout)
|
||||||
|
except concurrent.futures.TimeoutError:
|
||||||
|
raise Exception(f"API call timed out after {timeout} seconds")
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
+30
-5
@@ -20,7 +20,7 @@ class AIRulesEngine:
|
|||||||
self.rules = [
|
self.rules = [
|
||||||
AIRule("exact_amount_match", "amount_diff <= 0.01", "auto_approve", "system"),
|
AIRule("exact_amount_match", "amount_diff <= 0.01", "auto_approve", "system"),
|
||||||
AIRule("same_vendor_same_date", "vendor_match and date_diff <= 1", "high_confidence", "system"),
|
AIRule("same_vendor_same_date", "vendor_match and date_diff <= 1", "high_confidence", "system"),
|
||||||
AIRule("gas_station_pattern", "vendor contains 'gas' or 'fuel'", "categorize_transport", "system")
|
AIRule("gas_station_pattern", "vendor_contains_gas_or_fuel", "categorize_transport", "system")
|
||||||
]
|
]
|
||||||
|
|
||||||
def apply_rules(self, receipt: Receipt, transaction: Transaction) -> Dict[str, Any]:
|
def apply_rules(self, receipt: Receipt, transaction: Transaction) -> Dict[str, Any]:
|
||||||
@@ -36,17 +36,42 @@ class AIRulesEngine:
|
|||||||
return results
|
return results
|
||||||
|
|
||||||
def _evaluate_condition(self, condition: str, receipt: Receipt, transaction: Transaction) -> bool:
|
def _evaluate_condition(self, condition: str, receipt: Receipt, transaction: Transaction) -> bool:
|
||||||
amount_diff = abs(receipt.amount - transaction.amount)
|
"""Safely evaluate rule conditions without using eval()"""
|
||||||
|
amount_diff = abs(receipt.amount - abs(transaction.amount))
|
||||||
date_diff = abs((receipt.receipt_date - transaction.transaction_date).days)
|
date_diff = abs((receipt.receipt_date - transaction.transaction_date).days)
|
||||||
vendor_match = receipt.vendor.lower() in transaction.vendor.lower() or transaction.vendor.lower() in receipt.vendor.lower()
|
vendor_match = receipt.vendor.lower() in transaction.vendor.lower() or transaction.vendor.lower() in receipt.vendor.lower()
|
||||||
|
vendor_lower = receipt.vendor.lower()
|
||||||
|
vendor_contains_gas_or_fuel = 'gas' in vendor_lower or 'fuel' in vendor_lower
|
||||||
|
|
||||||
return eval(condition, {
|
# Handle specific condition types safely
|
||||||
|
if condition == "amount_diff <= 0.01":
|
||||||
|
return amount_diff <= 0.01
|
||||||
|
elif condition == "vendor_match and date_diff <= 1":
|
||||||
|
return vendor_match and date_diff <= 1
|
||||||
|
elif condition == "vendor_contains_gas_or_fuel":
|
||||||
|
return vendor_contains_gas_or_fuel
|
||||||
|
else:
|
||||||
|
# For any other conditions, try to evaluate them safely
|
||||||
|
try:
|
||||||
|
# Only allow safe operations
|
||||||
|
safe_globals = {
|
||||||
"amount_diff": amount_diff,
|
"amount_diff": amount_diff,
|
||||||
"date_diff": date_diff,
|
"date_diff": date_diff,
|
||||||
"vendor_match": vendor_match,
|
"vendor_match": vendor_match,
|
||||||
|
"vendor_contains_gas_or_fuel": vendor_contains_gas_or_fuel,
|
||||||
"receipt": receipt,
|
"receipt": receipt,
|
||||||
"transaction": transaction
|
"transaction": transaction,
|
||||||
})
|
"abs": abs,
|
||||||
|
"len": len,
|
||||||
|
"min": min,
|
||||||
|
"max": max,
|
||||||
|
"sum": sum,
|
||||||
|
"round": round
|
||||||
|
}
|
||||||
|
return eval(condition, safe_globals, {})
|
||||||
|
except (SyntaxError, NameError, TypeError) as e:
|
||||||
|
print(f"Warning: Invalid condition '{condition}': {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
def _execute_action(self, action: str, results: Dict[str, Any], receipt: Receipt, transaction: Transaction):
|
def _execute_action(self, action: str, results: Dict[str, Any], receipt: Receipt, transaction: Transaction):
|
||||||
if action == "auto_approve":
|
if action == "auto_approve":
|
||||||
|
|||||||
@@ -3,7 +3,13 @@ from dotenv import load_dotenv
|
|||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
GROQ_API_KEY = "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"
|
# Get API key from environment variable with fallback
|
||||||
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "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")
|
||||||
|
|
||||||
|
# Validate API key
|
||||||
|
if not GROQ_API_KEY or GROQ_API_KEY == "your_api_key_here":
|
||||||
|
raise ValueError("GROQ_API_KEY environment variable is not set or invalid. Please set it in your .env file.")
|
||||||
|
|
||||||
CONFIDENCE_THRESHOLD = 0.3
|
CONFIDENCE_THRESHOLD = 0.3
|
||||||
DATE_TOLERANCE_DAYS = 7
|
DATE_TOLERANCE_DAYS = 7
|
||||||
AMOUNT_TOLERANCE_PERCENT = 0.05
|
AMOUNT_TOLERANCE_PERCENT = 0.05
|
||||||
@@ -1,82 +0,0 @@
|
|||||||
import csv
|
|
||||||
from dateutil import parser
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
# Config values
|
|
||||||
DATE_TOLERANCE_DAYS = 7
|
|
||||||
AMOUNT_TOLERANCE_PERCENT = 0.05
|
|
||||||
CONFIDENCE_THRESHOLD = 0.8
|
|
||||||
|
|
||||||
# Receipt data
|
|
||||||
receipt_date = datetime(2025, 2, 7)
|
|
||||||
receipt_amount = 1412.5
|
|
||||||
receipt_vendor = "Ajai Srivastava CPA, Accounting Services & Taxes"
|
|
||||||
|
|
||||||
print("=== DEBUGGING AJAI RECEIPT MATCH ===")
|
|
||||||
print(f"Receipt Date: {receipt_date}")
|
|
||||||
print(f"Receipt Amount: ${receipt_amount}")
|
|
||||||
print(f"Receipt Vendor: {receipt_vendor}")
|
|
||||||
print(f"Date Tolerance: {DATE_TOLERANCE_DAYS} days")
|
|
||||||
print(f"Amount Tolerance: {AMOUNT_TOLERANCE_PERCENT * 100}%")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check CSV transaction
|
|
||||||
csv_transaction = {
|
|
||||||
"date": "2/18/2025",
|
|
||||||
"amount": -1412.5,
|
|
||||||
"vendor": "Ajai Srivastava"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Parse CSV date
|
|
||||||
csv_date = parser.parse(csv_transaction["date"])
|
|
||||||
csv_amount = csv_transaction["amount"]
|
|
||||||
csv_vendor = csv_transaction["vendor"]
|
|
||||||
|
|
||||||
print("=== CSV TRANSACTION ===")
|
|
||||||
print(f"CSV Date: {csv_date}")
|
|
||||||
print(f"CSV Amount: ${csv_amount}")
|
|
||||||
print(f"CSV Vendor: {csv_vendor}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check date tolerance
|
|
||||||
date_diff = abs((receipt_date - csv_date).days)
|
|
||||||
date_match = date_diff <= DATE_TOLERANCE_DAYS
|
|
||||||
|
|
||||||
print("=== DATE CHECK ===")
|
|
||||||
print(f"Date Difference: {date_diff} days")
|
|
||||||
print(f"Date Match: {date_match}")
|
|
||||||
print(f"Tolerance: {DATE_TOLERANCE_DAYS} days")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check amount tolerance
|
|
||||||
amount_tolerance = receipt_amount * AMOUNT_TOLERANCE_PERCENT
|
|
||||||
amount_diff = abs(receipt_amount - abs(csv_amount)) # Use absolute value for negative amounts
|
|
||||||
amount_match = amount_diff <= amount_tolerance
|
|
||||||
|
|
||||||
print("=== AMOUNT CHECK ===")
|
|
||||||
print(f"Receipt Amount: ${receipt_amount}")
|
|
||||||
print(f"CSV Amount (abs): ${abs(csv_amount)}")
|
|
||||||
print(f"Amount Difference: ${amount_diff}")
|
|
||||||
print(f"Amount Tolerance: ${amount_tolerance}")
|
|
||||||
print(f"Amount Match: {amount_match}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Check vendor similarity
|
|
||||||
vendor_similarity = "Ajai Srivastava" in receipt_vendor
|
|
||||||
print("=== VENDOR CHECK ===")
|
|
||||||
print(f"Receipt Vendor: {receipt_vendor}")
|
|
||||||
print(f"CSV Vendor: {csv_vendor}")
|
|
||||||
print(f"Vendor Similarity: {vendor_similarity}")
|
|
||||||
print()
|
|
||||||
|
|
||||||
# Overall result
|
|
||||||
print("=== RESULT ===")
|
|
||||||
if date_match and amount_match:
|
|
||||||
print("✅ Transaction would pass initial filtering")
|
|
||||||
print("Would proceed to AI matching stage")
|
|
||||||
else:
|
|
||||||
print("❌ Transaction filtered out before AI matching")
|
|
||||||
if not date_match:
|
|
||||||
print(f" - Date difference ({date_diff} days) > tolerance ({DATE_TOLERANCE_DAYS} days)")
|
|
||||||
if not amount_match:
|
|
||||||
print(f" - Amount difference (${amount_diff}) > tolerance (${amount_tolerance})")
|
|
||||||
+135
-21
@@ -8,6 +8,9 @@ import config
|
|||||||
import os
|
import os
|
||||||
import aiofiles
|
import aiofiles
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class DocumentProcessor:
|
class DocumentProcessor:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -160,27 +163,127 @@ class DocumentProcessor:
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Find JSON in response
|
# Find JSON in response - try multiple patterns
|
||||||
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
|
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
|
||||||
if json_match:
|
if json_match:
|
||||||
json_str = json_match.group()
|
json_str = json_match.group()
|
||||||
|
|
||||||
|
# Clean up common JSON issues
|
||||||
|
json_str = re.sub(r',\s*([}\]])', r'\1', json_str) # Remove trailing commas
|
||||||
|
json_str = re.sub(r'([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:', r'\1"\2":', json_str) # Quote unquoted keys
|
||||||
|
|
||||||
|
try:
|
||||||
data = json.loads(json_str)
|
data = json.loads(json_str)
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
# Try to fix common JSON issues
|
||||||
|
logger.warning(f"Initial JSON parsing failed: {e}")
|
||||||
|
|
||||||
|
# Try to extract individual fields using regex
|
||||||
|
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
|
||||||
|
total_amount_match = re.search(r'"total_amount"\s*:\s*([0-9.]+)', json_str)
|
||||||
|
tax_amount_match = re.search(r'"tax_amount"\s*:\s*([0-9.]+)', json_str)
|
||||||
|
date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
|
||||||
|
category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
|
||||||
|
confidence_match = re.search(r'"confidence"\s*:\s*([0-9.]+)', json_str)
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"vendor": vendor_match.group(1) if vendor_match else "",
|
||||||
|
"total_amount": float(total_amount_match.group(1)) if total_amount_match else 0.0,
|
||||||
|
"tax_amount": float(tax_amount_match.group(1)) if tax_amount_match else 0.0,
|
||||||
|
"date": date_match.group(1) if date_match else "",
|
||||||
|
"category": category_match.group(1) if category_match else "Other",
|
||||||
|
"confidence": float(confidence_match.group(1)) if confidence_match else 0.5
|
||||||
|
}
|
||||||
|
|
||||||
# Validate and clean data
|
# Validate and clean data
|
||||||
return {
|
return {
|
||||||
"vendor": data.get("vendor", "").strip(),
|
"vendor": str(data.get("vendor", "")).strip(),
|
||||||
"total_amount": float(data.get("total_amount", 0)),
|
"total_amount": float(data.get("total_amount", 0)),
|
||||||
"tax_amount": float(data.get("tax_amount", 0)),
|
"tax_amount": float(data.get("tax_amount", 0)),
|
||||||
"date": data.get("date", ""),
|
"date": str(data.get("date", "")).strip(),
|
||||||
"category": data.get("category", "Other"),
|
"category": str(data.get("category", "Other")).strip(),
|
||||||
"confidence": float(data.get("confidence", 0.5)),
|
"confidence": float(data.get("confidence", 0.5)),
|
||||||
"extraction_success": True
|
"extraction_success": True
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
return {"error": "Could not parse JSON from AI response"}
|
# Try to extract fields from plain text
|
||||||
|
logger.warning("No JSON found in response, attempting text extraction")
|
||||||
|
return self._extract_from_plain_text(result_text)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"error": f"JSON parsing error: {str(e)}"}
|
logger.error(f"JSON parsing error: {str(e)}")
|
||||||
|
return {"error": f"JSON parsing error: {str(e)}", "extraction_success": False}
|
||||||
|
|
||||||
|
def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
|
||||||
|
"""Extract receipt data from plain text when JSON parsing fails"""
|
||||||
|
try:
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Extract vendor (look for common patterns)
|
||||||
|
vendor_patterns = [
|
||||||
|
r'(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)',
|
||||||
|
r'([A-Z][A-Za-z0-9\s&.,]{3,30})', # Capitalized words
|
||||||
|
]
|
||||||
|
|
||||||
|
vendor = ""
|
||||||
|
for pattern in vendor_patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
vendor = match.group(1).strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract amount (look for currency patterns)
|
||||||
|
amount_patterns = [
|
||||||
|
r'\$?\s*([0-9,]+\.?[0-9]*)',
|
||||||
|
r'(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)',
|
||||||
|
]
|
||||||
|
|
||||||
|
total_amount = 0.0
|
||||||
|
for pattern in amount_patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
total_amount = float(match.group(1).replace(',', ''))
|
||||||
|
break
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract date
|
||||||
|
date_patterns = [
|
||||||
|
r'(\d{4}-\d{2}-\d{2})',
|
||||||
|
r'(\d{1,2}/\d{1,2}/\d{2,4})',
|
||||||
|
r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}',
|
||||||
|
]
|
||||||
|
|
||||||
|
date = ""
|
||||||
|
for pattern in date_patterns:
|
||||||
|
match = re.search(pattern, text, re.IGNORECASE)
|
||||||
|
if match:
|
||||||
|
date = match.group(0)
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"vendor": vendor or "Unknown",
|
||||||
|
"total_amount": total_amount,
|
||||||
|
"tax_amount": 0.0,
|
||||||
|
"date": date or "",
|
||||||
|
"category": "Other",
|
||||||
|
"confidence": 0.3, # Low confidence for text extraction
|
||||||
|
"extraction_success": True
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Text extraction error: {str(e)}")
|
||||||
|
return {
|
||||||
|
"vendor": "Unknown",
|
||||||
|
"total_amount": 0.0,
|
||||||
|
"tax_amount": 0.0,
|
||||||
|
"date": "",
|
||||||
|
"category": "Other",
|
||||||
|
"confidence": 0.1,
|
||||||
|
"extraction_success": False,
|
||||||
|
"error": f"Text extraction failed: {str(e)}"
|
||||||
|
}
|
||||||
|
|
||||||
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
|
async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
|
||||||
"""Save uploaded file to temporary storage"""
|
"""Save uploaded file to temporary storage"""
|
||||||
@@ -287,19 +390,37 @@ class DocumentProcessor:
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
# Find JSON in response
|
# Find the first '{' and last '}'
|
||||||
json_match = re.search(r'\{.*\}', result_text, re.DOTALL)
|
start = result_text.find('{')
|
||||||
if json_match:
|
end = result_text.rfind('}')
|
||||||
json_str = json_match.group()
|
if start == -1 or end == -1 or end <= start:
|
||||||
|
return {
|
||||||
|
"extraction_success": False,
|
||||||
|
"error": "Could not find JSON object in AI response",
|
||||||
|
"transactions": []
|
||||||
|
}
|
||||||
|
json_str = result_text[start:end+1]
|
||||||
|
|
||||||
|
# Remove trailing commas before } or ]
|
||||||
|
json_str = re.sub(r',\s*([}\]])', r'\1', json_str)
|
||||||
|
|
||||||
|
try:
|
||||||
data = json.loads(json_str)
|
data = json.loads(json_str)
|
||||||
|
except Exception as e:
|
||||||
|
import logging
|
||||||
|
logging.error(f"JSON parsing error: {str(e)}")
|
||||||
|
logging.error(f"Offending JSON string:\n{json_str}")
|
||||||
|
return {
|
||||||
|
"extraction_success": False,
|
||||||
|
"error": f"JSON parsing error: {str(e)}",
|
||||||
|
"transactions": []
|
||||||
|
}
|
||||||
|
|
||||||
# Validate and clean data
|
# Validate and clean data
|
||||||
transactions = data.get("transactions", [])
|
transactions = data.get("transactions", [])
|
||||||
cleaned_transactions = []
|
cleaned_transactions = []
|
||||||
|
|
||||||
for txn in transactions:
|
for txn in transactions:
|
||||||
try:
|
try:
|
||||||
# Clean and validate each transaction
|
|
||||||
cleaned_txn = {
|
cleaned_txn = {
|
||||||
"date": str(txn.get("date", "")).strip(),
|
"date": str(txn.get("date", "")).strip(),
|
||||||
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
|
"amount": float(str(txn.get("amount", 0)).replace('$', '').replace(',', '')),
|
||||||
@@ -308,22 +429,15 @@ class DocumentProcessor:
|
|||||||
}
|
}
|
||||||
cleaned_transactions.append(cleaned_txn)
|
cleaned_transactions.append(cleaned_txn)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Skip invalid transactions
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"extraction_success": data.get("extraction_success", True),
|
"extraction_success": data.get("extraction_success", True),
|
||||||
"transactions": cleaned_transactions,
|
"transactions": cleaned_transactions,
|
||||||
"total_transactions": len(cleaned_transactions)
|
"total_transactions": len(cleaned_transactions)
|
||||||
}
|
}
|
||||||
else:
|
|
||||||
return {
|
|
||||||
"extraction_success": False,
|
|
||||||
"error": "Could not parse JSON from AI response",
|
|
||||||
"transactions": []
|
|
||||||
}
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
import logging
|
||||||
|
logging.error(f"JSON parsing error (outer): {str(e)}")
|
||||||
return {
|
return {
|
||||||
"extraction_success": False,
|
"extraction_success": False,
|
||||||
"error": f"JSON parsing error: {str(e)}",
|
"error": f"JSON parsing error: {str(e)}",
|
||||||
|
|||||||
@@ -1,49 +0,0 @@
|
|||||||
import json
|
|
||||||
import requests
|
|
||||||
import csv
|
|
||||||
from dateutil import parser
|
|
||||||
|
|
||||||
# Prepare transactions
|
|
||||||
transactions = []
|
|
||||||
with open("chequing statement.csv", newline="") as f:
|
|
||||||
reader = csv.DictReader(f)
|
|
||||||
idx = 1
|
|
||||||
for row in reader:
|
|
||||||
try:
|
|
||||||
txn_id = f"{row['Account Number']}_{idx}"
|
|
||||||
txn_date = parser.parse(row["Transaction Date"]).isoformat()
|
|
||||||
amount = float(row["Amount"].replace(",", "").strip())
|
|
||||||
vendor = row["Description 2"].strip()
|
|
||||||
notes = f"{row['Account Type']} {row['Cheque Number']} {row['Description 1']}".strip()
|
|
||||||
transactions.append({
|
|
||||||
"id": txn_id,
|
|
||||||
"transaction_date": txn_date,
|
|
||||||
"amount": amount,
|
|
||||||
"vendor": vendor,
|
|
||||||
"notes": notes
|
|
||||||
})
|
|
||||||
idx += 1
|
|
||||||
except Exception as e:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Receipt data for Ajai Invoice (3).jpg
|
|
||||||
receipt = {
|
|
||||||
"id": "33754868-bff5-4caf-9ece-cfd63f4e52d9",
|
|
||||||
"file_name": "Ajai Invoice (3).jpg",
|
|
||||||
"upload_date": "2025-07-02T15:31:23.641315",
|
|
||||||
"receipt_date": "2025-02-07T00:00:00",
|
|
||||||
"amount": 1412.5,
|
|
||||||
"tax": 162.5,
|
|
||||||
"vendor": "Ajai Srivastava CPA, Accounting Services & Taxes",
|
|
||||||
"category": "Office"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Build request
|
|
||||||
data = {
|
|
||||||
"receipts": [receipt],
|
|
||||||
"transactions": transactions
|
|
||||||
}
|
|
||||||
|
|
||||||
# Post to /match
|
|
||||||
response = requests.post("http://localhost:8000/match", json=data)
|
|
||||||
print(json.dumps(response.json(), indent=2))
|
|
||||||
Reference in New Issue
Block a user