import csv import io import logging import uuid from datetime import datetime from typing import List from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi.middleware.cors import CORSMiddleware from ai_rules import AIRule from api_models import ( DocumentProcessResponse, DocumentUploadResponse, MatchingResponse, MatchResponse, MatchSpecificRequest, RuleRequest, ) from document_processor import DocumentProcessor from matching_engine import MatchingEngine from models import Receipt, Transaction logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", handlers=[logging.FileHandler("app.log"), logging.StreamHandler()], ) logger = logging.getLogger(__name__) app = FastAPI( title="AI Bookkeeper - Data Science Engine", description="AI-powered receipt-to-transaction matching engine. Receives transaction data and provides intelligent matching capabilities.", version="1.0.0", ) # CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Initialize DS Engine components matching_engine = MatchingEngine() document_processor = DocumentProcessor() # In-memory storage for uploaded files (in production, use a database) uploaded_files = {} # Store imported transactions globally for easy access stored_transactions = [] processed_receipts = {} @app.get("/") async def root(): """Health check endpoint""" return { "message": "AI Bookkeeper Data Science Engine is running", "version": "1.0.0", "status": "healthy", } # ============================================================================ # TRANSACTION IMPORT ENDPOINTS # ============================================================================ # @app.post("/transactions/import/csv") # async def import_transactions_csv(file: UploadFile = File(...), user_id: str = "", categorization_id: str = ""): # """ # Import transactions from a CSV file (custom bank export format). # """ # try: # content = await file.read() # decoded = content.decode("utf-8") # reader = csv.DictReader(io.StringIO(decoded)) # transactions = [] # errors = [] # for idx, row in enumerate(reader): # try: # # Use correct headers and strip whitespace # account_number = row.get("Account Number") or row.get( # "Account Number ".strip() # ) # txn_date_raw = row.get("Transaction Date") or row.get( # "Transaction Date ".strip() # ) # amount_raw = row.get("Amount") or row.get("Amount ".strip()) # payee_name = row.get("Description 2") or row.get( # "Description 2 ".strip() # ) # memo = f"{row.get('Account Type', '').strip()} {row.get('Cheque Number', '').strip()} {row.get('Description 1', '').strip()}".strip() # # Compose ID # txn_id = f"{account_number}_{idx + 1}" # # Parse date (try multiple formats) # txn_date_str = txn_date_raw.strip() # txn_date = None # for fmt in ("%m/%d/%y", "%m/%d/%Y"): # try: # txn_date = datetime.strptime(txn_date_str, fmt).strftime( # "%Y-%m-%d" # ) # break # except Exception: # continue # if not txn_date: # raise ValueError(f"Could not parse date: {txn_date_str}") # # Parse amount # amount = float(amount_raw.replace(",", "").strip()) # transactions.append( # { # "id": txn_id, # "txn_date": txn_date, # "amount": amount, # "payee_name": payee_name.strip(), # "memo": memo, # } # ) # except Exception as e: # errors.append(f"Row {idx + 1}: {str(e)}") # # Store transactions globally for auto-matching # global stored_transactions # stored_transactions = transactions # return { # "imported_count": len(transactions), # "converted_transactions": transactions, # "errors": errors, # } # except Exception as e: # raise HTTPException(status_code=500, detail=str(e)) @app.post("/transactions/import/csv") async def import_transactions_csv( file: UploadFile = File(...), categorization_id: str = Form(...), user_id: str = Form(...), ): """ Import transactions from a CSV file (custom bank export format). """ try: content = await file.read() decoded = content.decode("utf-8") reader = csv.DictReader(io.StringIO(decoded)) transactions = [] errors = [] for idx, row in enumerate(reader): try: # Use correct headers and strip whitespace account_number = row.get("Account Number") or row.get( "Account Number ".strip() ) txn_date_raw = row.get("Transaction Date") or row.get( "Transaction Date ".strip() ) amount_raw = row.get("Amount") or row.get("Amount ".strip()) payee_name = row.get("Description 2") or row.get( "Description 2 ".strip() ) memo = f"{row.get('Account Type', '').strip()} {row.get('Cheque Number', '').strip()} {row.get('Description 1', '').strip()}".strip() # Compose ID txn_id = f"{account_number}_{idx + 1}" # Parse date (try multiple formats) txn_date_str = txn_date_raw.strip() txn_date = None for fmt in ("%m/%d/%y", "%m/%d/%Y"): try: txn_date = datetime.strptime(txn_date_str, fmt).strftime( "%Y-%m-%d" ) break except Exception: continue if not txn_date: raise ValueError(f"Could not parse date: {txn_date_str}") # Parse amount amount = float(amount_raw.replace(",", "").strip()) transactions.append( { "id": txn_id, "txn_date": txn_date, "amount": amount, "payee_name": payee_name.strip(), "memo": memo, } ) except Exception as e: errors.append(f"Row {idx + 1}: {str(e)}") # Store transactions globally for auto-matching global stored_transactions stored_transactions = transactions return { "imported_count": len(transactions), "converted_transactions": transactions, "errors": errors, "categorization_id": categorization_id, "user_id": user_id, } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.post("/transactions/import/image") async def import_transactions_from_image(file: UploadFile = File(...)): """ Import transactions from an image (bank statement, credit card statement, etc.) using AI extraction. """ try: # Validate file type allowed_types = ["jpg", "jpeg", "png", "gif", "bmp", "pdf"] file_extension = file.filename.split(".")[-1].lower() if file_extension not in allowed_types: raise HTTPException( status_code=400, detail=f"Unsupported file type. Allowed: {allowed_types}", ) # Read file content content = await file.read() # Save file to disk image_path = await document_processor.save_uploaded_file(content, file.filename) # Extract transactions from image (pass file path) extraction_result = await document_processor.extract_transactions_from_image( image_path ) if not extraction_result.get("extraction_success", False): raise HTTPException( status_code=500, detail=extraction_result.get("error", "Extraction failed"), ) extracted_transactions = extraction_result.get("transactions", []) # Store transactions globally for auto-matching global stored_transactions stored_transactions = [] for idx, txn in enumerate(extracted_transactions): try: txn_id = f"img_{file.filename}_{idx + 1}" txn_date_raw = txn.get("date") amount = txn.get("amount") vendor = txn.get("vendor") memo = txn.get("memo", "") # Parse date to YYYY-MM-DD format txn_date = document_processor._parse_date_to_iso(txn_date_raw) if not txn_date: # Fallback: use current year if parsing fails txn_date = f"2024-{txn_date_raw}" stored_transactions.append( { "id": txn_id, "txn_date": txn_date, "amount": amount, "payee_name": vendor, "memo": memo, } ) except Exception: continue return { "imported_count": len(stored_transactions), "converted_transactions": stored_transactions, "errors": [], } except Exception as e: logger.error(f"Error importing transactions from image: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) # ============================================================================ # DOCUMENT PROCESSING ENDPOINTS # ============================================================================ @app.post("/upload-multiple", response_model=List[DocumentUploadResponse]) async def upload_multiple_documents(files: List[UploadFile] = File(...)): """ Upload multiple receipt images for processing. This endpoint accepts multiple image files and returns file IDs that can be used with the /process/{file_id} endpoint. """ try: responses = [] for file in files: # Validate file type allowed_types = ["jpg", "jpeg", "png", "gif", "bmp", "pdf"] file_extension = file.filename.split(".")[-1].lower() if file_extension not in allowed_types: raise HTTPException( status_code=400, detail=f"Unsupported file type for {file.filename}. Allowed: {allowed_types}", ) # Generate unique file ID file_id = str(uuid.uuid4()) # Read and store file content content = await file.read() uploaded_files[file_id] = { "filename": file.filename, "content": content, "upload_date": datetime.now(), } responses.append( DocumentUploadResponse( file_id=file_id, filename=file.filename, file_type=file_extension, upload_date=datetime.now(), status="uploaded", ) ) return responses except Exception as e: logger.error(f"Error uploading documents: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) @app.post("/process/{file_id}", response_model=DocumentProcessResponse) async def process_document(file_id: str): """ Process a previously uploaded document to extract receipt information. This endpoint uses AI to extract structured data from receipt images, including vendor, amount, date, and category information. """ try: # Check if file exists if file_id not in uploaded_files: raise HTTPException(status_code=404, detail=f"File {file_id} not found") file_data = uploaded_files[file_id] # Save file temporarily and process it file_path = await document_processor.save_uploaded_file( file_data["content"], file_data["filename"] ) file_type = file_data["filename"].split(".")[-1].lower() receipt_data = await document_processor.process_file(file_path, file_type) # Store processed receipt processed_receipts[file_id] = receipt_data return DocumentProcessResponse( file_id=file_id, extraction_success=receipt_data.get("extraction_success", False), vendor=receipt_data.get("vendor", ""), description=receipt_data.get("description", ""), total_amount=receipt_data.get("total_amount", 0.0), tax_amount=receipt_data.get("tax_amount", 0.0), date=receipt_data.get("date", ""), category=receipt_data.get("category", ""), confidence=receipt_data.get("confidence", 0.0), error=receipt_data.get("error", None), ) except Exception as e: logger.error(f"Error processing document {file_id}: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) # ============================================================================ # MATCHING ENDPOINTS # ============================================================================ # @app.post("/match-specific", response_model=MatchingResponse) # async def match_specific_receipts(file_ids: List[str]): # """ # Match specific receipts against imported transactions. # This endpoint takes a list of receipt file IDs and matches them against # the currently imported transactions using AI-powered matching logic. # """ # try: # logger.info(f"Starting match-specific for file IDs: {file_ids}") # # Check if transactions are imported # if not stored_transactions: # logger.warning("No transactions imported") # raise HTTPException( # status_code=400, # detail="No transactions imported. Please upload CSV first.", # ) # logger.info(f"Found {len(stored_transactions)} stored transactions") # # Convert stored transactions to Transaction objects # transactions = [] # for txn in stored_transactions: # try: # txn_date = datetime.strptime(txn["txn_date"], "%Y-%m-%d") # transaction = Transaction( # id=txn["id"], # transaction_date=txn_date, # amount=txn["amount"], # vendor=txn["payee_name"], # notes=txn["memo"], # ) # transactions.append(transaction) # except Exception as e: # logger.warning(f"Error converting transaction {txn['id']}: {str(e)}") # continue # logger.info(f"Converted {len(transactions)} transactions") # # Get receipts for the specified file IDs # receipts = [] # missing_files = [] # for file_id in file_ids: # if file_id in processed_receipts: # receipt_data = processed_receipts[file_id] # logger.info(f"DEBUG: receipt_data for {file_id}: {receipt_data}") # logger.info( # f"DEBUG: receipt_data keys for {file_id}: {list(receipt_data.keys())}" # ) # try: # # Handle missing date field # if "date" not in receipt_data or not receipt_data["date"]: # logger.warning( # f"Missing date for receipt {file_id}, using current date" # ) # receipt_date = datetime.now() # else: # receipt_date = datetime.strptime( # receipt_data["date"], "%Y-%m-%d" # ) # # Handle missing amount field - try multiple possible keys # amount = receipt_data.get("amount") # if amount is None: # amount = receipt_data.get("total_amount") # if amount is None: # amount = receipt_data.get("amount_total") # if amount is None: # logger.warning( # f"Missing amount for receipt {file_id}, using 0.0" # ) # amount = 0.0 # # Ensure amount is a float # try: # amount = float(amount) # except (ValueError, TypeError): # logger.warning( # f"Invalid amount '{amount}' for receipt {file_id}, using 0.0" # ) # amount = 0.0 # logger.info(f"DEBUG: amount for {file_id}: {amount}") # # Handle missing vendor field # vendor = receipt_data.get("vendor", "") # if not vendor: # logger.warning( # f"Missing vendor for receipt {file_id}, using 'Unknown'" # ) # vendor = "Unknown" # # Handle missing category field # category = receipt_data.get("category", "Other") # # Handle description field # description = receipt_data.get("description", "") # # Handle tax field # tax = receipt_data.get("tax", receipt_data.get("tax_amount", 0.0)) # try: # tax = float(tax) # except (ValueError, TypeError): # tax = 0.0 # receipt = Receipt( # id=file_id, # file_name=uploaded_files[file_id]["filename"], # upload_date=uploaded_files[file_id]["upload_date"], # receipt_date=receipt_date, # amount=amount, # tax=tax, # vendor=vendor, # category=category, # description=description, # ) # receipts.append(receipt) # logger.info(f"Added receipt: {receipt.vendor} - ${receipt.amount}") # except Exception as e: # logger.warning( # f"Error creating receipt object for {file_id}: {str(e)}" # ) # missing_files.append(f"{file_id} (error: {str(e)})") # else: # logger.warning(f"Receipt {file_id} not found in processed_receipts") # missing_files.append(f"{file_id} (not found)") # if missing_files: # logger.error(f"Missing files: {missing_files}") # raise HTTPException( # status_code=400, detail=f"Missing files: {missing_files}" # ) # logger.info( # f"Processing {len(receipts)} receipts against {len(transactions)} transactions" # ) # # Perform matching # try: # logger.info("Starting direct matching call (without ThreadPoolExecutor)") # logger.info(f"matching_engine type: {type(matching_engine)}") # logger.info( # f"matching_engine.process_matching type: {type(matching_engine.process_matching)}" # ) # logger.info(f"receipts type: {type(receipts)}, length: {len(receipts)}") # logger.info( # f"transactions type: {type(transactions)}, length: {len(transactions)}" # ) # matches = matching_engine.process_matching(receipts, transactions) # logger.info( # f"Matching completed successfully. Found {len(matches)} matches" # ) # # Convert matches to response format # match_responses = [] # for match in matches: # logger.info(f"Raw match object: {match}") # logger.info(f" receipt_id: {match.receipt.id}") # logger.info(f" transaction_id: {match.transaction.id}") # logger.info(f" confidence_score: {match.confidence_score}") # logger.info(f" match_reason: {match.match_reason}") # logger.info(f" receipt_vendor: {match.receipt.vendor}") # logger.info(f" receipt_amount: {match.receipt.amount}") # logger.info(f" transaction_vendor: {match.transaction.vendor}") # logger.info(f" transaction_amount: {match.transaction.amount}") # match_response = MatchResponse( # receipt_id=match.receipt.id, # transaction_id=match.transaction.id, # confidence_score=match.confidence_score, # match_reason=match.match_reason, # receipt_vendor=match.receipt.vendor, # receipt_amount=match.receipt.amount, # receipt_description=match.receipt.description, # receipt_category=match.receipt.category, # receipt_tax_amount=match.receipt.tax, # transaction_vendor=match.transaction.vendor, # transaction_amount=match.transaction.amount, # ) # match_responses.append(match_response) # logger.info( # f"Successfully created MatchResponse for {match.receipt.vendor} -> {match.transaction.vendor}" # ) # logger.info(f"Formatted {len(match_responses)} match responses") # # Calculate statistics # if match_responses: # high_confidence = sum( # 1 for m in match_responses if m.confidence_score >= 0.8 # ) # low_confidence = len(match_responses) - high_confidence # avg_score = sum(m.confidence_score for m in match_responses) / len( # match_responses # ) # else: # high_confidence = low_confidence = avg_score = 0 # stats = { # "total": len(match_responses), # "high_confidence": high_confidence, # "low_confidence": low_confidence, # "avg_score": round(avg_score, 2), # } # logger.info(f"Generated stats: {stats}") # logger.info( # f"Match-specific completed successfully with {len(match_responses)} matches" # ) # return MatchingResponse(matches=match_responses, stats=stats) # except Exception as e: # logger.error(f"Exception in matching section: {str(e)}") # logger.error(f"Exception type: {type(e)}") # logger.error(f"Exception args: {e.args}") # logger.error(f"Traceback: {e.__traceback__}") # raise HTTPException( # status_code=500, detail=f"Unexpected matching error: {str(e)}" # ) # except HTTPException: # raise # except Exception as e: # logger.error(f"Unexpected error in match_specific_receipts: {str(e)}") # raise HTTPException(status_code=500, detail=str(e)) @app.post("/match-specific", response_model=MatchingResponse) async def match_specific_receipts(request: MatchSpecificRequest): """ Match specific receipts against imported transactions. This endpoint takes a request with receipt file IDs and categorization ID, and matches them against the currently imported transactions using AI-powered matching logic. """ try: file_ids = request.file_ids categorization_id = request.categorization_id logger.info( f"Starting match-specific for file IDs: {file_ids}, categorization_id: {categorization_id}" ) # Check if transactions are imported if not stored_transactions: logger.warning("No transactions imported") raise HTTPException( status_code=400, detail="No transactions imported. Please upload CSV first.", ) logger.info(f"Found {len(stored_transactions)} stored transactions") # Convert stored transactions to Transaction objects transactions = [] for txn in stored_transactions: try: txn_date = datetime.strptime(txn["txn_date"], "%Y-%m-%d") transaction = Transaction( id=txn["id"], transaction_date=txn_date, amount=txn["amount"], vendor=txn["payee_name"], notes=txn["memo"], ) transactions.append(transaction) except Exception as e: logger.warning(f"Error converting transaction {txn['id']}: {str(e)}") continue logger.info(f"Converted {len(transactions)} transactions") # Get receipts for the specified file IDs receipts = [] missing_files = [] for file_id in file_ids: if file_id in processed_receipts: receipt_data = processed_receipts[file_id] logger.info(f"DEBUG: receipt_data for {file_id}: {receipt_data}") logger.info( f"DEBUG: receipt_data keys for {file_id}: {list(receipt_data.keys())}" ) try: # Handle missing date field if "date" not in receipt_data or not receipt_data["date"]: logger.warning( f"Missing date for receipt {file_id}, using current date" ) receipt_date = datetime.now() else: receipt_date = datetime.strptime( receipt_data["date"], "%Y-%m-%d" ) # Handle missing amount field - try multiple possible keys amount = receipt_data.get("amount") if amount is None: amount = receipt_data.get("total_amount") if amount is None: amount = receipt_data.get("amount_total") if amount is None: logger.warning( f"Missing amount for receipt {file_id}, using 0.0" ) amount = 0.0 # Ensure amount is a float try: amount = float(amount) except (ValueError, TypeError): logger.warning( f"Invalid amount '{amount}' for receipt {file_id}, using 0.0" ) amount = 0.0 logger.info(f"DEBUG: amount for {file_id}: {amount}") # Handle missing vendor field vendor = receipt_data.get("vendor", "") if not vendor: logger.warning( f"Missing vendor for receipt {file_id}, using 'Unknown'" ) vendor = "Unknown" # Handle missing category field category = receipt_data.get("category", "Other") # Handle description field description = receipt_data.get("description", "") # Handle tax field tax = receipt_data.get("tax", receipt_data.get("tax_amount", 0.0)) try: tax = float(tax) except (ValueError, TypeError): tax = 0.0 receipt = Receipt( id=file_id, file_name=uploaded_files[file_id]["filename"], upload_date=uploaded_files[file_id]["upload_date"], receipt_date=receipt_date, amount=amount, tax=tax, vendor=vendor, category=category, description=description, ) receipts.append(receipt) logger.info(f"Added receipt: {receipt.vendor} - ${receipt.amount}") except Exception as e: logger.warning( f"Error creating receipt object for {file_id}: {str(e)}" ) missing_files.append(f"{file_id} (error: {str(e)})") else: logger.warning(f"Receipt {file_id} not found in processed_receipts") missing_files.append(f"{file_id} (not found)") if missing_files: logger.error(f"Missing files: {missing_files}") raise HTTPException( status_code=400, detail=f"Missing files: {missing_files}" ) logger.info( f"Processing {len(receipts)} receipts against {len(transactions)} transactions" ) # Perform matching try: logger.info("Starting direct matching call (without ThreadPoolExecutor)") logger.info(f"matching_engine type: {type(matching_engine)}") logger.info( f"matching_engine.process_matching type: {type(matching_engine.process_matching)}" ) logger.info(f"receipts type: {type(receipts)}, length: {len(receipts)}") logger.info( f"transactions type: {type(transactions)}, length: {len(transactions)}" ) matches = matching_engine.process_matching(receipts, transactions) logger.info( f"Matching completed successfully. Found {len(matches)} matches" ) # Convert matches to response format match_responses = [] for match in matches: logger.info(f"Raw match object: {match}") logger.info(f" receipt_id: {match.receipt.id}") logger.info(f" transaction_id: {match.transaction.id}") logger.info(f" confidence_score: {match.confidence_score}") logger.info(f" match_reason: {match.match_reason}") logger.info(f" receipt_vendor: {match.receipt.vendor}") logger.info(f" receipt_amount: {match.receipt.amount}") logger.info(f" transaction_vendor: {match.transaction.vendor}") logger.info(f" transaction_amount: {match.transaction.amount}") match_response = MatchResponse( receipt_id=match.receipt.id, transaction_id=match.transaction.id, confidence_score=match.confidence_score, match_reason=match.match_reason, receipt_vendor=match.receipt.vendor, receipt_amount=match.receipt.amount, receipt_description=match.receipt.description, receipt_category=match.receipt.category, receipt_tax_amount=match.receipt.tax, transaction_vendor=match.transaction.vendor, transaction_amount=match.transaction.amount, ) match_responses.append(match_response) logger.info( f"Successfully created MatchResponse for {match.receipt.vendor} -> {match.transaction.vendor}" ) logger.info(f"Formatted {len(match_responses)} match responses") # Calculate statistics if match_responses: high_confidence = sum( 1 for m in match_responses if m.confidence_score >= 0.8 ) low_confidence = len(match_responses) - high_confidence avg_score = sum(m.confidence_score for m in match_responses) / len( match_responses ) else: high_confidence = low_confidence = avg_score = 0 stats = { "total": len(match_responses), "high_confidence": high_confidence, "low_confidence": low_confidence, "avg_score": round(avg_score, 2), } logger.info(f"Generated stats: {stats}") logger.info( f"Match-specific completed successfully with {len(match_responses)} matches" ) return MatchingResponse(matches=match_responses, stats=stats) except Exception as e: logger.error(f"Exception in matching section: {str(e)}") logger.error(f"Exception type: {type(e)}") logger.error(f"Exception args: {e.args}") logger.error(f"Traceback: {e.__traceback__}") raise HTTPException( status_code=500, detail=f"Unexpected matching error: {str(e)}" ) except HTTPException: raise except Exception as e: logger.error(f"Unexpected error in match_specific_receipts: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) # ============================================================================ # RULES MANAGEMENT ENDPOINTS # ============================================================================ @app.post("/rules") async def add_rule(request: RuleRequest): """ Add a new AI rule for transaction matching. """ try: new_rule = AIRule( name=request.name, condition=request.condition, action=request.action, source=request.source, ) matching_engine.rules_engine.rules.append(new_rule) return {"message": f"Rule '{request.name}' added successfully"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/rules") async def get_rules(): """ Get all current AI rules. """ try: rules = [] for rule in matching_engine.rules_engine.rules: rules.append( { "name": rule.name, "condition": rule.condition, "action": rule.action, "source": rule.source, "status": rule.status, } ) return {"rules": rules} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.delete("/rules/{rule_name}") async def delete_rule(rule_name: str): """ Delete an AI rule by name. """ try: rules = matching_engine.rules_engine.rules for i, rule in enumerate(rules): if rule.name == rule_name: del rules[i] return {"message": f"Rule '{rule_name}' deleted successfully"} raise HTTPException(status_code=404, detail=f"Rule '{rule_name}' not found") except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # ============================================================================ # STATISTICS ENDPOINT # ============================================================================ @app.get("/stats") async def get_stats(): """ Get system statistics. """ try: return { "total_transactions": len(stored_transactions), "total_receipts": len(processed_receipts), "total_uploaded_files": len(uploaded_files), "rules_count": len(matching_engine.rules_engine.rules), } except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8343)