2025-10-05 11:29:45 +00:00
import base64
2025-10-09 19:56:22 +00:00
import json
2025-10-05 11:29:45 +00:00
import logging
import os
from datetime import datetime
from typing import Any , Dict
import aiofiles
import groq
import PyPDF2
from config import settings
logger = logging . getLogger ( __name__ )
class DocumentProcessor :
def __init__ ( self ) :
self . client = groq . Groq ( api_key = settings . GROQ_API_KEY )
self . model = " meta-llama/llama-4-scout-17b-16e-instruct " # Vision model
2025-10-09 19:56:22 +00:00
def _extract_first_json ( self , raw : str ) - > dict :
""" Extract the first valid JSON object from raw LLM output.
Handles cases where LLM returns extra text after/before the JSON.
"""
try :
# First try direct parsing (fastest path)
return json . loads ( raw )
except json . JSONDecodeError :
pass
# Find the first '{' and match closing '}'
start = raw . find ( " { " )
if start == - 1 :
raise ValueError ( " No JSON object found in LLM output " )
depth = 0
end = - 1
in_string = False
escape_next = False
for i in range ( start , len ( raw ) ) :
ch = raw [ i ]
# Handle string escaping
if escape_next :
escape_next = False
continue
if ch == " \\ " :
escape_next = True
continue
# Track if we're inside a string
if ch == ' " ' :
in_string = not in_string
continue
# Only count braces outside of strings
if not in_string :
if ch == " { " :
depth + = 1
elif ch == " } " :
depth - = 1
if depth == 0 :
end = i + 1
break
if end == - 1 :
raise ValueError ( " Unbalanced JSON braces in LLM output " )
json_str = raw [ start : end ]
return json . loads ( json_str )
2025-10-07 12:03:26 +01:00
async def process_file (
2025-10-08 00:12:09 +01:00
self ,
file_path : str ,
file_type : str ,
user_location : str = None ,
ai_rules : list = None ,
2025-10-07 12:03:26 +01:00
) - > Dict [ str , Any ] :
""" Process uploaded file and extract receipt data
Args:
file_path: Path to the file to process
file_type: Type of file (jpg, pdf, etc.)
user_location: User ' s location string in format " State/Province, Country " (e.g., " Ontario, Canada " )
2025-10-08 00:12:09 +01:00
ai_rules: List of AI rules for categorization (e.g., [ { " condition " : " vendor is Starbucks " , " action " : " Food " }])
2025-10-07 12:03:26 +01:00
"""
2025-10-05 11:29:45 +00:00
try :
if file_type . lower ( ) in [ " jpg " , " jpeg " , " png " , " gif " , " bmp " ] :
2025-10-08 00:12:09 +01:00
return await self . _process_image ( file_path , user_location , ai_rules )
2025-10-05 11:29:45 +00:00
elif file_type . lower ( ) == " pdf " :
2025-10-08 00:12:09 +01:00
return await self . _process_pdf ( file_path , user_location , ai_rules )
2025-10-05 11:29:45 +00:00
else :
raise ValueError ( f " Unsupported file type: { file_type } " )
except Exception as e :
return { " error " : str ( e ) }
2025-10-07 12:03:26 +01:00
async def _process_image (
2025-10-08 00:12:09 +01:00
self , image_path : str , user_location : str = None , ai_rules : list = None
2025-10-07 12:03:26 +01:00
) - > Dict [ str , Any ] :
""" Extract data from image using Groq vision
Args:
image_path: Path to the image file
user_location: User ' s location string in format " State/Province, Country " (e.g., " Ontario, Canada " )
2025-10-08 00:12:09 +01:00
ai_rules: List of AI rules for categorization
2025-10-07 12:03:26 +01:00
"""
2025-10-05 11:29:45 +00:00
try :
# Encode image to base64
base64_image = self . _encode_image ( image_path )
2025-10-07 12:03:26 +01:00
# Build user location context
user_location_context = " "
if user_location :
user_location_context = f """
USER LOCATION CONTEXT:
The user is located in { user_location } .
- If the receipt location is MISSING or UNCLEAR, use the user ' s location ( { user_location } ) for tax calculations.
- If the receipt clearly shows a different location, use the receipt ' s location instead.
- Apply depreciation rules based on the user ' s location.
"""
2025-10-08 00:12:09 +01:00
# Build AI rules context for categorization
ai_rules_context = " "
if ai_rules and len ( ai_rules ) > 0 :
ai_rules_context = " \n CATEGORIZATION RULES (IMPORTANT - Apply these first): "
for idx , rule in enumerate ( ai_rules , 1 ) :
condition = rule . get ( " condition " , " " )
action = rule . get ( " action " , " " )
ai_rules_context + = f " \n { idx } . If { condition } → set category to ' { action } ' "
ai_rules_context + = " \n - Apply these custom rules before using default categorization logic \n - If multiple rules match, use the first matching rule \n - If no rules match, use default categorization based on vendor type "
2025-10-05 11:29:45 +00:00
# Create Groq vision prompt
2025-10-07 12:03:26 +01:00
prompt = f """
2025-10-05 11:29:45 +00:00
Analyze this receipt image and extract the following information in JSON format:
2025-10-07 12:03:26 +01:00
{{
2025-10-05 11:29:45 +00:00
" vendor " : " Store/company name " ,
" description " : " Detailed description of items/services purchased " ,
" total_amount " : 0.00,
" tax_amount " : 0.00,
" date " : " YYYY-MM-DD " ,
" category " : " Food/Transport/Office/Other " ,
" confidence " : 0.95,
2025-10-07 11:15:26 +01:00
" currency " : " USD " ,
" location " : " Province/State, Country " ,
" calculated_tax " : 0.00,
" is_depreciable " : false,
2025-10-07 20:35:43 +01:00
" name_of_asset " : null,
2025-10-07 11:15:26 +01:00
" cca_rate " : null,
" useful_life " : null,
" residual_value " : null
2025-10-07 12:03:26 +01:00
}}
2025-10-05 11:29:45 +00:00
Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., " Coffee and sandwich " , " Gasoline " , " Office supplies " )
- Total amount should be the final total including tax
2025-10-07 11:15:26 +01:00
- Tax amount is separate tax line if available (if not clearly shown, calculate based on location)
2025-10-05 11:29:45 +00:00
- Date should be the date on the receipt
- Confidence score 0-1 based on how clear the receipt is
2025-10-07 11:15:26 +01:00
- Currency should be the currency used on the receipt (e.g., " USD " , " EUR " , " CAD " )
2025-10-08 00:12:09 +01:00
{ ai_rules_context }
2025-10-07 12:03:26 +01:00
{ user_location_context }
2025-10-07 11:15:26 +01:00
LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country)
- Format location as " Province/State, Country " (e.g., " Ontario, Canada " or " California, USA " )
2025-10-07 12:03:26 +01:00
- If location not shown on receipt, return null for location (system will use user location as fallback)
2025-10-07 11:15:26 +01:00
2025-10-07 12:44:27 +01:00
TAX EXTRACTION RULES (IMPORTANT):
- If tax is EXPLICITLY shown on receipt (even if $0 or 0%), use that exact value:
* If receipt shows " Tax: $0 " , " Tax: $0.00 " , " Tax (0%) " , or similar → set tax_amount to 0.00 and calculated_tax to null
* If receipt shows any other tax amount → set tax_amount to that value and calculated_tax to null
- If tax_amount is NOT shown or UNCLEAR on receipt, calculate it based on location:
2025-10-07 11:15:26 +01:00
* Ontario, Canada: 13% HST
* Quebec, Canada: 9.975% QST + 5% GST = 14.975% total
* British Columbia, Canada: 12% (5% GST + 7% PST)
* Alberta, Canada: 5% GST
* California, USA: ~7.25% (varies by locality)
* New York, USA: ~8.875% (varies by locality)
* Texas, USA: 6.25%
* For other locations, estimate based on typical rates
2025-10-07 12:44:27 +01:00
* Store calculated tax in " calculated_tax " field and set tax_amount to the calculated value
2025-10-07 11:15:26 +01:00
DEPRECIATION RULES:
- Determine if item is a depreciable asset (vehicles, machinery, equipment, computers, furniture, buildings)
- Set is_depreciable to true only for capital assets, false for consumables/services
- If is_depreciable is true, provide:
2025-10-07 20:35:43 +01:00
* name_of_asset: Specific name/model of the asset (e.g., " 2024 Honda Accord " , " Dell Laptop XPS 15 " , " Office Desk " )
2025-10-07 11:15:26 +01:00
* cca_rate: CCA rate as decimal (e.g., 0.30 for 30%, 0.20 for 20%, 0.04 for 4%)
- Class 10 (Vehicles): 30%
- Class 8 (Furniture, equipment): 20%
- Class 50 (Computers, software): 55%
- Class 1 (Buildings): 4%
- Class 10.1 (Passenger vehicles >$30k): 30%
* useful_life: Expected years of use (e.g., 5 for computers, 8 for vehicles, 10 for furniture)
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
2025-10-07 20:35:43 +01:00
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
2025-10-05 11:29:45 +00:00
2025-10-09 19:56:22 +00:00
CATEGORY RULES:
- Assign the category based on all the details in the receipt
2025-10-05 11:29:45 +00:00
Return only valid JSON.
"""
# Call Groq vision API with correct format
response = self . client . chat . completions . create (
messages = [
{
" role " : " user " ,
" content " : [
{ " type " : " text " , " text " : prompt } ,
{
" type " : " image_url " ,
" image_url " : {
" url " : f " data:image/jpeg;base64, { base64_image } " ,
} ,
} ,
] ,
}
] ,
model = self . model ,
2025-10-07 11:15:26 +01:00
max_tokens = 800 ,
2025-10-05 11:29:45 +00:00
temperature = 0.1 ,
)
# Parse response
result_text = response . choices [ 0 ] . message . content . strip ( )
return self . _parse_extraction_result ( result_text )
except Exception as e :
return { " error " : f " Image processing error: { str ( e ) } " }
def _encode_image ( self , image_path : str ) - > str :
""" Encode image to base64 string """
with open ( image_path , " rb " ) as image_file :
return base64 . b64encode ( image_file . read ( ) ) . decode ( " utf-8 " )
2025-10-07 12:03:26 +01:00
async def _process_pdf (
2025-10-08 00:12:09 +01:00
self , pdf_path : str , user_location : str = None , ai_rules : list = None
2025-10-07 12:03:26 +01:00
) - > Dict [ str , Any ] :
""" Extract data from PDF by converting to image first
Args:
pdf_path: Path to the PDF file
user_location: User ' s location string in format " State/Province, Country " (e.g., " Ontario, Canada " )
2025-10-08 00:12:09 +01:00
ai_rules: List of AI rules for categorization
2025-10-07 12:03:26 +01:00
"""
2025-10-05 11:29:45 +00:00
try :
# For now, extract text from PDF and process as text
text_content = self . _extract_text_from_pdf ( pdf_path )
2025-10-08 00:12:09 +01:00
return self . _process_text_content ( text_content , user_location , ai_rules )
2025-10-05 11:29:45 +00:00
except Exception as e :
return { " error " : f " PDF processing error: { str ( e ) } " }
def _extract_text_from_pdf ( self , pdf_path : str ) - > str :
""" Extract text from PDF """
try :
with open ( pdf_path , " rb " ) as file :
pdf_reader = PyPDF2 . PdfReader ( file )
text = " "
for page in pdf_reader . pages :
text + = page . extract_text ( ) + " \n "
return text
except Exception :
return " "
2025-10-07 12:03:26 +01:00
def _process_text_content (
2025-10-08 00:12:09 +01:00
self , text_content : str , user_location : str = None , ai_rules : list = None
2025-10-07 12:03:26 +01:00
) - > Dict [ str , Any ] :
""" Process text content using Groq (fallback for PDFs)
Args:
text_content: Extracted text from PDF
user_location: User ' s location string in format " State/Province, Country " (e.g., " Ontario, Canada " )
2025-10-08 00:12:09 +01:00
ai_rules: List of AI rules for categorization
2025-10-07 12:03:26 +01:00
"""
2025-10-05 11:29:45 +00:00
try :
2025-10-07 12:03:26 +01:00
# Build user location context
user_location_context = " "
if user_location :
user_location_context = f """
USER LOCATION CONTEXT:
The user is located in { user_location } .
- If the receipt location is MISSING or UNCLEAR, use the user ' s location ( { user_location } ) for tax calculations.
- If the receipt clearly shows a different location, use the receipt ' s location instead.
- Apply depreciation rules based on the user ' s location.
"""
2025-10-08 00:12:09 +01:00
# Build AI rules context for categorization
ai_rules_context = " "
if ai_rules and len ( ai_rules ) > 0 :
ai_rules_context = " \n CATEGORIZATION RULES (IMPORTANT - Apply these first): "
for idx , rule in enumerate ( ai_rules , 1 ) :
condition = rule . get ( " condition " , " " )
action = rule . get ( " action " , " " )
ai_rules_context + = f " \n { idx } . If { condition } → set category to ' { action } ' "
ai_rules_context + = " \n - Apply these custom rules before using default categorization logic \n - If multiple rules match, use the first matching rule \n - If no rules match, use default categorization based on vendor type "
2025-10-05 11:29:45 +00:00
prompt = f """
Analyze this receipt text and extract the following information in JSON format:
Receipt Text:
{ text_content }
Extract:
{{
" vendor " : " Store/company name " ,
" description " : " Detailed description of items/services purchased " ,
" total_amount " : 0.00,
" tax_amount " : 0.00,
" date " : " YYYY-MM-DD " ,
" category " : " Food/Transport/Office/Other " ,
" confidence " : 0.95,
2025-10-07 11:15:26 +01:00
" currency " : " USD " ,
" location " : " Province/State, Country " ,
" calculated_tax " : 0.00,
" is_depreciable " : false,
2025-10-07 20:35:43 +01:00
" name_of_asset " : null,
2025-10-07 11:15:26 +01:00
" cca_rate " : null,
" useful_life " : null,
" residual_value " : null
2025-10-05 11:29:45 +00:00
}}
Rules:
- Extract vendor name as it appears on receipt
- Extract description of items/services purchased (e.g., " Coffee and sandwich " , " Gasoline " , " Office supplies " )
- Total amount should be the final total including tax
2025-10-07 11:15:26 +01:00
- Tax amount is separate tax line if available (if not clearly shown, calculate based on location)
2025-10-05 11:29:45 +00:00
- Date should be the date on the receipt
- Confidence score 0-1 based on clarity
2025-10-07 11:15:26 +01:00
- Currency should be the currency used on the receipt (e.g., " USD " , " EUR " , " CAD " )
2025-10-08 00:12:09 +01:00
{ ai_rules_context }
2025-10-07 12:03:26 +01:00
{ user_location_context }
2025-10-07 11:15:26 +01:00
LOCATION & TAX RULES:
- Extract location from receipt (look for store address, province/state, country)
- Format location as " Province/State, Country " (e.g., " Ontario, Canada " or " California, USA " )
2025-10-07 12:03:26 +01:00
- If location not shown on receipt, return null for location (system will use user location as fallback)
2025-10-07 11:15:26 +01:00
2025-10-07 12:44:27 +01:00
TAX EXTRACTION RULES (IMPORTANT):
- If tax is EXPLICITLY shown on receipt (even if $0 or 0%), use that exact value:
* If receipt shows " Tax: $0 " , " Tax: $0.00 " , " Tax (0%) " , or similar → set tax_amount to 0.00 and calculated_tax to null
* If receipt shows any other tax amount → set tax_amount to that value and calculated_tax to null
- If tax_amount is NOT shown or UNCLEAR on receipt, calculate it based on location:
2025-10-07 11:15:26 +01:00
* Ontario, Canada: 13% HST
* Quebec, Canada: 9.975% QST + 5% GST = 14.975% total
* British Columbia, Canada: 12% (5% GST + 7% PST)
* Alberta, Canada: 5% GST
* California, USA: ~7.25% (varies by locality)
* New York, USA: ~8.875% (varies by locality)
* Texas, USA: 6.25%
* For other locations, estimate based on typical rates
2025-10-07 12:44:27 +01:00
* Store calculated tax in " calculated_tax " field and set tax_amount to the calculated value
2025-10-07 11:15:26 +01:00
DEPRECIATION RULES:
- Determine if item is a depreciable asset (vehicles, machinery, equipment, computers, furniture, buildings)
- Set is_depreciable to true only for capital assets, false for consumables/services
- If is_depreciable is true, provide:
2025-10-07 20:35:43 +01:00
* name_of_asset: Specific name/model of the asset (e.g., " 2024 Honda Accord " , " Dell Laptop XPS 15 " , " Office Desk " )
2025-10-07 11:15:26 +01:00
* cca_rate: CCA rate as decimal (e.g., 0.30 for 30%, 0.20 for 20%, 0.04 for 4%)
- Class 10 (Vehicles): 30%
- Class 8 (Furniture, equipment): 20%
- Class 50 (Computers, software): 55%
- Class 1 (Buildings): 4%
- Class 10.1 (Passenger vehicles >$30k): 30%
* useful_life: Expected years of use (e.g., 5 for computers, 8 for vehicles, 10 for furniture)
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
2025-10-07 20:35:43 +01:00
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
2025-10-05 11:29:45 +00:00
Return only valid JSON.
"""
response = self . client . chat . completions . create (
model = self . model ,
messages = [ { " role " : " user " , " content " : prompt } ] ,
2025-10-07 11:15:26 +01:00
max_tokens = 800 ,
2025-10-05 11:29:45 +00:00
temperature = 0.1 ,
)
result_text = response . choices [ 0 ] . message . content . strip ( )
return self . _parse_extraction_result ( result_text )
except Exception as e :
return { " error " : f " Text processing error: { str ( e ) } " }
def _parse_extraction_result ( self , result_text : str ) - > Dict [ str , Any ] :
""" Parse Groq response and extract JSON data """
try :
import re
2025-10-09 19:56:22 +00:00
# Try robust JSON extraction first (handles extra text)
try :
data = self . _extract_first_json ( result_text )
return data
except ( json . JSONDecodeError , ValueError ) as e :
logger . warning ( f " Robust JSON extraction failed: { e } . Trying fallback methods... " )
# Fallback: Find JSON in response - try multiple patterns
2025-10-05 11:29:45 +00:00
json_match = re . search ( r " \ { .* \ } " , result_text , re . DOTALL )
if json_match :
json_str = json_match . group ( )
# Clean up common JSON issues
json_str = re . sub (
r " , \ s*([} \ ]]) " , r " \ 1 " , json_str
) # Remove trailing commas
json_str = re . sub (
r " ([ { ,]) \ s*([a-zA-Z_][a-zA-Z0-9_]*) \ s*: " , r ' \ 1 " \ 2 " : ' , json_str
) # Quote unquoted keys
try :
data = json . loads ( json_str )
except json . JSONDecodeError as e :
# Try to fix common JSON issues
2025-10-09 19:56:22 +00:00
logger . warning ( f " Fallback JSON parsing also failed: { e } " )
2025-10-05 11:29:45 +00:00
# Try to extract individual fields using regex
vendor_match = re . search ( r ' " vendor " \ s*: \ s* " ([^ " ]*) " ' , json_str )
description_match = re . search (
r ' " description " \ s*: \ s* " ([^ " ]*) " ' , json_str
)
total_amount_match = re . search (
r ' " total_amount " \ s*: \ s*([0-9.]+) ' , json_str
)
tax_amount_match = re . search (
r ' " tax_amount " \ s*: \ s*([0-9.]+) ' , json_str
)
date_match = re . search ( r ' " date " \ s*: \ s* " ([^ " ]*) " ' , json_str )
category_match = re . search ( r ' " category " \ s*: \ s* " ([^ " ]*) " ' , json_str )
confidence_match = re . search (
r ' " confidence " \ s*: \ s*([0-9.]+) ' , json_str
)
2025-10-07 11:15:26 +01:00
currency_match = re . search ( r ' " currency " \ s*: \ s* " ([^ " ]*) " ' , json_str )
location_match = re . search ( r ' " location " \ s*: \ s* " ([^ " ]*) " ' , json_str )
calculated_tax_match = re . search (
r ' " calculated_tax " \ s*: \ s*([0-9.]+|null) ' , json_str
)
is_depreciable_match = re . search (
r ' " is_depreciable " \ s*: \ s*(true|false) ' , json_str
)
2025-10-07 20:35:43 +01:00
name_of_asset_match = re . search (
r ' " name_of_asset " \ s*: \ s* " ([^ " ]*) " ' , json_str
)
2025-10-07 11:15:26 +01:00
cca_rate_match = re . search (
r ' " cca_rate " \ s*: \ s*([0-9.]+|null) ' , json_str
)
useful_life_match = re . search (
r ' " useful_life " \ s*: \ s*([0-9]+|null) ' , json_str
)
residual_value_match = re . search (
r ' " residual_value " \ s*: \ s*([0-9.]+|null) ' , json_str
2025-10-05 11:29:45 +00:00
)
data = {
" vendor " : vendor_match . group ( 1 ) if vendor_match else " " ,
" description " : description_match . group ( 1 )
if description_match
else " " ,
" total_amount " : float ( total_amount_match . group ( 1 ) )
if total_amount_match
else 0.0 ,
" tax_amount " : float ( tax_amount_match . group ( 1 ) )
if tax_amount_match
else 0.0 ,
" date " : date_match . group ( 1 ) if date_match else " " ,
" category " : category_match . group ( 1 )
if category_match
else " Other " ,
" confidence " : float ( confidence_match . group ( 1 ) )
if confidence_match
else 0.5 ,
2025-10-07 11:15:26 +01:00
" currency " : currency_match . group ( 1 )
if currency_match
else " CAD " ,
" location " : location_match . group ( 1 ) if location_match else None ,
" calculated_tax " : float ( calculated_tax_match . group ( 1 ) )
if calculated_tax_match
and calculated_tax_match . group ( 1 ) != " null "
else None ,
" is_depreciable " : is_depreciable_match . group ( 1 ) == " true "
if is_depreciable_match
else None ,
2025-10-07 20:35:43 +01:00
" name_of_asset " : name_of_asset_match . group ( 1 )
if name_of_asset_match
else None ,
2025-10-07 11:15:26 +01:00
" cca_rate " : float ( cca_rate_match . group ( 1 ) )
if cca_rate_match and cca_rate_match . group ( 1 ) != " null "
else None ,
" useful_life " : int ( useful_life_match . group ( 1 ) )
if useful_life_match and useful_life_match . group ( 1 ) != " null "
else None ,
" residual_value " : float ( residual_value_match . group ( 1 ) )
if residual_value_match
and residual_value_match . group ( 1 ) != " null "
else None ,
2025-10-05 11:29:45 +00:00
}
# Validate and clean data
return {
" vendor " : str ( data . get ( " vendor " , " " ) ) . strip ( ) ,
" description " : str ( data . get ( " description " , " " ) ) . strip ( ) ,
" total_amount " : float ( data . get ( " total_amount " , 0 ) ) ,
" tax_amount " : float ( data . get ( " tax_amount " , 0 ) ) ,
" date " : str ( data . get ( " date " , " " ) ) . strip ( ) ,
" category " : str ( data . get ( " category " , " Other " ) ) . strip ( ) ,
" confidence " : float ( data . get ( " confidence " , 0.5 ) ) ,
" extraction_success " : True ,
" currency " : data . get ( " currency " , " CAD " ) . strip ( ) ,
2025-10-07 11:15:26 +01:00
" location " : data . get ( " location " ) ,
" calculated_tax " : data . get ( " calculated_tax " ) ,
" is_depreciable " : data . get ( " is_depreciable " ) ,
2025-10-07 20:35:43 +01:00
" name_of_asset " : data . get ( " name_of_asset " ) ,
2025-10-07 11:15:26 +01:00
" cca_rate " : data . get ( " cca_rate " ) ,
" useful_life " : data . get ( " useful_life " ) ,
" residual_value " : data . get ( " residual_value " ) ,
2025-10-05 11:29:45 +00:00
}
else :
# Try to extract fields from plain text
logger . warning ( " No JSON found in response, attempting text extraction " )
return self . _extract_from_plain_text ( result_text )
except Exception as e :
logger . error ( f " JSON parsing error: { str ( e ) } " )
return {
" error " : f " JSON parsing error: { str ( e ) } " ,
" extraction_success " : False ,
}
def _extract_from_plain_text ( self , text : str ) - > Dict [ str , Any ] :
""" Extract receipt data from plain text when JSON parsing fails """
try :
import re
# Extract vendor (look for common patterns)
vendor_patterns = [
r " (?:vendor|store|merchant|company) \ s*[: \ -]? \ s*([A-Za-z0-9 \ s&.,]+) " ,
r " ([A-Z][A-Za-z0-9 \ s&.,] { 3,30}) " , # Capitalized words
]
vendor = " "
for pattern in vendor_patterns :
match = re . search ( pattern , text , re . IGNORECASE )
if match :
vendor = match . group ( 1 ) . strip ( )
break
# Extract amount (look for currency patterns)
amount_patterns = [
r " \ $? \ s*([0-9,]+ \ .?[0-9]*) " ,
r " (?:total|amount|sum) \ s*[: \ -]? \ s* \ $? \ s*([0-9,]+ \ .?[0-9]*) " ,
]
total_amount = 0.0
for pattern in amount_patterns :
match = re . search ( pattern , text , re . IGNORECASE )
if match :
try :
total_amount = float ( match . group ( 1 ) . replace ( " , " , " " ) )
break
except ValueError :
continue
# Extract date
date_patterns = [
r " ( \ d {4} - \ d {2} - \ d {2} ) " ,
r " ( \ d { 1,2}/ \ d { 1,2}/ \ d { 2,4}) " ,
r " (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \ s+ \ d { 1,2},? \ s+ \ d {4} " ,
]
date = " "
for pattern in date_patterns :
match = re . search ( pattern , text , re . IGNORECASE )
if match :
date = match . group ( 0 )
break
return {
" vendor " : vendor or " Unknown " ,
" total_amount " : total_amount ,
" tax_amount " : 0.0 ,
" date " : date or " " ,
" category " : " Other " ,
" confidence " : 0.3 , # Low confidence for text extraction
" extraction_success " : True ,
2025-10-07 11:15:26 +01:00
" location " : None ,
" calculated_tax " : None ,
" is_depreciable " : None ,
2025-10-07 20:35:43 +01:00
" name_of_asset " : None ,
2025-10-07 11:15:26 +01:00
" cca_rate " : None ,
" useful_life " : None ,
" residual_value " : None ,
2025-10-05 11:29:45 +00:00
}
except Exception as e :
logger . error ( f " Text extraction error: { str ( e ) } " )
return {
" vendor " : " Unknown " ,
" total_amount " : 0.0 ,
" tax_amount " : 0.0 ,
" date " : " " ,
" category " : " Other " ,
" confidence " : 0.1 ,
" extraction_success " : False ,
" error " : f " Text extraction failed: { str ( e ) } " ,
2025-10-07 11:15:26 +01:00
" location " : None ,
" calculated_tax " : None ,
" is_depreciable " : None ,
2025-10-07 20:35:43 +01:00
" name_of_asset " : None ,
2025-10-07 11:15:26 +01:00
" cca_rate " : None ,
" useful_life " : None ,
" residual_value " : None ,
2025-10-05 11:29:45 +00:00
}
async def save_uploaded_file ( self , file_content : bytes , filename : str ) - > str :
""" Save uploaded file to temporary storage """
try :
# Create uploads directory if it doesn't exist
upload_dir = " uploads "
os . makedirs ( upload_dir , exist_ok = True )
# Generate unique filename
timestamp = datetime . now ( ) . strftime ( " % Y % m %d _ % H % M % S " )
safe_filename = f " { timestamp } _ { filename . replace ( ' ' , ' _ ' ) } "
file_path = os . path . join ( upload_dir , safe_filename )
# Save file
async with aiofiles . open ( file_path , " wb " ) as f :
await f . write ( file_content )
return file_path
except Exception as e :
raise Exception ( f " Failed to save file: { str ( e ) } " )
async def extract_transactions_from_image ( self , image_path : str ) - > Dict [ str , Any ] :
""" Extract multiple transactions from an image (bank statement, credit card statement, etc.) """
try :
# Encode image to base64
base64_image = self . _encode_image ( image_path )
# Create Groq vision prompt for transaction extraction
prompt = """
Analyze this financial document image (bank statement, credit card statement, etc.) and extract ALL transactions in JSON format.
Look for transaction lists, payment records, or any financial entries that show:
- Date
- Amount (positive or negative)
- Vendor/Description/Payee name
- Any additional notes or memo
Return the transactions as a JSON array:
{
" extraction_success " : true,
" transactions " : [
{
" date " : " YYYY-MM-DD " ,
" amount " : 0.00,
" vendor " : " Vendor name " ,
" memo " : " Additional notes "
},
{
" date " : " YYYY-MM-DD " ,
" amount " : -0.00,
" vendor " : " Another vendor " ,
" memo " : " Payment or charge description "
}
]
}
Rules:
- Extract ALL visible transactions
- Include both positive (credits) and negative (debits) amounts
- Use the actual date format from the document
- Vendor should be the merchant/payee name
- Memo can include transaction type, reference numbers, etc.
- If no transactions found, return empty array but set extraction_success to true
Return only valid JSON.
"""
# Call Groq vision API
response = self . client . chat . completions . create (
messages = [
{
" role " : " user " ,
" content " : [
{ " type " : " text " , " text " : prompt } ,
{
" type " : " image_url " ,
" image_url " : {
" url " : f " data:image/jpeg;base64, { base64_image } " ,
} ,
} ,
] ,
}
] ,
model = self . model ,
max_tokens = 2000 , # Higher token limit for multiple transactions
temperature = 0.1 ,
)
# Parse response
result_text = response . choices [ 0 ] . message . content . strip ( )
return self . _parse_transaction_extraction_result ( result_text )
except Exception as e :
return {
" extraction_success " : False ,
" error " : f " Transaction extraction error: { str ( e ) } " ,
" transactions " : [ ] ,
}
def _parse_transaction_extraction_result ( self , result_text : str ) - > Dict [ str , Any ] :
""" Parse Groq response for transaction extraction """
try :
import json
import re
# Find the first '{' and last '}'
start = result_text . find ( " { " )
end = result_text . rfind ( " } " )
if start == - 1 or end == - 1 or end < = start :
return {
" extraction_success " : False ,
" error " : " Could not find JSON object in AI response " ,
" transactions " : [ ] ,
}
json_str = result_text [ start : end + 1 ]
# Remove trailing commas before } or ]
json_str = re . sub ( r " , \ s*([} \ ]]) " , r " \ 1 " , json_str )
try :
data = json . loads ( json_str )
except Exception as e :
import logging
logging . error ( f " JSON parsing error: { str ( e ) } " )
logging . error ( f " Offending JSON string: \n { json_str } " )
return {
" extraction_success " : False ,
" error " : f " JSON parsing error: { str ( e ) } " ,
" transactions " : [ ] ,
}
# Validate and clean data
transactions = data . get ( " transactions " , [ ] )
cleaned_transactions = [ ]
for txn in transactions :
try :
cleaned_txn = {
" date " : str ( txn . get ( " date " , " " ) ) . strip ( ) ,
" amount " : float (
str ( txn . get ( " amount " , 0 ) ) . replace ( " $ " , " " ) . replace ( " , " , " " )
) ,
" vendor " : str ( txn . get ( " vendor " , " " ) ) . strip ( ) ,
" memo " : str ( txn . get ( " memo " , " " ) ) . strip ( ) ,
}
cleaned_transactions . append ( cleaned_txn )
except Exception :
continue
return {
" extraction_success " : data . get ( " extraction_success " , True ) ,
" transactions " : cleaned_transactions ,
" total_transactions " : len ( cleaned_transactions ) ,
}
except Exception as e :
import logging
logging . error ( f " JSON parsing error (outer): { str ( e ) } " )
return {
" extraction_success " : False ,
" error " : f " JSON parsing error: { str ( e ) } " ,
" transactions " : [ ] ,
}
def _parse_date_to_iso ( self , date_str : str ) - > str :
""" Parse various date formats and convert to YYYY-MM-DD """
try :
import re
from datetime import datetime
date_str = date_str . strip ( ) . upper ( )
# Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024"
month_pattern = r " (JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC) \ s+( \ d { 1,2})(?:, \ s*( \ d {4} ))? "
match = re . match ( month_pattern , date_str )
if match :
month_abbr , day , year = match . groups ( )
month_map = {
" JAN " : 1 ,
" FEB " : 2 ,
" MAR " : 3 ,
" APR " : 4 ,
" MAY " : 5 ,
" JUN " : 6 ,
" JUL " : 7 ,
" AUG " : 8 ,
" SEP " : 9 ,
" OCT " : 10 ,
" NOV " : 11 ,
" DEC " : 12 ,
}
month = month_map [ month_abbr ]
day = int ( day )
year = int ( year ) if year else datetime . now ( ) . year
# Handle 2-digit years
if year < 100 :
year + = 2000
return f " { year : 04d } - { month : 02d } - { day : 02d } "
# Handle YYYY-MM-DD format
if re . match ( r " \ d {4} - \ d {2} - \ d {2} " , date_str ) :
return date_str
# Handle MM/DD/YYYY format
if re . match ( r " \ d { 1,2}/ \ d { 1,2}/ \ d {4} " , date_str ) :
return datetime . strptime ( date_str , " % m/ %d / % Y " ) . strftime ( " % Y- % m- %d " )
# Handle MM/DD/YY format
if re . match ( r " \ d { 1,2}/ \ d { 1,2}/ \ d {2} " , date_str ) :
return datetime . strptime ( date_str , " % m/ %d / % y " ) . strftime ( " % Y- % m- %d " )
return None
except Exception :
return None