From a202abf5c0c5eb90bf118b812f8ae74b1083217b Mon Sep 17 00:00:00 2001 From: Iyeoluwa Akinrinola Date: Thu, 3 Jul 2025 00:44:01 +0100 Subject: [PATCH] Fix date parsing for image import to handle various date formats --- document_processor.py | 48 ++++++++++++++++++++++++++++++++++++++++++- main.py | 11 +++++++--- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/document_processor.py b/document_processor.py index 87337c2..4a36c50 100644 --- a/document_processor.py +++ b/document_processor.py @@ -328,4 +328,50 @@ class DocumentProcessor: "extraction_success": False, "error": f"JSON parsing error: {str(e)}", "transactions": [] - } \ No newline at end of file + } + + def _parse_date_to_iso(self, date_str: str) -> str: + """Parse various date formats and convert to YYYY-MM-DD""" + try: + import re + from datetime import datetime + + date_str = date_str.strip().upper() + + # Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024" + month_pattern = r'(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s+(\d{1,2})(?:,\s*(\d{4}))?' + match = re.match(month_pattern, date_str) + + if match: + month_abbr, day, year = match.groups() + month_map = { + 'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6, + 'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12 + } + + month = month_map[month_abbr] + day = int(day) + year = int(year) if year else datetime.now().year + + # Handle 2-digit years + if year < 100: + year += 2000 + + return f"{year:04d}-{month:02d}-{day:02d}" + + # Handle YYYY-MM-DD format + if re.match(r'\d{4}-\d{2}-\d{2}', date_str): + return date_str + + # Handle MM/DD/YYYY format + if re.match(r'\d{1,2}/\d{1,2}/\d{4}', date_str): + return datetime.strptime(date_str, '%m/%d/%Y').strftime('%Y-%m-%d') + + # Handle MM/DD/YY format + if re.match(r'\d{1,2}/\d{1,2}/\d{2}', date_str): + return datetime.strptime(date_str, '%m/%d/%y').strftime('%Y-%m-%d') + + return None + + except Exception: + return None \ No newline at end of file diff --git a/main.py b/main.py index ef12860..ba2c258 100644 --- a/main.py +++ b/main.py @@ -200,11 +200,16 @@ async def import_transactions_from_image(file: UploadFile = File(...)): # Generate unique ID txn_id = f"img_{file.filename}_{idx+1}" - # Parse date - txn_date = txn.get("date", "") - if not txn_date: + # Parse date - handle various formats + txn_date_raw = txn.get("date", "") + if not txn_date_raw: raise ValueError("No date found in transaction") + # Convert date to YYYY-MM-DD format + txn_date = document_processor._parse_date_to_iso(txn_date_raw) + if not txn_date: + raise ValueError(f"Could not parse date: {txn_date_raw}") + # Parse amount amount_str = str(txn.get("amount", "0")) amount = float(amount_str.replace('$', '').replace(',', '').strip())