app/services/document_processor.py

import base64
import logging
import os
from datetime import datetime
from typing import Any, Dict

import aiofiles
import groq
import PyPDF2

from config import settings

logger = logging.getLogger(__name__)


class DocumentProcessor:
    def __init__(self):
        self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
        self.model = "meta-llama/llama-4-scout-17b-16e-instruct"  # Vision model

    async def process_file(self, file_path: str, file_type: str) -> Dict[str, Any]:
        """Process uploaded file and extract receipt data"""
        try:
            if file_type.lower() in ["jpg", "jpeg", "png", "gif", "bmp"]:
                return await self._process_image(file_path)
            elif file_type.lower() == "pdf":
                return await self._process_pdf(file_path)
            else:
                raise ValueError(f"Unsupported file type: {file_type}")
        except Exception as e:
            return {"error": str(e)}

    async def _process_image(self, image_path: str) -> Dict[str, Any]:
        """Extract data from image using Groq vision"""
        try:
            # Encode image to base64
            base64_image = self._encode_image(image_path)

            # Create Groq vision prompt
            prompt = """
            Analyze this receipt image and extract the following information in JSON format:
            {
                "vendor": "Store/company name",
                "description": "Detailed description of items/services purchased",
                "total_amount": 0.00,
                "tax_amount": 0.00,
                "date": "YYYY-MM-DD",
                "category": "Food/Transport/Office/Other",
                "confidence": 0.95,
                "currency": "USD"
            }
            
            Rules:
            - Extract vendor name as it appears on receipt
            - Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
            - Total amount should be the final total including tax
            - Tax amount is separate tax line if available
            - Date should be the date on the receipt
            - Categorize based on vendor type (Starbucks=Food, Shell=Transport, etc.)
            - Confidence score 0-1 based on how clear the receipt is
            - Currency should be the currency used on the receipt (e.g., "USD", "EUR")

            Return only valid JSON.
            """

            # Call Groq vision API with correct format
            response = self.client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                },
                            },
                        ],
                    }
                ],
                model=self.model,
                max_tokens=500,
                temperature=0.1,
            )

            # Parse response
            result_text = response.choices[0].message.content.strip()
            return self._parse_extraction_result(result_text)

        except Exception as e:
            return {"error": f"Image processing error: {str(e)}"}

    def _encode_image(self, image_path: str) -> str:
        """Encode image to base64 string"""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

    async def _process_pdf(self, pdf_path: str) -> Dict[str, Any]:
        """Extract data from PDF by converting to image first"""
        try:
            # For now, extract text from PDF and process as text
            text_content = self._extract_text_from_pdf(pdf_path)
            return self._process_text_content(text_content)

        except Exception as e:
            return {"error": f"PDF processing error: {str(e)}"}

    def _extract_text_from_pdf(self, pdf_path: str) -> str:
        """Extract text from PDF"""
        try:
            with open(pdf_path, "rb") as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"
                return text
        except Exception:
            return ""

    def _process_text_content(self, text_content: str) -> Dict[str, Any]:
        """Process text content using Groq (fallback for PDFs)"""
        try:
            prompt = f"""
            Analyze this receipt text and extract the following information in JSON format:
            
            Receipt Text:
            {text_content}
            
            Extract:
            {{
                "vendor": "Store/company name",
                "description": "Detailed description of items/services purchased",
                "total_amount": 0.00,
                "tax_amount": 0.00,
                "date": "YYYY-MM-DD",
                "category": "Food/Transport/Office/Other",
                "confidence": 0.95,
                "currency": "USD"
            }}
            
            Rules:
            - Extract vendor name as it appears on receipt
            - Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")
            - Total amount should be the final total including tax
            - Tax amount is separate tax line if available
            - Date should be the date on the receipt
            - Categorize based on vendor type
            - Confidence score 0-1 based on clarity
            - Currency should be the currency used on the receipt (e.g., "USD", "EUR")
            
            Return only valid JSON.
            """

            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=500,
                temperature=0.1,
            )

            result_text = response.choices[0].message.content.strip()
            return self._parse_extraction_result(result_text)

        except Exception as e:
            return {"error": f"Text processing error: {str(e)}"}

    def _parse_extraction_result(self, result_text: str) -> Dict[str, Any]:
        """Parse Groq response and extract JSON data"""
        try:
            # Clean up response and extract JSON
            import json
            import re

            # Find JSON in response - try multiple patterns
            json_match = re.search(r"\{.*\}", result_text, re.DOTALL)
            if json_match:
                json_str = json_match.group()

                # Clean up common JSON issues
                json_str = re.sub(
                    r",\s*([}\]])", r"\1", json_str
                )  # Remove trailing commas
                json_str = re.sub(
                    r"([{,])\s*([a-zA-Z_][a-zA-Z0-9_]*)\s*:", r'\1"\2":', json_str
                )  # Quote unquoted keys

                try:
                    data = json.loads(json_str)
                except json.JSONDecodeError as e:
                    # Try to fix common JSON issues
                    logger.warning(f"Initial JSON parsing failed: {e}")

                    # Try to extract individual fields using regex
                    vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
                    description_match = re.search(
                        r'"description"\s*:\s*"([^"]*)"', json_str
                    )
                    total_amount_match = re.search(
                        r'"total_amount"\s*:\s*([0-9.]+)', json_str
                    )
                    tax_amount_match = re.search(
                        r'"tax_amount"\s*:\s*([0-9.]+)', json_str
                    )
                    date_match = re.search(r'"date"\s*:\s*"([^"]*)"', json_str)
                    category_match = re.search(r'"category"\s*:\s*"([^"]*)"', json_str)
                    confidence_match = re.search(
                        r'"confidence"\s*:\s*([0-9.]+)', json_str
                    )
                    currency_match = re.search(
                        r'"currency"\s*:\s*"([^"]*)"', json_str
                    )

                    data = {
                        "vendor": vendor_match.group(1) if vendor_match else "",
                        "description": description_match.group(1)
                        if description_match
                        else "",
                        "total_amount": float(total_amount_match.group(1))
                        if total_amount_match
                        else 0.0,
                        "tax_amount": float(tax_amount_match.group(1))
                        if tax_amount_match
                        else 0.0,
                        "date": date_match.group(1) if date_match else "",
                        "category": category_match.group(1)
                        if category_match
                        else "Other",
                        "confidence": float(confidence_match.group(1))
                        if confidence_match
                        else 0.5,
                        "currency": currency_match.group(1) if currency_match else "CAD"
                    }

                # Validate and clean data
                return {
                    "vendor": str(data.get("vendor", "")).strip(),
                    "description": str(data.get("description", "")).strip(),
                    "total_amount": float(data.get("total_amount", 0)),
                    "tax_amount": float(data.get("tax_amount", 0)),
                    "date": str(data.get("date", "")).strip(),
                    "category": str(data.get("category", "Other")).strip(),
                    "confidence": float(data.get("confidence", 0.5)),
                    "extraction_success": True,
                    "currency": data.get("currency", "CAD").strip(),
                }
            else:
                # Try to extract fields from plain text
                logger.warning("No JSON found in response, attempting text extraction")
                return self._extract_from_plain_text(result_text)

        except Exception as e:
            logger.error(f"JSON parsing error: {str(e)}")
            return {
                "error": f"JSON parsing error: {str(e)}",
                "extraction_success": False,
            }

    def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:
        """Extract receipt data from plain text when JSON parsing fails"""
        try:
            import re

            # Extract vendor (look for common patterns)
            vendor_patterns = [
                r"(?:vendor|store|merchant|company)\s*[:\-]?\s*([A-Za-z0-9\s&.,]+)",
                r"([A-Z][A-Za-z0-9\s&.,]{3,30})",  # Capitalized words
            ]

            vendor = ""
            for pattern in vendor_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    vendor = match.group(1).strip()
                    break

            # Extract amount (look for currency patterns)
            amount_patterns = [
                r"\$?\s*([0-9,]+\.?[0-9]*)",
                r"(?:total|amount|sum)\s*[:\-]?\s*\$?\s*([0-9,]+\.?[0-9]*)",
            ]

            total_amount = 0.0
            for pattern in amount_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    try:
                        total_amount = float(match.group(1).replace(",", ""))
                        break
                    except ValueError:
                        continue

            # Extract date
            date_patterns = [
                r"(\d{4}-\d{2}-\d{2})",
                r"(\d{1,2}/\d{1,2}/\d{2,4})",
                r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},?\s+\d{4}",
            ]

            date = ""
            for pattern in date_patterns:
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    date = match.group(0)
                    break

            return {
                "vendor": vendor or "Unknown",
                "total_amount": total_amount,
                "tax_amount": 0.0,
                "date": date or "",
                "category": "Other",
                "confidence": 0.3,  # Low confidence for text extraction
                "extraction_success": True,
            }

        except Exception as e:
            logger.error(f"Text extraction error: {str(e)}")
            return {
                "vendor": "Unknown",
                "total_amount": 0.0,
                "tax_amount": 0.0,
                "date": "",
                "category": "Other",
                "confidence": 0.1,
                "extraction_success": False,
                "error": f"Text extraction failed: {str(e)}",
            }

    async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:
        """Save uploaded file to temporary storage"""
        try:
            # Create uploads directory if it doesn't exist
            upload_dir = "uploads"
            os.makedirs(upload_dir, exist_ok=True)

            # Generate unique filename
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            safe_filename = f"{timestamp}_{filename.replace(' ', '_')}"
            file_path = os.path.join(upload_dir, safe_filename)

            # Save file
            async with aiofiles.open(file_path, "wb") as f:
                await f.write(file_content)

            return file_path

        except Exception as e:
            raise Exception(f"Failed to save file: {str(e)}")

    async def extract_transactions_from_image(self, image_path: str) -> Dict[str, Any]:
        """Extract multiple transactions from an image (bank statement, credit card statement, etc.)"""
        try:
            # Encode image to base64
            base64_image = self._encode_image(image_path)

            # Create Groq vision prompt for transaction extraction
            prompt = """
            Analyze this financial document image (bank statement, credit card statement, etc.) and extract ALL transactions in JSON format.
            
            Look for transaction lists, payment records, or any financial entries that show:
            - Date
            - Amount (positive or negative)
            - Vendor/Description/Payee name
            - Any additional notes or memo
            
            Return the transactions as a JSON array:
            {
                "extraction_success": true,
                "transactions": [
                    {
                        "date": "YYYY-MM-DD",
                        "amount": 0.00,
                        "vendor": "Vendor name",
                        "memo": "Additional notes"
                    },
                    {
                        "date": "YYYY-MM-DD", 
                        "amount": -0.00,
                        "vendor": "Another vendor",
                        "memo": "Payment or charge description"
                    }
                ]
            }
            
            Rules:
            - Extract ALL visible transactions
            - Include both positive (credits) and negative (debits) amounts
            - Use the actual date format from the document
            - Vendor should be the merchant/payee name
            - Memo can include transaction type, reference numbers, etc.
            - If no transactions found, return empty array but set extraction_success to true
            
            Return only valid JSON.
            """

            # Call Groq vision API
            response = self.client.chat.completions.create(
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                },
                            },
                        ],
                    }
                ],
                model=self.model,
                max_tokens=2000,  # Higher token limit for multiple transactions
                temperature=0.1,
            )

            # Parse response
            result_text = response.choices[0].message.content.strip()
            return self._parse_transaction_extraction_result(result_text)

        except Exception as e:
            return {
                "extraction_success": False,
                "error": f"Transaction extraction error: {str(e)}",
                "transactions": [],
            }

    def _parse_transaction_extraction_result(self, result_text: str) -> Dict[str, Any]:
        """Parse Groq response for transaction extraction"""
        try:
            import json
            import re

            # Find the first '{' and last '}'
            start = result_text.find("{")
            end = result_text.rfind("}")
            if start == -1 or end == -1 or end <= start:
                return {
                    "extraction_success": False,
                    "error": "Could not find JSON object in AI response",
                    "transactions": [],
                }
            json_str = result_text[start : end + 1]

            # Remove trailing commas before } or ]
            json_str = re.sub(r",\s*([}\]])", r"\1", json_str)

            try:
                data = json.loads(json_str)
            except Exception as e:
                import logging

                logging.error(f"JSON parsing error: {str(e)}")
                logging.error(f"Offending JSON string:\n{json_str}")
                return {
                    "extraction_success": False,
                    "error": f"JSON parsing error: {str(e)}",
                    "transactions": [],
                }

            # Validate and clean data
            transactions = data.get("transactions", [])
            cleaned_transactions = []
            for txn in transactions:
                try:
                    cleaned_txn = {
                        "date": str(txn.get("date", "")).strip(),
                        "amount": float(
                            str(txn.get("amount", 0)).replace("$", "").replace(",", "")
                        ),
                        "vendor": str(txn.get("vendor", "")).strip(),
                        "memo": str(txn.get("memo", "")).strip(),
                    }
                    cleaned_transactions.append(cleaned_txn)
                except Exception:
                    continue
            return {
                "extraction_success": data.get("extraction_success", True),
                "transactions": cleaned_transactions,
                "total_transactions": len(cleaned_transactions),
            }
        except Exception as e:
            import logging

            logging.error(f"JSON parsing error (outer): {str(e)}")
            return {
                "extraction_success": False,
                "error": f"JSON parsing error: {str(e)}",
                "transactions": [],
            }

    def _parse_date_to_iso(self, date_str: str) -> str:
        """Parse various date formats and convert to YYYY-MM-DD"""
        try:
            import re
            from datetime import datetime

            date_str = date_str.strip().upper()

            # Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024"
            month_pattern = r"(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s+(\d{1,2})(?:,\s*(\d{4}))?"
            match = re.match(month_pattern, date_str)

            if match:
                month_abbr, day, year = match.groups()
                month_map = {
                    "JAN": 1,
                    "FEB": 2,
                    "MAR": 3,
                    "APR": 4,
                    "MAY": 5,
                    "JUN": 6,
                    "JUL": 7,
                    "AUG": 8,
                    "SEP": 9,
                    "OCT": 10,
                    "NOV": 11,
                    "DEC": 12,
                }

                month = month_map[month_abbr]
                day = int(day)
                year = int(year) if year else datetime.now().year

                # Handle 2-digit years
                if year < 100:
                    year += 2000

                return f"{year:04d}-{month:02d}-{day:02d}"

            # Handle YYYY-MM-DD format
            if re.match(r"\d{4}-\d{2}-\d{2}", date_str):
                return date_str

            # Handle MM/DD/YYYY format
            if re.match(r"\d{1,2}/\d{1,2}/\d{4}", date_str):
                return datetime.strptime(date_str, "%m/%d/%Y").strftime("%Y-%m-%d")

            # Handle MM/DD/YY format
            if re.match(r"\d{1,2}/\d{1,2}/\d{2}", date_str):
                return datetime.strptime(date_str, "%m/%d/%y").strftime("%Y-%m-%d")

            return None

        except Exception:
            return None
Add requirements.txt with essential dependencies for the project 2025-10-05 11:29:45 +00:00			`import base64`
			`import logging`
			`import os`
			`from datetime import datetime`
			`from typing import Any, Dict`

			`import aiofiles`
			`import groq`
			`import PyPDF2`

			`from config import settings`

			`logger = logging.getLogger(__name__)`


			`class DocumentProcessor:`
			`def __init__(self):`
			`self.client = groq.Groq(api_key=settings.GROQ_API_KEY)`
			`self.model = "meta-llama/llama-4-scout-17b-16e-instruct" # Vision model`

			`async def process_file(self, file_path: str, file_type: str) -> Dict[str, Any]:`
			`"""Process uploaded file and extract receipt data"""`
			`try:`
			`if file_type.lower() in ["jpg", "jpeg", "png", "gif", "bmp"]:`
			`return await self._process_image(file_path)`
			`elif file_type.lower() == "pdf":`
			`return await self._process_pdf(file_path)`
			`else:`
			`raise ValueError(f"Unsupported file type: {file_type}")`
			`except Exception as e:`
			`return {"error": str(e)}`

			`async def _process_image(self, image_path: str) -> Dict[str, Any]:`
			`"""Extract data from image using Groq vision"""`
			`try:`
			`# Encode image to base64`
			`base64_image = self._encode_image(image_path)`

			`# Create Groq vision prompt`
			`prompt = """`
			`Analyze this receipt image and extract the following information in JSON format:`
			`{`
			`"vendor": "Store/company name",`
			`"description": "Detailed description of items/services purchased",`
			`"total_amount": 0.00,`
			`"tax_amount": 0.00,`
			`"date": "YYYY-MM-DD",`
			`"category": "Food/Transport/Office/Other",`
			`"confidence": 0.95,`
			`"currency": "USD"`
			`}`

			`Rules:`
			`- Extract vendor name as it appears on receipt`
			`- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")`
			`- Total amount should be the final total including tax`
			`- Tax amount is separate tax line if available`
			`- Date should be the date on the receipt`
			`- Categorize based on vendor type (Starbucks=Food, Shell=Transport, etc.)`
			`- Confidence score 0-1 based on how clear the receipt is`
			`- Currency should be the currency used on the receipt (e.g., "USD", "EUR")`

			`Return only valid JSON.`
			`"""`

			`# Call Groq vision API with correct format`
			`response = self.client.chat.completions.create(`
			`messages=[`
			`{`
			`"role": "user",`
			`"content": [`
			`{"type": "text", "text": prompt},`
			`{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": f"data:image/jpeg;base64,{base64_image}",`
			`},`
			`},`
			`],`
			`}`
			`],`
			`model=self.model,`
			`max_tokens=500,`
			`temperature=0.1,`
			`)`

			`# Parse response`
			`result_text = response.choices[0].message.content.strip()`
			`return self._parse_extraction_result(result_text)`

			`except Exception as e:`
			`return {"error": f"Image processing error: {str(e)}"}`

			`def _encode_image(self, image_path: str) -> str:`
			`"""Encode image to base64 string"""`
			`with open(image_path, "rb") as image_file:`
			`return base64.b64encode(image_file.read()).decode("utf-8")`

			`async def _process_pdf(self, pdf_path: str) -> Dict[str, Any]:`
			`"""Extract data from PDF by converting to image first"""`
			`try:`
			`# For now, extract text from PDF and process as text`
			`text_content = self._extract_text_from_pdf(pdf_path)`
			`return self._process_text_content(text_content)`

			`except Exception as e:`
			`return {"error": f"PDF processing error: {str(e)}"}`

			`def _extract_text_from_pdf(self, pdf_path: str) -> str:`
			`"""Extract text from PDF"""`
			`try:`
			`with open(pdf_path, "rb") as file:`
			`pdf_reader = PyPDF2.PdfReader(file)`
			`text = ""`
			`for page in pdf_reader.pages:`
			`text += page.extract_text() + "\n"`
			`return text`
			`except Exception:`
			`return ""`

			`def _process_text_content(self, text_content: str) -> Dict[str, Any]:`
			`"""Process text content using Groq (fallback for PDFs)"""`
			`try:`
			`prompt = f"""`
			`Analyze this receipt text and extract the following information in JSON format:`

			`Receipt Text:`
			`{text_content}`

			`Extract:`
			`{{`
			`"vendor": "Store/company name",`
			`"description": "Detailed description of items/services purchased",`
			`"total_amount": 0.00,`
			`"tax_amount": 0.00,`
			`"date": "YYYY-MM-DD",`
			`"category": "Food/Transport/Office/Other",`
			`"confidence": 0.95,`
			`"currency": "USD"`
			`}}`

			`Rules:`
			`- Extract vendor name as it appears on receipt`
			`- Extract description of items/services purchased (e.g., "Coffee and sandwich", "Gasoline", "Office supplies")`
			`- Total amount should be the final total including tax`
			`- Tax amount is separate tax line if available`
			`- Date should be the date on the receipt`
			`- Categorize based on vendor type`
			`- Confidence score 0-1 based on clarity`
			`- Currency should be the currency used on the receipt (e.g., "USD", "EUR")`

			`Return only valid JSON.`
			`"""`

			`response = self.client.chat.completions.create(`
			`model=self.model,`
			`messages=[{"role": "user", "content": prompt}],`
			`max_tokens=500,`
			`temperature=0.1,`
			`)`

			`result_text = response.choices[0].message.content.strip()`
			`return self._parse_extraction_result(result_text)`

			`except Exception as e:`
			`return {"error": f"Text processing error: {str(e)}"}`

			`def _parse_extraction_result(self, result_text: str) -> Dict[str, Any]:`
			`"""Parse Groq response and extract JSON data"""`
			`try:`
			`# Clean up response and extract JSON`
			`import json`
			`import re`

			`# Find JSON in response - try multiple patterns`
			`json_match = re.search(r"\{.*\}", result_text, re.DOTALL)`
			`if json_match:`
			`json_str = json_match.group()`

			`# Clean up common JSON issues`
			`json_str = re.sub(`
			`r",\s*([}\]])", r"\1", json_str`
			`) # Remove trailing commas`
			`json_str = re.sub(`
			`r"([{,])\s([a-zA-Z_][a-zA-Z0-9_])\s*:", r'\1"\2":', json_str`
			`) # Quote unquoted keys`

			`try:`
			`data = json.loads(json_str)`
			`except json.JSONDecodeError as e:`
			`# Try to fix common JSON issues`
			`logger.warning(f"Initial JSON parsing failed: {e}")`

			`# Try to extract individual fields using regex`
			`vendor_match = re.search(r'"vendor"\s:\s"([^"]*)"', json_str)`
			`description_match = re.search(`
			`r'"description"\s:\s"([^"]*)"', json_str`
			`)`
			`total_amount_match = re.search(`
			`r'"total_amount"\s:\s([0-9.]+)', json_str`
			`)`
			`tax_amount_match = re.search(`
			`r'"tax_amount"\s:\s([0-9.]+)', json_str`
			`)`
			`date_match = re.search(r'"date"\s:\s"([^"]*)"', json_str)`
			`category_match = re.search(r'"category"\s:\s"([^"]*)"', json_str)`
			`confidence_match = re.search(`
			`r'"confidence"\s:\s([0-9.]+)', json_str`
			`)`
			`currency_match = re.search(`
			`r'"currency"\s:\s"([^"]*)"', json_str`
			`)`

			`data = {`
			`"vendor": vendor_match.group(1) if vendor_match else "",`
			`"description": description_match.group(1)`
			`if description_match`
			`else "",`
			`"total_amount": float(total_amount_match.group(1))`
			`if total_amount_match`
			`else 0.0,`
			`"tax_amount": float(tax_amount_match.group(1))`
			`if tax_amount_match`
			`else 0.0,`
			`"date": date_match.group(1) if date_match else "",`
			`"category": category_match.group(1)`
			`if category_match`
			`else "Other",`
			`"confidence": float(confidence_match.group(1))`
			`if confidence_match`
			`else 0.5,`
			`"currency": currency_match.group(1) if currency_match else "CAD"`
			`}`

			`# Validate and clean data`
			`return {`
			`"vendor": str(data.get("vendor", "")).strip(),`
			`"description": str(data.get("description", "")).strip(),`
			`"total_amount": float(data.get("total_amount", 0)),`
			`"tax_amount": float(data.get("tax_amount", 0)),`
			`"date": str(data.get("date", "")).strip(),`
			`"category": str(data.get("category", "Other")).strip(),`
			`"confidence": float(data.get("confidence", 0.5)),`
			`"extraction_success": True,`
			`"currency": data.get("currency", "CAD").strip(),`
			`}`
			`else:`
			`# Try to extract fields from plain text`
			`logger.warning("No JSON found in response, attempting text extraction")`
			`return self._extract_from_plain_text(result_text)`

			`except Exception as e:`
			`logger.error(f"JSON parsing error: {str(e)}")`
			`return {`
			`"error": f"JSON parsing error: {str(e)}",`
			`"extraction_success": False,`
			`}`

			`def _extract_from_plain_text(self, text: str) -> Dict[str, Any]:`
			`"""Extract receipt data from plain text when JSON parsing fails"""`
			`try:`
			`import re`

			`# Extract vendor (look for common patterns)`
			`vendor_patterns = [`
			`r"(?:vendor\|store\|merchant\|company)\s[:\-]?\s([A-Za-z0-9\s&.,]+)",`
			`r"([A-Z][A-Za-z0-9\s&.,]{3,30})", # Capitalized words`
			`]`

			`vendor = ""`
			`for pattern in vendor_patterns:`
			`match = re.search(pattern, text, re.IGNORECASE)`
			`if match:`
			`vendor = match.group(1).strip()`
			`break`

			`# Extract amount (look for currency patterns)`
			`amount_patterns = [`
			`r"\$?\s([0-9,]+\.?[0-9])",`
			`r"(?:total\|amount\|sum)\s[:\-]?\s\$?\s([0-9,]+\.?[0-9])",`
			`]`

			`total_amount = 0.0`
			`for pattern in amount_patterns:`
			`match = re.search(pattern, text, re.IGNORECASE)`
			`if match:`
			`try:`
			`total_amount = float(match.group(1).replace(",", ""))`
			`break`
			`except ValueError:`
			`continue`

			`# Extract date`
			`date_patterns = [`
			`r"(\d{4}-\d{2}-\d{2})",`
			`r"(\d{1,2}/\d{1,2}/\d{2,4})",`
			`r"(Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)\s+\d{1,2},?\s+\d{4}",`
			`]`

			`date = ""`
			`for pattern in date_patterns:`
			`match = re.search(pattern, text, re.IGNORECASE)`
			`if match:`
			`date = match.group(0)`
			`break`

			`return {`
			`"vendor": vendor or "Unknown",`
			`"total_amount": total_amount,`
			`"tax_amount": 0.0,`
			`"date": date or "",`
			`"category": "Other",`
			`"confidence": 0.3, # Low confidence for text extraction`
			`"extraction_success": True,`
			`}`

			`except Exception as e:`
			`logger.error(f"Text extraction error: {str(e)}")`
			`return {`
			`"vendor": "Unknown",`
			`"total_amount": 0.0,`
			`"tax_amount": 0.0,`
			`"date": "",`
			`"category": "Other",`
			`"confidence": 0.1,`
			`"extraction_success": False,`
			`"error": f"Text extraction failed: {str(e)}",`
			`}`

			`async def save_uploaded_file(self, file_content: bytes, filename: str) -> str:`
			`"""Save uploaded file to temporary storage"""`
			`try:`
			`# Create uploads directory if it doesn't exist`
			`upload_dir = "uploads"`
			`os.makedirs(upload_dir, exist_ok=True)`

			`# Generate unique filename`
			`timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`safe_filename = f"{timestamp}_{filename.replace(' ', '_')}"`
			`file_path = os.path.join(upload_dir, safe_filename)`

			`# Save file`
			`async with aiofiles.open(file_path, "wb") as f:`
			`await f.write(file_content)`

			`return file_path`

			`except Exception as e:`
			`raise Exception(f"Failed to save file: {str(e)}")`

			`async def extract_transactions_from_image(self, image_path: str) -> Dict[str, Any]:`
			`"""Extract multiple transactions from an image (bank statement, credit card statement, etc.)"""`
			`try:`
			`# Encode image to base64`
			`base64_image = self._encode_image(image_path)`

			`# Create Groq vision prompt for transaction extraction`
			`prompt = """`
			`Analyze this financial document image (bank statement, credit card statement, etc.) and extract ALL transactions in JSON format.`

			`Look for transaction lists, payment records, or any financial entries that show:`
			`- Date`
			`- Amount (positive or negative)`
			`- Vendor/Description/Payee name`
			`- Any additional notes or memo`

			`Return the transactions as a JSON array:`
			`{`
			`"extraction_success": true,`
			`"transactions": [`
			`{`
			`"date": "YYYY-MM-DD",`
			`"amount": 0.00,`
			`"vendor": "Vendor name",`
			`"memo": "Additional notes"`
			`},`
			`{`
			`"date": "YYYY-MM-DD",`
			`"amount": -0.00,`
			`"vendor": "Another vendor",`
			`"memo": "Payment or charge description"`
			`}`
			`]`
			`}`

			`Rules:`
			`- Extract ALL visible transactions`
			`- Include both positive (credits) and negative (debits) amounts`
			`- Use the actual date format from the document`
			`- Vendor should be the merchant/payee name`
			`- Memo can include transaction type, reference numbers, etc.`
			`- If no transactions found, return empty array but set extraction_success to true`

			`Return only valid JSON.`
			`"""`

			`# Call Groq vision API`
			`response = self.client.chat.completions.create(`
			`messages=[`
			`{`
			`"role": "user",`
			`"content": [`
			`{"type": "text", "text": prompt},`
			`{`
			`"type": "image_url",`
			`"image_url": {`
			`"url": f"data:image/jpeg;base64,{base64_image}",`
			`},`
			`},`
			`],`
			`}`
			`],`
			`model=self.model,`
			`max_tokens=2000, # Higher token limit for multiple transactions`
			`temperature=0.1,`
			`)`

			`# Parse response`
			`result_text = response.choices[0].message.content.strip()`
			`return self._parse_transaction_extraction_result(result_text)`

			`except Exception as e:`
			`return {`
			`"extraction_success": False,`
			`"error": f"Transaction extraction error: {str(e)}",`
			`"transactions": [],`
			`}`

			`def _parse_transaction_extraction_result(self, result_text: str) -> Dict[str, Any]:`
			`"""Parse Groq response for transaction extraction"""`
			`try:`
			`import json`
			`import re`

			`# Find the first '{' and last '}'`
			`start = result_text.find("{")`
			`end = result_text.rfind("}")`
			`if start == -1 or end == -1 or end <= start:`
			`return {`
			`"extraction_success": False,`
			`"error": "Could not find JSON object in AI response",`
			`"transactions": [],`
			`}`
			`json_str = result_text[start : end + 1]`

			`# Remove trailing commas before } or ]`
			`json_str = re.sub(r",\s*([}\]])", r"\1", json_str)`

			`try:`
			`data = json.loads(json_str)`
			`except Exception as e:`
			`import logging`

			`logging.error(f"JSON parsing error: {str(e)}")`
			`logging.error(f"Offending JSON string:\n{json_str}")`
			`return {`
			`"extraction_success": False,`
			`"error": f"JSON parsing error: {str(e)}",`
			`"transactions": [],`
			`}`

			`# Validate and clean data`
			`transactions = data.get("transactions", [])`
			`cleaned_transactions = []`
			`for txn in transactions:`
			`try:`
			`cleaned_txn = {`
			`"date": str(txn.get("date", "")).strip(),`
			`"amount": float(`
			`str(txn.get("amount", 0)).replace("$", "").replace(",", "")`
			`),`
			`"vendor": str(txn.get("vendor", "")).strip(),`
			`"memo": str(txn.get("memo", "")).strip(),`
			`}`
			`cleaned_transactions.append(cleaned_txn)`
			`except Exception:`
			`continue`
			`return {`
			`"extraction_success": data.get("extraction_success", True),`
			`"transactions": cleaned_transactions,`
			`"total_transactions": len(cleaned_transactions),`
			`}`
			`except Exception as e:`
			`import logging`

			`logging.error(f"JSON parsing error (outer): {str(e)}")`
			`return {`
			`"extraction_success": False,`
			`"error": f"JSON parsing error: {str(e)}",`
			`"transactions": [],`
			`}`

			`def _parse_date_to_iso(self, date_str: str) -> str:`
			`"""Parse various date formats and convert to YYYY-MM-DD"""`
			`try:`
			`import re`
			`from datetime import datetime`

			`date_str = date_str.strip().upper()`

			`# Handle formats like "MAY 22", "JUN 01", "MAY 22, 2024"`
			`month_pattern = r"(JAN\|FEB\|MAR\|APR\|MAY\|JUN\|JUL\|AUG\|SEP\|OCT\|NOV\|DEC)\s+(\d{1,2})(?:,\s*(\d{4}))?"`
			`match = re.match(month_pattern, date_str)`

			`if match:`
			`month_abbr, day, year = match.groups()`
			`month_map = {`
			`"JAN": 1,`
			`"FEB": 2,`
			`"MAR": 3,`
			`"APR": 4,`
			`"MAY": 5,`
			`"JUN": 6,`
			`"JUL": 7,`
			`"AUG": 8,`
			`"SEP": 9,`
			`"OCT": 10,`
			`"NOV": 11,`
			`"DEC": 12,`
			`}`

			`month = month_map[month_abbr]`
			`day = int(day)`
			`year = int(year) if year else datetime.now().year`

			`# Handle 2-digit years`
			`if year < 100:`
			`year += 2000`

			`return f"{year:04d}-{month:02d}-{day:02d}"`

			`# Handle YYYY-MM-DD format`
			`if re.match(r"\d{4}-\d{2}-\d{2}", date_str):`
			`return date_str`

			`# Handle MM/DD/YYYY format`
			`if re.match(r"\d{1,2}/\d{1,2}/\d{4}", date_str):`
			`return datetime.strptime(date_str, "%m/%d/%Y").strftime("%Y-%m-%d")`

			`# Handle MM/DD/YY format`
			`if re.match(r"\d{1,2}/\d{1,2}/\d{2}", date_str):`
			`return datetime.strptime(date_str, "%m/%d/%y").strftime("%Y-%m-%d")`

			`return None`

			`except Exception:`
			`return None`