Add test script for JSON extraction functionality
This commit introduces a new test script, `test_json_extraction.py`, which verifies the correctness of the JSON extraction logic. The script includes a function to extract the first valid JSON object from raw input and a series of test cases covering various scenarios, such as clean JSON, JSON with extra text, nested JSON, and escaped quotes. The tests ensure that the extraction function behaves as expected and handles edge cases appropriately.
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
@@ -17,6 +18,59 @@ class DocumentProcessor:
|
||||
self.client = groq.Groq(api_key=settings.GROQ_API_KEY)
|
||||
self.model = "meta-llama/llama-4-scout-17b-16e-instruct" # Vision model
|
||||
|
||||
def _extract_first_json(self, raw: str) -> dict:
|
||||
"""Extract the first valid JSON object from raw LLM output.
|
||||
|
||||
Handles cases where LLM returns extra text after/before the JSON.
|
||||
"""
|
||||
try:
|
||||
# First try direct parsing (fastest path)
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Find the first '{' and match closing '}'
|
||||
start = raw.find("{")
|
||||
if start == -1:
|
||||
raise ValueError("No JSON object found in LLM output")
|
||||
|
||||
depth = 0
|
||||
end = -1
|
||||
in_string = False
|
||||
escape_next = False
|
||||
|
||||
for i in range(start, len(raw)):
|
||||
ch = raw[i]
|
||||
|
||||
# Handle string escaping
|
||||
if escape_next:
|
||||
escape_next = False
|
||||
continue
|
||||
if ch == "\\":
|
||||
escape_next = True
|
||||
continue
|
||||
|
||||
# Track if we're inside a string
|
||||
if ch == '"':
|
||||
in_string = not in_string
|
||||
continue
|
||||
|
||||
# Only count braces outside of strings
|
||||
if not in_string:
|
||||
if ch == "{":
|
||||
depth += 1
|
||||
elif ch == "}":
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
end = i + 1
|
||||
break
|
||||
|
||||
if end == -1:
|
||||
raise ValueError("Unbalanced JSON braces in LLM output")
|
||||
|
||||
json_str = raw[start:end]
|
||||
return json.loads(json_str)
|
||||
|
||||
async def process_file(
|
||||
self,
|
||||
file_path: str,
|
||||
@@ -145,6 +199,8 @@ class DocumentProcessor:
|
||||
* residual_value: Estimated value at end of life (typically 10% of purchase price for equipment, 20% for vehicles)
|
||||
- If is_depreciable is false, set name_of_asset, cca_rate, useful_life, and residual_value to null
|
||||
|
||||
CATEGORY RULES:
|
||||
- Assign the category based on all the details in the receipt
|
||||
Return only valid JSON.
|
||||
"""
|
||||
|
||||
@@ -334,11 +390,16 @@ class DocumentProcessor:
|
||||
def _parse_extraction_result(self, result_text: str) -> Dict[str, Any]:
|
||||
"""Parse Groq response and extract JSON data"""
|
||||
try:
|
||||
# Clean up response and extract JSON
|
||||
import json
|
||||
import re
|
||||
|
||||
# Find JSON in response - try multiple patterns
|
||||
# Try robust JSON extraction first (handles extra text)
|
||||
try:
|
||||
data = self._extract_first_json(result_text)
|
||||
return data
|
||||
except (json.JSONDecodeError, ValueError) as e:
|
||||
logger.warning(f"Robust JSON extraction failed: {e}. Trying fallback methods...")
|
||||
|
||||
# Fallback: Find JSON in response - try multiple patterns
|
||||
json_match = re.search(r"\{.*\}", result_text, re.DOTALL)
|
||||
if json_match:
|
||||
json_str = json_match.group()
|
||||
@@ -355,7 +416,7 @@ class DocumentProcessor:
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError as e:
|
||||
# Try to fix common JSON issues
|
||||
logger.warning(f"Initial JSON parsing failed: {e}")
|
||||
logger.warning(f"Fallback JSON parsing also failed: {e}")
|
||||
|
||||
# Try to extract individual fields using regex
|
||||
vendor_match = re.search(r'"vendor"\s*:\s*"([^"]*)"', json_str)
|
||||
|
||||
Reference in New Issue
Block a user