feat: Refactor report generation to use async methods and improve error handling; enhance spirometry table extraction with better CSV formatting

This commit is contained in:
bolade
2025-10-04 10:35:02 +01:00
parent 358898b7db
commit 0a735d88c8
5 changed files with 123 additions and 38 deletions
+7 -3
View File
@@ -12,7 +12,6 @@ from pathlib import Path
from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from pydantic import BaseModel from pydantic import BaseModel
from services.report_generator import ReportGeneratorService from services.report_generator import ReportGeneratorService
app = FastAPI( app = FastAPI(
@@ -138,7 +137,7 @@ async def generate_report(
} }
# Generate report using the service # Generate report using the service
result = report_service.generate_report( result = await report_service.generate_report(
spirometry_pdf_path=str(spirometry_path), spirometry_pdf_path=str(spirometry_path),
pnoe_csv_path=str(pnoe_path), pnoe_csv_path=str(pnoe_path),
seca_excel_path=str(seca_path), seca_excel_path=str(seca_path),
@@ -153,9 +152,14 @@ async def generate_report(
) )
except Exception as e: except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"ERROR: {error_details}") # This will show in terminal
raise HTTPException( raise HTTPException(
status_code=500, status_code=500,
detail=f"Error generating report: {str(e)}", detail=f"Error generating report: {str(e)}\n{error_details}",
) )
finally: finally:
# Close file handles # Close file handles
+30 -24
View File
@@ -10,7 +10,7 @@ from typing import Any, Dict, List
import pandas as pd import pandas as pd
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from playwright.sync_api import sync_playwright from playwright.async_api import async_playwright
from services.context_generator import ContextGenerator from services.context_generator import ContextGenerator
from services.graph_generator import GraphGenerator from services.graph_generator import GraphGenerator
from services.spirometry_table_extractor import extract_spirometry_table_from_pdf from services.spirometry_table_extractor import extract_spirometry_table_from_pdf
@@ -265,7 +265,7 @@ class ReportGeneratorService:
return html_doc return html_doc
def html_to_pdf(self, html_content: str, pdf_path: str) -> None: async def html_to_pdf(self, html_content: str, pdf_path: str) -> None:
""" """
Convert HTML content to PDF file. Convert HTML content to PDF file.
@@ -273,14 +273,14 @@ class ReportGeneratorService:
html_content: HTML content as string html_content: HTML content as string
pdf_path: Path where PDF should be saved pdf_path: Path where PDF should be saved
""" """
with sync_playwright() as p: async with async_playwright() as p:
browser = p.chromium.launch() browser = await p.chromium.launch()
page = browser.new_page() page = await browser.new_page()
page.set_content(html_content) await page.set_content(html_content)
page.pdf(path=pdf_path, format="A4", print_background=True) await page.pdf(path=pdf_path, format="A4", print_background=True)
browser.close() await browser.close()
def generate_report( async def generate_report(
self, self,
spirometry_pdf_path: str, spirometry_pdf_path: str,
pnoe_csv_path: str, pnoe_csv_path: str,
@@ -309,19 +309,18 @@ class ReportGeneratorService:
Dictionary containing report path, graphs generated, and analysis data Dictionary containing report path, graphs generated, and analysis data
""" """
# Step 1: Extract spirometry table from PDF # Step 1: Extract spirometry table from PDF
spirometry_csv_path = self.data_dir / "extracted_spirometry_table.csv" print("Step 1: Extracting spirometry data from PDF...")
extract_spirometry_table_from_pdf(spirometry_pdf_path) spirometry_csv_path = extract_spirometry_table_from_pdf(
spirometry_pdf_path, output_dir=str(self.data_dir)
# The extraction saves to current directory, move it to data_dir )
import shutil print(f"Spirometry data saved to: {spirometry_csv_path}")
if Path("extracted_spirometry_table.csv").exists():
shutil.move("extracted_spirometry_table.csv", spirometry_csv_path)
# Step 2: Process Pnoe data # Step 2: Process Pnoe data
print("Step 2: Processing Pnoe data...")
df = self.process_pnoe_data(pnoe_csv_path) df = self.process_pnoe_data(pnoe_csv_path)
# Step 3: Generate all graphs # Step 3: Generate all graphs
print("Step 3: Generating graphs...")
graphs_generated = self.generate_graphs(df) graphs_generated = self.generate_graphs(df)
# Create graph dictionary with base64 encoded images # Create graph dictionary with base64 encoded images
@@ -370,13 +369,20 @@ class ReportGeneratorService:
graphs_dict["body_fat_percent"] = body_fat_b64 graphs_dict["body_fat_percent"] = body_fat_b64
# Generate spirometry chart # Generate spirometry chart
spirometry_df = pd.read_csv(spirometry_csv_path) print("Step 4: Generating spirometry chart...")
spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart( try:
spirometry_df, save_as_base64=True spirometry_df = pd.read_csv(spirometry_csv_path)
) print(f"Spirometry data loaded: {len(spirometry_df)} rows")
graphs_dict["spirometry_chart"] = spirometry_chart_b64 spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart(
spirometry_df, save_as_base64=True
)
graphs_dict["spirometry_chart"] = spirometry_chart_b64
except Exception as e:
print(f"Warning: Could not generate spirometry chart: {e}")
graphs_dict["spirometry_chart"] = ""
# Step 4: Generate context for all pages # Step 5: Generate context for all pages
print("Step 5: Generating page contexts...")
self.context_generator.load_data( self.context_generator.load_data(
pnoe_csv_path, str(spirometry_csv_path), seca_excel_path pnoe_csv_path, str(spirometry_csv_path), seca_excel_path
) )
@@ -401,7 +407,7 @@ class ReportGeneratorService:
report_path = self.reports_dir / output_filename report_path = self.reports_dir / output_filename
print(f"Generating PDF report at {report_path}") print(f"Generating PDF report at {report_path}")
self.html_to_pdf(html_content, str(report_path)) await self.html_to_pdf(html_content, str(report_path))
return { return {
"report_path": str(report_path), "report_path": str(report_path),
+86 -11
View File
@@ -13,7 +13,21 @@ def encode_pdf_to_base64(pdf_path):
return base64.b64encode(pdf_file.read()).decode("utf-8") return base64.b64encode(pdf_file.read()).decode("utf-8")
def extract_spirometry_table_from_pdf(pdf_path): def extract_spirometry_table_from_pdf(pdf_path, output_dir="data"):
"""
Extract spirometry table from PDF using AI and save as clean CSV.
Args:
pdf_path: Path to the spirometry PDF file
output_dir: Directory to save the extracted CSV
Returns:
Path to the saved CSV file
"""
import csv
import re
from pathlib import Path
url = "https://openrouter.ai/api/v1/chat/completions" url = "https://openrouter.ai/api/v1/chat/completions"
headers = { headers = {
"Authorization": f"Bearer {API_KEY_REF}", "Authorization": f"Bearer {API_KEY_REF}",
@@ -30,10 +44,17 @@ def extract_spirometry_table_from_pdf(pdf_path):
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "Please extract the Spirometry table from the pdf and return the values in csv format, " "text": "Please extract the Spirometry table from the pdf and return ONLY the values in CSV format. "
"note that it is the unit of parameter that is beside it and it should not be a column. " "The CSV should have these columns: Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
"The '-' Should be treated as empty values." "Rules:\n"
"do not add 'csv' at the start or end of the response", "1. Include ONLY the data rows (FVC, FEV1, FEV1/FVC%, etc.)\n"
"2. Do NOT include units in the data (units are part of parameter name)\n"
"3. Use empty string for missing values (not '-' or 'N/A')\n"
"4. Do NOT add 'csv' markers or code blocks\n"
"5. First line should be the header\n"
"Example format:\n"
"Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
"FVC,4.50,4.75,3.20,4.80,99,-0.10",
}, },
{ {
"type": "file", "type": "file",
@@ -54,11 +75,65 @@ def extract_spirometry_table_from_pdf(pdf_path):
if "choices" in response_data and len(response_data["choices"]) > 0: if "choices" in response_data and len(response_data["choices"]) > 0:
content = response_data["choices"][0]["message"]["content"] content = response_data["choices"][0]["message"]["content"]
# Save to a CSV file # Clean the content - remove markdown code blocks if present
output_file = "extracted_spirometry_table.csv" content = re.sub(r"```csv\n?", "", content)
with open(output_file, "w", encoding="utf-8") as f: content = re.sub(r"```\n?", "", content)
f.write(content) content = content.strip()
return f"Extracted table saved to {output_file}" # Parse and validate CSV
lines = content.split("\n")
if not lines:
raise ValueError("No data extracted from PDF")
# Ensure output directory exists
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
output_file = output_path / "extracted_spirometry_table.csv"
# Write cleaned CSV with proper formatting
with open(output_file, "w", encoding="utf-8", newline="") as f:
# Parse the first line as header
header_line = lines[0].strip()
if "," in header_line:
header = [col.strip() for col in header_line.split(",")]
else:
# Default header if not provided
header = [
"Parameters",
"Pre",
"Best",
"LLN",
"Pred.",
"%Pred.",
"ZScore",
]
writer = csv.writer(f)
writer.writerow(header)
# Process data rows
for line in lines[1:]:
line = line.strip()
if not line:
continue
# Split by comma and clean each field
fields = [field.strip() for field in line.split(",")]
# Ensure we have the right number of fields
if len(fields) < len(header):
# Pad with empty strings
fields.extend([""] * (len(header) - len(fields)))
elif len(fields) > len(header):
# Take only the first N fields
fields = fields[: len(header)]
# Replace '-' or 'N/A' with empty string
fields = ["" if f in ["-", "N/A", "n/a", "NA"] else f for f in fields]
writer.writerow(fields)
return str(output_file)
else: else:
return "No content found in response" error_msg = response_data.get("error", {}).get("message", "Unknown error")
raise Exception(f"No content found in response: {error_msg}")