feat: Refactor report generation to use async methods and improve error handling; enhance spirometry table extraction with better CSV formatting
This commit is contained in:
Binary file not shown.
Binary file not shown.
@@ -10,7 +10,7 @@ from typing import Any, Dict, List
|
||||
|
||||
import pandas as pd
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from playwright.sync_api import sync_playwright
|
||||
from playwright.async_api import async_playwright
|
||||
from services.context_generator import ContextGenerator
|
||||
from services.graph_generator import GraphGenerator
|
||||
from services.spirometry_table_extractor import extract_spirometry_table_from_pdf
|
||||
@@ -265,7 +265,7 @@ class ReportGeneratorService:
|
||||
|
||||
return html_doc
|
||||
|
||||
def html_to_pdf(self, html_content: str, pdf_path: str) -> None:
|
||||
async def html_to_pdf(self, html_content: str, pdf_path: str) -> None:
|
||||
"""
|
||||
Convert HTML content to PDF file.
|
||||
|
||||
@@ -273,14 +273,14 @@ class ReportGeneratorService:
|
||||
html_content: HTML content as string
|
||||
pdf_path: Path where PDF should be saved
|
||||
"""
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch()
|
||||
page = browser.new_page()
|
||||
page.set_content(html_content)
|
||||
page.pdf(path=pdf_path, format="A4", print_background=True)
|
||||
browser.close()
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch()
|
||||
page = await browser.new_page()
|
||||
await page.set_content(html_content)
|
||||
await page.pdf(path=pdf_path, format="A4", print_background=True)
|
||||
await browser.close()
|
||||
|
||||
def generate_report(
|
||||
async def generate_report(
|
||||
self,
|
||||
spirometry_pdf_path: str,
|
||||
pnoe_csv_path: str,
|
||||
@@ -309,19 +309,18 @@ class ReportGeneratorService:
|
||||
Dictionary containing report path, graphs generated, and analysis data
|
||||
"""
|
||||
# Step 1: Extract spirometry table from PDF
|
||||
spirometry_csv_path = self.data_dir / "extracted_spirometry_table.csv"
|
||||
extract_spirometry_table_from_pdf(spirometry_pdf_path)
|
||||
|
||||
# The extraction saves to current directory, move it to data_dir
|
||||
import shutil
|
||||
|
||||
if Path("extracted_spirometry_table.csv").exists():
|
||||
shutil.move("extracted_spirometry_table.csv", spirometry_csv_path)
|
||||
print("Step 1: Extracting spirometry data from PDF...")
|
||||
spirometry_csv_path = extract_spirometry_table_from_pdf(
|
||||
spirometry_pdf_path, output_dir=str(self.data_dir)
|
||||
)
|
||||
print(f"Spirometry data saved to: {spirometry_csv_path}")
|
||||
|
||||
# Step 2: Process Pnoe data
|
||||
print("Step 2: Processing Pnoe data...")
|
||||
df = self.process_pnoe_data(pnoe_csv_path)
|
||||
|
||||
# Step 3: Generate all graphs
|
||||
print("Step 3: Generating graphs...")
|
||||
graphs_generated = self.generate_graphs(df)
|
||||
|
||||
# Create graph dictionary with base64 encoded images
|
||||
@@ -370,13 +369,20 @@ class ReportGeneratorService:
|
||||
graphs_dict["body_fat_percent"] = body_fat_b64
|
||||
|
||||
# Generate spirometry chart
|
||||
spirometry_df = pd.read_csv(spirometry_csv_path)
|
||||
spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart(
|
||||
spirometry_df, save_as_base64=True
|
||||
)
|
||||
graphs_dict["spirometry_chart"] = spirometry_chart_b64
|
||||
print("Step 4: Generating spirometry chart...")
|
||||
try:
|
||||
spirometry_df = pd.read_csv(spirometry_csv_path)
|
||||
print(f"Spirometry data loaded: {len(spirometry_df)} rows")
|
||||
spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart(
|
||||
spirometry_df, save_as_base64=True
|
||||
)
|
||||
graphs_dict["spirometry_chart"] = spirometry_chart_b64
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not generate spirometry chart: {e}")
|
||||
graphs_dict["spirometry_chart"] = ""
|
||||
|
||||
# Step 4: Generate context for all pages
|
||||
# Step 5: Generate context for all pages
|
||||
print("Step 5: Generating page contexts...")
|
||||
self.context_generator.load_data(
|
||||
pnoe_csv_path, str(spirometry_csv_path), seca_excel_path
|
||||
)
|
||||
@@ -401,7 +407,7 @@ class ReportGeneratorService:
|
||||
|
||||
report_path = self.reports_dir / output_filename
|
||||
print(f"Generating PDF report at {report_path}")
|
||||
self.html_to_pdf(html_content, str(report_path))
|
||||
await self.html_to_pdf(html_content, str(report_path))
|
||||
|
||||
return {
|
||||
"report_path": str(report_path),
|
||||
|
||||
@@ -13,7 +13,21 @@ def encode_pdf_to_base64(pdf_path):
|
||||
return base64.b64encode(pdf_file.read()).decode("utf-8")
|
||||
|
||||
|
||||
def extract_spirometry_table_from_pdf(pdf_path):
|
||||
def extract_spirometry_table_from_pdf(pdf_path, output_dir="data"):
|
||||
"""
|
||||
Extract spirometry table from PDF using AI and save as clean CSV.
|
||||
|
||||
Args:
|
||||
pdf_path: Path to the spirometry PDF file
|
||||
output_dir: Directory to save the extracted CSV
|
||||
|
||||
Returns:
|
||||
Path to the saved CSV file
|
||||
"""
|
||||
import csv
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
url = "https://openrouter.ai/api/v1/chat/completions"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {API_KEY_REF}",
|
||||
@@ -30,10 +44,17 @@ def extract_spirometry_table_from_pdf(pdf_path):
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Please extract the Spirometry table from the pdf and return the values in csv format, "
|
||||
"note that it is the unit of parameter that is beside it and it should not be a column. "
|
||||
"The '-' Should be treated as empty values."
|
||||
"do not add 'csv' at the start or end of the response",
|
||||
"text": "Please extract the Spirometry table from the pdf and return ONLY the values in CSV format. "
|
||||
"The CSV should have these columns: Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
|
||||
"Rules:\n"
|
||||
"1. Include ONLY the data rows (FVC, FEV1, FEV1/FVC%, etc.)\n"
|
||||
"2. Do NOT include units in the data (units are part of parameter name)\n"
|
||||
"3. Use empty string for missing values (not '-' or 'N/A')\n"
|
||||
"4. Do NOT add 'csv' markers or code blocks\n"
|
||||
"5. First line should be the header\n"
|
||||
"Example format:\n"
|
||||
"Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
|
||||
"FVC,4.50,4.75,3.20,4.80,99,-0.10",
|
||||
},
|
||||
{
|
||||
"type": "file",
|
||||
@@ -54,11 +75,65 @@ def extract_spirometry_table_from_pdf(pdf_path):
|
||||
if "choices" in response_data and len(response_data["choices"]) > 0:
|
||||
content = response_data["choices"][0]["message"]["content"]
|
||||
|
||||
# Save to a CSV file
|
||||
output_file = "extracted_spirometry_table.csv"
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
# Clean the content - remove markdown code blocks if present
|
||||
content = re.sub(r"```csv\n?", "", content)
|
||||
content = re.sub(r"```\n?", "", content)
|
||||
content = content.strip()
|
||||
|
||||
return f"Extracted table saved to {output_file}"
|
||||
# Parse and validate CSV
|
||||
lines = content.split("\n")
|
||||
if not lines:
|
||||
raise ValueError("No data extracted from PDF")
|
||||
|
||||
# Ensure output directory exists
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
output_file = output_path / "extracted_spirometry_table.csv"
|
||||
|
||||
# Write cleaned CSV with proper formatting
|
||||
with open(output_file, "w", encoding="utf-8", newline="") as f:
|
||||
# Parse the first line as header
|
||||
header_line = lines[0].strip()
|
||||
if "," in header_line:
|
||||
header = [col.strip() for col in header_line.split(",")]
|
||||
else:
|
||||
# Default header if not provided
|
||||
header = [
|
||||
"Parameters",
|
||||
"Pre",
|
||||
"Best",
|
||||
"LLN",
|
||||
"Pred.",
|
||||
"%Pred.",
|
||||
"ZScore",
|
||||
]
|
||||
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(header)
|
||||
|
||||
# Process data rows
|
||||
for line in lines[1:]:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Split by comma and clean each field
|
||||
fields = [field.strip() for field in line.split(",")]
|
||||
|
||||
# Ensure we have the right number of fields
|
||||
if len(fields) < len(header):
|
||||
# Pad with empty strings
|
||||
fields.extend([""] * (len(header) - len(fields)))
|
||||
elif len(fields) > len(header):
|
||||
# Take only the first N fields
|
||||
fields = fields[: len(header)]
|
||||
|
||||
# Replace '-' or 'N/A' with empty string
|
||||
fields = ["" if f in ["-", "N/A", "n/a", "NA"] else f for f in fields]
|
||||
|
||||
writer.writerow(fields)
|
||||
|
||||
return str(output_file)
|
||||
else:
|
||||
return "No content found in response"
|
||||
error_msg = response_data.get("error", {}).get("message", "Unknown error")
|
||||
raise Exception(f"No content found in response: {error_msg}")
|
||||
|
||||
Reference in New Issue
Block a user