diff --git a/app/main.py b/app/main.py index ae2fcb0..c7ce9ee 100644 --- a/app/main.py +++ b/app/main.py @@ -12,7 +12,6 @@ from pathlib import Path from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi.responses import FileResponse from pydantic import BaseModel - from services.report_generator import ReportGeneratorService app = FastAPI( @@ -138,7 +137,7 @@ async def generate_report( } # Generate report using the service - result = report_service.generate_report( + result = await report_service.generate_report( spirometry_pdf_path=str(spirometry_path), pnoe_csv_path=str(pnoe_path), seca_excel_path=str(seca_path), @@ -153,9 +152,14 @@ async def generate_report( ) except Exception as e: + import traceback + + error_details = traceback.format_exc() + print(f"ERROR: {error_details}") # This will show in terminal + raise HTTPException( status_code=500, - detail=f"Error generating report: {str(e)}", + detail=f"Error generating report: {str(e)}\n{error_details}", ) finally: # Close file handles diff --git a/app/services/__pycache__/report_generator.cpython-312.pyc b/app/services/__pycache__/report_generator.cpython-312.pyc index 00b4f35..6618b06 100644 Binary files a/app/services/__pycache__/report_generator.cpython-312.pyc and b/app/services/__pycache__/report_generator.cpython-312.pyc differ diff --git a/app/services/__pycache__/spirometry_table_extractor.cpython-312.pyc b/app/services/__pycache__/spirometry_table_extractor.cpython-312.pyc index 3597aac..6e4ba6b 100644 Binary files a/app/services/__pycache__/spirometry_table_extractor.cpython-312.pyc and b/app/services/__pycache__/spirometry_table_extractor.cpython-312.pyc differ diff --git a/app/services/report_generator.py b/app/services/report_generator.py index 044c6f5..fac8b86 100644 --- a/app/services/report_generator.py +++ b/app/services/report_generator.py @@ -10,7 +10,7 @@ from typing import Any, Dict, List import pandas as pd from jinja2 import Environment, FileSystemLoader -from playwright.sync_api import sync_playwright +from playwright.async_api import async_playwright from services.context_generator import ContextGenerator from services.graph_generator import GraphGenerator from services.spirometry_table_extractor import extract_spirometry_table_from_pdf @@ -265,7 +265,7 @@ class ReportGeneratorService: return html_doc - def html_to_pdf(self, html_content: str, pdf_path: str) -> None: + async def html_to_pdf(self, html_content: str, pdf_path: str) -> None: """ Convert HTML content to PDF file. @@ -273,14 +273,14 @@ class ReportGeneratorService: html_content: HTML content as string pdf_path: Path where PDF should be saved """ - with sync_playwright() as p: - browser = p.chromium.launch() - page = browser.new_page() - page.set_content(html_content) - page.pdf(path=pdf_path, format="A4", print_background=True) - browser.close() + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + await page.set_content(html_content) + await page.pdf(path=pdf_path, format="A4", print_background=True) + await browser.close() - def generate_report( + async def generate_report( self, spirometry_pdf_path: str, pnoe_csv_path: str, @@ -309,19 +309,18 @@ class ReportGeneratorService: Dictionary containing report path, graphs generated, and analysis data """ # Step 1: Extract spirometry table from PDF - spirometry_csv_path = self.data_dir / "extracted_spirometry_table.csv" - extract_spirometry_table_from_pdf(spirometry_pdf_path) - - # The extraction saves to current directory, move it to data_dir - import shutil - - if Path("extracted_spirometry_table.csv").exists(): - shutil.move("extracted_spirometry_table.csv", spirometry_csv_path) + print("Step 1: Extracting spirometry data from PDF...") + spirometry_csv_path = extract_spirometry_table_from_pdf( + spirometry_pdf_path, output_dir=str(self.data_dir) + ) + print(f"Spirometry data saved to: {spirometry_csv_path}") # Step 2: Process Pnoe data + print("Step 2: Processing Pnoe data...") df = self.process_pnoe_data(pnoe_csv_path) # Step 3: Generate all graphs + print("Step 3: Generating graphs...") graphs_generated = self.generate_graphs(df) # Create graph dictionary with base64 encoded images @@ -370,13 +369,20 @@ class ReportGeneratorService: graphs_dict["body_fat_percent"] = body_fat_b64 # Generate spirometry chart - spirometry_df = pd.read_csv(spirometry_csv_path) - spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart( - spirometry_df, save_as_base64=True - ) - graphs_dict["spirometry_chart"] = spirometry_chart_b64 + print("Step 4: Generating spirometry chart...") + try: + spirometry_df = pd.read_csv(spirometry_csv_path) + print(f"Spirometry data loaded: {len(spirometry_df)} rows") + spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart( + spirometry_df, save_as_base64=True + ) + graphs_dict["spirometry_chart"] = spirometry_chart_b64 + except Exception as e: + print(f"Warning: Could not generate spirometry chart: {e}") + graphs_dict["spirometry_chart"] = "" - # Step 4: Generate context for all pages + # Step 5: Generate context for all pages + print("Step 5: Generating page contexts...") self.context_generator.load_data( pnoe_csv_path, str(spirometry_csv_path), seca_excel_path ) @@ -401,7 +407,7 @@ class ReportGeneratorService: report_path = self.reports_dir / output_filename print(f"Generating PDF report at {report_path}") - self.html_to_pdf(html_content, str(report_path)) + await self.html_to_pdf(html_content, str(report_path)) return { "report_path": str(report_path), diff --git a/app/services/spirometry_table_extractor.py b/app/services/spirometry_table_extractor.py index 79f3901..f4d416c 100644 --- a/app/services/spirometry_table_extractor.py +++ b/app/services/spirometry_table_extractor.py @@ -13,7 +13,21 @@ def encode_pdf_to_base64(pdf_path): return base64.b64encode(pdf_file.read()).decode("utf-8") -def extract_spirometry_table_from_pdf(pdf_path): +def extract_spirometry_table_from_pdf(pdf_path, output_dir="data"): + """ + Extract spirometry table from PDF using AI and save as clean CSV. + + Args: + pdf_path: Path to the spirometry PDF file + output_dir: Directory to save the extracted CSV + + Returns: + Path to the saved CSV file + """ + import csv + import re + from pathlib import Path + url = "https://openrouter.ai/api/v1/chat/completions" headers = { "Authorization": f"Bearer {API_KEY_REF}", @@ -30,10 +44,17 @@ def extract_spirometry_table_from_pdf(pdf_path): "content": [ { "type": "text", - "text": "Please extract the Spirometry table from the pdf and return the values in csv format, " - "note that it is the unit of parameter that is beside it and it should not be a column. " - "The '-' Should be treated as empty values." - "do not add 'csv' at the start or end of the response", + "text": "Please extract the Spirometry table from the pdf and return ONLY the values in CSV format. " + "The CSV should have these columns: Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n" + "Rules:\n" + "1. Include ONLY the data rows (FVC, FEV1, FEV1/FVC%, etc.)\n" + "2. Do NOT include units in the data (units are part of parameter name)\n" + "3. Use empty string for missing values (not '-' or 'N/A')\n" + "4. Do NOT add 'csv' markers or code blocks\n" + "5. First line should be the header\n" + "Example format:\n" + "Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n" + "FVC,4.50,4.75,3.20,4.80,99,-0.10", }, { "type": "file", @@ -54,11 +75,65 @@ def extract_spirometry_table_from_pdf(pdf_path): if "choices" in response_data and len(response_data["choices"]) > 0: content = response_data["choices"][0]["message"]["content"] - # Save to a CSV file - output_file = "extracted_spirometry_table.csv" - with open(output_file, "w", encoding="utf-8") as f: - f.write(content) + # Clean the content - remove markdown code blocks if present + content = re.sub(r"```csv\n?", "", content) + content = re.sub(r"```\n?", "", content) + content = content.strip() - return f"Extracted table saved to {output_file}" + # Parse and validate CSV + lines = content.split("\n") + if not lines: + raise ValueError("No data extracted from PDF") + + # Ensure output directory exists + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + output_file = output_path / "extracted_spirometry_table.csv" + + # Write cleaned CSV with proper formatting + with open(output_file, "w", encoding="utf-8", newline="") as f: + # Parse the first line as header + header_line = lines[0].strip() + if "," in header_line: + header = [col.strip() for col in header_line.split(",")] + else: + # Default header if not provided + header = [ + "Parameters", + "Pre", + "Best", + "LLN", + "Pred.", + "%Pred.", + "ZScore", + ] + + writer = csv.writer(f) + writer.writerow(header) + + # Process data rows + for line in lines[1:]: + line = line.strip() + if not line: + continue + + # Split by comma and clean each field + fields = [field.strip() for field in line.split(",")] + + # Ensure we have the right number of fields + if len(fields) < len(header): + # Pad with empty strings + fields.extend([""] * (len(header) - len(fields))) + elif len(fields) > len(header): + # Take only the first N fields + fields = fields[: len(header)] + + # Replace '-' or 'N/A' with empty string + fields = ["" if f in ["-", "N/A", "n/a", "NA"] else f for f in fields] + + writer.writerow(fields) + + return str(output_file) else: - return "No content found in response" + error_msg = response_data.get("error", {}).get("message", "Unknown error") + raise Exception(f"No content found in response: {error_msg}")