feat: Refactor report generation to use async methods and improve error handling; enhance spirometry table extraction with better CSV formatting

2025-10-04 10:35:02 +01:00
parent 358898b7db
commit 0a735d88c8
5 changed files with 123 additions and 38 deletions
@@ -12,7 +12,6 @@ from pathlib import Path
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.responses import FileResponse
 from pydantic import BaseModel
 from services.report_generator import ReportGeneratorService
 app = FastAPI(
@@ -138,7 +137,7 @@ async def generate_report(
            }
            # Generate report using the service
-            result = report_service.generate_report(
+            result = await report_service.generate_report(
                spirometry_pdf_path=str(spirometry_path),
                pnoe_csv_path=str(pnoe_path),
                seca_excel_path=str(seca_path),
@@ -153,9 +152,14 @@ async def generate_report(
            )
        except Exception as e:
            import traceback
            error_details = traceback.format_exc()
            print(f"ERROR: {error_details}")  # This will show in terminal
            raise HTTPException(
                status_code=500,
-                detail=f"Error generating report: {str(e)}",
+                detail=f"Error generating report: {str(e)}\n{error_details}",
            )
        finally:
            # Close file handles
@@ -10,7 +10,7 @@ from typing import Any, Dict, List
 import pandas as pd
 from jinja2 import Environment, FileSystemLoader
-from playwright.sync_api import sync_playwright
+from playwright.async_api import async_playwright
 from services.context_generator import ContextGenerator
 from services.graph_generator import GraphGenerator
 from services.spirometry_table_extractor import extract_spirometry_table_from_pdf
@@ -265,7 +265,7 @@ class ReportGeneratorService:
        return html_doc
-    def html_to_pdf(self, html_content: str, pdf_path: str) -> None:
+    async def html_to_pdf(self, html_content: str, pdf_path: str) -> None:
        """
        Convert HTML content to PDF file.
@@ -273,14 +273,14 @@ class ReportGeneratorService:
            html_content: HTML content as string
            pdf_path: Path where PDF should be saved
        """
-        with sync_playwright() as p:
+        async with async_playwright() as p:
-            browser = p.chromium.launch()
+            browser = await p.chromium.launch()
-            page = browser.new_page()
+            page = await browser.new_page()
-            page.set_content(html_content)
+            await page.set_content(html_content)
-            page.pdf(path=pdf_path, format="A4", print_background=True)
+            await page.pdf(path=pdf_path, format="A4", print_background=True)
-            browser.close()
+            await browser.close()
-    def generate_report(
+    async def generate_report(
        self,
        spirometry_pdf_path: str,
        pnoe_csv_path: str,
@@ -309,19 +309,18 @@ class ReportGeneratorService:
            Dictionary containing report path, graphs generated, and analysis data
        """
        # Step 1: Extract spirometry table from PDF
-        spirometry_csv_path = self.data_dir / "extracted_spirometry_table.csv"
+        print("Step 1: Extracting spirometry data from PDF...")
-        extract_spirometry_table_from_pdf(spirometry_pdf_path)
+        spirometry_csv_path = extract_spirometry_table_from_pdf(
-
+            spirometry_pdf_path, output_dir=str(self.data_dir)
-        # The extraction saves to current directory, move it to data_dir
+        )
-        import shutil
+        print(f"Spirometry data saved to: {spirometry_csv_path}")
        if Path("extracted_spirometry_table.csv").exists():
            shutil.move("extracted_spirometry_table.csv", spirometry_csv_path)
        # Step 2: Process Pnoe data
        print("Step 2: Processing Pnoe data...")
        df = self.process_pnoe_data(pnoe_csv_path)
        # Step 3: Generate all graphs
        print("Step 3: Generating graphs...")
        graphs_generated = self.generate_graphs(df)
        # Create graph dictionary with base64 encoded images
@@ -370,13 +369,20 @@ class ReportGeneratorService:
            graphs_dict["body_fat_percent"] = body_fat_b64
        # Generate spirometry chart
-        spirometry_df = pd.read_csv(spirometry_csv_path)
+        print("Step 4: Generating spirometry chart...")
-        spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart(
+        try:
-            spirometry_df, save_as_base64=True
+            spirometry_df = pd.read_csv(spirometry_csv_path)
-        )
+            print(f"Spirometry data loaded: {len(spirometry_df)} rows")
-        graphs_dict["spirometry_chart"] = spirometry_chart_b64
+            spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart(
                spirometry_df, save_as_base64=True
            )
            graphs_dict["spirometry_chart"] = spirometry_chart_b64
        except Exception as e:
            print(f"Warning: Could not generate spirometry chart: {e}")
            graphs_dict["spirometry_chart"] = ""
-        # Step 4: Generate context for all pages
+        # Step 5: Generate context for all pages
        print("Step 5: Generating page contexts...")
        self.context_generator.load_data(
            pnoe_csv_path, str(spirometry_csv_path), seca_excel_path
        )
@@ -401,7 +407,7 @@ class ReportGeneratorService:
        report_path = self.reports_dir / output_filename
        print(f"Generating PDF report at {report_path}")
-        self.html_to_pdf(html_content, str(report_path))
+        await self.html_to_pdf(html_content, str(report_path))
        return {
            "report_path": str(report_path),
@@ -13,7 +13,21 @@ def encode_pdf_to_base64(pdf_path):
        return base64.b64encode(pdf_file.read()).decode("utf-8")
-def extract_spirometry_table_from_pdf(pdf_path):
+def extract_spirometry_table_from_pdf(pdf_path, output_dir="data"):
    """
    Extract spirometry table from PDF using AI and save as clean CSV.
    Args:
        pdf_path: Path to the spirometry PDF file
        output_dir: Directory to save the extracted CSV
    Returns:
        Path to the saved CSV file
    """
    import csv
    import re
    from pathlib import Path
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {API_KEY_REF}",
@@ -30,10 +44,17 @@ def extract_spirometry_table_from_pdf(pdf_path):
            "content": [
                {
                    "type": "text",
-                    "text": "Please extract the Spirometry table from the pdf and return the values in csv format, "
+                    "text": "Please extract the Spirometry table from the pdf and return ONLY the values in CSV format. "
-                    "note that it is the unit of parameter that is beside it and it should not be a column. "
+                    "The CSV should have these columns: Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
-                    "The '-' Should be treated as empty values."
+                    "Rules:\n"
-                    "do not add 'csv' at the start or end of the response",
+                    "1. Include ONLY the data rows (FVC, FEV1, FEV1/FVC%, etc.)\n"
                    "2. Do NOT include units in the data (units are part of parameter name)\n"
                    "3. Use empty string for missing values (not '-' or 'N/A')\n"
                    "4. Do NOT add 'csv' markers or code blocks\n"
                    "5. First line should be the header\n"
                    "Example format:\n"
                    "Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
                    "FVC,4.50,4.75,3.20,4.80,99,-0.10",
                },
                {
                    "type": "file",
@@ -54,11 +75,65 @@ def extract_spirometry_table_from_pdf(pdf_path):
    if "choices" in response_data and len(response_data["choices"]) > 0:
        content = response_data["choices"][0]["message"]["content"]
-        # Save to a CSV file
+        # Clean the content - remove markdown code blocks if present
-        output_file = "extracted_spirometry_table.csv"
+        content = re.sub(r"```csv\n?", "", content)
-        with open(output_file, "w", encoding="utf-8") as f:
+        content = re.sub(r"```\n?", "", content)
-            f.write(content)
+        content = content.strip()
-        return f"Extracted table saved to {output_file}"
+        # Parse and validate CSV
        lines = content.split("\n")
        if not lines:
            raise ValueError("No data extracted from PDF")
        # Ensure output directory exists
        output_path = Path(output_dir)
        output_path.mkdir(exist_ok=True)
        output_file = output_path / "extracted_spirometry_table.csv"
        # Write cleaned CSV with proper formatting
        with open(output_file, "w", encoding="utf-8", newline="") as f:
            # Parse the first line as header
            header_line = lines[0].strip()
            if "," in header_line:
                header = [col.strip() for col in header_line.split(",")]
            else:
                # Default header if not provided
                header = [
                    "Parameters",
                    "Pre",
                    "Best",
                    "LLN",
                    "Pred.",
                    "%Pred.",
                    "ZScore",
                ]
            writer = csv.writer(f)
            writer.writerow(header)
            # Process data rows
            for line in lines[1:]:
                line = line.strip()
                if not line:
                    continue
                # Split by comma and clean each field
                fields = [field.strip() for field in line.split(",")]
                # Ensure we have the right number of fields
                if len(fields) < len(header):
                    # Pad with empty strings
                    fields.extend([""] * (len(header) - len(fields)))
                elif len(fields) > len(header):
                    # Take only the first N fields
                    fields = fields[: len(header)]
                # Replace '-' or 'N/A' with empty string
                fields = ["" if f in ["-", "N/A", "n/a", "NA"] else f for f in fields]
                writer.writerow(fields)
        return str(output_file)
    else:
-        return "No content found in response"
+        error_msg = response_data.get("error", {}).get("message", "Unknown error")
        raise Exception(f"No content found in response: {error_msg}")