import base64 import os import requests from dotenv import load_dotenv load_dotenv() API_KEY_REF = os.getenv("OPENROUTER_API_KEY") def encode_pdf_to_base64(pdf_path): with open(pdf_path, "rb") as pdf_file: return base64.b64encode(pdf_file.read()).decode("utf-8") def extract_spirometry_table_from_pdf(pdf_path, output_dir="data"): """ Extract spirometry table from PDF using AI and save as clean CSV. Args: pdf_path: Path to the spirometry PDF file output_dir: Directory to save the extracted CSV Returns: Path to the saved CSV file """ import csv import re from pathlib import Path url = "https://openrouter.ai/api/v1/chat/completions" headers = { "Authorization": f"Bearer {API_KEY_REF}", "Content-Type": "application/json", } # Read and encode the PDF base64_pdf = encode_pdf_to_base64(pdf_path) data_url = f"data:application/pdf;base64,{base64_pdf}" messages = [ { "role": "user", "content": [ { "type": "text", "text": "Please extract the Spirometry table from the pdf and return ONLY the values in CSV format. " "The CSV should have these columns: Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n" "Rules:\n" "1. Include ONLY the data rows (FVC, FEV1, FEV1/FVC%, etc.)\n" "2. Do NOT include units in the data (units are part of parameter name)\n" "3. Use empty string for missing values (not '-' or 'N/A')\n" "4. Do NOT add 'csv' markers or code blocks\n" "5. First line should be the header\n" "Example format:\n" "Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n" "FVC,4.50,4.75,3.20,4.80,99,-0.10", }, { "type": "file", "file": {"filename": "document.pdf", "file_data": data_url}, }, ], } ] payload = { "model": "google/gemini-2.5-flash-lite", "messages": messages, } response = requests.post(url, headers=headers, json=payload) response_data = response.json() if "choices" in response_data and len(response_data["choices"]) > 0: content = response_data["choices"][0]["message"]["content"] # Clean the content - remove markdown code blocks if present content = re.sub(r"```csv\n?", "", content) content = re.sub(r"```\n?", "", content) content = content.strip() # Parse and validate CSV lines = content.split("\n") if not lines: raise ValueError("No data extracted from PDF") # Ensure output directory exists output_path = Path(output_dir) output_path.mkdir(exist_ok=True) output_file = output_path / "extracted_spirometry_table.csv" # Write cleaned CSV with proper formatting with open(output_file, "w", encoding="utf-8", newline="") as f: # Parse the first line as header header_line = lines[0].strip() if "," in header_line: header = [col.strip() for col in header_line.split(",")] else: # Default header if not provided header = [ "Parameters", "Pre", "Best", "LLN", "Pred.", "%Pred.", "ZScore", ] writer = csv.writer(f) writer.writerow(header) # Process data rows for line in lines[1:]: line = line.strip() if not line: continue # Split by comma and clean each field fields = [field.strip() for field in line.split(",")] # Ensure we have the right number of fields if len(fields) < len(header): # Pad with empty strings fields.extend([""] * (len(header) - len(fields))) elif len(fields) > len(header): # Take only the first N fields fields = fields[: len(header)] # Replace '-' or 'N/A' with empty string fields = ["" if f in ["-", "N/A", "n/a", "NA"] else f for f in fields] writer.writerow(fields) return str(output_file) else: error_msg = response_data.get("error", {}).get("message", "Unknown error") raise Exception(f"No content found in response: {error_msg}")