140 lines
4.6 KiB
Python
140 lines
4.6 KiB
Python
import base64
|
|
import os
|
|
|
|
import requests
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
API_KEY_REF = os.getenv("OPENROUTER_API_KEY")
|
|
|
|
|
|
def encode_pdf_to_base64(pdf_path):
|
|
with open(pdf_path, "rb") as pdf_file:
|
|
return base64.b64encode(pdf_file.read()).decode("utf-8")
|
|
|
|
|
|
def extract_spirometry_table_from_pdf(pdf_path, output_dir="data"):
|
|
"""
|
|
Extract spirometry table from PDF using AI and save as clean CSV.
|
|
|
|
Args:
|
|
pdf_path: Path to the spirometry PDF file
|
|
output_dir: Directory to save the extracted CSV
|
|
|
|
Returns:
|
|
Path to the saved CSV file
|
|
"""
|
|
import csv
|
|
import re
|
|
from pathlib import Path
|
|
|
|
url = "https://openrouter.ai/api/v1/chat/completions"
|
|
headers = {
|
|
"Authorization": f"Bearer {API_KEY_REF}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
# Read and encode the PDF
|
|
base64_pdf = encode_pdf_to_base64(pdf_path)
|
|
data_url = f"data:application/pdf;base64,{base64_pdf}"
|
|
|
|
messages = [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "Please extract the Spirometry table from the pdf and return ONLY the values in CSV format. "
|
|
"The CSV should have these columns: Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
|
|
"Rules:\n"
|
|
"1. Include ONLY the data rows (FVC, FEV1, FEV1/FVC%, etc.)\n"
|
|
"2. Do NOT include units in the data (units are part of parameter name)\n"
|
|
"3. Use empty string for missing values (not '-' or 'N/A')\n"
|
|
"4. Do NOT add 'csv' markers or code blocks\n"
|
|
"5. First line should be the header\n"
|
|
"Example format:\n"
|
|
"Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n"
|
|
"FVC,4.50,4.75,3.20,4.80,99,-0.10",
|
|
},
|
|
{
|
|
"type": "file",
|
|
"file": {"filename": "document.pdf", "file_data": data_url},
|
|
},
|
|
],
|
|
}
|
|
]
|
|
|
|
payload = {
|
|
"model": "google/gemini-2.5-flash-lite",
|
|
"messages": messages,
|
|
}
|
|
|
|
response = requests.post(url, headers=headers, json=payload)
|
|
response_data = response.json()
|
|
|
|
if "choices" in response_data and len(response_data["choices"]) > 0:
|
|
content = response_data["choices"][0]["message"]["content"]
|
|
|
|
# Clean the content - remove markdown code blocks if present
|
|
content = re.sub(r"```csv\n?", "", content)
|
|
content = re.sub(r"```\n?", "", content)
|
|
content = content.strip()
|
|
|
|
# Parse and validate CSV
|
|
lines = content.split("\n")
|
|
if not lines:
|
|
raise ValueError("No data extracted from PDF")
|
|
|
|
# Ensure output directory exists
|
|
output_path = Path(output_dir)
|
|
output_path.mkdir(exist_ok=True)
|
|
output_file = output_path / "extracted_spirometry_table.csv"
|
|
|
|
# Write cleaned CSV with proper formatting
|
|
with open(output_file, "w", encoding="utf-8", newline="") as f:
|
|
# Parse the first line as header
|
|
header_line = lines[0].strip()
|
|
if "," in header_line:
|
|
header = [col.strip() for col in header_line.split(",")]
|
|
else:
|
|
# Default header if not provided
|
|
header = [
|
|
"Parameters",
|
|
"Pre",
|
|
"Best",
|
|
"LLN",
|
|
"Pred.",
|
|
"%Pred.",
|
|
"ZScore",
|
|
]
|
|
|
|
writer = csv.writer(f)
|
|
writer.writerow(header)
|
|
|
|
# Process data rows
|
|
for line in lines[1:]:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Split by comma and clean each field
|
|
fields = [field.strip() for field in line.split(",")]
|
|
|
|
# Ensure we have the right number of fields
|
|
if len(fields) < len(header):
|
|
# Pad with empty strings
|
|
fields.extend([""] * (len(header) - len(fields)))
|
|
elif len(fields) > len(header):
|
|
# Take only the first N fields
|
|
fields = fields[: len(header)]
|
|
|
|
# Replace '-' or 'N/A' with empty string
|
|
fields = ["" if f in ["-", "N/A", "n/a", "NA"] else f for f in fields]
|
|
|
|
writer.writerow(fields)
|
|
|
|
return str(output_file)
|
|
else:
|
|
error_msg = response_data.get("error", {}).get("message", "Unknown error")
|
|
raise Exception(f"No content found in response: {error_msg}")
|