65 lines
1.9 KiB
Python
65 lines
1.9 KiB
Python
|
|
import base64
|
||
|
|
import os
|
||
|
|
|
||
|
|
import requests
|
||
|
|
from dotenv import load_dotenv
|
||
|
|
|
||
|
|
load_dotenv()
|
||
|
|
API_KEY_REF = os.getenv("OPENROUTER_API_KEY")
|
||
|
|
|
||
|
|
|
||
|
|
def encode_pdf_to_base64(pdf_path):
|
||
|
|
with open(pdf_path, "rb") as pdf_file:
|
||
|
|
return base64.b64encode(pdf_file.read()).decode("utf-8")
|
||
|
|
|
||
|
|
|
||
|
|
def extract_spirometry_table_from_pdf(pdf_path):
|
||
|
|
url = "https://openrouter.ai/api/v1/chat/completions"
|
||
|
|
headers = {
|
||
|
|
"Authorization": f"Bearer {API_KEY_REF}",
|
||
|
|
"Content-Type": "application/json",
|
||
|
|
}
|
||
|
|
|
||
|
|
# Read and encode the PDF
|
||
|
|
base64_pdf = encode_pdf_to_base64(pdf_path)
|
||
|
|
data_url = f"data:application/pdf;base64,{base64_pdf}"
|
||
|
|
|
||
|
|
messages = [
|
||
|
|
{
|
||
|
|
"role": "user",
|
||
|
|
"content": [
|
||
|
|
{
|
||
|
|
"type": "text",
|
||
|
|
"text": "Please extract the Spirometry table from the pdf and return the values in csv format, "
|
||
|
|
"note that it is the unit of parameter that is beside it and it should not be a column. "
|
||
|
|
"The '-' Should be treated as empty values."
|
||
|
|
"do not add 'csv' at the start or end of the response",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"type": "file",
|
||
|
|
"file": {"filename": "document.pdf", "file_data": data_url},
|
||
|
|
},
|
||
|
|
],
|
||
|
|
}
|
||
|
|
]
|
||
|
|
|
||
|
|
payload = {
|
||
|
|
"model": "google/gemini-2.5-flash-lite",
|
||
|
|
"messages": messages,
|
||
|
|
}
|
||
|
|
|
||
|
|
response = requests.post(url, headers=headers, json=payload)
|
||
|
|
response_data = response.json()
|
||
|
|
|
||
|
|
if "choices" in response_data and len(response_data["choices"]) > 0:
|
||
|
|
content = response_data["choices"][0]["message"]["content"]
|
||
|
|
|
||
|
|
# Save to a CSV file
|
||
|
|
output_file = "extracted_spirometry_table.csv"
|
||
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
||
|
|
f.write(content)
|
||
|
|
|
||
|
|
return f"Extracted table saved to {output_file}"
|
||
|
|
else:
|
||
|
|
return "No content found in response"
|