{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "b18c1027", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'id': 'gen-1759135172-DIhs7TMuaaVY0h3T2ibV', 'provider': 'Google', 'model': 'google/gemini-2.5-flash-lite', 'object': 'chat.completion', 'created': 1759135172, 'choices': [{'logprobs': None, 'finish_reason': 'stop', 'native_finish_reason': 'STOP', 'index': 0, 'message': {'role': 'assistant', 'content': 'Parameters,Best,LLN,Pred.,%Pred.,ZScore,PRE#1,PRE#2,PRE#3\\nFVC,L,4.24,3.03,3.79,112.0,0.95,4.24,4.17,4.15\\nFEV1,L,3.26,2.53,3.16,103.3,0.28,3.26,3.21,3.14\\nFEV1/FVC%,76.89,72.47,83.78,91.8,-1.05,76.9,77.0,75.7\\nPEF,L/m,684,222,384,178.7,-,444,438,684\\nFEF2575,L/s,2.74,2.15,3.42,80.2,-0.84,2.74,2.68,2.48\\nFEF25,L/s,6.08,-,-,-,6.08,6.0,5.53\\nFEF50,L/s,3.06,-,-,-,3.06,3.1,2.77\\nFEF75,L/s,1.06,0.71,1.41,75.1,-0.72,1.06,1.12,0.94\\nPEFTime,ms,-,-,79,-,79,49,39\\nEvol,mL,-,-,78.0,-,78.0,77.0,197.0\\nFEV6,L,4.22,3.03,3.79,111.4,-,4.22,4.17,4.13', 'refusal': None, 'reasoning': None}}], 'usage': {'prompt_tokens': 1350, 'completion_tokens': 454, 'total_tokens': 1804, 'prompt_tokens_details': {'cached_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'image_tokens': 0}}}\n", "Content saved to extracted_table.csv\n" ] } ], "source": [ "\n", "import requests\n", "import json\n", "import base64\n", "from pathlib import Path\n", "\n", "API_KEY_REF = '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'\n", "def encode_pdf_to_base64(pdf_path):\n", " with open(pdf_path, \"rb\") as pdf_file:\n", " return base64.b64encode(pdf_file.read()).decode('utf-8')\n", "\n", "url = \"https://openrouter.ai/api/v1/chat/completions\"\n", "headers = {\n", " \"Authorization\": f\"Bearer {API_KEY_REF}\",\n", " \"Content-Type\": \"application/json\"\n", "}\n", "\n", "# Read and encode the PDF\n", "pdf_path = \"data/~Moran~K~19910201~Spirometry Exam~20250729~20250729032843.pdf\"\n", "base64_pdf = encode_pdf_to_base64(pdf_path)\n", "data_url = f\"data:application/pdf;base64,{base64_pdf}\"\n", "\n", "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\n", " \"type\": \"text\",\n", " \"text\": \"Please extract the Spirometry table from the pdf and return the values in csv format, \"\n", " \"note that it is the unit of parameter that is beside it and it should not be a column. \"\n", " \"The '-' Should be treated as empty values.\"\n", " \"do not add 'csv' at the start or end of the response\"\n", " },\n", " {\n", " \"type\": \"file\",\n", " \"file\": {\n", " \"filename\": \"document.pdf\",\n", " \"file_data\": data_url\n", " }\n", " },\n", " ]\n", " }\n", "]\n", "\n", "# Optional: Configure PDF processing engine\n", "# PDF parsing will still work even if the plugin is not explicitly set\n", "plugins = [\n", " {\n", " \"id\": \"file-parser\",\n", " \"pdf\": {\n", " \"engine\": \"pdf-text\" # defaults to \"mistral-ocr\". See Pricing above\n", " }\n", " }\n", "]\n", "\n", "payload = {\n", " \"model\": \"google/gemini-2.5-flash-lite\",\n", " \"messages\": messages,\n", "}\n", "\n", "response = requests.post(url, headers=headers, json=payload)\n", "# Get the response content\n", "response_data = response.json()\n", "print(response_data)\n", "\n", "# Extract the content from the response\n", "if 'choices' in response_data and len(response_data['choices']) > 0:\n", " content = response_data['choices'][0]['message']['content']\n", " \n", " # Save to a CSV file\n", " output_file = \"extracted_table.csv\"\n", " with open(output_file, 'w', encoding='utf-8') as f:\n", " f.write(content)\n", " \n", " print(f\"Content saved to {output_file}\")\n", "else:\n", " print(\"No content found in response\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "56a9d655", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "FVC Best: 4.24, FVC Pred: 112.0\n", "FEV1 Best: 3.26, FEV1 Pred: 103.3\n", "FEV1/FVC% Best: 76.89, FEV1/FVC% Pred: 91.8\n" ] } ], "source": [ "import pandas as pd\n", "spirometry_df = pd.read_csv(\"extracted_table.csv\")\n", "\n", "fvc_best = spirometry_df.loc[spirometry_df['Parameters'] == 'FVC', 'Best'].values[0]\n", "fvc_pred = spirometry_df.loc[spirometry_df['Parameters'] == 'FVC', '%Pred.'].values[0]\n", "\n", "fev1_best = spirometry_df.loc[spirometry_df['Parameters'] == 'FEV1', 'Best'].values[0]\n", "fev1_pred = spirometry_df.loc[spirometry_df['Parameters'] == 'FEV1', '%Pred.'].values[0]\n", "\n", "fev1_fevc_best = spirometry_df.loc[spirometry_df['Parameters'] == 'FEV1/FVC%', 'Best'].values[0]\n", "fev1_fevc_pred = spirometry_df.loc[spirometry_df['Parameters'] == 'FEV1/FVC%', '%Pred.'].values[0]\n", "\n", "print(f\"FVC Best: {fvc_best}, FVC Pred: {fvc_pred}\")\n", "print(f\"FEV1 Best: {fev1_best}, FEV1 Pred: {fev1_pred}\")\n", "print(f\"FEV1/FVC% Best: {fev1_fevc_best}, FEV1/FVC% Pred: {fev1_fevc_pred}\")" ] }, { "cell_type": "code", "execution_count": 16, "id": "990f4b4f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Peak VT: 2.75\n", "HR at Peak VT: 155.0\n" ] } ], "source": [ "df = pd.read_csv('data/Pnoe_20250729_1550-Moran_Keirstyn.csv', delimiter=';')\n", "peak_vt = df['VT(l)'].max()\n", "max_vt_row = df.loc[df['VT(l)'].idxmax()]\n", "print(f\"Peak VT: {peak_vt}\")\n", "hr = max_vt_row['HR(bpm)']\n", "print(f\"HR at Peak VT: {hr}\")" ] }, { "cell_type": "code", "execution_count": 18, "id": "041cbc3d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Peak VT: 2.3770000000000002\n", "HR at Peak VT: 171.525\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_301535/4157056299.py:3: FutureWarning: errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead\n", " df = df.apply(pd.to_numeric, errors='ignore')\n" ] } ], "source": [ "df = pd.read_csv('data/Pnoe_20250729_1550-Moran_Keirstyn.csv', delimiter=';')\n", "# Convert all columns to numeric where possible, coercing errors to NaN\n", "df = df.apply(pd.to_numeric, errors='ignore')\n", "df['VO2 Pulse'] = df['VO2(ml/min)'] / df['HR(bpm)'] # VO2 Pulse in mL/beat\n", "df['VO2 Breath'] = df['VO2(ml/min)'] / df['BF(bpm)'] # VO2 per Breath in mL/breath\n", "df['CHO'] = df['EE(kcal/min)'] * df['CARBS(%)']/100\n", "df['FAT'] = df['EE(kcal/min)'] * df['FAT(%)']/100\n", "# Smooth key columns using rolling window\n", "window_size = 10\n", "\n", "# List of columns to smooth\n", "columns_to_smooth = ['VO2(ml/min)', 'VCO2(ml/min)', 'HR(bpm)', 'VT(l)', 'BF(bpm)', 'VE(l/min)', 'VO2 Pulse', 'VO2 Breath', 'CHO', 'FAT']\n", "\n", "# Apply smoothing to each column\n", "for col in columns_to_smooth:\n", " if col in df.columns:\n", " df[f'{col}_smoothed'] = df[col].rolling(window=window_size).mean()\n", " \n", "peak_vt = df['VT(l)_smoothed'].max()\n", "max_vt_row = df.loc[df['VT(l)_smoothed'].idxmax()]\n", "print(f\"Peak VT: {peak_vt}\")\n", "hr = max_vt_row['HR(bpm)_smoothed']\n", "print(f\"HR at Peak VT: {hr}\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "de7cadd1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Percent FEV: 72.91411042944786\n" ] } ], "source": [ "percent_fev = (peak_vt / fev1_best) * 100\n", "print(f\"Percent FEV: {percent_fev}\")" ] }, { "cell_type": "code", "execution_count": 24, "id": "cb972ed3", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | MeasurementDate | \n", "Comment | \n", "ExternalDeviceId | \n", "ExternalPatientId | \n", "FirstName | \n", "LastName | \n", "BirthDate | \n", "Age | \n", "Ethnicity | \n", "Gender | \n", "... | \n", "Child_XC | \n", "Child_XC_Unit | \n", "Child_BIVA_ZRh | \n", "Child_BIVA_ZXcH | \n", "Child_PhA | \n", "Child_PhA_Unit | \n", "Child_REE_Kcal | \n", "Child_REE_MJ | \n", "Child_TEE_Kcal | \n", "Child_TEE_MJ | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 13 | \n", "2025-07-29T18:58:54.0000000Z | \n", "NaN | \n", "10000001583275_0055003f5631501320313557 | \n", "KM6479696509 | \n", "Keirstyn | \n", "Moran | \n", "1991-02-01T00:00:00.0000000Z | \n", "34 | \n", "Caucasian | \n", "Female | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
1 rows × 147 columns
\n", "