{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "6eee3ddd", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import fitz" ] }, { "cell_type": "code", "execution_count": 2, "id": "7b50e3ea", "metadata": {}, "outputs": [], "source": [ "file = fitz.open(\"data/~Moran~K~19910201~Spirometry Exam~20250729~20250729032843.pdf\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "b7e1c3ee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 3 image(s) on page 1\n", "Saved: page_1_image_1.png\n", "Saved: page_1_image_2.png\n", "Saved: page_1_image_3.png\n", "\n", "Total images extracted: 3\n", "Images saved in: extracted_images/\n" ] } ], "source": [ "import os\n", "\n", "# Create directory to save images if it doesn't exist\n", "output_dir = \"extracted_images\"\n", "os.makedirs(output_dir, exist_ok=True)\n", "\n", "# Extract all images from the PDF\n", "image_count = 0\n", "for page_num in range(len(file)):\n", " page = file[page_num]\n", " \n", " # Get list of images on this page\n", " image_list = page.get_images()\n", " \n", " if image_list:\n", " print(f\"Found {len(image_list)} image(s) on page {page_num + 1}\")\n", " \n", " for img_index, img in enumerate(image_list):\n", " # Get image reference number\n", " xref = img[0]\n", " \n", " # Extract image data\n", " base_image = file.extract_image(xref)\n", " image_bytes = base_image[\"image\"]\n", " image_ext = base_image[\"ext\"]\n", " \n", " # Create filename\n", " image_filename = f\"page_{page_num + 1}_image_{img_index + 1}.{image_ext}\"\n", " image_path = os.path.join(output_dir, image_filename)\n", " \n", " # Save image\n", " with open(image_path, \"wb\") as image_file:\n", " image_file.write(image_bytes)\n", " \n", " print(f\"Saved: {image_filename}\")\n", " image_count += 1\n", " else:\n", " print(f\"No images found on page {page_num + 1}\")\n", "\n", "print(f\"\\nTotal images extracted: {image_count}\")\n", "print(f\"Images saved in: {output_dir}/\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "e2af9631", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Error extracting tables from page 1: object of type 'TableFinder' has no len()\n", "\n", "Extracted text from 1 pages\n", "Found 0 tables total\n", "\n", "First 1000 characters of extracted text:\n", "\n", "--- Page 1 ---\n", "PRE#1\n", "PRE#2\n", "PRE#3\n", "Spirometry Results\n", "VISIT DATE 2025-07-29\n", "ID\n", "Last Name\n", "Moran\n", "First Name\n", "K\n", "Date of birth\n", "1991-02-01\n", "Origin\n", "Caucasian\n", "Age\n", "34\n", "Gender\n", "F\n", "Height\n", "163 cm\n", "Weight\n", "54 kg\n", "BMI\n", "20.3\n", "ACCEPTABILITY CRITERIA\n", "Quality Grade PRE F Variability FEV1=0.05(1.56%), FVC=0.07(1.68%)\n", "Acceptable trials 0\n", "LLN\n", "Predicted\n", "FVC\n", "FEV1\n", "FEV1/FVC\n", "-5\n", "-4\n", "-3\n", "-2\n", "-1\n", "0\n", "1\n", "2\n", "3\n", "Spirometry\n", "Parameters\n", "FVC\n", "FEV1\n", "FEV1/FVC\n", "PEF\n", "FEF2575\n", "FEF25\n", "FEF50\n", "FEF75\n", "PEFTime\n", "EVol\n", "FEV6\n", "L\n", "L\n", "%\n", "L/m\n", "L/s\n", "L/s\n", "L/s\n", "ms\n", "mL\n", "L\n", "L/s\n", "Best\n", "3.26\n", "76.89\n", "684\n", "2.74\n", "6.08\n", "3.06\n", "1.06\n", "79\n", "78.0\n", "4.24\n", "4.22\n", "LLN\n", "3.03\n", "2.53\n", "72.47\n", "222\n", "2.15\n", "0.0\n", "0.0\n", "0.71\n", "-\n", "-\n", "3.03\n", "Pred.\n", "3.79\n", "3.16\n", "384\n", "3.42\n", "0.0\n", "0.0\n", "1.41\n", "-\n", "-\n", "3.79\n", "83.78\n", "%Pred.\n", "112.0\n", "103.3\n", "91.8\n", "178.7\n", "80.2\n", "-\n", "-\n", "75.1\n", "-\n", "-\n", "111.4\n", "ZScore\n", "0.95\n", "0.28\n", "-1.05\n", "-\n", "-0.84\n", "0.0\n", "0.0\n", "-0.72\n", "-\n", "-\n", "-\n", "PRE#1\n", "4.24\n", "3.26\n", "76.9\n", "444\n", "2.74\n", "6.08\n", "3.06\n", "1.06\n", "79\n", "78.0\n", "4.22\n", "PRE#2\n", "4.17\n", "3.21\n", "77.0\n", "438\n", "2.68\n", "6.0\n", "1.12\n", "49\n", "77.0\n", "4.17\n", "3.1\n", "PRE#3\n", "0.94\n", "684\n", "4.15\n", "2.77\n", "197.0\n", "4.13\n", "75.7\n", "39\n", "2.48\n", "3.14\n", "5.53\n", "NOTE\n", "Spirobank Smart Z114689 Sent on 2025-07-29 15:28\n", "BTPS 1.111 21.0 °C \n" ] } ], "source": [ "# Extract text and tables from the PDF\n", "text_content = \"\"\n", "tables_data = []\n", "\n", "for page_num in range(len(file)):\n", " page = file[page_num]\n", " \n", " # Extract text from the page\n", " page_text = page.get_text()\n", " text_content += f\"\\n--- Page {page_num + 1} ---\\n\"\n", " text_content += page_text\n", " \n", " # Try to find tables using PyMuPDF's table detection\n", " try:\n", " tables = page.find_tables()\n", " if tables:\n", " print(f\"Found {len(tables)} table(s) on page {page_num + 1}\")\n", " for i, table in enumerate(tables):\n", " table_data = table.extract()\n", " tables_data.append({\n", " 'page': page_num + 1,\n", " 'table_index': i,\n", " 'data': table_data\n", " })\n", " print(f\"Table {i+1} on page {page_num + 1}:\")\n", " for row in table_data:\n", " print(row)\n", " print(\"-\" * 50)\n", " except Exception as e:\n", " print(f\"Error extracting tables from page {page_num + 1}: {e}\")\n", "\n", "print(f\"\\nExtracted text from {len(file)} pages\")\n", "print(f\"Found {len(tables_data)} tables total\")\n", "\n", "# Display first 1000 characters of text content to see what we have\n", "print(\"\\nFirst 1000 characters of extracted text:\")\n", "print(text_content[:1000])" ] }, { "cell_type": "code", "execution_count": 4, "id": "d95cd8b1", "metadata": {}, "outputs": [], "source": [ "df = pd.read_excel('data/SECA body comp for all patients.xlsx')" ] }, { "cell_type": "code", "execution_count": 5, "id": "6bbc907f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data shape: (63, 147)\n", "Columns: ['MeasurementDate', 'Comment', 'ExternalDeviceId', 'ExternalPatientId', 'FirstName', 'LastName', 'BirthDate', 'Age', 'Ethnicity', 'Gender', 'Height', 'Height_Unit', 'Weight', 'Weight_Unit', 'WaistCircumference', 'WaistCircumference_Unit', 'PAL', 'Adult_BMI', 'Adult_BMI_Unit', 'Adult_FM', 'Adult_FM_Unit', 'Adult_FMP', 'Adult_FMP_Unit', 'Adult_FMI', 'Adult_FMI_Unit', 'Adult_ZFMI', 'Adult_FFM', 'Adult_FFM_Unit', 'Adult_FFMP', 'Adult_FFMP_Unit', 'Adult_FFMI', 'Adult_FFMI_Unit', 'Adult_TBW', 'Adult_TBW_Unit', 'Adult_TBWP', 'Adult_TBWP_Unit', 'Adult_ECW', 'Adult_ECW_Unit', 'Adult_ECWP', 'Adult_ECWP_Unit', 'Adult_ECWbyTBW', 'Adult_ECWbyTBW_Unit', 'Adult_SMM', 'Adult_SMM_Unit', 'Adult_SMMBMIIndependant', 'Adult_SMMBMIIndependant_Unit', 'Adult_SMMP', 'Adult_SMMP_Unit', 'Adult_SMI', 'Adult_SMI_Unit', 'Adult_SMoA', 'Adult_SMoA_Unit', 'Adult_SMoABMIIndependant', 'Adult_SMoABMIIndependant_Unit', 'Adult_ZSMI', 'Adult_SSMMRightArm', 'Adult_SSMMRightArm_Unit', 'Adult_SSMMLeftArm', 'Adult_SSMMLeftArm_Unit', 'Adult_SSMMRightLeg', 'Adult_SSMMRightLeg_Unit', 'Adult_SSMMLeftLeg', 'Adult_SSMMLeftLeg_Unit', 'Adult_SSMMTorso', 'Adult_SSMMTorso_Unit', 'Adult_SSMMRightArmBMIIndependant', 'Adult_SSMMRightArmBMIIndependant_Unit', 'Adult_SSMMLeftArmBMIIndependant', 'Adult_SSMMLeftArmBMIIndependant_Unit', 'Adult_SSMMRightLegBMIIndependant', 'Adult_SSMMRightLegBMIIndependant_Unit', 'Adult_SSMMLeftLegBMIIndependant', 'Adult_SSMMLeftLegBMIIndependant_Unit', 'Adult_SSMMTorsoBMIIndependant', 'Adult_SSMMTorsoBMIIndependant_Unit', 'Adult_ASMM', 'Adult_ASMM_Unit', 'Adult_ASMI', 'Adult_ASMI_Unit', 'Adult_ASMP', 'Adult_ASMP_Unit', 'Adult_R', 'Adult_R_Unit', 'Adult_XC', 'Adult_XC_Unit', 'Adult_BIVA_ZRh', 'Adult_BIVA_ZXcH', 'Adult_PhA', 'Adult_PhA_Unit', 'Adult_VAT', 'Adult_VAT_Unit', 'Adult_TBS', 'Adult_TBS_Unit', 'Adult_TBS_MuscleScore', 'Adult_TBS_MuscleScore_Unit', 'Adult_TBS_FatScore', 'Adult_TBS_FatScore_Unit', 'Adult_REE_Kcal', 'Adult_REE_MJ', 'Adult_TEE_Kcal', 'Adult_TEE_MJ', 'Child_BMI', 'Child_BMI_Unit', 'Child_FM', 'Child_FM_Unit', 'Child_FMP', 'Child_FMP_Unit', 'Child_FMI', 'Child_FMI_Unit', 'Child_ZFMI', 'Child_FFM', 'Child_FFM_Unit', 'Child_FFMP', 'Child_FFMP_Unit', 'Child_FFMI', 'Child_FFMI_Unit', 'Child_ZFFMI', 'Child_TBW', 'Child_TBW_Unit', 'Child_TBWP', 'Child_TBWP_Unit', 'Child_SMMByKim', 'Child_SMMByKim_Unit', 'Child_SMMP', 'Child_SMMP_Unit', 'Child_SMI', 'Child_SMI_Unit', 'Child_LSTLeftArm', 'Child_LSTLeftArm_Unit', 'Child_LSTRightArm', 'Child_LSTRightArm_Unit', 'Child_LSTLeftLeg', 'Child_LSTLeftLeg_Unit', 'Child_LSTRightLeg', 'Child_LSTRightLeg_Unit', 'Child_R', 'Child_R_Unit', 'Child_XC', 'Child_XC_Unit', 'Child_BIVA_ZRh', 'Child_BIVA_ZXcH', 'Child_PhA', 'Child_PhA_Unit', 'Child_REE_Kcal', 'Child_REE_MJ', 'Child_TEE_Kcal', 'Child_TEE_MJ']\n", "\n", "First few rows:\n" ] }, { "data": { "text/html": [ "
| \n", " | MeasurementDate | \n", "Comment | \n", "ExternalDeviceId | \n", "ExternalPatientId | \n", "FirstName | \n", "LastName | \n", "BirthDate | \n", "Age | \n", "Ethnicity | \n", "Gender | \n", "... | \n", "Child_XC | \n", "Child_XC_Unit | \n", "Child_BIVA_ZRh | \n", "Child_BIVA_ZXcH | \n", "Child_PhA | \n", "Child_PhA_Unit | \n", "Child_REE_Kcal | \n", "Child_REE_MJ | \n", "Child_TEE_Kcal | \n", "Child_TEE_MJ | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "2025-09-05T14:56:27.0000000Z | \n", "NaN | \n", "10000001583275_0055003f5631501320313557 | \n", "LD5163301170 | \n", "Lucy | \n", "Dibenedetto | \n", "1997-08-28T00:00:00.0000000Z | \n", "28 | \n", "Caucasian | \n", "Female | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 1 | \n", "2025-09-03T13:16:22.0000000Z | \n", "NaN | \n", "10000001583275_0055003f5631501320313557 | \n", "NS6479273340 | \n", "Niyanta | \n", "Shah | \n", "1985-03-11T00:00:00.0000000Z | \n", "40 | \n", "Other | \n", "Female | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 2 | \n", "2025-09-03T13:14:23.0000000Z | \n", "NaN | \n", "10000001583275_0055003f5631501320313557 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1985-03-11T00:00:00.0000000Z | \n", "40 | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 3 | \n", "2025-08-27T20:57:32.0000000Z | \n", "NaN | \n", "10000001583275_0055003f5631501320313557 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "1996-04-05T00:00:00.0000000Z | \n", "29 | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 4 | \n", "2025-08-20T14:01:13.0000000Z | \n", "NaN | \n", "10000001583275_0055003f5631501320313557 | \n", "MW4167267833 | \n", "Monica | \n", "Wong | \n", "1985-02-17T00:00:00.0000000Z | \n", "40 | \n", "Asian | \n", "Female | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
5 rows × 147 columns
\n", "