Add body fat analysis graph for page 1

This commit is contained in:
bolade
2025-09-24 09:57:15 +01:00
parent 4753276778
commit 845a7ca099
24 changed files with 4139 additions and 41 deletions
+302 -2
View File
@@ -2,12 +2,312 @@
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "6eee3ddd",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
"import pandas as pd\n",
"import fitz"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "7b50e3ea",
"metadata": {},
"outputs": [],
"source": [
"file = fitz.open(\"data/~Moran~K~19910201~Spirometry Exam~20250729~20250729032843.pdf\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b7e1c3ee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 3 image(s) on page 1\n",
"Saved: page_1_image_1.png\n",
"Saved: page_1_image_2.png\n",
"Saved: page_1_image_3.png\n",
"\n",
"Total images extracted: 3\n",
"Images saved in: extracted_images/\n"
]
}
],
"source": [
"import os\n",
"\n",
"# Create directory to save images if it doesn't exist\n",
"output_dir = \"extracted_images\"\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"\n",
"# Extract all images from the PDF\n",
"image_count = 0\n",
"for page_num in range(len(file)):\n",
" page = file[page_num]\n",
" \n",
" # Get list of images on this page\n",
" image_list = page.get_images()\n",
" \n",
" if image_list:\n",
" print(f\"Found {len(image_list)} image(s) on page {page_num + 1}\")\n",
" \n",
" for img_index, img in enumerate(image_list):\n",
" # Get image reference number\n",
" xref = img[0]\n",
" \n",
" # Extract image data\n",
" base_image = file.extract_image(xref)\n",
" image_bytes = base_image[\"image\"]\n",
" image_ext = base_image[\"ext\"]\n",
" \n",
" # Create filename\n",
" image_filename = f\"page_{page_num + 1}_image_{img_index + 1}.{image_ext}\"\n",
" image_path = os.path.join(output_dir, image_filename)\n",
" \n",
" # Save image\n",
" with open(image_path, \"wb\") as image_file:\n",
" image_file.write(image_bytes)\n",
" \n",
" print(f\"Saved: {image_filename}\")\n",
" image_count += 1\n",
" else:\n",
" print(f\"No images found on page {page_num + 1}\")\n",
"\n",
"print(f\"\\nTotal images extracted: {image_count}\")\n",
"print(f\"Images saved in: {output_dir}/\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e2af9631",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Error extracting tables from page 1: object of type 'TableFinder' has no len()\n",
"\n",
"Extracted text from 1 pages\n",
"Found 0 tables total\n",
"\n",
"First 1000 characters of extracted text:\n",
"\n",
"--- Page 1 ---\n",
"PRE#1\n",
"PRE#2\n",
"PRE#3\n",
"Spirometry Results\n",
"VISIT DATE 2025-07-29\n",
"ID\n",
"Last Name\n",
"Moran\n",
"First Name\n",
"K\n",
"Date of birth\n",
"1991-02-01\n",
"Origin\n",
"Caucasian\n",
"Age\n",
"34\n",
"Gender\n",
"F\n",
"Height\n",
"163 cm\n",
"Weight\n",
"54 kg\n",
"BMI\n",
"20.3\n",
"ACCEPTABILITY CRITERIA\n",
"Quality Grade PRE F Variability FEV1=0.05(1.56%), FVC=0.07(1.68%)\n",
"Acceptable trials 0\n",
"LLN\n",
"Predicted\n",
"FVC\n",
"FEV1\n",
"FEV1/FVC\n",
"-5\n",
"-4\n",
"-3\n",
"-2\n",
"-1\n",
"0\n",
"1\n",
"2\n",
"3\n",
"Spirometry\n",
"Parameters\n",
"FVC\n",
"FEV1\n",
"FEV1/FVC\n",
"PEF\n",
"FEF2575\n",
"FEF25\n",
"FEF50\n",
"FEF75\n",
"PEFTime\n",
"EVol\n",
"FEV6\n",
"L\n",
"L\n",
"%\n",
"L/m\n",
"L/s\n",
"L/s\n",
"L/s\n",
"ms\n",
"mL\n",
"L\n",
"L/s\n",
"Best\n",
"3.26\n",
"76.89\n",
"684\n",
"2.74\n",
"6.08\n",
"3.06\n",
"1.06\n",
"79\n",
"78.0\n",
"4.24\n",
"4.22\n",
"LLN\n",
"3.03\n",
"2.53\n",
"72.47\n",
"222\n",
"2.15\n",
"0.0\n",
"0.0\n",
"0.71\n",
"-\n",
"-\n",
"3.03\n",
"Pred.\n",
"3.79\n",
"3.16\n",
"384\n",
"3.42\n",
"0.0\n",
"0.0\n",
"1.41\n",
"-\n",
"-\n",
"3.79\n",
"83.78\n",
"%Pred.\n",
"112.0\n",
"103.3\n",
"91.8\n",
"178.7\n",
"80.2\n",
"-\n",
"-\n",
"75.1\n",
"-\n",
"-\n",
"111.4\n",
"ZScore\n",
"0.95\n",
"0.28\n",
"-1.05\n",
"-\n",
"-0.84\n",
"0.0\n",
"0.0\n",
"-0.72\n",
"-\n",
"-\n",
"-\n",
"PRE#1\n",
"4.24\n",
"3.26\n",
"76.9\n",
"444\n",
"2.74\n",
"6.08\n",
"3.06\n",
"1.06\n",
"79\n",
"78.0\n",
"4.22\n",
"PRE#2\n",
"4.17\n",
"3.21\n",
"77.0\n",
"438\n",
"2.68\n",
"6.0\n",
"1.12\n",
"49\n",
"77.0\n",
"4.17\n",
"3.1\n",
"PRE#3\n",
"0.94\n",
"684\n",
"4.15\n",
"2.77\n",
"197.0\n",
"4.13\n",
"75.7\n",
"39\n",
"2.48\n",
"3.14\n",
"5.53\n",
"NOTE\n",
"Spirobank Smart Z114689 Sent on 2025-07-29 15:28\n",
"BTPS 1.111 21.0 °C \n"
]
}
],
"source": [
"# Extract text and tables from the PDF\n",
"text_content = \"\"\n",
"tables_data = []\n",
"\n",
"for page_num in range(len(file)):\n",
" page = file[page_num]\n",
" \n",
" # Extract text from the page\n",
" page_text = page.get_text()\n",
" text_content += f\"\\n--- Page {page_num + 1} ---\\n\"\n",
" text_content += page_text\n",
" \n",
" # Try to find tables using PyMuPDF's table detection\n",
" try:\n",
" tables = page.find_tables()\n",
" if tables:\n",
" print(f\"Found {len(tables)} table(s) on page {page_num + 1}\")\n",
" for i, table in enumerate(tables):\n",
" table_data = table.extract()\n",
" tables_data.append({\n",
" 'page': page_num + 1,\n",
" 'table_index': i,\n",
" 'data': table_data\n",
" })\n",
" print(f\"Table {i+1} on page {page_num + 1}:\")\n",
" for row in table_data:\n",
" print(row)\n",
" print(\"-\" * 50)\n",
" except Exception as e:\n",
" print(f\"Error extracting tables from page {page_num + 1}: {e}\")\n",
"\n",
"print(f\"\\nExtracted text from {len(file)} pages\")\n",
"print(f\"Found {len(tables_data)} tables total\")\n",
"\n",
"# Display first 1000 characters of text content to see what we have\n",
"print(\"\\nFirst 1000 characters of extracted text:\")\n",
"print(text_content[:1000])"
]
},
{