From 0a735d88c8a7ee5625b5217f01ab5b858eeb303b Mon Sep 17 00:00:00 2001 From: bolade Date: Sat, 4 Oct 2025 10:35:02 +0100 Subject: [PATCH] feat: Refactor report generation to use async methods and improve error handling; enhance spirometry table extraction with better CSV formatting --- app/main.py | 10 +- .../report_generator.cpython-312.pyc | Bin 16106 -> 17177 bytes ...spirometry_table_extractor.cpython-312.pyc | Bin 2462 -> 4926 bytes app/services/report_generator.py | 54 +++++----- app/services/spirometry_table_extractor.py | 97 ++++++++++++++++-- 5 files changed, 123 insertions(+), 38 deletions(-) diff --git a/app/main.py b/app/main.py index ae2fcb0..c7ce9ee 100644 --- a/app/main.py +++ b/app/main.py @@ -12,7 +12,6 @@ from pathlib import Path from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi.responses import FileResponse from pydantic import BaseModel - from services.report_generator import ReportGeneratorService app = FastAPI( @@ -138,7 +137,7 @@ async def generate_report( } # Generate report using the service - result = report_service.generate_report( + result = await report_service.generate_report( spirometry_pdf_path=str(spirometry_path), pnoe_csv_path=str(pnoe_path), seca_excel_path=str(seca_path), @@ -153,9 +152,14 @@ async def generate_report( ) except Exception as e: + import traceback + + error_details = traceback.format_exc() + print(f"ERROR: {error_details}") # This will show in terminal + raise HTTPException( status_code=500, - detail=f"Error generating report: {str(e)}", + detail=f"Error generating report: {str(e)}\n{error_details}", ) finally: # Close file handles diff --git a/app/services/__pycache__/report_generator.cpython-312.pyc b/app/services/__pycache__/report_generator.cpython-312.pyc index 00b4f357199a0aa82e2dcc4584d1f891daddec95..6618b06921b5c2f79d82ff955443ea79b08b0221 100644 GIT binary patch delta 3184 zcmZuze@q+K9e?i*+h_ZH_!rp5#y(?0Y>a;pFc3pc14-!;L7HY-m32|-ab5yqK+l(v zu=D5=siIOgC4JhGs7afD)zTeOH*Sqh%=YQv}z74s5m`nzMpFjWWi&N>oJKS#}WlrC{4nJ${(P8m< zJf3|B0XPI8xC|uBRi9fMb2m5-1^7!EfZtP>>9;i+lxs`Wp9dQqBd-(ZFyq-S=b9MT*E`g z<7apr#v_4;4-ZQKbb|B5rViJp)X@txAPeUx2##TEVtlB_Y?#jj{EzExcZ4<9LkX;a{&1%(?=qA^(@XhBH4` zKWu@Q;80ZiE{(b0d4~q*yKpG3{ltN}`$jmlM|+=j622718Jz>eDW}>WHG*h`lTa;ZU+}Iiw|g91{5+g?1~nN1=TRO+tbh&<=CD2NWFEun%Y* z;SD-0M<+rr4=9cKSKOjI8xz0qjV6Ua+GrO5WGHE>ZGicA>l;Q?&=RHgM^0PoZh@Ay z?}6)-io&ioJ71`)No54v8_$_aEPgw@lU9Cp*-i9nheyus+`OTX2HB< zQNK#X)-0PRTUA7D&Q`Fzt+rMw6+EvVAb;I|4owLUfbob@{>+xMVsQh2*B zXh;9)aQGU8#!ZhKXTqjudo`-bAvjgcWQ6Q~U#>y6%C>pjJ%VclYsPVh zYdV#aYoCsHOZJQ!Q}zmNXe>5tVuju6i_7ukb@vB_(5A#Mw9EYDi0ZXA3t^!{Md6+3 zko}lOP$d>vsD1EJxn=udfO9=m^PzIGZ_63eqGqUi$JZ0H<(kCT$M>OS23HijMjr*} zT-=sPRxg+UkdkVYRv>zFo-J>+RrYMpPB5u&=@^cg^Zd$l5nkx8a&vGFRDI$r?x1pU z#qYi!|0iXO9+fh)d}oqBa7Gf-p2=ph7ygy$nT!I&uI4=opexCbWTrD> znuOs`is{)?S*#U*fU6WqyN#_<;!o!_rzLc!>-xfcn_|_y%_>{VV*FHEl!$ld&eW!6 z(n3bSiD`IoW-hyGu34Q>tmdPyrNwESMUo$$IXxxt(=!r(vYK0lucAAZQWZG%6i{+W zY|CXfeKNz3&rD01GZHBTz0v(vN1OO6DTgJ*L5P)*6d|t>LN2FOZRX5GR>~rCVm~^T zkSOs@^m)QpcLeLuSBXgduL(#eW@e|-;~CL{qCF|}W=}i(Bf8de)A%Fe=M6$WMhAO6 z+K&jkgr<6@7dr5%QT+pw1bQydrB;~1$KJs*>suPWUH2!`9aFI*wMJ7uL!NQ3X$i*M zo)4$;?Spyl#T8~>RmR0un0VP_DVYKVQ{cvE(X?lg+E8_@Fp;v`S8~S+?$~Xr=%ehlmZs#Y1^#dw~tF zu}XbiD@=EpVM~m+z<5hcu)qY%=v;EOiL;hCZ-Mh(k#Q_{G4eu@OO=>Zo=KtEe)qz0 zincU91(Y*JT_SHRXun|6}8QY_O7-9Tmk{C@An%EEX#Gdl?8+fZ2}Eh8htTMY7m)Km;lAu={0Ah)x<2Z zv9;#`n#L9rYe=A`rZ0VHTH7~cOm=0PB`A>^V`JmPR;rDC@|;};)Hsv*&pH49f8T#O zXU^phs3(%~804IY)GhH7-b-i|_1y@mo%&C`g|Ta5~i;uNdO6~YDpnvI6*D8Q-6yP_zQ zQQ)*H0$MvKHzKMS5!0Hc464HSV|?^_D5 zJDdyr$9%yN%{iiLj#y@J-Rdb=BROki&03u~xXx4*7+;R@EgsD?b$E4QXntt1cDenl zs?1QH8C++q7n1YI^M{|)nD03ueM-cOG=sx^*dOX|SKe<|_xEbiR^1R?>dJ4pKMMd| zG&I@Blq!G&nVhE&GcG`stRdC8Fg`^d=Ms%&A%de5CpaGL>3HeL-7Apb(|jg z8>_VjVv|hhP-84y0rAVxM8j8N@t=pUCK22wj^UV9cIR+_65b5Wd=0wSXjhu~THG3| zRZ-Gg#Cn+oTF|*ByVA%h&~THVDy_mMP9awzXDr#o#(!g|3N+W`44OI1uJ6tv+{M_- z?5vfu$v%8DIpPV)0jMvaTNlTGIwWG*?Om2*N{S$78iP$iGD&4sruj*c})+nmtF zxmMkBsgfJIp@H*|RuUgM&ApNDAT5g7pdC6&NLAS;d7uX)y#y%+Wq*EO>2N%0HmJR& zn6Y@Y#aqJd6e#(i595405xZsN-pKb74y8hjpd0Zp)s5bdM~x#qPV0>fBVp&wf! z!C{%7tA>?Vz$d#u1HN4VpG4tSduK%Uft7L&G8+ZN|FJ?n&annbtxmm?+%PApSoGFk zt&bFSfz&Bc7#kN;+-Q1sQaCo95{1*F;@F#0siyJt34ye6L^y;clJ;GP)#2@krMid(`#6ucM}KIf|JZ1 zBt+EHz>OWn)yH;7@Ci1)DA zFwc&Olc^bTbY^TiHTu@^sVU)eVqC!@O*(=ebqCQ>Ky|6-!02t9+gVWT2EoX+>C4mE zrjcA=B&+lOpo?Yedh@!zZ4J;nex^;?iufAcQlJw#I+3Tl*BQ%&;rZdLE0OI^=9!VA z1{f^`Hj-l_OUX61HKV{9U+C72o)^@B+k1_^OlPY)*IbDVyB>@bf^E5ATOpXt1(WNH zd%3Ep1fdaX8z@4Ctj>p__4_k))>f0#Me#z>Q;wZ23b)HJRX;E^kp=n(BbZU(}bT2H^BAwk;_a6CckM>5_^e))KgA1O{Vf z{9;>XVm|Q9M63e&I|U;aF}v_R`n7+EvEscveNvcwdqQkNeFGA;iS7)zs9(^-fta^> X6LfBZwoUNLbIqWJGW-sREV29z8YajY diff --git a/app/services/__pycache__/spirometry_table_extractor.cpython-312.pyc b/app/services/__pycache__/spirometry_table_extractor.cpython-312.pyc index 3597aac0c69c56b2b0ddc5434085d819f29df277..6e4ba6b79f1fd0de29489b94444d2c5ad25a8c20 100644 GIT binary patch literal 4926 zcmb6dT})fo`P}P&gE1I`Z6M(i%8vuKf&4Y3OG^SYNgK$Llq#%|V|*^y#`t>YUPJJ% zB~V4urm;6YbNL22Ua|SV0(JCXU97Ju}Xzb9S7C zo{j5f^y7L4v542-E^Sk1OiZP(ku3IDcbh4W#*JA29vU}cBR0IpjGN~fujyO5G~#8H zintKMK`cmoVpegCo*NoDKl;|iq4U9kbEkuE3|$JBh^VBD&{Bb){2H1SByF$BAogum z#X*NqhLJS2P)uKh?rdcyX4*L~jy07uv169!aWJc`hN(&5r*UP_1=Naqk;u%k47!9E z_|2oMx=Uz|@#;ntmP~9JQqrXFWv@Ze3rRkq7$-s^fB2wcnK*oqPlN@`D|*6*uwukK zoq1WsoWzsCWJrn>N0ZSwFH$u*ZtF@QBFyjsA)cBGiJ`(0-lLSacD{^Y#k zED9B*GDt$OC@rZ5_7(6a4nwnm9$IXRkyU-cvS$-9h5;t)erT_IU~kUbo3rL$+J0`k zX)oA&9@vlM?MLpK3ij8Y=_uMW75TB+Zi$rMPx}Tex<6SA0}fNT zPzC}KWmrsmCZmI17iVX583;_Q&ls>lGOPQb`5F3%g`hKi$YzWZ<)MBu(tMaPVY6(a zSQ#_6U@NY`l^?R$hO1;Vw!^6ULw1VAHPA8{3wFpBTnn_Ul1-x(tI|g0UN*=^*?Px$ zyA*?&3b`UyQ(o6bTD)Z{ah+T_id~rk9z@q_!|W6avzOuW%+$)YFQU85e&E!jj5AZ4&`%-! z(l3~dQ)Xi=n#AQ^uF*JT>g2juYk5T*$xaOew6DT#m~B9V=<+{6ih8;JMN-gpk@k2SjoIB+8&JkAyTQjbB|88oh@cwW4za;vPlAzaK<`TM zsT<(C`)ONF27Qpz!x(7Q~d|1J}TOcZ7_wR6y_BiHt*X70IcwY!X^47NZ0$jZStR7?Q-sAX8$$I-*3iB-hWKgH*>! z0w+cIoh&F_n)|kQ2q`I1s&1@o2c1eq*ALfV^%SNJ$ZaMAIrXU3pSSKmjEAF3u9-iK^Fa6eDKX*D2j;AnR=1(=|h%hH|ox@`%ySd?^vEE`EfO%Ut z$4gxHPB~@;!ol2#%2m;%4$qsHg^S$r_yrLE<`m9ULrmj06Tcyw-!;0)#N;I>O>?2R9QU)1W`a zqE#C=5f{SKqVvd|-;7i6)}HKK~)#WIQBBeDSD6>s7-HSRk&L zXLwNzP4OatOhsiyMNeZNNJRu%dn%?94xt4?+H&Q}6|m+ziB}brHo^o(TZ}c`Fd`J) zT0YO(O>+@WrkTGQ6{Vmso!+~R5G>zK#j4g1gcW9v(3+oWt(LXmbH$Y4=csqKm~O90 z*x@m%4>xk8Bh#7Ap%L; z-bzd-gt-J)obCl(N`?fNr{H>o^G0|v3D*VR&w1+=9hH;?)+So>$D1Tl-K}BH2v)U5j&;PQVOf* zf+|SEkm^Uq0FOS3#DBwWynvq6BZK`bz3pp#(}wlv!pV)seYcKfeK}9hx_kd4gU)VT zc>QT5a)k|kTGqv0bulep;^7~H|+E3mK72024cQj`$dB^^>-n_%N7R@_ezU#?5 zPA-~2qT0HZ_T~1~rYu=IQg9qtG_5Az52zbz+$*+a+bS+p zw=NpKv^yUd%Kiijm z=}zm~h2Ezp(4NkWPab^=gy&rb?(*r>p!}6AUk{W*k^VAjXkZ|v*4ljo71^H zeM`f@^BgnAeBl}Uv?a^r+K%VB<2Ux*8^|{dDWef{fd3ro;7}6-SMacM1R%= z=6ZUOLDjWO#|rl5oVj_!+48{ImUp&gr`AZp+4sPCIPX0CoB4Zzg7f^M zTaj}Q7OIAF=ArK%UtmzfY394fe$+4wT|H`en|XGYd5-p$_%fV=PaIum+I64&z;(uD z{Ihew1(UyYxz041|H=;dVDf3eex}9z>G3)k|IKBmZM)&jQ4=k*)cj*1^u@2Rf=CY2 zfwB0%Awpl%iV^N4_(u^r1QYcpR8QZp&=Z&=^~7$OUx6G1mBw*$8DR8=77s$Rsbd)C zD`fi$S-wH`uhHIrqKfxy>s7ABiEGjKqd9kXp~|;V@z}^R1I$zFPZ(yHA@9J({{Ziw B_45D# delta 1086 zcmZ8g&2JM&6rb76uJ^<1^#>n`>qJwziZL{)Qz;Nc+e1p5N=R@33Bd(zJmc8G-nC}N z&WCmt4n33#3D6?7Qq!J#h!ifpacU#3y%69KS*Svye}E$nNL=WQYjfbu&dhti_vXFd z%+CBd{aY^ex8qnqh<){C{jbFR)Htl&FD;we_u#s*0n7Ry52lnYC?kfVi&+)@^}0ZR z0vJXRD}3YyzkpvAtm3h4B~rMI8%PLthdCk@YoRKmqA{%F1U9g_tzZk=;TTQ|&DmB~ z6r2(xj5O?q8cr{WWIESI$yOzW4864G`fIZ~*^GV!s<;?DaF7Cw&0A5M%4&V$Gf?NxW=e?OlG zp*bo80Uoqn<$t%Z^PwdV<1*Ozs0ii&gKG)|*8mjlj=Qd218cBcxEOzHK2U5Jzc9br z;&p<$Ub90OYWS$ix~LY=md8Iue!vOhb&sQl_?S=$>i9wrYN+i|uSGba!vKbs32R^? zLp>kMiPeKn6U!{Zhdfjbnw^$Ei>`Y@>Ew+ZmB-Xe zvEelNqVrxk^+oR zy7TFZP|~8aoVe7tz(Ks}ele>~UVw+1p2_ypV6u2nKC@TfFI+m*RDooYz{~UxpUe5#K*r^cV-@$`zIHlO@F=TJ&mW+Q(MQ+{9wOuGWSmX%yzCmcjo&5 znv;(*2TAAd((R>tOV5*qH>!a8I>?^dIkz+O#9eq|EWC7Q-q;eB;co|QAE&d674>mN zTRfX6o79xMQUXRlk&2GRb@97YzL=iABJPGTiU1gZ%FWSA{7*J-offKmy6p2<$yKb^ LRt3(|c>(+fwigNt diff --git a/app/services/report_generator.py b/app/services/report_generator.py index 044c6f5..fac8b86 100644 --- a/app/services/report_generator.py +++ b/app/services/report_generator.py @@ -10,7 +10,7 @@ from typing import Any, Dict, List import pandas as pd from jinja2 import Environment, FileSystemLoader -from playwright.sync_api import sync_playwright +from playwright.async_api import async_playwright from services.context_generator import ContextGenerator from services.graph_generator import GraphGenerator from services.spirometry_table_extractor import extract_spirometry_table_from_pdf @@ -265,7 +265,7 @@ class ReportGeneratorService: return html_doc - def html_to_pdf(self, html_content: str, pdf_path: str) -> None: + async def html_to_pdf(self, html_content: str, pdf_path: str) -> None: """ Convert HTML content to PDF file. @@ -273,14 +273,14 @@ class ReportGeneratorService: html_content: HTML content as string pdf_path: Path where PDF should be saved """ - with sync_playwright() as p: - browser = p.chromium.launch() - page = browser.new_page() - page.set_content(html_content) - page.pdf(path=pdf_path, format="A4", print_background=True) - browser.close() + async with async_playwright() as p: + browser = await p.chromium.launch() + page = await browser.new_page() + await page.set_content(html_content) + await page.pdf(path=pdf_path, format="A4", print_background=True) + await browser.close() - def generate_report( + async def generate_report( self, spirometry_pdf_path: str, pnoe_csv_path: str, @@ -309,19 +309,18 @@ class ReportGeneratorService: Dictionary containing report path, graphs generated, and analysis data """ # Step 1: Extract spirometry table from PDF - spirometry_csv_path = self.data_dir / "extracted_spirometry_table.csv" - extract_spirometry_table_from_pdf(spirometry_pdf_path) - - # The extraction saves to current directory, move it to data_dir - import shutil - - if Path("extracted_spirometry_table.csv").exists(): - shutil.move("extracted_spirometry_table.csv", spirometry_csv_path) + print("Step 1: Extracting spirometry data from PDF...") + spirometry_csv_path = extract_spirometry_table_from_pdf( + spirometry_pdf_path, output_dir=str(self.data_dir) + ) + print(f"Spirometry data saved to: {spirometry_csv_path}") # Step 2: Process Pnoe data + print("Step 2: Processing Pnoe data...") df = self.process_pnoe_data(pnoe_csv_path) # Step 3: Generate all graphs + print("Step 3: Generating graphs...") graphs_generated = self.generate_graphs(df) # Create graph dictionary with base64 encoded images @@ -370,13 +369,20 @@ class ReportGeneratorService: graphs_dict["body_fat_percent"] = body_fat_b64 # Generate spirometry chart - spirometry_df = pd.read_csv(spirometry_csv_path) - spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart( - spirometry_df, save_as_base64=True - ) - graphs_dict["spirometry_chart"] = spirometry_chart_b64 + print("Step 4: Generating spirometry chart...") + try: + spirometry_df = pd.read_csv(spirometry_csv_path) + print(f"Spirometry data loaded: {len(spirometry_df)} rows") + spirometry_chart_b64 = self.graph_generator.generate_spirometry_chart( + spirometry_df, save_as_base64=True + ) + graphs_dict["spirometry_chart"] = spirometry_chart_b64 + except Exception as e: + print(f"Warning: Could not generate spirometry chart: {e}") + graphs_dict["spirometry_chart"] = "" - # Step 4: Generate context for all pages + # Step 5: Generate context for all pages + print("Step 5: Generating page contexts...") self.context_generator.load_data( pnoe_csv_path, str(spirometry_csv_path), seca_excel_path ) @@ -401,7 +407,7 @@ class ReportGeneratorService: report_path = self.reports_dir / output_filename print(f"Generating PDF report at {report_path}") - self.html_to_pdf(html_content, str(report_path)) + await self.html_to_pdf(html_content, str(report_path)) return { "report_path": str(report_path), diff --git a/app/services/spirometry_table_extractor.py b/app/services/spirometry_table_extractor.py index 79f3901..f4d416c 100644 --- a/app/services/spirometry_table_extractor.py +++ b/app/services/spirometry_table_extractor.py @@ -13,7 +13,21 @@ def encode_pdf_to_base64(pdf_path): return base64.b64encode(pdf_file.read()).decode("utf-8") -def extract_spirometry_table_from_pdf(pdf_path): +def extract_spirometry_table_from_pdf(pdf_path, output_dir="data"): + """ + Extract spirometry table from PDF using AI and save as clean CSV. + + Args: + pdf_path: Path to the spirometry PDF file + output_dir: Directory to save the extracted CSV + + Returns: + Path to the saved CSV file + """ + import csv + import re + from pathlib import Path + url = "https://openrouter.ai/api/v1/chat/completions" headers = { "Authorization": f"Bearer {API_KEY_REF}", @@ -30,10 +44,17 @@ def extract_spirometry_table_from_pdf(pdf_path): "content": [ { "type": "text", - "text": "Please extract the Spirometry table from the pdf and return the values in csv format, " - "note that it is the unit of parameter that is beside it and it should not be a column. " - "The '-' Should be treated as empty values." - "do not add 'csv' at the start or end of the response", + "text": "Please extract the Spirometry table from the pdf and return ONLY the values in CSV format. " + "The CSV should have these columns: Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n" + "Rules:\n" + "1. Include ONLY the data rows (FVC, FEV1, FEV1/FVC%, etc.)\n" + "2. Do NOT include units in the data (units are part of parameter name)\n" + "3. Use empty string for missing values (not '-' or 'N/A')\n" + "4. Do NOT add 'csv' markers or code blocks\n" + "5. First line should be the header\n" + "Example format:\n" + "Parameters,Pre,Best,LLN,Pred.,%Pred.,ZScore\n" + "FVC,4.50,4.75,3.20,4.80,99,-0.10", }, { "type": "file", @@ -54,11 +75,65 @@ def extract_spirometry_table_from_pdf(pdf_path): if "choices" in response_data and len(response_data["choices"]) > 0: content = response_data["choices"][0]["message"]["content"] - # Save to a CSV file - output_file = "extracted_spirometry_table.csv" - with open(output_file, "w", encoding="utf-8") as f: - f.write(content) + # Clean the content - remove markdown code blocks if present + content = re.sub(r"```csv\n?", "", content) + content = re.sub(r"```\n?", "", content) + content = content.strip() - return f"Extracted table saved to {output_file}" + # Parse and validate CSV + lines = content.split("\n") + if not lines: + raise ValueError("No data extracted from PDF") + + # Ensure output directory exists + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + output_file = output_path / "extracted_spirometry_table.csv" + + # Write cleaned CSV with proper formatting + with open(output_file, "w", encoding="utf-8", newline="") as f: + # Parse the first line as header + header_line = lines[0].strip() + if "," in header_line: + header = [col.strip() for col in header_line.split(",")] + else: + # Default header if not provided + header = [ + "Parameters", + "Pre", + "Best", + "LLN", + "Pred.", + "%Pred.", + "ZScore", + ] + + writer = csv.writer(f) + writer.writerow(header) + + # Process data rows + for line in lines[1:]: + line = line.strip() + if not line: + continue + + # Split by comma and clean each field + fields = [field.strip() for field in line.split(",")] + + # Ensure we have the right number of fields + if len(fields) < len(header): + # Pad with empty strings + fields.extend([""] * (len(header) - len(fields))) + elif len(fields) > len(header): + # Take only the first N fields + fields = fields[: len(header)] + + # Replace '-' or 'N/A' with empty string + fields = ["" if f in ["-", "N/A", "n/a", "NA"] else f for f in fields] + + writer.writerow(fields) + + return str(output_file) else: - return "No content found in response" + error_msg = response_data.get("error", {}).get("message", "Unknown error") + raise Exception(f"No content found in response: {error_msg}")