From c53455cc06e54daba58b956bdfa555b560030fd1 Mon Sep 17 00:00:00 2001 From: michael Date: Mon, 27 Oct 2025 19:15:47 +0000 Subject: [PATCH] feat: Enhance compatibility scoring and report generation with new methods and models --- app/routers/investors.py | 33 ++++-- app/routers/report_route.py | 2 +- app/services/compatibility_score.py | 178 ++++++++++++++++++++++------ app/services/report_gen.py | 31 ++++- investors.db | Bin 29949952 -> 29958144 bytes 5 files changed, 194 insertions(+), 50 deletions(-) diff --git a/app/routers/investors.py b/app/routers/investors.py index 207f5a7..4448c60 100644 --- a/app/routers/investors.py +++ b/app/routers/investors.py @@ -12,7 +12,11 @@ from schemas.router_schemas import ( PaginatedResponse, SectorMinimal, ) -from services.compatibility_score import calculate_project_investor_compatibility +from services.compatibility_score import ( + calculate_project_investor_compatibility, + _calculate_project_fund_compatibility, + _calculate_project_investor_direct_compatibility, +) from sqlalchemy.orm import Session, selectinload router = APIRouter(tags=["Investor Routes"]) @@ -95,13 +99,6 @@ def read_investors( # Transform to InvestmentResponse format (one row per investor-fund combination) investment_responses = [] for investor in investors: - # Calculate compatibility score if project provided - compatibility_score = 1.0 - if project is not None: - compatibility_score = calculate_project_investor_compatibility( - project=project, investor=investor, use_funds=True - ) - # Get top 3 portfolio companies (id and name only) portfolio_companies = [ CompanyMinimal(id=company.id, name=company.name) @@ -111,6 +108,13 @@ def read_investors( # If investor has funds, create one entry per fund if investor.funds: for fund in investor.funds: + # Calculate compatibility score for this specific fund + compatibility_score = 1.0 + if project is not None: + compatibility_score = _calculate_project_fund_compatibility( + project=project, fund=fund + ) + # Get stage focus as comma-separated string stage_focus = ( ", ".join([stage.name for stage in fund.investment_stages]) @@ -141,6 +145,13 @@ def read_investors( investment_responses.append(investment_response) else: # If no funds, create one entry with null fund fields + # Calculate compatibility using investor-level data + compatibility_score = 1.0 + if project is not None: + compatibility_score = _calculate_project_investor_direct_compatibility( + project=project, investor=investor + ) + investment_response = InvestmentResponse( id=investor.id, name=investor.name, @@ -255,11 +266,11 @@ def filter_investors( for fund in funds: investor = fund.investor - # Calculate compatibility score if project provided + # Calculate compatibility score for this specific fund compatibility_score = 1.0 if project is not None: - compatibility_score = calculate_project_investor_compatibility( - project=project, investor=investor, use_funds=True + compatibility_score = _calculate_project_fund_compatibility( + project=project, fund=fund ) # Get top 3 portfolio companies (id and name only) diff --git a/app/routers/report_route.py b/app/routers/report_route.py index 6f422ac..9e00ea3 100644 --- a/app/routers/report_route.py +++ b/app/routers/report_route.py @@ -106,7 +106,7 @@ async def generate_investor_report( # Generate PDF report report_generator = ReportGenerator() pdf_bytes = await report_generator.generate_investor_report( - investor_data, project_data + investor_data, project_data, investor_model=investor, project_model=project ) # Return PDF as downloadable file diff --git a/app/services/compatibility_score.py b/app/services/compatibility_score.py index 6821a1a..1576fdb 100644 --- a/app/services/compatibility_score.py +++ b/app/services/compatibility_score.py @@ -6,6 +6,7 @@ The scoring system evaluates multiple dimensions to determine how well a project matches with an investor's investment criteria. """ +from difflib import SequenceMatcher from typing import List, Optional, Tuple from db.models import FundTable, InvestorTable, ProjectTable @@ -99,12 +100,16 @@ def _calculate_project_fund_compatibility( else str(project.stage) ) - if project_stage_name in fund_stage_names: + # Normalize both for case-insensitive comparison + project_stage_normalized = project_stage_name.upper().strip() + fund_stages_normalized = {name.upper().strip() for name in fund_stage_names} + + if project_stage_normalized in fund_stages_normalized: stage_score = 30 else: # Partial credit for adjacent stages stage_score = _calculate_stage_proximity( - project_stage_name, fund_stage_names + project_stage_normalized, fund_stages_normalized ) total_score += stage_score @@ -112,22 +117,53 @@ def _calculate_project_fund_compatibility( # 2. Sector Overlap (30 points) sector_score = 0 if project.sector and fund.sectors: - project_sector_ids = {sector.id for sector in project.sector} - fund_sector_ids = {sector.id for sector in fund.sectors} - - if project_sector_ids and fund_sector_ids: - common_sectors = project_sector_ids.intersection(fund_sector_ids) - # Score based on what percentage of project sectors are covered by fund - overlap_ratio = len(common_sectors) / len(project_sector_ids) - sector_score = int(30 * overlap_ratio) + project_sectors = [s for s in project.sector if hasattr(s, 'name')] + fund_sectors = [s for s in fund.sectors if hasattr(s, 'name')] + + if project_sectors and fund_sectors: + # Use fuzzy matching to account for similar but not identical sector names + match_count = 0 + total_matches = 0 + + for proj_sector in project_sectors: + best_match_score = 0 + proj_name = proj_sector.name.lower().strip() + + for fund_sector in fund_sectors: + fund_name = fund_sector.name.lower().strip() + + # Exact match + if proj_name == fund_name: + best_match_score = 1.0 + break + + # Fuzzy match using sequence matcher + similarity = SequenceMatcher(None, proj_name, fund_name).ratio() + + # Also check if one contains the other (substring match) + if proj_name in fund_name or fund_name in proj_name: + similarity = max(similarity, 0.8) + + best_match_score = max(best_match_score, similarity) + + # Count matches with threshold + # Perfect match (1.0), strong match (>0.75), partial match (>0.6) + if best_match_score >= 0.6: + total_matches += best_match_score + match_count += 1 + + if match_count > 0: + # Calculate overlap ratio based on fuzzy matches + overlap_ratio = total_matches / len(project_sectors) + sector_score = int(30 * overlap_ratio) total_score += sector_score # 3. Geographic Match (20 points) geo_score = 0 if project.location and fund.geographic_focus: - project_location_lower = project.location.lower() - fund_geo_lower = (fund.geographic_focus or "").lower() + project_location_lower = project.location.lower().strip() + fund_geo_lower = (fund.geographic_focus or "").lower().strip() # Exact match if project_location_lower == fund_geo_lower: @@ -137,10 +173,10 @@ def _calculate_project_fund_compatibility( project_location_lower in fund_geo_lower or fund_geo_lower in project_location_lower ): - geo_score = 10 - # Check for common geographic terms + geo_score = 15 + # Check for common geographic terms or regional overlap elif _check_geographic_overlap(project_location_lower, fund_geo_lower): - geo_score = 5 + geo_score = 12 total_score += geo_score @@ -209,13 +245,44 @@ def _calculate_project_investor_direct_compatibility( # 2. Sector Overlap (30 points) sector_score = 0 if project.sector and investor.sectors: - project_sector_ids = {sector.id for sector in project.sector} - investor_sector_ids = {sector.id for sector in investor.sectors} - - if project_sector_ids and investor_sector_ids: - common_sectors = project_sector_ids.intersection(investor_sector_ids) - overlap_ratio = len(common_sectors) / len(project_sector_ids) - sector_score = int(30 * overlap_ratio) + project_sectors = [s for s in project.sector if hasattr(s, 'name')] + investor_sectors = [s for s in investor.sectors if hasattr(s, 'name')] + + if project_sectors and investor_sectors: + # Use fuzzy matching to account for similar but not identical sector names + match_count = 0 + total_matches = 0 + + for proj_sector in project_sectors: + best_match_score = 0 + proj_name = proj_sector.name.lower().strip() + + for inv_sector in investor_sectors: + inv_name = inv_sector.name.lower().strip() + + # Exact match + if proj_name == inv_name: + best_match_score = 1.0 + break + + # Fuzzy match using sequence matcher + similarity = SequenceMatcher(None, proj_name, inv_name).ratio() + + # Also check if one contains the other (substring match) + if proj_name in inv_name or inv_name in proj_name: + similarity = max(similarity, 0.8) + + best_match_score = max(best_match_score, similarity) + + # Count matches with threshold + if best_match_score >= 0.6: + total_matches += best_match_score + match_count += 1 + + if match_count > 0: + # Calculate overlap ratio based on fuzzy matches + overlap_ratio = total_matches / len(project_sectors) + sector_score = int(30 * overlap_ratio) total_score += sector_score @@ -278,8 +345,11 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int: """ stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"] + # Normalize project stage for comparison + project_stage_normalized = project_stage.upper().strip() + try: - project_idx = stage_order.index(project_stage) + project_idx = stage_order.index(project_stage_normalized) except ValueError: return 0 @@ -290,8 +360,10 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int: if project_idx < len(stage_order) - 1: adjacent_stages.append(stage_order[project_idx + 1]) + # Normalize fund stages and check for matches for stage in fund_stages: - if stage in adjacent_stages: + stage_normalized = stage.upper().strip() + if stage_normalized in adjacent_stages: return 15 # Half credit for adjacent stage return 0 @@ -305,24 +377,62 @@ def _check_geographic_overlap(location1: str, location2: str) -> bool: - "San Francisco, CA" and "California" -> True - "New York" and "USA" -> True (if both contain USA/US) - "London, UK" and "United Kingdom" -> True + - "Germany" and "Europe" -> True """ - # Common geographic groupings + # Normalize inputs + loc1 = location1.lower().strip() + loc2 = location2.lower().strip() + + # Common geographic groupings with broader regional mappings geo_groups = [ - ["usa", "us", "united states", "america"], - ["uk", "united kingdom", "britain"], - ["california", "ca"], - ["new york", "ny"], + # North America + ["usa", "us", "united states", "america", "u.s.", "u.s.a"], + ["canada", "canadian"], + ["mexico", "mexican"], + + # Europe and countries + ["europe", "european", "eu", "germany", "france", "uk", "united kingdom", + "britain", "spain", "italy", "netherlands", "belgium", "sweden", "denmark", + "norway", "finland", "poland", "portugal", "austria", "switzerland", + "ireland", "greece", "czech", "romania"], + + # UK specific + ["uk", "united kingdom", "britain", "england", "scotland", "wales", "london"], + + # US states + ["california", "ca", "san francisco", "los angeles", "silicon valley"], + ["new york", "ny", "nyc"], ["texas", "tx"], - ["europe", "eu"], - ["asia", "asian"], - ["africa", "african"], + ["massachusetts", "ma", "boston"], + ["washington", "seattle"], + + # Asia + ["asia", "asian", "china", "japan", "korea", "singapore", "hong kong", + "india", "indonesia", "thailand", "vietnam", "malaysia", "philippines"], + + # Middle East + ["middle east", "israel", "uae", "dubai", "saudi arabia"], + + # Latin America + ["latin america", "brazil", "argentina", "chile", "colombia", "mexico"], + + # Africa + ["africa", "african", "south africa", "nigeria", "kenya", "egypt"], + + # Oceania + ["australia", "australian", "new zealand"], ] + # Check if both locations match any group for group in geo_groups: - found_in_1 = any(term in location1 for term in group) - found_in_2 = any(term in location2 for term in group) + found_in_1 = any(term in loc1 for term in group) + found_in_2 = any(term in loc2 for term in group) if found_in_1 and found_in_2: return True + + # Check for direct substring match (one contains the other) + if loc1 in loc2 or loc2 in loc1: + return True return False diff --git a/app/services/report_gen.py b/app/services/report_gen.py index fcfe220..b529168 100644 --- a/app/services/report_gen.py +++ b/app/services/report_gen.py @@ -4,6 +4,10 @@ from typing import Any, Dict, List, Optional from jinja2 import Environment, FileSystemLoader from playwright.async_api import async_playwright +# Import database models and compatibility score service +from db.models import InvestorTable, ProjectTable +from services.compatibility_score import calculate_project_investor_compatibility + class ReportGenerator: """Service for generating PDF reports from HTML templates""" @@ -17,6 +21,8 @@ class ReportGenerator: self, investor_data: Dict[str, Any], project_data: Optional[Dict[str, Any]] = None, + investor_model: Optional[InvestorTable] = None, + project_model: Optional[ProjectTable] = None, ) -> bytes: """ Generate a PDF report for an investor profile. @@ -24,12 +30,16 @@ class ReportGenerator: Args: investor_data: Dictionary containing investor information project_data: Optional dictionary containing project information for compatibility analysis + investor_model: Optional database model for investor (used for compatibility scoring) + project_model: Optional database model for project (used for compatibility scoring) Returns: bytes: PDF file content """ # Prepare template context - context = self._prepare_context(investor_data, project_data) + context = self._prepare_context( + investor_data, project_data, investor_model, project_model + ) # Render HTML from template template = self.env.get_template("report.html") @@ -43,6 +53,8 @@ class ReportGenerator: self, investor_data: Dict[str, Any], project_data: Optional[Dict[str, Any]] = None, + investor_model: Optional[InvestorTable] = None, + project_model: Optional[ProjectTable] = None, ) -> Dict[str, Any]: """Prepare the context dictionary for template rendering""" context = { @@ -55,9 +67,20 @@ class ReportGenerator: # If project data is provided, calculate compatibility if project_data: - context["compatibility_score"] = self._calculate_compatibility_score( - investor_data, project_data - ) + # Use the compatibility_score service if models are provided + if investor_model and project_model: + # Calculate using the standardized compatibility score service + # Returns score between 0 and 1, convert to percentage (0-100) + score_decimal = calculate_project_investor_compatibility( + project=project_model, investor=investor_model, use_funds=True + ) + context["compatibility_score"] = int(score_decimal * 100) + else: + # Fallback to old calculation method if models not provided + context["compatibility_score"] = self._calculate_compatibility_score( + investor_data, project_data + ) + context["match_criteria"] = self._generate_match_criteria( investor_data, project_data ) diff --git a/investors.db b/investors.db index c29bad880c437a03d34d3f27f7e46516d4448280..0f3e8ea86b19898d190c31a14a1cf45b60ae927a 100644 GIT binary patch delta 6611 zcmaKx4SW;j9mn%DY11@qk3!$l7C8_ogSIvep-_iV1QGEC2FQyJxF*-+AiWEBNlOte zD3cjqUwF}NMTE8BTUFe^m#N!ypyEKC1E);qaQpo)sV&IHhwqnsF3IKY zdH!z+t5$8zUe&cVyR9hAnQgO`Y`l}dgdSDgG=Eocvzd!y&dfD7?w;(-?ANnbXRgWG zkkv7mfAg&*Vxwo>s%GbVQdtpnJkN1wQ@mAMV`g)(CL!*5) zesArx)edK-ZLmGhGpe@MXS;BQsw@!$G2UCrO`1p_H6^YpQE^sGP^}9Q_Nz1f(((^nXx(pB9s~n=wPdrNa+>`&$ zbN61*=pS5Bjem6FY}^oACM`Lww3d=-8-zs|Osihpe$kFm?K`&@w{O_tEb9BA{+dcJ z)^d^{FU94xuin1JVYAs{?ZbDxpX21-v2p+APID)@Pq>e`_qmRouJ(r_{2$RF`97K?c`py}=#iUf?=&Dkrk-?hgLQ%+vNPM~0obQd3=a z#!~IZL5_{Z`QJ>u?=$WgcZB;R-Vwc>opyHcGwy%cd4a=ecQLyo4NkGA+Fh+)nG~fj3zy&gZ8)SkkkPSG{59EOUU;xMkc_1GYfI{E_MW7gz zfPvrwFbMEqFc<1FFGTFb>p! zi@}eibxJ@_eD2o{0GKmc_h0D?dS^&kYoKmtp^QV;8r%$S0c*g|z^z~{Xa(!QZD2jn!R_D_y`;WAA`Sw7NO4jlsfo0l)J$4Ix`DKkbR+2|QVVGnX*KC)(k-Mlq@R&)C9Ng3lGc%K zBdsUtq}xe%kT#IoNOzJplG;gkk?tnlLo!JBlI|njPx?9QL}_CEcV^kSQ|Q|z8@X@M zug$|RZDyZS5nk| zQKtWr>5nn}ai%}P^e36Vo#{K6zLV*@nEn*gpJw{6nEq>~|Ay%uOyAA)XPCZ+>CZBK zFVpui{kKf-WO^6V_cQ$f)1PDd^Gtt%=`S+vn7y~zl-NyCGM_J}I-*mIQOM{{i;pMHX2+tK62lrx zawEPtG~jYIUR)j!#UL%k2RA=4)?djtNRf!7$Yp$=8^qjc4`d%})yuFDPIcwFJ z*dS@<6$qxHbSo$*BKCbe(=$>l<5jUff?%XO75F+;5rT-M20>jaVvnsDX$p&~XnH@M z$_mmclC`)RGNbQqVtMAcueup;ta@#kBbRmr-@jyp5w>1S#P{A$PZl!2vCy%zU`Saa z$^>4^QjdTWBId4@3t6tATz$Z0n(G&_3}aMb)Ky-~YY0*-SU`|t2=Ex< zK<1S?O;j7P*7TJR5s{2sW7C3v6q@}a9CJ9Ms-z%ljDEg9=U!liID)#_iX9F29aVb1 z!d3ar$IoDOjGIo|UB+jjuUswoW?7M3Ir@Y?dy5t56WDw-A~a*gtQ!~kIn(+2I1&dR zB!ol?rQ}+c@d2eFDq8DoT?)biZ$rFcEhH_B8%eAn(pzkZ8s?k3i;N@JrRPoVjdxr} zL|RHIkNBz+DPEO`wFELB5d|w+RV5-?+d!8=n~DUIgT|?i$BK>dyW7%@C+8f?FveED zn%9C22}^-6(x_3yhKVxni}sdI%*SO3)k5YQg-9HUugItylr(rvPYCg$h*F2NNdyZ* zsn2@-zs!hGnhGIf{gOld%)`5{M_u^f+io_(c;>5A_k;-QjusH2;#mnXwx+VW#y6&K zLL?8hPH<7sSFgPhL7%rL5%iu}Q6V7C76bgHLR6yMKC1+$Q^06=iFwJg3t7H9abH$M znE43lA^GQ2SpP7q?3n$KLjw)-foDHM*0s2_cTI4d5LBGQV)<%;G@ zQ#;t80zMG2ia(04ETNBB$twrV{hlFgfH|ysE#tC!QrXI3(z z#46IJ_SPmgGEt(92cNbVI(k=Vlocq5KXG{!MHqd^3N5T{&`P{mxVJ1Drt^1&#_9L6 zv(3MK*Ui$+qo201g6Ta0MQI?&NUt(pl<@}`PGZz5$0g%z(U5$p=31mdyh$0Z&=Df#i&+c!p&ht>^G&#Pw_MB{c7w&0i+TSPLX^JRWB;|{0Au1TGt+3h{ONRK=$3i? z=GIQL?yp{wslO$wtX1gxmN;K!xgT>v@Jgi~NF)9A^vZrH+J2(bAY+{;SDTp3_cNcSkM&NY6$e( z9?dk?G|*>F0!lm*L}?0ztl#yFnP0!1Z$!5&a2qM7GjgXj5V z)uYe3ZCa&rRgB<+5EZ>Gzhau{4Mk{M{xOAIFTWI!oTEP$foZPbuNYCl$5a`XZml2mGy@tC>lRtL*# z;l*GGsX48_Z_pyQGzkNefEiS==D4Vc83$Jq>*-ije+8T=guJ0~mS)rBK%hk>j8T+N z*2X8h3gUvI`K(z{k}tM~I6fS&!`Txn+tZp+N@Msjwoy?LOqh{Ff+S-WPPScCNwWgm z>+xGTd{%R*t{IJHGVbiewGPpg#oR?UcSla;nO9Of_(Qi$O?9sexRK85yz8#=8Q!DQ z2Nu%Jp- zhMdM_Cl2Q3_M~ADGe)e5=7OPqW5?EY9`o9Nu41`n)V+pf8Vi?n=b3{aAAx~wSKcGV z=9dWl0%Jw%o{S+BWs{n5=~U4u9Q%BEh+^KvkbbJ{O3%1Ns*20QQXRIgZ$|u}7u3>N zAoO%|t8W>1r!C^lMkH%#qG_2c#<*FLdF8$amT9h>HG!q4NHKF)?qc+cSH~^K!SCkz z8D|V!dfaKuciqbwM^0>UnJafI_-WZnX7lLqTQJprw^3$JXR@H0`>&eD@>`PaD=~rf zy}PrUn3XD4FejQ$Pcy?J&`u@O5q0mZ`$6hcZKdqS2b?droryV4GreefMILH;y@RI~ zMkh;Tt9i0Y#17Y_og&DmK2ogo6S!!Xz|8#u2=e3o@dai=Dp);@LwT5|M$7j Gd+5J+io?tR delta 2342 zcmYM$XH*nd9ER~-SeEKuv3D2j9lO|j@4Z*-9lN4}y@R4+LoC=E*w((-Nn)Cs7?Rjc zVlXNaH8Gl)=Z)vY55H&b%(>^xe3|)AWJ;nVGBVK-lh1B{Yq1oFS?R2+Rr$$_lEdN? z92Sek=7_SGwI*g%0%Nddu>gRIa>u}m5HtOJ3Teye6CARpXI{`M6ZZYr7 zYm;HpU4bEK&DphO(kicnl=qvgwjv zJ8~c=av?YJATRPEKMJ5A3ZXEZ2tqK5peTx=I7*--N})8$pe)LvJSrdr6;TP5Q3X{| z4b@Qtp{R*ksEsj0T_ru2*Y3u!B7mtaE!o6jKXM)!B~vLc!XmDCSnpMV+y8X8m40g zA}|xPFdLDG!W_)SJj}-eEW{!##u6;WGAzdmL?Z?(u?n$RjWt+{IIP2ZY`{irLOeDj z0b8&Y+pryp*nyqch27YLz1WBSIDmsVgd`;6Fpl6Tj^Q|5IDwNmh0{2Lvp5GgQg9v@ za1obq8CP%>srUq+;xk;s=lB9&;wyZO>$riN_y*tN7QRCozQ+&v5w~#%cX1E*@c<9; z2tVOx{DQ~$73p|_-|!UA@EjRL_)UdP;qzfznWEq%>BVC{2}SN^_-!(o$)qv{u?EZIyOPd!>WYQR$>~R=Ox%m2OIR zrH9f}>812m`Y3&seoB93fHF`Sq=YGhl_APdWtcKt8KI0+Mk%9}G0IqFoHAYsS0*SE zl}XBEWr{LYnWjuvW+)NLOl6ibTZvSnlsU>=Wu7u$S)eRb7AcFBCCXA|nX+72p+qY& z%1ULG6059M)+lS0IAxu(UfG~*w7U5;S+gbiSPes)shvE|%hMhBFx8qKkcabVPd_f( z&25W$XkMDz=9zhH+(9W}R$sfO>3QD|6FK4j^G((w#%}KXKeidl2@R6Mt??hH?kn4e z-O&%2;v`<#tRyH~l(WxJB7>`-iEpI#RbM> RRMYQGsLdVmGSxcI|6dqCMrQy3