feat: Enhance compatibility scoring and report generation with new methods and models

This commit is contained in:
2025-10-27 19:15:47 +00:00
parent 02c8bb816f
commit c53455cc06
5 changed files with 194 additions and 50 deletions
+22 -11
View File
@@ -12,7 +12,11 @@ from schemas.router_schemas import (
PaginatedResponse, PaginatedResponse,
SectorMinimal, SectorMinimal,
) )
from services.compatibility_score import calculate_project_investor_compatibility from services.compatibility_score import (
calculate_project_investor_compatibility,
_calculate_project_fund_compatibility,
_calculate_project_investor_direct_compatibility,
)
from sqlalchemy.orm import Session, selectinload from sqlalchemy.orm import Session, selectinload
router = APIRouter(tags=["Investor Routes"]) router = APIRouter(tags=["Investor Routes"])
@@ -95,13 +99,6 @@ def read_investors(
# Transform to InvestmentResponse format (one row per investor-fund combination) # Transform to InvestmentResponse format (one row per investor-fund combination)
investment_responses = [] investment_responses = []
for investor in investors: for investor in investors:
# Calculate compatibility score if project provided
compatibility_score = 1.0
if project is not None:
compatibility_score = calculate_project_investor_compatibility(
project=project, investor=investor, use_funds=True
)
# Get top 3 portfolio companies (id and name only) # Get top 3 portfolio companies (id and name only)
portfolio_companies = [ portfolio_companies = [
CompanyMinimal(id=company.id, name=company.name) CompanyMinimal(id=company.id, name=company.name)
@@ -111,6 +108,13 @@ def read_investors(
# If investor has funds, create one entry per fund # If investor has funds, create one entry per fund
if investor.funds: if investor.funds:
for fund in investor.funds: for fund in investor.funds:
# Calculate compatibility score for this specific fund
compatibility_score = 1.0
if project is not None:
compatibility_score = _calculate_project_fund_compatibility(
project=project, fund=fund
)
# Get stage focus as comma-separated string # Get stage focus as comma-separated string
stage_focus = ( stage_focus = (
", ".join([stage.name for stage in fund.investment_stages]) ", ".join([stage.name for stage in fund.investment_stages])
@@ -141,6 +145,13 @@ def read_investors(
investment_responses.append(investment_response) investment_responses.append(investment_response)
else: else:
# If no funds, create one entry with null fund fields # If no funds, create one entry with null fund fields
# Calculate compatibility using investor-level data
compatibility_score = 1.0
if project is not None:
compatibility_score = _calculate_project_investor_direct_compatibility(
project=project, investor=investor
)
investment_response = InvestmentResponse( investment_response = InvestmentResponse(
id=investor.id, id=investor.id,
name=investor.name, name=investor.name,
@@ -255,11 +266,11 @@ def filter_investors(
for fund in funds: for fund in funds:
investor = fund.investor investor = fund.investor
# Calculate compatibility score if project provided # Calculate compatibility score for this specific fund
compatibility_score = 1.0 compatibility_score = 1.0
if project is not None: if project is not None:
compatibility_score = calculate_project_investor_compatibility( compatibility_score = _calculate_project_fund_compatibility(
project=project, investor=investor, use_funds=True project=project, fund=fund
) )
# Get top 3 portfolio companies (id and name only) # Get top 3 portfolio companies (id and name only)
+1 -1
View File
@@ -106,7 +106,7 @@ async def generate_investor_report(
# Generate PDF report # Generate PDF report
report_generator = ReportGenerator() report_generator = ReportGenerator()
pdf_bytes = await report_generator.generate_investor_report( pdf_bytes = await report_generator.generate_investor_report(
investor_data, project_data investor_data, project_data, investor_model=investor, project_model=project
) )
# Return PDF as downloadable file # Return PDF as downloadable file
+144 -34
View File
@@ -6,6 +6,7 @@ The scoring system evaluates multiple dimensions to determine how well a project
matches with an investor's investment criteria. matches with an investor's investment criteria.
""" """
from difflib import SequenceMatcher
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
from db.models import FundTable, InvestorTable, ProjectTable from db.models import FundTable, InvestorTable, ProjectTable
@@ -99,12 +100,16 @@ def _calculate_project_fund_compatibility(
else str(project.stage) else str(project.stage)
) )
if project_stage_name in fund_stage_names: # Normalize both for case-insensitive comparison
project_stage_normalized = project_stage_name.upper().strip()
fund_stages_normalized = {name.upper().strip() for name in fund_stage_names}
if project_stage_normalized in fund_stages_normalized:
stage_score = 30 stage_score = 30
else: else:
# Partial credit for adjacent stages # Partial credit for adjacent stages
stage_score = _calculate_stage_proximity( stage_score = _calculate_stage_proximity(
project_stage_name, fund_stage_names project_stage_normalized, fund_stages_normalized
) )
total_score += stage_score total_score += stage_score
@@ -112,22 +117,53 @@ def _calculate_project_fund_compatibility(
# 2. Sector Overlap (30 points) # 2. Sector Overlap (30 points)
sector_score = 0 sector_score = 0
if project.sector and fund.sectors: if project.sector and fund.sectors:
project_sector_ids = {sector.id for sector in project.sector} project_sectors = [s for s in project.sector if hasattr(s, 'name')]
fund_sector_ids = {sector.id for sector in fund.sectors} fund_sectors = [s for s in fund.sectors if hasattr(s, 'name')]
if project_sector_ids and fund_sector_ids: if project_sectors and fund_sectors:
common_sectors = project_sector_ids.intersection(fund_sector_ids) # Use fuzzy matching to account for similar but not identical sector names
# Score based on what percentage of project sectors are covered by fund match_count = 0
overlap_ratio = len(common_sectors) / len(project_sector_ids) total_matches = 0
sector_score = int(30 * overlap_ratio)
for proj_sector in project_sectors:
best_match_score = 0
proj_name = proj_sector.name.lower().strip()
for fund_sector in fund_sectors:
fund_name = fund_sector.name.lower().strip()
# Exact match
if proj_name == fund_name:
best_match_score = 1.0
break
# Fuzzy match using sequence matcher
similarity = SequenceMatcher(None, proj_name, fund_name).ratio()
# Also check if one contains the other (substring match)
if proj_name in fund_name or fund_name in proj_name:
similarity = max(similarity, 0.8)
best_match_score = max(best_match_score, similarity)
# Count matches with threshold
# Perfect match (1.0), strong match (>0.75), partial match (>0.6)
if best_match_score >= 0.6:
total_matches += best_match_score
match_count += 1
if match_count > 0:
# Calculate overlap ratio based on fuzzy matches
overlap_ratio = total_matches / len(project_sectors)
sector_score = int(30 * overlap_ratio)
total_score += sector_score total_score += sector_score
# 3. Geographic Match (20 points) # 3. Geographic Match (20 points)
geo_score = 0 geo_score = 0
if project.location and fund.geographic_focus: if project.location and fund.geographic_focus:
project_location_lower = project.location.lower() project_location_lower = project.location.lower().strip()
fund_geo_lower = (fund.geographic_focus or "").lower() fund_geo_lower = (fund.geographic_focus or "").lower().strip()
# Exact match # Exact match
if project_location_lower == fund_geo_lower: if project_location_lower == fund_geo_lower:
@@ -137,10 +173,10 @@ def _calculate_project_fund_compatibility(
project_location_lower in fund_geo_lower project_location_lower in fund_geo_lower
or fund_geo_lower in project_location_lower or fund_geo_lower in project_location_lower
): ):
geo_score = 10 geo_score = 15
# Check for common geographic terms # Check for common geographic terms or regional overlap
elif _check_geographic_overlap(project_location_lower, fund_geo_lower): elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
geo_score = 5 geo_score = 12
total_score += geo_score total_score += geo_score
@@ -209,13 +245,44 @@ def _calculate_project_investor_direct_compatibility(
# 2. Sector Overlap (30 points) # 2. Sector Overlap (30 points)
sector_score = 0 sector_score = 0
if project.sector and investor.sectors: if project.sector and investor.sectors:
project_sector_ids = {sector.id for sector in project.sector} project_sectors = [s for s in project.sector if hasattr(s, 'name')]
investor_sector_ids = {sector.id for sector in investor.sectors} investor_sectors = [s for s in investor.sectors if hasattr(s, 'name')]
if project_sector_ids and investor_sector_ids: if project_sectors and investor_sectors:
common_sectors = project_sector_ids.intersection(investor_sector_ids) # Use fuzzy matching to account for similar but not identical sector names
overlap_ratio = len(common_sectors) / len(project_sector_ids) match_count = 0
sector_score = int(30 * overlap_ratio) total_matches = 0
for proj_sector in project_sectors:
best_match_score = 0
proj_name = proj_sector.name.lower().strip()
for inv_sector in investor_sectors:
inv_name = inv_sector.name.lower().strip()
# Exact match
if proj_name == inv_name:
best_match_score = 1.0
break
# Fuzzy match using sequence matcher
similarity = SequenceMatcher(None, proj_name, inv_name).ratio()
# Also check if one contains the other (substring match)
if proj_name in inv_name or inv_name in proj_name:
similarity = max(similarity, 0.8)
best_match_score = max(best_match_score, similarity)
# Count matches with threshold
if best_match_score >= 0.6:
total_matches += best_match_score
match_count += 1
if match_count > 0:
# Calculate overlap ratio based on fuzzy matches
overlap_ratio = total_matches / len(project_sectors)
sector_score = int(30 * overlap_ratio)
total_score += sector_score total_score += sector_score
@@ -278,8 +345,11 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
""" """
stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"] stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"]
# Normalize project stage for comparison
project_stage_normalized = project_stage.upper().strip()
try: try:
project_idx = stage_order.index(project_stage) project_idx = stage_order.index(project_stage_normalized)
except ValueError: except ValueError:
return 0 return 0
@@ -290,8 +360,10 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
if project_idx < len(stage_order) - 1: if project_idx < len(stage_order) - 1:
adjacent_stages.append(stage_order[project_idx + 1]) adjacent_stages.append(stage_order[project_idx + 1])
# Normalize fund stages and check for matches
for stage in fund_stages: for stage in fund_stages:
if stage in adjacent_stages: stage_normalized = stage.upper().strip()
if stage_normalized in adjacent_stages:
return 15 # Half credit for adjacent stage return 15 # Half credit for adjacent stage
return 0 return 0
@@ -305,24 +377,62 @@ def _check_geographic_overlap(location1: str, location2: str) -> bool:
- "San Francisco, CA" and "California" -> True - "San Francisco, CA" and "California" -> True
- "New York" and "USA" -> True (if both contain USA/US) - "New York" and "USA" -> True (if both contain USA/US)
- "London, UK" and "United Kingdom" -> True - "London, UK" and "United Kingdom" -> True
- "Germany" and "Europe" -> True
""" """
# Common geographic groupings # Normalize inputs
loc1 = location1.lower().strip()
loc2 = location2.lower().strip()
# Common geographic groupings with broader regional mappings
geo_groups = [ geo_groups = [
["usa", "us", "united states", "america"], # North America
["uk", "united kingdom", "britain"], ["usa", "us", "united states", "america", "u.s.", "u.s.a"],
["california", "ca"], ["canada", "canadian"],
["new york", "ny"], ["mexico", "mexican"],
# Europe and countries
["europe", "european", "eu", "germany", "france", "uk", "united kingdom",
"britain", "spain", "italy", "netherlands", "belgium", "sweden", "denmark",
"norway", "finland", "poland", "portugal", "austria", "switzerland",
"ireland", "greece", "czech", "romania"],
# UK specific
["uk", "united kingdom", "britain", "england", "scotland", "wales", "london"],
# US states
["california", "ca", "san francisco", "los angeles", "silicon valley"],
["new york", "ny", "nyc"],
["texas", "tx"], ["texas", "tx"],
["europe", "eu"], ["massachusetts", "ma", "boston"],
["asia", "asian"], ["washington", "seattle"],
["africa", "african"],
# Asia
["asia", "asian", "china", "japan", "korea", "singapore", "hong kong",
"india", "indonesia", "thailand", "vietnam", "malaysia", "philippines"],
# Middle East
["middle east", "israel", "uae", "dubai", "saudi arabia"],
# Latin America
["latin america", "brazil", "argentina", "chile", "colombia", "mexico"],
# Africa
["africa", "african", "south africa", "nigeria", "kenya", "egypt"],
# Oceania
["australia", "australian", "new zealand"],
] ]
# Check if both locations match any group
for group in geo_groups: for group in geo_groups:
found_in_1 = any(term in location1 for term in group) found_in_1 = any(term in loc1 for term in group)
found_in_2 = any(term in location2 for term in group) found_in_2 = any(term in loc2 for term in group)
if found_in_1 and found_in_2: if found_in_1 and found_in_2:
return True return True
# Check for direct substring match (one contains the other)
if loc1 in loc2 or loc2 in loc1:
return True
return False return False
+27 -4
View File
@@ -4,6 +4,10 @@ from typing import Any, Dict, List, Optional
from jinja2 import Environment, FileSystemLoader from jinja2 import Environment, FileSystemLoader
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
# Import database models and compatibility score service
from db.models import InvestorTable, ProjectTable
from services.compatibility_score import calculate_project_investor_compatibility
class ReportGenerator: class ReportGenerator:
"""Service for generating PDF reports from HTML templates""" """Service for generating PDF reports from HTML templates"""
@@ -17,6 +21,8 @@ class ReportGenerator:
self, self,
investor_data: Dict[str, Any], investor_data: Dict[str, Any],
project_data: Optional[Dict[str, Any]] = None, project_data: Optional[Dict[str, Any]] = None,
investor_model: Optional[InvestorTable] = None,
project_model: Optional[ProjectTable] = None,
) -> bytes: ) -> bytes:
""" """
Generate a PDF report for an investor profile. Generate a PDF report for an investor profile.
@@ -24,12 +30,16 @@ class ReportGenerator:
Args: Args:
investor_data: Dictionary containing investor information investor_data: Dictionary containing investor information
project_data: Optional dictionary containing project information for compatibility analysis project_data: Optional dictionary containing project information for compatibility analysis
investor_model: Optional database model for investor (used for compatibility scoring)
project_model: Optional database model for project (used for compatibility scoring)
Returns: Returns:
bytes: PDF file content bytes: PDF file content
""" """
# Prepare template context # Prepare template context
context = self._prepare_context(investor_data, project_data) context = self._prepare_context(
investor_data, project_data, investor_model, project_model
)
# Render HTML from template # Render HTML from template
template = self.env.get_template("report.html") template = self.env.get_template("report.html")
@@ -43,6 +53,8 @@ class ReportGenerator:
self, self,
investor_data: Dict[str, Any], investor_data: Dict[str, Any],
project_data: Optional[Dict[str, Any]] = None, project_data: Optional[Dict[str, Any]] = None,
investor_model: Optional[InvestorTable] = None,
project_model: Optional[ProjectTable] = None,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Prepare the context dictionary for template rendering""" """Prepare the context dictionary for template rendering"""
context = { context = {
@@ -55,9 +67,20 @@ class ReportGenerator:
# If project data is provided, calculate compatibility # If project data is provided, calculate compatibility
if project_data: if project_data:
context["compatibility_score"] = self._calculate_compatibility_score( # Use the compatibility_score service if models are provided
investor_data, project_data if investor_model and project_model:
) # Calculate using the standardized compatibility score service
# Returns score between 0 and 1, convert to percentage (0-100)
score_decimal = calculate_project_investor_compatibility(
project=project_model, investor=investor_model, use_funds=True
)
context["compatibility_score"] = int(score_decimal * 100)
else:
# Fallback to old calculation method if models not provided
context["compatibility_score"] = self._calculate_compatibility_score(
investor_data, project_data
)
context["match_criteria"] = self._generate_match_criteria( context["match_criteria"] = self._generate_match_criteria(
investor_data, project_data investor_data, project_data
) )
BIN
View File
Binary file not shown.