feat: Enhance compatibility scoring and report generation with new methods and models

2025-10-27 19:15:47 +00:00
parent 02c8bb816f
commit c53455cc06
5 changed files with 194 additions and 50 deletions
@@ -12,7 +12,11 @@ from schemas.router_schemas import (
    PaginatedResponse,
    SectorMinimal,
 )
-from services.compatibility_score import calculate_project_investor_compatibility
+from services.compatibility_score import (
+    calculate_project_investor_compatibility,
+    _calculate_project_fund_compatibility,
+    _calculate_project_investor_direct_compatibility,
+)
 from sqlalchemy.orm import Session, selectinload

 router = APIRouter(tags=["Investor Routes"])
@@ -95,13 +99,6 @@ def read_investors(
    # Transform to InvestmentResponse format (one row per investor-fund combination)
    investment_responses = []
    for investor in investors:
-        # Calculate compatibility score if project provided
-        compatibility_score = 1.0
-        if project is not None:
-            compatibility_score = calculate_project_investor_compatibility(
-                project=project, investor=investor, use_funds=True
-            )
-
        # Get top 3 portfolio companies (id and name only)
        portfolio_companies = [
            CompanyMinimal(id=company.id, name=company.name)
@@ -111,6 +108,13 @@ def read_investors(
        # If investor has funds, create one entry per fund
        if investor.funds:
            for fund in investor.funds:
+                # Calculate compatibility score for this specific fund
+                compatibility_score = 1.0
+                if project is not None:
+                    compatibility_score = _calculate_project_fund_compatibility(
+                        project=project, fund=fund
+                    )
+
                # Get stage focus as comma-separated string
                stage_focus = (
                    ", ".join([stage.name for stage in fund.investment_stages])
@@ -141,6 +145,13 @@ def read_investors(
                investment_responses.append(investment_response)
        else:
            # If no funds, create one entry with null fund fields
+            # Calculate compatibility using investor-level data
+            compatibility_score = 1.0
+            if project is not None:
+                compatibility_score = _calculate_project_investor_direct_compatibility(
+                    project=project, investor=investor
+                )
+
            investment_response = InvestmentResponse(
                id=investor.id,
                name=investor.name,
@@ -255,11 +266,11 @@ def filter_investors(
    for fund in funds:
        investor = fund.investor

-        # Calculate compatibility score if project provided
+        # Calculate compatibility score for this specific fund
        compatibility_score = 1.0
        if project is not None:
-            compatibility_score = calculate_project_investor_compatibility(
-                project=project, investor=investor, use_funds=True
+            compatibility_score = _calculate_project_fund_compatibility(
+                project=project, fund=fund
            )

        # Get top 3 portfolio companies (id and name only)
@@ -106,7 +106,7 @@ async def generate_investor_report(
    # Generate PDF report
    report_generator = ReportGenerator()
    pdf_bytes = await report_generator.generate_investor_report(
-        investor_data, project_data
+        investor_data, project_data, investor_model=investor, project_model=project
    )

    # Return PDF as downloadable file
@@ -6,6 +6,7 @@ The scoring system evaluates multiple dimensions to determine how well a project
 matches with an investor's investment criteria.
 """

+from difflib import SequenceMatcher
 from typing import List, Optional, Tuple

 from db.models import FundTable, InvestorTable, ProjectTable
@@ -99,12 +100,16 @@ def _calculate_project_fund_compatibility(
            else str(project.stage)
        )

-        if project_stage_name in fund_stage_names:
+        # Normalize both for case-insensitive comparison
+        project_stage_normalized = project_stage_name.upper().strip()
+        fund_stages_normalized = {name.upper().strip() for name in fund_stage_names}
+
+        if project_stage_normalized in fund_stages_normalized:
            stage_score = 30
        else:
            # Partial credit for adjacent stages
            stage_score = _calculate_stage_proximity(
-                project_stage_name, fund_stage_names
+                project_stage_normalized, fund_stages_normalized
            )

    total_score += stage_score
@@ -112,22 +117,53 @@ def _calculate_project_fund_compatibility(
    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and fund.sectors:
-        project_sector_ids = {sector.id for sector in project.sector}
-        fund_sector_ids = {sector.id for sector in fund.sectors}
-
-        if project_sector_ids and fund_sector_ids:
-            common_sectors = project_sector_ids.intersection(fund_sector_ids)
-            # Score based on what percentage of project sectors are covered by fund
-            overlap_ratio = len(common_sectors) / len(project_sector_ids)
-            sector_score = int(30 * overlap_ratio)
+        project_sectors = [s for s in project.sector if hasattr(s, 'name')]
+        fund_sectors = [s for s in fund.sectors if hasattr(s, 'name')]
+        
+        if project_sectors and fund_sectors:
+            # Use fuzzy matching to account for similar but not identical sector names
+            match_count = 0
+            total_matches = 0
+            
+            for proj_sector in project_sectors:
+                best_match_score = 0
+                proj_name = proj_sector.name.lower().strip()
+                
+                for fund_sector in fund_sectors:
+                    fund_name = fund_sector.name.lower().strip()
+                    
+                    # Exact match
+                    if proj_name == fund_name:
+                        best_match_score = 1.0
+                        break
+                    
+                    # Fuzzy match using sequence matcher
+                    similarity = SequenceMatcher(None, proj_name, fund_name).ratio()
+                    
+                    # Also check if one contains the other (substring match)
+                    if proj_name in fund_name or fund_name in proj_name:
+                        similarity = max(similarity, 0.8)
+                    
+                    best_match_score = max(best_match_score, similarity)
+                
+                # Count matches with threshold
+                # Perfect match (1.0), strong match (>0.75), partial match (>0.6)
+                if best_match_score >= 0.6:
+                    total_matches += best_match_score
+                    match_count += 1
+            
+            if match_count > 0:
+                # Calculate overlap ratio based on fuzzy matches
+                overlap_ratio = total_matches / len(project_sectors)
+                sector_score = int(30 * overlap_ratio)

    total_score += sector_score

    # 3. Geographic Match (20 points)
    geo_score = 0
    if project.location and fund.geographic_focus:
-        project_location_lower = project.location.lower()
-        fund_geo_lower = (fund.geographic_focus or "").lower()
+        project_location_lower = project.location.lower().strip()
+        fund_geo_lower = (fund.geographic_focus or "").lower().strip()

        # Exact match
        if project_location_lower == fund_geo_lower:
@@ -137,10 +173,10 @@ def _calculate_project_fund_compatibility(
            project_location_lower in fund_geo_lower
            or fund_geo_lower in project_location_lower
        ):
-            geo_score = 10
-        # Check for common geographic terms
+            geo_score = 15
+        # Check for common geographic terms or regional overlap
        elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
-            geo_score = 5
+            geo_score = 12

    total_score += geo_score

@@ -209,13 +245,44 @@ def _calculate_project_investor_direct_compatibility(
    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and investor.sectors:
-        project_sector_ids = {sector.id for sector in project.sector}
-        investor_sector_ids = {sector.id for sector in investor.sectors}
-
-        if project_sector_ids and investor_sector_ids:
-            common_sectors = project_sector_ids.intersection(investor_sector_ids)
-            overlap_ratio = len(common_sectors) / len(project_sector_ids)
-            sector_score = int(30 * overlap_ratio)
+        project_sectors = [s for s in project.sector if hasattr(s, 'name')]
+        investor_sectors = [s for s in investor.sectors if hasattr(s, 'name')]
+        
+        if project_sectors and investor_sectors:
+            # Use fuzzy matching to account for similar but not identical sector names
+            match_count = 0
+            total_matches = 0
+            
+            for proj_sector in project_sectors:
+                best_match_score = 0
+                proj_name = proj_sector.name.lower().strip()
+                
+                for inv_sector in investor_sectors:
+                    inv_name = inv_sector.name.lower().strip()
+                    
+                    # Exact match
+                    if proj_name == inv_name:
+                        best_match_score = 1.0
+                        break
+                    
+                    # Fuzzy match using sequence matcher
+                    similarity = SequenceMatcher(None, proj_name, inv_name).ratio()
+                    
+                    # Also check if one contains the other (substring match)
+                    if proj_name in inv_name or inv_name in proj_name:
+                        similarity = max(similarity, 0.8)
+                    
+                    best_match_score = max(best_match_score, similarity)
+                
+                # Count matches with threshold
+                if best_match_score >= 0.6:
+                    total_matches += best_match_score
+                    match_count += 1
+            
+            if match_count > 0:
+                # Calculate overlap ratio based on fuzzy matches
+                overlap_ratio = total_matches / len(project_sectors)
+                sector_score = int(30 * overlap_ratio)

    total_score += sector_score

@@ -278,8 +345,11 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
    """
    stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"]

+    # Normalize project stage for comparison
+    project_stage_normalized = project_stage.upper().strip()
+
    try:
-        project_idx = stage_order.index(project_stage)
+        project_idx = stage_order.index(project_stage_normalized)
    except ValueError:
        return 0

@@ -290,8 +360,10 @@ def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
    if project_idx < len(stage_order) - 1:
        adjacent_stages.append(stage_order[project_idx + 1])

+    # Normalize fund stages and check for matches
    for stage in fund_stages:
-        if stage in adjacent_stages:
+        stage_normalized = stage.upper().strip()
+        if stage_normalized in adjacent_stages:
            return 15  # Half credit for adjacent stage

    return 0
@@ -305,24 +377,62 @@ def _check_geographic_overlap(location1: str, location2: str) -> bool:
        - "San Francisco, CA" and "California" -> True
        - "New York" and "USA" -> True (if both contain USA/US)
        - "London, UK" and "United Kingdom" -> True
+        - "Germany" and "Europe" -> True
    """
-    # Common geographic groupings
+    # Normalize inputs
+    loc1 = location1.lower().strip()
+    loc2 = location2.lower().strip()
+    
+    # Common geographic groupings with broader regional mappings
    geo_groups = [
-        ["usa", "us", "united states", "america"],
-        ["uk", "united kingdom", "britain"],
-        ["california", "ca"],
-        ["new york", "ny"],
+        # North America
+        ["usa", "us", "united states", "america", "u.s.", "u.s.a"],
+        ["canada", "canadian"],
+        ["mexico", "mexican"],
+        
+        # Europe and countries
+        ["europe", "european", "eu", "germany", "france", "uk", "united kingdom", 
+         "britain", "spain", "italy", "netherlands", "belgium", "sweden", "denmark",
+         "norway", "finland", "poland", "portugal", "austria", "switzerland", 
+         "ireland", "greece", "czech", "romania"],
+        
+        # UK specific
+        ["uk", "united kingdom", "britain", "england", "scotland", "wales", "london"],
+        
+        # US states
+        ["california", "ca", "san francisco", "los angeles", "silicon valley"],
+        ["new york", "ny", "nyc"],
        ["texas", "tx"],
-        ["europe", "eu"],
-        ["asia", "asian"],
-        ["africa", "african"],
+        ["massachusetts", "ma", "boston"],
+        ["washington", "seattle"],
+        
+        # Asia
+        ["asia", "asian", "china", "japan", "korea", "singapore", "hong kong", 
+         "india", "indonesia", "thailand", "vietnam", "malaysia", "philippines"],
+        
+        # Middle East
+        ["middle east", "israel", "uae", "dubai", "saudi arabia"],
+        
+        # Latin America
+        ["latin america", "brazil", "argentina", "chile", "colombia", "mexico"],
+        
+        # Africa
+        ["africa", "african", "south africa", "nigeria", "kenya", "egypt"],
+        
+        # Oceania
+        ["australia", "australian", "new zealand"],
    ]

+    # Check if both locations match any group
    for group in geo_groups:
-        found_in_1 = any(term in location1 for term in group)
-        found_in_2 = any(term in location2 for term in group)
+        found_in_1 = any(term in loc1 for term in group)
+        found_in_2 = any(term in loc2 for term in group)
        if found_in_1 and found_in_2:
            return True
+    
+    # Check for direct substring match (one contains the other)
+    if loc1 in loc2 or loc2 in loc1:
+        return True

    return False

@@ -4,6 +4,10 @@ from typing import Any, Dict, List, Optional
 from jinja2 import Environment, FileSystemLoader
 from playwright.async_api import async_playwright

+# Import database models and compatibility score service
+from db.models import InvestorTable, ProjectTable
+from services.compatibility_score import calculate_project_investor_compatibility
+

 class ReportGenerator:
    """Service for generating PDF reports from HTML templates"""
@@ -17,6 +21,8 @@ class ReportGenerator:
        self,
        investor_data: Dict[str, Any],
        project_data: Optional[Dict[str, Any]] = None,
+        investor_model: Optional[InvestorTable] = None,
+        project_model: Optional[ProjectTable] = None,
    ) -> bytes:
        """
        Generate a PDF report for an investor profile.
@@ -24,12 +30,16 @@ class ReportGenerator:
        Args:
            investor_data: Dictionary containing investor information
            project_data: Optional dictionary containing project information for compatibility analysis
+            investor_model: Optional database model for investor (used for compatibility scoring)
+            project_model: Optional database model for project (used for compatibility scoring)

        Returns:
            bytes: PDF file content
        """
        # Prepare template context
-        context = self._prepare_context(investor_data, project_data)
+        context = self._prepare_context(
+            investor_data, project_data, investor_model, project_model
+        )

        # Render HTML from template
        template = self.env.get_template("report.html")
@@ -43,6 +53,8 @@ class ReportGenerator:
        self,
        investor_data: Dict[str, Any],
        project_data: Optional[Dict[str, Any]] = None,
+        investor_model: Optional[InvestorTable] = None,
+        project_model: Optional[ProjectTable] = None,
    ) -> Dict[str, Any]:
        """Prepare the context dictionary for template rendering"""
        context = {
@@ -55,9 +67,20 @@ class ReportGenerator:

        # If project data is provided, calculate compatibility
        if project_data:
-            context["compatibility_score"] = self._calculate_compatibility_score(
-                investor_data, project_data
-            )
+            # Use the compatibility_score service if models are provided
+            if investor_model and project_model:
+                # Calculate using the standardized compatibility score service
+                # Returns score between 0 and 1, convert to percentage (0-100)
+                score_decimal = calculate_project_investor_compatibility(
+                    project=project_model, investor=investor_model, use_funds=True
+                )
+                context["compatibility_score"] = int(score_decimal * 100)
+            else:
+                # Fallback to old calculation method if models not provided
+                context["compatibility_score"] = self._calculate_compatibility_score(
+                    investor_data, project_data
+                )
+            
            context["match_criteria"] = self._generate_match_criteria(
                investor_data, project_data
            )