""" Compatibility Score Service This module calculates compatibility scores between projects and investors. The scoring system evaluates multiple dimensions to determine how well a project matches with an investor's investment criteria. """ from difflib import SequenceMatcher from typing import List, Optional, Tuple from db.models import FundTable, InvestorTable, ProjectTable def calculate_project_investor_compatibility( project: ProjectTable, investor: InvestorTable, use_funds: bool = True ) -> float: """ Calculate compatibility score between a project and an investor. Args: project: The project to evaluate investor: The investor to compare against use_funds: If True, evaluates against investor's funds. If False, uses investor-level data. Returns: A score between 0 and 1, where 1 is perfect match Scoring breakdown (out of 100 points): - Investment Stage Match: 30 points - Sector Overlap: 30 points - Geographic Match: 20 points - Valuation/Check Size Fit: 20 points """ if use_funds and investor.funds: # Calculate score for each fund and return the highest max_score = 0.0 for fund in investor.funds: fund_score = _calculate_project_fund_compatibility(project, fund) max_score = max(max_score, fund_score) return max_score else: # Use investor-level data (fallback) return _calculate_project_investor_direct_compatibility(project, investor) def calculate_project_investors_compatibility( project: ProjectTable, investors: List[InvestorTable], use_funds: bool = True ) -> List[Tuple[InvestorTable, float]]: """ Calculate compatibility scores between a project and multiple investors. Args: project: The project to evaluate investors: List of investors to compare against use_funds: If True, evaluates against investors' funds. If False, uses investor-level data. Returns: List of tuples (investor, score) sorted by score descending """ scored_investors = [] for investor in investors: score = calculate_project_investor_compatibility(project, investor, use_funds) scored_investors.append((investor, score)) # Sort by score descending scored_investors.sort(key=lambda x: x[1], reverse=True) return scored_investors def _calculate_project_fund_compatibility( project: ProjectTable, fund: FundTable ) -> float: """ Calculate compatibility score between a project and a specific fund. Scoring breakdown: - Investment Stage Match: 30 points (all or nothing if stage exists) - Sector Overlap: 30 points (proportional to overlap) - Geographic Match: 20 points (exact=20, partial=10, none=0) - Valuation/Check Size Fit: 20 points (proportional to fit) Returns: A score between 0 and 1 """ total_score = 0 max_score = 100 # 1. Investment Stage Match (30 points) stage_score = 0 if project.stage and fund.investment_stages: # Check if project stage matches any of the fund's investment stages fund_stage_names = {stage.name for stage in fund.investment_stages} # Convert project.stage enum to string for comparison project_stage_name = ( project.stage.value if hasattr(project.stage, "value") else str(project.stage) ) # Normalize both for case-insensitive comparison project_stage_normalized = project_stage_name.upper().strip() fund_stages_normalized = {name.upper().strip() for name in fund_stage_names} if project_stage_normalized in fund_stages_normalized: stage_score = 30 else: # Partial credit for adjacent stages stage_score = _calculate_stage_proximity( project_stage_normalized, fund_stages_normalized ) total_score += stage_score # 2. Sector Overlap (30 points) sector_score = 0 if project.sector and fund.sectors: project_sectors = [s for s in project.sector if hasattr(s, "name")] fund_sectors = [s for s in fund.sectors if hasattr(s, "name")] if project_sectors and fund_sectors: # Use fuzzy matching to account for similar but not identical sector names match_count = 0 total_matches = 0 for proj_sector in project_sectors: best_match_score = 0 proj_name = proj_sector.name.lower().strip() for fund_sector in fund_sectors: fund_name = fund_sector.name.lower().strip() # Exact match if proj_name == fund_name: best_match_score = 1.0 break # Fuzzy match using sequence matcher similarity = SequenceMatcher(None, proj_name, fund_name).ratio() # Also check if one contains the other (substring match) if proj_name in fund_name or fund_name in proj_name: similarity = max(similarity, 0.8) best_match_score = max(best_match_score, similarity) # Count matches with threshold # Perfect match (1.0), strong match (>0.75), partial match (>0.6) if best_match_score >= 0.6: total_matches += best_match_score match_count += 1 if match_count > 0: # Calculate overlap ratio based on fuzzy matches overlap_ratio = total_matches / len(project_sectors) sector_score = int(30 * overlap_ratio) total_score += sector_score # 3. Geographic Match (20 points) geo_score = 0 if project.location and fund.geographic_focus: project_location_lower = project.location.lower().strip() fund_geo_lower = (fund.geographic_focus or "").lower().strip() # Exact match if project_location_lower == fund_geo_lower: geo_score = 20 # Partial match (one contains the other) elif ( project_location_lower in fund_geo_lower or fund_geo_lower in project_location_lower ): geo_score = 15 # Check for common geographic terms or regional overlap (continent/country matching) elif _check_geographic_overlap(project_location_lower, fund_geo_lower): # Give higher score for continent/country matches (e.g., Germany -> Europe) geo_score = 18 total_score += geo_score # 4. Valuation/Check Size Fit (20 points) valuation_score = 0 if project.valuation and fund.check_size_lower and fund.check_size_upper: # Check if project valuation falls within or near the check size range # Typically, check size is a fraction of valuation (e.g., 10-20%) # We'll assume check size represents potential investment amount if fund.check_size_lower <= project.valuation <= fund.check_size_upper: # Valuation is within the check size range (might be too small) valuation_score = 10 else: # Check if the check size is reasonable for this valuation # Typical investment is 10-30% of valuation reasonable_valuation_min = fund.check_size_lower * 3 # Investing ~33% reasonable_valuation_max = fund.check_size_upper * 10 # Investing ~10% if ( reasonable_valuation_min <= project.valuation <= reasonable_valuation_max ): # Perfect fit valuation_score = 20 elif project.valuation < reasonable_valuation_min: # Project might be too small ratio = ( project.valuation / reasonable_valuation_min if reasonable_valuation_min > 0 else 0 ) valuation_score = int(10 * ratio) else: # Project might be too large ratio = ( reasonable_valuation_max / project.valuation if project.valuation > 0 else 0 ) valuation_score = int(10 * ratio) total_score += valuation_score # Convert to 0-1 scale return total_score / max_score def _calculate_project_investor_direct_compatibility( project: ProjectTable, investor: InvestorTable ) -> float: """ Calculate compatibility using investor-level data (fallback when no funds available). Uses the same scoring system but with investor-level attributes. """ total_score = 0 max_score = 100 # 1. Investment Stage - Skip this since investors don't have a direct stage field # We could add 30 points to other categories, but for consistency, we'll leave it as 0 stage_score = 0 total_score += stage_score # 2. Sector Overlap (30 points) sector_score = 0 if project.sector and investor.sectors: project_sectors = [s for s in project.sector if hasattr(s, "name")] investor_sectors = [s for s in investor.sectors if hasattr(s, "name")] if project_sectors and investor_sectors: # Use fuzzy matching to account for similar but not identical sector names match_count = 0 total_matches = 0 for proj_sector in project_sectors: best_match_score = 0 proj_name = proj_sector.name.lower().strip() for inv_sector in investor_sectors: inv_name = inv_sector.name.lower().strip() # Exact match if proj_name == inv_name: best_match_score = 1.0 break # Fuzzy match using sequence matcher similarity = SequenceMatcher(None, proj_name, inv_name).ratio() # Also check if one contains the other (substring match) if proj_name in inv_name or inv_name in proj_name: similarity = max(similarity, 0.8) best_match_score = max(best_match_score, similarity) # Count matches with threshold if best_match_score >= 0.6: total_matches += best_match_score match_count += 1 if match_count > 0: # Calculate overlap ratio based on fuzzy matches overlap_ratio = total_matches / len(project_sectors) sector_score = int(30 * overlap_ratio) total_score += sector_score # 3. Geographic Match (20 points) geo_score = 0 if project.location and investor.geographic_focus: project_location_lower = project.location.lower() investor_geo_lower = (investor.geographic_focus or "").lower() if project_location_lower == investor_geo_lower: geo_score = 20 elif ( project_location_lower in investor_geo_lower or investor_geo_lower in project_location_lower ): geo_score = 15 elif _check_geographic_overlap(project_location_lower, investor_geo_lower): # Give higher score for continent/country matches (e.g., Germany -> Europe) geo_score = 18 total_score += geo_score # 4. Valuation/Check Size Fit (20 points) valuation_score = 0 if project.valuation and investor.check_size_lower and investor.check_size_upper: reasonable_valuation_min = investor.check_size_lower * 3 reasonable_valuation_max = investor.check_size_upper * 10 if reasonable_valuation_min <= project.valuation <= reasonable_valuation_max: valuation_score = 20 elif project.valuation < reasonable_valuation_min: ratio = ( project.valuation / reasonable_valuation_min if reasonable_valuation_min > 0 else 0 ) valuation_score = int(10 * ratio) else: ratio = ( reasonable_valuation_max / project.valuation if project.valuation > 0 else 0 ) valuation_score = int(10 * ratio) total_score += valuation_score # Convert to 0-1 scale return total_score / max_score def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int: """ Calculate proximity score between project stage and fund stages. Awards partial credit for adjacent investment stages. Stage progression: SEED -> SERIES_A -> SERIES_B -> SERIES_C -> GROWTH -> LATE_STAGE Returns: Score from 0-15 (half credit for adjacent stages) """ stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"] # Normalize project stage for comparison project_stage_normalized = project_stage.upper().strip() try: project_idx = stage_order.index(project_stage_normalized) except ValueError: return 0 # Check for adjacent stages adjacent_stages = [] if project_idx > 0: adjacent_stages.append(stage_order[project_idx - 1]) if project_idx < len(stage_order) - 1: adjacent_stages.append(stage_order[project_idx + 1]) # Normalize fund stages and check for matches for stage in fund_stages: stage_normalized = stage.upper().strip() if stage_normalized in adjacent_stages: return 15 # Half credit for adjacent stage return 0 def _check_geographic_overlap(location1: str, location2: str) -> bool: """ Check for common geographic terms between two locations. Examples: - "San Francisco, CA" and "California" -> True - "New York" and "USA" -> True (if both contain USA/US) - "London, UK" and "United Kingdom" -> True - "Germany" and "Europe" -> True """ # Normalize inputs loc1 = location1.lower().strip() loc2 = location2.lower().strip() # Common geographic groupings with broader regional mappings geo_groups = [ # North America ["usa", "us", "united states", "america", "u.s.", "u.s.a"], ["canada", "canadian"], ["mexico", "mexican"], # Europe and countries [ "europe", "european", "eu", "germany", "france", "uk", "united kingdom", "britain", "spain", "italy", "netherlands", "belgium", "sweden", "denmark", "norway", "finland", "poland", "portugal", "austria", "switzerland", "ireland", "greece", "czech", "romania", ], # UK specific ["uk", "united kingdom", "britain", "england", "scotland", "wales", "london"], # US states ["california", "ca", "san francisco", "los angeles", "silicon valley"], ["new york", "ny", "nyc"], ["texas", "tx"], ["massachusetts", "ma", "boston"], ["washington", "seattle"], # Asia [ "asia", "asian", "china", "japan", "korea", "singapore", "hong kong", "india", "indonesia", "thailand", "vietnam", "malaysia", "philippines", ], # Middle East ["middle east", "israel", "uae", "dubai", "saudi arabia"], # Latin America ["latin america", "brazil", "argentina", "chile", "colombia", "mexico"], # Africa ["africa", "african", "south africa", "nigeria", "kenya", "egypt"], # Oceania ["australia", "australian", "new zealand"], ] # Check if both locations match any group for group in geo_groups: found_in_1 = any(term in loc1 for term in group) found_in_2 = any(term in loc2 for term in group) if found_in_1 and found_in_2: return True # Check for direct substring match (one contains the other) if loc1 in loc2 or loc2 in loc1: return True return False def get_top_compatible_investors( project: ProjectTable, investors: List[InvestorTable], limit: int = 10, min_score: float = 0.0, use_funds: bool = True, ) -> List[Tuple[InvestorTable, float]]: """ Get the top N most compatible investors for a project. Args: project: The project to find investors for investors: List of all available investors limit: Maximum number of investors to return min_score: Minimum compatibility score threshold (0-1) use_funds: If True, evaluates against investors' funds Returns: List of tuples (investor, score) sorted by score descending, limited to 'limit' items and filtered by min_score """ scored_investors = calculate_project_investors_compatibility( project, investors, use_funds ) # Filter by minimum score filtered_investors = [ (investor, score) for investor, score in scored_investors if score >= min_score ] # Return top N return filtered_investors[:limit] def get_compatibility_score_breakdown( project: ProjectTable, investor: InvestorTable, fund: Optional[FundTable] = None ) -> dict: """ Get a detailed breakdown of the compatibility score components. Useful for debugging or showing users why a particular score was calculated. Returns: Dictionary with score components and explanations """ if fund: total_score = 0 # Stage score stage_score = 0 stage_match = False if project.stage and fund.investment_stages: fund_stage_names = {stage.name for stage in fund.investment_stages} project_stage_name = ( project.stage.value if hasattr(project.stage, "value") else str(project.stage) ) if project_stage_name in fund_stage_names: stage_score = 30 stage_match = True else: stage_score = _calculate_stage_proximity( project_stage_name, fund_stage_names ) # Sector score sector_score = 0 matching_sectors = [] if project.sector and fund.sectors: project_sector_ids = {sector.id for sector in project.sector} fund_sector_ids = {sector.id for sector in fund.sectors} if project_sector_ids and fund_sector_ids: common_sectors = project_sector_ids.intersection(fund_sector_ids) matching_sectors = [ s.name for s in fund.sectors if s.id in common_sectors ] overlap_ratio = len(common_sectors) / len(project_sector_ids) sector_score = int(30 * overlap_ratio) # Geographic score geo_score = 0 geo_match_type = "none" if project.location and fund.geographic_focus: project_location_lower = project.location.lower() fund_geo_lower = fund.geographic_focus.lower() if project_location_lower == fund_geo_lower: geo_score = 20 geo_match_type = "exact" elif ( project_location_lower in fund_geo_lower or fund_geo_lower in project_location_lower ): geo_score = 10 geo_match_type = "partial" elif _check_geographic_overlap(project_location_lower, fund_geo_lower): geo_score = 5 geo_match_type = "regional" # Valuation score valuation_score = 0 valuation_fit = "unknown" if project.valuation and fund.check_size_lower and fund.check_size_upper: reasonable_valuation_min = fund.check_size_lower * 3 reasonable_valuation_max = fund.check_size_upper * 10 if ( reasonable_valuation_min <= project.valuation <= reasonable_valuation_max ): valuation_score = 20 valuation_fit = "perfect" elif project.valuation < reasonable_valuation_min: ratio = ( project.valuation / reasonable_valuation_min if reasonable_valuation_min > 0 else 0 ) valuation_score = int(10 * ratio) valuation_fit = "too_small" else: ratio = ( reasonable_valuation_max / project.valuation if project.valuation > 0 else 0 ) valuation_score = int(10 * ratio) valuation_fit = "too_large" total_score = stage_score + sector_score + geo_score + valuation_score return { "total_score": total_score / 100, "breakdown": { "stage": { "score": stage_score, "max_score": 30, "match": stage_match, "project_stage": project.stage.value if project.stage else None, "fund_stages": [s.name for s in fund.investment_stages] if fund.investment_stages else [], }, "sector": { "score": sector_score, "max_score": 30, "matching_sectors": matching_sectors, "project_sectors": [s.name for s in project.sector] if project.sector else [], "fund_sectors": [s.name for s in fund.sectors] if fund.sectors else [], }, "geography": { "score": geo_score, "max_score": 20, "match_type": geo_match_type, "project_location": project.location, "fund_geography": fund.geographic_focus, }, "valuation": { "score": valuation_score, "max_score": 20, "fit": valuation_fit, "project_valuation": project.valuation, "fund_check_size_range": f"{fund.check_size_lower}-{fund.check_size_upper}" if fund.check_size_lower else None, }, }, } else: # Investor-level breakdown (simplified) return { "total_score": _calculate_project_investor_direct_compatibility( project, investor ), "note": "Using investor-level data (no specific fund selected)", } def generate_compatibility_explanation( project: ProjectTable, investor: InvestorTable, score: float, use_funds: bool = True ) -> str: """ Generate a detailed, natural language explanation of the compatibility score. Args: project: The project being evaluated investor: The investor being compared against score: The calculated compatibility score (0-1) use_funds: Whether fund-level data was used Returns: A formatted string with the compatibility score and detailed explanation """ score_percentage = int(score * 100) # Determine match quality if score_percentage >= 80: match_level = "Excellent match" elif score_percentage >= 65: match_level = "Strong match" elif score_percentage >= 50: match_level = "Good match" elif score_percentage >= 35: match_level = "Moderate match" else: match_level = "Limited match" # Collect alignment factors alignment_factors = [] recommendations = [] # Get the best matching fund if using funds best_fund = None if use_funds and investor.funds: best_score = 0 for fund in investor.funds: fund_score = _calculate_project_fund_compatibility(project, fund) if fund_score > best_score: best_score = fund_score best_fund = fund # Analyze sector alignment if project.sector: project_sectors = [s.name for s in project.sector if hasattr(s, "name")] if best_fund and best_fund.sectors: fund_sectors = {s.name for s in best_fund.sectors if hasattr(s, "name")} common_sectors = set(project_sectors) & fund_sectors if common_sectors: sectors_str = ", ".join(list(common_sectors)[:2]) alignment_factors.append(f"{sectors_str} sector focus") elif project_sectors: recommendations.append( f"Consider emphasizing any {project_sectors[0]} industry connections" ) elif investor.sectors: investor_sectors = {s.name for s in investor.sectors if hasattr(s, "name")} common_sectors = set(project_sectors) & investor_sectors if common_sectors: sectors_str = ", ".join(list(common_sectors)[:2]) alignment_factors.append(f"{sectors_str} sector focus") # Analyze stage alignment if project.stage: stage_name = ( project.stage.value if hasattr(project.stage, "value") else str(project.stage) ) stage_display = stage_name.replace("_", " ").title() if best_fund and best_fund.investment_stages: fund_stage_names = { s.name for s in best_fund.investment_stages if hasattr(s, "name") } if stage_name in fund_stage_names: alignment_factors.append(f"{stage_display} stage") else: recommendations.append( "Investor typically focuses on different stages; highlight your traction and growth metrics" ) if not best_fund: alignment_factors.append(f"{stage_display} stage") # Analyze geographic alignment if project.location: if best_fund and best_fund.geographic_focus: if ( project.location.lower() in best_fund.geographic_focus.lower() or best_fund.geographic_focus.lower() in project.location.lower() ): alignment_factors.append(f"{project.location} presence") elif investor.headquarters: if ( project.location.lower() in investor.headquarters.lower() or investor.headquarters.lower() in project.location.lower() ): alignment_factors.append(f"{project.location} market presence") # Analyze valuation/check size fit if project.valuation: if best_fund and best_fund.check_size_lower and best_fund.check_size_upper: reasonable_min = best_fund.check_size_lower * 3 reasonable_max = best_fund.check_size_upper * 10 if reasonable_min <= project.valuation <= reasonable_max: alignment_factors.append("appropriate funding stage") elif project.valuation < reasonable_min: recommendations.append( "You may be early for this investor; consider approaching at a later stage" ) else: recommendations.append( "Consider highlighting your growth trajectory and market opportunity" ) # Build the explanation explanation_parts = [f"Based on your startup profile: {score_percentage}% match"] if alignment_factors: alignment_text = ", ".join(alignment_factors) explanation_parts.append(f"{match_level}: {alignment_text}.") else: explanation_parts.append(f"{match_level}.") if recommendations: rec_text = recommendations[0] # Show the most important recommendation explanation_parts.append(rec_text + ".") return " ".join(explanation_parts)