Anton_wireframe/app/services/compatibility_score.py

"""
Compatibility Score Service

This module calculates compatibility scores between projects and investors.
The scoring system evaluates multiple dimensions to determine how well a project
matches with an investor's investment criteria.
"""

from typing import List, Optional, Tuple

from db.models import FundTable, InvestorTable, ProjectTable


def calculate_project_investor_compatibility(
    project: ProjectTable, investor: InvestorTable, use_funds: bool = True
) -> float:
    """
    Calculate compatibility score between a project and an investor.

    Args:
        project: The project to evaluate
        investor: The investor to compare against
        use_funds: If True, evaluates against investor's funds. If False, uses investor-level data.

    Returns:
        A score between 0 and 1, where 1 is perfect match

    Scoring breakdown (out of 100 points):
        - Investment Stage Match: 30 points
        - Sector Overlap: 30 points
        - Geographic Match: 20 points
        - Valuation/Check Size Fit: 20 points
    """
    if use_funds and investor.funds:
        # Calculate score for each fund and return the highest
        max_score = 0.0
        for fund in investor.funds:
            fund_score = _calculate_project_fund_compatibility(project, fund)
            max_score = max(max_score, fund_score)
        return max_score
    else:
        # Use investor-level data (fallback)
        return _calculate_project_investor_direct_compatibility(project, investor)


def calculate_project_investors_compatibility(
    project: ProjectTable, investors: List[InvestorTable], use_funds: bool = True
) -> List[Tuple[InvestorTable, float]]:
    """
    Calculate compatibility scores between a project and multiple investors.

    Args:
        project: The project to evaluate
        investors: List of investors to compare against
        use_funds: If True, evaluates against investors' funds. If False, uses investor-level data.

    Returns:
        List of tuples (investor, score) sorted by score descending
    """
    scored_investors = []

    for investor in investors:
        score = calculate_project_investor_compatibility(project, investor, use_funds)
        scored_investors.append((investor, score))

    # Sort by score descending
    scored_investors.sort(key=lambda x: x[1], reverse=True)

    return scored_investors


def _calculate_project_fund_compatibility(
    project: ProjectTable, fund: FundTable
) -> float:
    """
    Calculate compatibility score between a project and a specific fund.

    Scoring breakdown:
        - Investment Stage Match: 30 points (all or nothing if stage exists)
        - Sector Overlap: 30 points (proportional to overlap)
        - Geographic Match: 20 points (exact=20, partial=10, none=0)
        - Valuation/Check Size Fit: 20 points (proportional to fit)

    Returns:
        A score between 0 and 1
    """
    total_score = 0
    max_score = 100

    # 1. Investment Stage Match (30 points)
    stage_score = 0
    if project.stage and fund.investment_stages:
        # Check if project stage matches any of the fund's investment stages
        fund_stage_names = {stage.name for stage in fund.investment_stages}
        # Convert project.stage enum to string for comparison
        project_stage_name = (
            project.stage.value
            if hasattr(project.stage, "value")
            else str(project.stage)
        )

        if project_stage_name in fund_stage_names:
            stage_score = 30
        else:
            # Partial credit for adjacent stages
            stage_score = _calculate_stage_proximity(
                project_stage_name, fund_stage_names
            )

    total_score += stage_score

    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and fund.sectors:
        project_sector_ids = {sector.id for sector in project.sector}
        fund_sector_ids = {sector.id for sector in fund.sectors}

        if project_sector_ids and fund_sector_ids:
            common_sectors = project_sector_ids.intersection(fund_sector_ids)
            # Score based on what percentage of project sectors are covered by fund
            overlap_ratio = len(common_sectors) / len(project_sector_ids)
            sector_score = int(30 * overlap_ratio)

    total_score += sector_score

    # 3. Geographic Match (20 points)
    geo_score = 0
    if project.location and fund.geographic_focus:
        project_location_lower = project.location.lower()
        fund_geo_lower = (fund.geographic_focus or "").lower()

        # Exact match
        if project_location_lower == fund_geo_lower:
            geo_score = 20
        # Partial match (one contains the other)
        elif (
            project_location_lower in fund_geo_lower
            or fund_geo_lower in project_location_lower
        ):
            geo_score = 10
        # Check for common geographic terms
        elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
            geo_score = 5

    total_score += geo_score

    # 4. Valuation/Check Size Fit (20 points)
    valuation_score = 0
    if project.valuation and fund.check_size_lower and fund.check_size_upper:
        # Check if project valuation falls within or near the check size range
        # Typically, check size is a fraction of valuation (e.g., 10-20%)
        # We'll assume check size represents potential investment amount

        if fund.check_size_lower <= project.valuation <= fund.check_size_upper:
            # Valuation is within the check size range (might be too small)
            valuation_score = 10
        else:
            # Check if the check size is reasonable for this valuation
            # Typical investment is 10-30% of valuation
            reasonable_valuation_min = fund.check_size_lower * 3  # Investing ~33%
            reasonable_valuation_max = fund.check_size_upper * 10  # Investing ~10%

            if (
                reasonable_valuation_min
                <= project.valuation
                <= reasonable_valuation_max
            ):
                # Perfect fit
                valuation_score = 20
            elif project.valuation < reasonable_valuation_min:
                # Project might be too small
                ratio = (
                    project.valuation / reasonable_valuation_min
                    if reasonable_valuation_min > 0
                    else 0
                )
                valuation_score = int(10 * ratio)
            else:
                # Project might be too large
                ratio = (
                    reasonable_valuation_max / project.valuation
                    if project.valuation > 0
                    else 0
                )
                valuation_score = int(10 * ratio)

    total_score += valuation_score

    # Convert to 0-1 scale
    return total_score / max_score


def _calculate_project_investor_direct_compatibility(
    project: ProjectTable, investor: InvestorTable
) -> float:
    """
    Calculate compatibility using investor-level data (fallback when no funds available).

    Uses the same scoring system but with investor-level attributes.
    """
    total_score = 0
    max_score = 100

    # 1. Investment Stage - Skip this since investors don't have a direct stage field
    # We could add 30 points to other categories, but for consistency, we'll leave it as 0
    stage_score = 0
    total_score += stage_score

    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and investor.sectors:
        project_sector_ids = {sector.id for sector in project.sector}
        investor_sector_ids = {sector.id for sector in investor.sectors}

        if project_sector_ids and investor_sector_ids:
            common_sectors = project_sector_ids.intersection(investor_sector_ids)
            overlap_ratio = len(common_sectors) / len(project_sector_ids)
            sector_score = int(30 * overlap_ratio)

    total_score += sector_score

    # 3. Geographic Match (20 points)
    geo_score = 0
    if project.location and investor.geographic_focus:
        project_location_lower = project.location.lower()
        investor_geo_lower = (investor.geographic_focus or "").lower()

        if project_location_lower == investor_geo_lower:
            geo_score = 20
        elif (
            project_location_lower in investor_geo_lower
            or investor_geo_lower in project_location_lower
        ):
            geo_score = 10
        elif _check_geographic_overlap(project_location_lower, investor_geo_lower):
            geo_score = 5

    total_score += geo_score

    # 4. Valuation/Check Size Fit (20 points)
    valuation_score = 0
    if project.valuation and investor.check_size_lower and investor.check_size_upper:
        reasonable_valuation_min = investor.check_size_lower * 3
        reasonable_valuation_max = investor.check_size_upper * 10

        if reasonable_valuation_min <= project.valuation <= reasonable_valuation_max:
            valuation_score = 20
        elif project.valuation < reasonable_valuation_min:
            ratio = (
                project.valuation / reasonable_valuation_min
                if reasonable_valuation_min > 0
                else 0
            )
            valuation_score = int(10 * ratio)
        else:
            ratio = (
                reasonable_valuation_max / project.valuation
                if project.valuation > 0
                else 0
            )
            valuation_score = int(10 * ratio)

    total_score += valuation_score

    # Convert to 0-1 scale
    return total_score / max_score


def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
    """
    Calculate proximity score between project stage and fund stages.
    Awards partial credit for adjacent investment stages.

    Stage progression: SEED -> SERIES_A -> SERIES_B -> SERIES_C -> GROWTH -> LATE_STAGE

    Returns:
        Score from 0-15 (half credit for adjacent stages)
    """
    stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"]

    try:
        project_idx = stage_order.index(project_stage)
    except ValueError:
        return 0

    # Check for adjacent stages
    adjacent_stages = []
    if project_idx > 0:
        adjacent_stages.append(stage_order[project_idx - 1])
    if project_idx < len(stage_order) - 1:
        adjacent_stages.append(stage_order[project_idx + 1])

    for stage in fund_stages:
        if stage in adjacent_stages:
            return 15  # Half credit for adjacent stage

    return 0


def _check_geographic_overlap(location1: str, location2: str) -> bool:
    """
    Check for common geographic terms between two locations.

    Examples:
        - "San Francisco, CA" and "California" -> True
        - "New York" and "USA" -> True (if both contain USA/US)
        - "London, UK" and "United Kingdom" -> True
    """
    # Common geographic groupings
    geo_groups = [
        ["usa", "us", "united states", "america"],
        ["uk", "united kingdom", "britain"],
        ["california", "ca"],
        ["new york", "ny"],
        ["texas", "tx"],
        ["europe", "eu"],
        ["asia", "asian"],
        ["africa", "african"],
    ]

    for group in geo_groups:
        found_in_1 = any(term in location1 for term in group)
        found_in_2 = any(term in location2 for term in group)
        if found_in_1 and found_in_2:
            return True

    return False


def get_top_compatible_investors(
    project: ProjectTable,
    investors: List[InvestorTable],
    limit: int = 10,
    min_score: float = 0.0,
    use_funds: bool = True,
) -> List[Tuple[InvestorTable, float]]:
    """
    Get the top N most compatible investors for a project.

    Args:
        project: The project to find investors for
        investors: List of all available investors
        limit: Maximum number of investors to return
        min_score: Minimum compatibility score threshold (0-1)
        use_funds: If True, evaluates against investors' funds

    Returns:
        List of tuples (investor, score) sorted by score descending,
        limited to 'limit' items and filtered by min_score
    """
    scored_investors = calculate_project_investors_compatibility(
        project, investors, use_funds
    )

    # Filter by minimum score
    filtered_investors = [
        (investor, score) for investor, score in scored_investors if score >= min_score
    ]

    # Return top N
    return filtered_investors[:limit]


def get_compatibility_score_breakdown(
    project: ProjectTable, investor: InvestorTable, fund: Optional[FundTable] = None
) -> dict:
    """
    Get a detailed breakdown of the compatibility score components.

    Useful for debugging or showing users why a particular score was calculated.

    Returns:
        Dictionary with score components and explanations
    """
    if fund:
        total_score = 0

        # Stage score
        stage_score = 0
        stage_match = False
        if project.stage and fund.investment_stages:
            fund_stage_names = {stage.name for stage in fund.investment_stages}
            project_stage_name = (
                project.stage.value
                if hasattr(project.stage, "value")
                else str(project.stage)
            )
            if project_stage_name in fund_stage_names:
                stage_score = 30
                stage_match = True
            else:
                stage_score = _calculate_stage_proximity(
                    project_stage_name, fund_stage_names
                )

        # Sector score
        sector_score = 0
        matching_sectors = []
        if project.sector and fund.sectors:
            project_sector_ids = {sector.id for sector in project.sector}
            fund_sector_ids = {sector.id for sector in fund.sectors}
            if project_sector_ids and fund_sector_ids:
                common_sectors = project_sector_ids.intersection(fund_sector_ids)
                matching_sectors = [
                    s.name for s in fund.sectors if s.id in common_sectors
                ]
                overlap_ratio = len(common_sectors) / len(project_sector_ids)
                sector_score = int(30 * overlap_ratio)

        # Geographic score
        geo_score = 0
        geo_match_type = "none"
        if project.location and fund.geographic_focus:
            project_location_lower = project.location.lower()
            fund_geo_lower = fund.geographic_focus.lower()
            if project_location_lower == fund_geo_lower:
                geo_score = 20
                geo_match_type = "exact"
            elif (
                project_location_lower in fund_geo_lower
                or fund_geo_lower in project_location_lower
            ):
                geo_score = 10
                geo_match_type = "partial"
            elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
                geo_score = 5
                geo_match_type = "regional"

        # Valuation score
        valuation_score = 0
        valuation_fit = "unknown"
        if project.valuation and fund.check_size_lower and fund.check_size_upper:
            reasonable_valuation_min = fund.check_size_lower * 3
            reasonable_valuation_max = fund.check_size_upper * 10
            if (
                reasonable_valuation_min
                <= project.valuation
                <= reasonable_valuation_max
            ):
                valuation_score = 20
                valuation_fit = "perfect"
            elif project.valuation < reasonable_valuation_min:
                ratio = (
                    project.valuation / reasonable_valuation_min
                    if reasonable_valuation_min > 0
                    else 0
                )
                valuation_score = int(10 * ratio)
                valuation_fit = "too_small"
            else:
                ratio = (
                    reasonable_valuation_max / project.valuation
                    if project.valuation > 0
                    else 0
                )
                valuation_score = int(10 * ratio)
                valuation_fit = "too_large"

        total_score = stage_score + sector_score + geo_score + valuation_score

        return {
            "total_score": total_score / 100,
            "breakdown": {
                "stage": {
                    "score": stage_score,
                    "max_score": 30,
                    "match": stage_match,
                    "project_stage": project.stage.value if project.stage else None,
                    "fund_stages": [s.name for s in fund.investment_stages]
                    if fund.investment_stages
                    else [],
                },
                "sector": {
                    "score": sector_score,
                    "max_score": 30,
                    "matching_sectors": matching_sectors,
                    "project_sectors": [s.name for s in project.sector]
                    if project.sector
                    else [],
                    "fund_sectors": [s.name for s in fund.sectors]
                    if fund.sectors
                    else [],
                },
                "geography": {
                    "score": geo_score,
                    "max_score": 20,
                    "match_type": geo_match_type,
                    "project_location": project.location,
                    "fund_geography": fund.geographic_focus,
                },
                "valuation": {
                    "score": valuation_score,
                    "max_score": 20,
                    "fit": valuation_fit,
                    "project_valuation": project.valuation,
                    "fund_check_size_range": f"{fund.check_size_lower}-{fund.check_size_upper}"
                    if fund.check_size_lower
                    else None,
                },
            },
        }
    else:
        # Investor-level breakdown (simplified)
        return {
            "total_score": _calculate_project_investor_direct_compatibility(
                project, investor
            ),
            "note": "Using investor-level data (no specific fund selected)",
        }


def generate_compatibility_explanation(
    project: ProjectTable, investor: InvestorTable, score: float, use_funds: bool = True
) -> str:
    """
    Generate a detailed, natural language explanation of the compatibility score.

    Args:
        project: The project being evaluated
        investor: The investor being compared against
        score: The calculated compatibility score (0-1)
        use_funds: Whether fund-level data was used

    Returns:
        A formatted string with the compatibility score and detailed explanation
    """
    score_percentage = int(score * 100)

    # Determine match quality
    if score_percentage >= 80:
        match_level = "Excellent match"
    elif score_percentage >= 65:
        match_level = "Strong match"
    elif score_percentage >= 50:
        match_level = "Good match"
    elif score_percentage >= 35:
        match_level = "Moderate match"
    else:
        match_level = "Limited match"

    # Collect alignment factors
    alignment_factors = []
    recommendations = []

    # Get the best matching fund if using funds
    best_fund = None
    if use_funds and investor.funds:
        best_score = 0
        for fund in investor.funds:
            fund_score = _calculate_project_fund_compatibility(project, fund)
            if fund_score > best_score:
                best_score = fund_score
                best_fund = fund

    # Analyze sector alignment
    if project.sector:
        project_sectors = [s.name for s in project.sector if hasattr(s, "name")]

        if best_fund and best_fund.sectors:
            fund_sectors = {s.name for s in best_fund.sectors if hasattr(s, "name")}
            common_sectors = set(project_sectors) & fund_sectors

            if common_sectors:
                sectors_str = ", ".join(list(common_sectors)[:2])
                alignment_factors.append(f"{sectors_str} sector focus")
            elif project_sectors:
                recommendations.append(
                    f"Consider emphasizing any {project_sectors[0]} industry connections"
                )
        elif investor.sectors:
            investor_sectors = {s.name for s in investor.sectors if hasattr(s, "name")}
            common_sectors = set(project_sectors) & investor_sectors

            if common_sectors:
                sectors_str = ", ".join(list(common_sectors)[:2])
                alignment_factors.append(f"{sectors_str} sector focus")

    # Analyze stage alignment
    if project.stage:
        stage_name = (
            project.stage.value
            if hasattr(project.stage, "value")
            else str(project.stage)
        )
        stage_display = stage_name.replace("_", " ").title()

        if best_fund and best_fund.investment_stages:
            fund_stage_names = {
                s.name for s in best_fund.investment_stages if hasattr(s, "name")
            }
            if stage_name in fund_stage_names:
                alignment_factors.append(f"{stage_display} stage")
            else:
                recommendations.append(
                    "Investor typically focuses on different stages; highlight your traction and growth metrics"
                )

        if not best_fund:
            alignment_factors.append(f"{stage_display} stage")

    # Analyze geographic alignment
    if project.location:
        if best_fund and best_fund.geographic_focus:
            if (
                project.location.lower() in best_fund.geographic_focus.lower()
                or best_fund.geographic_focus.lower() in project.location.lower()
            ):
                alignment_factors.append(f"{project.location} presence")
        elif investor.headquarters:
            if (
                project.location.lower() in investor.headquarters.lower()
                or investor.headquarters.lower() in project.location.lower()
            ):
                alignment_factors.append(f"{project.location} market presence")

    # Analyze valuation/check size fit
    if project.valuation:
        if best_fund and best_fund.check_size_lower and best_fund.check_size_upper:
            reasonable_min = best_fund.check_size_lower * 3
            reasonable_max = best_fund.check_size_upper * 10

            if reasonable_min <= project.valuation <= reasonable_max:
                alignment_factors.append("appropriate funding stage")
            elif project.valuation < reasonable_min:
                recommendations.append(
                    "You may be early for this investor; consider approaching at a later stage"
                )
            else:
                recommendations.append(
                    "Consider highlighting your growth trajectory and market opportunity"
                )

    # Build the explanation
    explanation_parts = [f"Based on your startup profile: {score_percentage}% match"]

    if alignment_factors:
        alignment_text = ", ".join(alignment_factors)
        explanation_parts.append(f"{match_level}: {alignment_text}.")
    else:
        explanation_parts.append(f"{match_level}.")

    if recommendations:
        rec_text = recommendations[0]  # Show the most important recommendation
        explanation_parts.append(rec_text + ".")

    return " ".join(explanation_parts)