Anton_wireframe/app/services/compatibility_score.py

"""
Compatibility Score Service

This module calculates compatibility scores between projects and investors.
The scoring system evaluates multiple dimensions to determine how well a project
matches with an investor's investment criteria.
"""

from difflib import SequenceMatcher
from typing import List, Optional, Tuple

from db.models import FundTable, InvestorTable, ProjectTable


def calculate_project_investor_compatibility(
    project: ProjectTable, investor: InvestorTable, use_funds: bool = True
) -> float:
    """
    Calculate compatibility score between a project and an investor.

    Args:
        project: The project to evaluate
        investor: The investor to compare against
        use_funds: If True, evaluates against investor's funds. If False, uses investor-level data.

    Returns:
        A score between 0 and 1, where 1 is perfect match

    Scoring breakdown (out of 100 points):
        - Investment Stage Match: 30 points
        - Sector Overlap: 30 points
        - Geographic Match: 20 points
        - Valuation/Check Size Fit: 20 points
    """
    if use_funds and investor.funds:
        # Calculate score for each fund and return the highest
        max_score = 0.0
        for fund in investor.funds:
            fund_score = _calculate_project_fund_compatibility(project, fund)
            max_score = max(max_score, fund_score)
        return max_score
    else:
        # Use investor-level data (fallback)
        return _calculate_project_investor_direct_compatibility(project, investor)


def calculate_project_investors_compatibility(
    project: ProjectTable, investors: List[InvestorTable], use_funds: bool = True
) -> List[Tuple[InvestorTable, float]]:
    """
    Calculate compatibility scores between a project and multiple investors.

    Args:
        project: The project to evaluate
        investors: List of investors to compare against
        use_funds: If True, evaluates against investors' funds. If False, uses investor-level data.

    Returns:
        List of tuples (investor, score) sorted by score descending
    """
    scored_investors = []

    for investor in investors:
        score = calculate_project_investor_compatibility(project, investor, use_funds)
        scored_investors.append((investor, score))

    # Sort by score descending
    scored_investors.sort(key=lambda x: x[1], reverse=True)

    return scored_investors


def _calculate_project_fund_compatibility(
    project: ProjectTable, fund: FundTable
) -> float:
    """
    Calculate compatibility score between a project and a specific fund.

    Scoring breakdown:
        - Investment Stage Match: 30 points (all or nothing if stage exists)
        - Sector Overlap: 30 points (proportional to overlap)
        - Geographic Match: 20 points (exact=20, partial=10, none=0)
        - Valuation/Check Size Fit: 20 points (proportional to fit)

    Returns:
        A score between 0 and 1
    """
    total_score = 0
    max_score = 100

    # 1. Investment Stage Match (30 points)
    stage_score = 0
    if project.stage and fund.investment_stages:
        # Check if project stage matches any of the fund's investment stages
        fund_stage_names = {stage.name for stage in fund.investment_stages}
        # Convert project.stage enum to string for comparison
        project_stage_name = (
            project.stage.value
            if hasattr(project.stage, "value")
            else str(project.stage)
        )

        # Normalize both for case-insensitive comparison
        project_stage_normalized = project_stage_name.upper().strip()
        fund_stages_normalized = {name.upper().strip() for name in fund_stage_names}

        if project_stage_normalized in fund_stages_normalized:
            stage_score = 30
        else:
            # Partial credit for adjacent stages
            stage_score = _calculate_stage_proximity(
                project_stage_normalized, fund_stages_normalized
            )

    total_score += stage_score

    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and fund.sectors:
        project_sectors = [s for s in project.sector if hasattr(s, "name")]
        fund_sectors = [s for s in fund.sectors if hasattr(s, "name")]

        if project_sectors and fund_sectors:
            # Use fuzzy matching to account for similar but not identical sector names
            match_count = 0
            total_matches = 0

            for proj_sector in project_sectors:
                best_match_score = 0
                proj_name = proj_sector.name.lower().strip()

                for fund_sector in fund_sectors:
                    fund_name = fund_sector.name.lower().strip()

                    # Exact match
                    if proj_name == fund_name:
                        best_match_score = 1.0
                        break

                    # Fuzzy match using sequence matcher
                    similarity = SequenceMatcher(None, proj_name, fund_name).ratio()

                    # Also check if one contains the other (substring match)
                    if proj_name in fund_name or fund_name in proj_name:
                        similarity = max(similarity, 0.8)

                    best_match_score = max(best_match_score, similarity)

                # Count matches with threshold
                # Perfect match (1.0), strong match (>0.75), partial match (>0.6)
                if best_match_score >= 0.6:
                    total_matches += best_match_score
                    match_count += 1

            if match_count > 0:
                # Calculate overlap ratio based on fuzzy matches
                overlap_ratio = total_matches / len(project_sectors)
                sector_score = int(30 * overlap_ratio)

    total_score += sector_score

    # 3. Geographic Match (20 points)
    geo_score = 0
    if project.location and fund.geographic_focus:
        project_location_lower = project.location.lower().strip()
        fund_geo_lower = (fund.geographic_focus or "").lower().strip()

        # Exact match
        if project_location_lower == fund_geo_lower:
            geo_score = 20
        # Partial match (one contains the other)
        elif (
            project_location_lower in fund_geo_lower
            or fund_geo_lower in project_location_lower
        ):
            geo_score = 15
        # Check for common geographic terms or regional overlap (continent/country matching)
        elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
            # Give higher score for continent/country matches (e.g., Germany -> Europe)
            geo_score = 18

    total_score += geo_score

    # 4. Valuation/Check Size Fit (20 points)
    valuation_score = 0
    if project.valuation and fund.check_size_lower and fund.check_size_upper:
        # Check if project valuation falls within or near the check size range
        # Typically, check size is a fraction of valuation (e.g., 10-20%)
        # We'll assume check size represents potential investment amount

        if fund.check_size_lower <= project.valuation <= fund.check_size_upper:
            # Valuation is within the check size range (might be too small)
            valuation_score = 10
        else:
            # Check if the check size is reasonable for this valuation
            # Typical investment is 10-30% of valuation
            reasonable_valuation_min = fund.check_size_lower * 3  # Investing ~33%
            reasonable_valuation_max = fund.check_size_upper * 10  # Investing ~10%

            if (
                reasonable_valuation_min
                <= project.valuation
                <= reasonable_valuation_max
            ):
                # Perfect fit
                valuation_score = 20
            elif project.valuation < reasonable_valuation_min:
                # Project might be too small
                ratio = (
                    project.valuation / reasonable_valuation_min
                    if reasonable_valuation_min > 0
                    else 0
                )
                valuation_score = int(10 * ratio)
            else:
                # Project might be too large
                ratio = (
                    reasonable_valuation_max / project.valuation
                    if project.valuation > 0
                    else 0
                )
                valuation_score = int(10 * ratio)

    total_score += valuation_score

    # Convert to 0-1 scale
    return total_score / max_score


def _calculate_project_investor_direct_compatibility(
    project: ProjectTable, investor: InvestorTable
) -> float:
    """
    Calculate compatibility using investor-level data (fallback when no funds available).

    Uses the same scoring system but with investor-level attributes.
    """
    total_score = 0
    max_score = 100

    # 1. Investment Stage - Skip this since investors don't have a direct stage field
    # We could add 30 points to other categories, but for consistency, we'll leave it as 0
    stage_score = 0
    total_score += stage_score

    # 2. Sector Overlap (30 points)
    sector_score = 0
    if project.sector and investor.sectors:
        project_sectors = [s for s in project.sector if hasattr(s, "name")]
        investor_sectors = [s for s in investor.sectors if hasattr(s, "name")]

        if project_sectors and investor_sectors:
            # Use fuzzy matching to account for similar but not identical sector names
            match_count = 0
            total_matches = 0

            for proj_sector in project_sectors:
                best_match_score = 0
                proj_name = proj_sector.name.lower().strip()

                for inv_sector in investor_sectors:
                    inv_name = inv_sector.name.lower().strip()

                    # Exact match
                    if proj_name == inv_name:
                        best_match_score = 1.0
                        break

                    # Fuzzy match using sequence matcher
                    similarity = SequenceMatcher(None, proj_name, inv_name).ratio()

                    # Also check if one contains the other (substring match)
                    if proj_name in inv_name or inv_name in proj_name:
                        similarity = max(similarity, 0.8)

                    best_match_score = max(best_match_score, similarity)

                # Count matches with threshold
                if best_match_score >= 0.6:
                    total_matches += best_match_score
                    match_count += 1

            if match_count > 0:
                # Calculate overlap ratio based on fuzzy matches
                overlap_ratio = total_matches / len(project_sectors)
                sector_score = int(30 * overlap_ratio)

    total_score += sector_score

    # 3. Geographic Match (20 points)
    geo_score = 0
    if project.location and investor.geographic_focus:
        project_location_lower = project.location.lower()
        investor_geo_lower = (investor.geographic_focus or "").lower()

        if project_location_lower == investor_geo_lower:
            geo_score = 20
        elif (
            project_location_lower in investor_geo_lower
            or investor_geo_lower in project_location_lower
        ):
            geo_score = 15
        elif _check_geographic_overlap(project_location_lower, investor_geo_lower):
            # Give higher score for continent/country matches (e.g., Germany -> Europe)
            geo_score = 18

    total_score += geo_score

    # 4. Valuation/Check Size Fit (20 points)
    valuation_score = 0
    if project.valuation and investor.check_size_lower and investor.check_size_upper:
        reasonable_valuation_min = investor.check_size_lower * 3
        reasonable_valuation_max = investor.check_size_upper * 10

        if reasonable_valuation_min <= project.valuation <= reasonable_valuation_max:
            valuation_score = 20
        elif project.valuation < reasonable_valuation_min:
            ratio = (
                project.valuation / reasonable_valuation_min
                if reasonable_valuation_min > 0
                else 0
            )
            valuation_score = int(10 * ratio)
        else:
            ratio = (
                reasonable_valuation_max / project.valuation
                if project.valuation > 0
                else 0
            )
            valuation_score = int(10 * ratio)

    total_score += valuation_score

    # Convert to 0-1 scale
    return total_score / max_score


def _calculate_stage_proximity(project_stage: str, fund_stages: set) -> int:
    """
    Calculate proximity score between project stage and fund stages.
    Awards partial credit for adjacent investment stages.

    Stage progression: SEED -> SERIES_A -> SERIES_B -> SERIES_C -> GROWTH -> LATE_STAGE

    Returns:
        Score from 0-15 (half credit for adjacent stages)
    """
    stage_order = ["SEED", "SERIES_A", "SERIES_B", "SERIES_C", "GROWTH", "LATE_STAGE"]

    # Normalize project stage for comparison
    project_stage_normalized = project_stage.upper().strip()

    try:
        project_idx = stage_order.index(project_stage_normalized)
    except ValueError:
        return 0

    # Check for adjacent stages
    adjacent_stages = []
    if project_idx > 0:
        adjacent_stages.append(stage_order[project_idx - 1])
    if project_idx < len(stage_order) - 1:
        adjacent_stages.append(stage_order[project_idx + 1])

    # Normalize fund stages and check for matches
    for stage in fund_stages:
        stage_normalized = stage.upper().strip()
        if stage_normalized in adjacent_stages:
            return 15  # Half credit for adjacent stage

    return 0


def _check_geographic_overlap(location1: str, location2: str) -> bool:
    """
    Check for common geographic terms between two locations.

    Examples:
        - "San Francisco, CA" and "California" -> True
        - "New York" and "USA" -> True (if both contain USA/US)
        - "London, UK" and "United Kingdom" -> True
        - "Germany" and "Europe" -> True
    """
    # Normalize inputs
    loc1 = location1.lower().strip()
    loc2 = location2.lower().strip()

    # Common geographic groupings with broader regional mappings
    geo_groups = [
        # North America
        ["usa", "us", "united states", "america", "u.s.", "u.s.a"],
        ["canada", "canadian"],
        ["mexico", "mexican"],
        # Europe and countries
        [
            "europe",
            "european",
            "eu",
            "germany",
            "france",
            "uk",
            "united kingdom",
            "britain",
            "spain",
            "italy",
            "netherlands",
            "belgium",
            "sweden",
            "denmark",
            "norway",
            "finland",
            "poland",
            "portugal",
            "austria",
            "switzerland",
            "ireland",
            "greece",
            "czech",
            "romania",
        ],
        # UK specific
        ["uk", "united kingdom", "britain", "england", "scotland", "wales", "london"],
        # US states
        ["california", "ca", "san francisco", "los angeles", "silicon valley"],
        ["new york", "ny", "nyc"],
        ["texas", "tx"],
        ["massachusetts", "ma", "boston"],
        ["washington", "seattle"],
        # Asia
        [
            "asia",
            "asian",
            "china",
            "japan",
            "korea",
            "singapore",
            "hong kong",
            "india",
            "indonesia",
            "thailand",
            "vietnam",
            "malaysia",
            "philippines",
        ],
        # Middle East
        ["middle east", "israel", "uae", "dubai", "saudi arabia"],
        # Latin America
        ["latin america", "brazil", "argentina", "chile", "colombia", "mexico"],
        # Africa
        ["africa", "african", "south africa", "nigeria", "kenya", "egypt"],
        # Oceania
        ["australia", "australian", "new zealand"],
    ]

    # Check if both locations match any group
    for group in geo_groups:
        found_in_1 = any(term in loc1 for term in group)
        found_in_2 = any(term in loc2 for term in group)
        if found_in_1 and found_in_2:
            return True

    # Check for direct substring match (one contains the other)
    if loc1 in loc2 or loc2 in loc1:
        return True

    return False


def get_top_compatible_investors(
    project: ProjectTable,
    investors: List[InvestorTable],
    limit: int = 10,
    min_score: float = 0.0,
    use_funds: bool = True,
) -> List[Tuple[InvestorTable, float]]:
    """
    Get the top N most compatible investors for a project.

    Args:
        project: The project to find investors for
        investors: List of all available investors
        limit: Maximum number of investors to return
        min_score: Minimum compatibility score threshold (0-1)
        use_funds: If True, evaluates against investors' funds

    Returns:
        List of tuples (investor, score) sorted by score descending,
        limited to 'limit' items and filtered by min_score
    """
    scored_investors = calculate_project_investors_compatibility(
        project, investors, use_funds
    )

    # Filter by minimum score
    filtered_investors = [
        (investor, score) for investor, score in scored_investors if score >= min_score
    ]

    # Return top N
    return filtered_investors[:limit]


def get_compatibility_score_breakdown(
    project: ProjectTable, investor: InvestorTable, fund: Optional[FundTable] = None
) -> dict:
    """
    Get a detailed breakdown of the compatibility score components.

    Useful for debugging or showing users why a particular score was calculated.

    Returns:
        Dictionary with score components and explanations
    """
    if fund:
        total_score = 0

        # Stage score
        stage_score = 0
        stage_match = False
        if project.stage and fund.investment_stages:
            fund_stage_names = {stage.name for stage in fund.investment_stages}
            project_stage_name = (
                project.stage.value
                if hasattr(project.stage, "value")
                else str(project.stage)
            )
            if project_stage_name in fund_stage_names:
                stage_score = 30
                stage_match = True
            else:
                stage_score = _calculate_stage_proximity(
                    project_stage_name, fund_stage_names
                )

        # Sector score
        sector_score = 0
        matching_sectors = []
        if project.sector and fund.sectors:
            project_sector_ids = {sector.id for sector in project.sector}
            fund_sector_ids = {sector.id for sector in fund.sectors}
            if project_sector_ids and fund_sector_ids:
                common_sectors = project_sector_ids.intersection(fund_sector_ids)
                matching_sectors = [
                    s.name for s in fund.sectors if s.id in common_sectors
                ]
                overlap_ratio = len(common_sectors) / len(project_sector_ids)
                sector_score = int(30 * overlap_ratio)

        # Geographic score
        geo_score = 0
        geo_match_type = "none"
        if project.location and fund.geographic_focus:
            project_location_lower = project.location.lower()
            fund_geo_lower = fund.geographic_focus.lower()
            if project_location_lower == fund_geo_lower:
                geo_score = 20
                geo_match_type = "exact"
            elif (
                project_location_lower in fund_geo_lower
                or fund_geo_lower in project_location_lower
            ):
                geo_score = 10
                geo_match_type = "partial"
            elif _check_geographic_overlap(project_location_lower, fund_geo_lower):
                geo_score = 5
                geo_match_type = "regional"

        # Valuation score
        valuation_score = 0
        valuation_fit = "unknown"
        if project.valuation and fund.check_size_lower and fund.check_size_upper:
            reasonable_valuation_min = fund.check_size_lower * 3
            reasonable_valuation_max = fund.check_size_upper * 10
            if (
                reasonable_valuation_min
                <= project.valuation
                <= reasonable_valuation_max
            ):
                valuation_score = 20
                valuation_fit = "perfect"
            elif project.valuation < reasonable_valuation_min:
                ratio = (
                    project.valuation / reasonable_valuation_min
                    if reasonable_valuation_min > 0
                    else 0
                )
                valuation_score = int(10 * ratio)
                valuation_fit = "too_small"
            else:
                ratio = (
                    reasonable_valuation_max / project.valuation
                    if project.valuation > 0
                    else 0
                )
                valuation_score = int(10 * ratio)
                valuation_fit = "too_large"

        total_score = stage_score + sector_score + geo_score + valuation_score

        return {
            "total_score": total_score / 100,
            "breakdown": {
                "stage": {
                    "score": stage_score,
                    "max_score": 30,
                    "match": stage_match,
                    "project_stage": project.stage.value if project.stage else None,
                    "fund_stages": [s.name for s in fund.investment_stages]
                    if fund.investment_stages
                    else [],
                },
                "sector": {
                    "score": sector_score,
                    "max_score": 30,
                    "matching_sectors": matching_sectors,
                    "project_sectors": [s.name for s in project.sector]
                    if project.sector
                    else [],
                    "fund_sectors": [s.name for s in fund.sectors]
                    if fund.sectors
                    else [],
                },
                "geography": {
                    "score": geo_score,
                    "max_score": 20,
                    "match_type": geo_match_type,
                    "project_location": project.location,
                    "fund_geography": fund.geographic_focus,
                },
                "valuation": {
                    "score": valuation_score,
                    "max_score": 20,
                    "fit": valuation_fit,
                    "project_valuation": project.valuation,
                    "fund_check_size_range": f"{fund.check_size_lower}-{fund.check_size_upper}"
                    if fund.check_size_lower
                    else None,
                },
            },
        }
    else:
        # Investor-level breakdown (simplified)
        return {
            "total_score": _calculate_project_investor_direct_compatibility(
                project, investor
            ),
            "note": "Using investor-level data (no specific fund selected)",
        }


def generate_compatibility_explanation(
    project: ProjectTable, investor: InvestorTable, score: float, use_funds: bool = True
) -> str:
    """
    Generate a detailed, natural language explanation of the compatibility score.

    Args:
        project: The project being evaluated
        investor: The investor being compared against
        score: The calculated compatibility score (0-1)
        use_funds: Whether fund-level data was used

    Returns:
        A formatted string with the compatibility score and detailed explanation
    """
    score_percentage = int(score * 100)

    # Determine match quality
    if score_percentage >= 80:
        match_level = "Excellent match"
    elif score_percentage >= 65:
        match_level = "Strong match"
    elif score_percentage >= 50:
        match_level = "Good match"
    elif score_percentage >= 35:
        match_level = "Moderate match"
    else:
        match_level = "Limited match"

    # Collect alignment factors
    alignment_factors = []
    recommendations = []

    # Get the best matching fund if using funds
    best_fund = None
    if use_funds and investor.funds:
        best_score = 0
        for fund in investor.funds:
            fund_score = _calculate_project_fund_compatibility(project, fund)
            if fund_score > best_score:
                best_score = fund_score
                best_fund = fund

    # Analyze sector alignment
    if project.sector:
        project_sectors = [s.name for s in project.sector if hasattr(s, "name")]

        if best_fund and best_fund.sectors:
            fund_sectors = {s.name for s in best_fund.sectors if hasattr(s, "name")}
            common_sectors = set(project_sectors) & fund_sectors

            if common_sectors:
                sectors_str = ", ".join(list(common_sectors)[:2])
                alignment_factors.append(f"{sectors_str} sector focus")
            elif project_sectors:
                recommendations.append(
                    f"Consider emphasizing any {project_sectors[0]} industry connections"
                )
        elif investor.sectors:
            investor_sectors = {s.name for s in investor.sectors if hasattr(s, "name")}
            common_sectors = set(project_sectors) & investor_sectors

            if common_sectors:
                sectors_str = ", ".join(list(common_sectors)[:2])
                alignment_factors.append(f"{sectors_str} sector focus")

    # Analyze stage alignment
    if project.stage:
        stage_name = (
            project.stage.value
            if hasattr(project.stage, "value")
            else str(project.stage)
        )
        stage_display = stage_name.replace("_", " ").title()

        if best_fund and best_fund.investment_stages:
            fund_stage_names = {
                s.name for s in best_fund.investment_stages if hasattr(s, "name")
            }
            if stage_name in fund_stage_names:
                alignment_factors.append(f"{stage_display} stage")
            else:
                recommendations.append(
                    "Investor typically focuses on different stages; highlight your traction and growth metrics"
                )

        if not best_fund:
            alignment_factors.append(f"{stage_display} stage")

    # Analyze geographic alignment
    if project.location:
        if best_fund and best_fund.geographic_focus:
            if (
                project.location.lower() in best_fund.geographic_focus.lower()
                or best_fund.geographic_focus.lower() in project.location.lower()
            ):
                alignment_factors.append(f"{project.location} presence")
        elif investor.headquarters:
            if (
                project.location.lower() in investor.headquarters.lower()
                or investor.headquarters.lower() in project.location.lower()
            ):
                alignment_factors.append(f"{project.location} market presence")

    # Analyze valuation/check size fit
    if project.valuation:
        if best_fund and best_fund.check_size_lower and best_fund.check_size_upper:
            reasonable_min = best_fund.check_size_lower * 3
            reasonable_max = best_fund.check_size_upper * 10

            if reasonable_min <= project.valuation <= reasonable_max:
                alignment_factors.append("appropriate funding stage")
            elif project.valuation < reasonable_min:
                recommendations.append(
                    "You may be early for this investor; consider approaching at a later stage"
                )
            else:
                recommendations.append(
                    "Consider highlighting your growth trajectory and market opportunity"
                )

    # Build the explanation
    explanation_parts = [f"Based on your startup profile: {score_percentage}% match"]

    if alignment_factors:
        alignment_text = ", ".join(alignment_factors)
        explanation_parts.append(f"{match_level}: {alignment_text}.")
    else:
        explanation_parts.append(f"{match_level}.")

    if recommendations:
        rec_text = recommendations[0]  # Show the most important recommendation
        explanation_parts.append(rec_text + ".")

    return " ".join(explanation_parts)