Update .gitignore to exclude preprocessor directory; refactor find_similar_investors function to improve similarity scoring based on investor characteristics and add limit parameter for results.

2025-10-01 23:29:29 +01:00
parent 17bc5acbc8
commit 3842171549
2 changed files with 98 additions and 27 deletions
@@ -14,3 +14,5 @@

 *.cypython

+/preprocessor
+
@@ -5,7 +5,6 @@ from db.models import InvestorTable, SectorTable
 from fastapi import APIRouter, Depends, HTTPException, Query
 from pydantic import BaseModel
 from schemas.router_schemas import InvestmentStage, InvestorData
-from services.querying import QueryProcessor
 from sqlalchemy.orm import Session, selectinload

 router = APIRouter(tags=["Investor Routes"])
@@ -235,10 +234,14 @@ def delete_investor(investor_id: int, db: Session = Depends(get_db)):


@router.get("/investors/{investor_id}/similar", response_model=List[InvestorData])
-def find_similar_investors(investor_id: int, db: Session = Depends(get_db)):
-    """Find investors similar to a given investor using AI agent"""
+def find_similar_investors(
+    investor_id: int,
+    limit: int = Query(10, description="Maximum number of similar investors to return"),
+    db: Session = Depends(get_db),
+):
+    """Find investors similar to a given investor based on characteristics"""

-    # First, get the target investor to build the AI query
+    # Get the target investor
    target_investor = (
        db.query(InvestorTable)
        .options(
@@ -253,29 +256,95 @@ def find_similar_investors(investor_id: int, db: Session = Depends(get_db)):
    if not target_investor:
        raise HTTPException(status_code=404, detail="Investor not found")

-    # Build a descriptive query for the AI agent based on target investor characteristics
-    target_sectors = [sector.name for sector in target_investor.sectors]
-    sectors_text = ", ".join(target_sectors) if target_sectors else "any sector"
+    # Get target investor's sector IDs for comparison
+    target_sector_ids = {sector.id for sector in target_investor.sectors}

-    ai_query = f"""
-    Find investors similar to investor ID {investor_id} with the following characteristics:
-    - Stage focus: {target_investor.stage_focus.value if target_investor.stage_focus else "any stage"}
-    - Geographic focus: {target_investor.geographic_focus or "any geography"}
-    - Check size range: ${target_investor.check_size_lower or 0:,} to ${target_investor.check_size_upper or 0:,}
-    - AUM (Assets Under Management): ${target_investor.aum or 0:,}
-    - Sectors: {sectors_text}
-    
-    Find investors with similar characteristics but exclude investor ID {investor_id}.
-    Look for investors with:
-    - Same or similar stage focus
-    - Similar geographic regions
-    - Overlapping check size ranges
-    - Similar AUM levels (within a reasonable range)
-    - Common sector interests
-    """
+    # Query all other investors with their relationships
+    candidates = (
+        db.query(InvestorTable)
+        .options(
+            selectinload(InvestorTable.portfolio_companies),
+            selectinload(InvestorTable.team_members),
+            selectinload(InvestorTable.sectors),
+        )
+        .filter(InvestorTable.id != investor_id)
+        .all()
+    )

-    # Use the AI agent to find similar investors
-    query_processor = QueryProcessor()
-    result = query_processor.process_query(ai_query)
+    # Calculate similarity scores
+    scored_investors = []
+    for candidate in candidates:
+        score = 0

-    return result.investors
+        # Stage focus match (30 points)
+        if candidate.stage_focus == target_investor.stage_focus:
+            score += 30
+
+        # Geographic focus match (20 points for exact, 10 for partial)
+        if candidate.geographic_focus and target_investor.geographic_focus:
+            if (
+                candidate.geographic_focus.lower()
+                == target_investor.geographic_focus.lower()
+            ):
+                score += 20
+            elif (
+                candidate.geographic_focus.lower()
+                in target_investor.geographic_focus.lower()
+                or target_investor.geographic_focus.lower()
+                in candidate.geographic_focus.lower()
+            ):
+                score += 10
+
+        # Check size overlap (20 points max)
+        if (
+            candidate.check_size_lower
+            and candidate.check_size_upper
+            and target_investor.check_size_lower
+            and target_investor.check_size_upper
+        ):
+            # Calculate overlap percentage
+            overlap_start = max(
+                candidate.check_size_lower, target_investor.check_size_lower
+            )
+            overlap_end = min(
+                candidate.check_size_upper, target_investor.check_size_upper
+            )
+            if overlap_end > overlap_start:
+                overlap = overlap_end - overlap_start
+                target_range = (
+                    target_investor.check_size_upper - target_investor.check_size_lower
+                )
+                overlap_ratio = overlap / target_range if target_range > 0 else 0
+                score += int(20 * overlap_ratio)
+
+        # AUM similarity (15 points max)
+        if candidate.aum and target_investor.aum:
+            aum_diff = abs(candidate.aum - target_investor.aum)
+            max_aum = max(candidate.aum, target_investor.aum)
+            similarity_ratio = 1 - (aum_diff / max_aum) if max_aum > 0 else 0
+            score += int(15 * similarity_ratio)
+
+        # Sector overlap (30 points max)
+        candidate_sector_ids = {sector.id for sector in candidate.sectors}
+        if target_sector_ids and candidate_sector_ids:
+            common_sectors = target_sector_ids.intersection(candidate_sector_ids)
+            overlap_ratio = len(common_sectors) / len(target_sector_ids)
+            score += int(30 * overlap_ratio)
+
+        if score > 0:  # Only include investors with some similarity
+            scored_investors.append((score, candidate))
+
+    # Sort by score (descending) and take top N
+    scored_investors.sort(key=lambda x: x[0], reverse=True)
+    similar_investors = [inv for score, inv in scored_investors[:limit]]
+
+    # Transform to InvestorData format
+    return [
+        InvestorData(
+            investor=inv,
+            portfolio_companies=inv.portfolio_companies,
+            team_members=inv.team_members,
+            sectors=inv.sectors,
+        )
+        for inv in similar_investors
+    ]