Implement find_similar_investors endpoint to enhance investor similarity search; refactor update_investor logic and improve scoring mechanism for better results.

2025-10-01 23:31:48 +01:00
22 changed files with 166 additions and 26007 deletions
@@ -10,7 +10,8 @@

 *__pycache__

+/*.db

 *.cypython

-
+/preprocessor
@@ -1,5 +1,4 @@
 import os
-from pathlib import Path
 from typing import Annotated

 from fastapi import Depends
@@ -10,11 +9,7 @@ from sqlalchemy.orm import Session, sessionmaker
 Base = declarative_base()

 # Database configuration
-# Use the preprocessor's database for consistency
-# Get absolute path to the preprocessor database
-# APP_DIR = Path(__file__).parent.parent
-# PREPROCESSOR_DB = APP_DIR.parent / "preprocessor" / "version_two.db"
-DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./version_two.db")
+DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db")

 # Create engine
 engine = create_engine(DATABASE_URL, echo=False)
@@ -43,7 +38,6 @@ def get_session_sync() -> Session:
    """Get a database session for synchronous operations"""
    return SessionLocal()

-
 def get_db_session():
    """Get a database session for direct use."""
    return SessionLocal()
@@ -2,7 +2,7 @@ import enum

 from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Table, Text, func
 from sqlalchemy.orm import declarative_mixin, relationship
-from sqlalchemy.types import JSON, Enum
+from sqlalchemy.types import Enum

 from db.db import Base

@@ -70,22 +70,6 @@ project_company_association = Table(
    Column("company_id", Integer, ForeignKey("companies.id")),
 )

-# Association table for fund-stage many-to-many
-fund_investment_stages_association = Table(
-    "fund_investment_stages",
-    Base.metadata,
-    Column("fund_id", Integer, ForeignKey("funds.id")),
-    Column("stage_id", Integer, ForeignKey("investment_stages.id")),
-)
-
-# Association table for fund-sector many-to-many
-fund_sectors_association = Table(
-    "fund_sectors",
-    Base.metadata,
-    Column("fund_id", Integer, ForeignKey("funds.id")),
-    Column("sector_id", Integer, ForeignKey("sectors.id")),
-)
-

 class InvestorTable(Base, TimestampMixin):
    __tablename__ = "investors"
@@ -93,47 +77,14 @@ class InvestorTable(Base, TimestampMixin):
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    description = Column(Text, nullable=True)
-
-    # Basic investor info
-    website = Column(String, nullable=True)
-    headquarters = Column(String, nullable=True)
-
-    # AUM fields
-    aum = Column(Integer, nullable=True)  # Store as integer for numerical filtering
-    aum_as_of_date = Column(String, nullable=True)
-    aum_source_url = Column(String, nullable=True)
-
-    # Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
-    check_size_lower = Column(Integer, nullable=True)
-    check_size_upper = Column(Integer, nullable=True)
-
-    # Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
+    aum = Column(Integer, nullable=True)  # Assets Under Management
+    check_size_lower = Column(Integer, nullable=True)  # Lower bound
+    check_size_upper = Column(Integer, nullable=True)  # Upper bound
    geographic_focus = Column(String, nullable=True)
-
-    # Investment thesis and portfolio
-    investment_thesis = Column(JSON, nullable=True)  # Array of thesis statements
-    portfolio_highlights = Column(
-        JSON, nullable=True
-    )  # Array of portfolio company names
-    linked_documents = Column(JSON, nullable=True)  # Array of document URLs
-
-    # Research metadata
-    researcher_notes = Column(Text, nullable=True)
-    missing_important_fields = Column(
-        JSON, nullable=True
-    )  # Array of missing field names
-    sources = Column(JSON, nullable=True)  # JSON object with source URLs
-
-    # Portfolio info
+    stage_focus = Column(Enum(InvestmentStage), nullable=True)
    number_of_investments = Column(Integer, default=0, nullable=True)

-    # Relationships
-    team_members = relationship(
-        "InvestorMember", back_populates="investor", cascade="all, delete-orphan"
-    )
-    funds = relationship(
-        "FundTable", back_populates="investor", cascade="all, delete-orphan"
-    )
+    team_members = relationship("InvestorMember", back_populates="investor")

    # Relationship to portfolio companies
    portfolio_companies = relationship(
@@ -160,51 +111,12 @@ class InvestorMember(Base, TimestampMixin):
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    role = Column(String, nullable=True)
-    title = Column(String, nullable=True)  # Alternative to role
    email = Column(String, nullable=True)
-    source_url = Column(String, nullable=True)  # URL where member info was found

    investor_id = Column(Integer, ForeignKey("investors.id"))
    investor = relationship("InvestorTable", back_populates="team_members")


-class FundTable(Base, TimestampMixin):
-    __tablename__ = "funds"
-
-    id = Column(Integer, primary_key=True, index=True)
-    investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
-
-    # Fund details
-    fund_name = Column(String, nullable=True)
-    fund_size = Column(
-        Integer, nullable=True
-    )  # Store as integer for numerical filtering
-    fund_size_source_url = Column(String, nullable=True)
-
-    # Check size range (parsed from estimated_investment_size by LLM)
-    check_size_lower = Column(Integer, nullable=True)
-    check_size_upper = Column(Integer, nullable=True)
-
-    source_url = Column(String, nullable=True)
-    source_provider = Column(String, nullable=True)  # e.g., "Perplexity"
-
-    # Geographic focus as simple string
-    geographic_focus = Column(String, nullable=True)
-
-    # Relationships
-    investor = relationship("InvestorTable", back_populates="funds")
-    investment_stages = relationship(
-        "InvestmentStageTable",
-        secondary=fund_investment_stages_association,
-        back_populates="funds",
-    )
-    sectors = relationship(
-        "SectorTable",
-        secondary=fund_sectors_association,
-        back_populates="funds",
-    )
-
-
 class CompanyTable(Base, TimestampMixin):
    __tablename__ = "companies"

@@ -216,9 +128,7 @@ class CompanyTable(Base, TimestampMixin):
    founded_year = Column(Integer, nullable=True)
    website = Column(String, nullable=True)

-    members = relationship(
-        "CompanyMember", back_populates="company", cascade="all, delete-orphan"
-    )
+    members = relationship("CompanyMember", back_populates="company")
    # Relationship back to investors
    investors = relationship(
        "InvestorTable",
@@ -248,43 +158,26 @@ class CompanyMember(Base, TimestampMixin):
    company = relationship("CompanyTable", back_populates="members")


-class InvestmentStageTable(Base, TimestampMixin):
-    __tablename__ = "investment_stages"
-
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, nullable=False, unique=True)
-
-    # Relationships
-    funds = relationship(
-        "FundTable",
-        secondary=fund_investment_stages_association,
-        back_populates="investment_stages",
-    )
-
-
 class SectorTable(Base, TimestampMixin):
    __tablename__ = "sectors"

    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)

-    # Relationships
+    # Add relationship back to investors
    investors = relationship(
        "InvestorTable",
        secondary=investor_sector_association,
        back_populates="sectors",
    )
+
    companies = relationship(
        "CompanyTable", secondary=company_sector_association, back_populates="sectors"
    )
+
    projects = relationship(
        "ProjectTable", secondary=project_sector_association, back_populates="sector"
    )
-    funds = relationship(
-        "FundTable",
-        secondary=fund_sectors_association,
-        back_populates="sectors",
-    )


 class ProjectTable(Base, TimestampMixin):
@@ -44,27 +44,6 @@ def health():
 async def parse_csv(
    db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...)
 ):
-    """
-    Parse and import CSV data into the database.
-
-    **For investors:**
-    - Expected columns: Name, Website, Final Investor Profile, Final Profile sourcing
-    - Manually parses JSON profiles for efficiency
-    - Uses LLM only for currency conversion to USD
-    - Handles AUM, fund sizes, and check sizes as integers
-
-    **For companies:**
-    - Expected columns: Name, Website, Investor, Final Investor Profile (company profile)
-    - 100% manual JSON parsing - no LLM needed
-    - Extracts company details, executives, investors, and client categories
-    - Automatically links companies to investors in database
-
-    **Benefits:**
-    - Fast processing (5-10s per record)
-    - Low cost (minimal or no LLM usage)
-    - Accurate data extraction
-    - Automatic database persistence
-    """
    # Read uploaded CSV with pandas
    content = await file.read()
    df = pd.read_csv(io.StringIO(content.decode("utf-8")))
@@ -73,15 +52,12 @@ async def parse_csv(
    processor = InvestorProcessor()

    if is_investor == 1:
-        # Manual parser with LLM currency conversion
-        results = await processor.parse_investors(df, save_to_db=True)
-        # Results are already dicts from the new parser
-        return results
+        results = await processor.parse_investors(df)
    else:
-        # Manual parser for companies (no LLM needed)
-        results = await processor.parse_companies(df, save_to_db=True)
-        # Results are already dicts from the new parser
-        return results
+        results = await processor.parse_companies(df)
+
+    # Convert Pydantic objects to dictionaries
+    return [r.model_dump() for r in results]


@app.post("/query", response_model=InvestorList, tags=["Querying"])
@@ -4,11 +4,8 @@ from db.db import get_db
 from db.models import InvestorTable, SectorTable
 from fastapi import APIRouter, Depends, HTTPException, Query
 from pydantic import BaseModel
-from schemas.router_schemas import (
-    InvestmentStage,
-    InvestorData,
-    InvestorFundData,
-)
+from schemas.router_schemas import InvestmentStage, InvestorData
+from services.querying import QueryProcessor
 from sqlalchemy.orm import Session, selectinload

 router = APIRouter(tags=["Investor Routes"])
@@ -37,95 +34,34 @@ class InvestorUpdate(BaseModel):
    number_of_investments: Optional[int] = None


-@router.get("/investors", response_model=List[InvestorFundData])
+@router.get("/investors", response_model=List[InvestorData])
 def read_investors(db: Session = Depends(get_db)):
-    """Get all investors with their funds as separate entries
-
-    Each investor-fund combination is returned as a separate row.
-    An investor with 3 funds will appear as 3 entries.
-    """
+    """Get all investors with their related data"""
    investors = (
        db.query(InvestorTable)
        .options(
            selectinload(InvestorTable.portfolio_companies),
            selectinload(InvestorTable.team_members),
            selectinload(InvestorTable.sectors),
-            selectinload(InvestorTable.funds),
        )
        .all()
    )

-    # Transform to InvestorFundData format (one row per investor-fund combination)
-    investor_fund_list = []
+    # Transform InvestorTable objects to InvestorData format
+    investor_data_list = []
    for investor in investors:
-        # If investor has funds, create one entry per fund
-        if investor.funds:
-            for fund in investor.funds:
-                investor_fund_data = InvestorFundData(
-                    # Investor fields
-                    investor_id=investor.id,
-                    investor_name=investor.name,
-                    investor_description=investor.description,
-                    investor_website=investor.website,
-                    investor_headquarters=investor.headquarters,
-                    aum=investor.aum,
-                    aum_as_of_date=investor.aum_as_of_date,
-                    aum_source_url=investor.aum_source_url,
-                    investment_thesis=investor.investment_thesis,
-                    portfolio_highlights=investor.portfolio_highlights,
-                    number_of_investments=investor.number_of_investments,
-                    # Fund fields
-                    fund_id=fund.id,
-                    fund_name=fund.fund_name,
-                    fund_size=fund.fund_size,
-                    fund_size_source_url=fund.fund_size_source_url,
-                    check_size_lower=fund.check_size_lower,
-                    check_size_upper=fund.check_size_upper,
-                    geographic_focus=fund.geographic_focus,
-                    fund_investment_stages=fund.investment_stages,  # Now a relationship
-                    fund_sectors=fund.sectors,  # Now a relationship
-                    # Related data (same for all funds of this investor)
-                    portfolio_companies=investor.portfolio_companies,
-                    team_members=investor.team_members,
-                    sectors=investor.sectors,
-                )
-                investor_fund_list.append(investor_fund_data)
-        else:
-            # If no funds, create one entry with null fund fields
-            investor_fund_data = InvestorFundData(
-                # Investor fields
-                investor_id=investor.id,
-                investor_name=investor.name,
-                investor_description=investor.description,
-                investor_website=investor.website,
-                investor_headquarters=investor.headquarters,
-                aum=investor.aum,
-                aum_as_of_date=investor.aum_as_of_date,
-                aum_source_url=investor.aum_source_url,
-                investment_thesis=investor.investment_thesis,
-                portfolio_highlights=investor.portfolio_highlights,
-                number_of_investments=investor.number_of_investments,
-                # Fund fields (null)
-                fund_id=None,
-                fund_name=None,
-                fund_size=None,
-                fund_size_source_url=None,
-                check_size_lower=None,
-                check_size_upper=None,
-                geographic_focus=None,
-                fund_investment_stages=None,
-                fund_sectors=None,
-                # Related data
-                portfolio_companies=investor.portfolio_companies,
-                team_members=investor.team_members,
-                sectors=investor.sectors,
-            )
-            investor_fund_list.append(investor_fund_data)
+        investor_data = InvestorData(
+            investor=investor,  # This maps to InvestorSchema
+            portfolio_companies=investor.portfolio_companies,
+            team_members=investor.team_members,
+            sectors=investor.sectors,
+        )
+        investor_data_list.append(investor_data)

-    return investor_fund_list
+    return investor_data_list


-@router.get("/investors/filter", response_model=List[InvestorFundData])
+@router.get("/investors/filter", response_model=List[InvestorData])
 def filter_investors(
    stage: Optional[InvestmentStage] = Query(
        None, description="Filter by investment stage"
@@ -140,18 +76,13 @@ def filter_investors(
    max_aum: Optional[int] = Query(None, description="Maximum AUM"),
    db: Session = Depends(get_db),
 ):
-    """Filter investors based on various criteria
-
-    Returns investor-fund combinations as separate rows.
-    An investor with 3 funds will appear as 3 entries.
-    """
+    """Filter investors based on various criteria"""

    # Start with base query
    query = db.query(InvestorTable).options(
        selectinload(InvestorTable.portfolio_companies),
        selectinload(InvestorTable.team_members),
        selectinload(InvestorTable.sectors),
-        selectinload(InvestorTable.funds),
    )

    # Apply filters
@@ -181,86 +112,29 @@ def filter_investors(

    investors = query.all()

-    # Transform to InvestorFundData format (one row per investor-fund combination)
-    investor_fund_list = []
+    # Transform to InvestorData format
+    investor_data_list = []
    for investor in investors:
-        # If investor has funds, create one entry per fund
-        if investor.funds:
-            for fund in investor.funds:
-                investor_fund_data = InvestorFundData(
-                    # Investor fields
-                    investor_id=investor.id,
-                    investor_name=investor.name,
-                    investor_description=investor.description,
-                    investor_website=investor.website,
-                    investor_headquarters=investor.headquarters,
-                    aum=investor.aum,
-                    aum_as_of_date=investor.aum_as_of_date,
-                    aum_source_url=investor.aum_source_url,
-                    investment_thesis=investor.investment_thesis,
-                    portfolio_highlights=investor.portfolio_highlights,
-                    number_of_investments=investor.number_of_investments,
-                    # Fund fields
-                    fund_id=fund.id,
-                    fund_name=fund.fund_name,
-                    fund_size=fund.fund_size,
-                    fund_size_source_url=fund.fund_size_source_url,
-                    check_size_lower=fund.check_size_lower,
-                    check_size_upper=fund.check_size_upper,
-                    geographic_focus=fund.geographic_focus,
-                    fund_investment_stages=fund.investment_stages,  # Now a relationship
-                    fund_sectors=fund.sectors,  # Now a relationship
-                    # Related data
-                    portfolio_companies=investor.portfolio_companies,
-                    team_members=investor.team_members,
-                    sectors=investor.sectors,
-                )
-                investor_fund_list.append(investor_fund_data)
-        else:
-            # If no funds, create one entry with null fund fields
-            investor_fund_data = InvestorFundData(
-                # Investor fields
-                investor_id=investor.id,
-                investor_name=investor.name,
-                investor_description=investor.description,
-                investor_website=investor.website,
-                investor_headquarters=investor.headquarters,
-                aum=investor.aum,
-                aum_as_of_date=investor.aum_as_of_date,
-                aum_source_url=investor.aum_source_url,
-                investment_thesis=investor.investment_thesis,
-                portfolio_highlights=investor.portfolio_highlights,
-                number_of_investments=investor.number_of_investments,
-                # Fund fields (null)
-                fund_id=None,
-                fund_name=None,
-                fund_size=None,
-                fund_size_source_url=None,
-                check_size_lower=None,
-                check_size_upper=None,
-                geographic_focus=None,
-                fund_investment_stages=None,
-                fund_sectors=None,
-                # Related data
-                portfolio_companies=investor.portfolio_companies,
-                team_members=investor.team_members,
-                sectors=investor.sectors,
-            )
-            investor_fund_list.append(investor_fund_data)
+        investor_data = InvestorData(
+            investor=investor,
+            portfolio_companies=investor.portfolio_companies,
+            team_members=investor.team_members,
+            sectors=investor.sectors,
+        )
+        investor_data_list.append(investor_data)

-    return investor_fund_list
+    return investor_data_list


@router.get("/investors/{investor_id}", response_model=InvestorData)
 def read_investor(investor_id: int, db: Session = Depends(get_db)):
-    """Get a specific investor by ID with all their funds"""
+    """Get a specific investor by ID"""
    investor = (
        db.query(InvestorTable)
        .options(
            selectinload(InvestorTable.portfolio_companies),
            selectinload(InvestorTable.team_members),
            selectinload(InvestorTable.sectors),
-            selectinload(InvestorTable.funds),
        )
        .filter(InvestorTable.id == investor_id)
        .first()
@@ -269,13 +143,12 @@ def read_investor(investor_id: int, db: Session = Depends(get_db)):
    if not investor:
        raise HTTPException(status_code=404, detail="Investor not found")

-    # Transform to InvestorData format (includes funds array)
+    # Transform to InvestorData format
    return InvestorData(
        investor=investor,
        portfolio_companies=investor.portfolio_companies,
        team_members=investor.team_members,
        sectors=investor.sectors,
-        funds=investor.funds,
    )


@@ -294,7 +167,6 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)):
            selectinload(InvestorTable.portfolio_companies),
            selectinload(InvestorTable.team_members),
            selectinload(InvestorTable.sectors),
-            selectinload(InvestorTable.funds),
        )
        .filter(InvestorTable.id == db_investor.id)
        .first()
@@ -306,76 +178,17 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)):
        portfolio_companies=investor_with_relations.portfolio_companies,
        team_members=investor_with_relations.team_members,
        sectors=investor_with_relations.sectors,
-        funds=investor_with_relations.funds,
    )


-@router.put("/investors/{investor_id}", response_model=InvestorData)
-def update_investor(
-    investor_id: int, investor: InvestorUpdate, db: Session = Depends(get_db)
-):
-    """Update an existing investor"""
-    db_investor = (
-        db.query(InvestorTable).filter(InvestorTable.id == investor_id).first()
-    )
-    if not db_investor:
-        raise HTTPException(status_code=404, detail="Investor not found")
-
-    update_data = investor.dict(exclude_unset=True)
-    for field, value in update_data.items():
-        setattr(db_investor, field, value)
-
-    db.commit()
-    db.refresh(db_investor)
-
-    # Reload with relationships
-    investor_with_relations = (
-        db.query(InvestorTable)
-        .options(
-            selectinload(InvestorTable.portfolio_companies),
-            selectinload(InvestorTable.team_members),
-            selectinload(InvestorTable.sectors),
-            selectinload(InvestorTable.funds),
-        )
-        .filter(InvestorTable.id == investor_id)
-        .first()
-    )
-
-    # Transform to InvestorData format
-    return InvestorData(
-        investor=investor_with_relations,
-        portfolio_companies=investor_with_relations.portfolio_companies,
-        team_members=investor_with_relations.team_members,
-        sectors=investor_with_relations.sectors,
-        funds=investor_with_relations.funds,
-    )
-
-
-@router.delete("/investors/{investor_id}")
-def delete_investor(investor_id: int, db: Session = Depends(get_db)):
-    """Delete an investor"""
-    db_investor = (
-        db.query(InvestorTable).filter(InvestorTable.id == investor_id).first()
-    )
-    if not db_investor:
-        raise HTTPException(status_code=404, detail="Investor not found")
-
-    db.delete(db_investor)
-    db.commit()
-    return {"message": "Investor deleted successfully"}
-
-
-@router.get("/investors/{investor_id}/similar", response_model=List[InvestorFundData])
+@router.get("/investors/{investor_id}/similar", response_model=List[InvestorData])
 def find_similar_investors(
-    investor_id: int,
+    investor_id: int, 
    limit: int = Query(10, description="Maximum number of similar investors to return"),
-    db: Session = Depends(get_db),
+    db: Session = Depends(get_db)
 ):
-    """Find investors similar to a given investor based on characteristics
-
-    Returns investor-fund combinations as separate rows.
-    """
-
+    """Find investors similar to a given investor based on characteristics"""
+    
    # Get the target investor
    target_investor = (
        db.query(InvestorTable)
@@ -383,7 +196,6 @@ def find_similar_investors(
            selectinload(InvestorTable.portfolio_companies),
            selectinload(InvestorTable.team_members),
            selectinload(InvestorTable.sectors),
-            selectinload(InvestorTable.funds),
        )
        .filter(InvestorTable.id == investor_id)
        .first()
@@ -402,7 +214,6 @@ def find_similar_investors(
            selectinload(InvestorTable.portfolio_companies),
            selectinload(InvestorTable.team_members),
            selectinload(InvestorTable.sectors),
-            selectinload(InvestorTable.funds),
        )
        .filter(InvestorTable.id != investor_id)
        .all()
@@ -412,134 +223,59 @@ def find_similar_investors(
    scored_investors = []
    for candidate in candidates:
        score = 0
-
+        
        # Stage focus match (30 points)
        if candidate.stage_focus == target_investor.stage_focus:
            score += 30
-
+        
        # Geographic focus match (20 points for exact, 10 for partial)
        if candidate.geographic_focus and target_investor.geographic_focus:
-            if (
-                candidate.geographic_focus.lower()
-                == target_investor.geographic_focus.lower()
-            ):
+            if candidate.geographic_focus.lower() == target_investor.geographic_focus.lower():
                score += 20
-            elif (
-                candidate.geographic_focus.lower()
-                in target_investor.geographic_focus.lower()
-                or target_investor.geographic_focus.lower()
-                in candidate.geographic_focus.lower()
-            ):
+            elif (candidate.geographic_focus.lower() in target_investor.geographic_focus.lower() or
+                  target_investor.geographic_focus.lower() in candidate.geographic_focus.lower()):
                score += 10
-
+        
        # Check size overlap (20 points max)
-        if (
-            candidate.check_size_lower
-            and candidate.check_size_upper
-            and target_investor.check_size_lower
-            and target_investor.check_size_upper
-        ):
+        if (candidate.check_size_lower and candidate.check_size_upper and 
+            target_investor.check_size_lower and target_investor.check_size_upper):
            # Calculate overlap percentage
-            overlap_start = max(
-                candidate.check_size_lower, target_investor.check_size_lower
-            )
-            overlap_end = min(
-                candidate.check_size_upper, target_investor.check_size_upper
-            )
+            overlap_start = max(candidate.check_size_lower, target_investor.check_size_lower)
+            overlap_end = min(candidate.check_size_upper, target_investor.check_size_upper)
            if overlap_end > overlap_start:
                overlap = overlap_end - overlap_start
-                target_range = (
-                    target_investor.check_size_upper - target_investor.check_size_lower
-                )
+                target_range = target_investor.check_size_upper - target_investor.check_size_lower
                overlap_ratio = overlap / target_range if target_range > 0 else 0
                score += int(20 * overlap_ratio)
-
+        
        # AUM similarity (15 points max)
        if candidate.aum and target_investor.aum:
            aum_diff = abs(candidate.aum - target_investor.aum)
            max_aum = max(candidate.aum, target_investor.aum)
            similarity_ratio = 1 - (aum_diff / max_aum) if max_aum > 0 else 0
            score += int(15 * similarity_ratio)
-
+        
        # Sector overlap (30 points max)
        candidate_sector_ids = {sector.id for sector in candidate.sectors}
        if target_sector_ids and candidate_sector_ids:
            common_sectors = target_sector_ids.intersection(candidate_sector_ids)
            overlap_ratio = len(common_sectors) / len(target_sector_ids)
            score += int(30 * overlap_ratio)
-
+        
        if score > 0:  # Only include investors with some similarity
            scored_investors.append((score, candidate))
-
+    
    # Sort by score (descending) and take top N
    scored_investors.sort(key=lambda x: x[0], reverse=True)
    similar_investors = [inv for score, inv in scored_investors[:limit]]
-
-    # Transform to InvestorFundData format (one row per investor-fund combination)
-    investor_fund_list = []
-    for investor in similar_investors:
-        # If investor has funds, create one entry per fund
-        if investor.funds:
-            for fund in investor.funds:
-                investor_fund_data = InvestorFundData(
-                    # Investor fields
-                    investor_id=investor.id,
-                    investor_name=investor.name,
-                    investor_description=investor.description,
-                    investor_website=investor.website,
-                    investor_headquarters=investor.headquarters,
-                    aum=investor.aum,
-                    aum_as_of_date=investor.aum_as_of_date,
-                    aum_source_url=investor.aum_source_url,
-                    investment_thesis=investor.investment_thesis,
-                    portfolio_highlights=investor.portfolio_highlights,
-                    number_of_investments=investor.number_of_investments,
-                    # Fund fields
-                    fund_id=fund.id,
-                    fund_name=fund.fund_name,
-                    fund_size=fund.fund_size,
-                    fund_size_source_url=fund.fund_size_source_url,
-                    check_size_lower=fund.check_size_lower,
-                    check_size_upper=fund.check_size_upper,
-                    geographic_focus=fund.geographic_focus,
-                    fund_investment_stages=fund.investment_stages,  # Now a relationship
-                    fund_sectors=fund.sectors,  # Now a relationship
-                    # Related data
-                    portfolio_companies=investor.portfolio_companies,
-                    team_members=investor.team_members,
-                    sectors=investor.sectors,
-                )
-                investor_fund_list.append(investor_fund_data)
-        else:
-            # If no funds, create one entry with null fund fields
-            investor_fund_data = InvestorFundData(
-                # Investor fields
-                investor_id=investor.id,
-                investor_name=investor.name,
-                investor_description=investor.description,
-                investor_website=investor.website,
-                investor_headquarters=investor.headquarters,
-                aum=investor.aum,
-                aum_as_of_date=investor.aum_as_of_date,
-                aum_source_url=investor.aum_source_url,
-                investment_thesis=investor.investment_thesis,
-                portfolio_highlights=investor.portfolio_highlights,
-                number_of_investments=investor.number_of_investments,
-                # Fund fields (null)
-                fund_id=None,
-                fund_name=None,
-                fund_size=None,
-                fund_size_source_url=None,
-                check_size_lower=None,
-                check_size_upper=None,
-                geographic_focus=None,
-                fund_investment_stages=None,
-                fund_sectors=None,
-                # Related data
-                portfolio_companies=investor.portfolio_companies,
-                team_members=investor.team_members,
-                sectors=investor.sectors,
-            )
-            investor_fund_list.append(investor_fund_data)
-
-    return investor_fund_list
+    
+    # Transform to InvestorData format
+    return [
+        InvestorData(
+            investor=inv,
+            portfolio_companies=inv.portfolio_companies,
+            team_members=inv.team_members,
+            sectors=inv.sectors,
+        )
+        for inv in similar_investors
+    ]
@@ -258,6 +258,10 @@ class InvestorSchema(BaseModel):
        default=None,
        description="Geographic investment focus. Do not return any special characters, Just locations separated by commas. Leave empty if not clearly identifiable.",
    )
+    stage_focus: InvestmentStage = Field(
+        default=InvestmentStage.SEED,
+        description="Investment stage focus. Use SEED as default if uncertain.",
+    )
    number_of_investments: Optional[int] = Field(
        default=None,
        ge=0,
@@ -22,14 +22,6 @@ class SectorSchema(BaseModel):
        from_attributes = True


-class InvestmentStageSchema(BaseModel):
-    id: int
-    name: str
-
-    class Config:
-        from_attributes = True
-
-
 class InvestorMemberSchema(BaseModel):
    id: int
    name: str
@@ -40,25 +32,6 @@ class InvestorMemberSchema(BaseModel):
        from_attributes = True


-class FundSchema(BaseModel):
-    id: int
-    fund_name: str | None
-    fund_size: int | None  # Changed to int for numerical filtering
-    fund_size_source_url: str | None
-    check_size_lower: int | None  # NEW: Lower bound of check size range
-    check_size_upper: int | None  # NEW: Upper bound of check size range
-    source_url: str | None
-    source_provider: str | None
-    geographic_focus: str | None  # Changed from List[str] to string
-    investment_stages: List[InvestmentStageSchema] | None  # Changed to relationship
-    sectors: List[SectorSchema] | None  # Changed to relationship
-    created_at: Optional[datetime] = None
-    updated_at: Optional[datetime] = None
-
-    class Config:
-        from_attributes = True
-
-
 class CompanyMemberSchema(BaseModel):
    id: int
    name: Optional[str]
@@ -103,55 +76,12 @@ class InvestorSchema(BaseModel):


 class InvestorData(BaseModel):
-    """Comprehensive investor data schema - used for individual investor requests"""
+    """Comprehensive investor data schema for LLM processing"""

    investor: InvestorSchema
    portfolio_companies: List[CompanySchema]
    team_members: List[InvestorMemberSchema]
    sectors: List[SectorSchema]
-    funds: List[FundSchema]
-
-    class Config:
-        from_attributes = True
-
-
-class InvestorFundData(BaseModel):
-    """Investor-Fund combined data - used for list/filter requests
-
-    Each row represents one investor-fund combination.
-    An investor with 3 funds will appear as 3 separate entries.
-    """
-
-    # Investor fields
-    investor_id: int
-    investor_name: str
-    investor_description: Optional[str]
-    investor_website: Optional[str]
-    investor_headquarters: Optional[str]
-    aum: int | None
-    aum_as_of_date: str | None
-    aum_source_url: str | None
-    investment_thesis: List[str] | None
-    portfolio_highlights: List[str] | None
-    number_of_investments: int | None
-
-    # Fund fields
-    fund_id: int | None
-    fund_name: str | None
-    fund_size: int | None  # Changed to int for numerical filtering
-    fund_size_source_url: str | None
-    check_size_lower: int | None  # NEW: Lower bound of check size range
-    check_size_upper: int | None  # NEW: Upper bound of check size range
-    geographic_focus: str | None  # Changed from List[str] to string
-    fund_investment_stages: (
-        List[InvestmentStageSchema] | None
-    )  # Changed to relationship
-    fund_sectors: List[SectorSchema] | None  # Changed to relationship
-
-    # Related data
-    portfolio_companies: List[CompanySchema]
-    team_members: List[InvestorMemberSchema]
-    sectors: List[SectorSchema]

    class Config:
        from_attributes = True
@@ -169,9 +99,3 @@ class CompanyData(BaseModel):  # Renamed from CompaniesData for consistency

 class InvestorList(BaseModel):
    investors: List[InvestorData]
-
-
-class InvestorFundList(BaseModel):
-    """List of investor-fund combinations"""
-
-    investor_funds: List[InvestorFundData]
@@ -1,6 +1,5 @@
-import json
+import asyncio
 import os
-import re
 from typing import Optional

 import pandas as pd
@@ -8,35 +7,15 @@ from db.db import get_db_session
 from db.models import (
    CompanyMember,
    CompanyTable,
-    FundTable,
-    InvestmentStageTable,
    InvestorMember,
    InvestorTable,
    SectorTable,
 )
 from langchain_openai import ChatOpenAI
-from pydantic import BaseModel
 from schemas.py_schemas import CompanyData, InvestorData
 from sqlalchemy.orm import Session


-class CurrencyConversion(BaseModel):
-    """Schema for LLM currency conversion responses"""
-
-    amount_usd: int = 0
-    confidence: str = "high"  # high, medium, low
-    notes: str = ""
-
-
-class CheckSizeRange(BaseModel):
-    """Schema for LLM check size range parsing from estimated investment size"""
-
-    lower_bound_usd: int = 0
-    upper_bound_usd: int = 0
-    confidence: str = "high"  # high, medium, low
-    notes: str = ""
-
-
 class InvestorProcessor:
    def __init__(self):
        self.llm = ChatOpenAI(
@@ -46,508 +25,9 @@ class InvestorProcessor:
            temperature=0,
        )

-        # Structured LLMs for specific parsing tasks
-        self.currency_converter_llm = self.llm.with_structured_output(
-            CurrencyConversion
-        )
-        self.check_size_parser_llm = self.llm.with_structured_output(CheckSizeRange)
-
-        # Keep legacy structured LLMs for backward compatibility
        self.investor_structured_llm = self.llm.with_structured_output(InvestorData)
        self.company_structured_llm = self.llm.with_structured_output(CompanyData)

-    async def convert_to_usd(self, amount_str: str) -> Optional[int]:
-        """
-        Use LLM to convert currency amounts to USD integers.
-        Handles formats like:
-        - "EUR 850,000,000"
-        - "$5M"
-        - "GBP 10-20 million"
-        - "Approximately EUR 100 million"
-        """
-        if not amount_str or amount_str == "Not Available" or amount_str == "0":
-            return None
-
-        try:
-            prompt = f"""Convert this amount to USD as an integer (whole number, no decimals).
-If it's a range, use the midpoint. If already in USD, just extract the number.
-Remove all commas and convert millions/billions to actual numbers.
-
-Amount: {amount_str}
-
-Examples:
- "EUR 850,000,000" -> 935000000 (assuming EUR to USD rate ~1.10)
- "$5M" -> 5000000
- "GBP 10-20 million" -> 18000000 (midpoint 15M * 1.20 rate)
- "Approximately EUR 100 million" -> 110000000
-
-Return only the USD integer amount with current exchange rates."""
-
-            result = await self.currency_converter_llm.ainvoke(prompt)
-            return result.amount_usd if result.amount_usd > 0 else None
-        except Exception as e:
-            print(f"Error converting currency '{amount_str}': {e}")
-            return None
-
-    async def parse_check_size_range(
-        self, estimated_investment_str: str
-    ) -> tuple[Optional[int], Optional[int]]:
-        """
-        Use LLM to parse check size range from estimated investment size string.
-        Returns tuple of (lower_bound_usd, upper_bound_usd).
-
-        Handles formats like:
-        - "EUR 1,000 to 2,000"
-        - "$100K-$500K"
-        - "Between $1M and $5M"
-        - "Up to EUR 10 million"
-        - "$2M typical"
-        """
-        if (
-            not estimated_investment_str
-            or estimated_investment_str == "Not Available"
-            or estimated_investment_str == "0"
-        ):
-            return None, None
-
-        try:
-            prompt = f"""Parse this check size/investment range into lower and upper bounds in USD as integers.
-
-Input: {estimated_investment_str}
-
-Instructions:
- If it's a range (e.g., "EUR 1M to 5M"), extract both bounds
- If it's a single amount (e.g., "$2M typical"), use it as both lower and upper
- If it says "up to X", use 0 as lower and X as upper
- Convert all currencies to USD using current exchange rates
- Return integers (whole numbers, no decimals)
-
-Examples:
- "EUR 1,000 to 2,000" -> lower: 1100, upper: 2200
- "$100K-$500K" -> lower: 100000, upper: 500000
- "Between $1M and $5M" -> lower: 1000000, upper: 5000000
- "Up to EUR 10 million" -> lower: 0, upper: 11000000
- "$2M typical" -> lower: 2000000, upper: 2000000
- "GBP 500K-2M" -> lower: 600000, upper: 2400000
-
-Return the lower and upper bounds in USD."""
-
-            result = await self.check_size_parser_llm.ainvoke(prompt)
-            lower = result.lower_bound_usd if result.lower_bound_usd > 0 else None
-            upper = result.upper_bound_usd if result.upper_bound_usd > 0 else None
-            return lower, upper
-        except Exception as e:
-            print(f"Error parsing check size range '{estimated_investment_str}': {e}")
-            return None, None
-
-    def parse_json_profile(self, json_str: str) -> Optional[dict]:
-        """
-        Manually parse the JSON profile from the CSV.
-        Returns a cleaned dictionary with the investor profile data.
-        """
-        if not json_str or pd.isna(json_str):
-            return None
-
-        try:
-            # Parse JSON string
-            profile = json.loads(json_str)
-            return profile
-        except json.JSONDecodeError as e:
-            print(f"Error parsing JSON: {e}")
-            return None
-
-    async def process_investor_profile(
-        self, name: str, website: str, profile_json: str
-    ) -> Optional[dict]:
-        """
-        Process investor profile from CSV data.
-        Manually extracts fields and uses LLM only for currency conversion.
-        """
-        profile = self.parse_json_profile(profile_json)
-        if not profile:
-            return None
-
-        try:
-            # Extract basic info
-            investor_data = {
-                "name": name.strip() if name else None,
-                "website": website.strip() if website else None,
-                "headquarters": profile.get("headquarters"),
-                "description": profile.get("investorDescription"),
-                "aum": None,
-                "aum_as_of_date": None,
-                "aum_source_url": None,
-                "investment_thesis": profile.get("investmentThesisFocus", []),
-                "portfolio_highlights": profile.get("portfolioHighlights", []),
-                "linked_documents": profile.get("linkedDocuments", []),
-                "researcher_notes": profile.get("researcherNotes"),
-                "missing_important_fields": profile.get("missingImportantFields", []),
-                "sources": profile.get("sources", {}),
-                "team_members": [],
-                "funds": [],
-            }
-
-            # Process AUM
-            aum_data = profile.get("overallAssetsUnderManagement", {})
-            if aum_data and isinstance(aum_data, dict):
-                aum_amount = aum_data.get("aumAmount")
-                if aum_amount and aum_amount != "Not Available":
-                    # Convert AUM to USD integer
-                    aum_usd = await self.convert_to_usd(aum_amount)
-                    investor_data["aum"] = aum_usd
-                    investor_data["aum_as_of_date"] = aum_data.get("asOfDate")
-                    investor_data["aum_source_url"] = aum_data.get("sourceUrl")
-
-            # Process senior leadership
-            senior_leadership = profile.get("seniorLeadership", [])
-            for member in senior_leadership:
-                if isinstance(member, dict) and member.get("name"):
-                    investor_data["team_members"].append(
-                        {
-                            "name": member.get("name"),
-                            "title": member.get("title"),
-                            "role": member.get("title"),  # Use title as role
-                            "email": None,
-                            "source_url": member.get("sourceUrl"),
-                        }
-                    )
-
-            # Process funds
-            funds = profile.get("funds", [])
-            for fund in funds:
-                if isinstance(fund, dict):
-                    fund_data = {
-                        "fund_name": fund.get("fundName"),
-                        "fund_size": None,
-                        "fund_size_source_url": fund.get("fundSizeSourceUrl"),
-                        "check_size_lower": None,
-                        "check_size_upper": None,
-                        "source_url": fund.get("sourceUrl"),
-                        "source_provider": fund.get("sourceProvider"),
-                        "geographic_focus": None,  # Will be converted to string
-                        "investment_stage_names": fund.get("investmentStageFocus", []),
-                        "sector_names": fund.get("sectorFocus", []),
-                    }
-
-                    # Convert geographic focus from array to comma-separated string
-                    geo_focus = fund.get("geographicFocus", [])
-                    if geo_focus and isinstance(geo_focus, list):
-                        fund_data["geographic_focus"] = ", ".join(geo_focus)
-
-                    # Convert fund size to USD integer
-                    fund_size_str = fund.get("fundSize")
-                    if fund_size_str and fund_size_str != "Not Available":
-                        fund_size_usd = await self.convert_to_usd(fund_size_str)
-                        if fund_size_usd:
-                            fund_data["fund_size"] = fund_size_usd  # Store as integer
-
-                    # Parse check size range from estimated investment size
-                    est_size_str = fund.get("estimatedInvestmentSize")
-                    if est_size_str and est_size_str != "Not Available":
-                        check_lower, check_upper = await self.parse_check_size_range(
-                            est_size_str
-                        )
-                        if check_lower is not None:
-                            fund_data["check_size_lower"] = check_lower
-                        if check_upper is not None:
-                            fund_data["check_size_upper"] = check_upper
-
-                    investor_data["funds"].append(fund_data)
-
-            return investor_data
-
-        except Exception as e:
-            print(f"Error processing investor profile for {name}: {e}")
-            return None
-
-    async def process_company_profile(
-        self, name: str, website: str, profile_json: str, investor_names: str = None
-    ) -> Optional[dict]:
-        """
-        Process company profile from CSV data.
-        Manually extracts fields without using LLM.
-        """
-        profile = self.parse_json_profile(profile_json)
-        if not profile:
-            return None
-
-        try:
-            # Extract basic info
-            company_data = {
-                "name": name.strip() if name else None,
-                "website": website.strip() if website else None,
-                "description": profile.get("companyDescription"),
-                "location": profile.get("geographicFocus"),
-                "industry": profile.get("sectorDescription"),
-                "founded_year": None,  # Not typically in the company JSON
-                "key_executives": [],
-                "client_categories": profile.get("clientCategories", []),
-                "product_description": profile.get("productDescription"),
-                "linked_documents": profile.get("linkedDocuments", []),
-                "researcher_notes": profile.get("researcherNotes"),
-                "missing_important_fields": profile.get("missingImportantFields", []),
-                "sources": profile.get("sources", {}),
-                "investor_names": [],
-            }
-
-            # Parse investor names from the Investor column
-            if investor_names and pd.notna(investor_names):
-                # Split by comma and clean
-                investors = [inv.strip() for inv in str(investor_names).split(",")]
-                company_data["investor_names"] = [inv for inv in investors if inv]
-
-            # Process key executives/leadership
-            key_executives = profile.get("keyExecutives", [])
-            if not key_executives:
-                # Try alternative field names
-                key_executives = profile.get("seniorLeadership", [])
-
-            for exec_member in key_executives:
-                if isinstance(exec_member, dict) and exec_member.get("name"):
-                    company_data["key_executives"].append(
-                        {
-                            "name": exec_member.get("name"),
-                            "title": exec_member.get("title"),
-                            "source_url": exec_member.get("sourceUrl"),
-                        }
-                    )
-
-            # Try to extract founding year from description
-            description = company_data.get("description", "")
-            if description:
-                # Look for patterns like "founded in 2020", "Gegründet 2020", "founded 2020"
-                year_patterns = [
-                    r"founded in (\d{4})",
-                    r"founded (\d{4})",
-                    r"Gegründet (\d{4})",
-                    r"established in (\d{4})",
-                    r"since (\d{4})",
-                    r"\((\d{4})\)",  # Year in parentheses
-                ]
-                for pattern in year_patterns:
-                    match = re.search(pattern, description, re.IGNORECASE)
-                    if match:
-                        try:
-                            year = int(match.group(1))
-                            if 1900 <= year <= 2025:  # Sanity check
-                                company_data["founded_year"] = year
-                                break
-                        except Exception:
-                            continue
-
-            return company_data
-
-        except Exception as e:
-            print(f"Error processing company profile for {name}: {e}")
-            return None
-
-    def _save_parsed_company_to_db(
-        self, db: Session, company_data: dict
-    ) -> Optional[CompanyTable]:
-        """Save manually parsed company data to database"""
-        try:
-            # Check if company already exists
-            existing_company = (
-                db.query(CompanyTable).filter_by(name=company_data["name"]).first()
-            )
-
-            if existing_company:
-                # Update existing company
-                company = existing_company
-                company.website = company_data.get("website") or company.website
-                company.location = company_data.get("location") or company.location
-                company.description = (
-                    company_data.get("description") or company.description
-                )
-                company.industry = company_data.get("industry") or company.industry
-                if company_data.get("founded_year"):
-                    company.founded_year = company_data["founded_year"]
-            else:
-                # Create new company
-                company = CompanyTable(
-                    name=company_data["name"],
-                    website=company_data.get("website"),
-                    location=company_data.get("location"),
-                    description=company_data.get("description"),
-                    industry=company_data.get("industry"),
-                    founded_year=company_data.get("founded_year"),
-                )
-                db.add(company)
-                db.flush()
-
-            # Add/update company members (key executives)
-            # First, remove existing members if updating
-            if existing_company:
-                db.query(CompanyMember).filter_by(company_id=company.id).delete()
-
-            for exec_data in company_data.get("key_executives", []):
-                member = CompanyMember(
-                    name=exec_data.get("name"),
-                    role=exec_data.get("title"),
-                    linkedin=exec_data.get(
-                        "source_url"
-                    ),  # Store source URL in linkedin field
-                    company_id=company.id,
-                )
-                db.add(member)
-
-            # Link to investors if provided
-            for investor_name in company_data.get("investor_names", []):
-                # Find investor in database
-                investor = (
-                    db.query(InvestorTable)
-                    .filter_by(name=investor_name.strip())
-                    .first()
-                )
-                if investor:
-                    # Add company to investor's portfolio if not already there
-                    if company not in investor.portfolio_companies:
-                        investor.portfolio_companies.append(company)
-
-            return company
-
-        except Exception as e:
-            print(f"Error saving company to database: {e}")
-            db.rollback()
-            return None
-
-    def _save_parsed_investor_to_db(
-        self, db: Session, investor_data: dict
-    ) -> Optional[InvestorTable]:
-        """Save manually parsed investor data to database"""
-        try:
-            # Check if investor already exists
-            existing_investor = (
-                db.query(InvestorTable).filter_by(name=investor_data["name"]).first()
-            )
-
-            if existing_investor:
-                # Update existing investor
-                investor = existing_investor
-                investor.website = investor_data.get("website") or investor.website
-                investor.headquarters = (
-                    investor_data.get("headquarters") or investor.headquarters
-                )
-                investor.description = (
-                    investor_data.get("description") or investor.description
-                )
-                investor.aum = investor_data.get("aum") or investor.aum
-                investor.aum_as_of_date = (
-                    investor_data.get("aum_as_of_date") or investor.aum_as_of_date
-                )
-                investor.aum_source_url = (
-                    investor_data.get("aum_source_url") or investor.aum_source_url
-                )
-                investor.investment_thesis = (
-                    investor_data.get("investment_thesis") or investor.investment_thesis
-                )
-                investor.portfolio_highlights = (
-                    investor_data.get("portfolio_highlights")
-                    or investor.portfolio_highlights
-                )
-                investor.linked_documents = (
-                    investor_data.get("linked_documents") or investor.linked_documents
-                )
-                investor.researcher_notes = (
-                    investor_data.get("researcher_notes") or investor.researcher_notes
-                )
-                investor.missing_important_fields = (
-                    investor_data.get("missing_important_fields")
-                    or investor.missing_important_fields
-                )
-                investor.sources = investor_data.get("sources") or investor.sources
-            else:
-                # Create new investor
-                investor = InvestorTable(
-                    name=investor_data["name"],
-                    website=investor_data.get("website"),
-                    headquarters=investor_data.get("headquarters"),
-                    description=investor_data.get("description"),
-                    aum=investor_data.get("aum"),
-                    aum_as_of_date=investor_data.get("aum_as_of_date"),
-                    aum_source_url=investor_data.get("aum_source_url"),
-                    investment_thesis=investor_data.get("investment_thesis"),
-                    portfolio_highlights=investor_data.get("portfolio_highlights"),
-                    linked_documents=investor_data.get("linked_documents"),
-                    researcher_notes=investor_data.get("researcher_notes"),
-                    missing_important_fields=investor_data.get(
-                        "missing_important_fields"
-                    ),
-                    sources=investor_data.get("sources"),
-                )
-                db.add(investor)
-                db.flush()
-
-            # Add/update team members
-            # First, remove existing team members if updating
-            if existing_investor:
-                db.query(InvestorMember).filter_by(investor_id=investor.id).delete()
-
-            for member_data in investor_data.get("team_members", []):
-                member = InvestorMember(
-                    name=member_data.get("name"),
-                    role=member_data.get("role"),
-                    title=member_data.get("title"),
-                    email=member_data.get("email"),
-                    source_url=member_data.get("source_url"),
-                    investor_id=investor.id,
-                )
-                db.add(member)
-
-            # Add/update funds
-            # First, remove existing funds if updating
-            if existing_investor:
-                db.query(FundTable).filter_by(investor_id=investor.id).delete()
-
-            for fund_data in investor_data.get("funds", []):
-                fund = FundTable(
-                    investor_id=investor.id,
-                    fund_name=fund_data.get("fund_name"),
-                    fund_size=fund_data.get("fund_size"),  # Now an integer
-                    fund_size_source_url=fund_data.get("fund_size_source_url"),
-                    check_size_lower=fund_data.get("check_size_lower"),
-                    check_size_upper=fund_data.get("check_size_upper"),
-                    source_url=fund_data.get("source_url"),
-                    source_provider=fund_data.get("source_provider"),
-                    geographic_focus=fund_data.get("geographic_focus"),  # Now a string
-                )
-                db.add(fund)
-                db.flush()  # Get the fund ID
-
-                # Add investment stages (many-to-many)
-                for stage_name in fund_data.get("investment_stage_names", []):
-                    stage = self._get_or_create_investment_stage(db, stage_name)
-                    fund.investment_stages.append(stage)
-
-                # Add sectors (many-to-many)
-                for sector_name in fund_data.get("sector_names", []):
-                    sector = self._get_or_create_sector(db, sector_name)
-                    fund.sectors.append(sector)
-
-            return investor
-
-        except Exception as e:
-            print(f"Error saving investor to database: {e}")
-            db.rollback()
-            return None
-
-    def _get_or_create_investment_stage(
-        self, db: Session, stage_name: str
-    ) -> InvestmentStageTable:
-        """Get existing investment stage or create new one"""
-        from db.models import InvestmentStageTable
-
-        stage = (
-            db.query(InvestmentStageTable)
-            .filter(InvestmentStageTable.name == stage_name)
-            .first()
-        )
-        if not stage:
-            stage = InvestmentStageTable(name=stage_name)
-            db.add(stage)
-            db.flush()  # Get the ID without committing
-        return stage
-
    def _get_or_create_sector(self, db: Session, sector_name: str) -> SectorTable:
        """Get existing sector or create new one"""
        sector = db.query(SectorTable).filter(SectorTable.name == sector_name).first()
@@ -569,6 +49,7 @@ Return the lower and upper bounds in USD."""
            check_size_lower=investor_data.investor.check_size_lower,
            check_size_upper=investor_data.investor.check_size_upper,
            geographic_focus=investor_data.investor.geographic_focus,
+            stage_focus=investor_data.investor.stage_focus,
            number_of_investments=investor_data.investor.number_of_investments,
        )
        db.add(investor)
@@ -692,219 +173,141 @@ Return the lower and upper bounds in USD."""
            print(f"Error processing row {row_idx + 1}: {e}")
            return None

-    async def parse_investors(self, df: pd.DataFrame, save_to_db: bool = True):
-        """
-        Parse investors from DataFrame using manual JSON parsing and LLM for currency conversion.
-        Expected CSV columns: Name, Website, Final Investor Profile, Final Profile sourcing
-        """
-        results = []
+    async def parse_investors(self, df, save_to_db: bool = True):
+        """Parse investors from DataFrame and optionally save to database"""
+        investors = []
+        df = df[20:]
        db = None
        if save_to_db:
            db = get_db_session()

        try:
-            total_rows = len(df)
-            print(f"\n🚀 Starting to process {total_rows} investors...")
+            # Process rows in batches asynchronously
+            batch_size = 20  # Adjust batch size as needed
+            rows = [(idx, row) for idx, row in df.iterrows()]

-            for idx, row in df.iterrows():
-                try:
-                    name = (
-                        row.get("Name", "").strip()
-                        if pd.notna(row.get("Name"))
-                        else None
-                    )
-                    website = (
-                        row.get("Website", "").strip()
-                        if pd.notna(row.get("Website"))
-                        else None
-                    )
-                    profile_json = (
-                        row.get("Final Investor Profile", "")
-                        if pd.notna(row.get("Final Investor Profile"))
-                        else None
-                    )
+            for i in range(0, len(rows), batch_size):
+                batch = rows[i : i + batch_size]

-                    if not name or not profile_json:
-                        print(f"⚠️  Row {idx + 1}: Skipping - missing name or profile")
+                # Process batch asynchronously
+                tasks = [
+                    self._process_row(row, idx, is_investor=True) for idx, row in batch
+                ]
+
+                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+                # Handle results from batch
+                for (idx, row), result in zip(batch, batch_results):
+                    if isinstance(result, Exception):
+                        print(f"Error processing row {idx}: {result}")
+                        if db:
+                            db.rollback()
                        continue

-                    print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}")
+                    if result:
+                        # Convert dict to InvestorData if needed
+                        if isinstance(result, dict):
+                            investor_data = InvestorData(**result)
+                        else:
+                            investor_data = result

-                    # Process the investor profile
-                    investor_data = await self.process_investor_profile(
-                        name, website, profile_json
-                    )
+                        investors.append(investor_data)

-                    if investor_data:
-                        results.append(investor_data)
-                        print("   ✓ Parsed successfully")
-                        print(f"   - HQ: {investor_data.get('headquarters')}")
-                        print(
-                            f"   - AUM: ${investor_data.get('aum'):,}"
-                            if investor_data.get("aum")
-                            else "   - AUM: Not Available"
-                        )
-                        print(f"   - Funds: {len(investor_data.get('funds', []))}")
-                        print(
-                            f"   - Team: {len(investor_data.get('team_members', []))}"
-                        )
-
-                        # Save to database
+                        # Save to database if requested
                        if save_to_db and db:
                            try:
-                                saved_investor = self._save_parsed_investor_to_db(
+                                saved_investor = self._save_investor_to_db(
                                    db, investor_data
                                )
-                                if saved_investor:
-                                    db.commit()
-                                    print(
-                                        f"   ✅ Saved to database (ID: {saved_investor.id})"
-                                    )
-                                else:
-                                    print("   ❌ Failed to save to database")
+                                db.commit()
+                                print(
+                                    f"✅ Saved investor '{saved_investor.name}' to database"
+                                )
                            except Exception as e:
                                db.rollback()
-                                print(f"   ❌ Database error: {e}")
-                    else:
-                        print("   ⚠️  Failed to process profile")
+                                print(f"❌ Failed to save investor to database: {e}")

-                    # Commit every 10 investors to avoid memory issues
-                    if save_to_db and db and (idx + 1) % 10 == 0:
-                        db.commit()
-                        print(f"\n💾 Committed batch at row {idx + 1}")
-
-                except Exception as e:
-                    print(f"❌ Error processing row {idx + 1}: {e}")
-                    if db:
-                        db.rollback()
-                    continue
-
-            # Final commit
-            if save_to_db and db:
-                db.commit()
-                print("\n✅ Final commit completed")
+                print(
+                    f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
+                )

        except Exception as e:
-            print(f"❌ Fatal error in parse_investors: {e}")
+            print(f"Error in batch processing: {e}")
            if db:
                db.rollback()
        finally:
            if db:
                db.close()

-        print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} investors")
-        return results
+        return investors

-    async def parse_companies(self, df: pd.DataFrame, save_to_db: bool = True):
-        """
-        Parse companies from DataFrame using manual JSON parsing.
-        Expected CSV columns: Name, Website, Investor, Final Investor Profile (actually company profile)
-        """
-        results = []
+    async def parse_companies(self, df, save_to_db: bool = True):
+        """Parse companies from DataFrame and optionally save to database"""
+        companies = []
+        df = df[20:]
        db = None
        if save_to_db:
            db = get_db_session()

        try:
-            total_rows = len(df)
-            print(f"\n🚀 Starting to process {total_rows} companies...")
+            # Process rows in batches asynchronously
+            batch_size = 20  # Adjust batch size as needed
+            rows = [(idx, row) for idx, row in df.iterrows()]

-            for idx, row in df.iterrows():
-                try:
-                    name = (
-                        row.get("Name", "").strip()
-                        if pd.notna(row.get("Name"))
-                        else None
-                    )
-                    website = (
-                        row.get("Website", "").strip()
-                        if pd.notna(row.get("Website"))
-                        else None
-                    )
-                    investor_names = (
-                        row.get("Investor", "").strip()
-                        if pd.notna(row.get("Investor"))
-                        else None
-                    )
-                    profile_json = (
-                        row.get("Final Investor Profile", "")
-                        if pd.notna(row.get("Final Investor Profile"))
-                        else None
-                    )
+            for i in range(0, len(rows), batch_size):
+                batch = rows[i : i + batch_size]

-                    if not name or not profile_json:
-                        print(f"⚠️  Row {idx + 1}: Skipping - missing name or profile")
+                # Process batch asynchronously
+                tasks = [
+                    self._process_row(row, idx, is_investor=False) for idx, row in batch
+                ]
+
+                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+                # Handle results from batch
+                for (idx, row), result in zip(batch, batch_results):
+                    if isinstance(result, Exception):
+                        print(f"Error processing row {idx}: {result}")
+                        if db:
+                            db.rollback()
                        continue

-                    print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}")
+                    if result:
+                        # Convert dict to CompanyData if needed
+                        if isinstance(result, dict):
+                            company_data = CompanyData(**result)
+                        else:
+                            company_data = result

-                    # Process the company profile
-                    company_data = await self.process_company_profile(
-                        name, website, profile_json, investor_names
-                    )
+                        companies.append(company_data)

-                    if company_data:
-                        results.append(company_data)
-                        print("   ✓ Parsed successfully")
-                        print(f"   - Location: {company_data.get('location')}")
-                        print(f"   - Industry: {company_data.get('industry')}")
-                        print(
-                            f"   - Founded: {company_data.get('founded_year')}"
-                            if company_data.get("founded_year")
-                            else "   - Founded: Unknown"
-                        )
-                        print(
-                            f"   - Executives: {len(company_data.get('key_executives', []))}"
-                        )
-                        print(
-                            f"   - Investors: {len(company_data.get('investor_names', []))}"
-                        )
-
-                        # Save to database
+                        # Save to database if requested
                        if save_to_db and db:
                            try:
-                                saved_company = self._save_parsed_company_to_db(
+                                saved_company = self._save_company_to_db(
                                    db, company_data
                                )
-                                if saved_company:
-                                    db.commit()
-                                    print(
-                                        f"   ✅ Saved to database (ID: {saved_company.id})"
-                                    )
-                                else:
-                                    print("   ❌ Failed to save to database")
+                                db.commit()
+                                print(
+                                    f"✅ Saved company '{saved_company.name}' to database"
+                                )
                            except Exception as e:
                                db.rollback()
-                                print(f"   ❌ Database error: {e}")
-                    else:
-                        print("   ⚠️  Failed to process profile")
+                                print(f"❌ Failed to save company to database: {e}")

-                    # Commit every 10 companies to avoid memory issues
-                    if save_to_db and db and (idx + 1) % 10 == 0:
-                        db.commit()
-                        print(f"\n💾 Committed batch at row {idx + 1}")
-
-                except Exception as e:
-                    print(f"❌ Error processing row {idx + 1}: {e}")
-                    if db:
-                        db.rollback()
-                    continue
-
-            # Final commit
-            if save_to_db and db:
-                db.commit()
-                print("\n✅ Final commit completed")
+                    print(
+                        f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
+                    )

        except Exception as e:
-            print(f"❌ Fatal error in parse_companies: {e}")
+            print(f"Error processing row {idx}: {e}")
            if db:
                db.rollback()
        finally:
            if db:
                db.close()

-        print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} companies")
-        return results
+        return companies


 # async def main():
@@ -95,7 +95,6 @@ class QueryProcessor:
                    selectinload(InvestorTable.portfolio_companies),
                    selectinload(InvestorTable.team_members),
                    selectinload(InvestorTable.sectors),
-                    selectinload(InvestorTable.funds),
                )
                .filter(InvestorTable.id.in_(investor_ids))
            )
@@ -110,7 +109,6 @@ class QueryProcessor:
                    portfolio_companies=investor.portfolio_companies,
                    team_members=investor.team_members,
                    sectors=investor.sectors,
-                    funds=investor.funds,
                )
                investor_data_list.append(investor_data)

@@ -1,315 +0,0 @@
-import logging
-import re
-import unicodedata
-
-import pandas as pd
-from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
-from sqlalchemy.orm import sessionmaker
-
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-# Import the schema
-init_database()
-
-
-# ===================== Ingesting Original Data =====================#
-def parse_investor_names(investor_names_str):
-    """Parse comma-separated investor names and return a list"""
-    if pd.isna(investor_names_str) or investor_names_str == "":
-        return []
-
-    # Split by comma and clean whitespace
-    # investors = [name.strip() for name in str(investor_names_str).split(",")]
-    investors = [
-        clean_name(name.strip()) for name in str(investor_names_str).split(",")
-    ]
-    return [investor for investor in investors if investor]
-
-
-def parse_industries(industries_str):
-    """Parse comma-separated industries and return a list"""
-    if pd.isna(industries_str) or industries_str == "":
-        return []
-
-    # Split by comma and clean whitespace
-    industries = [industry.strip() for industry in str(industries_str).split(",")]
-    return [industry for industry in industries if industry]
-
-
-def clean_special_characters(text):
-    """Clean special characters from text, converting to ASCII equivalents"""
-    if not text:
-        return text
-
-    # First remove ellipses and other problematic patterns
-    text = str(text).replace("...", "").replace("..", "")
-
-    # Normalize unicode characters to their closest ASCII equivalents
-    normalized = unicodedata.normalize("NFKD", text)
-
-    # Remove accents and convert to ASCII
-    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
-
-    # Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
-    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)
-
-    # Clean up multiple spaces
-    cleaned = re.sub(r"\s+", " ", cleaned).strip()
-
-    return cleaned
-
-
-def clean_string(value):
-    """Clean string values, converting empty/null/nan/0 to None and removing special characters"""
-    if (
-        pd.isna(value)
-        or value == ""
-        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
-    ):
-        return None
-
-    # First clean special characters
-    cleaned = clean_special_characters(str(value).strip())
-
-    # Check if result is just "0" after cleaning
-    if cleaned in ["0", "0.0", "null", "nan", "none"]:
-        return None
-
-    return cleaned if cleaned else None
-
-
-def clean_name(value):
-    """Clean names (companies, investors) with special character handling"""
-    if (
-        pd.isna(value)
-        or value == ""
-        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
-    ):
-        return None
-
-    # Clean special characters but be more permissive for names
-    text = str(value).strip()
-    # First remove ellipses and other problematic patterns
-    # text = text.replace("...", "").replace("..", "")
-
-    # Normalize unicode characters
-    normalized = unicodedata.normalize("NFKD", text)
-
-    # Convert to ASCII but keep more characters for business names
-    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
-
-    # Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
-    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)
-
-    # Clean up multiple spaces
-    cleaned = re.sub(r"\s+", " ", cleaned).strip()
-
-    # Remove any trailing or leading periods
-    cleaned = cleaned.strip(".")
-
-    cleaned = cleaned.replace("..", "").replace("...", "")
-    # Check if result is just "0" after cleaning
-    if cleaned in ["0", "0.0", "null", "nan", "none"]:
-        return None
-
-    return cleaned if cleaned else None
-
-
-def clean_integer(value):
-    """Clean integer values, converting empty/null/nan/0 to None"""
-    if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
-        return None
-    try:
-        cleaned_val = int(float(value))
-        return cleaned_val if cleaned_val > 0 else None
-    except (ValueError, TypeError):
-        return None
-
-
-def parse_website(website_str: str):
-    try:
-        _, end = website_str.split(":")
-
-        if end == "0":
-            return None
-        return "https:" + end
-    except Exception:
-        return None
-
-
-def ingest_data():
-    # Create database engine and session
-    Session = sessionmaker(bind=engine)
-    session = Session()
-
-    # Load CSV files
-    print("Loading CSV files...")
-    companies_df = pd.read_csv("companies.csv")
-    investors_df = pd.read_csv("investors.csv")
-
-    print(f"📊 Companies CSV: {len(companies_df)} rows")
-    print(f"📊 Investors CSV: {len(investors_df)} rows")
-
-    # Step 1: Ingest Investors
-    print("\n🔄 Step 1: Ingesting Investors...")
-    investors_processed = 0
-
-    for index, row in investors_df.iterrows():
-        try:
-            investor_name = clean_name(row.get("Filtered investor names", ""))
-
-            if investor_name:
-                # Check if investor already exists
-                existing_investor = (
-                    session.query(InvestorTable).filter_by(name=investor_name).first()
-                )
-                if not existing_investor:
-                    investor = InvestorTable(
-                        name=investor_name,
-                        description=clean_string(row.get("Business model", "")),
-                        headquarters=clean_string(row.get("HQ", "")),
-                        website=parse_website(str(row.get("Website", "")).strip()),
-                        number_of_investments=clean_integer(
-                            row.get("Number of investments")
-                        ),
-                    )
-                    session.add(investor)
-                    investors_processed += 1
-
-                    if investors_processed % 1000 == 0:
-                        session.commit()
-                        print(f"  Committed {investors_processed} investors")
-
-        except Exception as e:
-            logger.error(f"Error processing investor {index}: {e}")
-            continue
-
-    session.commit()
-    print(f"✅ Investors completed: {investors_processed} processed")
-
-    # Step 2: Ingest Companies and Rounds
-    print("\n🔄 Step 2: Ingesting Companies and Sectors...")
-    companies_processed = 0
-    sectors_created = set()
-
-    for index, row in companies_df.iterrows():
-        try:
-            # Process company
-            company_name = clean_name(row.get("Organization Name", ""))
-            if not company_name:
-                continue
-
-            # Check if company already exists
-            existing_company = (
-                session.query(CompanyTable).filter_by(name=company_name).first()
-            )
-            if existing_company:
-                company = existing_company
-            else:
-                # Create company
-                company = CompanyTable(
-                    name=company_name,
-                    description=clean_string(row.get("Organization Description", "")),
-                    location=clean_string(row.get("Organization Location", "")),
-                    industry=clean_string(row.get("Organization Industries", "")),
-                    website=clean_string(row.get("Organization Website", "")),
-                )
-                session.add(company)
-                session.flush()  # Get the company ID
-                companies_processed += 1
-
-            # Process investor relationships
-            investor_names_str = row.get("Investor Names", "")
-            if pd.notna(investor_names_str) and investor_names_str:
-                investor_names = parse_investor_names(investor_names_str)
-
-                for investor_name in investor_names:
-                    # Find investor in database
-                    investor = (
-                        session.query(InvestorTable)
-                        .filter_by(name=investor_name.strip())
-                        .first()
-                    )
-
-                    if investor:
-                        # Add investor-company relationship
-                        if company not in investor.portfolio_companies:
-                            investor.portfolio_companies.append(company)
-                    else:
-                        print("This company has an investor not in DB:", investor_name)
-
-            # Process sectors/industries
-            industries_str = row.get("Organization Industries", "")
-            if pd.notna(industries_str) and industries_str:
-                industries = parse_industries(industries_str)
-
-                for industry_name in industries:
-                    industry_name = industry_name.strip()
-                    if industry_name:
-                        # Check if sector exists
-                        sector = (
-                            session.query(SectorTable)
-                            .filter_by(name=industry_name)
-                            .first()
-                        )
-                        if not sector:
-                            sector = SectorTable(name=industry_name)
-                            session.add(sector)
-                            session.flush()
-                            sectors_created.add(industry_name)
-
-                        # Add company-sector relationship
-                        if sector not in company.sectors:
-                            company.sectors.append(sector)
-
-            # Commit every 100 companies
-            if companies_processed % 100 == 0 and companies_processed > 0:
-                session.commit()
-                print(f"  Processed {companies_processed} companies...")
-
-        except Exception as e:
-            logger.error(f"Error processing company {index}: {e}")
-            session.rollback()
-            continue
-
-    # Step 3: Link investors to sectors based on portfolio companies
-    print("\n🔄 Step 3: Linking Investors to Sectors...")
-    investors_linked_to_sectors = 0
-    all_investors = session.query(InvestorTable).all()
-    for investor in all_investors:
-        sectors = set()
-        for company in investor.portfolio_companies:
-            for sector in company.sectors:
-                sectors.add(sector)
-        # Add sectors to investor if not already present
-        for sector in sectors:
-            if sector not in investor.sectors:
-                investor.sectors.append(sector)
-        if sectors:
-            investors_linked_to_sectors += 1
-    session.commit()
-    print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")
-
-    # Final commit
-    session.commit()
-
-    # Final counts
-    final_investors = session.query(InvestorTable).count()
-    final_companies = session.query(CompanyTable).count()
-    final_sectors = session.query(SectorTable).count()
-
-    print("\n🎉 Ingestion Complete!")
-    print(f"   Investors: {final_investors}")
-    print(f"   Companies: {final_companies}")
-    print(f"   Sectors: {final_sectors}")
-
-    session.close()
-
-
-if __name__ == "__main__":
-    ingest_data()
-    # print(clean_name("A... Energi"))
-    # print(clean_name("B.. Tech"))
-    # print(clean_name("A... Energi"))
@@ -1,381 +0,0 @@
-import enum
-from typing import Annotated
-
-from fastapi import Depends
-from sqlalchemy import (
-    Column,
-    DateTime,
-    ForeignKey,
-    Integer,
-    String,
-    Table,
-    Text,
-    create_engine,
-    func,
-)
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import Session, declarative_mixin, relationship, sessionmaker
-from sqlalchemy.types import JSON, Enum
-
-Base = declarative_base()
-
-# Database configuration
-# DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db")
-
-# Create engine
-engine = create_engine("sqlite:///./investors.db", echo=False)
-
-# Create session factory
-SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
-
-
-def get_db():
-    db = SessionLocal()
-    try:
-        yield db
-    finally:
-        db.close()
-
-
-db_dependency = Annotated[Session, Depends(get_db)]
-
-
-def init_database():
-    """Initialize the database by creating all tables"""
-    Base.metadata.create_all(bind=engine)
-
-
-def get_session_sync() -> Session:
-    """Get a database session for synchronous operations"""
-    return SessionLocal()
-
-
-def get_db_session():
-    """Get a database session for direct use."""
-    return SessionLocal()
-
-
-@declarative_mixin
-class TimestampMixin:
-    created_at = Column(
-        DateTime(timezone=True), server_default=func.now(), nullable=False
-    )
-    updated_at = Column(DateTime(timezone=True), onupdate=func.now())
-
-
-class InvestmentStage(enum.Enum):
-    SEED = "SEED"
-    SERIES_A = "SERIES_A"
-    SERIES_B = "SERIES_B"
-    SERIES_C = "SERIES_C"
-    GROWTH = "GROWTH"
-    LATE_STAGE = "LATE_STAGE"
-
-
-# Association table for many-to-many relationship between investors and companies
-investor_company_association = Table(
-    "investor_companies",
-    Base.metadata,
-    Column("investor_id", Integer, ForeignKey("investors.id")),
-    Column("company_id", Integer, ForeignKey("companies.id")),
-)
-
-
-# Association table for investor-sector many-to-many
-investor_sector_association = Table(
-    "investor_sectors",
-    Base.metadata,
-    Column("investor_id", Integer, ForeignKey("investors.id")),
-    Column("sector_id", Integer, ForeignKey("sectors.id")),
-)
-
-
-company_sector_association = Table(
-    "company_sector",
-    Base.metadata,
-    Column("company_id", Integer, ForeignKey("companies.id")),
-    Column("sector_id", Integer, ForeignKey("sectors.id")),
-)
-
-project_sector_association = Table(
-    "project_sector",
-    Base.metadata,
-    Column("project_id", Integer, ForeignKey("projects.id")),
-    Column("sector_id", Integer, ForeignKey("sectors.id")),
-)
-
-project_investor_association = Table(
-    "project_investors",
-    Base.metadata,
-    Column("project_id", Integer, ForeignKey("projects.id")),
-    Column("investor_id", Integer, ForeignKey("investors.id")),
-)
-
-project_company_association = Table(
-    "project_companies",
-    Base.metadata,
-    Column("project_id", Integer, ForeignKey("projects.id")),
-    Column("company_id", Integer, ForeignKey("companies.id")),
-)
-
-# Association table for investor-stage many-to-many
-investor_stage_association = Table(
-    "investor_stages",
-    Base.metadata,
-    Column("investor_id", Integer, ForeignKey("investors.id")),
-    Column("stage_id", Integer, ForeignKey("investment_stages.id")),
-)
-
-# Association table for fund-stage many-to-many
-fund_investment_stages_association = Table(
-    "fund_investment_stages",
-    Base.metadata,
-    Column("fund_id", Integer, ForeignKey("funds.id")),
-    Column("stage_id", Integer, ForeignKey("investment_stages.id")),
-)
-
-# Association table for fund-sector many-to-many
-fund_sectors_association = Table(
-    "fund_sectors",
-    Base.metadata,
-    Column("fund_id", Integer, ForeignKey("funds.id")),
-    Column("sector_id", Integer, ForeignKey("sectors.id")),
-)
-
-
-class InvestorTable(Base, TimestampMixin):
-    __tablename__ = "investors"
-
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, nullable=False)
-    description = Column(Text, nullable=True)
-
-    # Basic investor info
-    website = Column(String, nullable=True)
-    headquarters = Column(String, nullable=True)
-
-    # AUM fields
-    aum = Column(Integer, nullable=True)  # Store as integer for numerical filtering
-    aum_as_of_date = Column(String, nullable=True)
-    aum_source_url = Column(String, nullable=True)
-
-    # Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
-    check_size_lower = Column(Integer, nullable=True)
-    check_size_upper = Column(Integer, nullable=True)
-
-    # Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
-    geographic_focus = Column(String, nullable=True)
-
-    # Investment thesis and portfolio
-    investment_thesis = Column(JSON, nullable=True)  # Array of thesis statements
-    portfolio_highlights = Column(
-        JSON, nullable=True
-    )  # Array of portfolio company names
-    linked_documents = Column(JSON, nullable=True)  # Array of document URLs
-
-    # Research metadata
-    researcher_notes = Column(Text, nullable=True)
-    missing_important_fields = Column(
-        JSON, nullable=True
-    )  # Array of missing field names
-    sources = Column(JSON, nullable=True)  # JSON object with source URLs
-
-    # Portfolio info
-    number_of_investments = Column(Integer, nullable=True)
-
-    # Relationships
-    team_members = relationship(
-        "InvestorMember", back_populates="investor", cascade="all, delete-orphan"
-    )
-    funds = relationship(
-        "FundTable", back_populates="investor", cascade="all, delete-orphan"
-    )
-
-    # Many-to-many relationship with investment stages
-    investment_stages = relationship(
-        "InvestmentStageTable",
-        secondary=investor_stage_association,
-        back_populates="investors",
-    )
-
-    # Relationship to portfolio companies
-    portfolio_companies = relationship(
-        "CompanyTable",
-        secondary=investor_company_association,
-        back_populates="investors",
-    )
-
-    sectors = relationship(
-        "SectorTable",
-        secondary=investor_sector_association,
-        back_populates="investors",
-    )
-
-    projects = relationship(
-        "ProjectTable",
-        secondary=project_investor_association,
-        back_populates="investors",
-    )
-
-
-class InvestorMember(Base, TimestampMixin):
-    __tablename__ = "investor_members"
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, nullable=False)
-    role = Column(String, nullable=True)
-    title = Column(String, nullable=True)  # Alternative to role
-    email = Column(String, nullable=True)
-    source_url = Column(String, nullable=True)  # URL where member info was found
-
-    investor_id = Column(Integer, ForeignKey("investors.id"))
-    investor = relationship("InvestorTable", back_populates="team_members")
-
-
-class FundTable(Base, TimestampMixin):
-    __tablename__ = "funds"
-
-    id = Column(Integer, primary_key=True, index=True)
-    investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
-
-    # Fund details
-    fund_name = Column(String, nullable=True)
-    fund_size = Column(
-        Integer, nullable=True
-    )  # Store as integer for numerical filtering
-    fund_size_source_url = Column(String, nullable=True)
-
-    # Check size range (parsed from estimated_investment_size by LLM)
-    check_size_lower = Column(Integer, nullable=True)
-    check_size_upper = Column(Integer, nullable=True)
-
-    source_url = Column(String, nullable=True)
-    source_provider = Column(String, nullable=True)  # e.g., "Perplexity"
-
-    # Geographic focus as simple string
-    geographic_focus = Column(String, nullable=True)
-
-    # Relationships
-    investor = relationship("InvestorTable", back_populates="funds")
-    investment_stages = relationship(
-        "InvestmentStageTable",
-        secondary=fund_investment_stages_association,
-        back_populates="funds",
-    )
-    sectors = relationship(
-        "SectorTable",
-        secondary=fund_sectors_association,
-        back_populates="funds",
-    )
-
-
-class InvestmentStageTable(Base, TimestampMixin):
-    __tablename__ = "investment_stages"
-
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, nullable=False, unique=True)
-
-    # Relationships
-    investors = relationship(
-        "InvestorTable",
-        secondary=investor_stage_association,
-        back_populates="investment_stages",
-    )
-    funds = relationship(
-        "FundTable",
-        secondary=fund_investment_stages_association,
-        back_populates="investment_stages",
-    )
-
-
-class CompanyTable(Base, TimestampMixin):
-    __tablename__ = "companies"
-
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, nullable=False)
-    industry = Column(String, nullable=True)
-    location = Column(String, nullable=True)
-    description = Column(String, nullable=True)
-    founded_year = Column(Integer, nullable=True)
-    website = Column(String, nullable=True)
-
-    members = relationship(
-        "CompanyMember", back_populates="company", cascade="all, delete-orphan"
-    )
-    # Relationship back to investors
-    investors = relationship(
-        "InvestorTable",
-        secondary=investor_company_association,
-        back_populates="portfolio_companies",
-    )
-
-    sectors = relationship(
-        "SectorTable", secondary=company_sector_association, back_populates="companies"
-    )
-
-    projects = relationship(
-        "ProjectTable",
-        secondary=project_company_association,
-        back_populates="companies",
-    )
-
-
-class CompanyMember(Base, TimestampMixin):
-    __tablename__ = "company_members"
-    id = Column(Integer, primary_key=True)
-    name = Column(String)
-    linkedin = Column(String, nullable=True)
-    role = Column(String, nullable=True)
-    company_id = Column(Integer, ForeignKey("companies.id"), nullable=False)
-
-    company = relationship("CompanyTable", back_populates="members")
-
-
-class SectorTable(Base, TimestampMixin):
-    __tablename__ = "sectors"
-
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, nullable=False)
-
-    # Relationships
-    investors = relationship(
-        "InvestorTable",
-        secondary=investor_sector_association,
-        back_populates="sectors",
-    )
-    companies = relationship(
-        "CompanyTable", secondary=company_sector_association, back_populates="sectors"
-    )
-    projects = relationship(
-        "ProjectTable", secondary=project_sector_association, back_populates="sector"
-    )
-    funds = relationship(
-        "FundTable",
-        secondary=fund_sectors_association,
-        back_populates="sectors",
-    )
-
-
-class ProjectTable(Base, TimestampMixin):
-    __tablename__ = "projects"
-
-    id = Column(Integer, primary_key=True, index=True)
-    name = Column(String, nullable=False)
-    valuation = Column(Integer, nullable=True)
-
-    stage = Column(Enum(InvestmentStage), nullable=True)
-    location = Column(String, nullable=True)
-    description = Column(Text, nullable=True)
-    start_date = Column(DateTime, nullable=True)
-    end_date = Column(DateTime, nullable=True)
-
-    sector = relationship(
-        "SectorTable", secondary=project_sector_association, back_populates="projects"
-    )
-    investors = relationship(
-        "InvestorTable",
-        secondary=project_investor_association,
-        back_populates="projects",
-    )
-    companies = relationship(
-        "CompanyTable", secondary=project_company_association, back_populates="projects"
-    )