Added funds table

2025-10-05 19:16:03 +01:00
parent 3842171549
commit a2b3ceedbe
18 changed files with 27404 additions and 9 deletions
@@ -10,9 +10,7 @@
 *__pycache__
 /*.db
 *.cypython
 /preprocessor
@@ -2,7 +2,7 @@ import enum
 from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Table, Text, func
 from sqlalchemy.orm import declarative_mixin, relationship
-from sqlalchemy.types import Enum
+from sqlalchemy.types import JSON, Enum
 from db.db import Base
@@ -77,14 +77,52 @@ class InvestorTable(Base, TimestampMixin):
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    description = Column(Text, nullable=True)
-    aum = Column(Integer, nullable=True)  # Assets Under Management
+
-    check_size_lower = Column(Integer, nullable=True)  # Lower bound
+    # Basic investor info
-    check_size_upper = Column(Integer, nullable=True)  # Upper bound
+    website = Column(String, nullable=True)
    headquarters = Column(String, nullable=True)
    # AUM fields
    aum = Column(
        String, nullable=True
    )  # Store as string to preserve currency (e.g., "EUR 850,000,000")
    aum_as_of_date = Column(String, nullable=True)
    aum_source_url = Column(String, nullable=True)
    # Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
    check_size_lower = Column(Integer, nullable=True)
    check_size_upper = Column(Integer, nullable=True)
    # Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
    geographic_focus = Column(String, nullable=True)
-    stage_focus = Column(Enum(InvestmentStage), nullable=True)
+    stage_focus = Column(
        Enum(InvestmentStage), nullable=True
    )  # Deprecated in favor of fund-level
    # Investment thesis and portfolio
    investment_thesis = Column(JSON, nullable=True)  # Array of thesis statements
    portfolio_highlights = Column(
        JSON, nullable=True
    )  # Array of portfolio company names
    linked_documents = Column(JSON, nullable=True)  # Array of document URLs
    # Research metadata
    researcher_notes = Column(Text, nullable=True)
    missing_important_fields = Column(
        JSON, nullable=True
    )  # Array of missing field names
    sources = Column(JSON, nullable=True)  # JSON object with source URLs
    # Portfolio info
    number_of_investments = Column(Integer, default=0, nullable=True)
-    team_members = relationship("InvestorMember", back_populates="investor")
+    # Relationships
    team_members = relationship(
        "InvestorMember", back_populates="investor", cascade="all, delete-orphan"
    )
    funds = relationship(
        "FundTable", back_populates="investor", cascade="all, delete-orphan"
    )
    # Relationship to portfolio companies
    portfolio_companies = relationship(
@@ -111,12 +149,39 @@ class InvestorMember(Base, TimestampMixin):
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    role = Column(String, nullable=True)
    title = Column(String, nullable=True)  # Alternative to role
    email = Column(String, nullable=True)
    source_url = Column(String, nullable=True)  # URL where member info was found
    investor_id = Column(Integer, ForeignKey("investors.id"))
    investor = relationship("InvestorTable", back_populates="team_members")
 class FundTable(Base, TimestampMixin):
    __tablename__ = "funds"
    id = Column(Integer, primary_key=True, index=True)
    investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
    # Fund details
    fund_name = Column(String, nullable=True)
    fund_size = Column(String, nullable=True)  # Store as string to preserve currency
    fund_size_source_url = Column(String, nullable=True)
    estimated_investment_size = Column(
        String, nullable=True
    )  # e.g., "EUR 1,000 to 2,000"
    source_url = Column(String, nullable=True)
    source_provider = Column(String, nullable=True)  # e.g., "Perplexity"
    # JSON array fields
    geographic_focus = Column(JSON, nullable=True)  # Array of regions/countries
    investment_stage_focus = Column(JSON, nullable=True)  # Array of stages
    sector_focus = Column(JSON, nullable=True)  # Array of sectors
    # Relationships
    investor = relationship("InvestorTable", back_populates="funds")
 class CompanyTable(Base, TimestampMixin):
    __tablename__ = "companies"
@@ -128,7 +193,9 @@ class CompanyTable(Base, TimestampMixin):
    founded_year = Column(Integer, nullable=True)
    website = Column(String, nullable=True)
-    members = relationship("CompanyMember", back_populates="company")
+    members = relationship(
        "CompanyMember", back_populates="company", cascade="all, delete-orphan"
    )
    # Relationship back to investors
    investors = relationship(
        "InvestorTable",
@@ -0,0 +1,255 @@
 # Database Schema Update - Enriched Investor Data & Funds
 ## Overview
 Updated the database schema to support enriched investor data with multiple funds per investor.
 ## Key Changes
 ### 1. **InvestorTable - New Fields**
 #### Basic Info
 -   `headquarters` - Investor headquarters location
 -   `website` - Investor website URL (moved from nullable)
 #### AUM (Assets Under Management)
 -   `aum` - Changed from Integer to String to preserve currency (e.g., "EUR 850,000,000")
 -   `aum_as_of_date` - Date when AUM was measured
 -   `aum_source_url` - Source URL for AUM information
 #### Investment Information
 -   `investment_thesis` - JSON array of thesis statements
 -   `portfolio_highlights` - JSON array of notable portfolio companies
 -   `linked_documents` - JSON array of document URLs
 #### Research Metadata
 -   `researcher_notes` - Free-text notes from research
 -   `missing_important_fields` - JSON array of field names that are missing
 -   `sources` - JSON object mapping field names to source URLs
 #### Deprecated Fields (kept for backward compatibility)
 -   `check_size_lower/upper` - Now handled at fund level
 -   `geographic_focus` - Now handled at fund level
 -   `stage_focus` - Now handled at fund level
 ### 2. **FundTable - NEW TABLE**
 Represents individual funds managed by an investor. One investor can have multiple funds.
 **Fields:**
 -   `id` - Primary key
 -   `investor_id` - Foreign key to InvestorTable
 -   `fund_name` - Name of the fund
 -   `fund_size` - Size of fund (string to preserve currency)
 -   `fund_size_source_url` - Source URL for fund size
 -   `estimated_investment_size` - Typical investment range (e.g., "EUR 1,000 to 2,000")
 -   `source_url` - Source URL for fund information
 -   `source_provider` - Provider of information (e.g., "Perplexity")
 -   `geographic_focus` - JSON array of regions/countries
 -   `investment_stage_focus` - JSON array of investment stages
 -   `sector_focus` - JSON array of sectors
 **Relationship:**
 -   Many-to-One with InvestorTable
 -   Cascade delete (deleting investor deletes all funds)
 ### 3. **InvestorMember - Enhanced**
 Added fields for senior leadership data:
 -   `title` - Alternative to role field
 -   `source_url` - URL where member info was found
 ## Data Model
 ```
 InvestorTable (1) -----> (Many) FundTable
     |
     |-----> (Many) InvestorMember
     |-----> (Many) CompanyTable (portfolio_companies)
     |-----> (Many) SectorTable
     |-----> (Many) InvestmentStageTable
 ```
 ## Frontend Strategy
 ### Flattened Response
 The frontend will receive a **flattened** view where each fund appears as a separate investor entry:
 ```
 Investor A + Fund 1 → Row 1
 Investor A + Fund 2 → Row 2
 Investor A + Fund 3 → Row 3
 Investor B + Fund 1 → Row 4
 ```
 ### Benefits:
 1. ✅ No frontend schema changes needed
 2. ✅ Each row represents a distinct investment opportunity
 3. ✅ Filtering and querying work naturally
 4. ✅ Compatibility scoring can be done per fund
 5. ✅ Backend maintains proper normalization
 ## Files Modified
 ### Preprocessor
 -   `preprocessor/models.py` - Updated schema with all new fields and FundTable
 -   `preprocessor/enrich_investors.py` - **NEW** Script to ingest enriched data
 ### App
 -   `app/db/models.py` - Updated schema to match preprocessor
 ## Usage
 ### 1. Run Initial Data Ingestion (if not done)
 ```bash
 cd preprocessor
 python main.py
 ```
 ### 2. Run Enrichment
 ```bash
 cd preprocessor
 python enrich_investors.py enriched_investors.csv investor_name enriched_data
 ```
 **CSV Format:**
 | investor_name | enriched_data |
 |---------------|---------------|
 | Anaxago | {"funds": [...], "headquarters": "...", ...} |
 | VC Firm B | {...} |
 ### 3. Reinitialize Database (if needed)
 ```bash
 # Backup first!
 cp version_two.db version_two.db.backup
 # Delete and reinitialize
 rm version_two.db
 python main.py  # Run initial ingestion
 python enrich_investors.py enriched_investors.csv  # Run enrichment
 ```
 ## Enrichment Script Features
 ✅ **Upsert Logic** - Creates new investors or updates existing ones
 ✅ **Duplicate Prevention** - Won't create duplicate funds or team members
 ✅ **Flexible Matching** - Matches by name or website
 ✅ **Batch Commits** - Commits every 10 investors for performance
 ✅ **Error Handling** - Continues on errors, reports at end
 ✅ **Detailed Logging** - Shows progress and summary
 ## Next Steps
 ### 1. Create Compatibility Scorer Service
 See the design doc for the `CompatibilityScorer` service that will:
 -   Calculate match scores for both filtered and queried results
 -   Provide detailed breakdown of scoring
 -   Work with fund-level criteria
 ### 2. Update API Endpoints
 -   Modify `GET /investors` to flatten funds
 -   Update `GET /investors/filter` to query funds table
 -   Enhance `/query` endpoint to extract parameters and score
 ### 3. Update Frontend Schemas (Pydantic)
 Add optional fields to response schemas:
 -   `compatibility_score: Optional[float]`
 -   `match_details: Optional[dict]`
 -   Fund-related fields in `InvestorData`
 ## Example Enriched JSON
 ```json
 {
    "websiteURL": "http://www.anaxago.com",
    "headquarters": "Paris, France",
    "investorDescription": "Anaxago is an investment group...",
    "overallAssetsUnderManagement": {
        "aumAmount": "EUR 850,000,000",
        "asOfDate": "Not Available",
        "sourceUrl": "http://www.anaxago.com"
    },
    "investmentThesisFocus": ["Sustainable real estate", "Climate tech"],
    "portfolioHighlights": ["Tilak Healthcare", "Innovorder"],
    "funds": [
        {
            "fundName": "Crowdfunding Immobilier",
            "fundSize": "Not Available",
            "estimatedInvestmentSize": "EUR 1,000 to 2,000",
            "geographicFocus": ["France"],
            "investmentStageFocus": ["Seed", "Early Stage"],
            "sectorFocus": ["Real Estate"],
            "sourceUrl": "http://www.anaxago.com/investissement"
        }
    ],
    "seniorLeadership": [
        {
            "name": "Joachim Dupont",
            "title": "Co-fondateur et président",
            "sourceUrl": "https://capital.anaxago.com/equipe"
        }
    ],
    "researcherNotes": "No explicit official fund sizes found",
    "missingImportantFields": ["fundSize"],
    "sources": {
        "funds": "http://www.anaxago.com/investissement",
        "headquarters": "http://www.anaxago.com/contact"
    }
 }
 ```
 ## Database Migration
 If you have existing data:
 ```python
 # Migration script (if needed)
 from models import InvestorTable, engine
 from sqlalchemy import text
 with engine.connect() as conn:
    # Add new columns (SQLAlchemy will handle this with create_all)
    # But if you need manual migration:
    # Convert AUM from Integer to String
    conn.execute(text("ALTER TABLE investors ADD COLUMN aum_new TEXT"))
    conn.execute(text("UPDATE investors SET aum_new = CAST(aum AS TEXT) WHERE aum IS NOT NULL"))
    conn.execute(text("ALTER TABLE investors DROP COLUMN aum"))
    conn.execute(text("ALTER TABLE investors RENAME COLUMN aum_new TO aum"))
    conn.commit()
 ```
 ## Questions?
 -   **Q: What if an investor has no funds?**
    A: They'll appear once with all fund fields as NULL
 -   **Q: How do we handle fund updates?**
    A: Enrichment script updates existing funds by fund_name + investor_id
 -   **Q: Can we query by fund criteria?**
    A: Yes! Join InvestorTable with FundTable and filter on fund fields
 -   **Q: How does compatibility scoring work?**
    A: See the separate `CompatibilityScorer` service design
@@ -0,0 +1,285 @@
 # Quick Start Guide - Enriched Investor Data
 ## 🚀 Setup
 ### 1. Backup Your Database
 ```bash
 cd preprocessor
 cp version_two.db version_two.db.backup
 ```
 ### 2. Run Migration (for existing databases)
 ```bash
 python migrate_database.py version_two.db
 # Type 'yes' when prompted
 ```
 ### 3. Verify Schema
 ```bash
 python3 -c "from models import init_database; init_database(); print('✅ Schema OK!')"
 ```
 ## 📊 Enriching Investor Data
 ### CSV Format
 Your enriched CSV should have these columns:
 -   `investor_name` - Name of the investor (used to match existing records)
 -   `enriched_data` - JSON string with enriched data
 **Example:**
 ```csv
 investor_name,enriched_data
 Anaxago,"{""websiteURL"": ""http://www.anaxago.com"", ""headquarters"": ""Paris, France"", ""funds"": [...]}"
 VC Firm B,"{...}"
 ```
 ### Run Enrichment
 ```bash
 python enrich_investors.py enriched_investors.csv
 ```
 **With custom column names:**
 ```bash
 python enrich_investors.py myfile.csv name_column data_column
 ```
 ### What Gets Updated
 **Investor Level:**
 -   ✅ Description
 -   ✅ Website
 -   ✅ Headquarters
 -   ✅ AUM (amount, date, source)
 -   ✅ Investment thesis
 -   ✅ Portfolio highlights
 -   ✅ Linked documents
 -   ✅ Researcher notes
 -   ✅ Missing fields metadata
 -   ✅ Sources
 **Fund Level (creates new records):**
 -   ✅ Fund name
 -   ✅ Fund size
 -   ✅ Estimated investment size
 -   ✅ Geographic focus (array)
 -   ✅ Investment stages (array)
 -   ✅ Sector focus (array)
 -   ✅ Source URL and provider
 **Team Members (creates new records):**
 -   ✅ Name
 -   ✅ Title/Role
 -   ✅ Source URL
 ## 📋 JSON Structure
 ```json
 {
  "websiteURL": "http://www.example.com",
  "headquarters": "San Francisco, CA",
  "investorDescription": "Leading VC firm...",
  "overallAssetsUnderManagement": {
    "aumAmount": "USD 1,500,000,000",
    "asOfDate": "2024-Q4",
    "sourceUrl": "http://source.com"
  },
  "investmentThesisFocus": [
    "AI and Machine Learning",
    "Climate Tech"
  ],
  "portfolioHighlights": [
    "Company A",
    "Company B"
  ],
  "linkedDocuments": [
    "http://doc1.com",
    "http://doc2.com"
  ],
  "funds": [
    {
      "fundName": "Fund I",
      "fundSize": "USD 500,000,000",
      "fundSizeSourceUrl": "http://source.com",
      "estimatedInvestmentSize": "USD 5M to 15M",
      "geographicFocus": ["North America", "Europe"],
      "investmentStageFocus": ["Series A", "Series B"],
      "sectorFocus": ["AI", "SaaS"],
      "sourceUrl": "http://fund-info.com",
      "sourceProvider": "Crunchbase"
    },
    {
      "fundName": "Fund II",
      "fundSize": "USD 750,000,000",
      ...
    }
  ],
  "seniorLeadership": [
    {
      "name": "John Doe",
      "title": "Managing Partner",
      "sourceUrl": "http://linkedin.com/johndoe"
    }
  ],
  "researcherNotes": "Notes about this investor...",
  "missingImportantFields": ["fundSize", "checkSize"],
  "sources": {
    "funds": "http://source1.com",
    "headquarters": "http://source2.com"
  }
 }
 ```
 ## 🔍 Querying
 ### Check Funds Created
 ```python
 from models import InvestorTable, FundTable, get_db_session
 session = get_db_session()
 # Get investor with funds
 investor = session.query(InvestorTable).filter_by(name="Anaxago").first()
 print(f"Investor: {investor.name}")
 print(f"Funds: {len(investor.funds)}")
 for fund in investor.funds:
    print(f"  - {fund.fund_name}: {fund.fund_size}")
    print(f"    Geographic: {fund.geographic_focus}")
    print(f"    Stages: {fund.investment_stage_focus}")
    print(f"    Sectors: {fund.sector_focus}")
 session.close()
 ```
 ### Get All Funds
 ```python
 funds = session.query(FundTable).all()
 print(f"Total funds: {len(funds)}")
 for fund in funds:
    print(f"{fund.investor.name} - {fund.fund_name}")
 ```
 ## 🎯 Next Steps
 ### 1. Update API to Flatten Funds
 ```python
 # In app/routers/investors.py
@router.get("/investors")
 def get_investors(db: Session = Depends(get_db)):
    investors = db.query(InvestorTable).all()
    flattened = []
    for investor in investors:
        if investor.funds:
            for fund in investor.funds:
                flattened.append({
                    "id": f"{investor.id}_fund_{fund.id}",
                    "name": investor.name,
                    "description": investor.description,
                    # ... investor fields ...
                    "fund_name": fund.fund_name,
                    "fund_size": fund.fund_size,
                    "geographic_focus": fund.geographic_focus,
                    # ... fund fields ...
                })
        else:
            # Investor with no funds
            flattened.append({...})
    return flattened
 ```
 ### 2. Create Compatibility Scorer
 See `DATABASE_SCHEMA_UPDATE.md` for the `CompatibilityScorer` service design.
 ### 3. Test the Enrichment
 ```python
 # Quick test
 from models import InvestorTable, FundTable, get_db_session
 session = get_db_session()
 # Count investors with funds
 investors_with_funds = session.query(InvestorTable).join(FundTable).distinct().count()
 total_investors = session.query(InvestorTable).count()
 total_funds = session.query(FundTable).count()
 print(f"Investors: {total_investors}")
 print(f"Investors with funds: {investors_with_funds}")
 print(f"Total funds: {total_funds}")
 print(f"Avg funds per investor: {total_funds / investors_with_funds if investors_with_funds > 0 else 0:.2f}")
 session.close()
 ```
 ## ❓ Troubleshooting
 ### "No module named 'models'"
 ```bash
 # Make sure you're in the preprocessor directory
 cd preprocessor
 python enrich_investors.py ...
 ```
 ### "Duplicate fund entries"
 The script matches funds by `fund_name + investor_id`. If you run enrichment twice with the same data, funds will be updated, not duplicated.
 ### "Investor not found"
 The script tries to match by:
 1. Investor name
 2. Website URL
 If neither matches, the investor will be created as new.
 ### Check Logs
 The enrichment script provides detailed logging:
 -   ✅ Successes
 -   ⚠️ Warnings (missing data)
 -   ❌ Errors (with row numbers)
 ## 📚 Resources
 -   **Schema Documentation**: `DATABASE_SCHEMA_UPDATE.md`
 -   **Migration Script**: `migrate_database.py`
 -   **Enrichment Script**: `enrich_investors.py`
 -   **Models**: `models.py`
 ## 🎉 Success Indicators
 After enrichment, you should see:
 -   ✅ New `funds` table populated
 -   ✅ Investor fields updated with enriched data
 -   ✅ Team members added
 -   ✅ No duplicate funds for same investor
 -   ✅ JSON fields properly stored
@@ -0,0 +1,287 @@
 import json
 import logging
 import pandas as pd
 from models import FundTable, InvestorMember, InvestorTable, engine, init_database
 from sqlalchemy.orm import sessionmaker
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Initialize database (create tables if they don't exist)
 init_database()
 def clean_value(value):
    """Clean values, converting 'Not Available', 'null', etc. to None"""
    if pd.isna(value):
        return None
    if isinstance(value, str):
        if value.strip() in ["Not Available", "null", "None", "", "0", "N/A"]:
            return None
    return value
 def parse_json_safely(json_str):
    """Safely parse JSON string"""
    try:
        if pd.isna(json_str) or json_str == "":
            return None
        if isinstance(json_str, dict):
            return json_str
        return json.loads(json_str)
    except (json.JSONDecodeError, TypeError) as e:
        logger.error(f"Error parsing JSON: {e}")
        return None
 def enrich_investors(
    csv_file_path: str,
    investor_name_column: str = "investor_name",
    enriched_data_column: str = "enriched_data",
 ):
    """
    Enrich investors from CSV containing enriched JSON data.
    Args:
        csv_file_path: Path to CSV file with enriched investor data
        investor_name_column: Column name containing investor name
        enriched_data_column: Column name containing JSON data
    """
    Session = sessionmaker(bind=engine)
    session = Session()
    # Load enriched data
    logger.info(f"Loading enriched investors from: {csv_file_path}")
    enriched_df = pd.read_csv(csv_file_path)
    logger.info(f"📊 Enriched Investors CSV: {len(enriched_df)} rows")
    investors_updated = 0
    investors_created = 0
    funds_created = 0
    team_members_created = 0
    investors_not_found = []
    errors = []
    for index, row in enriched_df.iterrows():
        try:
            # Parse the JSON data column
            investor_data = parse_json_safely(row.get(enriched_data_column))
            if not investor_data:
                logger.warning(f"Row {index}: No valid JSON data")
                continue
            # Get investor name from row or JSON
            investor_name = row.get(investor_name_column)
            if not investor_name and investor_data.get("websiteURL"):
                # Try to match by website if name not in CSV
                investor_name = None
                website = clean_value(investor_data.get("websiteURL"))
            # Find or create investor
            investor = None
            if investor_name:
                investor = (
                    session.query(InvestorTable).filter_by(name=investor_name).first()
                )
            if not investor and investor_data.get("websiteURL"):
                website = clean_value(investor_data.get("websiteURL"))
                investor = (
                    session.query(InvestorTable).filter_by(website=website).first()
                )
            # Create new investor if not found
            if not investor:
                if not investor_name:
                    logger.warning(f"Row {index}: No investor name found, skipping")
                    continue
                investor = InvestorTable(name=investor_name)
                session.add(investor)
                session.flush()  # Get ID for new investor
                investors_created += 1
                logger.info(f"Created new investor: {investor_name}")
            else:
                investors_updated += 1
            # Update investor fields
            investor.description = (
                clean_value(investor_data.get("investorDescription"))
                or investor.description
            )
            investor.website = (
                clean_value(investor_data.get("websiteURL")) or investor.website
            )
            investor.headquarters = (
                clean_value(investor_data.get("headquarters")) or investor.headquarters
            )
            # Handle AUM
            aum_data = investor_data.get("overallAssetsUnderManagement", {})
            if aum_data:
                investor.aum = clean_value(aum_data.get("aumAmount"))
                investor.aum_as_of_date = clean_value(aum_data.get("asOfDate"))
                investor.aum_source_url = clean_value(aum_data.get("sourceUrl"))
            # Handle investment thesis (stored as JSON array)
            thesis = investor_data.get("investmentThesisFocus")
            if thesis:
                investor.investment_thesis = thesis
            # Handle portfolio highlights (stored as JSON array)
            portfolio = investor_data.get("portfolioHighlights")
            if portfolio:
                investor.portfolio_highlights = portfolio
            # Handle linked documents
            linked_docs = investor_data.get("linkedDocuments")
            if linked_docs:
                investor.linked_documents = linked_docs
            # Handle researcher notes
            notes = investor_data.get("researcherNotes")
            if notes:
                investor.researcher_notes = clean_value(notes)
            # Handle missing important fields
            missing_fields = investor_data.get("missingImportantFields")
            if missing_fields:
                investor.missing_important_fields = missing_fields
            # Handle sources
            sources = investor_data.get("sources")
            if sources:
                investor.sources = sources
            # Process senior leadership / team members
            leadership = investor_data.get("seniorLeadership", [])
            for member_data in leadership:
                # Check if member already exists
                member_name = clean_value(member_data.get("name"))
                if not member_name:
                    continue
                existing_member = (
                    session.query(InvestorMember)
                    .filter_by(investor_id=investor.id, name=member_name)
                    .first()
                )
                if not existing_member:
                    member = InvestorMember(
                        investor_id=investor.id,
                        name=member_name,
                        title=clean_value(member_data.get("title")),
                        role=clean_value(member_data.get("title")),  # Use title as role
                        source_url=clean_value(member_data.get("sourceUrl")),
                    )
                    session.add(member)
                    team_members_created += 1
            # Process funds
            funds = investor_data.get("funds", [])
            for fund_data in funds:
                # Check if fund already exists (by name and investor)
                fund_name = clean_value(fund_data.get("fundName"))
                # Always create new fund or update if exists
                existing_fund = None
                if fund_name:
                    existing_fund = (
                        session.query(FundTable)
                        .filter_by(investor_id=investor.id, fund_name=fund_name)
                        .first()
                    )
                if existing_fund:
                    # Update existing fund
                    fund = existing_fund
                else:
                    # Create new fund
                    fund = FundTable(investor_id=investor.id)
                    session.add(fund)
                    funds_created += 1
                # Update fund fields
                fund.fund_name = fund_name
                fund.fund_size = clean_value(fund_data.get("fundSize"))
                fund.fund_size_source_url = clean_value(
                    fund_data.get("fundSizeSourceUrl")
                )
                fund.estimated_investment_size = clean_value(
                    fund_data.get("estimatedInvestmentSize")
                )
                fund.source_url = clean_value(fund_data.get("sourceUrl"))
                fund.source_provider = clean_value(fund_data.get("sourceProvider"))
                fund.geographic_focus = fund_data.get("geographicFocus")
                fund.investment_stage_focus = fund_data.get("investmentStageFocus")
                fund.sector_focus = fund_data.get("sectorFocus")
            # Commit every 10 investors
            if (investors_updated + investors_created) % 10 == 0:
                session.commit()
                logger.info(
                    f"  Processed {investors_updated + investors_created} investors, "
                    f"created {funds_created} funds, {team_members_created} team members"
                )
        except Exception as e:
            logger.error(f"Error processing row {index}: {e}")
            session.rollback()
            errors.append({"row": index, "error": str(e)})
            continue
    # Final commit
    session.commit()
    # Print summary
    logger.info("\n" + "=" * 60)
    logger.info("🎉 ENRICHMENT COMPLETE!")
    logger.info("=" * 60)
    logger.info(f"   Investors Updated: {investors_updated}")
    logger.info(f"   Investors Created: {investors_created}")
    logger.info(f"   Funds Created: {funds_created}")
    logger.info(f"   Team Members Created: {team_members_created}")
    logger.info(f"   Errors: {len(errors)}")
    if investors_not_found:
        logger.info(
            f"\n⚠️  Investors not found in database ({len(investors_not_found)}):"
        )
        for name in investors_not_found[:10]:  # Show first 10
            logger.info(f"   - {name}")
        if len(investors_not_found) > 10:
            logger.info(f"   ... and {len(investors_not_found) - 10} more")
    if errors:
        logger.info(f"\n❌ Errors encountered ({len(errors)}):")
        for error in errors[:5]:  # Show first 5
            logger.info(f"   Row {error['row']}: {error['error']}")
        if len(errors) > 5:
            logger.info(f"   ... and {len(errors) - 5} more errors")
    session.close()
    logger.info("=" * 60)
 if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print(
            "Usage: python enrich_investors.py <csv_file_path> [investor_name_column] [enriched_data_column]"
        )
        print("\nExample:")
        print("  python enrich_investors.py enriched_investors.csv")
        print("  python enrich_investors.py enriched_investors.csv 'name' 'data'")
        sys.exit(1)
    csv_file = sys.argv[1]
    investor_col = sys.argv[2] if len(sys.argv) > 2 else "investor_name"
    data_col = sys.argv[3] if len(sys.argv) > 3 else "enriched_data"
    enrich_investors(csv_file, investor_col, data_col)
@@ -0,0 +1,513 @@
 # Investor: 212
 {
  "investor": {
    "id": null,
    "name": "212",
    "description": "Growth-oriented venture capital firm investing in B2B technology across Turkey, Central and Eastern Europe, and the MENA region. Operates multiple funds (including 212 NexT and Simya-related funds) and pursues multi-stage opportunities (seed to growth).",
    "aum": 80000000,
    "check_size_lower": 500000,
    "check_size_upper": 3000000,
    "geographic_focus": "Turkey, Central and Eastern Europe (CEE), Middle East & North Africa (MENA) including UAE, Europe",
    "number_of_investments": 57
  },
  "portfolio_companies": [
    {
      "id": null,
      "name": "RemotePass",
      "industry": "Fintech / HRTech",
      "location": "UAE",
      "description": "Onboards, manages, and pays remote staff across 150+ countries; offers multi-currency payroll and related HR tools.",
      "founded_year": 2020,
      "website": "https://remotepass.com/"
    },
    {
      "id": null,
      "name": "Flow48",
      "industry": "Fintech / SME lending",
      "location": "UAE",
      "description": "SME working capital financing platform using ERP, payment gateway and ecommerce data for risk assessment.",
      "founded_year": 2021,
      "website": null
    },
    {
      "id": null,
      "name": "Getmobil",
      "industry": "Marketplace / E-commerce",
      "location": "Istanbul, Türkiye",
      "description": "Marketplace for buying/selling second-hand electronics; renewal center certified by Turkish Ministry of Trade.",
      "founded_year": 2018,
      "website": "https://getmobil.com/"
    },
    {
      "id": null,
      "name": "SOCRadar",
      "industry": "Cybersecurity",
      "location": "Istanbul, Türkiye",
      "description": "Extended Threat Intelligence (XTI) platform combining EASM, DRPS and CTI for security operations.",
      "founded_year": 2019,
      "website": "https://socradar.io/"
    },
    {
      "id": null,
      "name": "Trio Mobil",
      "industry": "Industrial IoT / AI",
      "location": "Istanbul, Türkiye",
      "description": "AI-driven Industrial IoT platform enabling real-time analytics and safety improvements in facilities.",
      "founded_year": 2021,
      "website": "https://www.triomobil.com/"
    },
    {
      "id": null,
      "name": "PhilosopherKing",
      "industry": "Gaming / AI",
      "location": "Las Vegas, US",
      "description": "AI-powered gaming platform delivering dynamic, real-time interactive storytelling.",
      "founded_year": 2023,
      "website": "https://philosopherking.ai"
    },
    {
      "id": null,
      "name": "OneFive",
      "industry": "Materials / Packaging AI",
      "location": "Germany",
      "description": "AI-driven biomaterials platform to replace single-use plastics in packaging.",
      "founded_year": 2020,
      "website": "https://www.one-five.com"
    },
    {
      "id": null,
      "name": "EverDye",
      "industry": "Textile / Green Tech",
      "location": "France",
      "description": "Bio-based pigment technology enabling low-energy, low-emission dyeing processes.",
      "founded_year": 2021,
      "website": "https://everdye.fr"
    },
    {
      "id": null,
      "name": "Eluvium",
      "industry": "AI / Data Analytics",
      "location": "London, UK",
      "description": "AI-driven data agents to transform unstructured information into actionable insights for manufacturing and procurement.",
      "founded_year": 2024,
      "website": "https://www.eluvium.ai/"
    },
    {
      "id": null,
      "name": "Khenda",
      "industry": "Manufacturing / AI",
      "location": "Ann Arbor, Michigan, USA",
      "description": "AI-powered video analytics to extract production metrics from existing security camera footage.",
      "founded_year": 2021,
      "website": "https://www.khenda.com/"
    },
    {
      "id": null,
      "name": "Fazla",
      "industry": "Waste / Sustainability SaaS",
      "location": "Türkiye",
      "description": "Technology-based solutions to reduce waste and emissions across value chains.",
      "founded_year": 2021,
      "website": null
    }
  ],
  "team_members": [
    {
      "id": null,
      "name": "Ali H. Karabey",
      "role": "Founding Partner, Growth Funds",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Ali Naci Temel",
      "role": "Operations & Investment I, 212 NexT",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Barbaros Ozbugutu",
      "role": "Experts | Leadership Management",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Cagdas Yildiz",
      "role": "Investment | Simya VC",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Caglar Urcan",
      "role": "Investment I, 212 NexT",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Can Deniz Tokman",
      "role": "Investment I, Growth Funds",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Emin Taha Celik",
      "role": "Investment I, Growth Funds",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Cenk Sezginsoy",
      "role": "Experts | Venture Partner",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Can Abacigil",
      "role": "Experts | Product Development",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Doğukan Kara",
      "role": "Operations | Finance",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Ebru Elmas Gürses",
      "role": "Operations | Finance",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Eren Baydemir",
      "role": "Experts | Product Management",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Erim Hayretci",
      "role": "Operations | Venture Fellow",
      "email": null,
      "investor_id": null
    }
  ],
  "sectors": [
    {
      "id": null,
      "name": "Artificial Intelligence"
    },
    {
      "id": null,
      "name": "Cybersecurity"
    },
    {
      "id": null,
      "name": "Fintech"
    },
    {
      "id": null,
      "name": "Industrial IoT"
    },
    {
      "id": null,
      "name": "E-commerce / Marketplace"
    },
    {
      "id": null,
      "name": "Gaming / Entertainment"
    },
    {
      "id": null,
      "name": "Sustainability / Green Tech"
    },
    {
      "id": null,
      "name": "Data & Analytics"
    },
    {
      "id": null,
      "name": "Enterprise Software"
    }
  ],
  "investment_stages": [
    {
      "id": null,
      "stage": "SEED"
    },
    {
      "id": null,
      "stage": "SERIES_A"
    },
    {
      "id": null,
      "stage": "SERIES_B"
    },
    {
      "id": null,
      "stage": "SERIES_C"
    },
    {
      "id": null,
      "stage": "GROWTH"
    },
    {
      "id": null,
      "stage": "LATE_STAGE"
    }
  ]
 }
 # Investor: 301
 {
  "investor": {
    "id": null,
    "name": "301 INC",
    "description": "The venture capital arm of General Mills. We invest in driven and passionate founders across the food ecosystem and partner with founder teams to help realize their ambitions.",
    "aum": null,
    "check_size_lower": null,
    "check_size_upper": null,
    "geographic_focus": "United States",
    "number_of_investments": 21
  },
  "team_members": [
    {
      "id": null,
      "name": "Kristen Harvey",
      "role": "Managing Director, 301 INC",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Miles Swammi",
      "role": "Sr. Principal, Business Development, 301 INC",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Taylor Sankovich",
      "role": "Sr. Principal, Commercial Partnerships, 301 INC",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Steven Schweiger",
      "role": "Principal, Investments, 301 INC",
      "email": null,
      "investor_id": null
    }
  ],
  "sectors": [
    {
      "id": null,
      "name": "Food & Beverage"
    },
    {
      "id": null,
      "name": "Foodtech"
    },
    {
      "id": null,
      "name": "CPG"
    },
    {
      "id": null,
      "name": "Consumer Goods"
    }
  ],
  "investment_stages": [
    {
      "id": null,
      "stage": "SEED"
    },
    {
      "id": null,
      "stage": "SERIES_A"
    }
  ]
 }
 # Investor: 2050
 {
  "investor": {
    "id": null,
    "name": "2050",
    "description": "An ecosystemic venture fund backing mission-driven founders advancing a sustainable economy. Operates via an evergreen model including 2050.do (management company), 2050.ventures (Article 9 SFDR evergreen fund) and 2050.commons. Emphasizes aligned ecosystems, open strategic resources, and portfolio-wide social/environmental impact aligned with the UN SDGs (the Five Essentials).",
    "aum": 130000000,
    "check_size_lower": null,
    "check_size_upper": null,
    "geographic_focus": "Europe, Africa",
    "number_of_investments": 13
  },
  "team_members": [
    {
      "id": null,
      "name": "Marie Ekeland",
      "role": "Founder & CEO",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Olivier Mathiot",
      "role": "General Manager",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Aude Duprat",
      "role": "General Secretary",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Guillaume Bregeras",
      "role": "Chief Knowledge Officer & General Manager",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Charly Berthet",
      "role": "Investor",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Meyha Camara",
      "role": "Communication Manager",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Diana Krantz",
      "role": "Investor",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Matthieu Scetbun",
      "role": "Chief Financial Officer",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Sindre Østgård",
      "role": "Chief Aligner",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Éric Carreel",
      "role": "Co-founder & Chairman",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Kimo Paula",
      "role": "Co-founder & CCO",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Christian Couturier",
      "role": "Director, Solagro",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Marieke van Iperen",
      "role": "Co-founder & CEO, Settly",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Laura Beaulier",
      "role": "CEO, Climate Dividends",
      "email": null,
      "investor_id": null
    },
    {
      "id": null,
      "name": "Arnaud Le Rodallec",
      "role": "Co-founder & CPO/CTO, Fifteen",
      "email": null,
      "investor_id": null
    }
  ],
  "sectors": [
    {
      "id": null,
      "name": "Climate & Sustainability"
    },
    {
      "id": null,
      "name": "Ocean / Maritime"
    },
    {
      "id": null,
      "name": "Food & Agriculture"
    },
    {
      "id": null,
      "name": "Education & Learning"
    },
    {
      "id": null,
      "name": "Human & Social Impact"
    },
    {
      "id": null,
      "name": "Climate Finance & Ecosystem Alignment"
    }
  ],
  "investment_stages": [
    {
      "id": null,
      "stage": "SEED"
    },
    {
      "id": null,
      "stage": "SERIES_A"
    },
    {
      "id": null,
      "stage": "SERIES_B"
    },
    {
      "id": null,
      "stage": "SERIES_C"
    },
    {
      "id": null,
      "stage": "GROWTH"
    }
  ]
 }
@@ -0,0 +1,314 @@
 import logging
 import re
 import unicodedata
 import pandas as pd
 from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
 from sqlalchemy.orm import sessionmaker
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Import the schema
 init_database()
 #===================== Ingesting Original Data =====================#
 def parse_investor_names(investor_names_str):
    """Parse comma-separated investor names and return a list"""
    if pd.isna(investor_names_str) or investor_names_str == "":
        return []
    # Split by comma and clean whitespace
    # investors = [name.strip() for name in str(investor_names_str).split(",")]
    investors = [clean_name(name.strip()) for name in str(investor_names_str).split(",")]
    return [investor for investor in investors if investor]
 def parse_industries(industries_str):
    """Parse comma-separated industries and return a list"""
    if pd.isna(industries_str) or industries_str == "":
        return []
    # Split by comma and clean whitespace
    industries = [industry.strip() for industry in str(industries_str).split(",")]
    return [industry for industry in industries if industry]
 def clean_special_characters(text):
    """Clean special characters from text, converting to ASCII equivalents"""
    if not text:
        return text
    # First remove ellipses and other problematic patterns
    text = str(text).replace("...", "").replace("..", "")
    # Normalize unicode characters to their closest ASCII equivalents
    normalized = unicodedata.normalize("NFKD", text)
    # Remove accents and convert to ASCII
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
    # Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)
    # Clean up multiple spaces
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned
 def clean_string(value):
    """Clean string values, converting empty/null/nan/0 to None and removing special characters"""
    if (
        pd.isna(value)
        or value == ""
        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
    ):
        return None
    # First clean special characters
    cleaned = clean_special_characters(str(value).strip())
    # Check if result is just "0" after cleaning
    if cleaned in ["0", "0.0", "null", "nan", "none"]:
        return None
    return cleaned if cleaned else None
 def clean_name(value):
    """Clean names (companies, investors) with special character handling"""
    if (
        pd.isna(value)
        or value == ""
        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
    ):
        return None
    # Clean special characters but be more permissive for names
    text = str(value).strip()
    # First remove ellipses and other problematic patterns
    # text = text.replace("...", "").replace("..", "")
    # Normalize unicode characters
    normalized = unicodedata.normalize("NFKD", text)
    # Convert to ASCII but keep more characters for business names
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
    # Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)
    # Clean up multiple spaces
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    # Remove any trailing or leading periods
    cleaned = cleaned.strip(".")
    cleaned = cleaned.replace("..", "").replace("...", "")
    # Check if result is just "0" after cleaning
    if cleaned in ["0", "0.0", "null", "nan", "none"]:
        return None
    return cleaned if cleaned else None
 def clean_integer(value):
    """Clean integer values, converting empty/null/nan/0 to None"""
    if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
        return None
    try:
        cleaned_val = int(float(value))
        return cleaned_val if cleaned_val > 0 else None
    except (ValueError, TypeError):
        return None
 def parse_website(website_str: str):
    try:
        _, end = website_str.split(":")
        if end == "0":
            return None
        return "https:" + end
    except Exception:
        return None
 def ingest_data():
    # Create database engine and session
    Session = sessionmaker(bind=engine)
    session = Session()
    # Load CSV files
    print("Loading CSV files...")
    companies_df = pd.read_csv("companies.csv")
    investors_df = pd.read_csv("investors.csv")
    print(f"📊 Companies CSV: {len(companies_df)} rows")
    print(f"📊 Investors CSV: {len(investors_df)} rows")
    # Step 1: Ingest Investors
    print("\n🔄 Step 1: Ingesting Investors...")
    investors_processed = 0
    for index, row in investors_df.iterrows():
        try:
            investor_name = clean_name(row.get("Filtered investor names", ""))
            if investor_name:
                # Check if investor already exists
                existing_investor = (
                    session.query(InvestorTable).filter_by(name=investor_name).first()
                )
                if not existing_investor:
                    investor = InvestorTable(
                        name=investor_name,
                        # description=clean_string(row.get("Business model", "")),
                        # geographic_focus=clean_string(row.get("HQ", "")),
                        website=parse_website(str(row.get("Website", "")).strip()),
                        number_of_investments=clean_integer(
                            row.get("Number of investments")
                        ),
                    )
                    session.add(investor)
                    investors_processed += 1
                    if investors_processed % 1000 == 0:
                        session.commit()
                        print(f"  Committed {investors_processed} investors")
        except Exception as e:
            logger.error(f"Error processing investor {index}: {e}")
            continue
    session.commit()
    print(f"✅ Investors completed: {investors_processed} processed")
    # Step 2: Ingest Companies and Rounds
    print("\n🔄 Step 2: Ingesting Companies and Sectors...")
    companies_processed = 0
    sectors_created = set()
    for index, row in companies_df.iterrows():
        try:
            # Process company
            company_name = clean_name(row.get("Organization Name", ""))
            if not company_name:
                continue
            # Check if company already exists
            existing_company = (
                session.query(CompanyTable).filter_by(name=company_name).first()
            )
            if existing_company:
                company = existing_company
            else:
                # Create company
                company = CompanyTable(
                    name=company_name,
                    description=clean_string(row.get("Organization Description", "")),
                    location=clean_string(row.get("Organization Location", "")),
                    industry=clean_string(row.get("Organization Industries", "")),
                    website=clean_string(row.get("Organization Website", "")),
                )
                session.add(company)
                session.flush()  # Get the company ID
                companies_processed += 1
            # Process investor relationships
            investor_names_str = row.get("Investor Names", "")
            if pd.notna(investor_names_str) and investor_names_str:
                investor_names = parse_investor_names(investor_names_str)
                for investor_name in investor_names:
                    # Find investor in database
                    investor = (
                        session.query(InvestorTable)
                        .filter_by(name=investor_name.strip())
                        .first()
                    )
                    if investor:
                        # Add investor-company relationship
                        if company not in investor.portfolio_companies:
                            investor.portfolio_companies.append(company)
                    else:
                        print("This company has an investor not in DB:", investor_name)
            # Process sectors/industries
            industries_str = row.get("Organization Industries", "")
            if pd.notna(industries_str) and industries_str:
                industries = parse_industries(industries_str)
                for industry_name in industries:
                    industry_name = industry_name.strip()
                    if industry_name:
                        # Check if sector exists
                        sector = (
                            session.query(SectorTable)
                            .filter_by(name=industry_name)
                            .first()
                        )
                        if not sector:
                            sector = SectorTable(name=industry_name)
                            session.add(sector)
                            session.flush()
                            sectors_created.add(industry_name)
                        # Add company-sector relationship
                        if sector not in company.sectors:
                            company.sectors.append(sector)
            # Commit every 100 companies
            if companies_processed % 100 == 0 and companies_processed > 0:
                session.commit()
                print(f"  Processed {companies_processed} companies...")
        except Exception as e:
            logger.error(f"Error processing company {index}: {e}")
            session.rollback()
            continue
    # Step 3: Link investors to sectors based on portfolio companies
    print("\n🔄 Step 3: Linking Investors to Sectors...")
    investors_linked_to_sectors = 0
    all_investors = session.query(InvestorTable).all()
    for investor in all_investors:
        sectors = set()
        for company in investor.portfolio_companies:
            for sector in company.sectors:
                sectors.add(sector)
        # Add sectors to investor if not already present
        for sector in sectors:
            if sector not in investor.sectors:
                investor.sectors.append(sector)
        if sectors:
            investors_linked_to_sectors += 1
    session.commit()
    print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")
    # Final commit
    session.commit()
    # Final counts
    final_investors = session.query(InvestorTable).count()
    final_companies = session.query(CompanyTable).count()
    final_sectors = session.query(SectorTable).count()
    print("\n🎉 Ingestion Complete!")
    print(f"   Investors: {final_investors}")
    print(f"   Companies: {final_companies}")
    print(f"   Sectors: {final_sectors}")
    session.close()
 if __name__ == "__main__":
    ingest_data()
    # print(clean_name("A... Energi"))
    # print(clean_name("B.. Tech"))
    # print(clean_name("A... Energi"))
@@ -0,0 +1,131 @@
 """
 Migration script to update existing database schema
 Converts AUM from INTEGER to TEXT and adds new columns
 """
 import logging
 import sqlite3
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 def migrate_database(db_path="version_two.db"):
    """Migrate existing database to new schema"""
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    logger.info("Starting database migration...")
    try:
        # Check current schema
        cursor.execute("PRAGMA table_info(investors);")
        columns = {col[1]: col[2] for col in cursor.fetchall()}
        # 1. Convert AUM from INTEGER to TEXT
        if "aum" in columns and columns["aum"] == "INTEGER":
            logger.info("Converting AUM from INTEGER to TEXT...")
            cursor.execute("ALTER TABLE investors RENAME COLUMN aum TO aum_old;")
            cursor.execute("ALTER TABLE investors ADD COLUMN aum TEXT;")
            cursor.execute(
                "UPDATE investors SET aum = CAST(aum_old AS TEXT) WHERE aum_old IS NOT NULL;"
            )
            cursor.execute("ALTER TABLE investors DROP COLUMN aum_old;")
            logger.info("✅ AUM converted to TEXT")
        # 2. Add new columns if they don't exist
        new_columns = {
            "headquarters": "TEXT",
            "aum_as_of_date": "TEXT",
            "aum_source_url": "TEXT",
            "investment_thesis": "JSON",
            "portfolio_highlights": "JSON",
            "linked_documents": "JSON",
            "researcher_notes": "TEXT",
            "missing_important_fields": "JSON",
            "sources": "JSON",
        }
        for col_name, col_type in new_columns.items():
            if col_name not in columns:
                logger.info(f"Adding column: {col_name} ({col_type})")
                cursor.execute(
                    f"ALTER TABLE investors ADD COLUMN {col_name} {col_type};"
                )
        # 3. Add new columns to investor_members if they don't exist
        cursor.execute("PRAGMA table_info(investor_members);")
        member_columns = {col[1]: col[2] for col in cursor.fetchall()}
        if "title" not in member_columns:
            logger.info("Adding 'title' to investor_members")
            cursor.execute("ALTER TABLE investor_members ADD COLUMN title TEXT;")
        if "source_url" not in member_columns:
            logger.info("Adding 'source_url' to investor_members")
            cursor.execute("ALTER TABLE investor_members ADD COLUMN source_url TEXT;")
        # 4. Check if funds table exists
        cursor.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name='funds';"
        )
        if not cursor.fetchone():
            logger.info("Creating funds table...")
            cursor.execute("""
                CREATE TABLE funds (
                    id INTEGER NOT NULL PRIMARY KEY,
                    investor_id INTEGER NOT NULL,
                    fund_name VARCHAR,
                    fund_size VARCHAR,
                    fund_size_source_url VARCHAR,
                    estimated_investment_size VARCHAR,
                    source_url VARCHAR,
                    source_provider VARCHAR,
                    geographic_focus JSON,
                    investment_stage_focus JSON,
                    sector_focus JSON,
                    created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
                    updated_at DATETIME,
                    FOREIGN KEY(investor_id) REFERENCES investors (id)
                );
            """)
            logger.info("✅ Funds table created")
        conn.commit()
        logger.info("\n🎉 Migration completed successfully!")
        # Show summary
        cursor.execute("PRAGMA table_info(investors);")
        investor_cols = cursor.fetchall()
        logger.info(f"\nInvestors table now has {len(investor_cols)} columns")
        cursor.execute("SELECT COUNT(*) FROM investors;")
        investor_count = cursor.fetchone()[0]
        logger.info(f"Investors in database: {investor_count}")
        cursor.execute("SELECT COUNT(*) FROM funds;")
        fund_count = cursor.fetchone()[0]
        logger.info(f"Funds in database: {fund_count}")
    except Exception as e:
        logger.error(f"Migration failed: {e}")
        conn.rollback()
        raise
    finally:
        conn.close()
 if __name__ == "__main__":
    import sys
    db_file = sys.argv[1] if len(sys.argv) > 1 else "version_two.db"
    print(f"Migrating database: {db_file}")
    print("⚠️  This will modify your database. Make sure you have a backup!")
    response = input("Continue? (yes/no): ")
    if response.lower() in ["yes", "y"]:
        migrate_database(db_file)
    else:
        print("Migration cancelled")
@@ -0,0 +1,347 @@
 import enum
 from typing import Annotated
 from fastapi import Depends
 from sqlalchemy import (
    Column,
    DateTime,
    ForeignKey,
    Integer,
    String,
    Table,
    Text,
    create_engine,
    func,
 )
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, declarative_mixin, relationship, sessionmaker
 from sqlalchemy.types import JSON, Enum
 Base = declarative_base()
 # Database configuration
 # DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db")
 # Create engine
 engine = create_engine("sqlite:///./version_two.db", echo=False)
 # Create session factory
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
 db_dependency = Annotated[Session, Depends(get_db)]
 def init_database():
    """Initialize the database by creating all tables"""
    Base.metadata.create_all(bind=engine)
 def get_session_sync() -> Session:
    """Get a database session for synchronous operations"""
    return SessionLocal()
 def get_db_session():
    """Get a database session for direct use."""
    return SessionLocal()
@declarative_mixin
 class TimestampMixin:
    created_at = Column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    updated_at = Column(DateTime(timezone=True), onupdate=func.now())
 class InvestmentStage(enum.Enum):
    SEED = "SEED"
    SERIES_A = "SERIES_A"
    SERIES_B = "SERIES_B"
    SERIES_C = "SERIES_C"
    GROWTH = "GROWTH"
    LATE_STAGE = "LATE_STAGE"
 # Association table for many-to-many relationship between investors and companies
 investor_company_association = Table(
    "investor_companies",
    Base.metadata,
    Column("investor_id", Integer, ForeignKey("investors.id")),
    Column("company_id", Integer, ForeignKey("companies.id")),
 )
 # Association table for investor-sector many-to-many
 investor_sector_association = Table(
    "investor_sectors",
    Base.metadata,
    Column("investor_id", Integer, ForeignKey("investors.id")),
    Column("sector_id", Integer, ForeignKey("sectors.id")),
 )
 company_sector_association = Table(
    "company_sector",
    Base.metadata,
    Column("company_id", Integer, ForeignKey("companies.id")),
    Column("sector_id", Integer, ForeignKey("sectors.id")),
 )
 project_sector_association = Table(
    "project_sector",
    Base.metadata,
    Column("project_id", Integer, ForeignKey("projects.id")),
    Column("sector_id", Integer, ForeignKey("sectors.id")),
 )
 project_investor_association = Table(
    "project_investors",
    Base.metadata,
    Column("project_id", Integer, ForeignKey("projects.id")),
    Column("investor_id", Integer, ForeignKey("investors.id")),
 )
 project_company_association = Table(
    "project_companies",
    Base.metadata,
    Column("project_id", Integer, ForeignKey("projects.id")),
    Column("company_id", Integer, ForeignKey("companies.id")),
 )
 # Association table for investor-stage many-to-many
 investor_stage_association = Table(
    "investor_stages",
    Base.metadata,
    Column("investor_id", Integer, ForeignKey("investors.id")),
    Column("stage_id", Integer, ForeignKey("investment_stages.id")),
 )
 class InvestorTable(Base, TimestampMixin):
    __tablename__ = "investors"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    description = Column(Text, nullable=True)
    # Basic investor info
    website = Column(String, nullable=True)
    headquarters = Column(String, nullable=True)
    # AUM fields
    aum = Column(
        String, nullable=True
    )  # Store as string to preserve currency (e.g., "EUR 850,000,000")
    aum_as_of_date = Column(String, nullable=True)
    aum_source_url = Column(String, nullable=True)
    # Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
    check_size_lower = Column(Integer, nullable=True)
    check_size_upper = Column(Integer, nullable=True)
    # Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
    geographic_focus = Column(String, nullable=True)
    # Investment thesis and portfolio
    investment_thesis = Column(JSON, nullable=True)  # Array of thesis statements
    portfolio_highlights = Column(
        JSON, nullable=True
    )  # Array of portfolio company names
    linked_documents = Column(JSON, nullable=True)  # Array of document URLs
    # Research metadata
    researcher_notes = Column(Text, nullable=True)
    missing_important_fields = Column(
        JSON, nullable=True
    )  # Array of missing field names
    sources = Column(JSON, nullable=True)  # JSON object with source URLs
    # Portfolio info
    number_of_investments = Column(Integer, nullable=True)
    # Relationships
    team_members = relationship(
        "InvestorMember", back_populates="investor", cascade="all, delete-orphan"
    )
    funds = relationship(
        "FundTable", back_populates="investor", cascade="all, delete-orphan"
    )
    # Many-to-many relationship with investment stages
    investment_stages = relationship(
        "InvestmentStageTable",
        secondary=investor_stage_association,
        back_populates="investors",
    )
    # Relationship to portfolio companies
    portfolio_companies = relationship(
        "CompanyTable",
        secondary=investor_company_association,
        back_populates="investors",
    )
    sectors = relationship(
        "SectorTable",
        secondary=investor_sector_association,
        back_populates="investors",
    )
    projects = relationship(
        "ProjectTable",
        secondary=project_investor_association,
        back_populates="investors",
    )
 class InvestorMember(Base, TimestampMixin):
    __tablename__ = "investor_members"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    role = Column(String, nullable=True)
    title = Column(String, nullable=True)  # Alternative to role
    email = Column(String, nullable=True)
    source_url = Column(String, nullable=True)  # URL where member info was found
    investor_id = Column(Integer, ForeignKey("investors.id"))
    investor = relationship("InvestorTable", back_populates="team_members")
 class FundTable(Base, TimestampMixin):
    __tablename__ = "funds"
    id = Column(Integer, primary_key=True, index=True)
    investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
    # Fund details
    fund_name = Column(String, nullable=True)
    fund_size = Column(String, nullable=True)  # Store as string to preserve currency
    fund_size_source_url = Column(String, nullable=True)
    estimated_investment_size = Column(
        String, nullable=True
    )  # e.g., "EUR 1,000 to 2,000"
    source_url = Column(String, nullable=True)
    source_provider = Column(String, nullable=True)  # e.g., "Perplexity"
    # JSON array fields
    geographic_focus = Column(JSON, nullable=True)  # Array of regions/countries
    investment_stage_focus = Column(JSON, nullable=True)  # Array of stages
    sector_focus = Column(JSON, nullable=True)  # Array of sectors
    # Relationships
    investor = relationship("InvestorTable", back_populates="funds")
 class InvestmentStageTable(Base, TimestampMixin):
    __tablename__ = "investment_stages"
    id = Column(Integer, primary_key=True, index=True)
    stage = Column(Enum(InvestmentStage), nullable=False, unique=True)
    # Relationship back to investors
    investors = relationship(
        "InvestorTable",
        secondary=investor_stage_association,
        back_populates="investment_stages",
    )
 class CompanyTable(Base, TimestampMixin):
    __tablename__ = "companies"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    industry = Column(String, nullable=True)
    location = Column(String, nullable=True)
    description = Column(String, nullable=True)
    founded_year = Column(Integer, nullable=True)
    website = Column(String, nullable=True)
    members = relationship(
        "CompanyMember", back_populates="company", cascade="all, delete-orphan"
    )
    # Relationship back to investors
    investors = relationship(
        "InvestorTable",
        secondary=investor_company_association,
        back_populates="portfolio_companies",
    )
    sectors = relationship(
        "SectorTable", secondary=company_sector_association, back_populates="companies"
    )
    projects = relationship(
        "ProjectTable",
        secondary=project_company_association,
        back_populates="companies",
    )
 class CompanyMember(Base, TimestampMixin):
    __tablename__ = "company_members"
    id = Column(Integer, primary_key=True)
    name = Column(String)
    linkedin = Column(String, nullable=True)
    role = Column(String, nullable=True)
    company_id = Column(Integer, ForeignKey("companies.id"), nullable=False)
    company = relationship("CompanyTable", back_populates="members")
 class SectorTable(Base, TimestampMixin):
    __tablename__ = "sectors"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    # Add relationship back to investors
    investors = relationship(
        "InvestorTable",
        secondary=investor_sector_association,
        back_populates="sectors",
    )
    companies = relationship(
        "CompanyTable", secondary=company_sector_association, back_populates="sectors"
    )
    projects = relationship(
        "ProjectTable", secondary=project_sector_association, back_populates="projects"
    )
 class ProjectTable(Base, TimestampMixin):
    __tablename__ = "projects"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    valuation = Column(Integer, nullable=True)
    stage = Column(Enum(InvestmentStage), nullable=True)
    location = Column(String, nullable=True)
    description = Column(Text, nullable=True)
    start_date = Column(DateTime, nullable=True)
    end_date = Column(DateTime, nullable=True)
    sector = relationship(
        "SectorTable", secondary=project_sector_association, back_populates="projects"
    )
    investors = relationship(
        "InvestorTable",
        secondary=project_investor_association,
        back_populates="projects",
    )
    companies = relationship(
        "CompanyTable", secondary=project_company_association, back_populates="projects"
    )
@@ -0,0 +1,367 @@
 import enum
 from typing import Annotated
 from fastapi import Depends
 from sqlalchemy import (
    Column,
    DateTime,
    ForeignKey,
    Integer,
    String,
    Tableclass InvestorMember(Base, TimestampMixin):
    __tablename__ = "investor_members"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    role = Column(String, nullable=True)
    title = Column(String, nullable=True)  # Alternative to role
    email = Column(String, nullable=True)
    source_url = Column(String, nullable=True)  # URL where member info was found
    investor_id = Column(Integer, ForeignKey("investors.id"))
    investor = relationship("InvestorTable", back_populates="team_members")
 class FundTable(Base, TimestampMixin):
    __tablename__ = "funds"
    id = Column(Integer, primary_key=True, index=True)
    investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
    # Fund details
    fund_name = Column(String, nullable=True)
    fund_size = Column(String, nullable=True)  # Store as string to preserve currency
    fund_size_source_url = Column(String, nullable=True)
    estimated_investment_size = Column(String, nullable=True)  # e.g., "EUR 1,000 to 2,000"
    source_url = Column(String, nullable=True)
    source_provider = Column(String, nullable=True)  # e.g., "Perplexity"
    # JSON array fields
    geographic_focus = Column(JSON, nullable=True)  # Array of regions/countries
    investment_stage_focus = Column(JSON, nullable=True)  # Array of stages
    sector_focus = Column(JSON, nullable=True)  # Array of sectors
    # Relationships
    investor = relationship("InvestorTable", back_populates="funds")
 class InvestmentStageTable(Base, TimestampMixin):  create_engine,
    func,
 )
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, declarative_mixin, relationship, sessionmaker
 from sqlalchemy.types import Enum, JSON, JSON
 Base = declarative_base()
 # Database configuration
 # DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db")
 # Create engine
 engine = create_engine("sqlite:///./version_two.db", echo=False)
 # Create session factory
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()
 db_dependency = Annotated[Session, Depends(get_db)]
 def init_database():
    """Initialize the database by creating all tables"""
    Base.metadata.create_all(bind=engine)
 def get_session_sync() -> Session:
    """Get a database session for synchronous operations"""
    return SessionLocal()
 def get_db_session():
    """Get a database session for direct use."""
    return SessionLocal()
@declarative_mixin
 class TimestampMixin:
    created_at = Column(
        DateTime(timezone=True), server_default=func.now(), nullable=False
    )
    updated_at = Column(DateTime(timezone=True), onupdate=func.now())
 class InvestmentStage(enum.Enum):
    SEED = "SEED"
    SERIES_A = "SERIES_A"
    SERIES_B = "SERIES_B"
    SERIES_C = "SERIES_C"
    GROWTH = "GROWTH"
    LATE_STAGE = "LATE_STAGE"
 # Association table for many-to-many relationship between investors and companies
 investor_company_association = Table(
    "investor_companies",
    Base.metadata,
    Column("investor_id", Integer, ForeignKey("investors.id")),
    Column("company_id", Integer, ForeignKey("companies.id")),
 )
 # Association table for investor-sector many-to-many
 investor_sector_association = Table(
    "investor_sectors",
    Base.metadata,
    Column("investor_id", Integer, ForeignKey("investors.id")),
    Column("sector_id", Integer, ForeignKey("sectors.id")),
 )
 company_sector_association = Table(
    "company_sector",
    Base.metadata,
    Column("company_id", Integer, ForeignKey("companies.id")),
    Column("sector_id", Integer, ForeignKey("sectors.id")),
 )
 project_sector_association = Table(
    "project_sector",
    Base.metadata,
    Column("project_id", Integer, ForeignKey("projects.id")),
    Column("sector_id", Integer, ForeignKey("sectors.id")),
 )
 project_investor_association = Table(
    "project_investors",
    Base.metadata,
    Column("project_id", Integer, ForeignKey("projects.id")),
    Column("investor_id", Integer, ForeignKey("investors.id")),
 )
 project_company_association = Table(
    "project_companies",
    Base.metadata,
    Column("project_id", Integer, ForeignKey("projects.id")),
    Column("company_id", Integer, ForeignKey("companies.id")),
 )
 # Association table for investor-stage many-to-many
 investor_stage_association = Table(
    "investor_stages",
    Base.metadata,
    Column("investor_id", Integer, ForeignKey("investors.id")),
    Column("stage_id", Integer, ForeignKey("investment_stages.id")),
 )
 class InvestorTable(Base, TimestampMixin):
    __tablename__ = "investors"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    description = Column(Text, nullable=True)
    # Basic investor info
    website = Column(String, nullable=True)
    headquarters = Column(String, nullable=True)
    # AUM fields
    aum = Column(String, nullable=True)  # Store as string to preserve currency (e.g., "EUR 850,000,000")
    aum_as_of_date = Column(String, nullable=True)
    aum_source_url = Column(String, nullable=True)
    # Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
    check_size_lower = Column(Integer, nullable=True)
    check_size_upper = Column(Integer, nullable=True)
    # Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
    geographic_focus = Column(String, nullable=True)
    # Investment thesis and portfolio
    investment_thesis = Column(JSON, nullable=True)  # Array of thesis statements
    portfolio_highlights = Column(JSON, nullable=True)  # Array of portfolio company names
    linked_documents = Column(JSON, nullable=True)  # Array of document URLs
    # Research metadata
    researcher_notes = Column(Text, nullable=True)
    missing_important_fields = Column(JSON, nullable=True)  # Array of missing field names
    sources = Column(JSON, nullable=True)  # JSON object with source URLs
    # Portfolio info
    number_of_investments = Column(Integer, nullable=True)
    # Relationships
    team_members = relationship("InvestorMember", back_populates="investor")
    funds = relationship("FundTable", back_populates="investor", cascade="all, delete-orphan")
    # Many-to-many relationship with investment stages
    investment_stages = relationship(
        "InvestmentStageTable",
        secondary=investor_stage_association,
        back_populates="investors",
    )
    # Relationship to portfolio companies
    portfolio_companies = relationship(
        "CompanyTable",
        secondary=investor_company_association,
        back_populates="investors",
    )
    sectors = relationship(
        "SectorTable",
        secondary=investor_sector_association,
        back_populates="investors",
    )
    projects = relationship(
        "ProjectTable",
        secondary=project_investor_association,
        back_populates="investors",
    )
 class InvestorMember(Base, TimestampMixin):
    __tablename__ = "investor_members"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    role = Column(String, nullable=True)
    title = Column(String, nullable=True)  # Alternative to role
    email = Column(String, nullable=True)
    source_url = Column(String, nullable=True)  # URL where member info was found
    investor_id = Column(Integer, ForeignKey("investors.id"))
    investor = relationship("InvestorTable", back_populates="team_members")
 class FundTable(Base, TimestampMixin):
    __tablename__ = "funds"
    id = Column(Integer, primary_key=True, index=True)
    investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
    # Fund details
    fund_name = Column(String, nullable=True)
    fund_size = Column(String, nullable=True)  # Store as string to preserve currency
    fund_size_source_url = Column(String, nullable=True)
    estimated_investment_size = Column(String, nullable=True)  # e.g., "EUR 1,000 to 2,000"
    source_url = Column(String, nullable=True)
    source_provider = Column(String, nullable=True)  # e.g., "Perplexity"
    # JSON array fields
    geographic_focus = Column(JSON, nullable=True)  # Array of regions/countries
    investment_stage_focus = Column(JSON, nullable=True)  # Array of stages
    sector_focus = Column(JSON, nullable=True)  # Array of sectors
    # Relationships
    investor = relationship("InvestorTable", back_populates="funds")
 class InvestmentStageTable(Base, TimestampMixin):
    __tablename__ = "investment_stages"
    id = Column(Integer, primary_key=True, index=True)
    stage = Column(Enum(InvestmentStage), nullable=False, unique=True)
    # Relationship back to investors
    investors = relationship(
        "InvestorTable",
        secondary=investor_stage_association,
        back_populates="investment_stages",
    )
 class CompanyTable(Base, TimestampMixin):
    __tablename__ = "companies"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    industry = Column(String, nullable=True)
    location = Column(String, nullable=True)
    description = Column(String, nullable=True)
    founded_year = Column(Integer, nullable=True)
    website = Column(String, nullable=True)
    members = relationship("CompanyMember", back_populates="company")
    # Relationship back to investors
    investors = relationship(
        "InvestorTable",
        secondary=investor_company_association,
        back_populates="portfolio_companies",
    )
    sectors = relationship(
        "SectorTable", secondary=company_sector_association, back_populates="companies"
    )
    projects = relationship(
        "ProjectTable",
        secondary=project_company_association,
        back_populates="companies",
    )
 class CompanyMember(Base, TimestampMixin):
    __tablename__ = "company_members"
    id = Column(Integer, primary_key=True)
    name = Column(String)
    linkedin = Column(String, nullable=True)
    role = Column(String, nullable=True)
    company_id = Column(Integer, ForeignKey("companies.id"), nullable=False)
    company = relationship("CompanyTable", back_populates="members")
 class SectorTable(Base, TimestampMixin):
    __tablename__ = "sectors"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    # Add relationship back to investors
    investors = relationship(
        "InvestorTable",
        secondary=investor_sector_association,
        back_populates="sectors",
    )
    companies = relationship(
        "CompanyTable", secondary=company_sector_association, back_populates="sectors"
    )
    projects = relationship(
        "ProjectTable", secondary=project_sector_association, back_populates="sector"
    )
 class ProjectTable(Base, TimestampMixin):
    __tablename__ = "projects"
    id = Column(Integer, primary_key=True, index=True)
    name = Column(String, nullable=False)
    valuation = Column(Integer, nullable=True)
    stage = Column(Enum(InvestmentStage), nullable=True)
    location = Column(String, nullable=True)
    description = Column(Text, nullable=True)
    start_date = Column(DateTime, nullable=True)
    end_date = Column(DateTime, nullable=True)
    sector = relationship(
        "SectorTable", secondary=project_sector_association, back_populates="projects"
    )
    investors = relationship(
        "InvestorTable",
        secondary=project_investor_association,
        back_populates="projects",
    )
    companies = relationship(
        "CompanyTable", secondary=project_company_association, back_populates="projects"
    )
@@ -0,0 +1,349 @@
 import asyncio
 import logging
 import os
 from typing import Optional
 from crawl4ai import AsyncWebCrawler
 from web_crawler_schemas import InvestorDataScrape
 from ddgs import DDGS
 from dotenv import load_dotenv
 from langchain_openai import ChatOpenAI
 from langgraph.prebuilt import create_react_agent
 from models import (
    CompanyTable,
    InvestmentStageTable,
    InvestorMember,
    InvestorTable,
    SectorTable,
    engine,
 )
 from sqlalchemy.orm import sessionmaker
 Session = sessionmaker(bind=engine)
 session = Session()
 # ------------------------------------------------------------------
 # Logging setup
 # ------------------------------------------------------------------
 logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
 )
 logger = logging.getLogger("web_search_agent")
 # ------------------------------------------------------------------
 # Environment
 # ------------------------------------------------------------------
 load_dotenv()
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
 if not OPENROUTER_API_KEY:
    logger.warning("OPENROUTER_API_KEY not set. LLM calls will fail if invoked.")
 class QueryProcessor:
    def __init__(self, sql_session: Optional[object] = None):
        self.sql_session = sql_session
        self.llm = ChatOpenAI(
            api_key=OPENROUTER_API_KEY,
            base_url="https://openrouter.ai/api/v1",
            model="openai/gpt-5-nano",
            temperature=0,
        )
        self.agent = create_react_agent(
            model=self.llm,
            tools=[self.crawl, self.web_search],
            response_format=InvestorDataScrape,
        )
        self.ddg_search = DDGS()
    async def fill_investor(self, investor: InvestorTable):
        inv_dict = {
            col.name: getattr(investor, col.name) for col in investor.__table__.columns
        }
        website = inv_dict.get("website", "No Website")
        name = inv_dict.get("name", "Unknown")
        description = inv_dict.get("description", "No description")
        aum = inv_dict.get("aum", "Unknown")
        check_size_lower = inv_dict.get("check_size_lower", "Unknown")
        check_size_upper = inv_dict.get("check_size_upper", "Unknown")
        geographic_focus = inv_dict.get("geographic_focus", "Unknown")
        number_of_investments = inv_dict.get("number_of_investments", "Unknown")
        print(website)
        prompt = f"""
        You are a crawler agent. You will be provided with information about a venture capital investor and their website.
        Your task is to navigate the website to find and enrich the existing information.
        If the website is not available, use the `web_search` tool to google the name of the investor company.
        Use the `crawl` tool to visit web pages and extract information.
        Current investor information:
        - Name: {name}
        - Website: {website}
        - Description: {description}
        - Assets Under Management: {aum}
        - Check Size Lower: {check_size_lower}
        - Check Size Upper: {check_size_upper}
        - Geographic Focus: {geographic_focus}
        - Number of Investments: {number_of_investments}
        IMPORTANT: Investment Stages - Investors often focus on MULTIPLE stages. Look for:
        - "Seed to Series A" = [SEED, SERIES_A]
        - "Early stage" = [SEED, SERIES_A]  
        - "Growth stage" = [SERIES_B, SERIES_C, GROWTH]
        - "Multi-stage" = [SEED, SERIES_A, SERIES_B, SERIES_C]
        - "Late stage" = [GROWTH, LATE_STAGE]
        - "Series A and B" = [SERIES_A, SERIES_B]
        IMPORTANT: Additional guidance for AUM and Check Size
        - "Check size" may also be written as "ticket size", "investment size", "typical investment range", or "investment amount".
        - "Assets under management (AUM)" may also be called "fund size", "capital under management", or "fund raised".
        - If not on the official website, search news and databases like Crunchbase, PitchBook, Dealroom, TechCrunch, PRNewswire, or EU-Startups.
        - Look for numbers with currency symbols (€,$,£) followed by "M", "B", "million", or "billion".
        - Example: "fund size €200M", "typical tickets $1–5M", "raised £1 billion".
        Follow these steps:
        1. Use the `crawl` tool with the main website URL to get the initial content.
        2. Analyze the returned content. Look for links or sections related to the information you need (About, Team, Portfolio, Investments, Funds).
        3. If you find a relevant URL, call the `crawl` tool again with that new URL to get more detailed information.
        4. If AUM or check size are still missing, immediately perform 1–2 `web_search` queries such as:
        - "{name} fund size site:techcrunch.com"
        - "{name} ticket size site:eu-startups.com"
        - "{name} raises fund site:prnewswire.com"
        5. Continue this process, exploring relevant pages, until you have gathered all the required information.
        6. Extract and update the following information:
        - investor: Core investor data (name, description, aum, check_size_lower, check_size_upper, geographic_focus, number_of_investments)
        - team_members: List of key members with name, role, and email/LinkedIn
        - sectors: List of investment sectors they focus on
        - investment_stages: List of ALL investment stages they focus on (can be multiple!)
        7. If any information is not available or cannot be improved, leave it as null or use existing data.
        Stop crawling/searching once you have found the missing information or confirmed it is not available online.
        Website: {website}
        """
        return prompt
    async def crawl(self, url: str):
        """Tool to search the web using a web crawler. given the url"""
        print(f"🕷️ Crawling: {url}")
        try:
            if url == "No Website" or not url or url.strip() == "":
                return "No website provided for this investor. Please use web_search to find information."
            async with AsyncWebCrawler() as crawler:
                results = await crawler.arun(url)
                return results.markdown[:5000]  # Limit content to avoid token limits
        except Exception as e:
            print(f"❌ Failed to crawl {url}: {e}")
            return f"Failed to crawl website: {e}. Please try web_search instead."
    def web_search(self, query: str):
        """Tool to search the web using google"""
        print(f"🔍 Searching: {query}")
        try:
            result = self.ddg_search.text(query, max_results=10, backend="google")
            # Format results for better LLM consumption
            formatted_results = []
            for r in result:
                formatted_results.append(
                    {
                        "title": r.get("title", ""),
                        "url": r.get("href", ""),
                        "snippet": r.get("body", ""),
                    }
                )
            return formatted_results
        except Exception as e:
            print(f"❌ Search failed: {e}")
            return f"Search failed: {e}"
 def needs_enrichment(investor: InvestorTable) -> bool:
    """Check if an investor needs enrichment based on missing fields"""
    missing_fields = []
    if not investor.description:
        missing_fields.append("description")
    if not investor.aum:
        missing_fields.append("aum")
    if not investor.check_size_lower or not investor.check_size_upper:
        missing_fields.append("check_size")
    if not investor.geographic_focus:
        missing_fields.append("geographic_focus")
    if not investor.investment_stages:
        missing_fields.append("investment_stages")
    if not investor.team_members:
        missing_fields.append("team_members")
    if missing_fields:
        print(f"Investor {investor.name} missing: {', '.join(missing_fields)}")
        return True
    return False
 def update_investor(session, investor: InvestorTable, data: InvestorDataScrape):
    """Update an InvestorTable row with extracted data, safely handling members and relationships."""
    # --- Core investor info ---
    if data.investor.description:
        investor.description = data.investor.description
    if data.investor.aum:
        investor.aum = data.investor.aum
    if data.investor.check_size_lower:
        investor.check_size_lower = data.investor.check_size_lower
    if data.investor.check_size_upper:
        investor.check_size_upper = data.investor.check_size_upper
    if data.investor.geographic_focus:
        investor.geographic_focus = data.investor.geographic_focus
    if data.investor.number_of_investments:
        investor.number_of_investments = data.investor.number_of_investments
    # --- Investment Stages (NEW) ---
    if data.investment_stages:
        # Get current stage IDs for comparison
        current_stage_enums = {stage.stage for stage in investor.investment_stages}
        for stage_data in data.investment_stages:
            if stage_data.stage not in current_stage_enums:
                # Check if stage already exists in database
                existing_stage = (
                    session.query(InvestmentStageTable)
                    .filter_by(stage=stage_data.stage)
                    .first()
                )
                if not existing_stage:
                    # Create new stage record
                    existing_stage = InvestmentStageTable(stage=stage_data.stage)
                    session.add(existing_stage)
                    session.flush()  # Get the ID
                # Add to investor's stages
                investor.investment_stages.append(existing_stage)
    # --- Team Members ---
    if data.team_members:
        # Index current members by name for quick lookup
        current_members = {m.name.lower(): m for m in investor.team_members if m.name}
        for m in data.team_members:
            if not m.name:
                continue
            normalized = m.name.strip().lower()
            if normalized in current_members:
                # Update existing member
                member_obj = current_members[normalized]
                if m.role:
                    member_obj.role = m.role
                if m.email:
                    member_obj.email = m.email
            else:
                # Create new member
                member_obj = InvestorMember(
                    name=m.name.strip(),
                    role=m.role,
                    email=m.email,
                    investor=investor,
                )
                session.add(member_obj)
    # --- Sectors ---
    if data.sectors:
        for sector_data in data.sectors:
            if not sector_data.name:
                continue
            # Check if sector already exists
            existing_sector = (
                session.query(SectorTable).filter_by(name=sector_data.name).first()
            )
            if not existing_sector:
                existing_sector = SectorTable(name=sector_data.name)
                session.add(existing_sector)
                session.flush()  # Get the ID
            # Add relationship if not already exists
            if existing_sector not in investor.sectors:
                investor.sectors.append(existing_sector)
    # --- Portfolio Companies ---
    # if data.portfolio_companies:
    #     for company_data in data.portfolio_companies:
    #         if not company_data.name:
    #             continue
    #         # Check if company already exists
    #         existing_company = (
    #             session.query(CompanyTable).filter_by(name=company_data.name).first()
    #         )
    #         if not existing_company:
    #             existing_company = CompanyTable(
    #                 name=company_data.name,
    #                 industry=company_data.industry,
    #                 location=company_data.location,
    #                 description=company_data.description,
    #                 founded_year=company_data.founded_year,
    #                 website=company_data.website,
    #             )
    #             session.add(existing_company)
    #             session.flush()  # Get the ID
    #         # Add relationship if not already exists
    #         if existing_company not in investor.portfolio_companies:
    #             investor.portfolio_companies.append(existing_company)
    session.add(investor)
    session.commit()
    return investor
 # ------------------------------------------------------------------
 # Main
 # ------------------------------------------------------------------
 async def main():
    qp = QueryProcessor(sql_session=session)
    all_investors = qp.sql_session.query(InvestorTable).all() if qp.sql_session else []
    # Filter investors that need enrichment
    investors_to_enrich = [inv for inv in all_investors if needs_enrichment(inv)]
    # print(
    #     f"Found {len(investors_to_enrich)} investors that need enrichment out of {len(all_investors)} total"
    # )
    # Process first 10 that need enrichment
    for inv in investors_to_enrich[:10]:
        try:
            print(f"\n🔄 Processing investor: {inv.name}")
            prompt = await qp.fill_investor(inv)
            ai_response = await qp.agent.ainvoke({"messages": [("user", f"{prompt}")]})
            extracted = ai_response["structured_response"]
            # Save JSON backup
            with open("enriched_investors.json", "a") as f:
                f.write(f"# Investor: {inv.name}\n")
                f.write(extracted.model_dump_json(indent=2) + "\n\n")
            # Update database
            update_investor(session, inv, extracted)
            print(f"✅ Updated investor {inv.name} (id={inv.id})")
        except Exception as e:
            logger.error(f"Failed to enrich investor {getattr(inv, 'id', None)}: {e}")
            continue
 if __name__ == "__main__":
    asyncio.run(main())
@@ -0,0 +1,408 @@
 from enum import Enum
 from typing import List, Optional
 from pydantic import BaseModel, Field, field_validator
 class InvestmentStage(str, Enum):
    SEED = "SEED"
    SERIES_A = "SERIES_A"
    SERIES_B = "SERIES_B"
    SERIES_C = "SERIES_C"
    GROWTH = "GROWTH"
    LATE_STAGE = "LATE_STAGE"
 class SectorSchema(BaseModel):
    """
    Expert parser: Only extract sector information if clearly identifiable.
    Leave name empty if uncertain about the sector classification.
    """
    id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Sector ID, must be 0 or greater. Use 0 if uncertain.",
    )
    name: Optional[str] = Field(
        default=None,
        description="Sector name. Leave empty string if not clearly identifiable from the data.",
    )
    @field_validator("name", mode="before")
    @classmethod
    def empty_string_to_none(cls, v):
        """Convert empty strings to None"""
        if v == "" or (isinstance(v, str) and v.strip() == ""):
            return None
        return v
    @field_validator("id", mode="before")
    @classmethod
    def zero_to_none(cls, v):
        """Convert 0 to None for optional id field"""
        if v == 0:
            return None
        return v
    class Config:
        from_attributes = True
 class InvestorMemberSchema(BaseModel):
    """
    Expert parser: Only extract team member information if clearly identifiable.
    Leave fields empty if uncertain about the member details.
    """
    id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Member ID, must be 0 or greater. Use 0 if uncertain.",
    )
    name: Optional[str] = Field(
        default=None,
        description="Team member name. Leave empty string if not clearly identifiable.",
    )
    role: Optional[str] = Field(
        default=None,
        description="Team member role/title. Leave empty string if not clearly identifiable.",
    )
    email: Optional[str] = Field(
        default=None,
        description="Team member email. Leave empty string if not clearly identifiable or not provided.",
    )
    investor_id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Investor ID, must be 0 or greater. Use 0 if uncertain.",
    )
    @field_validator("name", "role", "email", mode="before")
    @classmethod
    def empty_string_to_none(cls, v):
        """Convert empty strings to None"""
        if v == "" or (isinstance(v, str) and v.strip() == ""):
            return None
        return v
    @field_validator("id", "investor_id", mode="before")
    @classmethod
    def zero_to_none(cls, v):
        """Convert 0 to None for optional integer fields"""
        if v == 0:
            return None
        return v
    class Config:
        from_attributes = True
 class CompanyMemberSchema(BaseModel):
    """
    Expert parser: Only extract company member information if clearly identifiable.
    Leave fields empty if uncertain about the member details.
    """
    id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Member ID, must be 0 or greater. Use 0 if uncertain.",
    )
    name: Optional[str] = Field(
        default=None,
        description="Company member name. Leave empty if not clearly identifiable.",
    )
    linkedin: Optional[str] = Field(
        default=None,
        description="LinkedIn profile URL. Leave empty if not provided or uncertain.",
    )
    role: Optional[str] = Field(
        default=None,
        description="Company member role/title. Leave empty if not clearly identifiable.",
    )
    company_id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Company ID, must be 0 or greater. Use 0 if uncertain.",
    )
    @field_validator("name", "linkedin", "role", mode="before")
    @classmethod
    def empty_string_to_none(cls, v):
        """Convert empty strings to None"""
        if v == "" or (isinstance(v, str) and v.strip() == ""):
            return None
        return v
    @field_validator("id", "company_id", mode="before")
    @classmethod
    def zero_to_none(cls, v):
        """Convert 0 to None for optional integer fields"""
        if v == 0:
            return None
        return v
    class Config:
        from_attributes = True
 class CompanySchema(BaseModel):
    """
    Expert parser: Only extract company information if clearly identifiable.
    Leave optional fields empty if uncertain. Integer values must be 0 or greater.
    """
    id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Company ID, must be 0 or greater. Use 0 if uncertain.",
    )
    name: Optional[str] = Field(
        default=None,
        description="Company name. Leave empty string if not clearly identifiable.",
    )
    industry: Optional[str] = Field(
        default=None,
        description="Company industry/sector. Leave empty string if not clearly identifiable.",
    )
    location: Optional[str] = Field(
        default=None,
        description="Company location/address. Leave empty string if not clearly identifiable.",
    )
    description: Optional[str] = Field(
        default=None,
        description="Company description. Leave empty if not clearly available or uncertain.",
    )
    founded_year: Optional[int] = Field(
        default=None,
        ge=0,
        description="Year company was founded, must be 0 or greater. Leave None if not clearly identifiable or uncertain.",
    )
    website: Optional[str] = Field(
        default=None,
        description="Company website URL. Leave empty if not provided or uncertain.",
    )
    @field_validator(
        "name", "industry", "location", "description", "website", mode="before"
    )
    @classmethod
    def empty_string_to_none(cls, v):
        """Convert empty strings to None"""
        if v == "" or (isinstance(v, str) and v.strip() == ""):
            return None
        return v
    @field_validator("id", "founded_year", mode="before")
    @classmethod
    def zero_to_none(cls, v):
        """Convert 0 to None for founded_year"""
        if v == 0:
            return None
        return v
    @field_validator("founded_year", mode="before")
    @classmethod
    def validate_founded_year(cls, v):
        """Expert parser: Only accept clearly identifiable founding years"""
        if v is None or v == "Not Available" or v == "" or v == "Unknown":
            return None
        if isinstance(v, str):
            try:
                year = int(v)
                return year if year >= 0 else None
            except ValueError:
                return None
        return v if isinstance(v, int) and v >= 0 else None
    class Config:
        from_attributes = True
 class InvestmentStageSchema(BaseModel):
    """
    Investment stage schema for many-to-many relationship.
    """
    id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Stage ID, must be 0 or greater. Use 0 if uncertain.",
    )
    stage: InvestmentStage = Field(
        description="Investment stage enum value. Must be one of: SEED, SERIES_A, SERIES_B, SERIES_C, GROWTH, LATE_STAGE"
    )
    @field_validator("id", mode="before")
    @classmethod
    def validate_id(cls, v):
        """Convert 0 to None for optional id field"""
        if v == 0:
            return None
        return v
    class Config:
        from_attributes = True
        use_enum_values = True
 class InvestorSchema(BaseModel):
    """
    Expert parser: Only extract investor information if clearly identifiable.
    Leave optional fields empty if uncertain. All numeric values must be 0 or greater.
    """
    id: Optional[int] = Field(
        default=None,
        ge=0,
        description="Investor ID, must be 0 or greater. Use 0 if uncertain.",
    )
    name: Optional[str] = Field(
        default=None,
        description="Investor name. Do not return any special characters, Just the name as a string.",
    )
    description: Optional[str] = Field(
        default=None,
        description="Investor description. Leave empty if not clearly available or uncertain.",
    )
    aum: Optional[int] = Field(
        default=None,
        ge=0,
        description="Assets Under Management in USD, must be 0 or greater. Use 0 if not clearly identifiable or uncertain.",
    )
    check_size_lower: Optional[int] = Field(
        default=None,
        ge=0,
        description="Lower bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.",
    )
    check_size_upper: Optional[int] = Field(
        default=None,
        ge=0,
        description="Upper bound of typical investment check size in USD, must be 0 or greater. Use 0 if not clearly identifiable.",
    )
    geographic_focus: Optional[str] = Field(
        default=None,
        description="Geographic investment focus. Do not return any special characters, Just locations separated by commas. Leave empty if not clearly identifiable.",
    )
    number_of_investments: Optional[int] = Field(
        default=None,
        ge=0,
        description="Total number of investments made, must be 0 or greater. Use 0 if not clearly identifiable.",
    )
    @field_validator("name", "description", "geographic_focus", mode="before")
    @classmethod
    def empty_string_to_none(cls, v):
        """Convert empty strings to None"""
        if v == "" or (isinstance(v, str) and v.strip() == ""):
            return None
        return v
    @field_validator(
        "id",
        "aum",
        "check_size_lower",
        "check_size_upper",
        "number_of_investments",
        mode="before",
    )
    @classmethod
    def zero_to_none(cls, v):
        """Convert 0 to None for optional integer fields"""
        if v == 0:
            return None
        return v
    class Config:
        from_attributes = True
 class InvestorData(BaseModel):
    """
    Expert parser: Comprehensive investor data schema for LLM processing.
    Only populate fields with clearly identifiable information. Leave lists empty if uncertain.
    """
    investor: InvestorSchema = Field(
        description="Core investor information. Only populate with clearly identifiable data."
    )
    portfolio_companies: List[CompanySchema] = Field(
        default=[],
        description="List of portfolio companies. Leave empty if not clearly identifiable.",
    )
    team_members: List[InvestorMemberSchema] = Field(
        default=[],
        description="List of team members. Leave empty if not clearly identifiable.",
    )
    sectors: List[SectorSchema] = Field(
        default=[],
        description="List of investment sectors. Leave empty if not clearly identifiable.",
    )
    investment_stages: List[InvestmentStageSchema] = Field(
        default=[],
        description="List of investment stages the investor focuses on (can be multiple). Look for terms like 'seed to series A', 'early stage', 'multi-stage', etc. Leave empty if not clearly identifiable.",
    )
    class Config:
        from_attributes = True
 class InvestorDataScrape(BaseModel):
    """
    Expert parser: Comprehensive investor data schema for LLM processing.
    Only populate fields with clearly identifiable information. Leave lists empty if uncertain.
    """
    investor: InvestorSchema = Field(
        description="Core investor information. Only populate with clearly identifiable data."
    )
    team_members: List[InvestorMemberSchema] = Field(
        default=[],
        description="List of team members. Leave empty if not clearly identifiable.",
    )
    sectors: List[SectorSchema] = Field(
        default=[],
        description="List of investment sectors. Leave empty if not clearly identifiable.",
    )
    investment_stages: List[InvestmentStageSchema] = Field(
        default=[],
        description="List of investment stages the investor focuses on (can be multiple). Look for terms like 'seed to series A', 'early stage', 'multi-stage', etc. Leave empty if not clearly identifiable.",
    )
    class Config:
        from_attributes = True
 class CompanyData(BaseModel):
    """
    Expert parser: Comprehensive company data schema for LLM processing.
    Only populate fields with clearly identifiable information. Leave lists empty if uncertain.
    """
    company: CompanySchema = Field(
        description="Core company information. Only populate with clearly identifiable data."
    )
    sectors: List[SectorSchema] = Field(
        default=[],
        description="List of company sectors. Leave empty if not clearly identifiable.",
    )
    members: List[CompanyMemberSchema] = Field(
        default=[],
        description="List of company members. Leave empty if not clearly identifiable.",
    )
    investors: List[InvestorSchema] = Field(
        default=[],
        description="List of investors. Leave empty if not clearly identifiable.",
    )
    class Config:
        from_attributes = True
 class InvestorList(BaseModel):
    """Expert parser: List of investors with clearly identifiable information only."""
    investors: List[InvestorData] = Field(
        default=[],
        description="List of investors. Leave empty if no clearly identifiable investors.",
    )