feat: Implement database ingestion for investors and companies

- Added main ingestion logic in main.py to process CSV files for investors and companies. - Implemented data cleaning functions for names, strings, integers, and websites. - Established relationships between investors, companies, and sectors using SQLAlchemy ORM. - Created models for investors, companies, sectors, and their relationships in models.py. - Set up logging for error tracking during data processing. - Initialized database and created necessary tables.
2025-10-07 20:01:19 +01:00
parent a9589e54f3
commit 84e3c7b72a
32 changed files with 4 additions and 33994 deletions
@@ -0,0 +1,315 @@
+import logging
+import re
+import unicodedata
+
+import pandas as pd
+from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
+from sqlalchemy.orm import sessionmaker
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+# Import the schema
+init_database()
+
+
+# ===================== Ingesting Original Data =====================#
+def parse_investor_names(investor_names_str):
+    """Parse comma-separated investor names and return a list"""
+    if pd.isna(investor_names_str) or investor_names_str == "":
+        return []
+
+    # Split by comma and clean whitespace
+    # investors = [name.strip() for name in str(investor_names_str).split(",")]
+    investors = [
+        clean_name(name.strip()) for name in str(investor_names_str).split(",")
+    ]
+    return [investor for investor in investors if investor]
+
+
+def parse_industries(industries_str):
+    """Parse comma-separated industries and return a list"""
+    if pd.isna(industries_str) or industries_str == "":
+        return []
+
+    # Split by comma and clean whitespace
+    industries = [industry.strip() for industry in str(industries_str).split(",")]
+    return [industry for industry in industries if industry]
+
+
+def clean_special_characters(text):
+    """Clean special characters from text, converting to ASCII equivalents"""
+    if not text:
+        return text
+
+    # First remove ellipses and other problematic patterns
+    text = str(text).replace("...", "").replace("..", "")
+
+    # Normalize unicode characters to their closest ASCII equivalents
+    normalized = unicodedata.normalize("NFKD", text)
+
+    # Remove accents and convert to ASCII
+    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
+
+    # Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
+    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)
+
+    # Clean up multiple spaces
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+
+    return cleaned
+
+
+def clean_string(value):
+    """Clean string values, converting empty/null/nan/0 to None and removing special characters"""
+    if (
+        pd.isna(value)
+        or value == ""
+        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
+    ):
+        return None
+
+    # First clean special characters
+    cleaned = clean_special_characters(str(value).strip())
+
+    # Check if result is just "0" after cleaning
+    if cleaned in ["0", "0.0", "null", "nan", "none"]:
+        return None
+
+    return cleaned if cleaned else None
+
+
+def clean_name(value):
+    """Clean names (companies, investors) with special character handling"""
+    if (
+        pd.isna(value)
+        or value == ""
+        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
+    ):
+        return None
+
+    # Clean special characters but be more permissive for names
+    text = str(value).strip()
+    # First remove ellipses and other problematic patterns
+    # text = text.replace("...", "").replace("..", "")
+
+    # Normalize unicode characters
+    normalized = unicodedata.normalize("NFKD", text)
+
+    # Convert to ASCII but keep more characters for business names
+    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
+
+    # Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
+    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)
+
+    # Clean up multiple spaces
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+
+    # Remove any trailing or leading periods
+    cleaned = cleaned.strip(".")
+
+    cleaned = cleaned.replace("..", "").replace("...", "")
+    # Check if result is just "0" after cleaning
+    if cleaned in ["0", "0.0", "null", "nan", "none"]:
+        return None
+
+    return cleaned if cleaned else None
+
+
+def clean_integer(value):
+    """Clean integer values, converting empty/null/nan/0 to None"""
+    if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
+        return None
+    try:
+        cleaned_val = int(float(value))
+        return cleaned_val if cleaned_val > 0 else None
+    except (ValueError, TypeError):
+        return None
+
+
+def parse_website(website_str: str):
+    try:
+        _, end = website_str.split(":")
+
+        if end == "0":
+            return None
+        return "https:" + end
+    except Exception:
+        return None
+
+
+def ingest_data():
+    # Create database engine and session
+    Session = sessionmaker(bind=engine)
+    session = Session()
+
+    # Load CSV files
+    print("Loading CSV files...")
+    companies_df = pd.read_csv("companies.csv")
+    investors_df = pd.read_csv("investors.csv")
+
+    print(f"📊 Companies CSV: {len(companies_df)} rows")
+    print(f"📊 Investors CSV: {len(investors_df)} rows")
+
+    # Step 1: Ingest Investors
+    print("\n🔄 Step 1: Ingesting Investors...")
+    investors_processed = 0
+
+    for index, row in investors_df.iterrows():
+        try:
+            investor_name = clean_name(row.get("Filtered investor names", ""))
+
+            if investor_name:
+                # Check if investor already exists
+                existing_investor = (
+                    session.query(InvestorTable).filter_by(name=investor_name).first()
+                )
+                if not existing_investor:
+                    investor = InvestorTable(
+                        name=investor_name,
+                        description=clean_string(row.get("Business model", "")),
+                        headquarters=clean_string(row.get("HQ", "")),
+                        website=parse_website(str(row.get("Website", "")).strip()),
+                        number_of_investments=clean_integer(
+                            row.get("Number of investments")
+                        ),
+                    )
+                    session.add(investor)
+                    investors_processed += 1
+
+                    if investors_processed % 1000 == 0:
+                        session.commit()
+                        print(f"  Committed {investors_processed} investors")
+
+        except Exception as e:
+            logger.error(f"Error processing investor {index}: {e}")
+            continue
+
+    session.commit()
+    print(f"✅ Investors completed: {investors_processed} processed")
+
+    # Step 2: Ingest Companies and Rounds
+    print("\n🔄 Step 2: Ingesting Companies and Sectors...")
+    companies_processed = 0
+    sectors_created = set()
+
+    for index, row in companies_df.iterrows():
+        try:
+            # Process company
+            company_name = clean_name(row.get("Organization Name", ""))
+            if not company_name:
+                continue
+
+            # Check if company already exists
+            existing_company = (
+                session.query(CompanyTable).filter_by(name=company_name).first()
+            )
+            if existing_company:
+                company = existing_company
+            else:
+                # Create company
+                company = CompanyTable(
+                    name=company_name,
+                    description=clean_string(row.get("Organization Description", "")),
+                    location=clean_string(row.get("Organization Location", "")),
+                    industry=clean_string(row.get("Organization Industries", "")),
+                    website=clean_string(row.get("Organization Website", "")),
+                )
+                session.add(company)
+                session.flush()  # Get the company ID
+                companies_processed += 1
+
+            # Process investor relationships
+            investor_names_str = row.get("Investor Names", "")
+            if pd.notna(investor_names_str) and investor_names_str:
+                investor_names = parse_investor_names(investor_names_str)
+
+                for investor_name in investor_names:
+                    # Find investor in database
+                    investor = (
+                        session.query(InvestorTable)
+                        .filter_by(name=investor_name.strip())
+                        .first()
+                    )
+
+                    if investor:
+                        # Add investor-company relationship
+                        if company not in investor.portfolio_companies:
+                            investor.portfolio_companies.append(company)
+                    else:
+                        print("This company has an investor not in DB:", investor_name)
+
+            # Process sectors/industries
+            industries_str = row.get("Organization Industries", "")
+            if pd.notna(industries_str) and industries_str:
+                industries = parse_industries(industries_str)
+
+                for industry_name in industries:
+                    industry_name = industry_name.strip()
+                    if industry_name:
+                        # Check if sector exists
+                        sector = (
+                            session.query(SectorTable)
+                            .filter_by(name=industry_name)
+                            .first()
+                        )
+                        if not sector:
+                            sector = SectorTable(name=industry_name)
+                            session.add(sector)
+                            session.flush()
+                            sectors_created.add(industry_name)
+
+                        # Add company-sector relationship
+                        if sector not in company.sectors:
+                            company.sectors.append(sector)
+
+            # Commit every 100 companies
+            if companies_processed % 100 == 0 and companies_processed > 0:
+                session.commit()
+                print(f"  Processed {companies_processed} companies...")
+
+        except Exception as e:
+            logger.error(f"Error processing company {index}: {e}")
+            session.rollback()
+            continue
+
+    # Step 3: Link investors to sectors based on portfolio companies
+    print("\n🔄 Step 3: Linking Investors to Sectors...")
+    investors_linked_to_sectors = 0
+    all_investors = session.query(InvestorTable).all()
+    for investor in all_investors:
+        sectors = set()
+        for company in investor.portfolio_companies:
+            for sector in company.sectors:
+                sectors.add(sector)
+        # Add sectors to investor if not already present
+        for sector in sectors:
+            if sector not in investor.sectors:
+                investor.sectors.append(sector)
+        if sectors:
+            investors_linked_to_sectors += 1
+    session.commit()
+    print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")
+
+    # Final commit
+    session.commit()
+
+    # Final counts
+    final_investors = session.query(InvestorTable).count()
+    final_companies = session.query(CompanyTable).count()
+    final_sectors = session.query(SectorTable).count()
+
+    print("\n🎉 Ingestion Complete!")
+    print(f"   Investors: {final_investors}")
+    print(f"   Companies: {final_companies}")
+    print(f"   Sectors: {final_sectors}")
+
+    session.close()
+
+
+if __name__ == "__main__":
+    ingest_data()
+    # print(clean_name("A... Energi"))
+    # print(clean_name("B.. Tech"))
+    # print(clean_name("A... Energi"))
@@ -0,0 +1,381 @@
+import enum
+from typing import Annotated
+
+from fastapi import Depends
+from sqlalchemy import (
+    Column,
+    DateTime,
+    ForeignKey,
+    Integer,
+    String,
+    Table,
+    Text,
+    create_engine,
+    func,
+)
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import Session, declarative_mixin, relationship, sessionmaker
+from sqlalchemy.types import JSON, Enum
+
+Base = declarative_base()
+
+# Database configuration
+# DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db")
+
+# Create engine
+engine = create_engine("sqlite:///./investors.db", echo=False)
+
+# Create session factory
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+
+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+db_dependency = Annotated[Session, Depends(get_db)]
+
+
+def init_database():
+    """Initialize the database by creating all tables"""
+    Base.metadata.create_all(bind=engine)
+
+
+def get_session_sync() -> Session:
+    """Get a database session for synchronous operations"""
+    return SessionLocal()
+
+
+def get_db_session():
+    """Get a database session for direct use."""
+    return SessionLocal()
+
+
+@declarative_mixin
+class TimestampMixin:
+    created_at = Column(
+        DateTime(timezone=True), server_default=func.now(), nullable=False
+    )
+    updated_at = Column(DateTime(timezone=True), onupdate=func.now())
+
+
+class InvestmentStage(enum.Enum):
+    SEED = "SEED"
+    SERIES_A = "SERIES_A"
+    SERIES_B = "SERIES_B"
+    SERIES_C = "SERIES_C"
+    GROWTH = "GROWTH"
+    LATE_STAGE = "LATE_STAGE"
+
+
+# Association table for many-to-many relationship between investors and companies
+investor_company_association = Table(
+    "investor_companies",
+    Base.metadata,
+    Column("investor_id", Integer, ForeignKey("investors.id")),
+    Column("company_id", Integer, ForeignKey("companies.id")),
+)
+
+
+# Association table for investor-sector many-to-many
+investor_sector_association = Table(
+    "investor_sectors",
+    Base.metadata,
+    Column("investor_id", Integer, ForeignKey("investors.id")),
+    Column("sector_id", Integer, ForeignKey("sectors.id")),
+)
+
+
+company_sector_association = Table(
+    "company_sector",
+    Base.metadata,
+    Column("company_id", Integer, ForeignKey("companies.id")),
+    Column("sector_id", Integer, ForeignKey("sectors.id")),
+)
+
+project_sector_association = Table(
+    "project_sector",
+    Base.metadata,
+    Column("project_id", Integer, ForeignKey("projects.id")),
+    Column("sector_id", Integer, ForeignKey("sectors.id")),
+)
+
+project_investor_association = Table(
+    "project_investors",
+    Base.metadata,
+    Column("project_id", Integer, ForeignKey("projects.id")),
+    Column("investor_id", Integer, ForeignKey("investors.id")),
+)
+
+project_company_association = Table(
+    "project_companies",
+    Base.metadata,
+    Column("project_id", Integer, ForeignKey("projects.id")),
+    Column("company_id", Integer, ForeignKey("companies.id")),
+)
+
+# Association table for investor-stage many-to-many
+investor_stage_association = Table(
+    "investor_stages",
+    Base.metadata,
+    Column("investor_id", Integer, ForeignKey("investors.id")),
+    Column("stage_id", Integer, ForeignKey("investment_stages.id")),
+)
+
+# Association table for fund-stage many-to-many
+fund_investment_stages_association = Table(
+    "fund_investment_stages",
+    Base.metadata,
+    Column("fund_id", Integer, ForeignKey("funds.id")),
+    Column("stage_id", Integer, ForeignKey("investment_stages.id")),
+)
+
+# Association table for fund-sector many-to-many
+fund_sectors_association = Table(
+    "fund_sectors",
+    Base.metadata,
+    Column("fund_id", Integer, ForeignKey("funds.id")),
+    Column("sector_id", Integer, ForeignKey("sectors.id")),
+)
+
+
+class InvestorTable(Base, TimestampMixin):
+    __tablename__ = "investors"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+    description = Column(Text, nullable=True)
+
+    # Basic investor info
+    website = Column(String, nullable=True)
+    headquarters = Column(String, nullable=True)
+
+    # AUM fields
+    aum = Column(Integer, nullable=True)  # Store as integer for numerical filtering
+    aum_as_of_date = Column(String, nullable=True)
+    aum_source_url = Column(String, nullable=True)
+
+    # Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
+    check_size_lower = Column(Integer, nullable=True)
+    check_size_upper = Column(Integer, nullable=True)
+
+    # Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
+    geographic_focus = Column(String, nullable=True)
+
+    # Investment thesis and portfolio
+    investment_thesis = Column(JSON, nullable=True)  # Array of thesis statements
+    portfolio_highlights = Column(
+        JSON, nullable=True
+    )  # Array of portfolio company names
+    linked_documents = Column(JSON, nullable=True)  # Array of document URLs
+
+    # Research metadata
+    researcher_notes = Column(Text, nullable=True)
+    missing_important_fields = Column(
+        JSON, nullable=True
+    )  # Array of missing field names
+    sources = Column(JSON, nullable=True)  # JSON object with source URLs
+
+    # Portfolio info
+    number_of_investments = Column(Integer, nullable=True)
+
+    # Relationships
+    team_members = relationship(
+        "InvestorMember", back_populates="investor", cascade="all, delete-orphan"
+    )
+    funds = relationship(
+        "FundTable", back_populates="investor", cascade="all, delete-orphan"
+    )
+
+    # Many-to-many relationship with investment stages
+    investment_stages = relationship(
+        "InvestmentStageTable",
+        secondary=investor_stage_association,
+        back_populates="investors",
+    )
+
+    # Relationship to portfolio companies
+    portfolio_companies = relationship(
+        "CompanyTable",
+        secondary=investor_company_association,
+        back_populates="investors",
+    )
+
+    sectors = relationship(
+        "SectorTable",
+        secondary=investor_sector_association,
+        back_populates="investors",
+    )
+
+    projects = relationship(
+        "ProjectTable",
+        secondary=project_investor_association,
+        back_populates="investors",
+    )
+
+
+class InvestorMember(Base, TimestampMixin):
+    __tablename__ = "investor_members"
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+    role = Column(String, nullable=True)
+    title = Column(String, nullable=True)  # Alternative to role
+    email = Column(String, nullable=True)
+    source_url = Column(String, nullable=True)  # URL where member info was found
+
+    investor_id = Column(Integer, ForeignKey("investors.id"))
+    investor = relationship("InvestorTable", back_populates="team_members")
+
+
+class FundTable(Base, TimestampMixin):
+    __tablename__ = "funds"
+
+    id = Column(Integer, primary_key=True, index=True)
+    investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
+
+    # Fund details
+    fund_name = Column(String, nullable=True)
+    fund_size = Column(
+        Integer, nullable=True
+    )  # Store as integer for numerical filtering
+    fund_size_source_url = Column(String, nullable=True)
+
+    # Check size range (parsed from estimated_investment_size by LLM)
+    check_size_lower = Column(Integer, nullable=True)
+    check_size_upper = Column(Integer, nullable=True)
+
+    source_url = Column(String, nullable=True)
+    source_provider = Column(String, nullable=True)  # e.g., "Perplexity"
+
+    # Geographic focus as simple string
+    geographic_focus = Column(String, nullable=True)
+
+    # Relationships
+    investor = relationship("InvestorTable", back_populates="funds")
+    investment_stages = relationship(
+        "InvestmentStageTable",
+        secondary=fund_investment_stages_association,
+        back_populates="funds",
+    )
+    sectors = relationship(
+        "SectorTable",
+        secondary=fund_sectors_association,
+        back_populates="funds",
+    )
+
+
+class InvestmentStageTable(Base, TimestampMixin):
+    __tablename__ = "investment_stages"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False, unique=True)
+
+    # Relationships
+    investors = relationship(
+        "InvestorTable",
+        secondary=investor_stage_association,
+        back_populates="investment_stages",
+    )
+    funds = relationship(
+        "FundTable",
+        secondary=fund_investment_stages_association,
+        back_populates="investment_stages",
+    )
+
+
+class CompanyTable(Base, TimestampMixin):
+    __tablename__ = "companies"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+    industry = Column(String, nullable=True)
+    location = Column(String, nullable=True)
+    description = Column(String, nullable=True)
+    founded_year = Column(Integer, nullable=True)
+    website = Column(String, nullable=True)
+
+    members = relationship(
+        "CompanyMember", back_populates="company", cascade="all, delete-orphan"
+    )
+    # Relationship back to investors
+    investors = relationship(
+        "InvestorTable",
+        secondary=investor_company_association,
+        back_populates="portfolio_companies",
+    )
+
+    sectors = relationship(
+        "SectorTable", secondary=company_sector_association, back_populates="companies"
+    )
+
+    projects = relationship(
+        "ProjectTable",
+        secondary=project_company_association,
+        back_populates="companies",
+    )
+
+
+class CompanyMember(Base, TimestampMixin):
+    __tablename__ = "company_members"
+    id = Column(Integer, primary_key=True)
+    name = Column(String)
+    linkedin = Column(String, nullable=True)
+    role = Column(String, nullable=True)
+    company_id = Column(Integer, ForeignKey("companies.id"), nullable=False)
+
+    company = relationship("CompanyTable", back_populates="members")
+
+
+class SectorTable(Base, TimestampMixin):
+    __tablename__ = "sectors"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+
+    # Relationships
+    investors = relationship(
+        "InvestorTable",
+        secondary=investor_sector_association,
+        back_populates="sectors",
+    )
+    companies = relationship(
+        "CompanyTable", secondary=company_sector_association, back_populates="sectors"
+    )
+    projects = relationship(
+        "ProjectTable", secondary=project_sector_association, back_populates="sector"
+    )
+    funds = relationship(
+        "FundTable",
+        secondary=fund_sectors_association,
+        back_populates="sectors",
+    )
+
+
+class ProjectTable(Base, TimestampMixin):
+    __tablename__ = "projects"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+    valuation = Column(Integer, nullable=True)
+
+    stage = Column(Enum(InvestmentStage), nullable=True)
+    location = Column(String, nullable=True)
+    description = Column(Text, nullable=True)
+    start_date = Column(DateTime, nullable=True)
+    end_date = Column(DateTime, nullable=True)
+
+    sector = relationship(
+        "SectorTable", secondary=project_sector_association, back_populates="projects"
+    )
+    investors = relationship(
+        "InvestorTable",
+        secondary=project_investor_association,
+        back_populates="projects",
+    )
+    companies = relationship(
+        "CompanyTable", secondary=project_company_association, back_populates="projects"
+    )