import json from typing import List, Optional from pydantic import BaseModel from sqlalchemy import JSON, Column, DateTime, Integer, String, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.sql import func Base = declarative_base() class Investor(Base): __tablename__ = "investors" id = Column(Integer, primary_key=True, autoincrement=True) name = Column(String(500), nullable=False) website = Column(String(1000)) # Core investment information investor_description = Column(Text) investment_thesis_focus = Column(JSON) # List of focus areas headquarters = Column(String(1000)) # AUM information aum_amount = Column(String(200)) aum_as_of_date = Column(String(100)) aum_source_url = Column(String(1000)) # Fund information funds_info = Column(JSON) # Complex fund data # Raw data columns for reference crunchbase_urls = Column(Text) crunchbase_extract = Column(Text) linkedin_profile = Column(Text) source_truth_profile = Column(Text) # Metadata created_at = Column(DateTime(timezone=True), server_default=func.now()) updated_at = Column(DateTime(timezone=True), onupdate=func.now()) def __repr__(self): return f"" # Pydantic models for data validation and parsing class AUMInfo(BaseModel): aumAmount: Optional[str] = None asOfDate: Optional[str] = None sourceUrl: Optional[str] = None class FundInfo(BaseModel): fundName: Optional[str] = None fundSize: Optional[str] = None vintage: Optional[str] = None status: Optional[str] = None description: Optional[str] = None class InvestorProfile(BaseModel): websiteURL: Optional[str] = None investorDescription: Optional[str] = None investmentThesisFocus: Optional[List[str]] = None headquarters: Optional[str] = None overallAssetsUnderManagement: Optional[AUMInfo] = None funds: Optional[List[FundInfo]] = None class CSVRow(BaseModel): name: str website: Optional[str] = None investment_firm_profile: Optional[str] = None crunchbase_linkedin_urls: Optional[str] = None crunchbase_firm_extract: Optional[str] = None linkedin_investment_profile: Optional[str] = None source_of_truth_profile: Optional[str] = None def get_combined_description(self) -> str: """Combine all description fields for vector embedding""" descriptions = [] if self.investment_firm_profile: try: profile_data = json.loads(self.investment_firm_profile) if isinstance(profile_data, dict): desc = profile_data.get("investorDescription", "") if desc: descriptions.append(desc) except (json.JSONDecodeError, TypeError): pass if self.crunchbase_firm_extract: descriptions.append(self.crunchbase_firm_extract) if self.linkedin_investment_profile: descriptions.append(self.linkedin_investment_profile) if self.source_of_truth_profile: descriptions.append(self.source_of_truth_profile) return " ".join(descriptions) def get_investment_focus(self) -> List[str]: """Extract investment thesis focus""" if self.investment_firm_profile: try: profile_data = json.loads(self.investment_firm_profile) if isinstance(profile_data, dict): focus = profile_data.get("investmentThesisFocus", []) if isinstance(focus, list): return focus except (json.JSONDecodeError, TypeError): pass return []