116 lines
3.7 KiB
Python
116 lines
3.7 KiB
Python
|
|
import json
|
||
|
|
from typing import List, Optional
|
||
|
|
|
||
|
|
from pydantic import BaseModel
|
||
|
|
from sqlalchemy import JSON, Column, DateTime, Integer, String, Text
|
||
|
|
from sqlalchemy.ext.declarative import declarative_base
|
||
|
|
from sqlalchemy.sql import func
|
||
|
|
|
||
|
|
Base = declarative_base()
|
||
|
|
|
||
|
|
|
||
|
|
class Investor(Base):
|
||
|
|
__tablename__ = "investors"
|
||
|
|
|
||
|
|
id = Column(Integer, primary_key=True, autoincrement=True)
|
||
|
|
name = Column(String(500), nullable=False)
|
||
|
|
website = Column(String(1000))
|
||
|
|
|
||
|
|
# Core investment information
|
||
|
|
investor_description = Column(Text)
|
||
|
|
investment_thesis_focus = Column(JSON) # List of focus areas
|
||
|
|
headquarters = Column(String(1000))
|
||
|
|
|
||
|
|
# AUM information
|
||
|
|
aum_amount = Column(String(200))
|
||
|
|
aum_as_of_date = Column(String(100))
|
||
|
|
aum_source_url = Column(String(1000))
|
||
|
|
|
||
|
|
# Fund information
|
||
|
|
funds_info = Column(JSON) # Complex fund data
|
||
|
|
|
||
|
|
# Raw data columns for reference
|
||
|
|
crunchbase_urls = Column(Text)
|
||
|
|
crunchbase_extract = Column(Text)
|
||
|
|
linkedin_profile = Column(Text)
|
||
|
|
source_truth_profile = Column(Text)
|
||
|
|
|
||
|
|
# Metadata
|
||
|
|
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||
|
|
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
|
||
|
|
|
||
|
|
def __repr__(self):
|
||
|
|
return f"<Investor(name='{self.name}', website='{self.website}')>"
|
||
|
|
|
||
|
|
|
||
|
|
# Pydantic models for data validation and parsing
|
||
|
|
class AUMInfo(BaseModel):
|
||
|
|
aumAmount: Optional[str] = None
|
||
|
|
asOfDate: Optional[str] = None
|
||
|
|
sourceUrl: Optional[str] = None
|
||
|
|
|
||
|
|
|
||
|
|
class FundInfo(BaseModel):
|
||
|
|
fundName: Optional[str] = None
|
||
|
|
fundSize: Optional[str] = None
|
||
|
|
vintage: Optional[str] = None
|
||
|
|
status: Optional[str] = None
|
||
|
|
description: Optional[str] = None
|
||
|
|
|
||
|
|
|
||
|
|
class InvestorProfile(BaseModel):
|
||
|
|
websiteURL: Optional[str] = None
|
||
|
|
investorDescription: Optional[str] = None
|
||
|
|
investmentThesisFocus: Optional[List[str]] = None
|
||
|
|
headquarters: Optional[str] = None
|
||
|
|
overallAssetsUnderManagement: Optional[AUMInfo] = None
|
||
|
|
funds: Optional[List[FundInfo]] = None
|
||
|
|
|
||
|
|
|
||
|
|
class CSVRow(BaseModel):
|
||
|
|
name: str
|
||
|
|
website: Optional[str] = None
|
||
|
|
investment_firm_profile: Optional[str] = None
|
||
|
|
crunchbase_linkedin_urls: Optional[str] = None
|
||
|
|
crunchbase_firm_extract: Optional[str] = None
|
||
|
|
linkedin_investment_profile: Optional[str] = None
|
||
|
|
source_of_truth_profile: Optional[str] = None
|
||
|
|
|
||
|
|
def get_combined_description(self) -> str:
|
||
|
|
"""Combine all description fields for vector embedding"""
|
||
|
|
descriptions = []
|
||
|
|
|
||
|
|
if self.investment_firm_profile:
|
||
|
|
try:
|
||
|
|
profile_data = json.loads(self.investment_firm_profile)
|
||
|
|
if isinstance(profile_data, dict):
|
||
|
|
desc = profile_data.get("investorDescription", "")
|
||
|
|
if desc:
|
||
|
|
descriptions.append(desc)
|
||
|
|
except (json.JSONDecodeError, TypeError):
|
||
|
|
pass
|
||
|
|
|
||
|
|
if self.crunchbase_firm_extract:
|
||
|
|
descriptions.append(self.crunchbase_firm_extract)
|
||
|
|
|
||
|
|
if self.linkedin_investment_profile:
|
||
|
|
descriptions.append(self.linkedin_investment_profile)
|
||
|
|
|
||
|
|
if self.source_of_truth_profile:
|
||
|
|
descriptions.append(self.source_of_truth_profile)
|
||
|
|
|
||
|
|
return " ".join(descriptions)
|
||
|
|
|
||
|
|
def get_investment_focus(self) -> List[str]:
|
||
|
|
"""Extract investment thesis focus"""
|
||
|
|
if self.investment_firm_profile:
|
||
|
|
try:
|
||
|
|
profile_data = json.loads(self.investment_firm_profile)
|
||
|
|
if isinstance(profile_data, dict):
|
||
|
|
focus = profile_data.get("investmentThesisFocus", [])
|
||
|
|
if isinstance(focus, list):
|
||
|
|
return focus
|
||
|
|
except (json.JSONDecodeError, TypeError):
|
||
|
|
pass
|
||
|
|
return []
|