Anton_wireframe/app/services/schema.py

from sqlalchemy import Column, Integer, String, Text, DateTime, JSON, Float
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.sql import func
from pydantic import BaseModel
from typing import List, Optional, Dict, Any
import json

Base = declarative_base()

class Investor(Base):
    __tablename__ = 'investors'

    id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String(500), nullable=False)
    website = Column(String(1000))

    # Core investment information
    investor_description = Column(Text)
    investment_thesis_focus = Column(JSON)  # List of focus areas
    headquarters = Column(String(1000))

    # AUM information
    aum_amount = Column(String(200))
    aum_as_of_date = Column(String(100))
    aum_source_url = Column(String(1000))

    # Fund information
    funds_info = Column(JSON)  # Complex fund data

    # Raw data columns for reference
    crunchbase_urls = Column(Text)
    crunchbase_extract = Column(Text)
    linkedin_profile = Column(Text)
    source_truth_profile = Column(Text)

    # Metadata
    created_at = Column(DateTime(timezone=True), server_default=func.now())
    updated_at = Column(DateTime(timezone=True), onupdate=func.now())

    def __repr__(self):
        return f"<Investor(name='{self.name}', website='{self.website}')>"

# Pydantic models for data validation and parsing
class AUMInfo(BaseModel):
    aumAmount: Optional[str] = None
    asOfDate: Optional[str] = None
    sourceUrl: Optional[str] = None

class FundInfo(BaseModel):
    fundName: Optional[str] = None
    fundSize: Optional[str] = None
    vintage: Optional[str] = None
    status: Optional[str] = None
    description: Optional[str] = None

class InvestorProfile(BaseModel):
    websiteURL: Optional[str] = None
    investorDescription: Optional[str] = None
    investmentThesisFocus: Optional[List[str]] = None
    headquarters: Optional[str] = None
    overallAssetsUnderManagement: Optional[AUMInfo] = None
    funds: Optional[List[FundInfo]] = None

class CSVRow(BaseModel):
    name: str
    website: Optional[str] = None
    investment_firm_profile: Optional[str] = None
    crunchbase_linkedin_urls: Optional[str] = None
    crunchbase_firm_extract: Optional[str] = None
    linkedin_investment_profile: Optional[str] = None
    source_of_truth_profile: Optional[str] = None

    def get_combined_description(self) -> str:
        """Combine all description fields for vector embedding"""
        descriptions = []

        if self.investment_firm_profile:
            try:
                profile_data = json.loads(self.investment_firm_profile)
                if isinstance(profile_data, dict):
                    desc = profile_data.get('investorDescription', '')
                    if desc:
                        descriptions.append(desc)
            except (json.JSONDecodeError, TypeError):
                pass

        if self.crunchbase_firm_extract:
            descriptions.append(self.crunchbase_firm_extract)

        if self.linkedin_investment_profile:
            descriptions.append(self.linkedin_investment_profile)

        if self.source_of_truth_profile:
            descriptions.append(self.source_of_truth_profile)

        return " ".join(descriptions)

    def get_investment_focus(self) -> List[str]:
        """Extract investment thesis focus"""
        if self.investment_firm_profile:
            try:
                profile_data = json.loads(self.investment_firm_profile)
                if isinstance(profile_data, dict):
                    focus = profile_data.get('investmentThesisFocus', [])
                    if isinstance(focus, list):
                        return focus
            except (json.JSONDecodeError, TypeError):
                pass
        return []