1 Commits

22 changed files with 166 additions and 26007 deletions
+2 -1
View File
@@ -10,7 +10,8 @@
*__pycache__ *__pycache__
/*.db
*.cypython *.cypython
/preprocessor
Binary file not shown.
Binary file not shown.
Binary file not shown.
+1 -7
View File
@@ -1,5 +1,4 @@
import os import os
from pathlib import Path
from typing import Annotated from typing import Annotated
from fastapi import Depends from fastapi import Depends
@@ -10,11 +9,7 @@ from sqlalchemy.orm import Session, sessionmaker
Base = declarative_base() Base = declarative_base()
# Database configuration # Database configuration
# Use the preprocessor's database for consistency DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db")
# Get absolute path to the preprocessor database
# APP_DIR = Path(__file__).parent.parent
# PREPROCESSOR_DB = APP_DIR.parent / "preprocessor" / "version_two.db"
DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./version_two.db")
# Create engine # Create engine
engine = create_engine(DATABASE_URL, echo=False) engine = create_engine(DATABASE_URL, echo=False)
@@ -43,7 +38,6 @@ def get_session_sync() -> Session:
"""Get a database session for synchronous operations""" """Get a database session for synchronous operations"""
return SessionLocal() return SessionLocal()
def get_db_session(): def get_db_session():
"""Get a database session for direct use.""" """Get a database session for direct use."""
return SessionLocal() return SessionLocal()
+10 -117
View File
@@ -2,7 +2,7 @@ import enum
from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Table, Text, func from sqlalchemy import Column, DateTime, ForeignKey, Integer, String, Table, Text, func
from sqlalchemy.orm import declarative_mixin, relationship from sqlalchemy.orm import declarative_mixin, relationship
from sqlalchemy.types import JSON, Enum from sqlalchemy.types import Enum
from db.db import Base from db.db import Base
@@ -70,22 +70,6 @@ project_company_association = Table(
Column("company_id", Integer, ForeignKey("companies.id")), Column("company_id", Integer, ForeignKey("companies.id")),
) )
# Association table for fund-stage many-to-many
fund_investment_stages_association = Table(
"fund_investment_stages",
Base.metadata,
Column("fund_id", Integer, ForeignKey("funds.id")),
Column("stage_id", Integer, ForeignKey("investment_stages.id")),
)
# Association table for fund-sector many-to-many
fund_sectors_association = Table(
"fund_sectors",
Base.metadata,
Column("fund_id", Integer, ForeignKey("funds.id")),
Column("sector_id", Integer, ForeignKey("sectors.id")),
)
class InvestorTable(Base, TimestampMixin): class InvestorTable(Base, TimestampMixin):
__tablename__ = "investors" __tablename__ = "investors"
@@ -93,47 +77,14 @@ class InvestorTable(Base, TimestampMixin):
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False) name = Column(String, nullable=False)
description = Column(Text, nullable=True) description = Column(Text, nullable=True)
aum = Column(Integer, nullable=True) # Assets Under Management
# Basic investor info check_size_lower = Column(Integer, nullable=True) # Lower bound
website = Column(String, nullable=True) check_size_upper = Column(Integer, nullable=True) # Upper bound
headquarters = Column(String, nullable=True)
# AUM fields
aum = Column(Integer, nullable=True) # Store as integer for numerical filtering
aum_as_of_date = Column(String, nullable=True)
aum_source_url = Column(String, nullable=True)
# Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
check_size_lower = Column(Integer, nullable=True)
check_size_upper = Column(Integer, nullable=True)
# Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
geographic_focus = Column(String, nullable=True) geographic_focus = Column(String, nullable=True)
stage_focus = Column(Enum(InvestmentStage), nullable=True)
# Investment thesis and portfolio
investment_thesis = Column(JSON, nullable=True) # Array of thesis statements
portfolio_highlights = Column(
JSON, nullable=True
) # Array of portfolio company names
linked_documents = Column(JSON, nullable=True) # Array of document URLs
# Research metadata
researcher_notes = Column(Text, nullable=True)
missing_important_fields = Column(
JSON, nullable=True
) # Array of missing field names
sources = Column(JSON, nullable=True) # JSON object with source URLs
# Portfolio info
number_of_investments = Column(Integer, default=0, nullable=True) number_of_investments = Column(Integer, default=0, nullable=True)
# Relationships team_members = relationship("InvestorMember", back_populates="investor")
team_members = relationship(
"InvestorMember", back_populates="investor", cascade="all, delete-orphan"
)
funds = relationship(
"FundTable", back_populates="investor", cascade="all, delete-orphan"
)
# Relationship to portfolio companies # Relationship to portfolio companies
portfolio_companies = relationship( portfolio_companies = relationship(
@@ -160,51 +111,12 @@ class InvestorMember(Base, TimestampMixin):
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False) name = Column(String, nullable=False)
role = Column(String, nullable=True) role = Column(String, nullable=True)
title = Column(String, nullable=True) # Alternative to role
email = Column(String, nullable=True) email = Column(String, nullable=True)
source_url = Column(String, nullable=True) # URL where member info was found
investor_id = Column(Integer, ForeignKey("investors.id")) investor_id = Column(Integer, ForeignKey("investors.id"))
investor = relationship("InvestorTable", back_populates="team_members") investor = relationship("InvestorTable", back_populates="team_members")
class FundTable(Base, TimestampMixin):
__tablename__ = "funds"
id = Column(Integer, primary_key=True, index=True)
investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
# Fund details
fund_name = Column(String, nullable=True)
fund_size = Column(
Integer, nullable=True
) # Store as integer for numerical filtering
fund_size_source_url = Column(String, nullable=True)
# Check size range (parsed from estimated_investment_size by LLM)
check_size_lower = Column(Integer, nullable=True)
check_size_upper = Column(Integer, nullable=True)
source_url = Column(String, nullable=True)
source_provider = Column(String, nullable=True) # e.g., "Perplexity"
# Geographic focus as simple string
geographic_focus = Column(String, nullable=True)
# Relationships
investor = relationship("InvestorTable", back_populates="funds")
investment_stages = relationship(
"InvestmentStageTable",
secondary=fund_investment_stages_association,
back_populates="funds",
)
sectors = relationship(
"SectorTable",
secondary=fund_sectors_association,
back_populates="funds",
)
class CompanyTable(Base, TimestampMixin): class CompanyTable(Base, TimestampMixin):
__tablename__ = "companies" __tablename__ = "companies"
@@ -216,9 +128,7 @@ class CompanyTable(Base, TimestampMixin):
founded_year = Column(Integer, nullable=True) founded_year = Column(Integer, nullable=True)
website = Column(String, nullable=True) website = Column(String, nullable=True)
members = relationship( members = relationship("CompanyMember", back_populates="company")
"CompanyMember", back_populates="company", cascade="all, delete-orphan"
)
# Relationship back to investors # Relationship back to investors
investors = relationship( investors = relationship(
"InvestorTable", "InvestorTable",
@@ -248,43 +158,26 @@ class CompanyMember(Base, TimestampMixin):
company = relationship("CompanyTable", back_populates="members") company = relationship("CompanyTable", back_populates="members")
class InvestmentStageTable(Base, TimestampMixin):
__tablename__ = "investment_stages"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False, unique=True)
# Relationships
funds = relationship(
"FundTable",
secondary=fund_investment_stages_association,
back_populates="investment_stages",
)
class SectorTable(Base, TimestampMixin): class SectorTable(Base, TimestampMixin):
__tablename__ = "sectors" __tablename__ = "sectors"
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False) name = Column(String, nullable=False)
# Relationships # Add relationship back to investors
investors = relationship( investors = relationship(
"InvestorTable", "InvestorTable",
secondary=investor_sector_association, secondary=investor_sector_association,
back_populates="sectors", back_populates="sectors",
) )
companies = relationship( companies = relationship(
"CompanyTable", secondary=company_sector_association, back_populates="sectors" "CompanyTable", secondary=company_sector_association, back_populates="sectors"
) )
projects = relationship( projects = relationship(
"ProjectTable", secondary=project_sector_association, back_populates="sector" "ProjectTable", secondary=project_sector_association, back_populates="sector"
) )
funds = relationship(
"FundTable",
secondary=fund_sectors_association,
back_populates="sectors",
)
class ProjectTable(Base, TimestampMixin): class ProjectTable(Base, TimestampMixin):
+5 -29
View File
@@ -44,27 +44,6 @@ def health():
async def parse_csv( async def parse_csv(
db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...) db: db_dependency, file: UploadFile = File(...), is_investor: int = Form(...)
): ):
"""
Parse and import CSV data into the database.
**For investors:**
- Expected columns: Name, Website, Final Investor Profile, Final Profile sourcing
- Manually parses JSON profiles for efficiency
- Uses LLM only for currency conversion to USD
- Handles AUM, fund sizes, and check sizes as integers
**For companies:**
- Expected columns: Name, Website, Investor, Final Investor Profile (company profile)
- 100% manual JSON parsing - no LLM needed
- Extracts company details, executives, investors, and client categories
- Automatically links companies to investors in database
**Benefits:**
- Fast processing (5-10s per record)
- Low cost (minimal or no LLM usage)
- Accurate data extraction
- Automatic database persistence
"""
# Read uploaded CSV with pandas # Read uploaded CSV with pandas
content = await file.read() content = await file.read()
df = pd.read_csv(io.StringIO(content.decode("utf-8"))) df = pd.read_csv(io.StringIO(content.decode("utf-8")))
@@ -73,15 +52,12 @@ async def parse_csv(
processor = InvestorProcessor() processor = InvestorProcessor()
if is_investor == 1: if is_investor == 1:
# Manual parser with LLM currency conversion results = await processor.parse_investors(df)
results = await processor.parse_investors(df, save_to_db=True)
# Results are already dicts from the new parser
return results
else: else:
# Manual parser for companies (no LLM needed) results = await processor.parse_companies(df)
results = await processor.parse_companies(df, save_to_db=True)
# Results are already dicts from the new parser # Convert Pydantic objects to dictionaries
return results return [r.model_dump() for r in results]
@app.post("/query", response_model=InvestorList, tags=["Querying"]) @app.post("/query", response_model=InvestorList, tags=["Querying"])
Binary file not shown.
+49 -313
View File
@@ -4,11 +4,8 @@ from db.db import get_db
from db.models import InvestorTable, SectorTable from db.models import InvestorTable, SectorTable
from fastapi import APIRouter, Depends, HTTPException, Query from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel from pydantic import BaseModel
from schemas.router_schemas import ( from schemas.router_schemas import InvestmentStage, InvestorData
InvestmentStage, from services.querying import QueryProcessor
InvestorData,
InvestorFundData,
)
from sqlalchemy.orm import Session, selectinload from sqlalchemy.orm import Session, selectinload
router = APIRouter(tags=["Investor Routes"]) router = APIRouter(tags=["Investor Routes"])
@@ -37,95 +34,34 @@ class InvestorUpdate(BaseModel):
number_of_investments: Optional[int] = None number_of_investments: Optional[int] = None
@router.get("/investors", response_model=List[InvestorFundData]) @router.get("/investors", response_model=List[InvestorData])
def read_investors(db: Session = Depends(get_db)): def read_investors(db: Session = Depends(get_db)):
"""Get all investors with their funds as separate entries """Get all investors with their related data"""
Each investor-fund combination is returned as a separate row.
An investor with 3 funds will appear as 3 entries.
"""
investors = ( investors = (
db.query(InvestorTable) db.query(InvestorTable)
.options( .options(
selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members), selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors), selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
) )
.all() .all()
) )
# Transform to InvestorFundData format (one row per investor-fund combination) # Transform InvestorTable objects to InvestorData format
investor_fund_list = [] investor_data_list = []
for investor in investors: for investor in investors:
# If investor has funds, create one entry per fund investor_data = InvestorData(
if investor.funds: investor=investor, # This maps to InvestorSchema
for fund in investor.funds: portfolio_companies=investor.portfolio_companies,
investor_fund_data = InvestorFundData( team_members=investor.team_members,
# Investor fields sectors=investor.sectors,
investor_id=investor.id, )
investor_name=investor.name, investor_data_list.append(investor_data)
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields
fund_id=fund.id,
fund_name=fund.fund_name,
fund_size=fund.fund_size,
fund_size_source_url=fund.fund_size_source_url,
check_size_lower=fund.check_size_lower,
check_size_upper=fund.check_size_upper,
geographic_focus=fund.geographic_focus,
fund_investment_stages=fund.investment_stages, # Now a relationship
fund_sectors=fund.sectors, # Now a relationship
# Related data (same for all funds of this investor)
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
else:
# If no funds, create one entry with null fund fields
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields (null)
fund_id=None,
fund_name=None,
fund_size=None,
fund_size_source_url=None,
check_size_lower=None,
check_size_upper=None,
geographic_focus=None,
fund_investment_stages=None,
fund_sectors=None,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
return investor_fund_list return investor_data_list
@router.get("/investors/filter", response_model=List[InvestorFundData]) @router.get("/investors/filter", response_model=List[InvestorData])
def filter_investors( def filter_investors(
stage: Optional[InvestmentStage] = Query( stage: Optional[InvestmentStage] = Query(
None, description="Filter by investment stage" None, description="Filter by investment stage"
@@ -140,18 +76,13 @@ def filter_investors(
max_aum: Optional[int] = Query(None, description="Maximum AUM"), max_aum: Optional[int] = Query(None, description="Maximum AUM"),
db: Session = Depends(get_db), db: Session = Depends(get_db),
): ):
"""Filter investors based on various criteria """Filter investors based on various criteria"""
Returns investor-fund combinations as separate rows.
An investor with 3 funds will appear as 3 entries.
"""
# Start with base query # Start with base query
query = db.query(InvestorTable).options( query = db.query(InvestorTable).options(
selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members), selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors), selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
) )
# Apply filters # Apply filters
@@ -181,86 +112,29 @@ def filter_investors(
investors = query.all() investors = query.all()
# Transform to InvestorFundData format (one row per investor-fund combination) # Transform to InvestorData format
investor_fund_list = [] investor_data_list = []
for investor in investors: for investor in investors:
# If investor has funds, create one entry per fund investor_data = InvestorData(
if investor.funds: investor=investor,
for fund in investor.funds: portfolio_companies=investor.portfolio_companies,
investor_fund_data = InvestorFundData( team_members=investor.team_members,
# Investor fields sectors=investor.sectors,
investor_id=investor.id, )
investor_name=investor.name, investor_data_list.append(investor_data)
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields
fund_id=fund.id,
fund_name=fund.fund_name,
fund_size=fund.fund_size,
fund_size_source_url=fund.fund_size_source_url,
check_size_lower=fund.check_size_lower,
check_size_upper=fund.check_size_upper,
geographic_focus=fund.geographic_focus,
fund_investment_stages=fund.investment_stages, # Now a relationship
fund_sectors=fund.sectors, # Now a relationship
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
else:
# If no funds, create one entry with null fund fields
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields (null)
fund_id=None,
fund_name=None,
fund_size=None,
fund_size_source_url=None,
check_size_lower=None,
check_size_upper=None,
geographic_focus=None,
fund_investment_stages=None,
fund_sectors=None,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
return investor_fund_list return investor_data_list
@router.get("/investors/{investor_id}", response_model=InvestorData) @router.get("/investors/{investor_id}", response_model=InvestorData)
def read_investor(investor_id: int, db: Session = Depends(get_db)): def read_investor(investor_id: int, db: Session = Depends(get_db)):
"""Get a specific investor by ID with all their funds""" """Get a specific investor by ID"""
investor = ( investor = (
db.query(InvestorTable) db.query(InvestorTable)
.options( .options(
selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members), selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors), selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
) )
.filter(InvestorTable.id == investor_id) .filter(InvestorTable.id == investor_id)
.first() .first()
@@ -269,13 +143,12 @@ def read_investor(investor_id: int, db: Session = Depends(get_db)):
if not investor: if not investor:
raise HTTPException(status_code=404, detail="Investor not found") raise HTTPException(status_code=404, detail="Investor not found")
# Transform to InvestorData format (includes funds array) # Transform to InvestorData format
return InvestorData( return InvestorData(
investor=investor, investor=investor,
portfolio_companies=investor.portfolio_companies, portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members, team_members=investor.team_members,
sectors=investor.sectors, sectors=investor.sectors,
funds=investor.funds,
) )
@@ -294,7 +167,6 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)):
selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members), selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors), selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
) )
.filter(InvestorTable.id == db_investor.id) .filter(InvestorTable.id == db_investor.id)
.first() .first()
@@ -306,75 +178,16 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)):
portfolio_companies=investor_with_relations.portfolio_companies, portfolio_companies=investor_with_relations.portfolio_companies,
team_members=investor_with_relations.team_members, team_members=investor_with_relations.team_members,
sectors=investor_with_relations.sectors, sectors=investor_with_relations.sectors,
funds=investor_with_relations.funds,
) )
@router.put("/investors/{investor_id}", response_model=InvestorData) @router.get("/investors/{investor_id}/similar", response_model=List[InvestorData])
def update_investor(
investor_id: int, investor: InvestorUpdate, db: Session = Depends(get_db)
):
"""Update an existing investor"""
db_investor = (
db.query(InvestorTable).filter(InvestorTable.id == investor_id).first()
)
if not db_investor:
raise HTTPException(status_code=404, detail="Investor not found")
update_data = investor.dict(exclude_unset=True)
for field, value in update_data.items():
setattr(db_investor, field, value)
db.commit()
db.refresh(db_investor)
# Reload with relationships
investor_with_relations = (
db.query(InvestorTable)
.options(
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.filter(InvestorTable.id == investor_id)
.first()
)
# Transform to InvestorData format
return InvestorData(
investor=investor_with_relations,
portfolio_companies=investor_with_relations.portfolio_companies,
team_members=investor_with_relations.team_members,
sectors=investor_with_relations.sectors,
funds=investor_with_relations.funds,
)
@router.delete("/investors/{investor_id}")
def delete_investor(investor_id: int, db: Session = Depends(get_db)):
"""Delete an investor"""
db_investor = (
db.query(InvestorTable).filter(InvestorTable.id == investor_id).first()
)
if not db_investor:
raise HTTPException(status_code=404, detail="Investor not found")
db.delete(db_investor)
db.commit()
return {"message": "Investor deleted successfully"}
@router.get("/investors/{investor_id}/similar", response_model=List[InvestorFundData])
def find_similar_investors( def find_similar_investors(
investor_id: int, investor_id: int,
limit: int = Query(10, description="Maximum number of similar investors to return"), limit: int = Query(10, description="Maximum number of similar investors to return"),
db: Session = Depends(get_db), db: Session = Depends(get_db)
): ):
"""Find investors similar to a given investor based on characteristics """Find investors similar to a given investor based on characteristics"""
Returns investor-fund combinations as separate rows.
"""
# Get the target investor # Get the target investor
target_investor = ( target_investor = (
@@ -383,7 +196,6 @@ def find_similar_investors(
selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members), selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors), selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
) )
.filter(InvestorTable.id == investor_id) .filter(InvestorTable.id == investor_id)
.first() .first()
@@ -402,7 +214,6 @@ def find_similar_investors(
selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members), selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors), selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
) )
.filter(InvestorTable.id != investor_id) .filter(InvestorTable.id != investor_id)
.all() .all()
@@ -419,38 +230,21 @@ def find_similar_investors(
# Geographic focus match (20 points for exact, 10 for partial) # Geographic focus match (20 points for exact, 10 for partial)
if candidate.geographic_focus and target_investor.geographic_focus: if candidate.geographic_focus and target_investor.geographic_focus:
if ( if candidate.geographic_focus.lower() == target_investor.geographic_focus.lower():
candidate.geographic_focus.lower()
== target_investor.geographic_focus.lower()
):
score += 20 score += 20
elif ( elif (candidate.geographic_focus.lower() in target_investor.geographic_focus.lower() or
candidate.geographic_focus.lower() target_investor.geographic_focus.lower() in candidate.geographic_focus.lower()):
in target_investor.geographic_focus.lower()
or target_investor.geographic_focus.lower()
in candidate.geographic_focus.lower()
):
score += 10 score += 10
# Check size overlap (20 points max) # Check size overlap (20 points max)
if ( if (candidate.check_size_lower and candidate.check_size_upper and
candidate.check_size_lower target_investor.check_size_lower and target_investor.check_size_upper):
and candidate.check_size_upper
and target_investor.check_size_lower
and target_investor.check_size_upper
):
# Calculate overlap percentage # Calculate overlap percentage
overlap_start = max( overlap_start = max(candidate.check_size_lower, target_investor.check_size_lower)
candidate.check_size_lower, target_investor.check_size_lower overlap_end = min(candidate.check_size_upper, target_investor.check_size_upper)
)
overlap_end = min(
candidate.check_size_upper, target_investor.check_size_upper
)
if overlap_end > overlap_start: if overlap_end > overlap_start:
overlap = overlap_end - overlap_start overlap = overlap_end - overlap_start
target_range = ( target_range = target_investor.check_size_upper - target_investor.check_size_lower
target_investor.check_size_upper - target_investor.check_size_lower
)
overlap_ratio = overlap / target_range if target_range > 0 else 0 overlap_ratio = overlap / target_range if target_range > 0 else 0
score += int(20 * overlap_ratio) score += int(20 * overlap_ratio)
@@ -475,71 +269,13 @@ def find_similar_investors(
scored_investors.sort(key=lambda x: x[0], reverse=True) scored_investors.sort(key=lambda x: x[0], reverse=True)
similar_investors = [inv for score, inv in scored_investors[:limit]] similar_investors = [inv for score, inv in scored_investors[:limit]]
# Transform to InvestorFundData format (one row per investor-fund combination) # Transform to InvestorData format
investor_fund_list = [] return [
for investor in similar_investors: InvestorData(
# If investor has funds, create one entry per fund investor=inv,
if investor.funds: portfolio_companies=inv.portfolio_companies,
for fund in investor.funds: team_members=inv.team_members,
investor_fund_data = InvestorFundData( sectors=inv.sectors,
# Investor fields )
investor_id=investor.id, for inv in similar_investors
investor_name=investor.name, ]
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields
fund_id=fund.id,
fund_name=fund.fund_name,
fund_size=fund.fund_size,
fund_size_source_url=fund.fund_size_source_url,
check_size_lower=fund.check_size_lower,
check_size_upper=fund.check_size_upper,
geographic_focus=fund.geographic_focus,
fund_investment_stages=fund.investment_stages, # Now a relationship
fund_sectors=fund.sectors, # Now a relationship
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
else:
# If no funds, create one entry with null fund fields
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields (null)
fund_id=None,
fund_name=None,
fund_size=None,
fund_size_source_url=None,
check_size_lower=None,
check_size_upper=None,
geographic_focus=None,
fund_investment_stages=None,
fund_sectors=None,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
return investor_fund_list
Binary file not shown.
+4
View File
@@ -258,6 +258,10 @@ class InvestorSchema(BaseModel):
default=None, default=None,
description="Geographic investment focus. Do not return any special characters, Just locations separated by commas. Leave empty if not clearly identifiable.", description="Geographic investment focus. Do not return any special characters, Just locations separated by commas. Leave empty if not clearly identifiable.",
) )
stage_focus: InvestmentStage = Field(
default=InvestmentStage.SEED,
description="Investment stage focus. Use SEED as default if uncertain.",
)
number_of_investments: Optional[int] = Field( number_of_investments: Optional[int] = Field(
default=None, default=None,
ge=0, ge=0,
+1 -77
View File
@@ -22,14 +22,6 @@ class SectorSchema(BaseModel):
from_attributes = True from_attributes = True
class InvestmentStageSchema(BaseModel):
id: int
name: str
class Config:
from_attributes = True
class InvestorMemberSchema(BaseModel): class InvestorMemberSchema(BaseModel):
id: int id: int
name: str name: str
@@ -40,25 +32,6 @@ class InvestorMemberSchema(BaseModel):
from_attributes = True from_attributes = True
class FundSchema(BaseModel):
id: int
fund_name: str | None
fund_size: int | None # Changed to int for numerical filtering
fund_size_source_url: str | None
check_size_lower: int | None # NEW: Lower bound of check size range
check_size_upper: int | None # NEW: Upper bound of check size range
source_url: str | None
source_provider: str | None
geographic_focus: str | None # Changed from List[str] to string
investment_stages: List[InvestmentStageSchema] | None # Changed to relationship
sectors: List[SectorSchema] | None # Changed to relationship
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
from_attributes = True
class CompanyMemberSchema(BaseModel): class CompanyMemberSchema(BaseModel):
id: int id: int
name: Optional[str] name: Optional[str]
@@ -103,55 +76,12 @@ class InvestorSchema(BaseModel):
class InvestorData(BaseModel): class InvestorData(BaseModel):
"""Comprehensive investor data schema - used for individual investor requests""" """Comprehensive investor data schema for LLM processing"""
investor: InvestorSchema investor: InvestorSchema
portfolio_companies: List[CompanySchema] portfolio_companies: List[CompanySchema]
team_members: List[InvestorMemberSchema] team_members: List[InvestorMemberSchema]
sectors: List[SectorSchema] sectors: List[SectorSchema]
funds: List[FundSchema]
class Config:
from_attributes = True
class InvestorFundData(BaseModel):
"""Investor-Fund combined data - used for list/filter requests
Each row represents one investor-fund combination.
An investor with 3 funds will appear as 3 separate entries.
"""
# Investor fields
investor_id: int
investor_name: str
investor_description: Optional[str]
investor_website: Optional[str]
investor_headquarters: Optional[str]
aum: int | None
aum_as_of_date: str | None
aum_source_url: str | None
investment_thesis: List[str] | None
portfolio_highlights: List[str] | None
number_of_investments: int | None
# Fund fields
fund_id: int | None
fund_name: str | None
fund_size: int | None # Changed to int for numerical filtering
fund_size_source_url: str | None
check_size_lower: int | None # NEW: Lower bound of check size range
check_size_upper: int | None # NEW: Upper bound of check size range
geographic_focus: str | None # Changed from List[str] to string
fund_investment_stages: (
List[InvestmentStageSchema] | None
) # Changed to relationship
fund_sectors: List[SectorSchema] | None # Changed to relationship
# Related data
portfolio_companies: List[CompanySchema]
team_members: List[InvestorMemberSchema]
sectors: List[SectorSchema]
class Config: class Config:
from_attributes = True from_attributes = True
@@ -169,9 +99,3 @@ class CompanyData(BaseModel): # Renamed from CompaniesData for consistency
class InvestorList(BaseModel): class InvestorList(BaseModel):
investors: List[InvestorData] investors: List[InvestorData]
class InvestorFundList(BaseModel):
"""List of investor-fund combinations"""
investor_funds: List[InvestorFundData]
Binary file not shown.
Binary file not shown.
+84 -681
View File
@@ -1,6 +1,5 @@
import json import asyncio
import os import os
import re
from typing import Optional from typing import Optional
import pandas as pd import pandas as pd
@@ -8,35 +7,15 @@ from db.db import get_db_session
from db.models import ( from db.models import (
CompanyMember, CompanyMember,
CompanyTable, CompanyTable,
FundTable,
InvestmentStageTable,
InvestorMember, InvestorMember,
InvestorTable, InvestorTable,
SectorTable, SectorTable,
) )
from langchain_openai import ChatOpenAI from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from schemas.py_schemas import CompanyData, InvestorData from schemas.py_schemas import CompanyData, InvestorData
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
class CurrencyConversion(BaseModel):
"""Schema for LLM currency conversion responses"""
amount_usd: int = 0
confidence: str = "high" # high, medium, low
notes: str = ""
class CheckSizeRange(BaseModel):
"""Schema for LLM check size range parsing from estimated investment size"""
lower_bound_usd: int = 0
upper_bound_usd: int = 0
confidence: str = "high" # high, medium, low
notes: str = ""
class InvestorProcessor: class InvestorProcessor:
def __init__(self): def __init__(self):
self.llm = ChatOpenAI( self.llm = ChatOpenAI(
@@ -46,508 +25,9 @@ class InvestorProcessor:
temperature=0, temperature=0,
) )
# Structured LLMs for specific parsing tasks
self.currency_converter_llm = self.llm.with_structured_output(
CurrencyConversion
)
self.check_size_parser_llm = self.llm.with_structured_output(CheckSizeRange)
# Keep legacy structured LLMs for backward compatibility
self.investor_structured_llm = self.llm.with_structured_output(InvestorData) self.investor_structured_llm = self.llm.with_structured_output(InvestorData)
self.company_structured_llm = self.llm.with_structured_output(CompanyData) self.company_structured_llm = self.llm.with_structured_output(CompanyData)
async def convert_to_usd(self, amount_str: str) -> Optional[int]:
"""
Use LLM to convert currency amounts to USD integers.
Handles formats like:
- "EUR 850,000,000"
- "$5M"
- "GBP 10-20 million"
- "Approximately EUR 100 million"
"""
if not amount_str or amount_str == "Not Available" or amount_str == "0":
return None
try:
prompt = f"""Convert this amount to USD as an integer (whole number, no decimals).
If it's a range, use the midpoint. If already in USD, just extract the number.
Remove all commas and convert millions/billions to actual numbers.
Amount: {amount_str}
Examples:
- "EUR 850,000,000" -> 935000000 (assuming EUR to USD rate ~1.10)
- "$5M" -> 5000000
- "GBP 10-20 million" -> 18000000 (midpoint 15M * 1.20 rate)
- "Approximately EUR 100 million" -> 110000000
Return only the USD integer amount with current exchange rates."""
result = await self.currency_converter_llm.ainvoke(prompt)
return result.amount_usd if result.amount_usd > 0 else None
except Exception as e:
print(f"Error converting currency '{amount_str}': {e}")
return None
async def parse_check_size_range(
self, estimated_investment_str: str
) -> tuple[Optional[int], Optional[int]]:
"""
Use LLM to parse check size range from estimated investment size string.
Returns tuple of (lower_bound_usd, upper_bound_usd).
Handles formats like:
- "EUR 1,000 to 2,000"
- "$100K-$500K"
- "Between $1M and $5M"
- "Up to EUR 10 million"
- "$2M typical"
"""
if (
not estimated_investment_str
or estimated_investment_str == "Not Available"
or estimated_investment_str == "0"
):
return None, None
try:
prompt = f"""Parse this check size/investment range into lower and upper bounds in USD as integers.
Input: {estimated_investment_str}
Instructions:
- If it's a range (e.g., "EUR 1M to 5M"), extract both bounds
- If it's a single amount (e.g., "$2M typical"), use it as both lower and upper
- If it says "up to X", use 0 as lower and X as upper
- Convert all currencies to USD using current exchange rates
- Return integers (whole numbers, no decimals)
Examples:
- "EUR 1,000 to 2,000" -> lower: 1100, upper: 2200
- "$100K-$500K" -> lower: 100000, upper: 500000
- "Between $1M and $5M" -> lower: 1000000, upper: 5000000
- "Up to EUR 10 million" -> lower: 0, upper: 11000000
- "$2M typical" -> lower: 2000000, upper: 2000000
- "GBP 500K-2M" -> lower: 600000, upper: 2400000
Return the lower and upper bounds in USD."""
result = await self.check_size_parser_llm.ainvoke(prompt)
lower = result.lower_bound_usd if result.lower_bound_usd > 0 else None
upper = result.upper_bound_usd if result.upper_bound_usd > 0 else None
return lower, upper
except Exception as e:
print(f"Error parsing check size range '{estimated_investment_str}': {e}")
return None, None
def parse_json_profile(self, json_str: str) -> Optional[dict]:
"""
Manually parse the JSON profile from the CSV.
Returns a cleaned dictionary with the investor profile data.
"""
if not json_str or pd.isna(json_str):
return None
try:
# Parse JSON string
profile = json.loads(json_str)
return profile
except json.JSONDecodeError as e:
print(f"Error parsing JSON: {e}")
return None
async def process_investor_profile(
self, name: str, website: str, profile_json: str
) -> Optional[dict]:
"""
Process investor profile from CSV data.
Manually extracts fields and uses LLM only for currency conversion.
"""
profile = self.parse_json_profile(profile_json)
if not profile:
return None
try:
# Extract basic info
investor_data = {
"name": name.strip() if name else None,
"website": website.strip() if website else None,
"headquarters": profile.get("headquarters"),
"description": profile.get("investorDescription"),
"aum": None,
"aum_as_of_date": None,
"aum_source_url": None,
"investment_thesis": profile.get("investmentThesisFocus", []),
"portfolio_highlights": profile.get("portfolioHighlights", []),
"linked_documents": profile.get("linkedDocuments", []),
"researcher_notes": profile.get("researcherNotes"),
"missing_important_fields": profile.get("missingImportantFields", []),
"sources": profile.get("sources", {}),
"team_members": [],
"funds": [],
}
# Process AUM
aum_data = profile.get("overallAssetsUnderManagement", {})
if aum_data and isinstance(aum_data, dict):
aum_amount = aum_data.get("aumAmount")
if aum_amount and aum_amount != "Not Available":
# Convert AUM to USD integer
aum_usd = await self.convert_to_usd(aum_amount)
investor_data["aum"] = aum_usd
investor_data["aum_as_of_date"] = aum_data.get("asOfDate")
investor_data["aum_source_url"] = aum_data.get("sourceUrl")
# Process senior leadership
senior_leadership = profile.get("seniorLeadership", [])
for member in senior_leadership:
if isinstance(member, dict) and member.get("name"):
investor_data["team_members"].append(
{
"name": member.get("name"),
"title": member.get("title"),
"role": member.get("title"), # Use title as role
"email": None,
"source_url": member.get("sourceUrl"),
}
)
# Process funds
funds = profile.get("funds", [])
for fund in funds:
if isinstance(fund, dict):
fund_data = {
"fund_name": fund.get("fundName"),
"fund_size": None,
"fund_size_source_url": fund.get("fundSizeSourceUrl"),
"check_size_lower": None,
"check_size_upper": None,
"source_url": fund.get("sourceUrl"),
"source_provider": fund.get("sourceProvider"),
"geographic_focus": None, # Will be converted to string
"investment_stage_names": fund.get("investmentStageFocus", []),
"sector_names": fund.get("sectorFocus", []),
}
# Convert geographic focus from array to comma-separated string
geo_focus = fund.get("geographicFocus", [])
if geo_focus and isinstance(geo_focus, list):
fund_data["geographic_focus"] = ", ".join(geo_focus)
# Convert fund size to USD integer
fund_size_str = fund.get("fundSize")
if fund_size_str and fund_size_str != "Not Available":
fund_size_usd = await self.convert_to_usd(fund_size_str)
if fund_size_usd:
fund_data["fund_size"] = fund_size_usd # Store as integer
# Parse check size range from estimated investment size
est_size_str = fund.get("estimatedInvestmentSize")
if est_size_str and est_size_str != "Not Available":
check_lower, check_upper = await self.parse_check_size_range(
est_size_str
)
if check_lower is not None:
fund_data["check_size_lower"] = check_lower
if check_upper is not None:
fund_data["check_size_upper"] = check_upper
investor_data["funds"].append(fund_data)
return investor_data
except Exception as e:
print(f"Error processing investor profile for {name}: {e}")
return None
async def process_company_profile(
self, name: str, website: str, profile_json: str, investor_names: str = None
) -> Optional[dict]:
"""
Process company profile from CSV data.
Manually extracts fields without using LLM.
"""
profile = self.parse_json_profile(profile_json)
if not profile:
return None
try:
# Extract basic info
company_data = {
"name": name.strip() if name else None,
"website": website.strip() if website else None,
"description": profile.get("companyDescription"),
"location": profile.get("geographicFocus"),
"industry": profile.get("sectorDescription"),
"founded_year": None, # Not typically in the company JSON
"key_executives": [],
"client_categories": profile.get("clientCategories", []),
"product_description": profile.get("productDescription"),
"linked_documents": profile.get("linkedDocuments", []),
"researcher_notes": profile.get("researcherNotes"),
"missing_important_fields": profile.get("missingImportantFields", []),
"sources": profile.get("sources", {}),
"investor_names": [],
}
# Parse investor names from the Investor column
if investor_names and pd.notna(investor_names):
# Split by comma and clean
investors = [inv.strip() for inv in str(investor_names).split(",")]
company_data["investor_names"] = [inv for inv in investors if inv]
# Process key executives/leadership
key_executives = profile.get("keyExecutives", [])
if not key_executives:
# Try alternative field names
key_executives = profile.get("seniorLeadership", [])
for exec_member in key_executives:
if isinstance(exec_member, dict) and exec_member.get("name"):
company_data["key_executives"].append(
{
"name": exec_member.get("name"),
"title": exec_member.get("title"),
"source_url": exec_member.get("sourceUrl"),
}
)
# Try to extract founding year from description
description = company_data.get("description", "")
if description:
# Look for patterns like "founded in 2020", "Gegründet 2020", "founded 2020"
year_patterns = [
r"founded in (\d{4})",
r"founded (\d{4})",
r"Gegründet (\d{4})",
r"established in (\d{4})",
r"since (\d{4})",
r"\((\d{4})\)", # Year in parentheses
]
for pattern in year_patterns:
match = re.search(pattern, description, re.IGNORECASE)
if match:
try:
year = int(match.group(1))
if 1900 <= year <= 2025: # Sanity check
company_data["founded_year"] = year
break
except Exception:
continue
return company_data
except Exception as e:
print(f"Error processing company profile for {name}: {e}")
return None
def _save_parsed_company_to_db(
self, db: Session, company_data: dict
) -> Optional[CompanyTable]:
"""Save manually parsed company data to database"""
try:
# Check if company already exists
existing_company = (
db.query(CompanyTable).filter_by(name=company_data["name"]).first()
)
if existing_company:
# Update existing company
company = existing_company
company.website = company_data.get("website") or company.website
company.location = company_data.get("location") or company.location
company.description = (
company_data.get("description") or company.description
)
company.industry = company_data.get("industry") or company.industry
if company_data.get("founded_year"):
company.founded_year = company_data["founded_year"]
else:
# Create new company
company = CompanyTable(
name=company_data["name"],
website=company_data.get("website"),
location=company_data.get("location"),
description=company_data.get("description"),
industry=company_data.get("industry"),
founded_year=company_data.get("founded_year"),
)
db.add(company)
db.flush()
# Add/update company members (key executives)
# First, remove existing members if updating
if existing_company:
db.query(CompanyMember).filter_by(company_id=company.id).delete()
for exec_data in company_data.get("key_executives", []):
member = CompanyMember(
name=exec_data.get("name"),
role=exec_data.get("title"),
linkedin=exec_data.get(
"source_url"
), # Store source URL in linkedin field
company_id=company.id,
)
db.add(member)
# Link to investors if provided
for investor_name in company_data.get("investor_names", []):
# Find investor in database
investor = (
db.query(InvestorTable)
.filter_by(name=investor_name.strip())
.first()
)
if investor:
# Add company to investor's portfolio if not already there
if company not in investor.portfolio_companies:
investor.portfolio_companies.append(company)
return company
except Exception as e:
print(f"Error saving company to database: {e}")
db.rollback()
return None
def _save_parsed_investor_to_db(
self, db: Session, investor_data: dict
) -> Optional[InvestorTable]:
"""Save manually parsed investor data to database"""
try:
# Check if investor already exists
existing_investor = (
db.query(InvestorTable).filter_by(name=investor_data["name"]).first()
)
if existing_investor:
# Update existing investor
investor = existing_investor
investor.website = investor_data.get("website") or investor.website
investor.headquarters = (
investor_data.get("headquarters") or investor.headquarters
)
investor.description = (
investor_data.get("description") or investor.description
)
investor.aum = investor_data.get("aum") or investor.aum
investor.aum_as_of_date = (
investor_data.get("aum_as_of_date") or investor.aum_as_of_date
)
investor.aum_source_url = (
investor_data.get("aum_source_url") or investor.aum_source_url
)
investor.investment_thesis = (
investor_data.get("investment_thesis") or investor.investment_thesis
)
investor.portfolio_highlights = (
investor_data.get("portfolio_highlights")
or investor.portfolio_highlights
)
investor.linked_documents = (
investor_data.get("linked_documents") or investor.linked_documents
)
investor.researcher_notes = (
investor_data.get("researcher_notes") or investor.researcher_notes
)
investor.missing_important_fields = (
investor_data.get("missing_important_fields")
or investor.missing_important_fields
)
investor.sources = investor_data.get("sources") or investor.sources
else:
# Create new investor
investor = InvestorTable(
name=investor_data["name"],
website=investor_data.get("website"),
headquarters=investor_data.get("headquarters"),
description=investor_data.get("description"),
aum=investor_data.get("aum"),
aum_as_of_date=investor_data.get("aum_as_of_date"),
aum_source_url=investor_data.get("aum_source_url"),
investment_thesis=investor_data.get("investment_thesis"),
portfolio_highlights=investor_data.get("portfolio_highlights"),
linked_documents=investor_data.get("linked_documents"),
researcher_notes=investor_data.get("researcher_notes"),
missing_important_fields=investor_data.get(
"missing_important_fields"
),
sources=investor_data.get("sources"),
)
db.add(investor)
db.flush()
# Add/update team members
# First, remove existing team members if updating
if existing_investor:
db.query(InvestorMember).filter_by(investor_id=investor.id).delete()
for member_data in investor_data.get("team_members", []):
member = InvestorMember(
name=member_data.get("name"),
role=member_data.get("role"),
title=member_data.get("title"),
email=member_data.get("email"),
source_url=member_data.get("source_url"),
investor_id=investor.id,
)
db.add(member)
# Add/update funds
# First, remove existing funds if updating
if existing_investor:
db.query(FundTable).filter_by(investor_id=investor.id).delete()
for fund_data in investor_data.get("funds", []):
fund = FundTable(
investor_id=investor.id,
fund_name=fund_data.get("fund_name"),
fund_size=fund_data.get("fund_size"), # Now an integer
fund_size_source_url=fund_data.get("fund_size_source_url"),
check_size_lower=fund_data.get("check_size_lower"),
check_size_upper=fund_data.get("check_size_upper"),
source_url=fund_data.get("source_url"),
source_provider=fund_data.get("source_provider"),
geographic_focus=fund_data.get("geographic_focus"), # Now a string
)
db.add(fund)
db.flush() # Get the fund ID
# Add investment stages (many-to-many)
for stage_name in fund_data.get("investment_stage_names", []):
stage = self._get_or_create_investment_stage(db, stage_name)
fund.investment_stages.append(stage)
# Add sectors (many-to-many)
for sector_name in fund_data.get("sector_names", []):
sector = self._get_or_create_sector(db, sector_name)
fund.sectors.append(sector)
return investor
except Exception as e:
print(f"Error saving investor to database: {e}")
db.rollback()
return None
def _get_or_create_investment_stage(
self, db: Session, stage_name: str
) -> InvestmentStageTable:
"""Get existing investment stage or create new one"""
from db.models import InvestmentStageTable
stage = (
db.query(InvestmentStageTable)
.filter(InvestmentStageTable.name == stage_name)
.first()
)
if not stage:
stage = InvestmentStageTable(name=stage_name)
db.add(stage)
db.flush() # Get the ID without committing
return stage
def _get_or_create_sector(self, db: Session, sector_name: str) -> SectorTable: def _get_or_create_sector(self, db: Session, sector_name: str) -> SectorTable:
"""Get existing sector or create new one""" """Get existing sector or create new one"""
sector = db.query(SectorTable).filter(SectorTable.name == sector_name).first() sector = db.query(SectorTable).filter(SectorTable.name == sector_name).first()
@@ -569,6 +49,7 @@ Return the lower and upper bounds in USD."""
check_size_lower=investor_data.investor.check_size_lower, check_size_lower=investor_data.investor.check_size_lower,
check_size_upper=investor_data.investor.check_size_upper, check_size_upper=investor_data.investor.check_size_upper,
geographic_focus=investor_data.investor.geographic_focus, geographic_focus=investor_data.investor.geographic_focus,
stage_focus=investor_data.investor.stage_focus,
number_of_investments=investor_data.investor.number_of_investments, number_of_investments=investor_data.investor.number_of_investments,
) )
db.add(investor) db.add(investor)
@@ -692,219 +173,141 @@ Return the lower and upper bounds in USD."""
print(f"Error processing row {row_idx + 1}: {e}") print(f"Error processing row {row_idx + 1}: {e}")
return None return None
async def parse_investors(self, df: pd.DataFrame, save_to_db: bool = True): async def parse_investors(self, df, save_to_db: bool = True):
""" """Parse investors from DataFrame and optionally save to database"""
Parse investors from DataFrame using manual JSON parsing and LLM for currency conversion. investors = []
Expected CSV columns: Name, Website, Final Investor Profile, Final Profile sourcing df = df[20:]
"""
results = []
db = None db = None
if save_to_db: if save_to_db:
db = get_db_session() db = get_db_session()
try: try:
total_rows = len(df) # Process rows in batches asynchronously
print(f"\n🚀 Starting to process {total_rows} investors...") batch_size = 20 # Adjust batch size as needed
rows = [(idx, row) for idx, row in df.iterrows()]
for idx, row in df.iterrows(): for i in range(0, len(rows), batch_size):
try: batch = rows[i : i + batch_size]
name = (
row.get("Name", "").strip()
if pd.notna(row.get("Name"))
else None
)
website = (
row.get("Website", "").strip()
if pd.notna(row.get("Website"))
else None
)
profile_json = (
row.get("Final Investor Profile", "")
if pd.notna(row.get("Final Investor Profile"))
else None
)
if not name or not profile_json: # Process batch asynchronously
print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile") tasks = [
self._process_row(row, idx, is_investor=True) for idx, row in batch
]
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle results from batch
for (idx, row), result in zip(batch, batch_results):
if isinstance(result, Exception):
print(f"Error processing row {idx}: {result}")
if db:
db.rollback()
continue continue
print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}") if result:
# Convert dict to InvestorData if needed
if isinstance(result, dict):
investor_data = InvestorData(**result)
else:
investor_data = result
# Process the investor profile investors.append(investor_data)
investor_data = await self.process_investor_profile(
name, website, profile_json
)
if investor_data: # Save to database if requested
results.append(investor_data)
print(" ✓ Parsed successfully")
print(f" - HQ: {investor_data.get('headquarters')}")
print(
f" - AUM: ${investor_data.get('aum'):,}"
if investor_data.get("aum")
else " - AUM: Not Available"
)
print(f" - Funds: {len(investor_data.get('funds', []))}")
print(
f" - Team: {len(investor_data.get('team_members', []))}"
)
# Save to database
if save_to_db and db: if save_to_db and db:
try: try:
saved_investor = self._save_parsed_investor_to_db( saved_investor = self._save_investor_to_db(
db, investor_data db, investor_data
) )
if saved_investor: db.commit()
db.commit() print(
print( f"✅ Saved investor '{saved_investor.name}' to database"
f" ✅ Saved to database (ID: {saved_investor.id})" )
)
else:
print(" ❌ Failed to save to database")
except Exception as e: except Exception as e:
db.rollback() db.rollback()
print(f" ❌ Database error: {e}") print(f"❌ Failed to save investor to database: {e}")
else:
print(" ⚠️ Failed to process profile")
# Commit every 10 investors to avoid memory issues print(
if save_to_db and db and (idx + 1) % 10 == 0: f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
db.commit() )
print(f"\n💾 Committed batch at row {idx + 1}")
except Exception as e:
print(f"❌ Error processing row {idx + 1}: {e}")
if db:
db.rollback()
continue
# Final commit
if save_to_db and db:
db.commit()
print("\n✅ Final commit completed")
except Exception as e: except Exception as e:
print(f"❌ Fatal error in parse_investors: {e}") print(f"Error in batch processing: {e}")
if db: if db:
db.rollback() db.rollback()
finally: finally:
if db: if db:
db.close() db.close()
print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} investors") return investors
return results
async def parse_companies(self, df: pd.DataFrame, save_to_db: bool = True): async def parse_companies(self, df, save_to_db: bool = True):
""" """Parse companies from DataFrame and optionally save to database"""
Parse companies from DataFrame using manual JSON parsing. companies = []
Expected CSV columns: Name, Website, Investor, Final Investor Profile (actually company profile) df = df[20:]
"""
results = []
db = None db = None
if save_to_db: if save_to_db:
db = get_db_session() db = get_db_session()
try: try:
total_rows = len(df) # Process rows in batches asynchronously
print(f"\n🚀 Starting to process {total_rows} companies...") batch_size = 20 # Adjust batch size as needed
rows = [(idx, row) for idx, row in df.iterrows()]
for idx, row in df.iterrows(): for i in range(0, len(rows), batch_size):
try: batch = rows[i : i + batch_size]
name = (
row.get("Name", "").strip()
if pd.notna(row.get("Name"))
else None
)
website = (
row.get("Website", "").strip()
if pd.notna(row.get("Website"))
else None
)
investor_names = (
row.get("Investor", "").strip()
if pd.notna(row.get("Investor"))
else None
)
profile_json = (
row.get("Final Investor Profile", "")
if pd.notna(row.get("Final Investor Profile"))
else None
)
if not name or not profile_json: # Process batch asynchronously
print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile") tasks = [
self._process_row(row, idx, is_investor=False) for idx, row in batch
]
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle results from batch
for (idx, row), result in zip(batch, batch_results):
if isinstance(result, Exception):
print(f"Error processing row {idx}: {result}")
if db:
db.rollback()
continue continue
print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}") if result:
# Convert dict to CompanyData if needed
if isinstance(result, dict):
company_data = CompanyData(**result)
else:
company_data = result
# Process the company profile companies.append(company_data)
company_data = await self.process_company_profile(
name, website, profile_json, investor_names
)
if company_data: # Save to database if requested
results.append(company_data)
print(" ✓ Parsed successfully")
print(f" - Location: {company_data.get('location')}")
print(f" - Industry: {company_data.get('industry')}")
print(
f" - Founded: {company_data.get('founded_year')}"
if company_data.get("founded_year")
else " - Founded: Unknown"
)
print(
f" - Executives: {len(company_data.get('key_executives', []))}"
)
print(
f" - Investors: {len(company_data.get('investor_names', []))}"
)
# Save to database
if save_to_db and db: if save_to_db and db:
try: try:
saved_company = self._save_parsed_company_to_db( saved_company = self._save_company_to_db(
db, company_data db, company_data
) )
if saved_company: db.commit()
db.commit() print(
print( f"✅ Saved company '{saved_company.name}' to database"
f" ✅ Saved to database (ID: {saved_company.id})" )
)
else:
print(" ❌ Failed to save to database")
except Exception as e: except Exception as e:
db.rollback() db.rollback()
print(f" ❌ Database error: {e}") print(f"❌ Failed to save company to database: {e}")
else:
print(" ⚠️ Failed to process profile")
# Commit every 10 companies to avoid memory issues print(
if save_to_db and db and (idx + 1) % 10 == 0: f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
db.commit() )
print(f"\n💾 Committed batch at row {idx + 1}")
except Exception as e:
print(f"❌ Error processing row {idx + 1}: {e}")
if db:
db.rollback()
continue
# Final commit
if save_to_db and db:
db.commit()
print("\n✅ Final commit completed")
except Exception as e: except Exception as e:
print(f"❌ Fatal error in parse_companies: {e}") print(f"Error processing row {idx}: {e}")
if db: if db:
db.rollback() db.rollback()
finally: finally:
if db: if db:
db.close() db.close()
print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} companies") return companies
return results
# async def main(): # async def main():
-2
View File
@@ -95,7 +95,6 @@ class QueryProcessor:
selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members), selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors), selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
) )
.filter(InvestorTable.id.in_(investor_ids)) .filter(InvestorTable.id.in_(investor_ids))
) )
@@ -110,7 +109,6 @@ class QueryProcessor:
portfolio_companies=investor.portfolio_companies, portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members, team_members=investor.team_members,
sectors=investor.sectors, sectors=investor.sectors,
funds=investor.funds,
) )
investor_data_list.append(investor_data) investor_data_list.append(investor_data)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
Binary file not shown.
-315
View File
@@ -1,315 +0,0 @@
import logging
import re
import unicodedata
import pandas as pd
from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
from sqlalchemy.orm import sessionmaker
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Import the schema
init_database()
# ===================== Ingesting Original Data =====================#
def parse_investor_names(investor_names_str):
"""Parse comma-separated investor names and return a list"""
if pd.isna(investor_names_str) or investor_names_str == "":
return []
# Split by comma and clean whitespace
# investors = [name.strip() for name in str(investor_names_str).split(",")]
investors = [
clean_name(name.strip()) for name in str(investor_names_str).split(",")
]
return [investor for investor in investors if investor]
def parse_industries(industries_str):
"""Parse comma-separated industries and return a list"""
if pd.isna(industries_str) or industries_str == "":
return []
# Split by comma and clean whitespace
industries = [industry.strip() for industry in str(industries_str).split(",")]
return [industry for industry in industries if industry]
def clean_special_characters(text):
"""Clean special characters from text, converting to ASCII equivalents"""
if not text:
return text
# First remove ellipses and other problematic patterns
text = str(text).replace("...", "").replace("..", "")
# Normalize unicode characters to their closest ASCII equivalents
normalized = unicodedata.normalize("NFKD", text)
# Remove accents and convert to ASCII
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
# Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)
# Clean up multiple spaces
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned
def clean_string(value):
"""Clean string values, converting empty/null/nan/0 to None and removing special characters"""
if (
pd.isna(value)
or value == ""
or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
):
return None
# First clean special characters
cleaned = clean_special_characters(str(value).strip())
# Check if result is just "0" after cleaning
if cleaned in ["0", "0.0", "null", "nan", "none"]:
return None
return cleaned if cleaned else None
def clean_name(value):
"""Clean names (companies, investors) with special character handling"""
if (
pd.isna(value)
or value == ""
or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
):
return None
# Clean special characters but be more permissive for names
text = str(value).strip()
# First remove ellipses and other problematic patterns
# text = text.replace("...", "").replace("..", "")
# Normalize unicode characters
normalized = unicodedata.normalize("NFKD", text)
# Convert to ASCII but keep more characters for business names
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
# Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)
# Clean up multiple spaces
cleaned = re.sub(r"\s+", " ", cleaned).strip()
# Remove any trailing or leading periods
cleaned = cleaned.strip(".")
cleaned = cleaned.replace("..", "").replace("...", "")
# Check if result is just "0" after cleaning
if cleaned in ["0", "0.0", "null", "nan", "none"]:
return None
return cleaned if cleaned else None
def clean_integer(value):
"""Clean integer values, converting empty/null/nan/0 to None"""
if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
return None
try:
cleaned_val = int(float(value))
return cleaned_val if cleaned_val > 0 else None
except (ValueError, TypeError):
return None
def parse_website(website_str: str):
try:
_, end = website_str.split(":")
if end == "0":
return None
return "https:" + end
except Exception:
return None
def ingest_data():
# Create database engine and session
Session = sessionmaker(bind=engine)
session = Session()
# Load CSV files
print("Loading CSV files...")
companies_df = pd.read_csv("companies.csv")
investors_df = pd.read_csv("investors.csv")
print(f"📊 Companies CSV: {len(companies_df)} rows")
print(f"📊 Investors CSV: {len(investors_df)} rows")
# Step 1: Ingest Investors
print("\n🔄 Step 1: Ingesting Investors...")
investors_processed = 0
for index, row in investors_df.iterrows():
try:
investor_name = clean_name(row.get("Filtered investor names", ""))
if investor_name:
# Check if investor already exists
existing_investor = (
session.query(InvestorTable).filter_by(name=investor_name).first()
)
if not existing_investor:
investor = InvestorTable(
name=investor_name,
description=clean_string(row.get("Business model", "")),
headquarters=clean_string(row.get("HQ", "")),
website=parse_website(str(row.get("Website", "")).strip()),
number_of_investments=clean_integer(
row.get("Number of investments")
),
)
session.add(investor)
investors_processed += 1
if investors_processed % 1000 == 0:
session.commit()
print(f" Committed {investors_processed} investors")
except Exception as e:
logger.error(f"Error processing investor {index}: {e}")
continue
session.commit()
print(f"✅ Investors completed: {investors_processed} processed")
# Step 2: Ingest Companies and Rounds
print("\n🔄 Step 2: Ingesting Companies and Sectors...")
companies_processed = 0
sectors_created = set()
for index, row in companies_df.iterrows():
try:
# Process company
company_name = clean_name(row.get("Organization Name", ""))
if not company_name:
continue
# Check if company already exists
existing_company = (
session.query(CompanyTable).filter_by(name=company_name).first()
)
if existing_company:
company = existing_company
else:
# Create company
company = CompanyTable(
name=company_name,
description=clean_string(row.get("Organization Description", "")),
location=clean_string(row.get("Organization Location", "")),
industry=clean_string(row.get("Organization Industries", "")),
website=clean_string(row.get("Organization Website", "")),
)
session.add(company)
session.flush() # Get the company ID
companies_processed += 1
# Process investor relationships
investor_names_str = row.get("Investor Names", "")
if pd.notna(investor_names_str) and investor_names_str:
investor_names = parse_investor_names(investor_names_str)
for investor_name in investor_names:
# Find investor in database
investor = (
session.query(InvestorTable)
.filter_by(name=investor_name.strip())
.first()
)
if investor:
# Add investor-company relationship
if company not in investor.portfolio_companies:
investor.portfolio_companies.append(company)
else:
print("This company has an investor not in DB:", investor_name)
# Process sectors/industries
industries_str = row.get("Organization Industries", "")
if pd.notna(industries_str) and industries_str:
industries = parse_industries(industries_str)
for industry_name in industries:
industry_name = industry_name.strip()
if industry_name:
# Check if sector exists
sector = (
session.query(SectorTable)
.filter_by(name=industry_name)
.first()
)
if not sector:
sector = SectorTable(name=industry_name)
session.add(sector)
session.flush()
sectors_created.add(industry_name)
# Add company-sector relationship
if sector not in company.sectors:
company.sectors.append(sector)
# Commit every 100 companies
if companies_processed % 100 == 0 and companies_processed > 0:
session.commit()
print(f" Processed {companies_processed} companies...")
except Exception as e:
logger.error(f"Error processing company {index}: {e}")
session.rollback()
continue
# Step 3: Link investors to sectors based on portfolio companies
print("\n🔄 Step 3: Linking Investors to Sectors...")
investors_linked_to_sectors = 0
all_investors = session.query(InvestorTable).all()
for investor in all_investors:
sectors = set()
for company in investor.portfolio_companies:
for sector in company.sectors:
sectors.add(sector)
# Add sectors to investor if not already present
for sector in sectors:
if sector not in investor.sectors:
investor.sectors.append(sector)
if sectors:
investors_linked_to_sectors += 1
session.commit()
print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")
# Final commit
session.commit()
# Final counts
final_investors = session.query(InvestorTable).count()
final_companies = session.query(CompanyTable).count()
final_sectors = session.query(SectorTable).count()
print("\n🎉 Ingestion Complete!")
print(f" Investors: {final_investors}")
print(f" Companies: {final_companies}")
print(f" Sectors: {final_sectors}")
session.close()
if __name__ == "__main__":
ingest_data()
# print(clean_name("A... Energi"))
# print(clean_name("B.. Tech"))
# print(clean_name("A... Energi"))
-381
View File
@@ -1,381 +0,0 @@
import enum
from typing import Annotated
from fastapi import Depends
from sqlalchemy import (
Column,
DateTime,
ForeignKey,
Integer,
String,
Table,
Text,
create_engine,
func,
)
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import Session, declarative_mixin, relationship, sessionmaker
from sqlalchemy.types import JSON, Enum
Base = declarative_base()
# Database configuration
# DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db")
# Create engine
engine = create_engine("sqlite:///./investors.db", echo=False)
# Create session factory
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
def get_db():
db = SessionLocal()
try:
yield db
finally:
db.close()
db_dependency = Annotated[Session, Depends(get_db)]
def init_database():
"""Initialize the database by creating all tables"""
Base.metadata.create_all(bind=engine)
def get_session_sync() -> Session:
"""Get a database session for synchronous operations"""
return SessionLocal()
def get_db_session():
"""Get a database session for direct use."""
return SessionLocal()
@declarative_mixin
class TimestampMixin:
created_at = Column(
DateTime(timezone=True), server_default=func.now(), nullable=False
)
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
class InvestmentStage(enum.Enum):
SEED = "SEED"
SERIES_A = "SERIES_A"
SERIES_B = "SERIES_B"
SERIES_C = "SERIES_C"
GROWTH = "GROWTH"
LATE_STAGE = "LATE_STAGE"
# Association table for many-to-many relationship between investors and companies
investor_company_association = Table(
"investor_companies",
Base.metadata,
Column("investor_id", Integer, ForeignKey("investors.id")),
Column("company_id", Integer, ForeignKey("companies.id")),
)
# Association table for investor-sector many-to-many
investor_sector_association = Table(
"investor_sectors",
Base.metadata,
Column("investor_id", Integer, ForeignKey("investors.id")),
Column("sector_id", Integer, ForeignKey("sectors.id")),
)
company_sector_association = Table(
"company_sector",
Base.metadata,
Column("company_id", Integer, ForeignKey("companies.id")),
Column("sector_id", Integer, ForeignKey("sectors.id")),
)
project_sector_association = Table(
"project_sector",
Base.metadata,
Column("project_id", Integer, ForeignKey("projects.id")),
Column("sector_id", Integer, ForeignKey("sectors.id")),
)
project_investor_association = Table(
"project_investors",
Base.metadata,
Column("project_id", Integer, ForeignKey("projects.id")),
Column("investor_id", Integer, ForeignKey("investors.id")),
)
project_company_association = Table(
"project_companies",
Base.metadata,
Column("project_id", Integer, ForeignKey("projects.id")),
Column("company_id", Integer, ForeignKey("companies.id")),
)
# Association table for investor-stage many-to-many
investor_stage_association = Table(
"investor_stages",
Base.metadata,
Column("investor_id", Integer, ForeignKey("investors.id")),
Column("stage_id", Integer, ForeignKey("investment_stages.id")),
)
# Association table for fund-stage many-to-many
fund_investment_stages_association = Table(
"fund_investment_stages",
Base.metadata,
Column("fund_id", Integer, ForeignKey("funds.id")),
Column("stage_id", Integer, ForeignKey("investment_stages.id")),
)
# Association table for fund-sector many-to-many
fund_sectors_association = Table(
"fund_sectors",
Base.metadata,
Column("fund_id", Integer, ForeignKey("funds.id")),
Column("sector_id", Integer, ForeignKey("sectors.id")),
)
class InvestorTable(Base, TimestampMixin):
__tablename__ = "investors"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
description = Column(Text, nullable=True)
# Basic investor info
website = Column(String, nullable=True)
headquarters = Column(String, nullable=True)
# AUM fields
aum = Column(Integer, nullable=True) # Store as integer for numerical filtering
aum_as_of_date = Column(String, nullable=True)
aum_source_url = Column(String, nullable=True)
# Check size (deprecated in favor of fund-level data, but keeping for backward compatibility)
check_size_lower = Column(Integer, nullable=True)
check_size_upper = Column(Integer, nullable=True)
# Geographic focus (deprecated in favor of fund-level, but keeping for backward compatibility)
geographic_focus = Column(String, nullable=True)
# Investment thesis and portfolio
investment_thesis = Column(JSON, nullable=True) # Array of thesis statements
portfolio_highlights = Column(
JSON, nullable=True
) # Array of portfolio company names
linked_documents = Column(JSON, nullable=True) # Array of document URLs
# Research metadata
researcher_notes = Column(Text, nullable=True)
missing_important_fields = Column(
JSON, nullable=True
) # Array of missing field names
sources = Column(JSON, nullable=True) # JSON object with source URLs
# Portfolio info
number_of_investments = Column(Integer, nullable=True)
# Relationships
team_members = relationship(
"InvestorMember", back_populates="investor", cascade="all, delete-orphan"
)
funds = relationship(
"FundTable", back_populates="investor", cascade="all, delete-orphan"
)
# Many-to-many relationship with investment stages
investment_stages = relationship(
"InvestmentStageTable",
secondary=investor_stage_association,
back_populates="investors",
)
# Relationship to portfolio companies
portfolio_companies = relationship(
"CompanyTable",
secondary=investor_company_association,
back_populates="investors",
)
sectors = relationship(
"SectorTable",
secondary=investor_sector_association,
back_populates="investors",
)
projects = relationship(
"ProjectTable",
secondary=project_investor_association,
back_populates="investors",
)
class InvestorMember(Base, TimestampMixin):
__tablename__ = "investor_members"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
role = Column(String, nullable=True)
title = Column(String, nullable=True) # Alternative to role
email = Column(String, nullable=True)
source_url = Column(String, nullable=True) # URL where member info was found
investor_id = Column(Integer, ForeignKey("investors.id"))
investor = relationship("InvestorTable", back_populates="team_members")
class FundTable(Base, TimestampMixin):
__tablename__ = "funds"
id = Column(Integer, primary_key=True, index=True)
investor_id = Column(Integer, ForeignKey("investors.id"), nullable=False)
# Fund details
fund_name = Column(String, nullable=True)
fund_size = Column(
Integer, nullable=True
) # Store as integer for numerical filtering
fund_size_source_url = Column(String, nullable=True)
# Check size range (parsed from estimated_investment_size by LLM)
check_size_lower = Column(Integer, nullable=True)
check_size_upper = Column(Integer, nullable=True)
source_url = Column(String, nullable=True)
source_provider = Column(String, nullable=True) # e.g., "Perplexity"
# Geographic focus as simple string
geographic_focus = Column(String, nullable=True)
# Relationships
investor = relationship("InvestorTable", back_populates="funds")
investment_stages = relationship(
"InvestmentStageTable",
secondary=fund_investment_stages_association,
back_populates="funds",
)
sectors = relationship(
"SectorTable",
secondary=fund_sectors_association,
back_populates="funds",
)
class InvestmentStageTable(Base, TimestampMixin):
__tablename__ = "investment_stages"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False, unique=True)
# Relationships
investors = relationship(
"InvestorTable",
secondary=investor_stage_association,
back_populates="investment_stages",
)
funds = relationship(
"FundTable",
secondary=fund_investment_stages_association,
back_populates="investment_stages",
)
class CompanyTable(Base, TimestampMixin):
__tablename__ = "companies"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
industry = Column(String, nullable=True)
location = Column(String, nullable=True)
description = Column(String, nullable=True)
founded_year = Column(Integer, nullable=True)
website = Column(String, nullable=True)
members = relationship(
"CompanyMember", back_populates="company", cascade="all, delete-orphan"
)
# Relationship back to investors
investors = relationship(
"InvestorTable",
secondary=investor_company_association,
back_populates="portfolio_companies",
)
sectors = relationship(
"SectorTable", secondary=company_sector_association, back_populates="companies"
)
projects = relationship(
"ProjectTable",
secondary=project_company_association,
back_populates="companies",
)
class CompanyMember(Base, TimestampMixin):
__tablename__ = "company_members"
id = Column(Integer, primary_key=True)
name = Column(String)
linkedin = Column(String, nullable=True)
role = Column(String, nullable=True)
company_id = Column(Integer, ForeignKey("companies.id"), nullable=False)
company = relationship("CompanyTable", back_populates="members")
class SectorTable(Base, TimestampMixin):
__tablename__ = "sectors"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
# Relationships
investors = relationship(
"InvestorTable",
secondary=investor_sector_association,
back_populates="sectors",
)
companies = relationship(
"CompanyTable", secondary=company_sector_association, back_populates="sectors"
)
projects = relationship(
"ProjectTable", secondary=project_sector_association, back_populates="sector"
)
funds = relationship(
"FundTable",
secondary=fund_sectors_association,
back_populates="sectors",
)
class ProjectTable(Base, TimestampMixin):
__tablename__ = "projects"
id = Column(Integer, primary_key=True, index=True)
name = Column(String, nullable=False)
valuation = Column(Integer, nullable=True)
stage = Column(Enum(InvestmentStage), nullable=True)
location = Column(String, nullable=True)
description = Column(Text, nullable=True)
start_date = Column(DateTime, nullable=True)
end_date = Column(DateTime, nullable=True)
sector = relationship(
"SectorTable", secondary=project_sector_association, back_populates="projects"
)
investors = relationship(
"InvestorTable",
secondary=project_investor_association,
back_populates="projects",
)
companies = relationship(
"CompanyTable", secondary=project_company_association, back_populates="projects"
)