Refactor investor and fund schemas to support new check size range

- Removed deprecated `stage_focus` column from `InvestorTable` and `InvestorSchema`.
- Updated `FundTable` to change `fund_size` from VARCHAR to INTEGER and added `check_size_lower` and `check_size_upper` columns.
- Modified API routes to return investor-fund combinations as separate entries.
- Created new `InvestorFundData` schema for combined investor-fund responses.
- Implemented LLM parsing for check size range from estimated investment size.
- Updated database migration script to reflect schema changes and ensure data integrity.
- Removed obsolete verification and test scripts related to the old schema.
This commit is contained in:
bolade
2025-10-07 15:24:36 +01:00
parent c0fbbdd917
commit d341cacb9a
12 changed files with 556 additions and 884 deletions
+8 -4
View File
@@ -160,11 +160,15 @@ class FundTable(Base, TimestampMixin):
# Fund details
fund_name = Column(String, nullable=True)
fund_size = Column(String, nullable=True) # Store as string to preserve currency
fund_size = Column(
Integer, nullable=True
) # Store as integer for numerical filtering
fund_size_source_url = Column(String, nullable=True)
estimated_investment_size = Column(
String, nullable=True
) # e.g., "EUR 1,000 to 2,000"
# Check size range (parsed from estimated_investment_size by LLM)
check_size_lower = Column(Integer, nullable=True)
check_size_upper = Column(Integer, nullable=True)
source_url = Column(String, nullable=True)
source_provider = Column(String, nullable=True) # e.g., "Perplexity"
+234 -39
View File
@@ -4,7 +4,11 @@ from db.db import get_db
from db.models import InvestorTable, SectorTable
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from schemas.router_schemas import InvestmentStage, InvestorData
from schemas.router_schemas import (
InvestmentStage,
InvestorData,
InvestorFundData,
)
from sqlalchemy.orm import Session, selectinload
router = APIRouter(tags=["Investor Routes"])
@@ -33,34 +37,95 @@ class InvestorUpdate(BaseModel):
number_of_investments: Optional[int] = None
@router.get("/investors", response_model=List[InvestorData])
@router.get("/investors", response_model=List[InvestorFundData])
def read_investors(db: Session = Depends(get_db)):
"""Get all investors with their related data"""
"""Get all investors with their funds as separate entries
Each investor-fund combination is returned as a separate row.
An investor with 3 funds will appear as 3 entries.
"""
investors = (
db.query(InvestorTable)
.options(
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.all()
)
# Transform InvestorTable objects to InvestorData format
investor_data_list = []
# Transform to InvestorFundData format (one row per investor-fund combination)
investor_fund_list = []
for investor in investors:
investor_data = InvestorData(
investor=investor, # This maps to InvestorSchema
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_data_list.append(investor_data)
# If investor has funds, create one entry per fund
if investor.funds:
for fund in investor.funds:
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields
fund_id=fund.id,
fund_name=fund.fund_name,
fund_size=fund.fund_size,
fund_size_source_url=fund.fund_size_source_url,
check_size_lower=fund.check_size_lower,
check_size_upper=fund.check_size_upper,
geographic_focus=fund.geographic_focus,
investment_stage_focus=fund.investment_stage_focus,
sector_focus=fund.sector_focus,
# Related data (same for all funds of this investor)
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
else:
# If no funds, create one entry with null fund fields
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields (null)
fund_id=None,
fund_name=None,
fund_size=None,
fund_size_source_url=None,
check_size_lower=None,
check_size_upper=None,
geographic_focus=None,
investment_stage_focus=None,
sector_focus=None,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
return investor_data_list
return investor_fund_list
@router.get("/investors/filter", response_model=List[InvestorData])
@router.get("/investors/filter", response_model=List[InvestorFundData])
def filter_investors(
stage: Optional[InvestmentStage] = Query(
None, description="Filter by investment stage"
@@ -75,13 +140,18 @@ def filter_investors(
max_aum: Optional[int] = Query(None, description="Maximum AUM"),
db: Session = Depends(get_db),
):
"""Filter investors based on various criteria"""
"""Filter investors based on various criteria
Returns investor-fund combinations as separate rows.
An investor with 3 funds will appear as 3 entries.
"""
# Start with base query
query = db.query(InvestorTable).options(
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
# Apply filters
@@ -111,29 +181,86 @@ def filter_investors(
investors = query.all()
# Transform to InvestorData format
investor_data_list = []
# Transform to InvestorFundData format (one row per investor-fund combination)
investor_fund_list = []
for investor in investors:
investor_data = InvestorData(
investor=investor,
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_data_list.append(investor_data)
# If investor has funds, create one entry per fund
if investor.funds:
for fund in investor.funds:
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields
fund_id=fund.id,
fund_name=fund.fund_name,
fund_size=fund.fund_size,
fund_size_source_url=fund.fund_size_source_url,
check_size_lower=fund.check_size_lower,
check_size_upper=fund.check_size_upper,
geographic_focus=fund.geographic_focus,
investment_stage_focus=fund.investment_stage_focus,
sector_focus=fund.sector_focus,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
else:
# If no funds, create one entry with null fund fields
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields (null)
fund_id=None,
fund_name=None,
fund_size=None,
fund_size_source_url=None,
check_size_lower=None,
check_size_upper=None,
geographic_focus=None,
investment_stage_focus=None,
sector_focus=None,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
return investor_data_list
return investor_fund_list
@router.get("/investors/{investor_id}", response_model=InvestorData)
def read_investor(investor_id: int, db: Session = Depends(get_db)):
"""Get a specific investor by ID"""
"""Get a specific investor by ID with all their funds"""
investor = (
db.query(InvestorTable)
.options(
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.filter(InvestorTable.id == investor_id)
.first()
@@ -142,12 +269,13 @@ def read_investor(investor_id: int, db: Session = Depends(get_db)):
if not investor:
raise HTTPException(status_code=404, detail="Investor not found")
# Transform to InvestorData format
# Transform to InvestorData format (includes funds array)
return InvestorData(
investor=investor,
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
funds=investor.funds,
)
@@ -166,6 +294,7 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)):
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.filter(InvestorTable.id == db_investor.id)
.first()
@@ -177,6 +306,7 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)):
portfolio_companies=investor_with_relations.portfolio_companies,
team_members=investor_with_relations.team_members,
sectors=investor_with_relations.sectors,
funds=investor_with_relations.funds,
)
@@ -205,6 +335,7 @@ def update_investor(
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.filter(InvestorTable.id == investor_id)
.first()
@@ -216,6 +347,7 @@ def update_investor(
portfolio_companies=investor_with_relations.portfolio_companies,
team_members=investor_with_relations.team_members,
sectors=investor_with_relations.sectors,
funds=investor_with_relations.funds,
)
@@ -233,13 +365,16 @@ def delete_investor(investor_id: int, db: Session = Depends(get_db)):
return {"message": "Investor deleted successfully"}
@router.get("/investors/{investor_id}/similar", response_model=List[InvestorData])
@router.get("/investors/{investor_id}/similar", response_model=List[InvestorFundData])
def find_similar_investors(
investor_id: int,
limit: int = Query(10, description="Maximum number of similar investors to return"),
db: Session = Depends(get_db),
):
"""Find investors similar to a given investor based on characteristics"""
"""Find investors similar to a given investor based on characteristics
Returns investor-fund combinations as separate rows.
"""
# Get the target investor
target_investor = (
@@ -248,6 +383,7 @@ def find_similar_investors(
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.filter(InvestorTable.id == investor_id)
.first()
@@ -266,6 +402,7 @@ def find_similar_investors(
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.filter(InvestorTable.id != investor_id)
.all()
@@ -338,13 +475,71 @@ def find_similar_investors(
scored_investors.sort(key=lambda x: x[0], reverse=True)
similar_investors = [inv for score, inv in scored_investors[:limit]]
# Transform to InvestorData format
return [
InvestorData(
investor=inv,
portfolio_companies=inv.portfolio_companies,
team_members=inv.team_members,
sectors=inv.sectors,
)
for inv in similar_investors
]
# Transform to InvestorFundData format (one row per investor-fund combination)
investor_fund_list = []
for investor in similar_investors:
# If investor has funds, create one entry per fund
if investor.funds:
for fund in investor.funds:
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields
fund_id=fund.id,
fund_name=fund.fund_name,
fund_size=fund.fund_size,
fund_size_source_url=fund.fund_size_source_url,
check_size_lower=fund.check_size_lower,
check_size_upper=fund.check_size_upper,
geographic_focus=fund.geographic_focus,
investment_stage_focus=fund.investment_stage_focus,
sector_focus=fund.sector_focus,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
else:
# If no funds, create one entry with null fund fields
investor_fund_data = InvestorFundData(
# Investor fields
investor_id=investor.id,
investor_name=investor.name,
investor_description=investor.description,
investor_website=investor.website,
investor_headquarters=investor.headquarters,
aum=investor.aum,
aum_as_of_date=investor.aum_as_of_date,
aum_source_url=investor.aum_source_url,
investment_thesis=investor.investment_thesis,
portfolio_highlights=investor.portfolio_highlights,
number_of_investments=investor.number_of_investments,
# Fund fields (null)
fund_id=None,
fund_name=None,
fund_size=None,
fund_size_source_url=None,
check_size_lower=None,
check_size_upper=None,
geographic_focus=None,
investment_stage_focus=None,
sector_focus=None,
# Related data
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
)
investor_fund_list.append(investor_fund_data)
return investor_fund_list
+67 -1
View File
@@ -32,6 +32,25 @@ class InvestorMemberSchema(BaseModel):
from_attributes = True
class FundSchema(BaseModel):
id: int
fund_name: str | None
fund_size: int | None # Changed to int for numerical filtering
fund_size_source_url: str | None
check_size_lower: int | None # NEW: Lower bound of check size range
check_size_upper: int | None # NEW: Upper bound of check size range
source_url: str | None
source_provider: str | None
geographic_focus: List[str] | None
investment_stage_focus: List[str] | None
sector_focus: List[str] | None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
class Config:
from_attributes = True
class CompanyMemberSchema(BaseModel):
id: int
name: Optional[str]
@@ -76,12 +95,53 @@ class InvestorSchema(BaseModel):
class InvestorData(BaseModel):
"""Comprehensive investor data schema for LLM processing"""
"""Comprehensive investor data schema - used for individual investor requests"""
investor: InvestorSchema
portfolio_companies: List[CompanySchema]
team_members: List[InvestorMemberSchema]
sectors: List[SectorSchema]
funds: List[FundSchema]
class Config:
from_attributes = True
class InvestorFundData(BaseModel):
"""Investor-Fund combined data - used for list/filter requests
Each row represents one investor-fund combination.
An investor with 3 funds will appear as 3 separate entries.
"""
# Investor fields
investor_id: int
investor_name: str
investor_description: Optional[str]
investor_website: Optional[str]
investor_headquarters: Optional[str]
aum: int | None
aum_as_of_date: str | None
aum_source_url: str | None
investment_thesis: List[str] | None
portfolio_highlights: List[str] | None
number_of_investments: int | None
# Fund fields
fund_id: int | None
fund_name: str | None
fund_size: int | None # Changed to int for numerical filtering
fund_size_source_url: str | None
check_size_lower: int | None # NEW: Lower bound of check size range
check_size_upper: int | None # NEW: Upper bound of check size range
geographic_focus: List[str] | None
investment_stage_focus: List[str] | None
sector_focus: List[str] | None
# Related data
portfolio_companies: List[CompanySchema]
team_members: List[InvestorMemberSchema]
sectors: List[SectorSchema]
class Config:
from_attributes = True
@@ -99,3 +159,9 @@ class CompanyData(BaseModel): # Renamed from CompaniesData for consistency
class InvestorList(BaseModel):
investors: List[InvestorData]
class InvestorFundList(BaseModel):
"""List of investor-fund combinations"""
investor_funds: List[InvestorFundData]
+78 -12
View File
@@ -27,6 +27,15 @@ class CurrencyConversion(BaseModel):
notes: str = ""
class CheckSizeRange(BaseModel):
"""Schema for LLM check size range parsing from estimated investment size"""
lower_bound_usd: int = 0
upper_bound_usd: int = 0
confidence: str = "high" # high, medium, low
notes: str = ""
class InvestorProcessor:
def __init__(self):
self.llm = ChatOpenAI(
@@ -36,10 +45,12 @@ class InvestorProcessor:
temperature=0,
)
# Only use structured LLM for currency conversion
# Structured LLMs for specific parsing tasks
self.currency_converter_llm = self.llm.with_structured_output(
CurrencyConversion
)
self.check_size_parser_llm = self.llm.with_structured_output(CheckSizeRange)
# Keep legacy structured LLMs for backward compatibility
self.investor_structured_llm = self.llm.with_structured_output(InvestorData)
self.company_structured_llm = self.llm.with_structured_output(CompanyData)
@@ -77,6 +88,57 @@ Return only the USD integer amount with current exchange rates."""
print(f"Error converting currency '{amount_str}': {e}")
return None
async def parse_check_size_range(
self, estimated_investment_str: str
) -> tuple[Optional[int], Optional[int]]:
"""
Use LLM to parse check size range from estimated investment size string.
Returns tuple of (lower_bound_usd, upper_bound_usd).
Handles formats like:
- "EUR 1,000 to 2,000"
- "$100K-$500K"
- "Between $1M and $5M"
- "Up to EUR 10 million"
- "$2M typical"
"""
if (
not estimated_investment_str
or estimated_investment_str == "Not Available"
or estimated_investment_str == "0"
):
return None, None
try:
prompt = f"""Parse this check size/investment range into lower and upper bounds in USD as integers.
Input: {estimated_investment_str}
Instructions:
- If it's a range (e.g., "EUR 1M to 5M"), extract both bounds
- If it's a single amount (e.g., "$2M typical"), use it as both lower and upper
- If it says "up to X", use 0 as lower and X as upper
- Convert all currencies to USD using current exchange rates
- Return integers (whole numbers, no decimals)
Examples:
- "EUR 1,000 to 2,000" -> lower: 1100, upper: 2200
- "$100K-$500K" -> lower: 100000, upper: 500000
- "Between $1M and $5M" -> lower: 1000000, upper: 5000000
- "Up to EUR 10 million" -> lower: 0, upper: 11000000
- "$2M typical" -> lower: 2000000, upper: 2000000
- "GBP 500K-2M" -> lower: 600000, upper: 2400000
Return the lower and upper bounds in USD."""
result = await self.check_size_parser_llm.ainvoke(prompt)
lower = result.lower_bound_usd if result.lower_bound_usd > 0 else None
upper = result.upper_bound_usd if result.upper_bound_usd > 0 else None
return lower, upper
except Exception as e:
print(f"Error parsing check size range '{estimated_investment_str}': {e}")
return None, None
def parse_json_profile(self, json_str: str) -> Optional[dict]:
"""
Manually parse the JSON profile from the CSV.
@@ -157,7 +219,8 @@ Return only the USD integer amount with current exchange rates."""
"fund_name": fund.get("fundName"),
"fund_size": None,
"fund_size_source_url": fund.get("fundSizeSourceUrl"),
"estimated_investment_size": None,
"check_size_lower": None,
"check_size_upper": None,
"source_url": fund.get("sourceUrl"),
"source_provider": fund.get("sourceProvider"),
"geographic_focus": fund.get("geographicFocus", []),
@@ -165,19 +228,23 @@ Return only the USD integer amount with current exchange rates."""
"sector_focus": fund.get("sectorFocus", []),
}
# Convert fund size to USD
# Convert fund size to USD integer
fund_size_str = fund.get("fundSize")
if fund_size_str and fund_size_str != "Not Available":
fund_size_usd = await self.convert_to_usd(fund_size_str)
if fund_size_usd:
fund_data["fund_size"] = str(fund_size_usd)
fund_data["fund_size"] = fund_size_usd # Store as integer
# Convert estimated investment size
# Parse check size range from estimated investment size
est_size_str = fund.get("estimatedInvestmentSize")
if est_size_str and est_size_str != "Not Available":
est_size_usd = await self.convert_to_usd(est_size_str)
if est_size_usd:
fund_data["estimated_investment_size"] = str(est_size_usd)
check_lower, check_upper = await self.parse_check_size_range(
est_size_str
)
if check_lower is not None:
fund_data["check_size_lower"] = check_lower
if check_upper is not None:
fund_data["check_size_upper"] = check_upper
investor_data["funds"].append(fund_data)
@@ -430,11 +497,10 @@ Return only the USD integer amount with current exchange rates."""
fund = FundTable(
investor_id=investor.id,
fund_name=fund_data.get("fund_name"),
fund_size=fund_data.get("fund_size"),
fund_size=fund_data.get("fund_size"), # Now an integer
fund_size_source_url=fund_data.get("fund_size_source_url"),
estimated_investment_size=fund_data.get(
"estimated_investment_size"
),
check_size_lower=fund_data.get("check_size_lower"), # NEW
check_size_upper=fund_data.get("check_size_upper"), # NEW
source_url=fund_data.get("source_url"),
source_provider=fund_data.get("source_provider"),
geographic_focus=fund_data.get("geographic_focus"),
+2
View File
@@ -95,6 +95,7 @@ class QueryProcessor:
selectinload(InvestorTable.portfolio_companies),
selectinload(InvestorTable.team_members),
selectinload(InvestorTable.sectors),
selectinload(InvestorTable.funds),
)
.filter(InvestorTable.id.in_(investor_ids))
)
@@ -109,6 +110,7 @@ class QueryProcessor:
portfolio_companies=investor.portfolio_companies,
team_members=investor.team_members,
sectors=investor.sectors,
funds=investor.funds,
)
investor_data_list.append(investor_data)