Refactor investor and fund schemas to support new check size range

- Removed deprecated `stage_focus` column from `InvestorTable` and `InvestorSchema`.
- Updated `FundTable` to change `fund_size` from VARCHAR to INTEGER and added `check_size_lower` and `check_size_upper` columns.
- Modified API routes to return investor-fund combinations as separate entries.
- Created new `InvestorFundData` schema for combined investor-fund responses.
- Implemented LLM parsing for check size range from estimated investment size.
- Updated database migration script to reflect schema changes and ensure data integrity.
- Removed obsolete verification and test scripts related to the old schema.
This commit is contained in:
bolade
2025-10-07 15:24:36 +01:00
parent c0fbbdd917
commit d341cacb9a
12 changed files with 556 additions and 884 deletions
+78 -12
View File
@@ -27,6 +27,15 @@ class CurrencyConversion(BaseModel):
notes: str = ""
class CheckSizeRange(BaseModel):
"""Schema for LLM check size range parsing from estimated investment size"""
lower_bound_usd: int = 0
upper_bound_usd: int = 0
confidence: str = "high" # high, medium, low
notes: str = ""
class InvestorProcessor:
def __init__(self):
self.llm = ChatOpenAI(
@@ -36,10 +45,12 @@ class InvestorProcessor:
temperature=0,
)
# Only use structured LLM for currency conversion
# Structured LLMs for specific parsing tasks
self.currency_converter_llm = self.llm.with_structured_output(
CurrencyConversion
)
self.check_size_parser_llm = self.llm.with_structured_output(CheckSizeRange)
# Keep legacy structured LLMs for backward compatibility
self.investor_structured_llm = self.llm.with_structured_output(InvestorData)
self.company_structured_llm = self.llm.with_structured_output(CompanyData)
@@ -77,6 +88,57 @@ Return only the USD integer amount with current exchange rates."""
print(f"Error converting currency '{amount_str}': {e}")
return None
async def parse_check_size_range(
self, estimated_investment_str: str
) -> tuple[Optional[int], Optional[int]]:
"""
Use LLM to parse check size range from estimated investment size string.
Returns tuple of (lower_bound_usd, upper_bound_usd).
Handles formats like:
- "EUR 1,000 to 2,000"
- "$100K-$500K"
- "Between $1M and $5M"
- "Up to EUR 10 million"
- "$2M typical"
"""
if (
not estimated_investment_str
or estimated_investment_str == "Not Available"
or estimated_investment_str == "0"
):
return None, None
try:
prompt = f"""Parse this check size/investment range into lower and upper bounds in USD as integers.
Input: {estimated_investment_str}
Instructions:
- If it's a range (e.g., "EUR 1M to 5M"), extract both bounds
- If it's a single amount (e.g., "$2M typical"), use it as both lower and upper
- If it says "up to X", use 0 as lower and X as upper
- Convert all currencies to USD using current exchange rates
- Return integers (whole numbers, no decimals)
Examples:
- "EUR 1,000 to 2,000" -> lower: 1100, upper: 2200
- "$100K-$500K" -> lower: 100000, upper: 500000
- "Between $1M and $5M" -> lower: 1000000, upper: 5000000
- "Up to EUR 10 million" -> lower: 0, upper: 11000000
- "$2M typical" -> lower: 2000000, upper: 2000000
- "GBP 500K-2M" -> lower: 600000, upper: 2400000
Return the lower and upper bounds in USD."""
result = await self.check_size_parser_llm.ainvoke(prompt)
lower = result.lower_bound_usd if result.lower_bound_usd > 0 else None
upper = result.upper_bound_usd if result.upper_bound_usd > 0 else None
return lower, upper
except Exception as e:
print(f"Error parsing check size range '{estimated_investment_str}': {e}")
return None, None
def parse_json_profile(self, json_str: str) -> Optional[dict]:
"""
Manually parse the JSON profile from the CSV.
@@ -157,7 +219,8 @@ Return only the USD integer amount with current exchange rates."""
"fund_name": fund.get("fundName"),
"fund_size": None,
"fund_size_source_url": fund.get("fundSizeSourceUrl"),
"estimated_investment_size": None,
"check_size_lower": None,
"check_size_upper": None,
"source_url": fund.get("sourceUrl"),
"source_provider": fund.get("sourceProvider"),
"geographic_focus": fund.get("geographicFocus", []),
@@ -165,19 +228,23 @@ Return only the USD integer amount with current exchange rates."""
"sector_focus": fund.get("sectorFocus", []),
}
# Convert fund size to USD
# Convert fund size to USD integer
fund_size_str = fund.get("fundSize")
if fund_size_str and fund_size_str != "Not Available":
fund_size_usd = await self.convert_to_usd(fund_size_str)
if fund_size_usd:
fund_data["fund_size"] = str(fund_size_usd)
fund_data["fund_size"] = fund_size_usd # Store as integer
# Convert estimated investment size
# Parse check size range from estimated investment size
est_size_str = fund.get("estimatedInvestmentSize")
if est_size_str and est_size_str != "Not Available":
est_size_usd = await self.convert_to_usd(est_size_str)
if est_size_usd:
fund_data["estimated_investment_size"] = str(est_size_usd)
check_lower, check_upper = await self.parse_check_size_range(
est_size_str
)
if check_lower is not None:
fund_data["check_size_lower"] = check_lower
if check_upper is not None:
fund_data["check_size_upper"] = check_upper
investor_data["funds"].append(fund_data)
@@ -430,11 +497,10 @@ Return only the USD integer amount with current exchange rates."""
fund = FundTable(
investor_id=investor.id,
fund_name=fund_data.get("fund_name"),
fund_size=fund_data.get("fund_size"),
fund_size=fund_data.get("fund_size"), # Now an integer
fund_size_source_url=fund_data.get("fund_size_source_url"),
estimated_investment_size=fund_data.get(
"estimated_investment_size"
),
check_size_lower=fund_data.get("check_size_lower"), # NEW
check_size_upper=fund_data.get("check_size_upper"), # NEW
source_url=fund_data.get("source_url"),
source_provider=fund_data.get("source_provider"),
geographic_focus=fund_data.get("geographic_focus"),