a9589e54f3
- Updated FundTable to replace JSON fields for investment stages and sectors with relationships. - Introduced InvestmentStageTable and fund_investment_stages association table. - Created fund_sectors association table for many-to-many relationship with sectors. - Changed geographic_focus from JSON array to a simple string. - Migrated existing data to new schema, ensuring data integrity and normalization. - Updated related schemas, routers, and services to reflect new structure. - Added migration script to handle data transformation and schema updates. - Implemented tests to verify new relationships and data integrity.
935 lines
37 KiB
Python
935 lines
37 KiB
Python
import json
|
|
import os
|
|
import re
|
|
from typing import Optional
|
|
|
|
import pandas as pd
|
|
from db.db import get_db_session
|
|
from db.models import (
|
|
CompanyMember,
|
|
CompanyTable,
|
|
FundTable,
|
|
InvestmentStageTable,
|
|
InvestorMember,
|
|
InvestorTable,
|
|
SectorTable,
|
|
)
|
|
from langchain_openai import ChatOpenAI
|
|
from pydantic import BaseModel
|
|
from schemas.py_schemas import CompanyData, InvestorData
|
|
from sqlalchemy.orm import Session
|
|
|
|
|
|
class CurrencyConversion(BaseModel):
|
|
"""Schema for LLM currency conversion responses"""
|
|
|
|
amount_usd: int = 0
|
|
confidence: str = "high" # high, medium, low
|
|
notes: str = ""
|
|
|
|
|
|
class CheckSizeRange(BaseModel):
|
|
"""Schema for LLM check size range parsing from estimated investment size"""
|
|
|
|
lower_bound_usd: int = 0
|
|
upper_bound_usd: int = 0
|
|
confidence: str = "high" # high, medium, low
|
|
notes: str = ""
|
|
|
|
|
|
class InvestorProcessor:
|
|
def __init__(self):
|
|
self.llm = ChatOpenAI(
|
|
api_key=os.getenv("OPENROUTER_API_KEY"),
|
|
base_url="https://openrouter.ai/api/v1",
|
|
model="openai/gpt-4o-mini",
|
|
temperature=0,
|
|
)
|
|
|
|
# Structured LLMs for specific parsing tasks
|
|
self.currency_converter_llm = self.llm.with_structured_output(
|
|
CurrencyConversion
|
|
)
|
|
self.check_size_parser_llm = self.llm.with_structured_output(CheckSizeRange)
|
|
|
|
# Keep legacy structured LLMs for backward compatibility
|
|
self.investor_structured_llm = self.llm.with_structured_output(InvestorData)
|
|
self.company_structured_llm = self.llm.with_structured_output(CompanyData)
|
|
|
|
async def convert_to_usd(self, amount_str: str) -> Optional[int]:
|
|
"""
|
|
Use LLM to convert currency amounts to USD integers.
|
|
Handles formats like:
|
|
- "EUR 850,000,000"
|
|
- "$5M"
|
|
- "GBP 10-20 million"
|
|
- "Approximately EUR 100 million"
|
|
"""
|
|
if not amount_str or amount_str == "Not Available" or amount_str == "0":
|
|
return None
|
|
|
|
try:
|
|
prompt = f"""Convert this amount to USD as an integer (whole number, no decimals).
|
|
If it's a range, use the midpoint. If already in USD, just extract the number.
|
|
Remove all commas and convert millions/billions to actual numbers.
|
|
|
|
Amount: {amount_str}
|
|
|
|
Examples:
|
|
- "EUR 850,000,000" -> 935000000 (assuming EUR to USD rate ~1.10)
|
|
- "$5M" -> 5000000
|
|
- "GBP 10-20 million" -> 18000000 (midpoint 15M * 1.20 rate)
|
|
- "Approximately EUR 100 million" -> 110000000
|
|
|
|
Return only the USD integer amount with current exchange rates."""
|
|
|
|
result = await self.currency_converter_llm.ainvoke(prompt)
|
|
return result.amount_usd if result.amount_usd > 0 else None
|
|
except Exception as e:
|
|
print(f"Error converting currency '{amount_str}': {e}")
|
|
return None
|
|
|
|
async def parse_check_size_range(
|
|
self, estimated_investment_str: str
|
|
) -> tuple[Optional[int], Optional[int]]:
|
|
"""
|
|
Use LLM to parse check size range from estimated investment size string.
|
|
Returns tuple of (lower_bound_usd, upper_bound_usd).
|
|
|
|
Handles formats like:
|
|
- "EUR 1,000 to 2,000"
|
|
- "$100K-$500K"
|
|
- "Between $1M and $5M"
|
|
- "Up to EUR 10 million"
|
|
- "$2M typical"
|
|
"""
|
|
if (
|
|
not estimated_investment_str
|
|
or estimated_investment_str == "Not Available"
|
|
or estimated_investment_str == "0"
|
|
):
|
|
return None, None
|
|
|
|
try:
|
|
prompt = f"""Parse this check size/investment range into lower and upper bounds in USD as integers.
|
|
|
|
Input: {estimated_investment_str}
|
|
|
|
Instructions:
|
|
- If it's a range (e.g., "EUR 1M to 5M"), extract both bounds
|
|
- If it's a single amount (e.g., "$2M typical"), use it as both lower and upper
|
|
- If it says "up to X", use 0 as lower and X as upper
|
|
- Convert all currencies to USD using current exchange rates
|
|
- Return integers (whole numbers, no decimals)
|
|
|
|
Examples:
|
|
- "EUR 1,000 to 2,000" -> lower: 1100, upper: 2200
|
|
- "$100K-$500K" -> lower: 100000, upper: 500000
|
|
- "Between $1M and $5M" -> lower: 1000000, upper: 5000000
|
|
- "Up to EUR 10 million" -> lower: 0, upper: 11000000
|
|
- "$2M typical" -> lower: 2000000, upper: 2000000
|
|
- "GBP 500K-2M" -> lower: 600000, upper: 2400000
|
|
|
|
Return the lower and upper bounds in USD."""
|
|
|
|
result = await self.check_size_parser_llm.ainvoke(prompt)
|
|
lower = result.lower_bound_usd if result.lower_bound_usd > 0 else None
|
|
upper = result.upper_bound_usd if result.upper_bound_usd > 0 else None
|
|
return lower, upper
|
|
except Exception as e:
|
|
print(f"Error parsing check size range '{estimated_investment_str}': {e}")
|
|
return None, None
|
|
|
|
def parse_json_profile(self, json_str: str) -> Optional[dict]:
|
|
"""
|
|
Manually parse the JSON profile from the CSV.
|
|
Returns a cleaned dictionary with the investor profile data.
|
|
"""
|
|
if not json_str or pd.isna(json_str):
|
|
return None
|
|
|
|
try:
|
|
# Parse JSON string
|
|
profile = json.loads(json_str)
|
|
return profile
|
|
except json.JSONDecodeError as e:
|
|
print(f"Error parsing JSON: {e}")
|
|
return None
|
|
|
|
async def process_investor_profile(
|
|
self, name: str, website: str, profile_json: str
|
|
) -> Optional[dict]:
|
|
"""
|
|
Process investor profile from CSV data.
|
|
Manually extracts fields and uses LLM only for currency conversion.
|
|
"""
|
|
profile = self.parse_json_profile(profile_json)
|
|
if not profile:
|
|
return None
|
|
|
|
try:
|
|
# Extract basic info
|
|
investor_data = {
|
|
"name": name.strip() if name else None,
|
|
"website": website.strip() if website else None,
|
|
"headquarters": profile.get("headquarters"),
|
|
"description": profile.get("investorDescription"),
|
|
"aum": None,
|
|
"aum_as_of_date": None,
|
|
"aum_source_url": None,
|
|
"investment_thesis": profile.get("investmentThesisFocus", []),
|
|
"portfolio_highlights": profile.get("portfolioHighlights", []),
|
|
"linked_documents": profile.get("linkedDocuments", []),
|
|
"researcher_notes": profile.get("researcherNotes"),
|
|
"missing_important_fields": profile.get("missingImportantFields", []),
|
|
"sources": profile.get("sources", {}),
|
|
"team_members": [],
|
|
"funds": [],
|
|
}
|
|
|
|
# Process AUM
|
|
aum_data = profile.get("overallAssetsUnderManagement", {})
|
|
if aum_data and isinstance(aum_data, dict):
|
|
aum_amount = aum_data.get("aumAmount")
|
|
if aum_amount and aum_amount != "Not Available":
|
|
# Convert AUM to USD integer
|
|
aum_usd = await self.convert_to_usd(aum_amount)
|
|
investor_data["aum"] = aum_usd
|
|
investor_data["aum_as_of_date"] = aum_data.get("asOfDate")
|
|
investor_data["aum_source_url"] = aum_data.get("sourceUrl")
|
|
|
|
# Process senior leadership
|
|
senior_leadership = profile.get("seniorLeadership", [])
|
|
for member in senior_leadership:
|
|
if isinstance(member, dict) and member.get("name"):
|
|
investor_data["team_members"].append(
|
|
{
|
|
"name": member.get("name"),
|
|
"title": member.get("title"),
|
|
"role": member.get("title"), # Use title as role
|
|
"email": None,
|
|
"source_url": member.get("sourceUrl"),
|
|
}
|
|
)
|
|
|
|
# Process funds
|
|
funds = profile.get("funds", [])
|
|
for fund in funds:
|
|
if isinstance(fund, dict):
|
|
fund_data = {
|
|
"fund_name": fund.get("fundName"),
|
|
"fund_size": None,
|
|
"fund_size_source_url": fund.get("fundSizeSourceUrl"),
|
|
"check_size_lower": None,
|
|
"check_size_upper": None,
|
|
"source_url": fund.get("sourceUrl"),
|
|
"source_provider": fund.get("sourceProvider"),
|
|
"geographic_focus": None, # Will be converted to string
|
|
"investment_stage_names": fund.get("investmentStageFocus", []),
|
|
"sector_names": fund.get("sectorFocus", []),
|
|
}
|
|
|
|
# Convert geographic focus from array to comma-separated string
|
|
geo_focus = fund.get("geographicFocus", [])
|
|
if geo_focus and isinstance(geo_focus, list):
|
|
fund_data["geographic_focus"] = ", ".join(geo_focus)
|
|
|
|
# Convert fund size to USD integer
|
|
fund_size_str = fund.get("fundSize")
|
|
if fund_size_str and fund_size_str != "Not Available":
|
|
fund_size_usd = await self.convert_to_usd(fund_size_str)
|
|
if fund_size_usd:
|
|
fund_data["fund_size"] = fund_size_usd # Store as integer
|
|
|
|
# Parse check size range from estimated investment size
|
|
est_size_str = fund.get("estimatedInvestmentSize")
|
|
if est_size_str and est_size_str != "Not Available":
|
|
check_lower, check_upper = await self.parse_check_size_range(
|
|
est_size_str
|
|
)
|
|
if check_lower is not None:
|
|
fund_data["check_size_lower"] = check_lower
|
|
if check_upper is not None:
|
|
fund_data["check_size_upper"] = check_upper
|
|
|
|
investor_data["funds"].append(fund_data)
|
|
|
|
return investor_data
|
|
|
|
except Exception as e:
|
|
print(f"Error processing investor profile for {name}: {e}")
|
|
return None
|
|
|
|
async def process_company_profile(
|
|
self, name: str, website: str, profile_json: str, investor_names: str = None
|
|
) -> Optional[dict]:
|
|
"""
|
|
Process company profile from CSV data.
|
|
Manually extracts fields without using LLM.
|
|
"""
|
|
profile = self.parse_json_profile(profile_json)
|
|
if not profile:
|
|
return None
|
|
|
|
try:
|
|
# Extract basic info
|
|
company_data = {
|
|
"name": name.strip() if name else None,
|
|
"website": website.strip() if website else None,
|
|
"description": profile.get("companyDescription"),
|
|
"location": profile.get("geographicFocus"),
|
|
"industry": profile.get("sectorDescription"),
|
|
"founded_year": None, # Not typically in the company JSON
|
|
"key_executives": [],
|
|
"client_categories": profile.get("clientCategories", []),
|
|
"product_description": profile.get("productDescription"),
|
|
"linked_documents": profile.get("linkedDocuments", []),
|
|
"researcher_notes": profile.get("researcherNotes"),
|
|
"missing_important_fields": profile.get("missingImportantFields", []),
|
|
"sources": profile.get("sources", {}),
|
|
"investor_names": [],
|
|
}
|
|
|
|
# Parse investor names from the Investor column
|
|
if investor_names and pd.notna(investor_names):
|
|
# Split by comma and clean
|
|
investors = [inv.strip() for inv in str(investor_names).split(",")]
|
|
company_data["investor_names"] = [inv for inv in investors if inv]
|
|
|
|
# Process key executives/leadership
|
|
key_executives = profile.get("keyExecutives", [])
|
|
if not key_executives:
|
|
# Try alternative field names
|
|
key_executives = profile.get("seniorLeadership", [])
|
|
|
|
for exec_member in key_executives:
|
|
if isinstance(exec_member, dict) and exec_member.get("name"):
|
|
company_data["key_executives"].append(
|
|
{
|
|
"name": exec_member.get("name"),
|
|
"title": exec_member.get("title"),
|
|
"source_url": exec_member.get("sourceUrl"),
|
|
}
|
|
)
|
|
|
|
# Try to extract founding year from description
|
|
description = company_data.get("description", "")
|
|
if description:
|
|
# Look for patterns like "founded in 2020", "Gegründet 2020", "founded 2020"
|
|
year_patterns = [
|
|
r"founded in (\d{4})",
|
|
r"founded (\d{4})",
|
|
r"Gegründet (\d{4})",
|
|
r"established in (\d{4})",
|
|
r"since (\d{4})",
|
|
r"\((\d{4})\)", # Year in parentheses
|
|
]
|
|
for pattern in year_patterns:
|
|
match = re.search(pattern, description, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
year = int(match.group(1))
|
|
if 1900 <= year <= 2025: # Sanity check
|
|
company_data["founded_year"] = year
|
|
break
|
|
except Exception:
|
|
continue
|
|
|
|
return company_data
|
|
|
|
except Exception as e:
|
|
print(f"Error processing company profile for {name}: {e}")
|
|
return None
|
|
|
|
def _save_parsed_company_to_db(
|
|
self, db: Session, company_data: dict
|
|
) -> Optional[CompanyTable]:
|
|
"""Save manually parsed company data to database"""
|
|
try:
|
|
# Check if company already exists
|
|
existing_company = (
|
|
db.query(CompanyTable).filter_by(name=company_data["name"]).first()
|
|
)
|
|
|
|
if existing_company:
|
|
# Update existing company
|
|
company = existing_company
|
|
company.website = company_data.get("website") or company.website
|
|
company.location = company_data.get("location") or company.location
|
|
company.description = (
|
|
company_data.get("description") or company.description
|
|
)
|
|
company.industry = company_data.get("industry") or company.industry
|
|
if company_data.get("founded_year"):
|
|
company.founded_year = company_data["founded_year"]
|
|
else:
|
|
# Create new company
|
|
company = CompanyTable(
|
|
name=company_data["name"],
|
|
website=company_data.get("website"),
|
|
location=company_data.get("location"),
|
|
description=company_data.get("description"),
|
|
industry=company_data.get("industry"),
|
|
founded_year=company_data.get("founded_year"),
|
|
)
|
|
db.add(company)
|
|
db.flush()
|
|
|
|
# Add/update company members (key executives)
|
|
# First, remove existing members if updating
|
|
if existing_company:
|
|
db.query(CompanyMember).filter_by(company_id=company.id).delete()
|
|
|
|
for exec_data in company_data.get("key_executives", []):
|
|
member = CompanyMember(
|
|
name=exec_data.get("name"),
|
|
role=exec_data.get("title"),
|
|
linkedin=exec_data.get(
|
|
"source_url"
|
|
), # Store source URL in linkedin field
|
|
company_id=company.id,
|
|
)
|
|
db.add(member)
|
|
|
|
# Link to investors if provided
|
|
for investor_name in company_data.get("investor_names", []):
|
|
# Find investor in database
|
|
investor = (
|
|
db.query(InvestorTable)
|
|
.filter_by(name=investor_name.strip())
|
|
.first()
|
|
)
|
|
if investor:
|
|
# Add company to investor's portfolio if not already there
|
|
if company not in investor.portfolio_companies:
|
|
investor.portfolio_companies.append(company)
|
|
|
|
return company
|
|
|
|
except Exception as e:
|
|
print(f"Error saving company to database: {e}")
|
|
db.rollback()
|
|
return None
|
|
|
|
def _save_parsed_investor_to_db(
|
|
self, db: Session, investor_data: dict
|
|
) -> Optional[InvestorTable]:
|
|
"""Save manually parsed investor data to database"""
|
|
try:
|
|
# Check if investor already exists
|
|
existing_investor = (
|
|
db.query(InvestorTable).filter_by(name=investor_data["name"]).first()
|
|
)
|
|
|
|
if existing_investor:
|
|
# Update existing investor
|
|
investor = existing_investor
|
|
investor.website = investor_data.get("website") or investor.website
|
|
investor.headquarters = (
|
|
investor_data.get("headquarters") or investor.headquarters
|
|
)
|
|
investor.description = (
|
|
investor_data.get("description") or investor.description
|
|
)
|
|
investor.aum = investor_data.get("aum") or investor.aum
|
|
investor.aum_as_of_date = (
|
|
investor_data.get("aum_as_of_date") or investor.aum_as_of_date
|
|
)
|
|
investor.aum_source_url = (
|
|
investor_data.get("aum_source_url") or investor.aum_source_url
|
|
)
|
|
investor.investment_thesis = (
|
|
investor_data.get("investment_thesis") or investor.investment_thesis
|
|
)
|
|
investor.portfolio_highlights = (
|
|
investor_data.get("portfolio_highlights")
|
|
or investor.portfolio_highlights
|
|
)
|
|
investor.linked_documents = (
|
|
investor_data.get("linked_documents") or investor.linked_documents
|
|
)
|
|
investor.researcher_notes = (
|
|
investor_data.get("researcher_notes") or investor.researcher_notes
|
|
)
|
|
investor.missing_important_fields = (
|
|
investor_data.get("missing_important_fields")
|
|
or investor.missing_important_fields
|
|
)
|
|
investor.sources = investor_data.get("sources") or investor.sources
|
|
else:
|
|
# Create new investor
|
|
investor = InvestorTable(
|
|
name=investor_data["name"],
|
|
website=investor_data.get("website"),
|
|
headquarters=investor_data.get("headquarters"),
|
|
description=investor_data.get("description"),
|
|
aum=investor_data.get("aum"),
|
|
aum_as_of_date=investor_data.get("aum_as_of_date"),
|
|
aum_source_url=investor_data.get("aum_source_url"),
|
|
investment_thesis=investor_data.get("investment_thesis"),
|
|
portfolio_highlights=investor_data.get("portfolio_highlights"),
|
|
linked_documents=investor_data.get("linked_documents"),
|
|
researcher_notes=investor_data.get("researcher_notes"),
|
|
missing_important_fields=investor_data.get(
|
|
"missing_important_fields"
|
|
),
|
|
sources=investor_data.get("sources"),
|
|
)
|
|
db.add(investor)
|
|
db.flush()
|
|
|
|
# Add/update team members
|
|
# First, remove existing team members if updating
|
|
if existing_investor:
|
|
db.query(InvestorMember).filter_by(investor_id=investor.id).delete()
|
|
|
|
for member_data in investor_data.get("team_members", []):
|
|
member = InvestorMember(
|
|
name=member_data.get("name"),
|
|
role=member_data.get("role"),
|
|
title=member_data.get("title"),
|
|
email=member_data.get("email"),
|
|
source_url=member_data.get("source_url"),
|
|
investor_id=investor.id,
|
|
)
|
|
db.add(member)
|
|
|
|
# Add/update funds
|
|
# First, remove existing funds if updating
|
|
if existing_investor:
|
|
db.query(FundTable).filter_by(investor_id=investor.id).delete()
|
|
|
|
for fund_data in investor_data.get("funds", []):
|
|
fund = FundTable(
|
|
investor_id=investor.id,
|
|
fund_name=fund_data.get("fund_name"),
|
|
fund_size=fund_data.get("fund_size"), # Now an integer
|
|
fund_size_source_url=fund_data.get("fund_size_source_url"),
|
|
check_size_lower=fund_data.get("check_size_lower"),
|
|
check_size_upper=fund_data.get("check_size_upper"),
|
|
source_url=fund_data.get("source_url"),
|
|
source_provider=fund_data.get("source_provider"),
|
|
geographic_focus=fund_data.get("geographic_focus"), # Now a string
|
|
)
|
|
db.add(fund)
|
|
db.flush() # Get the fund ID
|
|
|
|
# Add investment stages (many-to-many)
|
|
for stage_name in fund_data.get("investment_stage_names", []):
|
|
stage = self._get_or_create_investment_stage(db, stage_name)
|
|
fund.investment_stages.append(stage)
|
|
|
|
# Add sectors (many-to-many)
|
|
for sector_name in fund_data.get("sector_names", []):
|
|
sector = self._get_or_create_sector(db, sector_name)
|
|
fund.sectors.append(sector)
|
|
|
|
return investor
|
|
|
|
except Exception as e:
|
|
print(f"Error saving investor to database: {e}")
|
|
db.rollback()
|
|
return None
|
|
|
|
def _get_or_create_investment_stage(
|
|
self, db: Session, stage_name: str
|
|
) -> InvestmentStageTable:
|
|
"""Get existing investment stage or create new one"""
|
|
from db.models import InvestmentStageTable
|
|
|
|
stage = (
|
|
db.query(InvestmentStageTable)
|
|
.filter(InvestmentStageTable.name == stage_name)
|
|
.first()
|
|
)
|
|
if not stage:
|
|
stage = InvestmentStageTable(name=stage_name)
|
|
db.add(stage)
|
|
db.flush() # Get the ID without committing
|
|
return stage
|
|
|
|
def _get_or_create_sector(self, db: Session, sector_name: str) -> SectorTable:
|
|
"""Get existing sector or create new one"""
|
|
sector = db.query(SectorTable).filter(SectorTable.name == sector_name).first()
|
|
if not sector:
|
|
sector = SectorTable(name=sector_name)
|
|
db.add(sector)
|
|
db.flush() # Get the ID without committing
|
|
return sector
|
|
|
|
def _save_investor_to_db(
|
|
self, db: Session, investor_data: InvestorData
|
|
) -> InvestorTable:
|
|
"""Save investor data to database"""
|
|
# Create investor record
|
|
investor = InvestorTable(
|
|
name=investor_data.investor.name,
|
|
description=investor_data.investor.description,
|
|
aum=investor_data.investor.aum,
|
|
check_size_lower=investor_data.investor.check_size_lower,
|
|
check_size_upper=investor_data.investor.check_size_upper,
|
|
geographic_focus=investor_data.investor.geographic_focus,
|
|
number_of_investments=investor_data.investor.number_of_investments,
|
|
)
|
|
db.add(investor)
|
|
db.flush() # Get the ID
|
|
|
|
# Add team members
|
|
for member_data in investor_data.team_members:
|
|
member = InvestorMember(
|
|
name=member_data.name,
|
|
role=member_data.role,
|
|
email=member_data.email,
|
|
investor_id=investor.id,
|
|
)
|
|
db.add(member)
|
|
|
|
# Add sectors
|
|
for sector_data in investor_data.sectors:
|
|
sector = self._get_or_create_sector(db, sector_data.name)
|
|
investor.sectors.append(sector)
|
|
|
|
# Add portfolio companies
|
|
for company_schema in investor_data.portfolio_companies:
|
|
# Convert CompanySchema to CompanyData format
|
|
company_data = CompanyData(
|
|
company=company_schema,
|
|
sectors=[], # Will be empty for portfolio companies
|
|
members=[], # Will be empty for portfolio companies
|
|
investors=[], # Will be empty for portfolio companies
|
|
)
|
|
company = self._save_company_to_db(db, company_data, skip_investors=True)
|
|
investor.portfolio_companies.append(company)
|
|
|
|
return investor
|
|
|
|
def _save_company_to_db(
|
|
self, db: Session, company_data: CompanyData, skip_investors: bool = False
|
|
) -> CompanyTable:
|
|
"""Save company data to database"""
|
|
# Check if company already exists
|
|
existing_company = (
|
|
db.query(CompanyTable)
|
|
.filter(CompanyTable.name == company_data.company.name)
|
|
.first()
|
|
)
|
|
if existing_company:
|
|
return existing_company
|
|
|
|
# Create company record
|
|
company = CompanyTable(
|
|
name=company_data.company.name,
|
|
industry=company_data.company.industry,
|
|
location=company_data.company.location,
|
|
description=company_data.company.description,
|
|
founded_year=company_data.company.founded_year,
|
|
website=company_data.company.website,
|
|
)
|
|
db.add(company)
|
|
db.flush() # Get the ID
|
|
|
|
# Add company members
|
|
for member_data in company_data.members:
|
|
if member_data.name: # Only add members with names
|
|
member = CompanyMember(
|
|
name=member_data.name,
|
|
linkedin=member_data.linkedin,
|
|
role=member_data.role,
|
|
company_id=company.id,
|
|
)
|
|
db.add(member)
|
|
|
|
# Add sectors
|
|
for sector_data in company_data.sectors:
|
|
sector = self._get_or_create_sector(db, sector_data.name)
|
|
company.sectors.append(sector)
|
|
|
|
# Add investors (if not skipping to avoid circular references)
|
|
if not skip_investors:
|
|
for investor_data in company_data.investors:
|
|
# Look for existing investor by name
|
|
existing_investor = (
|
|
db.query(InvestorTable)
|
|
.filter(InvestorTable.name == investor_data.name)
|
|
.first()
|
|
)
|
|
if existing_investor:
|
|
company.investors.append(existing_investor)
|
|
|
|
return company
|
|
|
|
async def _process_row(
|
|
self, row: pd.Series, row_idx: int, is_investor: bool = True
|
|
) -> Optional[InvestorData | CompanyData]:
|
|
"""Process a single row of data"""
|
|
# Clean values to remove control characters
|
|
cleaned_row = {}
|
|
for key, value in row.items():
|
|
if pd.notna(value):
|
|
# Convert to string and clean control characters
|
|
clean_value = (
|
|
str(value).replace("\n", " ").replace("\r", " ").replace("\t", " ")
|
|
)
|
|
# Remove other control characters
|
|
clean_value = "".join(
|
|
char
|
|
for char in clean_value
|
|
if ord(char) >= 32 or char in ["\n", "\r", "\t"]
|
|
)
|
|
cleaned_row[key] = clean_value
|
|
|
|
row_str = ", ".join([f"{key}: {value}" for key, value in cleaned_row.items()])
|
|
try:
|
|
print(f"Processing row {row_idx + 1}...")
|
|
if is_investor:
|
|
result = await self.investor_structured_llm.ainvoke(row_str)
|
|
else:
|
|
result = await self.company_structured_llm.ainvoke(row_str)
|
|
if result:
|
|
return result.model_dump()
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error processing row {row_idx + 1}: {e}")
|
|
return None
|
|
|
|
async def parse_investors(self, df: pd.DataFrame, save_to_db: bool = True):
|
|
"""
|
|
Parse investors from DataFrame using manual JSON parsing and LLM for currency conversion.
|
|
Expected CSV columns: Name, Website, Final Investor Profile, Final Profile sourcing
|
|
"""
|
|
results = []
|
|
db = None
|
|
if save_to_db:
|
|
db = get_db_session()
|
|
|
|
try:
|
|
total_rows = len(df)
|
|
print(f"\n🚀 Starting to process {total_rows} investors...")
|
|
|
|
for idx, row in df.iterrows():
|
|
try:
|
|
name = (
|
|
row.get("Name", "").strip()
|
|
if pd.notna(row.get("Name"))
|
|
else None
|
|
)
|
|
website = (
|
|
row.get("Website", "").strip()
|
|
if pd.notna(row.get("Website"))
|
|
else None
|
|
)
|
|
profile_json = (
|
|
row.get("Final Investor Profile", "")
|
|
if pd.notna(row.get("Final Investor Profile"))
|
|
else None
|
|
)
|
|
|
|
if not name or not profile_json:
|
|
print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile")
|
|
continue
|
|
|
|
print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}")
|
|
|
|
# Process the investor profile
|
|
investor_data = await self.process_investor_profile(
|
|
name, website, profile_json
|
|
)
|
|
|
|
if investor_data:
|
|
results.append(investor_data)
|
|
print(" ✓ Parsed successfully")
|
|
print(f" - HQ: {investor_data.get('headquarters')}")
|
|
print(
|
|
f" - AUM: ${investor_data.get('aum'):,}"
|
|
if investor_data.get("aum")
|
|
else " - AUM: Not Available"
|
|
)
|
|
print(f" - Funds: {len(investor_data.get('funds', []))}")
|
|
print(
|
|
f" - Team: {len(investor_data.get('team_members', []))}"
|
|
)
|
|
|
|
# Save to database
|
|
if save_to_db and db:
|
|
try:
|
|
saved_investor = self._save_parsed_investor_to_db(
|
|
db, investor_data
|
|
)
|
|
if saved_investor:
|
|
db.commit()
|
|
print(
|
|
f" ✅ Saved to database (ID: {saved_investor.id})"
|
|
)
|
|
else:
|
|
print(" ❌ Failed to save to database")
|
|
except Exception as e:
|
|
db.rollback()
|
|
print(f" ❌ Database error: {e}")
|
|
else:
|
|
print(" ⚠️ Failed to process profile")
|
|
|
|
# Commit every 10 investors to avoid memory issues
|
|
if save_to_db and db and (idx + 1) % 10 == 0:
|
|
db.commit()
|
|
print(f"\n💾 Committed batch at row {idx + 1}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing row {idx + 1}: {e}")
|
|
if db:
|
|
db.rollback()
|
|
continue
|
|
|
|
# Final commit
|
|
if save_to_db and db:
|
|
db.commit()
|
|
print("\n✅ Final commit completed")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Fatal error in parse_investors: {e}")
|
|
if db:
|
|
db.rollback()
|
|
finally:
|
|
if db:
|
|
db.close()
|
|
|
|
print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} investors")
|
|
return results
|
|
|
|
async def parse_companies(self, df: pd.DataFrame, save_to_db: bool = True):
|
|
"""
|
|
Parse companies from DataFrame using manual JSON parsing.
|
|
Expected CSV columns: Name, Website, Investor, Final Investor Profile (actually company profile)
|
|
"""
|
|
results = []
|
|
db = None
|
|
if save_to_db:
|
|
db = get_db_session()
|
|
|
|
try:
|
|
total_rows = len(df)
|
|
print(f"\n🚀 Starting to process {total_rows} companies...")
|
|
|
|
for idx, row in df.iterrows():
|
|
try:
|
|
name = (
|
|
row.get("Name", "").strip()
|
|
if pd.notna(row.get("Name"))
|
|
else None
|
|
)
|
|
website = (
|
|
row.get("Website", "").strip()
|
|
if pd.notna(row.get("Website"))
|
|
else None
|
|
)
|
|
investor_names = (
|
|
row.get("Investor", "").strip()
|
|
if pd.notna(row.get("Investor"))
|
|
else None
|
|
)
|
|
profile_json = (
|
|
row.get("Final Investor Profile", "")
|
|
if pd.notna(row.get("Final Investor Profile"))
|
|
else None
|
|
)
|
|
|
|
if not name or not profile_json:
|
|
print(f"⚠️ Row {idx + 1}: Skipping - missing name or profile")
|
|
continue
|
|
|
|
print(f"\n📊 Processing {idx + 1}/{total_rows}: {name}")
|
|
|
|
# Process the company profile
|
|
company_data = await self.process_company_profile(
|
|
name, website, profile_json, investor_names
|
|
)
|
|
|
|
if company_data:
|
|
results.append(company_data)
|
|
print(" ✓ Parsed successfully")
|
|
print(f" - Location: {company_data.get('location')}")
|
|
print(f" - Industry: {company_data.get('industry')}")
|
|
print(
|
|
f" - Founded: {company_data.get('founded_year')}"
|
|
if company_data.get("founded_year")
|
|
else " - Founded: Unknown"
|
|
)
|
|
print(
|
|
f" - Executives: {len(company_data.get('key_executives', []))}"
|
|
)
|
|
print(
|
|
f" - Investors: {len(company_data.get('investor_names', []))}"
|
|
)
|
|
|
|
# Save to database
|
|
if save_to_db and db:
|
|
try:
|
|
saved_company = self._save_parsed_company_to_db(
|
|
db, company_data
|
|
)
|
|
if saved_company:
|
|
db.commit()
|
|
print(
|
|
f" ✅ Saved to database (ID: {saved_company.id})"
|
|
)
|
|
else:
|
|
print(" ❌ Failed to save to database")
|
|
except Exception as e:
|
|
db.rollback()
|
|
print(f" ❌ Database error: {e}")
|
|
else:
|
|
print(" ⚠️ Failed to process profile")
|
|
|
|
# Commit every 10 companies to avoid memory issues
|
|
if save_to_db and db and (idx + 1) % 10 == 0:
|
|
db.commit()
|
|
print(f"\n💾 Committed batch at row {idx + 1}")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing row {idx + 1}: {e}")
|
|
if db:
|
|
db.rollback()
|
|
continue
|
|
|
|
# Final commit
|
|
if save_to_db and db:
|
|
db.commit()
|
|
print("\n✅ Final commit completed")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Fatal error in parse_companies: {e}")
|
|
if db:
|
|
db.rollback()
|
|
finally:
|
|
if db:
|
|
db.close()
|
|
|
|
print(f"\n🎉 Completed! Processed {len(results)}/{total_rows} companies")
|
|
return results
|
|
|
|
|
|
# async def main():
|
|
# """Main execution function"""
|
|
# # Initialize database tables
|
|
# print("🔧 Initializing database...")
|
|
# init_database()
|
|
|
|
# # Create processor
|
|
# processor = InvestorProcessor()
|
|
|
|
# print("📊 Processing companies...")
|
|
# companies = await processor.parse_companies(
|
|
# "data/19 Companies data.csv", save_to_db=True
|
|
# )
|
|
# print(f"Processed {len(companies)} companies")
|
|
|
|
# print("\n💰 Processing investors...")
|
|
# investors = await processor.parse_investors(
|
|
# "data/19 Investors data.csv", save_to_db=True
|
|
# )
|
|
# print(f"Processed {len(investors)} investors")
|
|
# print("\n✨ Processing complete!")
|
|
|
|
|
|
# if __name__ == "__main__":
|
|
# asyncio.run(main())
|