Implement LLM-powered Investor Parser with CSV processing, SQL and vector database integration
- Added FastAPI application with a simple root endpoint. - Developed LLMInvestorParser class for processing investor data from CSV files. - Integrated OpenAI API for LLM enhancements and JSON cleaning. - Implemented structured data extraction and saving to SQL database. - Added functionality to save investor descriptions to ChromaDB for vector similarity search. - Created command-line interface for processing files and searching investors. - Added schema definitions for Investor and related data models using SQLAlchemy and Pydantic. - Implemented logging for better traceability and error handling. - Included requirements.txt for dependency management.
This commit is contained in:
@@ -0,0 +1,115 @@
|
||||
import json
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import JSON, Column, DateTime, Integer, String, Text
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class Investor(Base):
|
||||
__tablename__ = "investors"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
name = Column(String(500), nullable=False)
|
||||
website = Column(String(1000))
|
||||
|
||||
# Core investment information
|
||||
investor_description = Column(Text)
|
||||
investment_thesis_focus = Column(JSON) # List of focus areas
|
||||
headquarters = Column(String(1000))
|
||||
|
||||
# AUM information
|
||||
aum_amount = Column(String(200))
|
||||
aum_as_of_date = Column(String(100))
|
||||
aum_source_url = Column(String(1000))
|
||||
|
||||
# Fund information
|
||||
funds_info = Column(JSON) # Complex fund data
|
||||
|
||||
# Raw data columns for reference
|
||||
crunchbase_urls = Column(Text)
|
||||
crunchbase_extract = Column(Text)
|
||||
linkedin_profile = Column(Text)
|
||||
source_truth_profile = Column(Text)
|
||||
|
||||
# Metadata
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Investor(name='{self.name}', website='{self.website}')>"
|
||||
|
||||
|
||||
# Pydantic models for data validation and parsing
|
||||
class AUMInfo(BaseModel):
|
||||
aumAmount: Optional[str] = None
|
||||
asOfDate: Optional[str] = None
|
||||
sourceUrl: Optional[str] = None
|
||||
|
||||
|
||||
class FundInfo(BaseModel):
|
||||
fundName: Optional[str] = None
|
||||
fundSize: Optional[str] = None
|
||||
vintage: Optional[str] = None
|
||||
status: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class InvestorProfile(BaseModel):
|
||||
websiteURL: Optional[str] = None
|
||||
investorDescription: Optional[str] = None
|
||||
investmentThesisFocus: Optional[List[str]] = None
|
||||
headquarters: Optional[str] = None
|
||||
overallAssetsUnderManagement: Optional[AUMInfo] = None
|
||||
funds: Optional[List[FundInfo]] = None
|
||||
|
||||
|
||||
class CSVRow(BaseModel):
|
||||
name: str
|
||||
website: Optional[str] = None
|
||||
investment_firm_profile: Optional[str] = None
|
||||
crunchbase_linkedin_urls: Optional[str] = None
|
||||
crunchbase_firm_extract: Optional[str] = None
|
||||
linkedin_investment_profile: Optional[str] = None
|
||||
source_of_truth_profile: Optional[str] = None
|
||||
|
||||
def get_combined_description(self) -> str:
|
||||
"""Combine all description fields for vector embedding"""
|
||||
descriptions = []
|
||||
|
||||
if self.investment_firm_profile:
|
||||
try:
|
||||
profile_data = json.loads(self.investment_firm_profile)
|
||||
if isinstance(profile_data, dict):
|
||||
desc = profile_data.get("investorDescription", "")
|
||||
if desc:
|
||||
descriptions.append(desc)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
if self.crunchbase_firm_extract:
|
||||
descriptions.append(self.crunchbase_firm_extract)
|
||||
|
||||
if self.linkedin_investment_profile:
|
||||
descriptions.append(self.linkedin_investment_profile)
|
||||
|
||||
if self.source_of_truth_profile:
|
||||
descriptions.append(self.source_of_truth_profile)
|
||||
|
||||
return " ".join(descriptions)
|
||||
|
||||
def get_investment_focus(self) -> List[str]:
|
||||
"""Extract investment thesis focus"""
|
||||
if self.investment_firm_profile:
|
||||
try:
|
||||
profile_data = json.loads(self.investment_firm_profile)
|
||||
if isinstance(profile_data, dict):
|
||||
focus = profile_data.get("investmentThesisFocus", [])
|
||||
if isinstance(focus, list):
|
||||
return focus
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
return []
|
||||
Reference in New Issue
Block a user