made version 2

This commit is contained in:
bolade
2025-09-25 17:00:38 +01:00
parent b1b1c5ea1e
commit 0f7beca5e1
42 changed files with 660 additions and 2036 deletions
+298 -329
View File
@@ -1,368 +1,337 @@
import json
import logging
import asyncio
import os
from typing import Any, Dict, Optional
from typing import Optional
import chromadb
import pandas as pd
from dotenv import load_dotenv
from openai import OpenAI
from db import get_session, init_database
from py_schemas import CSVRow, Investor
# Load environment variables
load_dotenv()
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
from db.db import get_db_session
from db.models import (
CompanyMember,
CompanyTable,
InvestorMember,
InvestorTable,
SectorTable,
)
from langchain_openai import ChatOpenAI
from schemas.py_schemas import CompanyData, InvestorData
from sqlalchemy.orm import Session
class LLMInvestorParser:
class InvestorProcessor:
def __init__(self):
# Initialize OpenAI client
self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# Initialize ChromaDB
self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
self.collection = self.chroma_client.get_or_create_collection(
name="investor_descriptions",
metadata={
"description": "Investor descriptions and investment thesis focus"
},
self.llm = ChatOpenAI(
api_key=os.getenv("OPENROUTER_API_KEY"),
base_url="https://openrouter.ai/api/v1",
model="openai/gpt-5-nano",
temperature=0,
)
# Initialize database
init_database()
self.investor_structured_llm = self.llm.with_structured_output(InvestorData)
self.company_structured_llm = self.llm.with_structured_output(CompanyData)
def parse_json_field(self, json_str: str) -> Dict[str, Any]:
"""Safely parse JSON string with LLM assistance if needed"""
if not json_str or json_str.strip() == "":
return {}
def _get_or_create_sector(self, db: Session, sector_name: str) -> SectorTable:
"""Get existing sector or create new one"""
sector = db.query(SectorTable).filter(SectorTable.name == sector_name).first()
if not sector:
sector = SectorTable(name=sector_name)
db.add(sector)
db.flush() # Get the ID without committing
return sector
try:
# Try direct JSON parsing first
return json.loads(json_str)
except json.JSONDecodeError:
# If direct parsing fails, use LLM to clean and parse
logger.info("Direct JSON parsing failed, using LLM to clean JSON")
return self._llm_clean_json(json_str)
def _save_investor_to_db(
self, db: Session, investor_data: InvestorData
) -> InvestorTable:
"""Save investor data to database"""
# Create investor record
investor = InvestorTable(
name=investor_data.investor.name,
description=investor_data.investor.description,
aum=investor_data.investor.aum,
check_size_lower=investor_data.investor.check_size_lower,
check_size_upper=investor_data.investor.check_size_upper,
geographic_focus=investor_data.investor.geographic_focus,
stage_focus=investor_data.investor.stage_focus,
number_of_investments=investor_data.investor.number_of_investments,
)
db.add(investor)
db.flush() # Get the ID
def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]:
"""Use LLM to clean and parse malformed JSON"""
try:
prompt = f"""
The following text appears to be malformed JSON. Please clean it up and return valid JSON.
If it's not possible to create valid JSON, return an empty object {{}}.
Original text:
{malformed_json[:2000]} # Limit length for API
Return only the cleaned JSON, no explanations:
"""
response = self.openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0,
# Add team members
for member_data in investor_data.team_members:
member = InvestorMember(
name=member_data.name,
role=member_data.role,
email=member_data.email,
investor_id=investor.id,
)
db.add(member)
cleaned_json = response.choices[0].message.content.strip()
return json.loads(cleaned_json)
# Add sectors
for sector_data in investor_data.sectors:
sector = self._get_or_create_sector(db, sector_data.name)
investor.sectors.append(sector)
except Exception as e:
logger.error(f"LLM JSON cleaning failed: {e}")
return {}
def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
"""Extract and structure data from CSV row using LLM"""
# Parse the investment firm profile
profile_data = {}
if csv_row.investment_firm_profile:
profile_data = self.parse_json_field(csv_row.investment_firm_profile)
# Create structured output
structured_data = {
"name": csv_row.name,
"website": csv_row.website or profile_data.get("websiteURL"),
"investor_description": profile_data.get("investorDescription", ""),
"investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
"headquarters": profile_data.get("headquarters", ""),
"aum_info": profile_data.get("overallAssetsUnderManagement", {}),
"funds_info": profile_data.get("funds", []),
"crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
"crunchbase_extract": csv_row.crunchbase_firm_extract or "",
"linkedin_profile": csv_row.linkedin_investment_profile or "",
"source_truth_profile": csv_row.source_of_truth_profile or "",
}
return structured_data
def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]:
"""Use LLM to enhance and standardize investor data"""
try:
# Combine all available text for context
context_text = " ".join(
[
investor_data.get("investor_description", ""),
investor_data.get("crunchbase_extract", ""),
investor_data.get("linkedin_profile", ""),
investor_data.get("source_truth_profile", ""),
]
# Add portfolio companies
for company_schema in investor_data.portfolio_companies:
# Convert CompanySchema to CompanyData format
company_data = CompanyData(
company=company_schema,
sectors=[], # Will be empty for portfolio companies
members=[], # Will be empty for portfolio companies
investors=[], # Will be empty for portfolio companies
)
company = self._save_company_to_db(db, company_data, skip_investors=True)
investor.portfolio_companies.append(company)
if not context_text.strip():
return investor_data
return investor
prompt = f"""
Based on the following information about an investor, please extract and standardize:
1. A concise investor description (2-3 sentences)
2. Investment thesis focus areas (list of specific focus areas)
3. Headquarters location (city, country format)
Investor: {investor_data["name"]}
Context: {context_text[:3000]} # Limit for API
Return in JSON format:
{{
"enhanced_description": "concise description here",
"standardized_focus": ["focus area 1", "focus area 2", ...],
"standardized_headquarters": "City, Country"
}}
"""
def _save_company_to_db(
self, db: Session, company_data: CompanyData, skip_investors: bool = False
) -> CompanyTable:
"""Save company data to database"""
# Check if company already exists
existing_company = (
db.query(CompanyTable)
.filter(CompanyTable.name == company_data.company.name)
.first()
)
if existing_company:
return existing_company
response = self.openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
)
# Create company record
company = CompanyTable(
name=company_data.company.name,
industry=company_data.company.industry,
location=company_data.company.location,
description=company_data.company.description,
founded_year=company_data.company.founded_year,
website=company_data.company.website,
)
db.add(company)
db.flush() # Get the ID
enhanced_data = json.loads(response.choices[0].message.content)
# Add company members
for member_data in company_data.members:
if member_data.name: # Only add members with names
member = CompanyMember(
name=member_data.name,
linkedin=member_data.linkedin,
role=member_data.role,
company_id=company.id,
)
db.add(member)
# Update investor data with enhanced information
if enhanced_data.get("enhanced_description"):
investor_data["enhanced_description"] = enhanced_data[
"enhanced_description"
]
# Add sectors
for sector_data in company_data.sectors:
sector = self._get_or_create_sector(db, sector_data.name)
company.sectors.append(sector)
if enhanced_data.get("standardized_focus"):
investor_data["standardized_focus"] = enhanced_data[
"standardized_focus"
]
if enhanced_data.get("standardized_headquarters"):
investor_data["standardized_headquarters"] = enhanced_data[
"standardized_headquarters"
]
return investor_data
except Exception as e:
logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}")
return investor_data
def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
"""Save investor data to SQL database"""
try:
with get_session() as session:
# Check if investor already exists
existing = (
session.query(Investor)
.filter_by(name=investor_data["name"])
# Add investors (if not skipping to avoid circular references)
if not skip_investors:
for investor_data in company_data.investors:
# Look for existing investor by name
existing_investor = (
db.query(InvestorTable)
.filter(InvestorTable.name == investor_data.name)
.first()
)
if existing_investor:
company.investors.append(existing_investor)
if existing:
logger.info(f"Updating existing investor: {investor_data['name']}")
investor = existing
else:
logger.info(f"Creating new investor: {investor_data['name']}")
investor = Investor()
return company
# Map data to investor object
investor.name = investor_data["name"]
investor.website = investor_data.get("website")
investor.investor_description = investor_data.get(
"enhanced_description"
) or investor_data.get("investor_description")
investor.investment_thesis_focus = investor_data.get(
"standardized_focus"
) or investor_data.get("investment_thesis_focus")
investor.headquarters = investor_data.get(
"standardized_headquarters"
) or investor_data.get("headquarters")
# AUM information
aum_info = investor_data.get("aum_info", {})
investor.aum_amount = aum_info.get("aumAmount")
investor.aum_as_of_date = aum_info.get("asOfDate")
investor.aum_source_url = aum_info.get("sourceUrl")
# Fund information
investor.funds_info = investor_data.get("funds_info", [])
# Raw data
investor.crunchbase_urls = investor_data.get("crunchbase_urls")
investor.crunchbase_extract = investor_data.get("crunchbase_extract")
investor.linkedin_profile = investor_data.get("linkedin_profile")
investor.source_truth_profile = investor_data.get(
"source_truth_profile"
async def _process_row(
self, row: pd.Series, row_idx: int, is_investor: bool = True
) -> Optional[InvestorData | CompanyData]:
"""Process a single row of data"""
# Clean values to remove control characters
cleaned_row = {}
for key, value in row.items():
if pd.notna(value):
# Convert to string and clean control characters
clean_value = (
str(value).replace("\n", " ").replace("\r", " ").replace("\t", " ")
)
# Remove other control characters
clean_value = "".join(
char
for char in clean_value
if ord(char) >= 32 or char in ["\n", "\r", "\t"]
)
cleaned_row[key] = clean_value
if not existing:
session.add(investor)
session.flush() # Get the ID
return investor.id
except Exception as e:
logger.error(f"Failed to save to SQL: {e}")
raise
def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
"""Save investor description and focus to ChromaDB"""
row_str = ", ".join([f"{key}: {value}" for key, value in cleaned_row.items()])
try:
# Prepare text for embedding
description_text = investor_data.get(
"enhanced_description"
) or investor_data.get("investor_description", "")
focus_areas = investor_data.get("standardized_focus") or investor_data.get(
"investment_thesis_focus", []
)
if isinstance(focus_areas, list):
focus_text = " ".join(focus_areas)
print(f"Processing row {row_idx + 1}...")
if is_investor:
result = await self.investor_structured_llm.ainvoke(row_str)
else:
focus_text = str(focus_areas)
# Combine description and focus for embedding
combined_text = f"{description_text} {focus_text}".strip()
if not combined_text:
logger.warning(f"No text to embed for investor {investor_data['name']}")
return
# Create metadata
metadata = {
"investor_id": investor_id,
"name": investor_data["name"],
"website": investor_data.get("website", ""),
"headquarters": investor_data.get("standardized_headquarters")
or investor_data.get("headquarters", ""),
"focus_areas_count": len(focus_areas)
if isinstance(focus_areas, list)
else 0,
}
# Add to ChromaDB
self.collection.add(
documents=[combined_text],
metadatas=[metadata],
ids=[f"investor_{investor_id}"],
)
logger.info(f"Added investor {investor_data['name']} to vector database")
result = await self.company_structured_llm.ainvoke(row_str)
if result:
return result.model_dump()
return None
except Exception as e:
logger.error(f"Failed to save to vector DB: {e}")
def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
"""Process the entire CSV file"""
logger.info(f"Starting to process CSV file: {csv_file_path}")
# Read CSV
df = pd.read_csv(csv_file_path)
logger.info(f"Loaded {len(df)} rows from CSV")
if limit:
df = df.head(limit)
logger.info(f"Processing limited to {limit} rows")
processed_count = 0
error_count = 0
for index, row in df.iterrows():
try:
logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
# Create CSVRow object
csv_row = CSVRow(
name=row["Name"],
website=row.get("Website"),
investment_firm_profile=row.get("Investment Firm Profile"),
crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
source_of_truth_profile=row.get("Source of Truth Profile"),
)
# Extract structured data
structured_data = self.extract_structured_data(csv_row)
# Enhance with LLM
enhanced_data = self.enhance_with_llm(structured_data)
# Save to SQL database
investor_id = self.save_to_sql(enhanced_data)
# Save to vector database
self.save_to_vector_db(investor_id, enhanced_data)
processed_count += 1
# Progress update every 10 rows
if (index + 1) % 10 == 0:
logger.info(
f"Processed {processed_count} rows successfully, {error_count} errors"
)
except Exception as e:
error_count += 1
logger.error(
f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
)
continue
logger.info(
f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
)
return processed_count, error_count
def search_investors(self, query: str, limit: int = 5):
"""Search investors using vector similarity"""
try:
results = self.collection.query(query_texts=[query], n_results=limit)
return results
except Exception as e:
logger.error(f"Search failed: {e}")
print(f"Error processing row {row_idx + 1}: {e}")
return None
async def parse_investors(self, df, save_to_db: bool = True):
"""Parse investors from DataFrame and optionally save to database"""
investors = []
def main():
"""Main function to run the parser"""
parser = LLMInvestorParser()
db = None
if save_to_db:
db = get_db_session()
# Process the CSV file
csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv"
try:
# Process rows in batches asynchronously
batch_size = 15 # Adjust batch size as needed
rows = [(idx, row) for idx, row in df.iterrows()]
# Start with a small sample for testing
processed, errors = parser.process_csv_file(csv_file, limit=5)
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
print("\nProcessing complete!")
print(f"Successfully processed: {processed} investors")
print(f"Errors encountered: {errors}")
# Process batch asynchronously
tasks = [
self._process_row(row, idx, is_investor=True) for idx, row in batch
]
# Test search functionality
print("\nTesting search functionality...")
results = parser.search_investors("bioeconomy circular economy")
if results:
print(f"Found {len(results['documents'][0])} similar investors")
for i, doc in enumerate(results["documents"][0]):
print(f" {i + 1}. {results['metadatas'][0][i]['name']}")
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle results from batch
for (idx, row), result in zip(batch, batch_results):
if isinstance(result, Exception):
print(f"Error processing row {idx}: {result}")
if db:
db.rollback()
continue
if result:
# Convert dict to InvestorData if needed
if isinstance(result, dict):
investor_data = InvestorData(**result)
else:
investor_data = result
investors.append(investor_data)
# Save to database if requested
if save_to_db and db:
try:
saved_investor = self._save_investor_to_db(
db, investor_data
)
db.commit()
print(
f"✅ Saved investor '{saved_investor.name}' to database"
)
except Exception as e:
db.rollback()
print(f"❌ Failed to save investor to database: {e}")
print(
f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
)
except Exception as e:
print(f"Error in batch processing: {e}")
if db:
db.rollback()
finally:
if db:
db.close()
return investors
async def parse_companies(self, df, save_to_db: bool = True):
"""Parse companies from DataFrame and optionally save to database"""
companies = []
db = None
if save_to_db:
db = get_db_session()
try:
# Process rows in batches asynchronously
batch_size = 15 # Adjust batch size as needed
rows = [(idx, row) for idx, row in df.iterrows()]
for i in range(0, len(rows), batch_size):
batch = rows[i : i + batch_size]
# Process batch asynchronously
tasks = [
self._process_row(row, idx, is_investor=False) for idx, row in batch
]
batch_results = await asyncio.gather(*tasks, return_exceptions=True)
# Handle results from batch
for (idx, row), result in zip(batch, batch_results):
if isinstance(result, Exception):
print(f"Error processing row {idx}: {result}")
if db:
db.rollback()
continue
if result:
# Convert dict to CompanyData if needed
if isinstance(result, dict):
company_data = CompanyData(**result)
else:
company_data = result
companies.append(company_data)
# Save to database if requested
if save_to_db and db:
try:
saved_company = self._save_company_to_db(
db, company_data
)
db.commit()
print(
f"✅ Saved company '{saved_company.name}' to database"
)
except Exception as e:
db.rollback()
print(f"❌ Failed to save company to database: {e}")
print(
f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
)
except Exception as e:
print(f"Error processing row {idx}: {e}")
if db:
db.rollback()
finally:
if db:
db.close()
return companies
if __name__ == "__main__":
main()
# async def main():
# """Main execution function"""
# # Initialize database tables
# print("🔧 Initializing database...")
# init_database()
# # Create processor
# processor = InvestorProcessor()
# print("📊 Processing companies...")
# companies = await processor.parse_companies(
# "data/19 Companies data.csv", save_to_db=True
# )
# print(f"Processed {len(companies)} companies")
# print("\n💰 Processing investors...")
# investors = await processor.parse_investors(
# "data/19 Investors data.csv", save_to_db=True
# )
# print(f"Processed {len(investors)} investors")
# print("\n✨ Processing complete!")
# if __name__ == "__main__":
# asyncio.run(main())