Remove deprecated demo, ingest, schema, and test parser files; add new LLM parser implementation and settings configuration
This commit is contained in:
@@ -1,82 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Quick demonstration of the LLM Investor Parser functionality.
|
|
||||||
This script shows how to use the system programmatically.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from sqlalchemy import func, select
|
|
||||||
|
|
||||||
from db import get_session
|
|
||||||
from investor_parser import InvestorParser
|
|
||||||
from schema import Investor
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
print("🚀 LLM Investor Parser Demo")
|
|
||||||
print("=" * 50)
|
|
||||||
|
|
||||||
# Initialize parser (without LLM for demo)
|
|
||||||
parser = InvestorParser(use_llm=False)
|
|
||||||
|
|
||||||
# Show current database stats
|
|
||||||
with get_session() as session:
|
|
||||||
count = session.scalar(select(func.count(Investor.id)))
|
|
||||||
print(f"📊 Current database: {count} investors")
|
|
||||||
|
|
||||||
# Demonstrate search functionality
|
|
||||||
print("\n🔍 Search Examples:")
|
|
||||||
|
|
||||||
search_queries = [
|
|
||||||
"circular bioeconomy sustainable",
|
|
||||||
"venture capital early stage",
|
|
||||||
"fintech financial technology",
|
|
||||||
"healthcare biotechnology",
|
|
||||||
"climate sustainability",
|
|
||||||
]
|
|
||||||
|
|
||||||
for query in search_queries:
|
|
||||||
print(f"\n🔎 Searching for: '{query}'")
|
|
||||||
results = parser.search_investors(query, limit=3)
|
|
||||||
|
|
||||||
if results and results["documents"][0]:
|
|
||||||
for i, metadata in enumerate(results["metadatas"][0]):
|
|
||||||
score = results["distances"][0][i]
|
|
||||||
print(f" {i + 1}. {metadata['name']} (score: {score:.3f})")
|
|
||||||
else:
|
|
||||||
print(" No results found")
|
|
||||||
|
|
||||||
# Show detailed investor information
|
|
||||||
print("\n📋 Detailed Investor Sample:")
|
|
||||||
|
|
||||||
with get_session() as session:
|
|
||||||
investor = session.execute(
|
|
||||||
select(Investor).where(Investor.investor_description.isnot(None)).limit(1)
|
|
||||||
).scalar_one_or_none()
|
|
||||||
|
|
||||||
if investor:
|
|
||||||
print(f"\n🏢 {investor.name}")
|
|
||||||
print(f"🌐 Website: {investor.website}")
|
|
||||||
print(f"📍 HQ: {investor.headquarters or 'Not specified'}")
|
|
||||||
print(f"📝 Description: {investor.investor_description[:200]}...")
|
|
||||||
|
|
||||||
if investor.investment_thesis_focus:
|
|
||||||
print(
|
|
||||||
f"\n🎯 Investment Focus ({len(investor.investment_thesis_focus)} areas):"
|
|
||||||
)
|
|
||||||
for i, focus in enumerate(investor.investment_thesis_focus[:3], 1):
|
|
||||||
print(f" {i}. {focus}")
|
|
||||||
if len(investor.investment_thesis_focus) > 3:
|
|
||||||
print(f" ... and {len(investor.investment_thesis_focus) - 3} more")
|
|
||||||
|
|
||||||
if investor.aum_amount:
|
|
||||||
print(f"\n💰 AUM: {investor.aum_amount}")
|
|
||||||
|
|
||||||
print("\n✅ Demo complete!")
|
|
||||||
print("\nTo run the full parser:")
|
|
||||||
print(" python investor_parser.py --file 'your_file.csv' --limit 50")
|
|
||||||
print("\nTo search investors:")
|
|
||||||
print(" python investor_parser.py --search 'your search query'")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,368 +0,0 @@
|
|||||||
import json
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
import pandas as pd
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from openai import OpenAI
|
|
||||||
|
|
||||||
from db import get_session, init_database
|
|
||||||
from schema import CSVRow, Investor
|
|
||||||
|
|
||||||
# Load environment variables
|
|
||||||
load_dotenv()
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class LLMInvestorParser:
|
|
||||||
def __init__(self):
|
|
||||||
# Initialize OpenAI client
|
|
||||||
self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
||||||
|
|
||||||
# Initialize ChromaDB
|
|
||||||
self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
|
|
||||||
self.collection = self.chroma_client.get_or_create_collection(
|
|
||||||
name="investor_descriptions",
|
|
||||||
metadata={
|
|
||||||
"description": "Investor descriptions and investment thesis focus"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize database
|
|
||||||
init_database()
|
|
||||||
|
|
||||||
def parse_json_field(self, json_str: str) -> Dict[str, Any]:
|
|
||||||
"""Safely parse JSON string with LLM assistance if needed"""
|
|
||||||
if not json_str or json_str.strip() == "":
|
|
||||||
return {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Try direct JSON parsing first
|
|
||||||
return json.loads(json_str)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
# If direct parsing fails, use LLM to clean and parse
|
|
||||||
logger.info("Direct JSON parsing failed, using LLM to clean JSON")
|
|
||||||
return self._llm_clean_json(json_str)
|
|
||||||
|
|
||||||
def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]:
|
|
||||||
"""Use LLM to clean and parse malformed JSON"""
|
|
||||||
try:
|
|
||||||
prompt = f"""
|
|
||||||
The following text appears to be malformed JSON. Please clean it up and return valid JSON.
|
|
||||||
If it's not possible to create valid JSON, return an empty object {{}}.
|
|
||||||
|
|
||||||
Original text:
|
|
||||||
{malformed_json[:2000]} # Limit length for API
|
|
||||||
|
|
||||||
Return only the cleaned JSON, no explanations:
|
|
||||||
"""
|
|
||||||
|
|
||||||
response = self.openai_client.chat.completions.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[{"role": "user", "content": prompt}],
|
|
||||||
temperature=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
cleaned_json = response.choices[0].message.content.strip()
|
|
||||||
return json.loads(cleaned_json)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"LLM JSON cleaning failed: {e}")
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
|
|
||||||
"""Extract and structure data from CSV row using LLM"""
|
|
||||||
# Parse the investment firm profile
|
|
||||||
profile_data = {}
|
|
||||||
if csv_row.investment_firm_profile:
|
|
||||||
profile_data = self.parse_json_field(csv_row.investment_firm_profile)
|
|
||||||
|
|
||||||
# Create structured output
|
|
||||||
structured_data = {
|
|
||||||
"name": csv_row.name,
|
|
||||||
"website": csv_row.website or profile_data.get("websiteURL"),
|
|
||||||
"investor_description": profile_data.get("investorDescription", ""),
|
|
||||||
"investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
|
|
||||||
"headquarters": profile_data.get("headquarters", ""),
|
|
||||||
"aum_info": profile_data.get("overallAssetsUnderManagement", {}),
|
|
||||||
"funds_info": profile_data.get("funds", []),
|
|
||||||
"crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
|
|
||||||
"crunchbase_extract": csv_row.crunchbase_firm_extract or "",
|
|
||||||
"linkedin_profile": csv_row.linkedin_investment_profile or "",
|
|
||||||
"source_truth_profile": csv_row.source_of_truth_profile or "",
|
|
||||||
}
|
|
||||||
|
|
||||||
return structured_data
|
|
||||||
|
|
||||||
def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
"""Use LLM to enhance and standardize investor data"""
|
|
||||||
try:
|
|
||||||
# Combine all available text for context
|
|
||||||
context_text = " ".join(
|
|
||||||
[
|
|
||||||
investor_data.get("investor_description", ""),
|
|
||||||
investor_data.get("crunchbase_extract", ""),
|
|
||||||
investor_data.get("linkedin_profile", ""),
|
|
||||||
investor_data.get("source_truth_profile", ""),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
if not context_text.strip():
|
|
||||||
return investor_data
|
|
||||||
|
|
||||||
prompt = f"""
|
|
||||||
Based on the following information about an investor, please extract and standardize:
|
|
||||||
1. A concise investor description (2-3 sentences)
|
|
||||||
2. Investment thesis focus areas (list of specific focus areas)
|
|
||||||
3. Headquarters location (city, country format)
|
|
||||||
|
|
||||||
Investor: {investor_data["name"]}
|
|
||||||
Context: {context_text[:3000]} # Limit for API
|
|
||||||
|
|
||||||
Return in JSON format:
|
|
||||||
{{
|
|
||||||
"enhanced_description": "concise description here",
|
|
||||||
"standardized_focus": ["focus area 1", "focus area 2", ...],
|
|
||||||
"standardized_headquarters": "City, Country"
|
|
||||||
}}
|
|
||||||
"""
|
|
||||||
|
|
||||||
response = self.openai_client.chat.completions.create(
|
|
||||||
model="gpt-3.5-turbo",
|
|
||||||
messages=[{"role": "user", "content": prompt}],
|
|
||||||
temperature=0.3,
|
|
||||||
)
|
|
||||||
|
|
||||||
enhanced_data = json.loads(response.choices[0].message.content)
|
|
||||||
|
|
||||||
# Update investor data with enhanced information
|
|
||||||
if enhanced_data.get("enhanced_description"):
|
|
||||||
investor_data["enhanced_description"] = enhanced_data[
|
|
||||||
"enhanced_description"
|
|
||||||
]
|
|
||||||
|
|
||||||
if enhanced_data.get("standardized_focus"):
|
|
||||||
investor_data["standardized_focus"] = enhanced_data[
|
|
||||||
"standardized_focus"
|
|
||||||
]
|
|
||||||
|
|
||||||
if enhanced_data.get("standardized_headquarters"):
|
|
||||||
investor_data["standardized_headquarters"] = enhanced_data[
|
|
||||||
"standardized_headquarters"
|
|
||||||
]
|
|
||||||
|
|
||||||
return investor_data
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}")
|
|
||||||
return investor_data
|
|
||||||
|
|
||||||
def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
|
|
||||||
"""Save investor data to SQL database"""
|
|
||||||
try:
|
|
||||||
with get_session() as session:
|
|
||||||
# Check if investor already exists
|
|
||||||
existing = (
|
|
||||||
session.query(Investor)
|
|
||||||
.filter_by(name=investor_data["name"])
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing:
|
|
||||||
logger.info(f"Updating existing investor: {investor_data['name']}")
|
|
||||||
investor = existing
|
|
||||||
else:
|
|
||||||
logger.info(f"Creating new investor: {investor_data['name']}")
|
|
||||||
investor = Investor()
|
|
||||||
|
|
||||||
# Map data to investor object
|
|
||||||
investor.name = investor_data["name"]
|
|
||||||
investor.website = investor_data.get("website")
|
|
||||||
investor.investor_description = investor_data.get(
|
|
||||||
"enhanced_description"
|
|
||||||
) or investor_data.get("investor_description")
|
|
||||||
investor.investment_thesis_focus = investor_data.get(
|
|
||||||
"standardized_focus"
|
|
||||||
) or investor_data.get("investment_thesis_focus")
|
|
||||||
investor.headquarters = investor_data.get(
|
|
||||||
"standardized_headquarters"
|
|
||||||
) or investor_data.get("headquarters")
|
|
||||||
|
|
||||||
# AUM information
|
|
||||||
aum_info = investor_data.get("aum_info", {})
|
|
||||||
investor.aum_amount = aum_info.get("aumAmount")
|
|
||||||
investor.aum_as_of_date = aum_info.get("asOfDate")
|
|
||||||
investor.aum_source_url = aum_info.get("sourceUrl")
|
|
||||||
|
|
||||||
# Fund information
|
|
||||||
investor.funds_info = investor_data.get("funds_info", [])
|
|
||||||
|
|
||||||
# Raw data
|
|
||||||
investor.crunchbase_urls = investor_data.get("crunchbase_urls")
|
|
||||||
investor.crunchbase_extract = investor_data.get("crunchbase_extract")
|
|
||||||
investor.linkedin_profile = investor_data.get("linkedin_profile")
|
|
||||||
investor.source_truth_profile = investor_data.get(
|
|
||||||
"source_truth_profile"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not existing:
|
|
||||||
session.add(investor)
|
|
||||||
|
|
||||||
session.flush() # Get the ID
|
|
||||||
return investor.id
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to save to SQL: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
|
|
||||||
"""Save investor description and focus to ChromaDB"""
|
|
||||||
try:
|
|
||||||
# Prepare text for embedding
|
|
||||||
description_text = investor_data.get(
|
|
||||||
"enhanced_description"
|
|
||||||
) or investor_data.get("investor_description", "")
|
|
||||||
focus_areas = investor_data.get("standardized_focus") or investor_data.get(
|
|
||||||
"investment_thesis_focus", []
|
|
||||||
)
|
|
||||||
|
|
||||||
if isinstance(focus_areas, list):
|
|
||||||
focus_text = " ".join(focus_areas)
|
|
||||||
else:
|
|
||||||
focus_text = str(focus_areas)
|
|
||||||
|
|
||||||
# Combine description and focus for embedding
|
|
||||||
combined_text = f"{description_text} {focus_text}".strip()
|
|
||||||
|
|
||||||
if not combined_text:
|
|
||||||
logger.warning(f"No text to embed for investor {investor_data['name']}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Create metadata
|
|
||||||
metadata = {
|
|
||||||
"investor_id": investor_id,
|
|
||||||
"name": investor_data["name"],
|
|
||||||
"website": investor_data.get("website", ""),
|
|
||||||
"headquarters": investor_data.get("standardized_headquarters")
|
|
||||||
or investor_data.get("headquarters", ""),
|
|
||||||
"focus_areas_count": len(focus_areas)
|
|
||||||
if isinstance(focus_areas, list)
|
|
||||||
else 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add to ChromaDB
|
|
||||||
self.collection.add(
|
|
||||||
documents=[combined_text],
|
|
||||||
metadatas=[metadata],
|
|
||||||
ids=[f"investor_{investor_id}"],
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Added investor {investor_data['name']} to vector database")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to save to vector DB: {e}")
|
|
||||||
|
|
||||||
def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
|
|
||||||
"""Process the entire CSV file"""
|
|
||||||
logger.info(f"Starting to process CSV file: {csv_file_path}")
|
|
||||||
|
|
||||||
# Read CSV
|
|
||||||
df = pd.read_csv(csv_file_path)
|
|
||||||
logger.info(f"Loaded {len(df)} rows from CSV")
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
df = df.head(limit)
|
|
||||||
logger.info(f"Processing limited to {limit} rows")
|
|
||||||
|
|
||||||
processed_count = 0
|
|
||||||
error_count = 0
|
|
||||||
|
|
||||||
for index, row in df.iterrows():
|
|
||||||
try:
|
|
||||||
logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
|
|
||||||
|
|
||||||
# Create CSVRow object
|
|
||||||
csv_row = CSVRow(
|
|
||||||
name=row["Name"],
|
|
||||||
website=row.get("Website"),
|
|
||||||
investment_firm_profile=row.get("Investment Firm Profile"),
|
|
||||||
crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
|
|
||||||
crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
|
|
||||||
linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
|
|
||||||
source_of_truth_profile=row.get("Source of Truth Profile"),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract structured data
|
|
||||||
structured_data = self.extract_structured_data(csv_row)
|
|
||||||
|
|
||||||
# Enhance with LLM
|
|
||||||
enhanced_data = self.enhance_with_llm(structured_data)
|
|
||||||
|
|
||||||
# Save to SQL database
|
|
||||||
investor_id = self.save_to_sql(enhanced_data)
|
|
||||||
|
|
||||||
# Save to vector database
|
|
||||||
self.save_to_vector_db(investor_id, enhanced_data)
|
|
||||||
|
|
||||||
processed_count += 1
|
|
||||||
|
|
||||||
# Progress update every 10 rows
|
|
||||||
if (index + 1) % 10 == 0:
|
|
||||||
logger.info(
|
|
||||||
f"Processed {processed_count} rows successfully, {error_count} errors"
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
error_count += 1
|
|
||||||
logger.error(
|
|
||||||
f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
|
|
||||||
)
|
|
||||||
return processed_count, error_count
|
|
||||||
|
|
||||||
def search_investors(self, query: str, limit: int = 5):
|
|
||||||
"""Search investors using vector similarity"""
|
|
||||||
try:
|
|
||||||
results = self.collection.query(query_texts=[query], n_results=limit)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Search failed: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main function to run the parser"""
|
|
||||||
parser = LLMInvestorParser()
|
|
||||||
|
|
||||||
# Process the CSV file
|
|
||||||
csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv"
|
|
||||||
|
|
||||||
# Start with a small sample for testing
|
|
||||||
processed, errors = parser.process_csv_file(csv_file, limit=5)
|
|
||||||
|
|
||||||
print("\nProcessing complete!")
|
|
||||||
print(f"Successfully processed: {processed} investors")
|
|
||||||
print(f"Errors encountered: {errors}")
|
|
||||||
|
|
||||||
# Test search functionality
|
|
||||||
print("\nTesting search functionality...")
|
|
||||||
results = parser.search_investors("bioeconomy circular economy")
|
|
||||||
if results:
|
|
||||||
print(f"Found {len(results['documents'][0])} similar investors")
|
|
||||||
for i, doc in enumerate(results["documents"][0]):
|
|
||||||
print(f" {i + 1}. {results['metadatas'][0][i]['name']}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
import asyncio
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from openai import AsyncOpenAI
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class RowSchema(BaseModel):
|
||||||
|
section: str
|
||||||
|
explanation: str
|
||||||
|
|
||||||
|
client = AsyncOpenAI()
|
||||||
|
|
||||||
|
async def process_row(row):
|
||||||
|
resp = await client.chat.completions.create(
|
||||||
|
model="gpt-4o-mini",
|
||||||
|
messages=[{"role": "user", "content": f"Extract relevant section:\n{row}"}],
|
||||||
|
response_format={"type": "json_object"} # ensures JSON output
|
||||||
|
)
|
||||||
|
return RowSchema.model_validate_json(resp.choices[0].message.content)
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
with open("data.csv") as f:
|
||||||
|
reader = csv.DictReader(f)
|
||||||
|
tasks = [process_row(row) for row in reader]
|
||||||
|
return await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
results = asyncio.run(main())
|
||||||
@@ -1,109 +0,0 @@
|
|||||||
from sqlalchemy import Column, Integer, String, Text, DateTime, JSON, Float
|
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
|
||||||
from sqlalchemy.sql import func
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from typing import List, Optional, Dict, Any
|
|
||||||
import json
|
|
||||||
|
|
||||||
Base = declarative_base()
|
|
||||||
|
|
||||||
class Investor(Base):
|
|
||||||
__tablename__ = 'investors'
|
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
|
||||||
name = Column(String(500), nullable=False)
|
|
||||||
website = Column(String(1000))
|
|
||||||
|
|
||||||
# Core investment information
|
|
||||||
investor_description = Column(Text)
|
|
||||||
investment_thesis_focus = Column(JSON) # List of focus areas
|
|
||||||
headquarters = Column(String(1000))
|
|
||||||
|
|
||||||
# AUM information
|
|
||||||
aum_amount = Column(String(200))
|
|
||||||
aum_as_of_date = Column(String(100))
|
|
||||||
aum_source_url = Column(String(1000))
|
|
||||||
|
|
||||||
# Fund information
|
|
||||||
funds_info = Column(JSON) # Complex fund data
|
|
||||||
|
|
||||||
# Raw data columns for reference
|
|
||||||
crunchbase_urls = Column(Text)
|
|
||||||
crunchbase_extract = Column(Text)
|
|
||||||
linkedin_profile = Column(Text)
|
|
||||||
source_truth_profile = Column(Text)
|
|
||||||
|
|
||||||
# Metadata
|
|
||||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
|
||||||
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return f"<Investor(name='{self.name}', website='{self.website}')>"
|
|
||||||
|
|
||||||
# Pydantic models for data validation and parsing
|
|
||||||
class AUMInfo(BaseModel):
|
|
||||||
aumAmount: Optional[str] = None
|
|
||||||
asOfDate: Optional[str] = None
|
|
||||||
sourceUrl: Optional[str] = None
|
|
||||||
|
|
||||||
class FundInfo(BaseModel):
|
|
||||||
fundName: Optional[str] = None
|
|
||||||
fundSize: Optional[str] = None
|
|
||||||
vintage: Optional[str] = None
|
|
||||||
status: Optional[str] = None
|
|
||||||
description: Optional[str] = None
|
|
||||||
|
|
||||||
class InvestorProfile(BaseModel):
|
|
||||||
websiteURL: Optional[str] = None
|
|
||||||
investorDescription: Optional[str] = None
|
|
||||||
investmentThesisFocus: Optional[List[str]] = None
|
|
||||||
headquarters: Optional[str] = None
|
|
||||||
overallAssetsUnderManagement: Optional[AUMInfo] = None
|
|
||||||
funds: Optional[List[FundInfo]] = None
|
|
||||||
|
|
||||||
class CSVRow(BaseModel):
|
|
||||||
name: str
|
|
||||||
website: Optional[str] = None
|
|
||||||
investment_firm_profile: Optional[str] = None
|
|
||||||
crunchbase_linkedin_urls: Optional[str] = None
|
|
||||||
crunchbase_firm_extract: Optional[str] = None
|
|
||||||
linkedin_investment_profile: Optional[str] = None
|
|
||||||
source_of_truth_profile: Optional[str] = None
|
|
||||||
|
|
||||||
def get_combined_description(self) -> str:
|
|
||||||
"""Combine all description fields for vector embedding"""
|
|
||||||
descriptions = []
|
|
||||||
|
|
||||||
if self.investment_firm_profile:
|
|
||||||
try:
|
|
||||||
profile_data = json.loads(self.investment_firm_profile)
|
|
||||||
if isinstance(profile_data, dict):
|
|
||||||
desc = profile_data.get('investorDescription', '')
|
|
||||||
if desc:
|
|
||||||
descriptions.append(desc)
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
if self.crunchbase_firm_extract:
|
|
||||||
descriptions.append(self.crunchbase_firm_extract)
|
|
||||||
|
|
||||||
if self.linkedin_investment_profile:
|
|
||||||
descriptions.append(self.linkedin_investment_profile)
|
|
||||||
|
|
||||||
if self.source_of_truth_profile:
|
|
||||||
descriptions.append(self.source_of_truth_profile)
|
|
||||||
|
|
||||||
return " ".join(descriptions)
|
|
||||||
|
|
||||||
def get_investment_focus(self) -> List[str]:
|
|
||||||
"""Extract investment thesis focus"""
|
|
||||||
if self.investment_firm_profile:
|
|
||||||
try:
|
|
||||||
profile_data = json.loads(self.investment_firm_profile)
|
|
||||||
if isinstance(profile_data, dict):
|
|
||||||
focus = profile_data.get('investmentThesisFocus', [])
|
|
||||||
if isinstance(focus, list):
|
|
||||||
return focus
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
pass
|
|
||||||
return []
|
|
||||||
@@ -1,260 +0,0 @@
|
|||||||
import json
|
|
||||||
import logging
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
import chromadb
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from db import get_session, init_database
|
|
||||||
from schema import CSVRow, Investor
|
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleInvestorParser:
|
|
||||||
"""Simplified parser that works without OpenAI API for testing"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
# Initialize ChromaDB
|
|
||||||
self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
|
|
||||||
self.collection = self.chroma_client.get_or_create_collection(
|
|
||||||
name="investor_descriptions",
|
|
||||||
metadata={
|
|
||||||
"description": "Investor descriptions and investment thesis focus"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialize database
|
|
||||||
init_database()
|
|
||||||
|
|
||||||
def parse_json_field(self, json_str: str) -> Dict[str, Any]:
|
|
||||||
"""Safely parse JSON string"""
|
|
||||||
if not json_str or json_str.strip() == "":
|
|
||||||
return {}
|
|
||||||
|
|
||||||
try:
|
|
||||||
return json.loads(json_str)
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
logger.warning(f"JSON parsing failed: {e}")
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
|
|
||||||
"""Extract and structure data from CSV row"""
|
|
||||||
# Parse the investment firm profile
|
|
||||||
profile_data = {}
|
|
||||||
if csv_row.investment_firm_profile:
|
|
||||||
profile_data = self.parse_json_field(csv_row.investment_firm_profile)
|
|
||||||
|
|
||||||
# Create structured output
|
|
||||||
structured_data = {
|
|
||||||
"name": csv_row.name,
|
|
||||||
"website": csv_row.website or profile_data.get("websiteURL"),
|
|
||||||
"investor_description": profile_data.get("investorDescription", ""),
|
|
||||||
"investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
|
|
||||||
"headquarters": profile_data.get("headquarters", ""),
|
|
||||||
"aum_info": profile_data.get("overallAssetsUnderManagement", {}),
|
|
||||||
"funds_info": profile_data.get("funds", []),
|
|
||||||
"crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
|
|
||||||
"crunchbase_extract": csv_row.crunchbase_firm_extract or "",
|
|
||||||
"linkedin_profile": csv_row.linkedin_investment_profile or "",
|
|
||||||
"source_truth_profile": csv_row.source_of_truth_profile or "",
|
|
||||||
}
|
|
||||||
|
|
||||||
return structured_data
|
|
||||||
|
|
||||||
def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
|
|
||||||
"""Save investor data to SQL database"""
|
|
||||||
try:
|
|
||||||
with get_session() as session:
|
|
||||||
# Check if investor already exists
|
|
||||||
existing = (
|
|
||||||
session.query(Investor)
|
|
||||||
.filter_by(name=investor_data["name"])
|
|
||||||
.first()
|
|
||||||
)
|
|
||||||
|
|
||||||
if existing:
|
|
||||||
logger.info(f"Updating existing investor: {investor_data['name']}")
|
|
||||||
investor = existing
|
|
||||||
else:
|
|
||||||
logger.info(f"Creating new investor: {investor_data['name']}")
|
|
||||||
investor = Investor()
|
|
||||||
|
|
||||||
# Map data to investor object
|
|
||||||
investor.name = investor_data["name"]
|
|
||||||
investor.website = investor_data.get("website")
|
|
||||||
investor.investor_description = investor_data.get(
|
|
||||||
"investor_description"
|
|
||||||
)
|
|
||||||
investor.investment_thesis_focus = investor_data.get(
|
|
||||||
"investment_thesis_focus"
|
|
||||||
)
|
|
||||||
investor.headquarters = investor_data.get("headquarters")
|
|
||||||
|
|
||||||
# AUM information
|
|
||||||
aum_info = investor_data.get("aum_info") or {}
|
|
||||||
investor.aum_amount = aum_info.get("aumAmount")
|
|
||||||
investor.aum_as_of_date = aum_info.get("asOfDate")
|
|
||||||
investor.aum_source_url = aum_info.get("sourceUrl")
|
|
||||||
|
|
||||||
# Fund information
|
|
||||||
investor.funds_info = investor_data.get("funds_info", [])
|
|
||||||
|
|
||||||
# Raw data
|
|
||||||
investor.crunchbase_urls = investor_data.get("crunchbase_urls")
|
|
||||||
investor.crunchbase_extract = investor_data.get("crunchbase_extract")
|
|
||||||
investor.linkedin_profile = investor_data.get("linkedin_profile")
|
|
||||||
investor.source_truth_profile = investor_data.get(
|
|
||||||
"source_truth_profile"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not existing:
|
|
||||||
session.add(investor)
|
|
||||||
|
|
||||||
session.flush() # Get the ID
|
|
||||||
return investor.id
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to save to SQL: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
|
|
||||||
"""Save investor description and focus to ChromaDB"""
|
|
||||||
try:
|
|
||||||
# Prepare text for embedding
|
|
||||||
description_text = investor_data.get("investor_description", "")
|
|
||||||
focus_areas = investor_data.get("investment_thesis_focus", [])
|
|
||||||
|
|
||||||
if isinstance(focus_areas, list):
|
|
||||||
focus_text = " ".join(focus_areas)
|
|
||||||
else:
|
|
||||||
focus_text = str(focus_areas)
|
|
||||||
|
|
||||||
# Combine description and focus for embedding
|
|
||||||
combined_text = f"{description_text} {focus_text}".strip()
|
|
||||||
|
|
||||||
if not combined_text:
|
|
||||||
logger.warning(f"No text to embed for investor {investor_data['name']}")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Create metadata
|
|
||||||
metadata = {
|
|
||||||
"investor_id": investor_id,
|
|
||||||
"name": investor_data["name"],
|
|
||||||
"website": investor_data.get("website") or "",
|
|
||||||
"headquarters": investor_data.get("headquarters") or "",
|
|
||||||
"focus_areas_count": len(focus_areas)
|
|
||||||
if isinstance(focus_areas, list)
|
|
||||||
else 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add to ChromaDB
|
|
||||||
self.collection.add(
|
|
||||||
documents=[combined_text],
|
|
||||||
metadatas=[metadata],
|
|
||||||
ids=[f"investor_{investor_id}"],
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Added investor {investor_data['name']} to vector database")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Failed to save to vector DB: {e}")
|
|
||||||
|
|
||||||
def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
|
|
||||||
"""Process the entire CSV file"""
|
|
||||||
logger.info(f"Starting to process CSV file: {csv_file_path}")
|
|
||||||
|
|
||||||
# Read CSV
|
|
||||||
df = pd.read_csv(csv_file_path)
|
|
||||||
logger.info(f"Loaded {len(df)} rows from CSV")
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
df = df.head(limit)
|
|
||||||
logger.info(f"Processing limited to {limit} rows")
|
|
||||||
|
|
||||||
processed_count = 0
|
|
||||||
error_count = 0
|
|
||||||
|
|
||||||
for index, row in df.iterrows():
|
|
||||||
try:
|
|
||||||
logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
|
|
||||||
|
|
||||||
# Create CSVRow object
|
|
||||||
csv_row = CSVRow(
|
|
||||||
name=row["Name"],
|
|
||||||
website=row.get("Website"),
|
|
||||||
investment_firm_profile=row.get("Investment Firm Profile"),
|
|
||||||
crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
|
|
||||||
crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
|
|
||||||
linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
|
|
||||||
source_of_truth_profile=row.get("Source of Truth Profile"),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract structured data
|
|
||||||
structured_data = self.extract_structured_data(csv_row)
|
|
||||||
|
|
||||||
# Save to SQL database
|
|
||||||
investor_id = self.save_to_sql(structured_data)
|
|
||||||
|
|
||||||
# Save to vector database
|
|
||||||
self.save_to_vector_db(investor_id, structured_data)
|
|
||||||
|
|
||||||
processed_count += 1
|
|
||||||
|
|
||||||
# Progress update every 10 rows
|
|
||||||
if (index + 1) % 10 == 0:
|
|
||||||
logger.info(
|
|
||||||
f"Processed {processed_count} rows successfully, {error_count} errors"
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
error_count += 1
|
|
||||||
logger.error(
|
|
||||||
f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
|
|
||||||
)
|
|
||||||
return processed_count, error_count
|
|
||||||
|
|
||||||
def search_investors(self, query: str, limit: int = 5):
|
|
||||||
"""Search investors using vector similarity"""
|
|
||||||
try:
|
|
||||||
results = self.collection.query(query_texts=[query], n_results=limit)
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Search failed: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main function to run the parser"""
|
|
||||||
parser = SimpleInvestorParser()
|
|
||||||
|
|
||||||
# Process the CSV file
|
|
||||||
csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv"
|
|
||||||
|
|
||||||
# Start with a small sample for testing
|
|
||||||
processed, errors = parser.process_csv_file(csv_file, limit=5)
|
|
||||||
|
|
||||||
print("Processing complete!")
|
|
||||||
print(f"Successfully processed: {processed} investors")
|
|
||||||
print(f"Errors encountered: {errors}")
|
|
||||||
|
|
||||||
# Test search functionality
|
|
||||||
print("\nTesting search functionality...")
|
|
||||||
results = parser.search_investors("bioeconomy circular economy")
|
|
||||||
if results:
|
|
||||||
print(f"Found {len(results['documents'][0])} similar investors")
|
|
||||||
for i, doc in enumerate(results["documents"][0]):
|
|
||||||
print(f" {i + 1}. {results['metadatas'][0][i]['name']}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
api_key: str
|
||||||
|
db_url: str
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
env_file = ".env"
|
||||||
|
|
||||||
|
settings = Settings()
|
||||||
Reference in New Issue
Block a user