#!/usr/bin/env python3 """ LLM-powered Investor Parser A comprehensive parser that processes investor CSV data and saves it to both SQL and vector databases. Supports both simple parsing and LLM-enhanced parsing for better data quality. Usage: python investor_parser.py --help python investor_parser.py --file="path/to/csv" --limit=10 python investor_parser.py --file="path/to/csv" --use-llm --limit=50 python investor_parser.py --search="bioeconomy circular" """ import argparse import json import logging import os from typing import Any, Dict, Optional import chromadb import pandas as pd from dotenv import load_dotenv from openai import OpenAI from db import get_session, init_database from schema import CSVRow, Investor # Load environment variables load_dotenv() # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) logger = logging.getLogger(__name__) class InvestorParser: """Complete investor parser with optional LLM enhancement""" def __init__(self, use_llm: bool = False): self.use_llm = use_llm # Initialize OpenAI client if using LLM if self.use_llm: api_key = os.getenv("OPENAI_API_KEY") if not api_key: logger.warning( "OpenAI API key not found. LLM features will be disabled." ) self.use_llm = False else: self.openai_client = OpenAI(api_key=api_key) logger.info("LLM enhancement enabled") # Initialize ChromaDB self.chroma_client = chromadb.PersistentClient(path="./chroma_db") self.collection = self.chroma_client.get_or_create_collection( name="investor_descriptions", metadata={ "description": "Investor descriptions and investment thesis focus" }, ) # Initialize database init_database() def parse_json_field(self, json_str: str) -> Dict[str, Any]: """Safely parse JSON string with optional LLM assistance""" if not json_str or json_str.strip() == "": return {} try: return json.loads(json_str) except json.JSONDecodeError as e: logger.warning(f"JSON parsing failed: {e}") # Use LLM to clean JSON if available if self.use_llm: return self._llm_clean_json(json_str) else: return {} def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]: """Use LLM to clean and parse malformed JSON""" try: prompt = f""" The following text appears to be malformed JSON. Please clean it up and return valid JSON. If it's not possible to create valid JSON, return an empty object {{}}. Original text: {malformed_json[:2000]} # Limit length for API Return only the cleaned JSON, no explanations: """ response = self.openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], temperature=0, ) cleaned_json = response.choices[0].message.content.strip() return json.loads(cleaned_json) except Exception as e: logger.error(f"LLM JSON cleaning failed: {e}") return {} def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]: """Extract and structure data from CSV row""" # Parse the investment firm profile profile_data = {} if csv_row.investment_firm_profile: profile_data = self.parse_json_field(csv_row.investment_firm_profile) # Create structured output structured_data = { "name": csv_row.name, "website": csv_row.website or profile_data.get("websiteURL"), "investor_description": profile_data.get("investorDescription", ""), "investment_thesis_focus": profile_data.get("investmentThesisFocus", []), "headquarters": profile_data.get("headquarters", ""), "aum_info": profile_data.get("overallAssetsUnderManagement", {}), "funds_info": profile_data.get("funds", []), "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "", "crunchbase_extract": csv_row.crunchbase_firm_extract or "", "linkedin_profile": csv_row.linkedin_investment_profile or "", "source_truth_profile": csv_row.source_of_truth_profile or "", } return structured_data def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]: """Use LLM to enhance and standardize investor data""" if not self.use_llm: return investor_data try: # Combine all available text for context context_text = " ".join( [ investor_data.get("investor_description", ""), investor_data.get("crunchbase_extract", ""), investor_data.get("linkedin_profile", ""), investor_data.get("source_truth_profile", ""), ] ) if not context_text.strip(): return investor_data prompt = f""" Based on the following information about an investor, please extract and standardize: 1. A concise investor description (2-3 sentences) 2. Investment thesis focus areas (list of specific focus areas) 3. Headquarters location (city, country format) Investor: {investor_data["name"]} Context: {context_text[:3000]} # Limit for API Return in JSON format: {{ "enhanced_description": "concise description here", "standardized_focus": ["focus area 1", "focus area 2", ...], "standardized_headquarters": "City, Country" }} """ response = self.openai_client.chat.completions.create( model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}], temperature=0.3, ) enhanced_data = json.loads(response.choices[0].message.content) # Update investor data with enhanced information if enhanced_data.get("enhanced_description"): investor_data["enhanced_description"] = enhanced_data[ "enhanced_description" ] if enhanced_data.get("standardized_focus"): investor_data["standardized_focus"] = enhanced_data[ "standardized_focus" ] if enhanced_data.get("standardized_headquarters"): investor_data["standardized_headquarters"] = enhanced_data[ "standardized_headquarters" ] return investor_data except Exception as e: logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}") return investor_data def save_to_sql(self, investor_data: Dict[str, Any]) -> int: """Save investor data to SQL database""" try: with get_session() as session: # Check if investor already exists existing = ( session.query(Investor) .filter_by(name=investor_data["name"]) .first() ) if existing: logger.info(f"Updating existing investor: {investor_data['name']}") investor = existing else: logger.info(f"Creating new investor: {investor_data['name']}") investor = Investor() # Map data to investor object investor.name = investor_data["name"] investor.website = investor_data.get("website") investor.investor_description = investor_data.get( "enhanced_description" ) or investor_data.get("investor_description") investor.investment_thesis_focus = investor_data.get( "standardized_focus" ) or investor_data.get("investment_thesis_focus") investor.headquarters = investor_data.get( "standardized_headquarters" ) or investor_data.get("headquarters") # AUM information aum_info = investor_data.get("aum_info") or {} investor.aum_amount = aum_info.get("aumAmount") investor.aum_as_of_date = aum_info.get("asOfDate") investor.aum_source_url = aum_info.get("sourceUrl") # Fund information investor.funds_info = investor_data.get("funds_info", []) # Raw data investor.crunchbase_urls = investor_data.get("crunchbase_urls") investor.crunchbase_extract = investor_data.get("crunchbase_extract") investor.linkedin_profile = investor_data.get("linkedin_profile") investor.source_truth_profile = investor_data.get( "source_truth_profile" ) if not existing: session.add(investor) session.flush() # Get the ID return investor.id except Exception as e: logger.error(f"Failed to save to SQL: {e}") raise def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]): """Save investor description and focus to ChromaDB""" try: # Prepare text for embedding description_text = investor_data.get( "enhanced_description" ) or investor_data.get("investor_description", "") focus_areas = investor_data.get("standardized_focus") or investor_data.get( "investment_thesis_focus", [] ) if isinstance(focus_areas, list): focus_text = " ".join(focus_areas) else: focus_text = str(focus_areas) # Combine description and focus for embedding combined_text = f"{description_text} {focus_text}".strip() if not combined_text: logger.warning(f"No text to embed for investor {investor_data['name']}") return # Create metadata metadata = { "investor_id": investor_id, "name": investor_data["name"], "website": investor_data.get("website") or "", "headquarters": investor_data.get("standardized_headquarters") or investor_data.get("headquarters") or "", "focus_areas_count": len(focus_areas) if isinstance(focus_areas, list) else 0, } # Add to ChromaDB self.collection.add( documents=[combined_text], metadatas=[metadata], ids=[f"investor_{investor_id}"], ) logger.info(f"Added investor {investor_data['name']} to vector database") except Exception as e: logger.error(f"Failed to save to vector DB: {e}") def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None): """Process the entire CSV file""" logger.info(f"Starting to process CSV file: {csv_file_path}") # Read CSV df = pd.read_csv(csv_file_path) logger.info(f"Loaded {len(df)} rows from CSV") if limit: df = df.head(limit) logger.info(f"Processing limited to {limit} rows") processed_count = 0 error_count = 0 for index, row in df.iterrows(): try: logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}") # Create CSVRow object csv_row = CSVRow( name=row["Name"], website=row.get("Website"), investment_firm_profile=row.get("Investment Firm Profile"), crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"), crunchbase_firm_extract=row.get("Crunchbase Firm Extract"), linkedin_investment_profile=row.get("LinkedIn Investment Profile"), source_of_truth_profile=row.get("Source of Truth Profile"), ) # Extract structured data structured_data = self.extract_structured_data(csv_row) # Enhance with LLM if enabled enhanced_data = self.enhance_with_llm(structured_data) # Save to SQL database investor_id = self.save_to_sql(enhanced_data) # Save to vector database self.save_to_vector_db(investor_id, enhanced_data) processed_count += 1 # Progress update every 10 rows if (index + 1) % 10 == 0: logger.info( f"Progress: {processed_count} processed, {error_count} errors" ) except Exception as e: error_count += 1 logger.error( f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}" ) continue logger.info( f"Processing complete! Processed: {processed_count}, Errors: {error_count}" ) return processed_count, error_count def search_investors(self, query: str, limit: int = 10): """Search investors using vector similarity""" try: results = self.collection.query(query_texts=[query], n_results=limit) return results except Exception as e: logger.error(f"Search failed: {e}") return None def main(): """Main function with command line interface""" parser = argparse.ArgumentParser(description="LLM-powered Investor Parser") parser.add_argument("--file", type=str, help="Path to CSV file to process") parser.add_argument("--limit", type=int, help="Limit number of rows to process") parser.add_argument( "--use-llm", action="store_true", help="Enable LLM enhancement (requires OpenAI API key)", ) parser.add_argument("--search", type=str, help="Search query for vector database") parser.add_argument( "--search-limit", type=int, default=10, help="Number of search results to return", ) args = parser.parse_args() # Initialize parser investor_parser = InvestorParser(use_llm=args.use_llm) if args.search: # Perform search logger.info(f"Searching for: {args.search}") results = investor_parser.search_investors(args.search, args.search_limit) if results and results["documents"][0]: print(f"\nFound {len(results['documents'][0])} similar investors:") for i, (doc, metadata) in enumerate( zip(results["documents"][0], results["metadatas"][0]) ): print(f"{i + 1}. {metadata['name']}") print(f" Website: {metadata.get('website', 'N/A')}") print(f" HQ: {metadata.get('headquarters', 'N/A')}") print(f" Focus areas: {metadata.get('focus_areas_count', 0)}") print(f" Similarity score: {results['distances'][0][i]:.3f}") print() else: print("No results found.") elif args.file: # Process CSV file if not os.path.exists(args.file): logger.error(f"File not found: {args.file}") return processed, errors = investor_parser.process_csv_file(args.file, args.limit) print("\nProcessing complete!") print(f"Successfully processed: {processed} investors") print(f"Errors encountered: {errors}") # Show some search examples print("\nTrying some example searches...") for query in ["bioeconomy", "venture capital", "sustainability"]: results = investor_parser.search_investors(query, 3) if results and results["documents"][0]: print(f"\nTop matches for '{query}':") for i, metadata in enumerate(results["metadatas"][0][:3]): print(f" {i + 1}. {metadata['name']}") else: parser.print_help() if __name__ == "__main__": main()