Implement LLM-powered Investor Parser with CSV processing, SQL and vector database integration

- Added FastAPI application with a simple root endpoint. - Developed LLMInvestorParser class for processing investor data from CSV files. - Integrated OpenAI API for LLM enhancements and JSON cleaning. - Implemented structured data extraction and saving to SQL database. - Added functionality to save investor descriptions to ChromaDB for vector similarity search. - Created command-line interface for processing files and searching investors. - Added schema definitions for Investor and related data models using SQLAlchemy and Pydantic. - Implemented logging for better traceability and error handling. - Included requirements.txt for dependency management.
2025-08-28 22:51:58 +01:00
commit bbf6af58f0
13 changed files with 5227 additions and 0 deletions
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""
+Quick demonstration of the LLM Investor Parser functionality.
+This script shows how to use the system programmatically.
+"""
+
+from sqlalchemy import func, select
+
+from db import get_session
+from investor_parser import InvestorParser
+from schema import Investor
+
+
+def main():
+    print("🚀 LLM Investor Parser Demo")
+    print("=" * 50)
+
+    # Initialize parser (without LLM for demo)
+    parser = InvestorParser(use_llm=False)
+
+    # Show current database stats
+    with get_session() as session:
+        count = session.scalar(select(func.count(Investor.id)))
+        print(f"📊 Current database: {count} investors")
+
+    # Demonstrate search functionality
+    print("\n🔍 Search Examples:")
+
+    search_queries = [
+        "circular bioeconomy sustainable",
+        "venture capital early stage",
+        "fintech financial technology",
+        "healthcare biotechnology",
+        "climate sustainability",
+    ]
+
+    for query in search_queries:
+        print(f"\n🔎 Searching for: '{query}'")
+        results = parser.search_investors(query, limit=3)
+
+        if results and results["documents"][0]:
+            for i, metadata in enumerate(results["metadatas"][0]):
+                score = results["distances"][0][i]
+                print(f"  {i + 1}. {metadata['name']} (score: {score:.3f})")
+        else:
+            print("  No results found")
+
+    # Show detailed investor information
+    print("\n📋 Detailed Investor Sample:")
+
+    with get_session() as session:
+        investor = session.execute(
+            select(Investor).where(Investor.investor_description.isnot(None)).limit(1)
+        ).scalar_one_or_none()
+
+        if investor:
+            print(f"\n🏢 {investor.name}")
+            print(f"🌐 Website: {investor.website}")
+            print(f"📍 HQ: {investor.headquarters or 'Not specified'}")
+            print(f"📝 Description: {investor.investor_description[:200]}...")
+
+            if investor.investment_thesis_focus:
+                print(
+                    f"\n🎯 Investment Focus ({len(investor.investment_thesis_focus)} areas):"
+                )
+                for i, focus in enumerate(investor.investment_thesis_focus[:3], 1):
+                    print(f"  {i}. {focus}")
+                if len(investor.investment_thesis_focus) > 3:
+                    print(f"  ... and {len(investor.investment_thesis_focus) - 3} more")
+
+            if investor.aum_amount:
+                print(f"\n💰 AUM: {investor.aum_amount}")
+
+    print("\n✅ Demo complete!")
+    print("\nTo run the full parser:")
+    print("  python investor_parser.py --file 'your_file.csv' --limit 50")
+    print("\nTo search investors:")
+    print("  python investor_parser.py --search 'your search query'")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,368 @@
+import json
+import logging
+import os
+from typing import Any, Dict, Optional
+
+import chromadb
+import pandas as pd
+from dotenv import load_dotenv
+from openai import OpenAI
+
+from db import get_session, init_database
+from schema import CSVRow, Investor
+
+# Load environment variables
+load_dotenv()
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class LLMInvestorParser:
+    def __init__(self):
+        # Initialize OpenAI client
+        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+
+        # Initialize ChromaDB
+        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
+        self.collection = self.chroma_client.get_or_create_collection(
+            name="investor_descriptions",
+            metadata={
+                "description": "Investor descriptions and investment thesis focus"
+            },
+        )
+
+        # Initialize database
+        init_database()
+
+    def parse_json_field(self, json_str: str) -> Dict[str, Any]:
+        """Safely parse JSON string with LLM assistance if needed"""
+        if not json_str or json_str.strip() == "":
+            return {}
+
+        try:
+            # Try direct JSON parsing first
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            # If direct parsing fails, use LLM to clean and parse
+            logger.info("Direct JSON parsing failed, using LLM to clean JSON")
+            return self._llm_clean_json(json_str)
+
+    def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]:
+        """Use LLM to clean and parse malformed JSON"""
+        try:
+            prompt = f"""
+            The following text appears to be malformed JSON. Please clean it up and return valid JSON.
+            If it's not possible to create valid JSON, return an empty object {{}}.
+            
+            Original text:
+            {malformed_json[:2000]}  # Limit length for API
+            
+            Return only the cleaned JSON, no explanations:
+            """
+
+            response = self.openai_client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0,
+            )
+
+            cleaned_json = response.choices[0].message.content.strip()
+            return json.loads(cleaned_json)
+
+        except Exception as e:
+            logger.error(f"LLM JSON cleaning failed: {e}")
+            return {}
+
+    def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
+        """Extract and structure data from CSV row using LLM"""
+        # Parse the investment firm profile
+        profile_data = {}
+        if csv_row.investment_firm_profile:
+            profile_data = self.parse_json_field(csv_row.investment_firm_profile)
+
+        # Create structured output
+        structured_data = {
+            "name": csv_row.name,
+            "website": csv_row.website or profile_data.get("websiteURL"),
+            "investor_description": profile_data.get("investorDescription", ""),
+            "investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
+            "headquarters": profile_data.get("headquarters", ""),
+            "aum_info": profile_data.get("overallAssetsUnderManagement", {}),
+            "funds_info": profile_data.get("funds", []),
+            "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
+            "crunchbase_extract": csv_row.crunchbase_firm_extract or "",
+            "linkedin_profile": csv_row.linkedin_investment_profile or "",
+            "source_truth_profile": csv_row.source_of_truth_profile or "",
+        }
+
+        return structured_data
+
+    def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Use LLM to enhance and standardize investor data"""
+        try:
+            # Combine all available text for context
+            context_text = " ".join(
+                [
+                    investor_data.get("investor_description", ""),
+                    investor_data.get("crunchbase_extract", ""),
+                    investor_data.get("linkedin_profile", ""),
+                    investor_data.get("source_truth_profile", ""),
+                ]
+            )
+
+            if not context_text.strip():
+                return investor_data
+
+            prompt = f"""
+            Based on the following information about an investor, please extract and standardize:
+            1. A concise investor description (2-3 sentences)
+            2. Investment thesis focus areas (list of specific focus areas)
+            3. Headquarters location (city, country format)
+            
+            Investor: {investor_data["name"]}
+            Context: {context_text[:3000]}  # Limit for API
+            
+            Return in JSON format:
+            {{
+                "enhanced_description": "concise description here",
+                "standardized_focus": ["focus area 1", "focus area 2", ...],
+                "standardized_headquarters": "City, Country"
+            }}
+            """
+
+            response = self.openai_client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.3,
+            )
+
+            enhanced_data = json.loads(response.choices[0].message.content)
+
+            # Update investor data with enhanced information
+            if enhanced_data.get("enhanced_description"):
+                investor_data["enhanced_description"] = enhanced_data[
+                    "enhanced_description"
+                ]
+
+            if enhanced_data.get("standardized_focus"):
+                investor_data["standardized_focus"] = enhanced_data[
+                    "standardized_focus"
+                ]
+
+            if enhanced_data.get("standardized_headquarters"):
+                investor_data["standardized_headquarters"] = enhanced_data[
+                    "standardized_headquarters"
+                ]
+
+            return investor_data
+
+        except Exception as e:
+            logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}")
+            return investor_data
+
+    def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
+        """Save investor data to SQL database"""
+        try:
+            with get_session() as session:
+                # Check if investor already exists
+                existing = (
+                    session.query(Investor)
+                    .filter_by(name=investor_data["name"])
+                    .first()
+                )
+
+                if existing:
+                    logger.info(f"Updating existing investor: {investor_data['name']}")
+                    investor = existing
+                else:
+                    logger.info(f"Creating new investor: {investor_data['name']}")
+                    investor = Investor()
+
+                # Map data to investor object
+                investor.name = investor_data["name"]
+                investor.website = investor_data.get("website")
+                investor.investor_description = investor_data.get(
+                    "enhanced_description"
+                ) or investor_data.get("investor_description")
+                investor.investment_thesis_focus = investor_data.get(
+                    "standardized_focus"
+                ) or investor_data.get("investment_thesis_focus")
+                investor.headquarters = investor_data.get(
+                    "standardized_headquarters"
+                ) or investor_data.get("headquarters")
+
+                # AUM information
+                aum_info = investor_data.get("aum_info", {})
+                investor.aum_amount = aum_info.get("aumAmount")
+                investor.aum_as_of_date = aum_info.get("asOfDate")
+                investor.aum_source_url = aum_info.get("sourceUrl")
+
+                # Fund information
+                investor.funds_info = investor_data.get("funds_info", [])
+
+                # Raw data
+                investor.crunchbase_urls = investor_data.get("crunchbase_urls")
+                investor.crunchbase_extract = investor_data.get("crunchbase_extract")
+                investor.linkedin_profile = investor_data.get("linkedin_profile")
+                investor.source_truth_profile = investor_data.get(
+                    "source_truth_profile"
+                )
+
+                if not existing:
+                    session.add(investor)
+
+                session.flush()  # Get the ID
+                return investor.id
+
+        except Exception as e:
+            logger.error(f"Failed to save to SQL: {e}")
+            raise
+
+    def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
+        """Save investor description and focus to ChromaDB"""
+        try:
+            # Prepare text for embedding
+            description_text = investor_data.get(
+                "enhanced_description"
+            ) or investor_data.get("investor_description", "")
+            focus_areas = investor_data.get("standardized_focus") or investor_data.get(
+                "investment_thesis_focus", []
+            )
+
+            if isinstance(focus_areas, list):
+                focus_text = " ".join(focus_areas)
+            else:
+                focus_text = str(focus_areas)
+
+            # Combine description and focus for embedding
+            combined_text = f"{description_text} {focus_text}".strip()
+
+            if not combined_text:
+                logger.warning(f"No text to embed for investor {investor_data['name']}")
+                return
+
+            # Create metadata
+            metadata = {
+                "investor_id": investor_id,
+                "name": investor_data["name"],
+                "website": investor_data.get("website", ""),
+                "headquarters": investor_data.get("standardized_headquarters")
+                or investor_data.get("headquarters", ""),
+                "focus_areas_count": len(focus_areas)
+                if isinstance(focus_areas, list)
+                else 0,
+            }
+
+            # Add to ChromaDB
+            self.collection.add(
+                documents=[combined_text],
+                metadatas=[metadata],
+                ids=[f"investor_{investor_id}"],
+            )
+
+            logger.info(f"Added investor {investor_data['name']} to vector database")
+
+        except Exception as e:
+            logger.error(f"Failed to save to vector DB: {e}")
+
+    def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
+        """Process the entire CSV file"""
+        logger.info(f"Starting to process CSV file: {csv_file_path}")
+
+        # Read CSV
+        df = pd.read_csv(csv_file_path)
+        logger.info(f"Loaded {len(df)} rows from CSV")
+
+        if limit:
+            df = df.head(limit)
+            logger.info(f"Processing limited to {limit} rows")
+
+        processed_count = 0
+        error_count = 0
+
+        for index, row in df.iterrows():
+            try:
+                logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
+
+                # Create CSVRow object
+                csv_row = CSVRow(
+                    name=row["Name"],
+                    website=row.get("Website"),
+                    investment_firm_profile=row.get("Investment Firm Profile"),
+                    crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
+                    crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
+                    linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
+                    source_of_truth_profile=row.get("Source of Truth Profile"),
+                )
+
+                # Extract structured data
+                structured_data = self.extract_structured_data(csv_row)
+
+                # Enhance with LLM
+                enhanced_data = self.enhance_with_llm(structured_data)
+
+                # Save to SQL database
+                investor_id = self.save_to_sql(enhanced_data)
+
+                # Save to vector database
+                self.save_to_vector_db(investor_id, enhanced_data)
+
+                processed_count += 1
+
+                # Progress update every 10 rows
+                if (index + 1) % 10 == 0:
+                    logger.info(
+                        f"Processed {processed_count} rows successfully, {error_count} errors"
+                    )
+
+            except Exception as e:
+                error_count += 1
+                logger.error(
+                    f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
+                )
+                continue
+
+        logger.info(
+            f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
+        )
+        return processed_count, error_count
+
+    def search_investors(self, query: str, limit: int = 5):
+        """Search investors using vector similarity"""
+        try:
+            results = self.collection.query(query_texts=[query], n_results=limit)
+
+            return results
+
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return None
+
+
+def main():
+    """Main function to run the parser"""
+    parser = LLMInvestorParser()
+
+    # Process the CSV file
+    csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv"
+
+    # Start with a small sample for testing
+    processed, errors = parser.process_csv_file(csv_file, limit=5)
+
+    print("\nProcessing complete!")
+    print(f"Successfully processed: {processed} investors")
+    print(f"Errors encountered: {errors}")
+
+    # Test search functionality
+    print("\nTesting search functionality...")
+    results = parser.search_investors("bioeconomy circular economy")
+    if results:
+        print(f"Found {len(results['documents'][0])} similar investors")
+        for i, doc in enumerate(results["documents"][0]):
+            print(f"  {i + 1}. {results['metadatas'][0][i]['name']}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,449 @@
+#!/usr/bin/env python3
+"""
+LLM-powered Investor Parser
+
+A comprehensive parser that processes investor CSV data and saves it to both SQL and vector databases.
+Supports both simple parsing and LLM-enhanced parsing for better data quality.
+
+Usage:
+    python investor_parser.py --help
+    python investor_parser.py --file="path/to/csv" --limit=10
+    python investor_parser.py --file="path/to/csv" --use-llm --limit=50
+    python investor_parser.py --search="bioeconomy circular"
+"""
+
+import argparse
+import json
+import logging
+import os
+from typing import Any, Dict, Optional
+
+import chromadb
+import pandas as pd
+from dotenv import load_dotenv
+from openai import OpenAI
+
+from db import get_session, init_database
+from schema import CSVRow, Investor
+
+# Load environment variables
+load_dotenv()
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+
+class InvestorParser:
+    """Complete investor parser with optional LLM enhancement"""
+
+    def __init__(self, use_llm: bool = False):
+        self.use_llm = use_llm
+
+        # Initialize OpenAI client if using LLM
+        if self.use_llm:
+            api_key = os.getenv("OPENAI_API_KEY")
+            if not api_key:
+                logger.warning(
+                    "OpenAI API key not found. LLM features will be disabled."
+                )
+                self.use_llm = False
+            else:
+                self.openai_client = OpenAI(api_key=api_key)
+                logger.info("LLM enhancement enabled")
+
+        # Initialize ChromaDB
+        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
+        self.collection = self.chroma_client.get_or_create_collection(
+            name="investor_descriptions",
+            metadata={
+                "description": "Investor descriptions and investment thesis focus"
+            },
+        )
+
+        # Initialize database
+        init_database()
+
+    def parse_json_field(self, json_str: str) -> Dict[str, Any]:
+        """Safely parse JSON string with optional LLM assistance"""
+        if not json_str or json_str.strip() == "":
+            return {}
+
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON parsing failed: {e}")
+
+            # Use LLM to clean JSON if available
+            if self.use_llm:
+                return self._llm_clean_json(json_str)
+            else:
+                return {}
+
+    def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]:
+        """Use LLM to clean and parse malformed JSON"""
+        try:
+            prompt = f"""
+            The following text appears to be malformed JSON. Please clean it up and return valid JSON.
+            If it's not possible to create valid JSON, return an empty object {{}}.
+            
+            Original text:
+            {malformed_json[:2000]}  # Limit length for API
+            
+            Return only the cleaned JSON, no explanations:
+            """
+
+            response = self.openai_client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0,
+            )
+
+            cleaned_json = response.choices[0].message.content.strip()
+            return json.loads(cleaned_json)
+
+        except Exception as e:
+            logger.error(f"LLM JSON cleaning failed: {e}")
+            return {}
+
+    def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
+        """Extract and structure data from CSV row"""
+        # Parse the investment firm profile
+        profile_data = {}
+        if csv_row.investment_firm_profile:
+            profile_data = self.parse_json_field(csv_row.investment_firm_profile)
+
+        # Create structured output
+        structured_data = {
+            "name": csv_row.name,
+            "website": csv_row.website or profile_data.get("websiteURL"),
+            "investor_description": profile_data.get("investorDescription", ""),
+            "investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
+            "headquarters": profile_data.get("headquarters", ""),
+            "aum_info": profile_data.get("overallAssetsUnderManagement", {}),
+            "funds_info": profile_data.get("funds", []),
+            "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
+            "crunchbase_extract": csv_row.crunchbase_firm_extract or "",
+            "linkedin_profile": csv_row.linkedin_investment_profile or "",
+            "source_truth_profile": csv_row.source_of_truth_profile or "",
+        }
+
+        return structured_data
+
+    def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Use LLM to enhance and standardize investor data"""
+        if not self.use_llm:
+            return investor_data
+
+        try:
+            # Combine all available text for context
+            context_text = " ".join(
+                [
+                    investor_data.get("investor_description", ""),
+                    investor_data.get("crunchbase_extract", ""),
+                    investor_data.get("linkedin_profile", ""),
+                    investor_data.get("source_truth_profile", ""),
+                ]
+            )
+
+            if not context_text.strip():
+                return investor_data
+
+            prompt = f"""
+            Based on the following information about an investor, please extract and standardize:
+            1. A concise investor description (2-3 sentences)
+            2. Investment thesis focus areas (list of specific focus areas)
+            3. Headquarters location (city, country format)
+            
+            Investor: {investor_data["name"]}
+            Context: {context_text[:3000]}  # Limit for API
+            
+            Return in JSON format:
+            {{
+                "enhanced_description": "concise description here",
+                "standardized_focus": ["focus area 1", "focus area 2", ...],
+                "standardized_headquarters": "City, Country"
+            }}
+            """
+
+            response = self.openai_client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.3,
+            )
+
+            enhanced_data = json.loads(response.choices[0].message.content)
+
+            # Update investor data with enhanced information
+            if enhanced_data.get("enhanced_description"):
+                investor_data["enhanced_description"] = enhanced_data[
+                    "enhanced_description"
+                ]
+
+            if enhanced_data.get("standardized_focus"):
+                investor_data["standardized_focus"] = enhanced_data[
+                    "standardized_focus"
+                ]
+
+            if enhanced_data.get("standardized_headquarters"):
+                investor_data["standardized_headquarters"] = enhanced_data[
+                    "standardized_headquarters"
+                ]
+
+            return investor_data
+
+        except Exception as e:
+            logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}")
+            return investor_data
+
+    def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
+        """Save investor data to SQL database"""
+        try:
+            with get_session() as session:
+                # Check if investor already exists
+                existing = (
+                    session.query(Investor)
+                    .filter_by(name=investor_data["name"])
+                    .first()
+                )
+
+                if existing:
+                    logger.info(f"Updating existing investor: {investor_data['name']}")
+                    investor = existing
+                else:
+                    logger.info(f"Creating new investor: {investor_data['name']}")
+                    investor = Investor()
+
+                # Map data to investor object
+                investor.name = investor_data["name"]
+                investor.website = investor_data.get("website")
+                investor.investor_description = investor_data.get(
+                    "enhanced_description"
+                ) or investor_data.get("investor_description")
+                investor.investment_thesis_focus = investor_data.get(
+                    "standardized_focus"
+                ) or investor_data.get("investment_thesis_focus")
+                investor.headquarters = investor_data.get(
+                    "standardized_headquarters"
+                ) or investor_data.get("headquarters")
+
+                # AUM information
+                aum_info = investor_data.get("aum_info") or {}
+                investor.aum_amount = aum_info.get("aumAmount")
+                investor.aum_as_of_date = aum_info.get("asOfDate")
+                investor.aum_source_url = aum_info.get("sourceUrl")
+
+                # Fund information
+                investor.funds_info = investor_data.get("funds_info", [])
+
+                # Raw data
+                investor.crunchbase_urls = investor_data.get("crunchbase_urls")
+                investor.crunchbase_extract = investor_data.get("crunchbase_extract")
+                investor.linkedin_profile = investor_data.get("linkedin_profile")
+                investor.source_truth_profile = investor_data.get(
+                    "source_truth_profile"
+                )
+
+                if not existing:
+                    session.add(investor)
+
+                session.flush()  # Get the ID
+                return investor.id
+
+        except Exception as e:
+            logger.error(f"Failed to save to SQL: {e}")
+            raise
+
+    def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
+        """Save investor description and focus to ChromaDB"""
+        try:
+            # Prepare text for embedding
+            description_text = investor_data.get(
+                "enhanced_description"
+            ) or investor_data.get("investor_description", "")
+            focus_areas = investor_data.get("standardized_focus") or investor_data.get(
+                "investment_thesis_focus", []
+            )
+
+            if isinstance(focus_areas, list):
+                focus_text = " ".join(focus_areas)
+            else:
+                focus_text = str(focus_areas)
+
+            # Combine description and focus for embedding
+            combined_text = f"{description_text} {focus_text}".strip()
+
+            if not combined_text:
+                logger.warning(f"No text to embed for investor {investor_data['name']}")
+                return
+
+            # Create metadata
+            metadata = {
+                "investor_id": investor_id,
+                "name": investor_data["name"],
+                "website": investor_data.get("website") or "",
+                "headquarters": investor_data.get("standardized_headquarters")
+                or investor_data.get("headquarters")
+                or "",
+                "focus_areas_count": len(focus_areas)
+                if isinstance(focus_areas, list)
+                else 0,
+            }
+
+            # Add to ChromaDB
+            self.collection.add(
+                documents=[combined_text],
+                metadatas=[metadata],
+                ids=[f"investor_{investor_id}"],
+            )
+
+            logger.info(f"Added investor {investor_data['name']} to vector database")
+
+        except Exception as e:
+            logger.error(f"Failed to save to vector DB: {e}")
+
+    def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
+        """Process the entire CSV file"""
+        logger.info(f"Starting to process CSV file: {csv_file_path}")
+
+        # Read CSV
+        df = pd.read_csv(csv_file_path)
+        logger.info(f"Loaded {len(df)} rows from CSV")
+
+        if limit:
+            df = df.head(limit)
+            logger.info(f"Processing limited to {limit} rows")
+
+        processed_count = 0
+        error_count = 0
+
+        for index, row in df.iterrows():
+            try:
+                logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
+
+                # Create CSVRow object
+                csv_row = CSVRow(
+                    name=row["Name"],
+                    website=row.get("Website"),
+                    investment_firm_profile=row.get("Investment Firm Profile"),
+                    crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
+                    crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
+                    linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
+                    source_of_truth_profile=row.get("Source of Truth Profile"),
+                )
+
+                # Extract structured data
+                structured_data = self.extract_structured_data(csv_row)
+
+                # Enhance with LLM if enabled
+                enhanced_data = self.enhance_with_llm(structured_data)
+
+                # Save to SQL database
+                investor_id = self.save_to_sql(enhanced_data)
+
+                # Save to vector database
+                self.save_to_vector_db(investor_id, enhanced_data)
+
+                processed_count += 1
+
+                # Progress update every 10 rows
+                if (index + 1) % 10 == 0:
+                    logger.info(
+                        f"Progress: {processed_count} processed, {error_count} errors"
+                    )
+
+            except Exception as e:
+                error_count += 1
+                logger.error(
+                    f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
+                )
+                continue
+
+        logger.info(
+            f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
+        )
+        return processed_count, error_count
+
+    def search_investors(self, query: str, limit: int = 10):
+        """Search investors using vector similarity"""
+        try:
+            results = self.collection.query(query_texts=[query], n_results=limit)
+
+            return results
+
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return None
+
+
+def main():
+    """Main function with command line interface"""
+    parser = argparse.ArgumentParser(description="LLM-powered Investor Parser")
+    parser.add_argument("--file", type=str, help="Path to CSV file to process")
+    parser.add_argument("--limit", type=int, help="Limit number of rows to process")
+    parser.add_argument(
+        "--use-llm",
+        action="store_true",
+        help="Enable LLM enhancement (requires OpenAI API key)",
+    )
+    parser.add_argument("--search", type=str, help="Search query for vector database")
+    parser.add_argument(
+        "--search-limit",
+        type=int,
+        default=10,
+        help="Number of search results to return",
+    )
+
+    args = parser.parse_args()
+
+    # Initialize parser
+    investor_parser = InvestorParser(use_llm=args.use_llm)
+
+    if args.search:
+        # Perform search
+        logger.info(f"Searching for: {args.search}")
+        results = investor_parser.search_investors(args.search, args.search_limit)
+
+        if results and results["documents"][0]:
+            print(f"\nFound {len(results['documents'][0])} similar investors:")
+            for i, (doc, metadata) in enumerate(
+                zip(results["documents"][0], results["metadatas"][0])
+            ):
+                print(f"{i + 1}. {metadata['name']}")
+                print(f"   Website: {metadata.get('website', 'N/A')}")
+                print(f"   HQ: {metadata.get('headquarters', 'N/A')}")
+                print(f"   Focus areas: {metadata.get('focus_areas_count', 0)}")
+                print(f"   Similarity score: {results['distances'][0][i]:.3f}")
+                print()
+        else:
+            print("No results found.")
+
+    elif args.file:
+        # Process CSV file
+        if not os.path.exists(args.file):
+            logger.error(f"File not found: {args.file}")
+            return
+
+        processed, errors = investor_parser.process_csv_file(args.file, args.limit)
+
+        print("\nProcessing complete!")
+        print(f"Successfully processed: {processed} investors")
+        print(f"Errors encountered: {errors}")
+
+        # Show some search examples
+        print("\nTrying some example searches...")
+        for query in ["bioeconomy", "venture capital", "sustainability"]:
+            results = investor_parser.search_investors(query, 3)
+            if results and results["documents"][0]:
+                print(f"\nTop matches for '{query}':")
+                for i, metadata in enumerate(results["metadatas"][0][:3]):
+                    print(f"  {i + 1}. {metadata['name']}")
+
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,109 @@
+from sqlalchemy import Column, Integer, String, Text, DateTime, JSON, Float
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.sql import func
+from pydantic import BaseModel
+from typing import List, Optional, Dict, Any
+import json
+
+Base = declarative_base()
+
+class Investor(Base):
+    __tablename__ = 'investors'
+    
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    name = Column(String(500), nullable=False)
+    website = Column(String(1000))
+    
+    # Core investment information
+    investor_description = Column(Text)
+    investment_thesis_focus = Column(JSON)  # List of focus areas
+    headquarters = Column(String(1000))
+    
+    # AUM information
+    aum_amount = Column(String(200))
+    aum_as_of_date = Column(String(100))
+    aum_source_url = Column(String(1000))
+    
+    # Fund information
+    funds_info = Column(JSON)  # Complex fund data
+    
+    # Raw data columns for reference
+    crunchbase_urls = Column(Text)
+    crunchbase_extract = Column(Text)
+    linkedin_profile = Column(Text)
+    source_truth_profile = Column(Text)
+    
+    # Metadata
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+    updated_at = Column(DateTime(timezone=True), onupdate=func.now())
+    
+    def __repr__(self):
+        return f"<Investor(name='{self.name}', website='{self.website}')>"
+
+# Pydantic models for data validation and parsing
+class AUMInfo(BaseModel):
+    aumAmount: Optional[str] = None
+    asOfDate: Optional[str] = None
+    sourceUrl: Optional[str] = None
+
+class FundInfo(BaseModel):
+    fundName: Optional[str] = None
+    fundSize: Optional[str] = None
+    vintage: Optional[str] = None
+    status: Optional[str] = None
+    description: Optional[str] = None
+
+class InvestorProfile(BaseModel):
+    websiteURL: Optional[str] = None
+    investorDescription: Optional[str] = None
+    investmentThesisFocus: Optional[List[str]] = None
+    headquarters: Optional[str] = None
+    overallAssetsUnderManagement: Optional[AUMInfo] = None
+    funds: Optional[List[FundInfo]] = None
+
+class CSVRow(BaseModel):
+    name: str
+    website: Optional[str] = None
+    investment_firm_profile: Optional[str] = None
+    crunchbase_linkedin_urls: Optional[str] = None
+    crunchbase_firm_extract: Optional[str] = None
+    linkedin_investment_profile: Optional[str] = None
+    source_of_truth_profile: Optional[str] = None
+    
+    def get_combined_description(self) -> str:
+        """Combine all description fields for vector embedding"""
+        descriptions = []
+        
+        if self.investment_firm_profile:
+            try:
+                profile_data = json.loads(self.investment_firm_profile)
+                if isinstance(profile_data, dict):
+                    desc = profile_data.get('investorDescription', '')
+                    if desc:
+                        descriptions.append(desc)
+            except (json.JSONDecodeError, TypeError):
+                pass
+        
+        if self.crunchbase_firm_extract:
+            descriptions.append(self.crunchbase_firm_extract)
+        
+        if self.linkedin_investment_profile:
+            descriptions.append(self.linkedin_investment_profile)
+            
+        if self.source_of_truth_profile:
+            descriptions.append(self.source_of_truth_profile)
+        
+        return " ".join(descriptions)
+    
+    def get_investment_focus(self) -> List[str]:
+        """Extract investment thesis focus"""
+        if self.investment_firm_profile:
+            try:
+                profile_data = json.loads(self.investment_firm_profile)
+                if isinstance(profile_data, dict):
+                    focus = profile_data.get('investmentThesisFocus', [])
+                    if isinstance(focus, list):
+                        return focus
+            except (json.JSONDecodeError, TypeError):
+                pass
+        return []
@@ -0,0 +1,260 @@
+import json
+import logging
+from typing import Any, Dict, Optional
+
+import chromadb
+import pandas as pd
+
+from db import get_session, init_database
+from schema import CSVRow, Investor
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class SimpleInvestorParser:
+    """Simplified parser that works without OpenAI API for testing"""
+
+    def __init__(self):
+        # Initialize ChromaDB
+        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
+        self.collection = self.chroma_client.get_or_create_collection(
+            name="investor_descriptions",
+            metadata={
+                "description": "Investor descriptions and investment thesis focus"
+            },
+        )
+
+        # Initialize database
+        init_database()
+
+    def parse_json_field(self, json_str: str) -> Dict[str, Any]:
+        """Safely parse JSON string"""
+        if not json_str or json_str.strip() == "":
+            return {}
+
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON parsing failed: {e}")
+            return {}
+
+    def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
+        """Extract and structure data from CSV row"""
+        # Parse the investment firm profile
+        profile_data = {}
+        if csv_row.investment_firm_profile:
+            profile_data = self.parse_json_field(csv_row.investment_firm_profile)
+
+        # Create structured output
+        structured_data = {
+            "name": csv_row.name,
+            "website": csv_row.website or profile_data.get("websiteURL"),
+            "investor_description": profile_data.get("investorDescription", ""),
+            "investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
+            "headquarters": profile_data.get("headquarters", ""),
+            "aum_info": profile_data.get("overallAssetsUnderManagement", {}),
+            "funds_info": profile_data.get("funds", []),
+            "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
+            "crunchbase_extract": csv_row.crunchbase_firm_extract or "",
+            "linkedin_profile": csv_row.linkedin_investment_profile or "",
+            "source_truth_profile": csv_row.source_of_truth_profile or "",
+        }
+
+        return structured_data
+
+    def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
+        """Save investor data to SQL database"""
+        try:
+            with get_session() as session:
+                # Check if investor already exists
+                existing = (
+                    session.query(Investor)
+                    .filter_by(name=investor_data["name"])
+                    .first()
+                )
+
+                if existing:
+                    logger.info(f"Updating existing investor: {investor_data['name']}")
+                    investor = existing
+                else:
+                    logger.info(f"Creating new investor: {investor_data['name']}")
+                    investor = Investor()
+
+                # Map data to investor object
+                investor.name = investor_data["name"]
+                investor.website = investor_data.get("website")
+                investor.investor_description = investor_data.get(
+                    "investor_description"
+                )
+                investor.investment_thesis_focus = investor_data.get(
+                    "investment_thesis_focus"
+                )
+                investor.headquarters = investor_data.get("headquarters")
+
+                # AUM information
+                aum_info = investor_data.get("aum_info") or {}
+                investor.aum_amount = aum_info.get("aumAmount")
+                investor.aum_as_of_date = aum_info.get("asOfDate")
+                investor.aum_source_url = aum_info.get("sourceUrl")
+
+                # Fund information
+                investor.funds_info = investor_data.get("funds_info", [])
+
+                # Raw data
+                investor.crunchbase_urls = investor_data.get("crunchbase_urls")
+                investor.crunchbase_extract = investor_data.get("crunchbase_extract")
+                investor.linkedin_profile = investor_data.get("linkedin_profile")
+                investor.source_truth_profile = investor_data.get(
+                    "source_truth_profile"
+                )
+
+                if not existing:
+                    session.add(investor)
+
+                session.flush()  # Get the ID
+                return investor.id
+
+        except Exception as e:
+            logger.error(f"Failed to save to SQL: {e}")
+            raise
+
+    def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
+        """Save investor description and focus to ChromaDB"""
+        try:
+            # Prepare text for embedding
+            description_text = investor_data.get("investor_description", "")
+            focus_areas = investor_data.get("investment_thesis_focus", [])
+
+            if isinstance(focus_areas, list):
+                focus_text = " ".join(focus_areas)
+            else:
+                focus_text = str(focus_areas)
+
+            # Combine description and focus for embedding
+            combined_text = f"{description_text} {focus_text}".strip()
+
+            if not combined_text:
+                logger.warning(f"No text to embed for investor {investor_data['name']}")
+                return
+
+            # Create metadata
+            metadata = {
+                "investor_id": investor_id,
+                "name": investor_data["name"],
+                "website": investor_data.get("website") or "",
+                "headquarters": investor_data.get("headquarters") or "",
+                "focus_areas_count": len(focus_areas)
+                if isinstance(focus_areas, list)
+                else 0,
+            }
+
+            # Add to ChromaDB
+            self.collection.add(
+                documents=[combined_text],
+                metadatas=[metadata],
+                ids=[f"investor_{investor_id}"],
+            )
+
+            logger.info(f"Added investor {investor_data['name']} to vector database")
+
+        except Exception as e:
+            logger.error(f"Failed to save to vector DB: {e}")
+
+    def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
+        """Process the entire CSV file"""
+        logger.info(f"Starting to process CSV file: {csv_file_path}")
+
+        # Read CSV
+        df = pd.read_csv(csv_file_path)
+        logger.info(f"Loaded {len(df)} rows from CSV")
+
+        if limit:
+            df = df.head(limit)
+            logger.info(f"Processing limited to {limit} rows")
+
+        processed_count = 0
+        error_count = 0
+
+        for index, row in df.iterrows():
+            try:
+                logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
+
+                # Create CSVRow object
+                csv_row = CSVRow(
+                    name=row["Name"],
+                    website=row.get("Website"),
+                    investment_firm_profile=row.get("Investment Firm Profile"),
+                    crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
+                    crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
+                    linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
+                    source_of_truth_profile=row.get("Source of Truth Profile"),
+                )
+
+                # Extract structured data
+                structured_data = self.extract_structured_data(csv_row)
+
+                # Save to SQL database
+                investor_id = self.save_to_sql(structured_data)
+
+                # Save to vector database
+                self.save_to_vector_db(investor_id, structured_data)
+
+                processed_count += 1
+
+                # Progress update every 10 rows
+                if (index + 1) % 10 == 0:
+                    logger.info(
+                        f"Processed {processed_count} rows successfully, {error_count} errors"
+                    )
+
+            except Exception as e:
+                error_count += 1
+                logger.error(
+                    f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
+                )
+                continue
+
+        logger.info(
+            f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
+        )
+        return processed_count, error_count
+
+    def search_investors(self, query: str, limit: int = 5):
+        """Search investors using vector similarity"""
+        try:
+            results = self.collection.query(query_texts=[query], n_results=limit)
+
+            return results
+
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return None
+
+
+def main():
+    """Main function to run the parser"""
+    parser = SimpleInvestorParser()
+
+    # Process the CSV file
+    csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv"
+
+    # Start with a small sample for testing
+    processed, errors = parser.process_csv_file(csv_file, limit=5)
+
+    print("Processing complete!")
+    print(f"Successfully processed: {processed} investors")
+    print(f"Errors encountered: {errors}")
+
+    # Test search functionality
+    print("\nTesting search functionality...")
+    results = parser.search_investors("bioeconomy circular economy")
+    if results:
+        print(f"Found {len(results['documents'][0])} similar investors")
+        for i, doc in enumerate(results["documents"][0]):
+            print(f"  {i + 1}. {results['metadatas'][0][i]['name']}")
+
+
+if __name__ == "__main__":
+    main()