Implement investor processing and querying functionality

- Added InvestorProcessor class for processing CSV data in batches and saving to SQL and vector databases. - Introduced QueryProcessor class for querying investor information from SQL and vector databases. - Integrated OpenAI's ChatGPT for structured output generation. - Implemented data cleaning and control character removal in CSV processing. - Added asynchronous processing capabilities for batch handling. - Established connection to ChromaDB for vector storage of investor descriptions. - Defined structured output schemas using Pydantic for investor data validation. - Enhanced settings management for API key and database configurations.
2025-08-29 18:42:55 +01:00
parent 4c99638d94
commit ba0ed169ce
22 changed files with 719 additions and 492 deletions
@@ -1,11 +1,12 @@
 import os
-from contextlib import contextmanager
-from typing import Generator
+from typing import Annotated

+from fastapi import Depends
 from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import Session, sessionmaker

-from schema import Base
+Base = declarative_base()

 # Database configuration
 DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///investors.db")
@@ -17,26 +18,23 @@ engine = create_engine(DATABASE_URL, echo=False)
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)


+def get_db():
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
+
+
+db_dependency = Annotated[Session, Depends(get_db)]
+
+
 def init_database():
    """Initialize the database by creating all tables"""
    Base.metadata.create_all(bind=engine)
    print("Database initialized successfully!")


-@contextmanager
-def get_session() -> Generator[Session, None, None]:
-    """Get a database session with automatic cleanup"""
-    session = SessionLocal()
-    try:
-        yield session
-        session.commit()
-    except Exception as e:
-        session.rollback()
-        raise e
-    finally:
-        session.close()
-
-
 def get_session_sync() -> Session:
    """Get a database session for synchronous operations"""
    return SessionLocal()
@@ -0,0 +1,23 @@
+import datetime
+
+from sqlalchemy import Column, DateTime, Integer, String
+
+from db.db import Base
+
+
+class InvestorTable(Base):
+    __tablename__ = "investors"
+
+    id = Column(Integer, primary_key=True, index=True)
+    name = Column(String, nullable=False)
+    aum = Column(Integer, nullable=False)
+    check_size = Column(String, nullable=False)
+    sector_focus = Column(String, nullable=False)
+    stage_focus = Column(String, nullable=False)
+    region = Column(String, nullable=False)
+    created_at = Column(DateTime, default=datetime.datetime.now(datetime.UTC))
+    updated_at = Column(
+        DateTime,
+        default=datetime.datetime.now(datetime.UTC),
+        onupdate=datetime.datetime.now(datetime.UTC),
+    )
@@ -1,7 +1,44 @@
-from fastapi import FastAPI
+import io
+
+import pandas as pd
+from db.db import db_dependency, init_database
+from fastapi import FastAPI, File, UploadFile
+from services.openrouter import InvestorProcessor
+
+from app.services.querying import QueryProcessor

 app = FastAPI()

+init_database()
+
+
@app.get("/")
 def read_root():
-    return {"Hello": "World"}
+    return {"Hello": "World"}
+
+
+@app.post("/parse-csv")
+async def parse_csv(db: db_dependency, file: UploadFile = File(...)):
+    # Read uploaded CSV with pandas
+    content = await file.read()
+    df = pd.read_csv(io.StringIO(content.decode("utf-8")))
+
+    # Process the dataframe
+    processor = InvestorProcessor(sql_session=db)
+    results = await processor.process_csv(df)
+
+    # Convert Pydantic objects to dictionaries
+    return {"results": [r.dict() for r in results]}
+
+
+@app.post("/query")
+async def query_investors(db: db_dependency, question: str):
+    processor = QueryProcessor(sql_session=db)
+    results = processor.process_query(question)
+    return {"results": [r.dict() for r in results]}
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app="main:app", host="localhost", port=8000, reload=True)
@@ -0,0 +1,38 @@
+from typing import List
+
+from pydantic import BaseModel
+
+
+class Investor(BaseModel):
+    name: str
+    aum: int
+    check_size: str
+    sector_focus: str
+    stage_focus: str
+    region: str
+    investment_thesis: str
+    investor_description: str
+
+
+class InvestorList(BaseModel):
+    investor_list: List[Investor]
+
+
+class QueryResponse(BaseModel):
+    name: str
+    aum: int
+    check_size: str
+    sector_focus: str
+    stage_focus: str
+    region: str
+    investment_thesis: str
+    investor_description: str
+    reason: str
+
+
+class QueryRequest(BaseModel):
+    question: str
+
+
+class QueryResponseList(BaseModel):
+    responses: List[QueryResponse]
@@ -1,449 +0,0 @@
-#!/usr/bin/env python3
-"""
-LLM-powered Investor Parser
-
-A comprehensive parser that processes investor CSV data and saves it to both SQL and vector databases.
-Supports both simple parsing and LLM-enhanced parsing for better data quality.
-
-Usage:
-    python investor_parser.py --help
-    python investor_parser.py --file="path/to/csv" --limit=10
-    python investor_parser.py --file="path/to/csv" --use-llm --limit=50
-    python investor_parser.py --search="bioeconomy circular"
-"""
-
-import argparse
-import json
-import logging
-import os
-from typing import Any, Dict, Optional
-
-import chromadb
-import pandas as pd
-from dotenv import load_dotenv
-from openai import OpenAI
-
-from db import get_session, init_database
-from schema import CSVRow, Investor
-
-# Load environment variables
-load_dotenv()
-
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-logger = logging.getLogger(__name__)
-
-
-class InvestorParser:
-    """Complete investor parser with optional LLM enhancement"""
-
-    def __init__(self, use_llm: bool = False):
-        self.use_llm = use_llm
-
-        # Initialize OpenAI client if using LLM
-        if self.use_llm:
-            api_key = os.getenv("OPENAI_API_KEY")
-            if not api_key:
-                logger.warning(
-                    "OpenAI API key not found. LLM features will be disabled."
-                )
-                self.use_llm = False
-            else:
-                self.openai_client = OpenAI(api_key=api_key)
-                logger.info("LLM enhancement enabled")
-
-        # Initialize ChromaDB
-        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
-        self.collection = self.chroma_client.get_or_create_collection(
-            name="investor_descriptions",
-            metadata={
-                "description": "Investor descriptions and investment thesis focus"
-            },
-        )
-
-        # Initialize database
-        init_database()
-
-    def parse_json_field(self, json_str: str) -> Dict[str, Any]:
-        """Safely parse JSON string with optional LLM assistance"""
-        if not json_str or json_str.strip() == "":
-            return {}
-
-        try:
-            return json.loads(json_str)
-        except json.JSONDecodeError as e:
-            logger.warning(f"JSON parsing failed: {e}")
-
-            # Use LLM to clean JSON if available
-            if self.use_llm:
-                return self._llm_clean_json(json_str)
-            else:
-                return {}
-
-    def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]:
-        """Use LLM to clean and parse malformed JSON"""
-        try:
-            prompt = f"""
-            The following text appears to be malformed JSON. Please clean it up and return valid JSON.
-            If it's not possible to create valid JSON, return an empty object {{}}.
-            
-            Original text:
-            {malformed_json[:2000]}  # Limit length for API
-            
-            Return only the cleaned JSON, no explanations:
-            """
-
-            response = self.openai_client.chat.completions.create(
-                model="gpt-3.5-turbo",
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0,
-            )
-
-            cleaned_json = response.choices[0].message.content.strip()
-            return json.loads(cleaned_json)
-
-        except Exception as e:
-            logger.error(f"LLM JSON cleaning failed: {e}")
-            return {}
-
-    def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
-        """Extract and structure data from CSV row"""
-        # Parse the investment firm profile
-        profile_data = {}
-        if csv_row.investment_firm_profile:
-            profile_data = self.parse_json_field(csv_row.investment_firm_profile)
-
-        # Create structured output
-        structured_data = {
-            "name": csv_row.name,
-            "website": csv_row.website or profile_data.get("websiteURL"),
-            "investor_description": profile_data.get("investorDescription", ""),
-            "investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
-            "headquarters": profile_data.get("headquarters", ""),
-            "aum_info": profile_data.get("overallAssetsUnderManagement", {}),
-            "funds_info": profile_data.get("funds", []),
-            "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
-            "crunchbase_extract": csv_row.crunchbase_firm_extract or "",
-            "linkedin_profile": csv_row.linkedin_investment_profile or "",
-            "source_truth_profile": csv_row.source_of_truth_profile or "",
-        }
-
-        return structured_data
-
-    def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Use LLM to enhance and standardize investor data"""
-        if not self.use_llm:
-            return investor_data
-
-        try:
-            # Combine all available text for context
-            context_text = " ".join(
-                [
-                    investor_data.get("investor_description", ""),
-                    investor_data.get("crunchbase_extract", ""),
-                    investor_data.get("linkedin_profile", ""),
-                    investor_data.get("source_truth_profile", ""),
-                ]
-            )
-
-            if not context_text.strip():
-                return investor_data
-
-            prompt = f"""
-            Based on the following information about an investor, please extract and standardize:
-            1. A concise investor description (2-3 sentences)
-            2. Investment thesis focus areas (list of specific focus areas)
-            3. Headquarters location (city, country format)
-            
-            Investor: {investor_data["name"]}
-            Context: {context_text[:3000]}  # Limit for API
-            
-            Return in JSON format:
-            {{
-                "enhanced_description": "concise description here",
-                "standardized_focus": ["focus area 1", "focus area 2", ...],
-                "standardized_headquarters": "City, Country"
-            }}
-            """
-
-            response = self.openai_client.chat.completions.create(
-                model="gpt-3.5-turbo",
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.3,
-            )
-
-            enhanced_data = json.loads(response.choices[0].message.content)
-
-            # Update investor data with enhanced information
-            if enhanced_data.get("enhanced_description"):
-                investor_data["enhanced_description"] = enhanced_data[
-                    "enhanced_description"
-                ]
-
-            if enhanced_data.get("standardized_focus"):
-                investor_data["standardized_focus"] = enhanced_data[
-                    "standardized_focus"
-                ]
-
-            if enhanced_data.get("standardized_headquarters"):
-                investor_data["standardized_headquarters"] = enhanced_data[
-                    "standardized_headquarters"
-                ]
-
-            return investor_data
-
-        except Exception as e:
-            logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}")
-            return investor_data
-
-    def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
-        """Save investor data to SQL database"""
-        try:
-            with get_session() as session:
-                # Check if investor already exists
-                existing = (
-                    session.query(Investor)
-                    .filter_by(name=investor_data["name"])
-                    .first()
-                )
-
-                if existing:
-                    logger.info(f"Updating existing investor: {investor_data['name']}")
-                    investor = existing
-                else:
-                    logger.info(f"Creating new investor: {investor_data['name']}")
-                    investor = Investor()
-
-                # Map data to investor object
-                investor.name = investor_data["name"]
-                investor.website = investor_data.get("website")
-                investor.investor_description = investor_data.get(
-                    "enhanced_description"
-                ) or investor_data.get("investor_description")
-                investor.investment_thesis_focus = investor_data.get(
-                    "standardized_focus"
-                ) or investor_data.get("investment_thesis_focus")
-                investor.headquarters = investor_data.get(
-                    "standardized_headquarters"
-                ) or investor_data.get("headquarters")
-
-                # AUM information
-                aum_info = investor_data.get("aum_info") or {}
-                investor.aum_amount = aum_info.get("aumAmount")
-                investor.aum_as_of_date = aum_info.get("asOfDate")
-                investor.aum_source_url = aum_info.get("sourceUrl")
-
-                # Fund information
-                investor.funds_info = investor_data.get("funds_info", [])
-
-                # Raw data
-                investor.crunchbase_urls = investor_data.get("crunchbase_urls")
-                investor.crunchbase_extract = investor_data.get("crunchbase_extract")
-                investor.linkedin_profile = investor_data.get("linkedin_profile")
-                investor.source_truth_profile = investor_data.get(
-                    "source_truth_profile"
-                )
-
-                if not existing:
-                    session.add(investor)
-
-                session.flush()  # Get the ID
-                return investor.id
-
-        except Exception as e:
-            logger.error(f"Failed to save to SQL: {e}")
-            raise
-
-    def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
-        """Save investor description and focus to ChromaDB"""
-        try:
-            # Prepare text for embedding
-            description_text = investor_data.get(
-                "enhanced_description"
-            ) or investor_data.get("investor_description", "")
-            focus_areas = investor_data.get("standardized_focus") or investor_data.get(
-                "investment_thesis_focus", []
-            )
-
-            if isinstance(focus_areas, list):
-                focus_text = " ".join(focus_areas)
-            else:
-                focus_text = str(focus_areas)
-
-            # Combine description and focus for embedding
-            combined_text = f"{description_text} {focus_text}".strip()
-
-            if not combined_text:
-                logger.warning(f"No text to embed for investor {investor_data['name']}")
-                return
-
-            # Create metadata
-            metadata = {
-                "investor_id": investor_id,
-                "name": investor_data["name"],
-                "website": investor_data.get("website") or "",
-                "headquarters": investor_data.get("standardized_headquarters")
-                or investor_data.get("headquarters")
-                or "",
-                "focus_areas_count": len(focus_areas)
-                if isinstance(focus_areas, list)
-                else 0,
-            }
-
-            # Add to ChromaDB
-            self.collection.add(
-                documents=[combined_text],
-                metadatas=[metadata],
-                ids=[f"investor_{investor_id}"],
-            )
-
-            logger.info(f"Added investor {investor_data['name']} to vector database")
-
-        except Exception as e:
-            logger.error(f"Failed to save to vector DB: {e}")
-
-    def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
-        """Process the entire CSV file"""
-        logger.info(f"Starting to process CSV file: {csv_file_path}")
-
-        # Read CSV
-        df = pd.read_csv(csv_file_path)
-        logger.info(f"Loaded {len(df)} rows from CSV")
-
-        if limit:
-            df = df.head(limit)
-            logger.info(f"Processing limited to {limit} rows")
-
-        processed_count = 0
-        error_count = 0
-
-        for index, row in df.iterrows():
-            try:
-                logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
-
-                # Create CSVRow object
-                csv_row = CSVRow(
-                    name=row["Name"],
-                    website=row.get("Website"),
-                    investment_firm_profile=row.get("Investment Firm Profile"),
-                    crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
-                    crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
-                    linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
-                    source_of_truth_profile=row.get("Source of Truth Profile"),
-                )
-
-                # Extract structured data
-                structured_data = self.extract_structured_data(csv_row)
-
-                # Enhance with LLM if enabled
-                enhanced_data = self.enhance_with_llm(structured_data)
-
-                # Save to SQL database
-                investor_id = self.save_to_sql(enhanced_data)
-
-                # Save to vector database
-                self.save_to_vector_db(investor_id, enhanced_data)
-
-                processed_count += 1
-
-                # Progress update every 10 rows
-                if (index + 1) % 10 == 0:
-                    logger.info(
-                        f"Progress: {processed_count} processed, {error_count} errors"
-                    )
-
-            except Exception as e:
-                error_count += 1
-                logger.error(
-                    f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
-                )
-                continue
-
-        logger.info(
-            f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
-        )
-        return processed_count, error_count
-
-    def search_investors(self, query: str, limit: int = 10):
-        """Search investors using vector similarity"""
-        try:
-            results = self.collection.query(query_texts=[query], n_results=limit)
-
-            return results
-
-        except Exception as e:
-            logger.error(f"Search failed: {e}")
-            return None
-
-
-def main():
-    """Main function with command line interface"""
-    parser = argparse.ArgumentParser(description="LLM-powered Investor Parser")
-    parser.add_argument("--file", type=str, help="Path to CSV file to process")
-    parser.add_argument("--limit", type=int, help="Limit number of rows to process")
-    parser.add_argument(
-        "--use-llm",
-        action="store_true",
-        help="Enable LLM enhancement (requires OpenAI API key)",
-    )
-    parser.add_argument("--search", type=str, help="Search query for vector database")
-    parser.add_argument(
-        "--search-limit",
-        type=int,
-        default=10,
-        help="Number of search results to return",
-    )
-
-    args = parser.parse_args()
-
-    # Initialize parser
-    investor_parser = InvestorParser(use_llm=args.use_llm)
-
-    if args.search:
-        # Perform search
-        logger.info(f"Searching for: {args.search}")
-        results = investor_parser.search_investors(args.search, args.search_limit)
-
-        if results and results["documents"][0]:
-            print(f"\nFound {len(results['documents'][0])} similar investors:")
-            for i, (doc, metadata) in enumerate(
-                zip(results["documents"][0], results["metadatas"][0])
-            ):
-                print(f"{i + 1}. {metadata['name']}")
-                print(f"   Website: {metadata.get('website', 'N/A')}")
-                print(f"   HQ: {metadata.get('headquarters', 'N/A')}")
-                print(f"   Focus areas: {metadata.get('focus_areas_count', 0)}")
-                print(f"   Similarity score: {results['distances'][0][i]:.3f}")
-                print()
-        else:
-            print("No results found.")
-
-    elif args.file:
-        # Process CSV file
-        if not os.path.exists(args.file):
-            logger.error(f"File not found: {args.file}")
-            return
-
-        processed, errors = investor_parser.process_csv_file(args.file, args.limit)
-
-        print("\nProcessing complete!")
-        print(f"Successfully processed: {processed} investors")
-        print(f"Errors encountered: {errors}")
-
-        # Show some search examples
-        print("\nTrying some example searches...")
-        for query in ["bioeconomy", "venture capital", "sustainability"]:
-            results = investor_parser.search_investors(query, 3)
-            if results and results["documents"][0]:
-                print(f"\nTop matches for '{query}':")
-                for i, metadata in enumerate(results["metadatas"][0][:3]):
-                    print(f"  {i + 1}. {metadata['name']}")
-
-    else:
-        parser.print_help()
-
-
-if __name__ == "__main__":
-    main()
@@ -1,28 +1,368 @@
-import asyncio
-import csv
+import json
+import logging
+import os
+from typing import Any, Dict, Optional

-from openai import AsyncOpenAI
-from pydantic import BaseModel
+import chromadb
+import pandas as pd
+from dotenv import load_dotenv
+from openai import OpenAI
+
+from db import get_session, init_database
+from schema import CSVRow, Investor
+
+# Load environment variables
+load_dotenv()
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)


-class RowSchema(BaseModel):
-    section: str
-    explanation: str
+class LLMInvestorParser:
+    def __init__(self):
+        # Initialize OpenAI client
+        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

-client = AsyncOpenAI()
+        # Initialize ChromaDB
+        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
+        self.collection = self.chroma_client.get_or_create_collection(
+            name="investor_descriptions",
+            metadata={
+                "description": "Investor descriptions and investment thesis focus"
+            },
+        )

-async def process_row(row):
-    resp = await client.chat.completions.create(
-        model="gpt-4o-mini",
-        messages=[{"role": "user", "content": f"Extract relevant section:\n{row}"}],
-        response_format={"type": "json_object"}  # ensures JSON output
-    )
-    return RowSchema.model_validate_json(resp.choices[0].message.content)
+        # Initialize database
+        init_database()

-async def main():
-    with open("data.csv") as f:
-        reader = csv.DictReader(f)
-        tasks = [process_row(row) for row in reader]
-        return await asyncio.gather(*tasks)
+    def parse_json_field(self, json_str: str) -> Dict[str, Any]:
+        """Safely parse JSON string with LLM assistance if needed"""
+        if not json_str or json_str.strip() == "":
+            return {}

-results = asyncio.run(main())
+        try:
+            # Try direct JSON parsing first
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            # If direct parsing fails, use LLM to clean and parse
+            logger.info("Direct JSON parsing failed, using LLM to clean JSON")
+            return self._llm_clean_json(json_str)
+
+    def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]:
+        """Use LLM to clean and parse malformed JSON"""
+        try:
+            prompt = f"""
+            The following text appears to be malformed JSON. Please clean it up and return valid JSON.
+            If it's not possible to create valid JSON, return an empty object {{}}.
+            
+            Original text:
+            {malformed_json[:2000]}  # Limit length for API
+            
+            Return only the cleaned JSON, no explanations:
+            """
+
+            response = self.openai_client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0,
+            )
+
+            cleaned_json = response.choices[0].message.content.strip()
+            return json.loads(cleaned_json)
+
+        except Exception as e:
+            logger.error(f"LLM JSON cleaning failed: {e}")
+            return {}
+
+    def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
+        """Extract and structure data from CSV row using LLM"""
+        # Parse the investment firm profile
+        profile_data = {}
+        if csv_row.investment_firm_profile:
+            profile_data = self.parse_json_field(csv_row.investment_firm_profile)
+
+        # Create structured output
+        structured_data = {
+            "name": csv_row.name,
+            "website": csv_row.website or profile_data.get("websiteURL"),
+            "investor_description": profile_data.get("investorDescription", ""),
+            "investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
+            "headquarters": profile_data.get("headquarters", ""),
+            "aum_info": profile_data.get("overallAssetsUnderManagement", {}),
+            "funds_info": profile_data.get("funds", []),
+            "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
+            "crunchbase_extract": csv_row.crunchbase_firm_extract or "",
+            "linkedin_profile": csv_row.linkedin_investment_profile or "",
+            "source_truth_profile": csv_row.source_of_truth_profile or "",
+        }
+
+        return structured_data
+
+    def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]:
+        """Use LLM to enhance and standardize investor data"""
+        try:
+            # Combine all available text for context
+            context_text = " ".join(
+                [
+                    investor_data.get("investor_description", ""),
+                    investor_data.get("crunchbase_extract", ""),
+                    investor_data.get("linkedin_profile", ""),
+                    investor_data.get("source_truth_profile", ""),
+                ]
+            )
+
+            if not context_text.strip():
+                return investor_data
+
+            prompt = f"""
+            Based on the following information about an investor, please extract and standardize:
+            1. A concise investor description (2-3 sentences)
+            2. Investment thesis focus areas (list of specific focus areas)
+            3. Headquarters location (city, country format)
+            
+            Investor: {investor_data["name"]}
+            Context: {context_text[:3000]}  # Limit for API
+            
+            Return in JSON format:
+            {{
+                "enhanced_description": "concise description here",
+                "standardized_focus": ["focus area 1", "focus area 2", ...],
+                "standardized_headquarters": "City, Country"
+            }}
+            """
+
+            response = self.openai_client.chat.completions.create(
+                model="gpt-3.5-turbo",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.3,
+            )
+
+            enhanced_data = json.loads(response.choices[0].message.content)
+
+            # Update investor data with enhanced information
+            if enhanced_data.get("enhanced_description"):
+                investor_data["enhanced_description"] = enhanced_data[
+                    "enhanced_description"
+                ]
+
+            if enhanced_data.get("standardized_focus"):
+                investor_data["standardized_focus"] = enhanced_data[
+                    "standardized_focus"
+                ]
+
+            if enhanced_data.get("standardized_headquarters"):
+                investor_data["standardized_headquarters"] = enhanced_data[
+                    "standardized_headquarters"
+                ]
+
+            return investor_data
+
+        except Exception as e:
+            logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}")
+            return investor_data
+
+    def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
+        """Save investor data to SQL database"""
+        try:
+            with get_session() as session:
+                # Check if investor already exists
+                existing = (
+                    session.query(Investor)
+                    .filter_by(name=investor_data["name"])
+                    .first()
+                )
+
+                if existing:
+                    logger.info(f"Updating existing investor: {investor_data['name']}")
+                    investor = existing
+                else:
+                    logger.info(f"Creating new investor: {investor_data['name']}")
+                    investor = Investor()
+
+                # Map data to investor object
+                investor.name = investor_data["name"]
+                investor.website = investor_data.get("website")
+                investor.investor_description = investor_data.get(
+                    "enhanced_description"
+                ) or investor_data.get("investor_description")
+                investor.investment_thesis_focus = investor_data.get(
+                    "standardized_focus"
+                ) or investor_data.get("investment_thesis_focus")
+                investor.headquarters = investor_data.get(
+                    "standardized_headquarters"
+                ) or investor_data.get("headquarters")
+
+                # AUM information
+                aum_info = investor_data.get("aum_info", {})
+                investor.aum_amount = aum_info.get("aumAmount")
+                investor.aum_as_of_date = aum_info.get("asOfDate")
+                investor.aum_source_url = aum_info.get("sourceUrl")
+
+                # Fund information
+                investor.funds_info = investor_data.get("funds_info", [])
+
+                # Raw data
+                investor.crunchbase_urls = investor_data.get("crunchbase_urls")
+                investor.crunchbase_extract = investor_data.get("crunchbase_extract")
+                investor.linkedin_profile = investor_data.get("linkedin_profile")
+                investor.source_truth_profile = investor_data.get(
+                    "source_truth_profile"
+                )
+
+                if not existing:
+                    session.add(investor)
+
+                session.flush()  # Get the ID
+                return investor.id
+
+        except Exception as e:
+            logger.error(f"Failed to save to SQL: {e}")
+            raise
+
+    def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
+        """Save investor description and focus to ChromaDB"""
+        try:
+            # Prepare text for embedding
+            description_text = investor_data.get(
+                "enhanced_description"
+            ) or investor_data.get("investor_description", "")
+            focus_areas = investor_data.get("standardized_focus") or investor_data.get(
+                "investment_thesis_focus", []
+            )
+
+            if isinstance(focus_areas, list):
+                focus_text = " ".join(focus_areas)
+            else:
+                focus_text = str(focus_areas)
+
+            # Combine description and focus for embedding
+            combined_text = f"{description_text} {focus_text}".strip()
+
+            if not combined_text:
+                logger.warning(f"No text to embed for investor {investor_data['name']}")
+                return
+
+            # Create metadata
+            metadata = {
+                "investor_id": investor_id,
+                "name": investor_data["name"],
+                "website": investor_data.get("website", ""),
+                "headquarters": investor_data.get("standardized_headquarters")
+                or investor_data.get("headquarters", ""),
+                "focus_areas_count": len(focus_areas)
+                if isinstance(focus_areas, list)
+                else 0,
+            }
+
+            # Add to ChromaDB
+            self.collection.add(
+                documents=[combined_text],
+                metadatas=[metadata],
+                ids=[f"investor_{investor_id}"],
+            )
+
+            logger.info(f"Added investor {investor_data['name']} to vector database")
+
+        except Exception as e:
+            logger.error(f"Failed to save to vector DB: {e}")
+
+    def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
+        """Process the entire CSV file"""
+        logger.info(f"Starting to process CSV file: {csv_file_path}")
+
+        # Read CSV
+        df = pd.read_csv(csv_file_path)
+        logger.info(f"Loaded {len(df)} rows from CSV")
+
+        if limit:
+            df = df.head(limit)
+            logger.info(f"Processing limited to {limit} rows")
+
+        processed_count = 0
+        error_count = 0
+
+        for index, row in df.iterrows():
+            try:
+                logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
+
+                # Create CSVRow object
+                csv_row = CSVRow(
+                    name=row["Name"],
+                    website=row.get("Website"),
+                    investment_firm_profile=row.get("Investment Firm Profile"),
+                    crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
+                    crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
+                    linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
+                    source_of_truth_profile=row.get("Source of Truth Profile"),
+                )
+
+                # Extract structured data
+                structured_data = self.extract_structured_data(csv_row)
+
+                # Enhance with LLM
+                enhanced_data = self.enhance_with_llm(structured_data)
+
+                # Save to SQL database
+                investor_id = self.save_to_sql(enhanced_data)
+
+                # Save to vector database
+                self.save_to_vector_db(investor_id, enhanced_data)
+
+                processed_count += 1
+
+                # Progress update every 10 rows
+                if (index + 1) % 10 == 0:
+                    logger.info(
+                        f"Processed {processed_count} rows successfully, {error_count} errors"
+                    )
+
+            except Exception as e:
+                error_count += 1
+                logger.error(
+                    f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
+                )
+                continue
+
+        logger.info(
+            f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
+        )
+        return processed_count, error_count
+
+    def search_investors(self, query: str, limit: int = 5):
+        """Search investors using vector similarity"""
+        try:
+            results = self.collection.query(query_texts=[query], n_results=limit)
+
+            return results
+
+        except Exception as e:
+            logger.error(f"Search failed: {e}")
+            return None
+
+
+def main():
+    """Main function to run the parser"""
+    parser = LLMInvestorParser()
+
+    # Process the CSV file
+    csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv"
+
+    # Start with a small sample for testing
+    processed, errors = parser.process_csv_file(csv_file, limit=5)
+
+    print("\nProcessing complete!")
+    print(f"Successfully processed: {processed} investors")
+    print(f"Errors encountered: {errors}")
+
+    # Test search functionality
+    print("\nTesting search functionality...")
+    results = parser.search_investors("bioeconomy circular economy")
+    if results:
+        print(f"Found {len(results['documents'][0])} similar investors")
+        for i, doc in enumerate(results["documents"][0]):
+            print(f"  {i + 1}. {results['metadatas'][0][i]['name']}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,178 @@
+import asyncio
+from typing import List, Optional
+
+import chromadb
+import pandas as pd
+from db.tables import InvestorTable
+from langchain_core.prompts import PromptTemplate
+from langchain_openai import ChatOpenAI
+from pydantic_schemas import Investor, InvestorList
+from settings import settings
+
+# Add these imports for your databases
+# from sqlalchemy.ext.asyncio import AsyncSession
+# from your_vector_db import VectorDBClient
+
+
+class InvestorProcessor:
+    def __init__(
+        self,
+        sql_session: Optional[object] = None,
+        vector_db_client: Optional[object] = None,
+    ):
+        self.template = """You are an expert data extraction assistant. Extract investor information from the provided CSV data and return it as a list of structured records.
+
+Given the following CSV data rows:
+{question}
+
+For each row, extract and structure the following fields:
+- name: The investor's full name
+- aum: Assets under management (as integer, use 0 if not available)
+- check_size: Investment check size (as string)
+- sector_focus: Sector focus (as string)
+- stage_focus: Investment stage focus (as string)
+- region: Geographic region (as string)
+- investment_thesis: Investment thesis (as string)
+- investor_description: Description of the investor (as string)
+
+Important: 
+- If a field is not available in the data, use appropriate default values (empty string for text fields, 0 for numbers)
+- Ensure all text fields are properly escaped and contain no control characters
+- Return clean, valid JSON only
+
+Return the data as a structured list of investors."""
+
+        self.prompt = PromptTemplate(
+            template=self.template, input_variables=["question"]
+        )
+
+        self.llm = ChatOpenAI(
+            api_key=settings.OPENROUTER_API_KEY,
+            base_url="https://openrouter.ai/api/v1",
+            model="openai/gpt-oss-120b:fre",
+            temperature=0,
+        )
+
+        self.structured_llm = self.llm.with_structured_output(InvestorList)
+        self.sql_session = sql_session
+        self.vector_db_client = vector_db_client
+
+        self.vector_db_client = chromadb.PersistentClient(path="./chroma_db")
+        self.collection = self.vector_db_client.get_or_create_collection(
+            name="investor_descriptions",
+            metadata={
+                "description": "Investor descriptions and investment thesis focus"
+            },
+        )
+
+    async def _process_batch(self, batch: pd.DataFrame, batch_idx: int) -> List:
+        """Process a single batch of data"""
+        # Convert batch to string representation - clean the data
+        batch_str = ""
+        for idx, row in batch.iterrows():
+            # Clean values to remove control characters
+            cleaned_row = {}
+            for key, value in row.items():
+                if pd.notna(value):
+                    # Convert to string and clean control characters
+                    clean_value = (
+                        str(value)
+                        .replace("\n", " ")
+                        .replace("\r", " ")
+                        .replace("\t", " ")
+                    )
+                    # Remove other control characters
+                    clean_value = "".join(
+                        char
+                        for char in clean_value
+                        if ord(char) >= 32 or char in ["\n", "\r", "\t"]
+                    )
+                    cleaned_row[key] = clean_value
+
+            row_str = ", ".join(
+                [f"{key}: {value}" for key, value in cleaned_row.items()]
+            )
+            batch_str += f"Row {idx + 1}: {row_str}\n"
+
+        try:
+            print(f"Processing batch {batch_idx + 1}...")
+            batch_results = await self.structured_llm.ainvoke(batch_str)
+            return batch_results.investor_list
+        except Exception as e:
+            print(f"Error processing batch {batch_idx + 1}: {e}")
+            return []
+
+    async def _save_to_sql(self, investors: List[Investor]) -> None:
+        """Save investors to SQL database"""
+        if not self.sql_session:
+            return
+
+        # Implement SQL saving logic here
+        for investor in investors:
+            db_investor = InvestorTable(
+                name=investor.name,
+                aum=investor.aum,
+                check_size=investor.check_size,
+                sector_focus=investor.sector_focus,
+                stage_focus=investor.stage_focus,
+                region=investor.region,
+            )
+            self.sql_session.add(db_investor)
+        self.sql_session.commit()
+
+    async def _save_to_vector_db(self, investors: List[Investor]) -> None:
+        """Save investors to vector database"""
+        if not self.vector_db_client:
+            return
+
+        documents = []
+        metadatas = []
+        ids = []
+
+        for i, investor in enumerate(investors):
+            doc_text = f"{investor.investor_description}\nInvestment Thesis: {investor.investment_thesis}"
+            documents.append(doc_text)
+            metadatas.append({"name": investor.name})
+            ids.append(f"investor_{i}_{investor.name.replace(' ', '_')}")
+
+        if documents:
+            # Use add method with proper parameters
+            self.collection.add(documents=documents, metadatas=metadatas, ids=ids)
+
+    async def process_csv(
+        self, df: pd.DataFrame, batch_size: int = 10, max_concurrent: int = 10
+    ) -> List:
+        """Process CSV data in parallel batches and save to databases"""
+        results = []
+
+        # Create batches
+        batches = []
+        for i in range(0, len(df), batch_size):
+            batch = df.iloc[i : i + batch_size]
+            batches.append((batch, i // batch_size))
+
+        # Process batches with concurrency control
+        semaphore = asyncio.Semaphore(max_concurrent)
+
+        async def process_with_semaphore(batch_data):
+            batch, batch_idx = batch_data
+            async with semaphore:
+                return await self._process_batch(batch, batch_idx)
+
+        # Execute all batches concurrently
+        batch_results = await asyncio.gather(
+            *[process_with_semaphore(batch_data) for batch_data in batches],
+            return_exceptions=True,
+        )
+
+        # Collect results, filtering out exceptions
+        for batch_result in batch_results:
+            if not isinstance(batch_result, Exception):
+                results.extend(batch_result)
+
+        # Save to databases
+        if results:
+            await self._save_to_sql(results)
+            await self._save_to_vector_db(results)
+
+        return results
@@ -0,0 +1,61 @@
+from typing import Optional
+
+import chromadb
+from langchain_openai import ChatOpenAI
+from pydantic_schemas import Investor, InvestorList
+from settings import settings
+
+# Add these imports for your databases
+# from sqlalchemy.ext.asyncio import AsyncSession
+# from your_vector_db import VectorDBClient
+
+
+class QueryProcessor:
+    def __init__(
+        self,
+        sql_session: Optional[object] = None,
+        vector_db_client: Optional[object] = None,
+    ):
+        self.llm = ChatOpenAI(
+            api_key=settings.OPENROUTER_API_KEY,
+            base_url="https://openrouter.ai/api/v1",
+            model="openai/gpt-oss-120b:free",
+            temperature=0,
+        )
+
+        self.structured_llm = self.llm.with_structured_output(InvestorList)
+        self.sql_session = sql_session
+        self.vector_db_client = vector_db_client
+
+        self.vector_db_client = chromadb.PersistentClient(path="./chroma_db")
+        self.collection = self.vector_db_client.get_or_create_collection(
+            name="investor_descriptions",
+            metadata={
+                "description": "Investor descriptions and investment thesis focus"
+            },
+        )
+
+    def query_sql_database(self, query: str) -> Optional[InvestorList]:
+        """Query the SQL database for investor information."""
+        if not self.sql_session:
+            return None
+
+        # Implement SQL querying logic here
+        result = self.sql_session.execute(query)
+        investors = result.scalars().all()
+        return InvestorList(investors=investors)
+
+    def query_vector_database(self, query: str) -> Optional[InvestorList]:
+        """Query the vector database for investor information."""
+        if not self.vector_db_client:
+            return None
+
+        # Implement vector database querying logic here
+        results = self.vector_db_client.query(collection=self.collection, query=query)
+        investors = [Investor(**doc.metadata) for doc in results.documents]
+        return InvestorList(investors=investors)
+
+    def process_query(self, question: str) -> InvestorList:
+        """Process a query using the LLM and return structured investor data."""
+        response = self.structured_llm.predict(question=question)
+        return response
@@ -1,10 +1,11 @@
 from pydantic_settings import BaseSettings

+
 class Settings(BaseSettings):
-    api_key: str
-    db_url: str
+    OPENROUTER_API_KEY: str

    class Config:
        env_file = ".env"

-settings = Settings() 
+
+settings = Settings()