made version 2

2025-09-25 17:00:38 +01:00
parent b1b1c5ea1e
commit 0f7beca5e1
42 changed files with 660 additions and 2036 deletions
@@ -1,368 +1,337 @@
-import json
-import logging
+import asyncio
 import os
-from typing import Any, Dict, Optional
+from typing import Optional

-import chromadb
 import pandas as pd
-from dotenv import load_dotenv
-from openai import OpenAI
-
-from db import get_session, init_database
-from py_schemas import CSVRow, Investor
-
-# Load environment variables
-load_dotenv()
-
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
+from db.db import get_db_session
+from db.models import (
+    CompanyMember,
+    CompanyTable,
+    InvestorMember,
+    InvestorTable,
+    SectorTable,
+)
+from langchain_openai import ChatOpenAI
+from schemas.py_schemas import CompanyData, InvestorData
+from sqlalchemy.orm import Session


-class LLMInvestorParser:
+class InvestorProcessor:
    def __init__(self):
-        # Initialize OpenAI client
-        self.openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
-
-        # Initialize ChromaDB
-        self.chroma_client = chromadb.PersistentClient(path="./chroma_db")
-        self.collection = self.chroma_client.get_or_create_collection(
-            name="investor_descriptions",
-            metadata={
-                "description": "Investor descriptions and investment thesis focus"
-            },
+        self.llm = ChatOpenAI(
+            api_key=os.getenv("OPENROUTER_API_KEY"),
+            base_url="https://openrouter.ai/api/v1",
+            model="openai/gpt-5-nano",
+            temperature=0,
        )

-        # Initialize database
-        init_database()
+        self.investor_structured_llm = self.llm.with_structured_output(InvestorData)
+        self.company_structured_llm = self.llm.with_structured_output(CompanyData)

-    def parse_json_field(self, json_str: str) -> Dict[str, Any]:
-        """Safely parse JSON string with LLM assistance if needed"""
-        if not json_str or json_str.strip() == "":
-            return {}
+    def _get_or_create_sector(self, db: Session, sector_name: str) -> SectorTable:
+        """Get existing sector or create new one"""
+        sector = db.query(SectorTable).filter(SectorTable.name == sector_name).first()
+        if not sector:
+            sector = SectorTable(name=sector_name)
+            db.add(sector)
+            db.flush()  # Get the ID without committing
+        return sector

-        try:
-            # Try direct JSON parsing first
-            return json.loads(json_str)
-        except json.JSONDecodeError:
-            # If direct parsing fails, use LLM to clean and parse
-            logger.info("Direct JSON parsing failed, using LLM to clean JSON")
-            return self._llm_clean_json(json_str)
+    def _save_investor_to_db(
+        self, db: Session, investor_data: InvestorData
+    ) -> InvestorTable:
+        """Save investor data to database"""
+        # Create investor record
+        investor = InvestorTable(
+            name=investor_data.investor.name,
+            description=investor_data.investor.description,
+            aum=investor_data.investor.aum,
+            check_size_lower=investor_data.investor.check_size_lower,
+            check_size_upper=investor_data.investor.check_size_upper,
+            geographic_focus=investor_data.investor.geographic_focus,
+            stage_focus=investor_data.investor.stage_focus,
+            number_of_investments=investor_data.investor.number_of_investments,
+        )
+        db.add(investor)
+        db.flush()  # Get the ID

-    def _llm_clean_json(self, malformed_json: str) -> Dict[str, Any]:
-        """Use LLM to clean and parse malformed JSON"""
-        try:
-            prompt = f"""
-            The following text appears to be malformed JSON. Please clean it up and return valid JSON.
-            If it's not possible to create valid JSON, return an empty object {{}}.
-            
-            Original text:
-            {malformed_json[:2000]}  # Limit length for API
-            
-            Return only the cleaned JSON, no explanations:
-            """
-
-            response = self.openai_client.chat.completions.create(
-                model="gpt-3.5-turbo",
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0,
+        # Add team members
+        for member_data in investor_data.team_members:
+            member = InvestorMember(
+                name=member_data.name,
+                role=member_data.role,
+                email=member_data.email,
+                investor_id=investor.id,
            )
+            db.add(member)

-            cleaned_json = response.choices[0].message.content.strip()
-            return json.loads(cleaned_json)
+        # Add sectors
+        for sector_data in investor_data.sectors:
+            sector = self._get_or_create_sector(db, sector_data.name)
+            investor.sectors.append(sector)

-        except Exception as e:
-            logger.error(f"LLM JSON cleaning failed: {e}")
-            return {}
-
-    def extract_structured_data(self, csv_row: CSVRow) -> Dict[str, Any]:
-        """Extract and structure data from CSV row using LLM"""
-        # Parse the investment firm profile
-        profile_data = {}
-        if csv_row.investment_firm_profile:
-            profile_data = self.parse_json_field(csv_row.investment_firm_profile)
-
-        # Create structured output
-        structured_data = {
-            "name": csv_row.name,
-            "website": csv_row.website or profile_data.get("websiteURL"),
-            "investor_description": profile_data.get("investorDescription", ""),
-            "investment_thesis_focus": profile_data.get("investmentThesisFocus", []),
-            "headquarters": profile_data.get("headquarters", ""),
-            "aum_info": profile_data.get("overallAssetsUnderManagement", {}),
-            "funds_info": profile_data.get("funds", []),
-            "crunchbase_urls": csv_row.crunchbase_linkedin_urls or "",
-            "crunchbase_extract": csv_row.crunchbase_firm_extract or "",
-            "linkedin_profile": csv_row.linkedin_investment_profile or "",
-            "source_truth_profile": csv_row.source_of_truth_profile or "",
-        }
-
-        return structured_data
-
-    def enhance_with_llm(self, investor_data: Dict[str, Any]) -> Dict[str, Any]:
-        """Use LLM to enhance and standardize investor data"""
-        try:
-            # Combine all available text for context
-            context_text = " ".join(
-                [
-                    investor_data.get("investor_description", ""),
-                    investor_data.get("crunchbase_extract", ""),
-                    investor_data.get("linkedin_profile", ""),
-                    investor_data.get("source_truth_profile", ""),
-                ]
+        # Add portfolio companies
+        for company_schema in investor_data.portfolio_companies:
+            # Convert CompanySchema to CompanyData format
+            company_data = CompanyData(
+                company=company_schema,
+                sectors=[],  # Will be empty for portfolio companies
+                members=[],  # Will be empty for portfolio companies
+                investors=[],  # Will be empty for portfolio companies
            )
+            company = self._save_company_to_db(db, company_data, skip_investors=True)
+            investor.portfolio_companies.append(company)

-            if not context_text.strip():
-                return investor_data
+        return investor

-            prompt = f"""
-            Based on the following information about an investor, please extract and standardize:
-            1. A concise investor description (2-3 sentences)
-            2. Investment thesis focus areas (list of specific focus areas)
-            3. Headquarters location (city, country format)
-            
-            Investor: {investor_data["name"]}
-            Context: {context_text[:3000]}  # Limit for API
-            
-            Return in JSON format:
-            {{
-                "enhanced_description": "concise description here",
-                "standardized_focus": ["focus area 1", "focus area 2", ...],
-                "standardized_headquarters": "City, Country"
-            }}
-            """
+    def _save_company_to_db(
+        self, db: Session, company_data: CompanyData, skip_investors: bool = False
+    ) -> CompanyTable:
+        """Save company data to database"""
+        # Check if company already exists
+        existing_company = (
+            db.query(CompanyTable)
+            .filter(CompanyTable.name == company_data.company.name)
+            .first()
+        )
+        if existing_company:
+            return existing_company

-            response = self.openai_client.chat.completions.create(
-                model="gpt-3.5-turbo",
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.3,
-            )
+        # Create company record
+        company = CompanyTable(
+            name=company_data.company.name,
+            industry=company_data.company.industry,
+            location=company_data.company.location,
+            description=company_data.company.description,
+            founded_year=company_data.company.founded_year,
+            website=company_data.company.website,
+        )
+        db.add(company)
+        db.flush()  # Get the ID

-            enhanced_data = json.loads(response.choices[0].message.content)
+        # Add company members
+        for member_data in company_data.members:
+            if member_data.name:  # Only add members with names
+                member = CompanyMember(
+                    name=member_data.name,
+                    linkedin=member_data.linkedin,
+                    role=member_data.role,
+                    company_id=company.id,
+                )
+                db.add(member)

-            # Update investor data with enhanced information
-            if enhanced_data.get("enhanced_description"):
-                investor_data["enhanced_description"] = enhanced_data[
-                    "enhanced_description"
-                ]
+        # Add sectors
+        for sector_data in company_data.sectors:
+            sector = self._get_or_create_sector(db, sector_data.name)
+            company.sectors.append(sector)

-            if enhanced_data.get("standardized_focus"):
-                investor_data["standardized_focus"] = enhanced_data[
-                    "standardized_focus"
-                ]
-
-            if enhanced_data.get("standardized_headquarters"):
-                investor_data["standardized_headquarters"] = enhanced_data[
-                    "standardized_headquarters"
-                ]
-
-            return investor_data
-
-        except Exception as e:
-            logger.error(f"LLM enhancement failed for {investor_data['name']}: {e}")
-            return investor_data
-
-    def save_to_sql(self, investor_data: Dict[str, Any]) -> int:
-        """Save investor data to SQL database"""
-        try:
-            with get_session() as session:
-                # Check if investor already exists
-                existing = (
-                    session.query(Investor)
-                    .filter_by(name=investor_data["name"])
+        # Add investors (if not skipping to avoid circular references)
+        if not skip_investors:
+            for investor_data in company_data.investors:
+                # Look for existing investor by name
+                existing_investor = (
+                    db.query(InvestorTable)
+                    .filter(InvestorTable.name == investor_data.name)
                    .first()
                )
+                if existing_investor:
+                    company.investors.append(existing_investor)

-                if existing:
-                    logger.info(f"Updating existing investor: {investor_data['name']}")
-                    investor = existing
-                else:
-                    logger.info(f"Creating new investor: {investor_data['name']}")
-                    investor = Investor()
+        return company

-                # Map data to investor object
-                investor.name = investor_data["name"]
-                investor.website = investor_data.get("website")
-                investor.investor_description = investor_data.get(
-                    "enhanced_description"
-                ) or investor_data.get("investor_description")
-                investor.investment_thesis_focus = investor_data.get(
-                    "standardized_focus"
-                ) or investor_data.get("investment_thesis_focus")
-                investor.headquarters = investor_data.get(
-                    "standardized_headquarters"
-                ) or investor_data.get("headquarters")
-
-                # AUM information
-                aum_info = investor_data.get("aum_info", {})
-                investor.aum_amount = aum_info.get("aumAmount")
-                investor.aum_as_of_date = aum_info.get("asOfDate")
-                investor.aum_source_url = aum_info.get("sourceUrl")
-
-                # Fund information
-                investor.funds_info = investor_data.get("funds_info", [])
-
-                # Raw data
-                investor.crunchbase_urls = investor_data.get("crunchbase_urls")
-                investor.crunchbase_extract = investor_data.get("crunchbase_extract")
-                investor.linkedin_profile = investor_data.get("linkedin_profile")
-                investor.source_truth_profile = investor_data.get(
-                    "source_truth_profile"
+    async def _process_row(
+        self, row: pd.Series, row_idx: int, is_investor: bool = True
+    ) -> Optional[InvestorData | CompanyData]:
+        """Process a single row of data"""
+        # Clean values to remove control characters
+        cleaned_row = {}
+        for key, value in row.items():
+            if pd.notna(value):
+                # Convert to string and clean control characters
+                clean_value = (
+                    str(value).replace("\n", " ").replace("\r", " ").replace("\t", " ")
                )
+                # Remove other control characters
+                clean_value = "".join(
+                    char
+                    for char in clean_value
+                    if ord(char) >= 32 or char in ["\n", "\r", "\t"]
+                )
+                cleaned_row[key] = clean_value

-                if not existing:
-                    session.add(investor)
-
-                session.flush()  # Get the ID
-                return investor.id
-
-        except Exception as e:
-            logger.error(f"Failed to save to SQL: {e}")
-            raise
-
-    def save_to_vector_db(self, investor_id: int, investor_data: Dict[str, Any]):
-        """Save investor description and focus to ChromaDB"""
+        row_str = ", ".join([f"{key}: {value}" for key, value in cleaned_row.items()])
        try:
-            # Prepare text for embedding
-            description_text = investor_data.get(
-                "enhanced_description"
-            ) or investor_data.get("investor_description", "")
-            focus_areas = investor_data.get("standardized_focus") or investor_data.get(
-                "investment_thesis_focus", []
-            )
-
-            if isinstance(focus_areas, list):
-                focus_text = " ".join(focus_areas)
+            print(f"Processing row {row_idx + 1}...")
+            if is_investor:
+                result = await self.investor_structured_llm.ainvoke(row_str)
            else:
-                focus_text = str(focus_areas)
-
-            # Combine description and focus for embedding
-            combined_text = f"{description_text} {focus_text}".strip()
-
-            if not combined_text:
-                logger.warning(f"No text to embed for investor {investor_data['name']}")
-                return
-
-            # Create metadata
-            metadata = {
-                "investor_id": investor_id,
-                "name": investor_data["name"],
-                "website": investor_data.get("website", ""),
-                "headquarters": investor_data.get("standardized_headquarters")
-                or investor_data.get("headquarters", ""),
-                "focus_areas_count": len(focus_areas)
-                if isinstance(focus_areas, list)
-                else 0,
-            }
-
-            # Add to ChromaDB
-            self.collection.add(
-                documents=[combined_text],
-                metadatas=[metadata],
-                ids=[f"investor_{investor_id}"],
-            )
-
-            logger.info(f"Added investor {investor_data['name']} to vector database")
-
+                result = await self.company_structured_llm.ainvoke(row_str)
+            if result:
+                return result.model_dump()
+            return None
        except Exception as e:
-            logger.error(f"Failed to save to vector DB: {e}")
-
-    def process_csv_file(self, csv_file_path: str, limit: Optional[int] = None):
-        """Process the entire CSV file"""
-        logger.info(f"Starting to process CSV file: {csv_file_path}")
-
-        # Read CSV
-        df = pd.read_csv(csv_file_path)
-        logger.info(f"Loaded {len(df)} rows from CSV")
-
-        if limit:
-            df = df.head(limit)
-            logger.info(f"Processing limited to {limit} rows")
-
-        processed_count = 0
-        error_count = 0
-
-        for index, row in df.iterrows():
-            try:
-                logger.info(f"Processing row {index + 1}/{len(df)}: {row['Name']}")
-
-                # Create CSVRow object
-                csv_row = CSVRow(
-                    name=row["Name"],
-                    website=row.get("Website"),
-                    investment_firm_profile=row.get("Investment Firm Profile"),
-                    crunchbase_linkedin_urls=row.get("Crunchbase & LinkedIn URLs"),
-                    crunchbase_firm_extract=row.get("Crunchbase Firm Extract"),
-                    linkedin_investment_profile=row.get("LinkedIn Investment Profile"),
-                    source_of_truth_profile=row.get("Source of Truth Profile"),
-                )
-
-                # Extract structured data
-                structured_data = self.extract_structured_data(csv_row)
-
-                # Enhance with LLM
-                enhanced_data = self.enhance_with_llm(structured_data)
-
-                # Save to SQL database
-                investor_id = self.save_to_sql(enhanced_data)
-
-                # Save to vector database
-                self.save_to_vector_db(investor_id, enhanced_data)
-
-                processed_count += 1
-
-                # Progress update every 10 rows
-                if (index + 1) % 10 == 0:
-                    logger.info(
-                        f"Processed {processed_count} rows successfully, {error_count} errors"
-                    )
-
-            except Exception as e:
-                error_count += 1
-                logger.error(
-                    f"Error processing row {index + 1} ({row.get('Name', 'Unknown')}): {e}"
-                )
-                continue
-
-        logger.info(
-            f"Processing complete! Processed: {processed_count}, Errors: {error_count}"
-        )
-        return processed_count, error_count
-
-    def search_investors(self, query: str, limit: int = 5):
-        """Search investors using vector similarity"""
-        try:
-            results = self.collection.query(query_texts=[query], n_results=limit)
-
-            return results
-
-        except Exception as e:
-            logger.error(f"Search failed: {e}")
+            print(f"Error processing row {row_idx + 1}: {e}")
            return None

+    async def parse_investors(self, df, save_to_db: bool = True):
+        """Parse investors from DataFrame and optionally save to database"""
+        investors = []

-def main():
-    """Main function to run the parser"""
-    parser = LLMInvestorParser()
+        db = None
+        if save_to_db:
+            db = get_db_session()

-    # Process the CSV file
-    csv_file = "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/New Excerpt 5 investors - Sheet1 parse.csv"
+        try:
+            # Process rows in batches asynchronously
+            batch_size = 15  # Adjust batch size as needed
+            rows = [(idx, row) for idx, row in df.iterrows()]

-    # Start with a small sample for testing
-    processed, errors = parser.process_csv_file(csv_file, limit=5)
+            for i in range(0, len(rows), batch_size):
+                batch = rows[i : i + batch_size]

-    print("\nProcessing complete!")
-    print(f"Successfully processed: {processed} investors")
-    print(f"Errors encountered: {errors}")
+                # Process batch asynchronously
+                tasks = [
+                    self._process_row(row, idx, is_investor=True) for idx, row in batch
+                ]

-    # Test search functionality
-    print("\nTesting search functionality...")
-    results = parser.search_investors("bioeconomy circular economy")
-    if results:
-        print(f"Found {len(results['documents'][0])} similar investors")
-        for i, doc in enumerate(results["documents"][0]):
-            print(f"  {i + 1}. {results['metadatas'][0][i]['name']}")
+                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+                # Handle results from batch
+                for (idx, row), result in zip(batch, batch_results):
+                    if isinstance(result, Exception):
+                        print(f"Error processing row {idx}: {result}")
+                        if db:
+                            db.rollback()
+                        continue
+
+                    if result:
+                        # Convert dict to InvestorData if needed
+                        if isinstance(result, dict):
+                            investor_data = InvestorData(**result)
+                        else:
+                            investor_data = result
+
+                        investors.append(investor_data)
+
+                        # Save to database if requested
+                        if save_to_db and db:
+                            try:
+                                saved_investor = self._save_investor_to_db(
+                                    db, investor_data
+                                )
+                                db.commit()
+                                print(
+                                    f"✅ Saved investor '{saved_investor.name}' to database"
+                                )
+                            except Exception as e:
+                                db.rollback()
+                                print(f"❌ Failed to save investor to database: {e}")
+
+                print(
+                    f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
+                )
+
+        except Exception as e:
+            print(f"Error in batch processing: {e}")
+            if db:
+                db.rollback()
+        finally:
+            if db:
+                db.close()
+
+        return investors
+
+    async def parse_companies(self, df, save_to_db: bool = True):
+        """Parse companies from DataFrame and optionally save to database"""
+        companies = []
+
+        db = None
+        if save_to_db:
+            db = get_db_session()
+
+        try:
+            # Process rows in batches asynchronously
+            batch_size = 15  # Adjust batch size as needed
+            rows = [(idx, row) for idx, row in df.iterrows()]
+
+            for i in range(0, len(rows), batch_size):
+                batch = rows[i : i + batch_size]
+
+                # Process batch asynchronously
+                tasks = [
+                    self._process_row(row, idx, is_investor=False) for idx, row in batch
+                ]
+
+                batch_results = await asyncio.gather(*tasks, return_exceptions=True)
+
+                # Handle results from batch
+                for (idx, row), result in zip(batch, batch_results):
+                    if isinstance(result, Exception):
+                        print(f"Error processing row {idx}: {result}")
+                        if db:
+                            db.rollback()
+                        continue
+
+                    if result:
+                        # Convert dict to CompanyData if needed
+                        if isinstance(result, dict):
+                            company_data = CompanyData(**result)
+                        else:
+                            company_data = result
+
+                        companies.append(company_data)
+
+                        # Save to database if requested
+                        if save_to_db and db:
+                            try:
+                                saved_company = self._save_company_to_db(
+                                    db, company_data
+                                )
+                                db.commit()
+                                print(
+                                    f"✅ Saved company '{saved_company.name}' to database"
+                                )
+                            except Exception as e:
+                                db.rollback()
+                                print(f"❌ Failed to save company to database: {e}")
+
+                    print(
+                        f"Completed batch {i // batch_size + 1} of {(len(rows) + batch_size - 1) // batch_size}"
+                    )
+
+        except Exception as e:
+            print(f"Error processing row {idx}: {e}")
+            if db:
+                db.rollback()
+        finally:
+            if db:
+                db.close()
+
+        return companies


-if __name__ == "__main__":
-    main()
+# async def main():
+#     """Main execution function"""
+#     # Initialize database tables
+#     print("🔧 Initializing database...")
+#     init_database()
+
+#     # Create processor
+#     processor = InvestorProcessor()
+
+#     print("📊 Processing companies...")
+#     companies = await processor.parse_companies(
+#         "data/19 Companies data.csv", save_to_db=True
+#     )
+#     print(f"Processed {len(companies)} companies")
+
+#     print("\n💰 Processing investors...")
+#     investors = await processor.parse_investors(
+#         "data/19 Investors data.csv", save_to_db=True
+#     )
+#     print(f"Processed {len(investors)} investors")
+#     print("\n✨ Processing complete!")
+
+
+# if __name__ == "__main__":
+#     asyncio.run(main())
@@ -1,293 +0,0 @@
-import asyncio
-from typing import List, Optional
-
-import chromadb
-import pandas as pd
-from db.models import CompanyTable, InvestorTable, InvestorTeamMember, SectorTable
-from langchain_core.prompts import PromptTemplate
-from langchain_openai import ChatOpenAI
-from py_schemas import InvestorData
-from pydantic import BaseModel
-from settings import settings
-
-
-class InvestorList(BaseModel):
-    """Schema for LLM structured output"""
-
-    investor_list: List[InvestorData]
-
-
-class InvestorProcessor:
-    def __init__(
-        self,
-        sql_session: Optional[object] = None,
-        vector_db_client: Optional[object] = None,
-    ):
-        self.template = """You are an expert data extraction assistant. Extract investor information from the provided CSV data and return it as a list of structured records.
-
-Given the following CSV data rows:
-{question}
-
-For each row, extract and structure the following fields for the investor:
- name: The investor's full name
- description: Description of the investor
- aum: Assets under management (as integer, use 0 if not available)
- check_size_lower: Lower bound of investment check size (as integer)
- check_size_upper: Upper bound of investment check size (as integer)
- geographic_focus: Geographic region focus
- stage_focus: Investment stage focus (must be one of: seed, series_a, series_b, series_c, growth, late_stage)
- number_of_investments: Number of investments made (default 0)
-
-Also extract related data:
- portfolio_companies: List of companies they've invested in
- team_members: List of team members with name, role, email
- sectors: List of sectors they focus on
-
-Important: 
- If a field is not available, use appropriate defaults
- stage_focus must be one of the valid enum values
- Return clean, valid JSON only
-
-Return the data as a structured list of comprehensive investor data."""
-
-        self.prompt = PromptTemplate(
-            template=self.template, input_variables=["question"]
-        )
-
-        self.llm = ChatOpenAI(
-            api_key=settings.OPENROUTER_API_KEY,
-            base_url="https://openrouter.ai/api/v1",
-            model="google/gemini-2.5-flash-lite",
-            temperature=0,
-        )
-
-        self.structured_llm = self.llm.with_structured_output(InvestorList)
-        self.sql_session = sql_session
-        self.vector_db_client = vector_db_client
-
-        self.vector_db_client = chromadb.PersistentClient(path="./chroma_db")
-        self.collection = self.vector_db_client.get_or_create_collection(
-            name="investor_descriptions",
-            metadata={
-                "description": "Investor descriptions and investment thesis focus"
-            },
-        )
-
-    async def _process_batch(
-        self, batch: pd.DataFrame, batch_idx: int
-    ) -> List[InvestorData]:
-        """Process a single batch of data"""
-        # Convert batch to string representation - clean the data
-        batch_str = ""
-        for idx, row in batch.iterrows():
-            # Clean values to remove control characters
-            cleaned_row = {}
-            for key, value in row.items():
-                if pd.notna(value):
-                    # Convert to string and clean control characters
-                    clean_value = (
-                        str(value)
-                        .replace("\n", " ")
-                        .replace("\r", " ")
-                        .replace("\t", " ")
-                    )
-                    # Remove other control characters
-                    clean_value = "".join(
-                        char
-                        for char in clean_value
-                        if ord(char) >= 32 or char in ["\n", "\r", "\t"]
-                    )
-                    cleaned_row[key] = clean_value
-
-            row_str = ", ".join(
-                [f"{key}: {value}" for key, value in cleaned_row.items()]
-            )
-            batch_str += f"Row {idx + 1}: {row_str}\n"
-
-        try:
-            print(f"Processing batch {batch_idx + 1}...")
-            batch_results = await self.structured_llm.ainvoke(batch_str)
-            return batch_results.investor_list
-        except Exception as e:
-            print(f"Error processing batch {batch_idx + 1}: {e}")
-            return []
-
-    async def _save_to_sql(self, investor_data_list: List[InvestorData]) -> None:
-        """Save investors and related data to SQL database"""
-        if not self.sql_session:
-            return
-
-        try:
-            for investor_data in investor_data_list:
-                # Save investor
-                db_investor = InvestorTable(
-                    name=investor_data.investor.name,
-                    description=investor_data.investor.description,
-                    aum=investor_data.investor.aum,
-                    check_size_lower=investor_data.investor.check_size_lower,
-                    check_size_upper=investor_data.investor.check_size_upper,
-                    geographic_focus=investor_data.investor.geographic_focus,
-                    stage_focus=investor_data.investor.stage_focus,
-                    number_of_investments=investor_data.investor.number_of_investments,
-                )
-                self.sql_session.add(db_investor)
-                self.sql_session.flush()  # Get the ID
-
-                # Save sectors and create associations
-                for sector_data in investor_data.sectors:
-                    # Check if sector exists, create if not
-                    existing_sector = (
-                        self.sql_session.query(SectorTable)
-                        .filter(SectorTable.name == sector_data.name)
-                        .first()
-                    )
-
-                    if not existing_sector:
-                        db_sector = SectorTable(name=sector_data.name)
-                        self.sql_session.add(db_sector)
-                        self.sql_session.flush()
-                        # Add sector to investor's sectors
-                        db_investor.sectors.append(db_sector)
-                    else:
-                        # Add existing sector to investor if not already there
-                        if existing_sector not in db_investor.sectors:
-                            db_investor.sectors.append(existing_sector)
-
-                # Save companies and create portfolio associations
-                for company_data in investor_data.portfolio_companies:
-                    # Check if company exists, create if not
-                    existing_company = (
-                        self.sql_session.query(CompanyTable)
-                        .filter(CompanyTable.name == company_data.name)
-                        .first()
-                    )
-
-                    if not existing_company:
-                        db_company = CompanyTable(
-                            name=company_data.name,
-                            industry=company_data.industry,
-                            location=company_data.location,
-                            founded_year=company_data.founded_year,
-                            website=company_data.website,
-                        )
-                        self.sql_session.add(db_company)
-                        self.sql_session.flush()
-
-                        # Add to investor's portfolio
-                        db_investor.portfolio_companies.append(db_company)
-                    else:
-                        # Add existing company to portfolio if not already there
-                        if existing_company not in db_investor.portfolio_companies:
-                            db_investor.portfolio_companies.append(existing_company)
-
-                # Save team members
-                for team_member_data in investor_data.team_members:
-                    # Check if team member exists
-                    existing_member = (
-                        self.sql_session.query(InvestorTeamMember)
-                        .filter(InvestorTeamMember.email == team_member_data.email)
-                        .first()
-                    )
-
-                    if not existing_member:
-                        db_team_member = InvestorTeamMember(
-                            name=team_member_data.name,
-                            role=team_member_data.role,
-                            email=team_member_data.email,
-                            investor_id=db_investor.id,
-                        )
-                        self.sql_session.add(db_team_member)
-
-            self.sql_session.commit()
-            print(f"Successfully saved {len(investor_data_list)} investors to database")
-
-        except Exception as e:
-            self.sql_session.rollback()
-            print(f"Error saving to SQL database: {e}")
-            raise
-
-    async def _save_to_vector_db(self, investor_data_list: List[InvestorData]) -> None:
-        """Save investors to vector database"""
-        if not self.vector_db_client:
-            return
-
-        documents = []
-        metadatas = []
-        ids = []
-
-        for i, investor_data in enumerate(investor_data_list):
-            investor = investor_data.investor
-            sectors = ", ".join([s.name for s in investor_data.sectors])
-            companies = ", ".join([c.name for c in investor_data.portfolio_companies])
-
-            doc_text = f"""
-            Investor: {investor.name}
-            Description: {investor.description or "N/A"}
-            AUM: ${investor.aum:,}
-            Check Size: ${investor.check_size_lower:,} - ${investor.check_size_upper:,}
-            Geographic Focus: {investor.geographic_focus}
-            Stage Focus: {investor.stage_focus.value}
-            Sectors: {sectors}
-            Portfolio Companies: {companies}
-            """.strip()
-
-            documents.append(doc_text)
-            metadatas.append(
-                {
-                    "name": investor.name,
-                    "stage_focus": investor.stage_focus.value,
-                    "geographic_focus": investor.geographic_focus,
-                    "aum": investor.aum,
-                }
-            )
-            ids.append(
-                f"investor_{i}_{investor.name.replace(' ', '_').replace('/', '_')}"
-            )
-
-        if documents:
-            try:
-                self.collection.add(documents=documents, metadatas=metadatas, ids=ids)
-                print(
-                    f"Successfully saved {len(documents)} investors to vector database"
-                )
-            except Exception as e:
-                print(f"Error saving to vector database: {e}")
-
-    async def process_csv(
-        self, df: pd.DataFrame, batch_size: int = 10, max_concurrent: int = 10
-    ) -> List[InvestorData]:
-        """Process CSV data in parallel batches and save to databases"""
-        results = []
-
-        # Create batches
-        batches = []
-        for i in range(0, len(df), batch_size):
-            batch = df.iloc[i : i + batch_size]
-            batches.append((batch, i // batch_size))
-
-        # Process batches with concurrency control
-        semaphore = asyncio.Semaphore(max_concurrent)
-
-        async def process_with_semaphore(batch_data):
-            batch, batch_idx = batch_data
-            async with semaphore:
-                return await self._process_batch(batch, batch_idx)
-
-        # Execute all batches concurrently
-        batch_results = await asyncio.gather(
-            *[process_with_semaphore(batch_data) for batch_data in batches],
-            return_exceptions=True,
-        )
-
-        # Collect results, filtering out exceptions
-        for batch_result in batch_results:
-            if not isinstance(batch_result, Exception):
-                results.extend(batch_result)
-
-        # Save to databases
-        if results:
-            print(f"Successfully processed {len(results)} investors")
-            await self._save_to_sql(results)
-            await self._save_to_vector_db(results)
-
-        return results
@@ -1,290 +0,0 @@
-import asyncio
-from typing import List, Optional
-
-import chromadb
-import pandas as pd
-from db.models import CompanyTable, InvestorTable, InvestorTeamMember, SectorTable
-from langchain_core.prompts import PromptTemplate
-from langchain_openai import ChatOpenAI
-from py_schemas import InvestorData
-from pydantic import BaseModel
-from settings import settings
-
-
-class InvestorOutput(BaseModel):
-    """Schema for LLM structured output"""
-
-    investor_data: InvestorData
-
-
-class InvestorProcessor:
-    def __init__(
-        self,
-        sql_session: Optional[object] = None,
-        vector_db_client: Optional[object] = None,
-    ):
-        self.template = """You are an expert data extraction assistant. Extract investor information from the provided CSV data and return it as a structured record.
-
-Given the following CSV data row:
-{question}
-
-Extract and structure the following fields for the investor:
- name: The investor's full name
- description: Description of the investor
- aum: Assets under management (as integer, use 0 if not available)
- check_size_lower: Lower bound of investment check size (as integer)
- check_size_upper: Upper bound of investment check size (as integer)
- geographic_focus: Geographic region focus
- stage_focus: Investment stage focus (must be one of: seed, series_a, series_b, series_c, growth, late_stage)
- number_of_investments: Number of investments made (default 0)
-
-Also extract related data:
- portfolio_companies: List of companies they've invested in
- team_members: List of team members with name, role, email
- sectors: List of sectors they focus on
-
-Important: 
- If a field is not available, use appropriate defaults
- stage_focus must be one of the valid enum values
- Return clean, valid JSON only
-
-Return the data as a single comprehensive investor data record."""
-
-        self.prompt = PromptTemplate(
-            template=self.template, input_variables=["question"]
-        )
-
-        self.llm = ChatOpenAI(
-            api_key=settings.OPENROUTER_API_KEY,
-            base_url="https://openrouter.ai/api/v1",
-            model="google/gemini-2.5-flash-lite",
-            temperature=0,
-        )
-
-        self.structured_llm = self.llm.with_structured_output(InvestorOutput)
-        self.sql_session = sql_session
-        self.vector_db_client = vector_db_client
-
-        self.vector_db_client = chromadb.PersistentClient(path="./chroma_db")
-        self.collection = self.vector_db_client.get_or_create_collection(
-            name="investor_descriptions",
-            metadata={
-                "description": "Investor descriptions and investment thesis focus"
-            },
-        )
-
-    async def _process_row(
-        self, row: pd.Series, row_idx: int
-    ) -> Optional[InvestorData]:
-        """Process a single row of data"""
-        # Clean values to remove control characters
-        cleaned_row = {}
-        for key, value in row.items():
-            if pd.notna(value):
-                # Convert to string and clean control characters
-                clean_value = (
-                    str(value)
-                    .replace("\n", " ")
-                    .replace("\r", " ")
-                    .replace("\t", " ")
-                )
-                # Remove other control characters
-                clean_value = "".join(
-                    char
-                    for char in clean_value
-                    if ord(char) >= 32 or char in ["\n", "\r", "\t"]
-                )
-                cleaned_row[key] = clean_value
-
-        row_str = ", ".join(
-            [f"{key}: {value}" for key, value in cleaned_row.items()]
-        )
-
-        try:
-            print(f"Processing row {row_idx + 1}...")
-            result = await self.structured_llm.ainvoke(row_str)
-            if result.investor_data:
-                return result.investor_data
-            return None
-        except Exception as e:
-            print(f"Error processing row {row_idx + 1}: {e}")
-            return None
-
-    async def _save_to_sql(self, investor_data_list: List[InvestorData]) -> None:
-        """Save investors and related data to SQL database"""
-        if not self.sql_session:
-            return
-
-        try:
-            for investor_data in investor_data_list:
-                # Save investor
-                db_investor = InvestorTable(
-                    name=investor_data.investor.name,
-                    description=investor_data.investor.description,
-                    aum=investor_data.investor.aum,
-                    check_size_lower=investor_data.investor.check_size_lower,
-                    check_size_upper=investor_data.investor.check_size_upper,
-                    geographic_focus=investor_data.investor.geographic_focus,
-                    stage_focus=investor_data.investor.stage_focus,
-                    number_of_investments=investor_data.investor.number_of_investments,
-                )
-                self.sql_session.add(db_investor)
-                self.sql_session.flush()  # Get the ID
-
-                # Save sectors and create associations
-                for sector_data in investor_data.sectors:
-                    # Check if sector exists, create if not
-                    existing_sector = (
-                        self.sql_session.query(SectorTable)
-                        .filter(SectorTable.name == sector_data.name)
-                        .first()
-                    )
-
-                    if not existing_sector:
-                        db_sector = SectorTable(name=sector_data.name)
-                        self.sql_session.add(db_sector)
-                        self.sql_session.flush()
-                        # Add sector to investor's sectors
-                        db_investor.sectors.append(db_sector)
-                    else:
-                        # Add existing sector to investor if not already there
-                        if existing_sector not in db_investor.sectors:
-                            db_investor.sectors.append(existing_sector)
-
-                # Save companies and create portfolio associations
-                for company_data in investor_data.portfolio_companies:
-                    # Check if company exists, create if not
-                    existing_company = (
-                        self.sql_session.query(CompanyTable)
-                        .filter(CompanyTable.name == company_data.name)
-                        .first()
-                    )
-
-                    if not existing_company:
-                        db_company = CompanyTable(
-                            name=company_data.name,
-                            industry=company_data.industry,
-                            location=company_data.location,
-                            founded_year=company_data.founded_year,
-                            website=company_data.website,
-                        )
-                        self.sql_session.add(db_company)
-                        self.sql_session.flush()
-
-                        # Add to investor's portfolio
-                        db_investor.portfolio_companies.append(db_company)
-                    else:
-                        # Add existing company to portfolio if not already there
-                        if existing_company not in db_investor.portfolio_companies:
-                            db_investor.portfolio_companies.append(existing_company)
-
-                # Save team members
-                for team_member_data in investor_data.team_members:
-                    # Check if team member exists
-                    existing_member = (
-                        self.sql_session.query(InvestorTeamMember)
-                        .filter(InvestorTeamMember.email == team_member_data.email)
-                        .first()
-                    )
-
-                    if not existing_member:
-                        db_team_member = InvestorTeamMember(
-                            name=team_member_data.name,
-                            role=team_member_data.role,
-                            email=team_member_data.email,
-                            investor_id=db_investor.id,
-                        )
-                        self.sql_session.add(db_team_member)
-
-            self.sql_session.commit()
-            print(f"Successfully saved {len(investor_data_list)} investors to database")
-
-        except Exception as e:
-            self.sql_session.rollback()
-            print(f"Error saving to SQL database: {e}")
-            raise
-
-    async def _save_to_vector_db(self, investor_data_list: List[InvestorData]) -> None:
-        """Save investors to vector database"""
-        if not self.vector_db_client:
-            return
-
-        documents = []
-        metadatas = []
-        ids = []
-
-        for i, investor_data in enumerate(investor_data_list):
-            investor = investor_data.investor
-            sectors = ", ".join([s.name for s in investor_data.sectors])
-            companies = ", ".join([c.name for c in investor_data.portfolio_companies])
-
-            doc_text = f"""
-            Investor: {investor.name}
-            Description: {investor.description or "N/A"}
-            AUM: ${investor.aum:,}
-            Check Size: ${investor.check_size_lower:,} - ${investor.check_size_upper:,}
-            Geographic Focus: {investor.geographic_focus}
-            Stage Focus: {investor.stage_focus.value}
-            Sectors: {sectors}
-            Portfolio Companies: {companies}
-            """.strip()
-
-            documents.append(doc_text)
-            metadatas.append(
-                {
-                    "name": investor.name,
-                    "stage_focus": investor.stage_focus.value,
-                    "geographic_focus": investor.geographic_focus,
-                    "aum": investor.aum,
-                }
-            )
-            ids.append(
-                f"investor_{i}_{investor.name.replace(' ', '_').replace('/', '_')}"
-            )
-
-        if documents:
-            try:
-                self.collection.add(documents=documents, metadatas=metadatas, ids=ids)
-                print(
-                    f"Successfully saved {len(documents)} investors to vector database"
-                )
-            except Exception as e:
-                print(f"Error saving to vector database: {e}")
-
-    async def process_csv(
-        self, df: pd.DataFrame, max_concurrent: int = 10
-    ) -> List[InvestorData]:
-        """Process CSV data one row at a time and save to databases"""
-        results = []
-
-        # Create semaphore for concurrency control
-        semaphore = asyncio.Semaphore(max_concurrent)
-
-        async def process_row_with_semaphore(row_data):
-            row, row_idx = row_data
-            async with semaphore:
-                return await self._process_row(row, row_idx)
-
-        # Create row tasks
-        row_tasks = []
-        for idx, row in df.iterrows():
-            row_tasks.append((row, idx))
-
-        # Execute all rows concurrently
-        row_results = await asyncio.gather(
-            *[process_row_with_semaphore(row_data) for row_data in row_tasks],
-            return_exceptions=True,
-        )
-
-        # Collect results, filtering out exceptions and None values
-        for row_result in row_results:
-            if not isinstance(row_result, Exception) and row_result is not None:
-                results.append(row_result)
-
-        # Save to databases
-        if results:
-            print(f"Successfully processed {len(results)} investors")
-            await self._save_to_sql(results)
-            await self._save_to_vector_db(results)
-
-        return results
@@ -1,88 +1,47 @@
-from typing import List, Optional
+import os
+from typing import List

-import chromadb
+from db.db import DATABASE_URL, get_db
 from db.models import InvestorTable
 from langchain import hub
 from langchain_community.agent_toolkits import SQLDatabaseToolkit
 from langchain_community.utilities import SQLDatabase
 from langchain_openai import ChatOpenAI
 from langgraph.prebuilt import create_react_agent
-from py_schemas import InvestorData, InvestorList
-from settings import settings
+from schemas.py_schemas import InvestorData, InvestorList
 from sqlalchemy.orm import selectinload

 # Connect to SQLite
-
 prompt_template = hub.pull("langchain-ai/sql-agent-system-prompt")
-db = SQLDatabase.from_uri("sqlite:///investors.db")
-system_message = (
-    prompt_template.format(dialect="SQLite", top_k=5)
-    + "\n Get answers from the Sql database and the vector database"
-)
+db = SQLDatabase.from_uri(DATABASE_URL)


 class QueryProcessor:
-    def __init__(
-        self,
-        sql_session: Optional[object] = None,
-        vector_db_client: Optional[object] = None,
-    ):
-        self.sql_session = sql_session
+    def __init__(self):
        self.llm = ChatOpenAI(
-            api_key=settings.OPENROUTER_API_KEY,
+            api_key=os.getenv("OPENROUTER_API_KEY"),
            base_url="https://openrouter.ai/api/v1",
-            model="google/gemini-2.5-flash-lite",
+            model="openai/gpt-5-nano",
            temperature=0.3,
        )
        self.toolkit = SQLDatabaseToolkit(db=db, llm=self.llm)
+        # Update system message to specifically request only investor IDs
+        system_message_updated = (
+            prompt_template.format(dialect="SQLite", top_k=5)
+            + "\n\nIMPORTANT: You must ONLY return the investor IDs (id field) that match the user's criteria. "
+            + "Do NOT return any other information, explanations, or data. "
+            + "Your response should be ONLY a comma-separated list of numbers representing the investor IDs. "
+            + "Example format: 1, 5, 12, 23"
+        )
        self.agent = create_react_agent(
            model=self.llm,
-            tools=self.toolkit.get_tools() + [self.query_vector_database],
-            prompt=system_message,
+            tools=self.toolkit.get_tools(),
+            prompt=system_message_updated,
        )
-        self.vector_db_client = vector_db_client
-
-        self.vector_db_client = chromadb.PersistentClient(path="./chroma_db")
-        self.collection = self.vector_db_client.get_or_create_collection(
-            name="investor_descriptions",
-            metadata={
-                "description": "Investor descriptions and investment thesis focus"
-            },
-        )
-
-    def query_sql_database(self, query: str) -> Optional[InvestorList]:
-        """Query the SQL database for investor information."""
-        if not self.sql_session:
-            return None
-
-        # Implement SQL querying logic here
-        result = self.sql_session.execute(query)
-        investors = result.scalars().all()
-        return InvestorList(investors=investors)
-
-    def query_vector_database(self, query: str) -> Optional[InvestorList]:
-        """Query the vector database for investor information."""
-        if not self.vector_db_client:
-            return None
-        print("VECTOR STORE WAS CALLED")
-
-        # Query the collection directly, not passing collection as parameter
-        results = self.collection.query(
-            query_texts=[query],  # ChromaDB expects a list of query texts
-            n_results=3,  # Specify how many results you want
-        )
-        print(results)
-
-        # ChromaDB returns results in a different structure
-        # results will have 'documents', 'metadatas', 'ids', 'distances'
-        return results

    def process_query(self, question: str) -> InvestorList:
-        """Process a query using the LLM and return structured investor data."""
-        # Extract filters from the query first
-        filters = self._extract_filters_from_query(question)
-
-        # Get AI response for additional context
+        """Process a query using the LLM and return investor data."""
+        # Let the LLM handle all database interactions and filtering to get IDs
        response = self.agent.invoke(
            {"messages": [("user", question)]},
        )
@@ -92,189 +51,68 @@ class QueryProcessor:
            response["messages"][-1].content if response.get("messages") else ""
        )

-        # Try to extract investor IDs or names from the AI response
-        investor_ids = self._extract_investor_info_from_response(ai_response)
+        # Extract investor IDs from the AI response
+        investor_ids = self._extract_investor_ids_from_response(ai_response)

-        # Fetch filtered investor data with relationships from database
-        return self._fetch_investors_with_relationships(investor_ids, filters)
+        # Fetch full investor data using the IDs
+        return self._fetch_investors_by_ids(investor_ids)

-    def _extract_investor_info_from_response(self, ai_response: str) -> List[int]:
-        """Extract investor IDs from AI response. This is a simple implementation."""
-        # This is a basic implementation - you might want to make it more sophisticated
-        # based on how your AI formats responses
-        investor_ids = []
-
-        # If the AI can't provide structured data, fall back to getting all investors
-        # that match basic criteria
-        try:
-            # Try to extract numbers that might be IDs
-            import re
-
-            ids = re.findall(r"\bid:\s*(\d+)", ai_response.lower())
-            investor_ids = [int(id_str) for id_str in ids]
-        except Exception:
-            pass
-
-        return investor_ids if investor_ids else []
-
-    def _extract_filters_from_query(self, question: str) -> dict:
-        """Extract filter criteria from natural language query."""
-        question_lower = question.lower()
-        filters = {}
-
-        # Extract stage filters
-        if any(
-            stage in question_lower
-            for stage in [
-                "seed",
-                "series a",
-                "series b",
-                "series c",
-                "growth",
-                "late stage",
-            ]
-        ):
-            if "seed" in question_lower:
-                filters["stage"] = "SEED"
-            elif "series a" in question_lower:
-                filters["stage"] = "SERIES_A"
-            elif "series b" in question_lower:
-                filters["stage"] = "SERIES_B"
-            elif "series c" in question_lower:
-                filters["stage"] = "SERIES_C"
-            elif "growth" in question_lower:
-                filters["stage"] = "GROWTH"
-            elif "late stage" in question_lower:
-                filters["stage"] = "LATE_STAGE"
-
-        # Extract geographic filters
-        if any(
-            geo in question_lower
-            for geo in [
-                "us",
-                "usa",
-                "united states",
-                "europe",
-                "asia",
-                "silicon valley",
-                "bay area",
-            ]
-        ):
-            if (
-                "us" in question_lower
-                or "usa" in question_lower
-                or "united states" in question_lower
-            ):
-                filters["geography"] = "US"
-            elif "europe" in question_lower:
-                filters["geography"] = "Europe"
-            elif "asia" in question_lower:
-                filters["geography"] = "Asia"
-            elif "silicon valley" in question_lower or "bay area" in question_lower:
-                filters["geography"] = "Silicon Valley"
-
-        # Extract sector filters
-        sectors = [
-            "fintech",
-            "healthcare",
-            "saas",
-            "ai",
-            "biotech",
-            "consumer",
-            "enterprise",
-            "crypto",
-            "blockchain",
-        ]
-        for sector in sectors:
-            if sector in question_lower:
-                filters["sector"] = sector
-                break
-
-        # Extract check size filters (simple patterns)
+    def _extract_investor_ids_from_response(self, ai_response: str) -> List[int]:
+        """Extract investor IDs from AI response."""
        import re

-        amounts = re.findall(
-            r"\$?(\d+(?:,\d{3})*(?:\.\d+)?)\s*(?:million|m|k|thousand)", question_lower
-        )
-        if amounts:
-            amount = amounts[0].replace(",", "")
-            if "million" in question_lower or "m" in question_lower:
-                filters["min_check_size"] = int(float(amount) * 1000000)
-            elif "thousand" in question_lower or "k" in question_lower:
-                filters["min_check_size"] = int(float(amount) * 1000)
+        investor_ids = []
+        try:
+            # Try multiple patterns to extract IDs from the response
+            # Pattern 1: Simple numbers (assuming they are IDs)
+            numbers = re.findall(r"\b\d+\b", ai_response)
+            investor_ids = [int(num) for num in numbers]

-        return filters
+            # Pattern 2: If response contains explicit ID references
+            id_matches = re.findall(r"\bid[:\s]*(\d+)", ai_response.lower())
+            if id_matches:
+                investor_ids = [int(id_str) for id_str in id_matches]

-    def _fetch_investors_with_relationships(
-        self, investor_ids: List[int] = None, filters: dict = None
-    ) -> InvestorList:
-        """Fetch investors with all their relationships from the database."""
-        if not self.sql_session:
+        except Exception as e:
+            print(f"Error extracting IDs from response: {e}")
+            return []
+
+        return investor_ids
+
+    def _fetch_investors_by_ids(self, investor_ids: List[int]) -> InvestorList:
+        """Fetch investors with all their relationships from the database using IDs."""
+        if not investor_ids:
            return InvestorList(investors=[])

-        # Import here to avoid circular imports
-        from db.models import SectorTable
+        # Get database session
+        db_session = next(get_db())

-        # Build query with all relationships loaded
-        query = self.sql_session.query(InvestorTable).options(
-            selectinload(InvestorTable.portfolio_companies),
-            selectinload(InvestorTable.team_members),
-            selectinload(InvestorTable.sectors),
-        )
-
-        # Apply filters if provided
-        if filters:
-            if "stage" in filters:
-                from db.models import InvestmentStage
-
-                stage_enum = getattr(InvestmentStage, filters["stage"])
-                query = query.filter(InvestorTable.stage_focus == stage_enum)
-
-            if "geography" in filters:
-                query = query.filter(
-                    InvestorTable.geographic_focus.ilike(f"%{filters['geography']}%")
+        try:
+            # Build query with all relationships loaded
+            query = (
+                db_session.query(InvestorTable)
+                .options(
+                    selectinload(InvestorTable.portfolio_companies),
+                    selectinload(InvestorTable.team_members),
+                    selectinload(InvestorTable.sectors),
                )
-
-            if "min_check_size" in filters:
-                query = query.filter(
-                    InvestorTable.check_size_lower >= filters["min_check_size"]
-                )
-
-            if "max_check_size" in filters:
-                query = query.filter(
-                    InvestorTable.check_size_upper <= filters["max_check_size"]
-                )
-
-            if "min_aum" in filters:
-                query = query.filter(InvestorTable.aum >= filters["min_aum"])
-
-            if "max_aum" in filters:
-                query = query.filter(InvestorTable.aum <= filters["max_aum"])
-
-            if "sector" in filters:
-                query = query.join(InvestorTable.sectors).filter(
-                    SectorTable.name.ilike(f"%{filters['sector']}%")
-                )
-
-        # Filter by IDs if provided
-        if investor_ids:
-            query = query.filter(InvestorTable.id.in_(investor_ids))
-        else:
-            # If no specific IDs and no filters, limit to prevent overwhelming response
-            if not filters:
-                query = query.limit(10)
-
-        investors = query.all()
-
-        # Transform to InvestorData format
-        investor_data_list = []
-        for investor in investors:
-            investor_data = InvestorData(
-                investor=investor,
-                portfolio_companies=investor.portfolio_companies,
-                team_members=investor.team_members,
-                sectors=investor.sectors,
+                .filter(InvestorTable.id.in_(investor_ids))
            )
-            investor_data_list.append(investor_data)

-        return InvestorList(investors=investor_data_list)
+            investors = query.all()
+
+            # Transform to InvestorData format
+            investor_data_list = []
+            for investor in investors:
+                investor_data = InvestorData(
+                    investor=investor,
+                    portfolio_companies=investor.portfolio_companies,
+                    team_members=investor.team_members,
+                    sectors=investor.sectors,
+                )
+                investor_data_list.append(investor_data)
+
+            return InvestorList(investors=investor_data_list)
+
+        finally:
+            db_session.close()