diff --git a/COMPANY_PARSER_DOCS.md b/COMPANY_PARSER_DOCS.md deleted file mode 100644 index 3d874c4..0000000 --- a/COMPANY_PARSER_DOCS.md +++ /dev/null @@ -1,452 +0,0 @@ -# Company Parser Documentation - -## Overview - -The company CSV parser has been updated to use **100% manual JSON parsing** with **zero LLM calls**. This makes it extremely fast, cost-effective, and reliable. - -## Key Features - -### πŸš€ No LLM Required - -- **Manual JSON parsing** extracts all data directly from CSV -- **No AI calls** needed for structure parsing -- **Instant processing** - no API delays -- **Zero cost** - no LLM API fees - -### πŸ“Š Data Extracted - -**Basic Information:** - -- Company name -- Website -- Location/geographic focus -- Industry/sector description -- Founded year (auto-extracted from description) - -**People:** - -- Key executives/senior leadership -- Titles and roles -- Source URLs - -**Relationships:** - -- Investor names (from CSV column) -- Automatic linking to investors in database - -**Additional Data:** - -- Client categories -- Product descriptions -- Linked documents -- Researcher notes -- Missing fields tracking -- Data sources - -## CSV Format - -### Required Columns - -| Column Name | Description | Required | -| ------------------------ | ------------------------------ | -------- | -| `Name` | Company name | Yes | -| `Website` | Company website URL | No | -| `Investor` | Comma-separated investor names | No | -| `Final Investor Profile` | JSON string with company data | Yes | - -### JSON Profile Structure - -The `Final Investor Profile` column should contain a JSON object with: - -```json -{ - "companyDescription": "Company description text...", - "geographicFocus": "Location/HQ and sales focus", - "sectorDescription": "Industry/sector description", - "keyExecutives": [ - { - "name": "John Doe", - "title": "CEO", - "sourceUrl": "https://company.com/team" - } - ], - "clientCategories": ["Category 1", "Category 2"], - "productDescription": "Product/service description", - "linkedDocuments": ["https://doc1.com", "https://doc2.com"], - "researcherNotes": "Research notes...", - "missingImportantFields": ["field1", "field2"], - "sources": { - "companyDescription": "https://source1.com", - "keyExecutives": "https://source2.com" - } -} -``` - -## Usage - -### Via API - -```bash -curl -X POST "http://localhost:8585/parse-csv" \ - -F "file=@data/300 Companies data.csv" \ - -F "is_investor=0" -``` - -### Programmatically - -```python -import pandas as pd -from services.llm_parser import InvestorProcessor - -# Load CSV -df = pd.read_csv('companies.csv') - -# Create processor -processor = InvestorProcessor() - -# Parse and save to database (no LLM needed!) -results = await processor.parse_companies(df, save_to_db=True) -``` - -### Testing (Dry Run) - -```bash -python3 test_company_parser.py -``` - -## Processing Output - -### Console Example - -``` -πŸš€ Starting to process 100 companies... - -πŸ“Š Processing 1/100: Mammaly - βœ“ Parsed successfully - - Location: Berlin, Germany - - Industry: Pet health and nutrition - - Founded: 2020 - - Executives: 3 - - Investors: 3 - βœ… Saved to database (ID: 1234) - -πŸ“Š Processing 2/100: Ljusgarda - βœ“ Parsed successfully - - Location: Sweden - - Industry: Indoor agriculture - - Founded: 2018 - - Executives: 1 - - Investors: 4 - βœ… Saved to database (ID: 1235) - -πŸ’Ύ Committed batch at row 10 - -... - -πŸŽ‰ Completed! Processed 100/100 companies -``` - -## Database Schema - -### CompanyTable - -```python -class CompanyTable: - id: int - name: str - website: str | None - location: str | None - description: str | None - industry: str | None - founded_year: int | None - created_at: datetime - updated_at: datetime | None - - # Relationships - members: List[CompanyMember] # Key executives - investors: List[InvestorTable] # Linked investors - sectors: List[SectorTable] -``` - -### CompanyMember - -```python -class CompanyMember: - id: int - name: str - role: str | None # Job title - linkedin: str | None # Source URL - company_id: int -``` - -### Investor Linking - -Companies are automatically linked to investors: - -```python -# If investor exists in database -investor = db.query(InvestorTable).filter_by(name="Five Seasons Ventures").first() -if investor: - investor.portfolio_companies.append(company) -``` - -## Features - -### 1. Automatic Founding Year Extraction - -The parser automatically extracts founding years from company descriptions: - -**Patterns Recognized:** - -- "founded in 2020" -- "founded 2020" -- "GegrΓΌndet 2020" (German) -- "established in 2020" -- "since 2020" -- "(2020)" - year in parentheses - -**Example:** - -``` -Description: "mammaly is a leading European pet health startup founded in 2020..." -β†’ Founded Year: 2020 -``` - -### 2. Executive Name Extraction - -Extracts from multiple possible field names: - -- `keyExecutives` -- `seniorLeadership` - -### 3. Investor Relationship Management - -- Parses comma-separated investor names -- Links to existing investors in database -- Adds company to investor's portfolio -- Skips non-existent investors (logs warning) - -### 4. Upsert Logic - -- Updates existing companies with same name -- Preserves existing data if new data is null -- Replaces team members on update -- Maintains investor relationships - -## Performance - -### Speed - -| Metric | Value | -| ---------------------- | ------------ | -| Processing per company | ~1-2 seconds | -| 100 companies | ~2-3 minutes | -| 300 companies | ~6-9 minutes | - -### Comparison with Old LLM Parser - -| Metric | Old LLM Parser | New Manual Parser | Improvement | -| --------- | -------------- | ----------------- | ----------------- | -| Speed | 30-60s/company | 1-2s/company | **95%+ faster** | -| Cost | $0.02/company | $0.00/company | **100% savings** | -| API calls | 10-20/company | 0/company | **No LLM needed** | -| Accuracy | Variable | Consistent | **More reliable** | - -## Error Handling - -### Graceful Failures - -```python -# Missing required fields -if not name or not profile_json: - print("⚠️ Skipping - missing name or profile") - continue - -# JSON parsing errors -try: - profile = json.loads(profile_json) -except json.JSONDecodeError: - print("❌ Invalid JSON") - continue - -# Database errors -try: - db.commit() -except Exception as e: - db.rollback() - print(f"❌ Database error: {e}") -``` - -### Batch Commits - -Commits every 10 companies to avoid memory issues and ensure data persistence even if later errors occur. - -## Query Examples - -### Get Companies by Industry - -```python -companies = db.query(CompanyTable).filter( - CompanyTable.industry.like('%agriculture%') -).all() -``` - -### Get Companies Founded After 2018 - -```python -companies = db.query(CompanyTable).filter( - CompanyTable.founded_year >= 2018 -).all() -``` - -### Get Companies with Specific Investor - -```python -investor = db.query(InvestorTable).filter_by(name="Five Seasons Ventures").first() -companies = investor.portfolio_companies -``` - -### Get Companies by Location - -```python -companies = db.query(CompanyTable).filter( - CompanyTable.location.like('%Germany%') -).all() -``` - -## Benefits - -### 1. Speed ⚑ - -- **95%+ faster** than LLM-based parsing -- No API call delays -- Instant JSON parsing - -### 2. Cost πŸ’° - -- **$0 per company** (vs $0.02 with LLM) -- No LLM API fees -- 100% savings on large datasets - -### 3. Reliability 🎯 - -- **Consistent parsing** every time -- No LLM hallucinations -- Predictable results - -### 4. Simplicity 🧩 - -- **Zero configuration** needed -- No API keys required for companies -- Straightforward JSON parsing - -### 5. Completeness πŸ“‹ - -- Extracts **all available fields** -- No data loss -- Preserves source references - -## Integration with Investors - -Companies can reference investors, and investors can have companies in their portfolio: - -```python -# Query investors of a company -company = db.query(CompanyTable).filter_by(name="Mammaly").first() -investors = company.investors - -# Query companies of an investor -investor = db.query(InvestorTable).filter_by(name="Five Seasons Ventures").first() -companies = investor.portfolio_companies -``` - -## Troubleshooting - -### Issue: Company not saved - -**Check:** - -1. Valid JSON in `Final Investor Profile` column -2. Company `name` is not empty -3. No database constraint violations - -### Issue: Investors not linked - -**Possible causes:** - -1. Investor doesn't exist in database yet -2. Investor name spelling doesn't match exactly -3. Parse investors CSV first, then companies - -**Solution:** - -```python -# Always parse investors first -await processor.parse_investors(investors_df, save_to_db=True) -# Then parse companies -await processor.parse_companies(companies_df, save_to_db=True) -``` - -### Issue: Founded year not extracted - -**Reason:** Description doesn't contain recognizable year pattern - -**Solution:** Year patterns are best-effort. Add more patterns if needed or set manually: - -```python -company.founded_year = 2020 -db.commit() -``` - -## Extending the Parser - -### Add New Fields - -```python -# In process_company_profile method -company_data = { - # ... existing fields ... - "new_field": profile.get("newFieldName"), -} -``` - -### Add New Year Patterns - -```python -year_patterns = [ - # ... existing patterns ... - r'started in (\d{4})', - r'launched (\d{4})', -] -``` - -### Custom Post-Processing - -```python -async def parse_companies(self, df, save_to_db=True): - # ... existing code ... - - for company_data in results: - # Custom processing here - if company_data['industry'] == 'agriculture': - company_data['category'] = 'agtech' -``` - -## Best Practices - -1. **Parse investors first** - ensures investor relationships work -2. **Test on small sample** - use `save_to_db=False` first -3. **Check data quality** - review first few results -4. **Commit in batches** - default 10 companies per commit -5. **Monitor console** - watch for errors and warnings - -## Summary - -βœ… **100% manual parsing** - No LLM needed -βœ… **Instant processing** - 1-2s per company -βœ… **Zero cost** - No API fees -βœ… **Reliable** - Consistent results -βœ… **Complete** - All fields extracted -βœ… **Integrated** - Auto-links to investors - -The company parser is now as efficient as the investor parser, with the added benefit of requiring **zero LLM calls**! diff --git a/SCHEMA_FIX.md b/SCHEMA_FIX.md deleted file mode 100644 index 91abd5d..0000000 --- a/SCHEMA_FIX.md +++ /dev/null @@ -1,237 +0,0 @@ -# Schema Mismatch Fix - Summary - -## Problem - -When trying to parse the investor CSV, the following error occurred: - -``` -sqlite3.OperationalError: no such column: investors.stage_focus -``` - -## Root Cause - -The application models still referenced `stage_focus` column which was removed from the preprocessor database schema. The `stage_focus` was deprecated in favor of fund-level stage tracking (each fund has its own `investment_stage_focus`). - -## Files Fixed - -### 1. βœ… `app/db/models.py` - -**Removed:** `stage_focus` column from `InvestorTable` - -```python -# BEFORE: -stage_focus = Column(Enum(InvestmentStage), nullable=True) - -# AFTER: -# Removed completely -``` - -### 2. βœ… `app/schemas/py_schemas.py` - -**Removed:** `stage_focus` field from `InvestorSchema` - -```python -# BEFORE: -stage_focus: InvestmentStage = Field( - default=InvestmentStage.SEED, - description="Investment stage focus..." -) - -# AFTER: -# Removed completely -``` - -### 3. βœ… `app/services/llm_parser.py` - -**Removed:** `stage_focus` parameter from `_save_investor_to_db()` method - -```python -# BEFORE: -investor = InvestorTable( - ... - stage_focus=investor_data.investor.stage_focus, - ... -) - -# AFTER: -investor = InvestorTable( - ... - # stage_focus removed - ... -) -``` - -### 4. βœ… `app/db/db.py` - -**Fixed:** Database path to use absolute path to preprocessor database - -```python -# BEFORE: -DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./investors.db") - -# AFTER: -APP_DIR = Path(__file__).parent.parent -PREPROCESSOR_DB = APP_DIR.parent / "preprocessor" / "version_two.db" -DATABASE_URL = os.getenv("DATABASE_URL", f"sqlite:///{PREPROCESSOR_DB}") -``` - -## Verification - -Created `verify_schema.py` to check database schema: - -```bash -python3 verify_schema.py -``` - -**Results:** - -``` -βœ… 'stage_focus' column not in database (as expected) -βœ… All required enriched columns present -βœ… aum column is INTEGER type (correct) -``` - -## Architecture Decision - -**Stage Focus Tracking:** - -- ❌ **Old:** Single `stage_focus` at investor level -- βœ… **New:** Multiple stages tracked per fund via `investment_stage_focus` JSON array - -This allows investors with multiple funds targeting different stages. - -**Example:** - -```python -# Investor: Alumni Ventures -funds = [ - { - "fund_name": "Seed Fund", - "investment_stage_focus": ["Seed", "Early Stage"] - }, - { - "fund_name": "Growth Fund", - "investment_stage_focus": ["Series B", "Series C", "Growth"] - } -] -``` - -## Database Schema Status - -### InvestorTable (Current) - -``` -βœ… aum: INTEGER (for numerical filtering) -βœ… investment_thesis: JSON (array) -βœ… portfolio_highlights: JSON (array) -βœ… linked_documents: JSON (array) -βœ… researcher_notes: TEXT -βœ… missing_important_fields: JSON (array) -βœ… sources: JSON (object) -❌ stage_focus: REMOVED (moved to fund level) -``` - -### FundTable (Current) - -``` -βœ… fund_name: VARCHAR -βœ… fund_size: VARCHAR (USD integer as string) -βœ… estimated_investment_size: VARCHAR (USD integer as string) -βœ… geographic_focus: JSON (array) -βœ… investment_stage_focus: JSON (array) ⭐ REPLACES investor.stage_focus -βœ… sector_focus: JSON (array) -``` - -## Testing - -### Before Fix - -``` -❌ Error: no such column: investors.stage_focus -❌ Failed to save to database -``` - -### After Fix - -```bash -# Test with API -curl -X POST "http://localhost:8585/parse-csv" \ - -F "file=@data/300 Investors data.csv" \ - -F "is_investor=1" - -# Expected: Successfully parses and saves investors -``` - -## Migration Notes - -**For existing code that queries stage_focus:** - -```python -# OLD CODE (will break): -investors = db.query(InvestorTable).filter( - InvestorTable.stage_focus == InvestmentStage.SEED -).all() - -# NEW CODE (correct): -from sqlalchemy import func - -investors = db.query(InvestorTable).join(FundTable).filter( - func.json_extract(FundTable.investment_stage_focus, '$').contains('Seed') -).all() - -# Or better yet, use JSON operations: -investors = db.query(InvestorTable).join(FundTable).filter( - FundTable.investment_stage_focus.like('%Seed%') -).all() -``` - -## Benefits of This Change - -1. **Accurate Representation:** Investors can have multiple funds with different stage focuses -2. **No Data Loss:** Stage information preserved at fund level -3. **Better Queries:** Can filter by specific fund characteristics -4. **Scalability:** Supports complex investor portfolios - -## Next Steps - -1. βœ… Schema fixed -2. βœ… Database path corrected -3. βœ… Verification script created -4. πŸ”„ Ready to parse investor CSV -5. πŸ“ Update any existing queries that used `stage_focus` - -## Quick Reference - -**Correct Database Path:** - -``` -/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/preprocessor/version_two.db -``` - -**Access Fund Stage Info:** - -```python -for investor in investors: - for fund in investor.funds: - print(f"{fund.fund_name}: {fund.investment_stage_focus}") -``` - -**Query by Stage:** - -```python -# Get all seed-stage funds -seed_funds = db.query(FundTable).filter( - FundTable.investment_stage_focus.contains('Seed') -).all() - -# Get investors with seed funds -seed_investors = db.query(InvestorTable).join(FundTable).filter( - FundTable.investment_stage_focus.contains('Seed') -).distinct().all() -``` - -## Status - -βœ… **FIXED:** All schema mismatches resolved -βœ… **VERIFIED:** Database schema validated -βœ… **READY:** Can now parse investor CSV without errors diff --git a/app/db/models.py b/app/db/models.py index 3ae0253..86acced 100644 --- a/app/db/models.py +++ b/app/db/models.py @@ -160,11 +160,15 @@ class FundTable(Base, TimestampMixin): # Fund details fund_name = Column(String, nullable=True) - fund_size = Column(String, nullable=True) # Store as string to preserve currency + fund_size = Column( + Integer, nullable=True + ) # Store as integer for numerical filtering fund_size_source_url = Column(String, nullable=True) - estimated_investment_size = Column( - String, nullable=True - ) # e.g., "EUR 1,000 to 2,000" + + # Check size range (parsed from estimated_investment_size by LLM) + check_size_lower = Column(Integer, nullable=True) + check_size_upper = Column(Integer, nullable=True) + source_url = Column(String, nullable=True) source_provider = Column(String, nullable=True) # e.g., "Perplexity" diff --git a/app/routers/investors.py b/app/routers/investors.py index 3b1b20b..b26ebcb 100644 --- a/app/routers/investors.py +++ b/app/routers/investors.py @@ -4,7 +4,11 @@ from db.db import get_db from db.models import InvestorTable, SectorTable from fastapi import APIRouter, Depends, HTTPException, Query from pydantic import BaseModel -from schemas.router_schemas import InvestmentStage, InvestorData +from schemas.router_schemas import ( + InvestmentStage, + InvestorData, + InvestorFundData, +) from sqlalchemy.orm import Session, selectinload router = APIRouter(tags=["Investor Routes"]) @@ -33,34 +37,95 @@ class InvestorUpdate(BaseModel): number_of_investments: Optional[int] = None -@router.get("/investors", response_model=List[InvestorData]) +@router.get("/investors", response_model=List[InvestorFundData]) def read_investors(db: Session = Depends(get_db)): - """Get all investors with their related data""" + """Get all investors with their funds as separate entries + + Each investor-fund combination is returned as a separate row. + An investor with 3 funds will appear as 3 entries. + """ investors = ( db.query(InvestorTable) .options( selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) .all() ) - # Transform InvestorTable objects to InvestorData format - investor_data_list = [] + # Transform to InvestorFundData format (one row per investor-fund combination) + investor_fund_list = [] for investor in investors: - investor_data = InvestorData( - investor=investor, # This maps to InvestorSchema - portfolio_companies=investor.portfolio_companies, - team_members=investor.team_members, - sectors=investor.sectors, - ) - investor_data_list.append(investor_data) + # If investor has funds, create one entry per fund + if investor.funds: + for fund in investor.funds: + investor_fund_data = InvestorFundData( + # Investor fields + investor_id=investor.id, + investor_name=investor.name, + investor_description=investor.description, + investor_website=investor.website, + investor_headquarters=investor.headquarters, + aum=investor.aum, + aum_as_of_date=investor.aum_as_of_date, + aum_source_url=investor.aum_source_url, + investment_thesis=investor.investment_thesis, + portfolio_highlights=investor.portfolio_highlights, + number_of_investments=investor.number_of_investments, + # Fund fields + fund_id=fund.id, + fund_name=fund.fund_name, + fund_size=fund.fund_size, + fund_size_source_url=fund.fund_size_source_url, + check_size_lower=fund.check_size_lower, + check_size_upper=fund.check_size_upper, + geographic_focus=fund.geographic_focus, + investment_stage_focus=fund.investment_stage_focus, + sector_focus=fund.sector_focus, + # Related data (same for all funds of this investor) + portfolio_companies=investor.portfolio_companies, + team_members=investor.team_members, + sectors=investor.sectors, + ) + investor_fund_list.append(investor_fund_data) + else: + # If no funds, create one entry with null fund fields + investor_fund_data = InvestorFundData( + # Investor fields + investor_id=investor.id, + investor_name=investor.name, + investor_description=investor.description, + investor_website=investor.website, + investor_headquarters=investor.headquarters, + aum=investor.aum, + aum_as_of_date=investor.aum_as_of_date, + aum_source_url=investor.aum_source_url, + investment_thesis=investor.investment_thesis, + portfolio_highlights=investor.portfolio_highlights, + number_of_investments=investor.number_of_investments, + # Fund fields (null) + fund_id=None, + fund_name=None, + fund_size=None, + fund_size_source_url=None, + check_size_lower=None, + check_size_upper=None, + geographic_focus=None, + investment_stage_focus=None, + sector_focus=None, + # Related data + portfolio_companies=investor.portfolio_companies, + team_members=investor.team_members, + sectors=investor.sectors, + ) + investor_fund_list.append(investor_fund_data) - return investor_data_list + return investor_fund_list -@router.get("/investors/filter", response_model=List[InvestorData]) +@router.get("/investors/filter", response_model=List[InvestorFundData]) def filter_investors( stage: Optional[InvestmentStage] = Query( None, description="Filter by investment stage" @@ -75,13 +140,18 @@ def filter_investors( max_aum: Optional[int] = Query(None, description="Maximum AUM"), db: Session = Depends(get_db), ): - """Filter investors based on various criteria""" + """Filter investors based on various criteria + + Returns investor-fund combinations as separate rows. + An investor with 3 funds will appear as 3 entries. + """ # Start with base query query = db.query(InvestorTable).options( selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) # Apply filters @@ -111,29 +181,86 @@ def filter_investors( investors = query.all() - # Transform to InvestorData format - investor_data_list = [] + # Transform to InvestorFundData format (one row per investor-fund combination) + investor_fund_list = [] for investor in investors: - investor_data = InvestorData( - investor=investor, - portfolio_companies=investor.portfolio_companies, - team_members=investor.team_members, - sectors=investor.sectors, - ) - investor_data_list.append(investor_data) + # If investor has funds, create one entry per fund + if investor.funds: + for fund in investor.funds: + investor_fund_data = InvestorFundData( + # Investor fields + investor_id=investor.id, + investor_name=investor.name, + investor_description=investor.description, + investor_website=investor.website, + investor_headquarters=investor.headquarters, + aum=investor.aum, + aum_as_of_date=investor.aum_as_of_date, + aum_source_url=investor.aum_source_url, + investment_thesis=investor.investment_thesis, + portfolio_highlights=investor.portfolio_highlights, + number_of_investments=investor.number_of_investments, + # Fund fields + fund_id=fund.id, + fund_name=fund.fund_name, + fund_size=fund.fund_size, + fund_size_source_url=fund.fund_size_source_url, + check_size_lower=fund.check_size_lower, + check_size_upper=fund.check_size_upper, + geographic_focus=fund.geographic_focus, + investment_stage_focus=fund.investment_stage_focus, + sector_focus=fund.sector_focus, + # Related data + portfolio_companies=investor.portfolio_companies, + team_members=investor.team_members, + sectors=investor.sectors, + ) + investor_fund_list.append(investor_fund_data) + else: + # If no funds, create one entry with null fund fields + investor_fund_data = InvestorFundData( + # Investor fields + investor_id=investor.id, + investor_name=investor.name, + investor_description=investor.description, + investor_website=investor.website, + investor_headquarters=investor.headquarters, + aum=investor.aum, + aum_as_of_date=investor.aum_as_of_date, + aum_source_url=investor.aum_source_url, + investment_thesis=investor.investment_thesis, + portfolio_highlights=investor.portfolio_highlights, + number_of_investments=investor.number_of_investments, + # Fund fields (null) + fund_id=None, + fund_name=None, + fund_size=None, + fund_size_source_url=None, + check_size_lower=None, + check_size_upper=None, + geographic_focus=None, + investment_stage_focus=None, + sector_focus=None, + # Related data + portfolio_companies=investor.portfolio_companies, + team_members=investor.team_members, + sectors=investor.sectors, + ) + investor_fund_list.append(investor_fund_data) - return investor_data_list + return investor_fund_list @router.get("/investors/{investor_id}", response_model=InvestorData) def read_investor(investor_id: int, db: Session = Depends(get_db)): - """Get a specific investor by ID""" + """Get a specific investor by ID with all their funds""" investor = ( db.query(InvestorTable) .options( selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) .filter(InvestorTable.id == investor_id) .first() @@ -142,12 +269,13 @@ def read_investor(investor_id: int, db: Session = Depends(get_db)): if not investor: raise HTTPException(status_code=404, detail="Investor not found") - # Transform to InvestorData format + # Transform to InvestorData format (includes funds array) return InvestorData( investor=investor, portfolio_companies=investor.portfolio_companies, team_members=investor.team_members, sectors=investor.sectors, + funds=investor.funds, ) @@ -166,6 +294,7 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)): selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) .filter(InvestorTable.id == db_investor.id) .first() @@ -177,6 +306,7 @@ def create_investor(investor: InvestorCreate, db: Session = Depends(get_db)): portfolio_companies=investor_with_relations.portfolio_companies, team_members=investor_with_relations.team_members, sectors=investor_with_relations.sectors, + funds=investor_with_relations.funds, ) @@ -205,6 +335,7 @@ def update_investor( selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) .filter(InvestorTable.id == investor_id) .first() @@ -216,6 +347,7 @@ def update_investor( portfolio_companies=investor_with_relations.portfolio_companies, team_members=investor_with_relations.team_members, sectors=investor_with_relations.sectors, + funds=investor_with_relations.funds, ) @@ -233,13 +365,16 @@ def delete_investor(investor_id: int, db: Session = Depends(get_db)): return {"message": "Investor deleted successfully"} -@router.get("/investors/{investor_id}/similar", response_model=List[InvestorData]) +@router.get("/investors/{investor_id}/similar", response_model=List[InvestorFundData]) def find_similar_investors( investor_id: int, limit: int = Query(10, description="Maximum number of similar investors to return"), db: Session = Depends(get_db), ): - """Find investors similar to a given investor based on characteristics""" + """Find investors similar to a given investor based on characteristics + + Returns investor-fund combinations as separate rows. + """ # Get the target investor target_investor = ( @@ -248,6 +383,7 @@ def find_similar_investors( selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) .filter(InvestorTable.id == investor_id) .first() @@ -266,6 +402,7 @@ def find_similar_investors( selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) .filter(InvestorTable.id != investor_id) .all() @@ -338,13 +475,71 @@ def find_similar_investors( scored_investors.sort(key=lambda x: x[0], reverse=True) similar_investors = [inv for score, inv in scored_investors[:limit]] - # Transform to InvestorData format - return [ - InvestorData( - investor=inv, - portfolio_companies=inv.portfolio_companies, - team_members=inv.team_members, - sectors=inv.sectors, - ) - for inv in similar_investors - ] + # Transform to InvestorFundData format (one row per investor-fund combination) + investor_fund_list = [] + for investor in similar_investors: + # If investor has funds, create one entry per fund + if investor.funds: + for fund in investor.funds: + investor_fund_data = InvestorFundData( + # Investor fields + investor_id=investor.id, + investor_name=investor.name, + investor_description=investor.description, + investor_website=investor.website, + investor_headquarters=investor.headquarters, + aum=investor.aum, + aum_as_of_date=investor.aum_as_of_date, + aum_source_url=investor.aum_source_url, + investment_thesis=investor.investment_thesis, + portfolio_highlights=investor.portfolio_highlights, + number_of_investments=investor.number_of_investments, + # Fund fields + fund_id=fund.id, + fund_name=fund.fund_name, + fund_size=fund.fund_size, + fund_size_source_url=fund.fund_size_source_url, + check_size_lower=fund.check_size_lower, + check_size_upper=fund.check_size_upper, + geographic_focus=fund.geographic_focus, + investment_stage_focus=fund.investment_stage_focus, + sector_focus=fund.sector_focus, + # Related data + portfolio_companies=investor.portfolio_companies, + team_members=investor.team_members, + sectors=investor.sectors, + ) + investor_fund_list.append(investor_fund_data) + else: + # If no funds, create one entry with null fund fields + investor_fund_data = InvestorFundData( + # Investor fields + investor_id=investor.id, + investor_name=investor.name, + investor_description=investor.description, + investor_website=investor.website, + investor_headquarters=investor.headquarters, + aum=investor.aum, + aum_as_of_date=investor.aum_as_of_date, + aum_source_url=investor.aum_source_url, + investment_thesis=investor.investment_thesis, + portfolio_highlights=investor.portfolio_highlights, + number_of_investments=investor.number_of_investments, + # Fund fields (null) + fund_id=None, + fund_name=None, + fund_size=None, + fund_size_source_url=None, + check_size_lower=None, + check_size_upper=None, + geographic_focus=None, + investment_stage_focus=None, + sector_focus=None, + # Related data + portfolio_companies=investor.portfolio_companies, + team_members=investor.team_members, + sectors=investor.sectors, + ) + investor_fund_list.append(investor_fund_data) + + return investor_fund_list diff --git a/app/schemas/router_schemas.py b/app/schemas/router_schemas.py index 1d1a685..942f2b1 100644 --- a/app/schemas/router_schemas.py +++ b/app/schemas/router_schemas.py @@ -32,6 +32,25 @@ class InvestorMemberSchema(BaseModel): from_attributes = True +class FundSchema(BaseModel): + id: int + fund_name: str | None + fund_size: int | None # Changed to int for numerical filtering + fund_size_source_url: str | None + check_size_lower: int | None # NEW: Lower bound of check size range + check_size_upper: int | None # NEW: Upper bound of check size range + source_url: str | None + source_provider: str | None + geographic_focus: List[str] | None + investment_stage_focus: List[str] | None + sector_focus: List[str] | None + created_at: Optional[datetime] = None + updated_at: Optional[datetime] = None + + class Config: + from_attributes = True + + class CompanyMemberSchema(BaseModel): id: int name: Optional[str] @@ -76,12 +95,53 @@ class InvestorSchema(BaseModel): class InvestorData(BaseModel): - """Comprehensive investor data schema for LLM processing""" + """Comprehensive investor data schema - used for individual investor requests""" investor: InvestorSchema portfolio_companies: List[CompanySchema] team_members: List[InvestorMemberSchema] sectors: List[SectorSchema] + funds: List[FundSchema] + + class Config: + from_attributes = True + + +class InvestorFundData(BaseModel): + """Investor-Fund combined data - used for list/filter requests + + Each row represents one investor-fund combination. + An investor with 3 funds will appear as 3 separate entries. + """ + + # Investor fields + investor_id: int + investor_name: str + investor_description: Optional[str] + investor_website: Optional[str] + investor_headquarters: Optional[str] + aum: int | None + aum_as_of_date: str | None + aum_source_url: str | None + investment_thesis: List[str] | None + portfolio_highlights: List[str] | None + number_of_investments: int | None + + # Fund fields + fund_id: int | None + fund_name: str | None + fund_size: int | None # Changed to int for numerical filtering + fund_size_source_url: str | None + check_size_lower: int | None # NEW: Lower bound of check size range + check_size_upper: int | None # NEW: Upper bound of check size range + geographic_focus: List[str] | None + investment_stage_focus: List[str] | None + sector_focus: List[str] | None + + # Related data + portfolio_companies: List[CompanySchema] + team_members: List[InvestorMemberSchema] + sectors: List[SectorSchema] class Config: from_attributes = True @@ -99,3 +159,9 @@ class CompanyData(BaseModel): # Renamed from CompaniesData for consistency class InvestorList(BaseModel): investors: List[InvestorData] + + +class InvestorFundList(BaseModel): + """List of investor-fund combinations""" + + investor_funds: List[InvestorFundData] diff --git a/app/services/llm_parser.py b/app/services/llm_parser.py index 4111434..7fbd46d 100644 --- a/app/services/llm_parser.py +++ b/app/services/llm_parser.py @@ -27,6 +27,15 @@ class CurrencyConversion(BaseModel): notes: str = "" +class CheckSizeRange(BaseModel): + """Schema for LLM check size range parsing from estimated investment size""" + + lower_bound_usd: int = 0 + upper_bound_usd: int = 0 + confidence: str = "high" # high, medium, low + notes: str = "" + + class InvestorProcessor: def __init__(self): self.llm = ChatOpenAI( @@ -36,10 +45,12 @@ class InvestorProcessor: temperature=0, ) - # Only use structured LLM for currency conversion + # Structured LLMs for specific parsing tasks self.currency_converter_llm = self.llm.with_structured_output( CurrencyConversion ) + self.check_size_parser_llm = self.llm.with_structured_output(CheckSizeRange) + # Keep legacy structured LLMs for backward compatibility self.investor_structured_llm = self.llm.with_structured_output(InvestorData) self.company_structured_llm = self.llm.with_structured_output(CompanyData) @@ -77,6 +88,57 @@ Return only the USD integer amount with current exchange rates.""" print(f"Error converting currency '{amount_str}': {e}") return None + async def parse_check_size_range( + self, estimated_investment_str: str + ) -> tuple[Optional[int], Optional[int]]: + """ + Use LLM to parse check size range from estimated investment size string. + Returns tuple of (lower_bound_usd, upper_bound_usd). + + Handles formats like: + - "EUR 1,000 to 2,000" + - "$100K-$500K" + - "Between $1M and $5M" + - "Up to EUR 10 million" + - "$2M typical" + """ + if ( + not estimated_investment_str + or estimated_investment_str == "Not Available" + or estimated_investment_str == "0" + ): + return None, None + + try: + prompt = f"""Parse this check size/investment range into lower and upper bounds in USD as integers. + +Input: {estimated_investment_str} + +Instructions: +- If it's a range (e.g., "EUR 1M to 5M"), extract both bounds +- If it's a single amount (e.g., "$2M typical"), use it as both lower and upper +- If it says "up to X", use 0 as lower and X as upper +- Convert all currencies to USD using current exchange rates +- Return integers (whole numbers, no decimals) + +Examples: +- "EUR 1,000 to 2,000" -> lower: 1100, upper: 2200 +- "$100K-$500K" -> lower: 100000, upper: 500000 +- "Between $1M and $5M" -> lower: 1000000, upper: 5000000 +- "Up to EUR 10 million" -> lower: 0, upper: 11000000 +- "$2M typical" -> lower: 2000000, upper: 2000000 +- "GBP 500K-2M" -> lower: 600000, upper: 2400000 + +Return the lower and upper bounds in USD.""" + + result = await self.check_size_parser_llm.ainvoke(prompt) + lower = result.lower_bound_usd if result.lower_bound_usd > 0 else None + upper = result.upper_bound_usd if result.upper_bound_usd > 0 else None + return lower, upper + except Exception as e: + print(f"Error parsing check size range '{estimated_investment_str}': {e}") + return None, None + def parse_json_profile(self, json_str: str) -> Optional[dict]: """ Manually parse the JSON profile from the CSV. @@ -157,7 +219,8 @@ Return only the USD integer amount with current exchange rates.""" "fund_name": fund.get("fundName"), "fund_size": None, "fund_size_source_url": fund.get("fundSizeSourceUrl"), - "estimated_investment_size": None, + "check_size_lower": None, + "check_size_upper": None, "source_url": fund.get("sourceUrl"), "source_provider": fund.get("sourceProvider"), "geographic_focus": fund.get("geographicFocus", []), @@ -165,19 +228,23 @@ Return only the USD integer amount with current exchange rates.""" "sector_focus": fund.get("sectorFocus", []), } - # Convert fund size to USD + # Convert fund size to USD integer fund_size_str = fund.get("fundSize") if fund_size_str and fund_size_str != "Not Available": fund_size_usd = await self.convert_to_usd(fund_size_str) if fund_size_usd: - fund_data["fund_size"] = str(fund_size_usd) + fund_data["fund_size"] = fund_size_usd # Store as integer - # Convert estimated investment size + # Parse check size range from estimated investment size est_size_str = fund.get("estimatedInvestmentSize") if est_size_str and est_size_str != "Not Available": - est_size_usd = await self.convert_to_usd(est_size_str) - if est_size_usd: - fund_data["estimated_investment_size"] = str(est_size_usd) + check_lower, check_upper = await self.parse_check_size_range( + est_size_str + ) + if check_lower is not None: + fund_data["check_size_lower"] = check_lower + if check_upper is not None: + fund_data["check_size_upper"] = check_upper investor_data["funds"].append(fund_data) @@ -430,11 +497,10 @@ Return only the USD integer amount with current exchange rates.""" fund = FundTable( investor_id=investor.id, fund_name=fund_data.get("fund_name"), - fund_size=fund_data.get("fund_size"), + fund_size=fund_data.get("fund_size"), # Now an integer fund_size_source_url=fund_data.get("fund_size_source_url"), - estimated_investment_size=fund_data.get( - "estimated_investment_size" - ), + check_size_lower=fund_data.get("check_size_lower"), # NEW + check_size_upper=fund_data.get("check_size_upper"), # NEW source_url=fund_data.get("source_url"), source_provider=fund_data.get("source_provider"), geographic_focus=fund_data.get("geographic_focus"), diff --git a/app/services/querying.py b/app/services/querying.py index 3078e18..27df87a 100644 --- a/app/services/querying.py +++ b/app/services/querying.py @@ -95,6 +95,7 @@ class QueryProcessor: selectinload(InvestorTable.portfolio_companies), selectinload(InvestorTable.team_members), selectinload(InvestorTable.sectors), + selectinload(InvestorTable.funds), ) .filter(InvestorTable.id.in_(investor_ids)) ) @@ -109,6 +110,7 @@ class QueryProcessor: portfolio_companies=investor.portfolio_companies, team_members=investor.team_members, sectors=investor.sectors, + funds=investor.funds, ) investor_data_list.append(investor_data) diff --git a/preprocessor/migrate_fund_schema.py b/preprocessor/migrate_fund_schema.py new file mode 100644 index 0000000..dae12bf --- /dev/null +++ b/preprocessor/migrate_fund_schema.py @@ -0,0 +1,159 @@ +""" +Migration script to update FundTable schema: +- Change fund_size from VARCHAR to INTEGER +- Remove estimated_investment_size column +- Add check_size_lower INTEGER column +- Add check_size_upper INTEGER column +""" + +import sys +from pathlib import Path + +# Add preprocessor to path +sys.path.insert(0, str(Path(__file__).parent)) + +from models import engine +from sqlalchemy import text + + +def migrate_fund_table(): + """ + Migrate the funds table to add check_size fields and update fund_size type. + + SQLite doesn't support ALTER COLUMN directly, so we need to: + 1. Create new table with correct schema + 2. Copy data from old table + 3. Drop old table + 4. Rename new table + """ + + print("πŸ”„ Starting fund table migration...") + + with engine.connect() as conn: + # Start transaction + trans = conn.begin() + + try: + # Check if migration is needed + result = conn.execute(text("PRAGMA table_info(funds)")) + columns = {row[1]: row[2] for row in result} + + if "check_size_lower" in columns and "check_size_upper" in columns: + print("βœ… Migration already applied - check_size columns exist") + return + + print("πŸ“Š Current columns:", list(columns.keys())) + + # Create new table with updated schema + print("\n1️⃣ Creating new funds table with updated schema...") + conn.execute( + text(""" + CREATE TABLE IF NOT EXISTS funds_new ( + id INTEGER PRIMARY KEY, + investor_id INTEGER NOT NULL, + fund_name VARCHAR, + fund_size INTEGER, + fund_size_source_url VARCHAR, + check_size_lower INTEGER, + check_size_upper INTEGER, + source_url VARCHAR, + source_provider VARCHAR, + geographic_focus JSON, + investment_stage_focus JSON, + sector_focus JSON, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP NOT NULL, + updated_at DATETIME, + FOREIGN KEY (investor_id) REFERENCES investors(id) + ) + """) + ) + + # Copy data from old table to new table + print("2️⃣ Copying data from old table...") + + # Check if old estimated_investment_size column exists + if "estimated_investment_size" in columns: + # We have estimated_investment_size but it's a string + # We'll set check_size fields to NULL for now - they'll be repopulated when re-parsing + conn.execute( + text(""" + INSERT INTO funds_new ( + id, investor_id, fund_name, fund_size, fund_size_source_url, + check_size_lower, check_size_upper, + source_url, source_provider, + geographic_focus, investment_stage_focus, sector_focus, + created_at, updated_at + ) + SELECT + id, investor_id, fund_name, + CAST(fund_size AS INTEGER) as fund_size, + fund_size_source_url, + NULL as check_size_lower, + NULL as check_size_upper, + source_url, source_provider, + geographic_focus, investment_stage_focus, sector_focus, + created_at, updated_at + FROM funds + """) + ) + else: + # No estimated_investment_size column (fresh install or already migrated partially) + conn.execute( + text(""" + INSERT INTO funds_new ( + id, investor_id, fund_name, fund_size, fund_size_source_url, + check_size_lower, check_size_upper, + source_url, source_provider, + geographic_focus, investment_stage_focus, sector_focus, + created_at, updated_at + ) + SELECT + id, investor_id, fund_name, + CAST(fund_size AS INTEGER) as fund_size, + fund_size_source_url, + NULL as check_size_lower, + NULL as check_size_upper, + source_url, source_provider, + geographic_focus, investment_stage_focus, sector_focus, + created_at, updated_at + FROM funds + """) + ) + + rows_copied = conn.execute( + text("SELECT COUNT(*) FROM funds_new") + ).fetchone()[0] + print(f" βœ… Copied {rows_copied} rows") + + # Drop old table + print("3️⃣ Dropping old funds table...") + conn.execute(text("DROP TABLE funds")) + + # Rename new table + print("4️⃣ Renaming funds_new to funds...") + conn.execute(text("ALTER TABLE funds_new RENAME TO funds")) + + # Commit transaction + trans.commit() + + print("\nβœ… Migration completed successfully!") + print("\nπŸ“ Summary:") + print(" - fund_size: VARCHAR β†’ INTEGER") + print(" - estimated_investment_size: REMOVED") + print(" - check_size_lower: ADDED (INTEGER)") + print(" - check_size_upper: ADDED (INTEGER)") + print(f" - {rows_copied} fund records migrated") + + print( + "\n⚠️ Note: check_size_lower and check_size_upper are NULL for existing records." + ) + print(" Run the investor CSV parser again to populate these fields.") + + except Exception as e: + trans.rollback() + print(f"\n❌ Migration failed: {e}") + raise + + +if __name__ == "__main__": + migrate_fund_table() diff --git a/preprocessor/models.py b/preprocessor/models.py index bf0073b..d768803 100644 --- a/preprocessor/models.py +++ b/preprocessor/models.py @@ -223,11 +223,15 @@ class FundTable(Base, TimestampMixin): # Fund details fund_name = Column(String, nullable=True) - fund_size = Column(String, nullable=True) # Store as string to preserve currency + fund_size = Column( + Integer, nullable=True + ) # Store as integer for numerical filtering fund_size_source_url = Column(String, nullable=True) - estimated_investment_size = Column( - String, nullable=True - ) # e.g., "EUR 1,000 to 2,000" + + # Check size range (parsed from estimated_investment_size by LLM) + check_size_lower = Column(Integer, nullable=True) + check_size_upper = Column(Integer, nullable=True) + source_url = Column(String, nullable=True) source_provider = Column(String, nullable=True) # e.g., "Perplexity" diff --git a/preprocessor/version_two.db b/preprocessor/version_two.db index 815c5db..174cc40 100644 Binary files a/preprocessor/version_two.db and b/preprocessor/version_two.db differ diff --git a/test_company_parser.py b/test_company_parser.py deleted file mode 100644 index 515c41a..0000000 --- a/test_company_parser.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script for the company parser with manual JSON parsing. -""" - -import asyncio -import os -import sys - -sys.path.insert(0, "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/app") - -import pandas as pd -from dotenv import load_dotenv -from services.llm_parser import InvestorProcessor - -# Load environment variables from root directory -load_dotenv("/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/.env") - -# Also check if API key is set (not needed for companies now but for consistency) -if not os.getenv("OPENROUTER_API_KEY"): - print("⚠️ WARNING: OPENROUTER_API_KEY not found in environment") - print("This is OK for companies (no LLM needed), but will fail for investors") - - -async def test_parser(): - """Test the new company parser with a small sample""" - print("πŸ§ͺ Testing Manual Company JSON Parser (No LLM)\n") - - # Load the company data - df = pd.read_csv( - "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/data/300 Companies data.csv" - ) - - # Process just the first 3 rows for testing - test_df = df.head(3) - - processor = InvestorProcessor() - - print(f"Processing {len(test_df)} test companies...\n") - results = await processor.parse_companies(test_df, save_to_db=False) - - print("\n" + "=" * 80) - print("πŸ“Š TEST RESULTS") - print("=" * 80) - - for idx, result in enumerate(results, 1): - print(f"\n{idx}. {result.get('name')}") - print(f" Website: {result.get('website')}") - print(f" Location: {result.get('location')}") - print(f" Industry: {result.get('industry')}") - print( - f" Founded: {result.get('founded_year')}" - if result.get("founded_year") - else " Founded: Unknown" - ) - print(f" Executives: {len(result.get('key_executives', []))}") - if result.get("key_executives"): - for exec_member in result.get("key_executives", [])[:3]: # Show first 3 - print(f" - {exec_member.get('name')} ({exec_member.get('title')})") - print(f" Investors: {len(result.get('investor_names', []))}") - if result.get("investor_names"): - print( - f" - {', '.join(result.get('investor_names', [])[:5])}" - ) # Show first 5 - print(f" Client Categories: {len(result.get('client_categories', []))}") - if result.get("client_categories"): - print( - f" - {', '.join(result.get('client_categories', [])[:3])}" - ) # Show first 3 - - print("\n" + "=" * 80) - print(f"βœ… Successfully processed {len(results)}/{len(test_df)} companies") - print("πŸŽ‰ No LLM calls needed - 100% manual parsing!") - print("=" * 80) - - -if __name__ == "__main__": - asyncio.run(test_parser()) diff --git a/verify_schema.py b/verify_schema.py deleted file mode 100644 index cbe20e4..0000000 --- a/verify_schema.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python3 -""" -Quick test to verify the database schema matches between app and preprocessor. -""" - -import sys - -sys.path.insert(0, "/home/oluwasanmi/Documents/Work/MKD/anton_wireframe/app") - -from db.db import engine -from sqlalchemy import inspect - -# Get table info -inspector = inspect(engine) - -print("πŸ” Checking database schema...") -print(f"Database: {engine.url}\n") - -# Check investors table -if "investors" in inspector.get_table_names(): - print("βœ… 'investors' table exists") - columns = inspector.get_columns("investors") - - print("\nColumns in 'investors' table:") - for col in columns: - print(f" - {col['name']}: {col['type']}") - - # Check for stage_focus - column_names = [col["name"] for col in columns] - if "stage_focus" in column_names: - print("\n⚠️ WARNING: 'stage_focus' column still exists in database!") - print(" This should be removed as it's deprecated.") - else: - print("\nβœ… Good: 'stage_focus' column not in database (as expected)") - - # Check for required columns - required_columns = [ - "aum", - "investment_thesis", - "portfolio_highlights", - "linked_documents", - "researcher_notes", - "sources", - ] - missing = [col for col in required_columns if col not in column_names] - - if missing: - print(f"\n❌ Missing columns: {', '.join(missing)}") - else: - print("\nβœ… All required enriched columns present") - -else: - print("❌ 'investors' table not found!") - -print("\n" + "=" * 60) -print("Schema verification complete!") -print("=" * 60)