diff --git a/preprocessor/INGESTION_COMPLETE.md b/preprocessor/INGESTION_COMPLETE.md new file mode 100644 index 0000000..e6a6813 --- /dev/null +++ b/preprocessor/INGESTION_COMPLETE.md @@ -0,0 +1,202 @@ +# āœ… Base Database Ingestion Complete! + +**Date:** October 5, 2025 +**Database:** `version_two.db` + +## šŸ“Š Summary Statistics + +| Entity | Count | +| ---------------------------------- | ------ | +| **Investors** | 9,315 | +| **Companies** | 6,877 | +| **Sectors** | 639 | +| **Investor-Company Relationships** | 22,548 | +| **Investor-Sector Relationships** | 75,307 | + +## šŸŽÆ Top Investors by Portfolio Size + +1. **Bpifrance** - 211 companies +2. **European Innovation Council** - 183 companies +3. **Business Growth Fund** - 84 companies +4. **HTGF (High-Tech Gruenderfonds)** - 74 companies +5. **EIT InnoEnergy** - 72 companies + +## šŸ“ Source Files + +- **Companies CSV**: 13,027 rows +- **Investors CSV**: 11,045 rows +- **Investors Ingested**: 9,315 (some duplicates/invalid entries filtered out) + +## šŸ—ƒļø Database Structure + +### Tables Created: + +- āœ… `investors` - Core investor data +- āœ… `companies` - Portfolio companies +- āœ… `sectors` - Industry sectors +- āœ… `funds` - (Empty, will be populated during enrichment) +- āœ… `investor_members` - (Empty, will be populated during enrichment) +- āœ… `company_members` - Company team members +- āœ… `investment_stages` - Investment stage definitions +- āœ… Association tables for relationships + +### Current Data: + +- āœ… Investor names and basic info (website, investment count) +- āœ… Company details (name, location, industry, description) +- āœ… Sectors extracted from company industries +- āœ… Investor → Company relationships (who invested in what) +- āœ… Investor → Sector relationships (derived from portfolio) + +### Missing (To Be Added via Enrichment): + +- ā³ Investor headquarters +- ā³ AUM (Assets Under Management) details +- ā³ Investment thesis +- ā³ Portfolio highlights +- ā³ Fund details (multiple funds per investor) +- ā³ Senior leadership/team members +- ā³ Research notes and sources + +## šŸ”„ Next Steps + +### 1. Prepare Enriched Data CSV + +Your enriched CSV should have this structure: + +```csv +investor_name,enriched_data +"212","{\"websiteURL\": \"...\", \"funds\": [...], ...}" +"301","{...}" +``` + +### 2. Run Enrichment Script + +```bash +cd preprocessor +python enrich_investors.py enriched_investors.csv investor_name enriched_data +``` + +This will: + +- āœ… Add fund details (multiple funds per investor) +- āœ… Update AUM information +- āœ… Add investment thesis +- āœ… Add portfolio highlights +- āœ… Add senior leadership +- āœ… Add research notes and sources + +### 3. Verify Enriched Data + +```bash +python3 << 'EOF' +from models import InvestorTable, FundTable, get_db_session +session = get_db_session() + +# Check enriched data +investor = session.query(InvestorTable).filter_by(name="Anaxago").first() +if investor: + print(f"Investor: {investor.name}") + print(f"HQ: {investor.headquarters}") + print(f"AUM: {investor.aum}") + print(f"Funds: {len(investor.funds)}") + for fund in investor.funds: + print(f" - {fund.fund_name}") + +session.close() +EOF +``` + +## šŸ“ Sample Queries + +### Get Investor with Portfolio + +```python +from models import InvestorTable, get_db_session + +session = get_db_session() +investor = session.query(InvestorTable).filter_by(name="Bpifrance").first() + +print(f"Investor: {investor.name}") +print(f"Website: {investor.website}") +print(f"Investments: {investor.number_of_investments}") +print(f"Portfolio Companies: {len(investor.portfolio_companies)}") +print(f"Sectors: {[s.name for s in investor.sectors[:5]]}") + +session.close() +``` + +### Get Companies by Sector + +```python +from models import CompanyTable, SectorTable, get_db_session + +session = get_db_session() +sector = session.query(SectorTable).filter_by(name="AgTech").first() + +print(f"Sector: {sector.name}") +print(f"Companies: {len(sector.companies)}") +for company in sector.companies[:5]: + print(f" - {company.name}") + +session.close() +``` + +### Get Investor's Sector Distribution + +```python +from models import InvestorTable, get_db_session + +session = get_db_session() +investor = session.query(InvestorTable).filter_by(name="Bpifrance").first() + +sectors = {} +for company in investor.portfolio_companies: + for sector in company.sectors: + sectors[sector.name] = sectors.get(sector.name, 0) + 1 + +# Top sectors +for sector, count in sorted(sectors.items(), key=lambda x: x[1], reverse=True)[:5]: + print(f"{sector}: {count} companies") + +session.close() +``` + +## āš ļø Known Issues + +### Investors Not Found in DB + +Some companies reference investors that weren't in the investors CSV: + +- The Venture Collective +- Sarah Leary +- Transpose +- ND Capital +- InvestSud +- Third Swedish National Pension Fund +- Union Tech Ventures +- Vasuki Tech Fund +- MSA Novo +- And others... + +These are likely individual angel investors or smaller funds not in the main investor list. They are recorded but not linked. + +## šŸ”’ Backup + +A backup of the database was created before ingestion: + +- `version_two.db.backup_YYYYMMDD_HHMMSS` + +## šŸ“§ Support + +For issues or questions: + +1. Check the logs for error messages +2. Verify CSV file formats +3. Ensure all required columns are present +4. Check for duplicate entries + +--- + +**Status:** āœ… Base database created successfully +**Ready for:** Enrichment phase with detailed investor data diff --git a/preprocessor/main.py b/preprocessor/main.py index e6cbe14..9ba3447 100644 --- a/preprocessor/main.py +++ b/preprocessor/main.py @@ -13,7 +13,8 @@ logger = logging.getLogger(__name__) # Import the schema init_database() -#===================== Ingesting Original Data =====================# + +# ===================== Ingesting Original Data =====================# def parse_investor_names(investor_names_str): """Parse comma-separated investor names and return a list""" if pd.isna(investor_names_str) or investor_names_str == "": @@ -21,7 +22,9 @@ def parse_investor_names(investor_names_str): # Split by comma and clean whitespace # investors = [name.strip() for name in str(investor_names_str).split(",")] - investors = [clean_name(name.strip()) for name in str(investor_names_str).split(",")] + investors = [ + clean_name(name.strip()) for name in str(investor_names_str).split(",") + ] return [investor for investor in investors if investor] @@ -165,8 +168,8 @@ def ingest_data(): if not existing_investor: investor = InvestorTable( name=investor_name, - # description=clean_string(row.get("Business model", "")), - # geographic_focus=clean_string(row.get("HQ", "")), + description=clean_string(row.get("Business model", "")), + headquarters=clean_string(row.get("HQ", "")), website=parse_website(str(row.get("Website", "")).strip()), number_of_investments=clean_integer( row.get("Number of investments") @@ -305,8 +308,6 @@ def ingest_data(): session.close() - - if __name__ == "__main__": ingest_data() # print(clean_name("A... Energi")) diff --git a/preprocessor/models.py b/preprocessor/models.py index 650f961..bf0073b 100644 --- a/preprocessor/models.py +++ b/preprocessor/models.py @@ -139,9 +139,7 @@ class InvestorTable(Base, TimestampMixin): headquarters = Column(String, nullable=True) # AUM fields - aum = Column( - String, nullable=True - ) # Store as string to preserve currency (e.g., "EUR 850,000,000") + aum = Column(Integer, nullable=True) # Store as integer for numerical filtering aum_as_of_date = Column(String, nullable=True) aum_source_url = Column(String, nullable=True) @@ -317,7 +315,7 @@ class SectorTable(Base, TimestampMixin): ) projects = relationship( - "ProjectTable", secondary=project_sector_association, back_populates="projects" + "ProjectTable", secondary=project_sector_association, back_populates="sector" ) diff --git a/preprocessor/verify_database.py b/preprocessor/verify_database.py new file mode 100644 index 0000000..1ed2f7f --- /dev/null +++ b/preprocessor/verify_database.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +Quick verification script for the database +""" + +from models import CompanyTable, FundTable, InvestorTable, SectorTable, get_db_session + + +def verify_database(): + session = get_db_session() + + print("=" * 60) + print("šŸ” DATABASE VERIFICATION") + print("=" * 60) + + # Count records + investor_count = session.query(InvestorTable).count() + company_count = session.query(CompanyTable).count() + sector_count = session.query(SectorTable).count() + fund_count = session.query(FundTable).count() + + print("\nšŸ“Š Record Counts:") + print(f" Investors: {investor_count:,}") + print(f" Companies: {company_count:,}") + print(f" Sectors: {sector_count:,}") + print(f" Funds: {fund_count:,}") + + # Check relationships + investors_with_companies = ( + session.query(InvestorTable) + .filter(InvestorTable.portfolio_companies.any()) + .count() + ) + + investors_with_sectors = ( + session.query(InvestorTable).filter(InvestorTable.sectors.any()).count() + ) + + print("\nšŸ”— Relationships:") + print(f" Investors with portfolio companies: {investors_with_companies:,}") + print(f" Investors with sectors: {investors_with_sectors:,}") + + # Sample data quality checks + investors_with_website = ( + session.query(InvestorTable).filter(InvestorTable.website.isnot(None)).count() + ) + + investors_with_investments = ( + session.query(InvestorTable) + .filter( + InvestorTable.number_of_investments.isnot(None), + InvestorTable.number_of_investments > 0, + ) + .count() + ) + + print("\nāœ… Data Quality:") + print( + f" Investors with website: {investors_with_website:,} ({investors_with_website / investor_count * 100:.1f}%)" + ) + print( + f" Investors with investment count: {investors_with_investments:,} ({investors_with_investments / investor_count * 100:.1f}%)" + ) + + # Check for enrichment readiness + investors_with_aum = ( + session.query(InvestorTable).filter(InvestorTable.aum.isnot(None)).count() + ) + + investors_with_headquarters = ( + session.query(InvestorTable) + .filter(InvestorTable.headquarters.isnot(None)) + .count() + ) + + investors_with_thesis = ( + session.query(InvestorTable) + .filter(InvestorTable.investment_thesis.isnot(None)) + .count() + ) + + print("\nšŸŽÆ Enrichment Status:") + print(f" Investors with AUM: {investors_with_aum:,}") + print(f" Investors with HQ: {investors_with_headquarters:,}") + print(f" Investors with thesis: {investors_with_thesis:,}") + print(f" Investors with funds: {fund_count:,}") + + if fund_count == 0: + print("\nāš ļø No funds found - enrichment needed!") + + # Show a random sample + import random + + sample_investors = session.query(InvestorTable).limit(1000).all() + sample = random.sample(sample_investors, min(3, len(sample_investors))) + + print("\nšŸ“‹ Random Sample:") + for inv in sample: + print(f"\n {inv.name}") + print(f" Website: {inv.website or 'N/A'}") + print(f" Investments: {inv.number_of_investments or 'N/A'}") + print(f" Portfolio: {len(inv.portfolio_companies)} companies") + print(f" Sectors: {len(inv.sectors)} sectors") + if inv.funds: + print(f" Funds: {len(inv.funds)}") + + session.close() + + print("\n" + "=" * 60) + + if fund_count == 0: + print("šŸ“ Next step: Run enrichment script") + print(" python enrich_investors.py enriched_investors.csv") + else: + print("āœ… Database is enriched and ready!") + + print("=" * 60) + + +if __name__ == "__main__": + verify_database() diff --git a/preprocessor/version_two.db b/preprocessor/version_two.db index e3d9d2b..2d3c8c9 100644 Binary files a/preprocessor/version_two.db and b/preprocessor/version_two.db differ diff --git a/preprocessor/version_two.db.backup_20251005_191749 b/preprocessor/version_two.db.backup_20251005_191749 new file mode 100644 index 0000000..e3d9d2b Binary files /dev/null and b/preprocessor/version_two.db.backup_20251005_191749 differ