Refactor code structure for improved readability and maintainability
This commit is contained in:
@@ -0,0 +1,202 @@
|
|||||||
|
# ✅ Base Database Ingestion Complete!
|
||||||
|
|
||||||
|
**Date:** October 5, 2025
|
||||||
|
**Database:** `version_two.db`
|
||||||
|
|
||||||
|
## 📊 Summary Statistics
|
||||||
|
|
||||||
|
| Entity | Count |
|
||||||
|
| ---------------------------------- | ------ |
|
||||||
|
| **Investors** | 9,315 |
|
||||||
|
| **Companies** | 6,877 |
|
||||||
|
| **Sectors** | 639 |
|
||||||
|
| **Investor-Company Relationships** | 22,548 |
|
||||||
|
| **Investor-Sector Relationships** | 75,307 |
|
||||||
|
|
||||||
|
## 🎯 Top Investors by Portfolio Size
|
||||||
|
|
||||||
|
1. **Bpifrance** - 211 companies
|
||||||
|
2. **European Innovation Council** - 183 companies
|
||||||
|
3. **Business Growth Fund** - 84 companies
|
||||||
|
4. **HTGF (High-Tech Gruenderfonds)** - 74 companies
|
||||||
|
5. **EIT InnoEnergy** - 72 companies
|
||||||
|
|
||||||
|
## 📁 Source Files
|
||||||
|
|
||||||
|
- **Companies CSV**: 13,027 rows
|
||||||
|
- **Investors CSV**: 11,045 rows
|
||||||
|
- **Investors Ingested**: 9,315 (some duplicates/invalid entries filtered out)
|
||||||
|
|
||||||
|
## 🗃️ Database Structure
|
||||||
|
|
||||||
|
### Tables Created:
|
||||||
|
|
||||||
|
- ✅ `investors` - Core investor data
|
||||||
|
- ✅ `companies` - Portfolio companies
|
||||||
|
- ✅ `sectors` - Industry sectors
|
||||||
|
- ✅ `funds` - (Empty, will be populated during enrichment)
|
||||||
|
- ✅ `investor_members` - (Empty, will be populated during enrichment)
|
||||||
|
- ✅ `company_members` - Company team members
|
||||||
|
- ✅ `investment_stages` - Investment stage definitions
|
||||||
|
- ✅ Association tables for relationships
|
||||||
|
|
||||||
|
### Current Data:
|
||||||
|
|
||||||
|
- ✅ Investor names and basic info (website, investment count)
|
||||||
|
- ✅ Company details (name, location, industry, description)
|
||||||
|
- ✅ Sectors extracted from company industries
|
||||||
|
- ✅ Investor → Company relationships (who invested in what)
|
||||||
|
- ✅ Investor → Sector relationships (derived from portfolio)
|
||||||
|
|
||||||
|
### Missing (To Be Added via Enrichment):
|
||||||
|
|
||||||
|
- ⏳ Investor headquarters
|
||||||
|
- ⏳ AUM (Assets Under Management) details
|
||||||
|
- ⏳ Investment thesis
|
||||||
|
- ⏳ Portfolio highlights
|
||||||
|
- ⏳ Fund details (multiple funds per investor)
|
||||||
|
- ⏳ Senior leadership/team members
|
||||||
|
- ⏳ Research notes and sources
|
||||||
|
|
||||||
|
## 🔄 Next Steps
|
||||||
|
|
||||||
|
### 1. Prepare Enriched Data CSV
|
||||||
|
|
||||||
|
Your enriched CSV should have this structure:
|
||||||
|
|
||||||
|
```csv
|
||||||
|
investor_name,enriched_data
|
||||||
|
"212","{\"websiteURL\": \"...\", \"funds\": [...], ...}"
|
||||||
|
"301","{...}"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Run Enrichment Script
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd preprocessor
|
||||||
|
python enrich_investors.py enriched_investors.csv investor_name enriched_data
|
||||||
|
```
|
||||||
|
|
||||||
|
This will:
|
||||||
|
|
||||||
|
- ✅ Add fund details (multiple funds per investor)
|
||||||
|
- ✅ Update AUM information
|
||||||
|
- ✅ Add investment thesis
|
||||||
|
- ✅ Add portfolio highlights
|
||||||
|
- ✅ Add senior leadership
|
||||||
|
- ✅ Add research notes and sources
|
||||||
|
|
||||||
|
### 3. Verify Enriched Data
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 << 'EOF'
|
||||||
|
from models import InvestorTable, FundTable, get_db_session
|
||||||
|
session = get_db_session()
|
||||||
|
|
||||||
|
# Check enriched data
|
||||||
|
investor = session.query(InvestorTable).filter_by(name="Anaxago").first()
|
||||||
|
if investor:
|
||||||
|
print(f"Investor: {investor.name}")
|
||||||
|
print(f"HQ: {investor.headquarters}")
|
||||||
|
print(f"AUM: {investor.aum}")
|
||||||
|
print(f"Funds: {len(investor.funds)}")
|
||||||
|
for fund in investor.funds:
|
||||||
|
print(f" - {fund.fund_name}")
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
EOF
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📝 Sample Queries
|
||||||
|
|
||||||
|
### Get Investor with Portfolio
|
||||||
|
|
||||||
|
```python
|
||||||
|
from models import InvestorTable, get_db_session
|
||||||
|
|
||||||
|
session = get_db_session()
|
||||||
|
investor = session.query(InvestorTable).filter_by(name="Bpifrance").first()
|
||||||
|
|
||||||
|
print(f"Investor: {investor.name}")
|
||||||
|
print(f"Website: {investor.website}")
|
||||||
|
print(f"Investments: {investor.number_of_investments}")
|
||||||
|
print(f"Portfolio Companies: {len(investor.portfolio_companies)}")
|
||||||
|
print(f"Sectors: {[s.name for s in investor.sectors[:5]]}")
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Companies by Sector
|
||||||
|
|
||||||
|
```python
|
||||||
|
from models import CompanyTable, SectorTable, get_db_session
|
||||||
|
|
||||||
|
session = get_db_session()
|
||||||
|
sector = session.query(SectorTable).filter_by(name="AgTech").first()
|
||||||
|
|
||||||
|
print(f"Sector: {sector.name}")
|
||||||
|
print(f"Companies: {len(sector.companies)}")
|
||||||
|
for company in sector.companies[:5]:
|
||||||
|
print(f" - {company.name}")
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Investor's Sector Distribution
|
||||||
|
|
||||||
|
```python
|
||||||
|
from models import InvestorTable, get_db_session
|
||||||
|
|
||||||
|
session = get_db_session()
|
||||||
|
investor = session.query(InvestorTable).filter_by(name="Bpifrance").first()
|
||||||
|
|
||||||
|
sectors = {}
|
||||||
|
for company in investor.portfolio_companies:
|
||||||
|
for sector in company.sectors:
|
||||||
|
sectors[sector.name] = sectors.get(sector.name, 0) + 1
|
||||||
|
|
||||||
|
# Top sectors
|
||||||
|
for sector, count in sorted(sectors.items(), key=lambda x: x[1], reverse=True)[:5]:
|
||||||
|
print(f"{sector}: {count} companies")
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
```
|
||||||
|
|
||||||
|
## ⚠️ Known Issues
|
||||||
|
|
||||||
|
### Investors Not Found in DB
|
||||||
|
|
||||||
|
Some companies reference investors that weren't in the investors CSV:
|
||||||
|
|
||||||
|
- The Venture Collective
|
||||||
|
- Sarah Leary
|
||||||
|
- Transpose
|
||||||
|
- ND Capital
|
||||||
|
- InvestSud
|
||||||
|
- Third Swedish National Pension Fund
|
||||||
|
- Union Tech Ventures
|
||||||
|
- Vasuki Tech Fund
|
||||||
|
- MSA Novo
|
||||||
|
- And others...
|
||||||
|
|
||||||
|
These are likely individual angel investors or smaller funds not in the main investor list. They are recorded but not linked.
|
||||||
|
|
||||||
|
## 🔒 Backup
|
||||||
|
|
||||||
|
A backup of the database was created before ingestion:
|
||||||
|
|
||||||
|
- `version_two.db.backup_YYYYMMDD_HHMMSS`
|
||||||
|
|
||||||
|
## 📧 Support
|
||||||
|
|
||||||
|
For issues or questions:
|
||||||
|
|
||||||
|
1. Check the logs for error messages
|
||||||
|
2. Verify CSV file formats
|
||||||
|
3. Ensure all required columns are present
|
||||||
|
4. Check for duplicate entries
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Status:** ✅ Base database created successfully
|
||||||
|
**Ready for:** Enrichment phase with detailed investor data
|
||||||
@@ -13,7 +13,8 @@ logger = logging.getLogger(__name__)
|
|||||||
# Import the schema
|
# Import the schema
|
||||||
init_database()
|
init_database()
|
||||||
|
|
||||||
#===================== Ingesting Original Data =====================#
|
|
||||||
|
# ===================== Ingesting Original Data =====================#
|
||||||
def parse_investor_names(investor_names_str):
|
def parse_investor_names(investor_names_str):
|
||||||
"""Parse comma-separated investor names and return a list"""
|
"""Parse comma-separated investor names and return a list"""
|
||||||
if pd.isna(investor_names_str) or investor_names_str == "":
|
if pd.isna(investor_names_str) or investor_names_str == "":
|
||||||
@@ -21,7 +22,9 @@ def parse_investor_names(investor_names_str):
|
|||||||
|
|
||||||
# Split by comma and clean whitespace
|
# Split by comma and clean whitespace
|
||||||
# investors = [name.strip() for name in str(investor_names_str).split(",")]
|
# investors = [name.strip() for name in str(investor_names_str).split(",")]
|
||||||
investors = [clean_name(name.strip()) for name in str(investor_names_str).split(",")]
|
investors = [
|
||||||
|
clean_name(name.strip()) for name in str(investor_names_str).split(",")
|
||||||
|
]
|
||||||
return [investor for investor in investors if investor]
|
return [investor for investor in investors if investor]
|
||||||
|
|
||||||
|
|
||||||
@@ -165,8 +168,8 @@ def ingest_data():
|
|||||||
if not existing_investor:
|
if not existing_investor:
|
||||||
investor = InvestorTable(
|
investor = InvestorTable(
|
||||||
name=investor_name,
|
name=investor_name,
|
||||||
# description=clean_string(row.get("Business model", "")),
|
description=clean_string(row.get("Business model", "")),
|
||||||
# geographic_focus=clean_string(row.get("HQ", "")),
|
headquarters=clean_string(row.get("HQ", "")),
|
||||||
website=parse_website(str(row.get("Website", "")).strip()),
|
website=parse_website(str(row.get("Website", "")).strip()),
|
||||||
number_of_investments=clean_integer(
|
number_of_investments=clean_integer(
|
||||||
row.get("Number of investments")
|
row.get("Number of investments")
|
||||||
@@ -305,8 +308,6 @@ def ingest_data():
|
|||||||
session.close()
|
session.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ingest_data()
|
ingest_data()
|
||||||
# print(clean_name("A... Energi"))
|
# print(clean_name("A... Energi"))
|
||||||
|
|||||||
@@ -139,9 +139,7 @@ class InvestorTable(Base, TimestampMixin):
|
|||||||
headquarters = Column(String, nullable=True)
|
headquarters = Column(String, nullable=True)
|
||||||
|
|
||||||
# AUM fields
|
# AUM fields
|
||||||
aum = Column(
|
aum = Column(Integer, nullable=True) # Store as integer for numerical filtering
|
||||||
String, nullable=True
|
|
||||||
) # Store as string to preserve currency (e.g., "EUR 850,000,000")
|
|
||||||
aum_as_of_date = Column(String, nullable=True)
|
aum_as_of_date = Column(String, nullable=True)
|
||||||
aum_source_url = Column(String, nullable=True)
|
aum_source_url = Column(String, nullable=True)
|
||||||
|
|
||||||
@@ -317,7 +315,7 @@ class SectorTable(Base, TimestampMixin):
|
|||||||
)
|
)
|
||||||
|
|
||||||
projects = relationship(
|
projects = relationship(
|
||||||
"ProjectTable", secondary=project_sector_association, back_populates="projects"
|
"ProjectTable", secondary=project_sector_association, back_populates="sector"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,121 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Quick verification script for the database
|
||||||
|
"""
|
||||||
|
|
||||||
|
from models import CompanyTable, FundTable, InvestorTable, SectorTable, get_db_session
|
||||||
|
|
||||||
|
|
||||||
|
def verify_database():
|
||||||
|
session = get_db_session()
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("🔍 DATABASE VERIFICATION")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Count records
|
||||||
|
investor_count = session.query(InvestorTable).count()
|
||||||
|
company_count = session.query(CompanyTable).count()
|
||||||
|
sector_count = session.query(SectorTable).count()
|
||||||
|
fund_count = session.query(FundTable).count()
|
||||||
|
|
||||||
|
print("\n📊 Record Counts:")
|
||||||
|
print(f" Investors: {investor_count:,}")
|
||||||
|
print(f" Companies: {company_count:,}")
|
||||||
|
print(f" Sectors: {sector_count:,}")
|
||||||
|
print(f" Funds: {fund_count:,}")
|
||||||
|
|
||||||
|
# Check relationships
|
||||||
|
investors_with_companies = (
|
||||||
|
session.query(InvestorTable)
|
||||||
|
.filter(InvestorTable.portfolio_companies.any())
|
||||||
|
.count()
|
||||||
|
)
|
||||||
|
|
||||||
|
investors_with_sectors = (
|
||||||
|
session.query(InvestorTable).filter(InvestorTable.sectors.any()).count()
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n🔗 Relationships:")
|
||||||
|
print(f" Investors with portfolio companies: {investors_with_companies:,}")
|
||||||
|
print(f" Investors with sectors: {investors_with_sectors:,}")
|
||||||
|
|
||||||
|
# Sample data quality checks
|
||||||
|
investors_with_website = (
|
||||||
|
session.query(InvestorTable).filter(InvestorTable.website.isnot(None)).count()
|
||||||
|
)
|
||||||
|
|
||||||
|
investors_with_investments = (
|
||||||
|
session.query(InvestorTable)
|
||||||
|
.filter(
|
||||||
|
InvestorTable.number_of_investments.isnot(None),
|
||||||
|
InvestorTable.number_of_investments > 0,
|
||||||
|
)
|
||||||
|
.count()
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n✅ Data Quality:")
|
||||||
|
print(
|
||||||
|
f" Investors with website: {investors_with_website:,} ({investors_with_website / investor_count * 100:.1f}%)"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" Investors with investment count: {investors_with_investments:,} ({investors_with_investments / investor_count * 100:.1f}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for enrichment readiness
|
||||||
|
investors_with_aum = (
|
||||||
|
session.query(InvestorTable).filter(InvestorTable.aum.isnot(None)).count()
|
||||||
|
)
|
||||||
|
|
||||||
|
investors_with_headquarters = (
|
||||||
|
session.query(InvestorTable)
|
||||||
|
.filter(InvestorTable.headquarters.isnot(None))
|
||||||
|
.count()
|
||||||
|
)
|
||||||
|
|
||||||
|
investors_with_thesis = (
|
||||||
|
session.query(InvestorTable)
|
||||||
|
.filter(InvestorTable.investment_thesis.isnot(None))
|
||||||
|
.count()
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n🎯 Enrichment Status:")
|
||||||
|
print(f" Investors with AUM: {investors_with_aum:,}")
|
||||||
|
print(f" Investors with HQ: {investors_with_headquarters:,}")
|
||||||
|
print(f" Investors with thesis: {investors_with_thesis:,}")
|
||||||
|
print(f" Investors with funds: {fund_count:,}")
|
||||||
|
|
||||||
|
if fund_count == 0:
|
||||||
|
print("\n⚠️ No funds found - enrichment needed!")
|
||||||
|
|
||||||
|
# Show a random sample
|
||||||
|
import random
|
||||||
|
|
||||||
|
sample_investors = session.query(InvestorTable).limit(1000).all()
|
||||||
|
sample = random.sample(sample_investors, min(3, len(sample_investors)))
|
||||||
|
|
||||||
|
print("\n📋 Random Sample:")
|
||||||
|
for inv in sample:
|
||||||
|
print(f"\n {inv.name}")
|
||||||
|
print(f" Website: {inv.website or 'N/A'}")
|
||||||
|
print(f" Investments: {inv.number_of_investments or 'N/A'}")
|
||||||
|
print(f" Portfolio: {len(inv.portfolio_companies)} companies")
|
||||||
|
print(f" Sectors: {len(inv.sectors)} sectors")
|
||||||
|
if inv.funds:
|
||||||
|
print(f" Funds: {len(inv.funds)}")
|
||||||
|
|
||||||
|
session.close()
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
|
||||||
|
if fund_count == 0:
|
||||||
|
print("📝 Next step: Run enrichment script")
|
||||||
|
print(" python enrich_investors.py enriched_investors.csv")
|
||||||
|
else:
|
||||||
|
print("✅ Database is enriched and ready!")
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
verify_database()
|
||||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user