2025-10-05 19:16:03 +01:00
|
|
|
import logging
|
|
|
|
|
import re
|
|
|
|
|
import unicodedata
|
|
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
|
|
|
|
|
from sqlalchemy.orm import sessionmaker
|
|
|
|
|
|
|
|
|
|
# Set up logging
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
# Import the schema
|
|
|
|
|
init_database()
|
|
|
|
|
|
2025-10-06 12:57:08 +01:00
|
|
|
|
|
|
|
|
# ===================== Ingesting Original Data =====================#
|
2025-10-05 19:16:03 +01:00
|
|
|
def parse_investor_names(investor_names_str):
|
|
|
|
|
"""Parse comma-separated investor names and return a list"""
|
|
|
|
|
if pd.isna(investor_names_str) or investor_names_str == "":
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# Split by comma and clean whitespace
|
|
|
|
|
# investors = [name.strip() for name in str(investor_names_str).split(",")]
|
2025-10-06 12:57:08 +01:00
|
|
|
investors = [
|
|
|
|
|
clean_name(name.strip()) for name in str(investor_names_str).split(",")
|
|
|
|
|
]
|
2025-10-05 19:16:03 +01:00
|
|
|
return [investor for investor in investors if investor]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_industries(industries_str):
|
|
|
|
|
"""Parse comma-separated industries and return a list"""
|
|
|
|
|
if pd.isna(industries_str) or industries_str == "":
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
# Split by comma and clean whitespace
|
|
|
|
|
industries = [industry.strip() for industry in str(industries_str).split(",")]
|
|
|
|
|
return [industry for industry in industries if industry]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_special_characters(text):
|
|
|
|
|
"""Clean special characters from text, converting to ASCII equivalents"""
|
|
|
|
|
if not text:
|
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
# First remove ellipses and other problematic patterns
|
|
|
|
|
text = str(text).replace("...", "").replace("..", "")
|
|
|
|
|
|
|
|
|
|
# Normalize unicode characters to their closest ASCII equivalents
|
|
|
|
|
normalized = unicodedata.normalize("NFKD", text)
|
|
|
|
|
|
|
|
|
|
# Remove accents and convert to ASCII
|
|
|
|
|
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
|
|
|
|
|
|
|
|
|
# Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
|
|
|
|
|
cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)
|
|
|
|
|
|
|
|
|
|
# Clean up multiple spaces
|
|
|
|
|
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
|
|
|
|
|
|
|
|
return cleaned
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_string(value):
|
|
|
|
|
"""Clean string values, converting empty/null/nan/0 to None and removing special characters"""
|
|
|
|
|
if (
|
|
|
|
|
pd.isna(value)
|
|
|
|
|
or value == ""
|
|
|
|
|
or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
|
|
|
|
|
):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# First clean special characters
|
|
|
|
|
cleaned = clean_special_characters(str(value).strip())
|
|
|
|
|
|
|
|
|
|
# Check if result is just "0" after cleaning
|
|
|
|
|
if cleaned in ["0", "0.0", "null", "nan", "none"]:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
return cleaned if cleaned else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_name(value):
|
|
|
|
|
"""Clean names (companies, investors) with special character handling"""
|
|
|
|
|
if (
|
|
|
|
|
pd.isna(value)
|
|
|
|
|
or value == ""
|
|
|
|
|
or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
|
|
|
|
|
):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
# Clean special characters but be more permissive for names
|
|
|
|
|
text = str(value).strip()
|
|
|
|
|
# First remove ellipses and other problematic patterns
|
|
|
|
|
# text = text.replace("...", "").replace("..", "")
|
|
|
|
|
|
|
|
|
|
# Normalize unicode characters
|
|
|
|
|
normalized = unicodedata.normalize("NFKD", text)
|
|
|
|
|
|
|
|
|
|
# Convert to ASCII but keep more characters for business names
|
|
|
|
|
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
|
|
|
|
|
|
|
|
|
# Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
|
|
|
|
|
cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)
|
|
|
|
|
|
|
|
|
|
# Clean up multiple spaces
|
|
|
|
|
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
|
|
|
|
|
|
|
|
|
# Remove any trailing or leading periods
|
|
|
|
|
cleaned = cleaned.strip(".")
|
|
|
|
|
|
|
|
|
|
cleaned = cleaned.replace("..", "").replace("...", "")
|
|
|
|
|
# Check if result is just "0" after cleaning
|
|
|
|
|
if cleaned in ["0", "0.0", "null", "nan", "none"]:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
return cleaned if cleaned else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_integer(value):
|
|
|
|
|
"""Clean integer values, converting empty/null/nan/0 to None"""
|
|
|
|
|
if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
|
|
|
|
|
return None
|
|
|
|
|
try:
|
|
|
|
|
cleaned_val = int(float(value))
|
|
|
|
|
return cleaned_val if cleaned_val > 0 else None
|
|
|
|
|
except (ValueError, TypeError):
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def parse_website(website_str: str):
|
|
|
|
|
try:
|
|
|
|
|
_, end = website_str.split(":")
|
|
|
|
|
|
|
|
|
|
if end == "0":
|
|
|
|
|
return None
|
|
|
|
|
return "https:" + end
|
|
|
|
|
except Exception:
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ingest_data():
|
|
|
|
|
# Create database engine and session
|
|
|
|
|
Session = sessionmaker(bind=engine)
|
|
|
|
|
session = Session()
|
|
|
|
|
|
|
|
|
|
# Load CSV files
|
|
|
|
|
print("Loading CSV files...")
|
|
|
|
|
companies_df = pd.read_csv("companies.csv")
|
|
|
|
|
investors_df = pd.read_csv("investors.csv")
|
|
|
|
|
|
|
|
|
|
print(f"📊 Companies CSV: {len(companies_df)} rows")
|
|
|
|
|
print(f"📊 Investors CSV: {len(investors_df)} rows")
|
|
|
|
|
|
|
|
|
|
# Step 1: Ingest Investors
|
|
|
|
|
print("\n🔄 Step 1: Ingesting Investors...")
|
|
|
|
|
investors_processed = 0
|
|
|
|
|
|
|
|
|
|
for index, row in investors_df.iterrows():
|
|
|
|
|
try:
|
|
|
|
|
investor_name = clean_name(row.get("Filtered investor names", ""))
|
|
|
|
|
|
|
|
|
|
if investor_name:
|
|
|
|
|
# Check if investor already exists
|
|
|
|
|
existing_investor = (
|
|
|
|
|
session.query(InvestorTable).filter_by(name=investor_name).first()
|
|
|
|
|
)
|
|
|
|
|
if not existing_investor:
|
|
|
|
|
investor = InvestorTable(
|
|
|
|
|
name=investor_name,
|
2025-10-06 12:57:08 +01:00
|
|
|
description=clean_string(row.get("Business model", "")),
|
|
|
|
|
headquarters=clean_string(row.get("HQ", "")),
|
2025-10-05 19:16:03 +01:00
|
|
|
website=parse_website(str(row.get("Website", "")).strip()),
|
|
|
|
|
number_of_investments=clean_integer(
|
|
|
|
|
row.get("Number of investments")
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
session.add(investor)
|
|
|
|
|
investors_processed += 1
|
|
|
|
|
|
|
|
|
|
if investors_processed % 1000 == 0:
|
|
|
|
|
session.commit()
|
|
|
|
|
print(f" Committed {investors_processed} investors")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error processing investor {index}: {e}")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
session.commit()
|
|
|
|
|
print(f"✅ Investors completed: {investors_processed} processed")
|
|
|
|
|
|
|
|
|
|
# Step 2: Ingest Companies and Rounds
|
|
|
|
|
print("\n🔄 Step 2: Ingesting Companies and Sectors...")
|
|
|
|
|
companies_processed = 0
|
|
|
|
|
sectors_created = set()
|
|
|
|
|
|
|
|
|
|
for index, row in companies_df.iterrows():
|
|
|
|
|
try:
|
|
|
|
|
# Process company
|
|
|
|
|
company_name = clean_name(row.get("Organization Name", ""))
|
|
|
|
|
if not company_name:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Check if company already exists
|
|
|
|
|
existing_company = (
|
|
|
|
|
session.query(CompanyTable).filter_by(name=company_name).first()
|
|
|
|
|
)
|
|
|
|
|
if existing_company:
|
|
|
|
|
company = existing_company
|
|
|
|
|
else:
|
|
|
|
|
# Create company
|
|
|
|
|
company = CompanyTable(
|
|
|
|
|
name=company_name,
|
|
|
|
|
description=clean_string(row.get("Organization Description", "")),
|
|
|
|
|
location=clean_string(row.get("Organization Location", "")),
|
|
|
|
|
industry=clean_string(row.get("Organization Industries", "")),
|
|
|
|
|
website=clean_string(row.get("Organization Website", "")),
|
|
|
|
|
)
|
|
|
|
|
session.add(company)
|
|
|
|
|
session.flush() # Get the company ID
|
|
|
|
|
companies_processed += 1
|
|
|
|
|
|
|
|
|
|
# Process investor relationships
|
|
|
|
|
investor_names_str = row.get("Investor Names", "")
|
|
|
|
|
if pd.notna(investor_names_str) and investor_names_str:
|
|
|
|
|
investor_names = parse_investor_names(investor_names_str)
|
|
|
|
|
|
|
|
|
|
for investor_name in investor_names:
|
|
|
|
|
# Find investor in database
|
|
|
|
|
investor = (
|
|
|
|
|
session.query(InvestorTable)
|
|
|
|
|
.filter_by(name=investor_name.strip())
|
|
|
|
|
.first()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if investor:
|
|
|
|
|
# Add investor-company relationship
|
|
|
|
|
if company not in investor.portfolio_companies:
|
|
|
|
|
investor.portfolio_companies.append(company)
|
|
|
|
|
else:
|
|
|
|
|
print("This company has an investor not in DB:", investor_name)
|
|
|
|
|
|
|
|
|
|
# Process sectors/industries
|
|
|
|
|
industries_str = row.get("Organization Industries", "")
|
|
|
|
|
if pd.notna(industries_str) and industries_str:
|
|
|
|
|
industries = parse_industries(industries_str)
|
|
|
|
|
|
|
|
|
|
for industry_name in industries:
|
|
|
|
|
industry_name = industry_name.strip()
|
|
|
|
|
if industry_name:
|
|
|
|
|
# Check if sector exists
|
|
|
|
|
sector = (
|
|
|
|
|
session.query(SectorTable)
|
|
|
|
|
.filter_by(name=industry_name)
|
|
|
|
|
.first()
|
|
|
|
|
)
|
|
|
|
|
if not sector:
|
|
|
|
|
sector = SectorTable(name=industry_name)
|
|
|
|
|
session.add(sector)
|
|
|
|
|
session.flush()
|
|
|
|
|
sectors_created.add(industry_name)
|
|
|
|
|
|
|
|
|
|
# Add company-sector relationship
|
|
|
|
|
if sector not in company.sectors:
|
|
|
|
|
company.sectors.append(sector)
|
|
|
|
|
|
|
|
|
|
# Commit every 100 companies
|
|
|
|
|
if companies_processed % 100 == 0 and companies_processed > 0:
|
|
|
|
|
session.commit()
|
|
|
|
|
print(f" Processed {companies_processed} companies...")
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Error processing company {index}: {e}")
|
|
|
|
|
session.rollback()
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# Step 3: Link investors to sectors based on portfolio companies
|
|
|
|
|
print("\n🔄 Step 3: Linking Investors to Sectors...")
|
|
|
|
|
investors_linked_to_sectors = 0
|
|
|
|
|
all_investors = session.query(InvestorTable).all()
|
|
|
|
|
for investor in all_investors:
|
|
|
|
|
sectors = set()
|
|
|
|
|
for company in investor.portfolio_companies:
|
|
|
|
|
for sector in company.sectors:
|
|
|
|
|
sectors.add(sector)
|
|
|
|
|
# Add sectors to investor if not already present
|
|
|
|
|
for sector in sectors:
|
|
|
|
|
if sector not in investor.sectors:
|
|
|
|
|
investor.sectors.append(sector)
|
|
|
|
|
if sectors:
|
|
|
|
|
investors_linked_to_sectors += 1
|
|
|
|
|
session.commit()
|
|
|
|
|
print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")
|
|
|
|
|
|
|
|
|
|
# Final commit
|
|
|
|
|
session.commit()
|
|
|
|
|
|
|
|
|
|
# Final counts
|
|
|
|
|
final_investors = session.query(InvestorTable).count()
|
|
|
|
|
final_companies = session.query(CompanyTable).count()
|
|
|
|
|
final_sectors = session.query(SectorTable).count()
|
|
|
|
|
|
|
|
|
|
print("\n🎉 Ingestion Complete!")
|
|
|
|
|
print(f" Investors: {final_investors}")
|
|
|
|
|
print(f" Companies: {final_companies}")
|
|
|
|
|
print(f" Sectors: {final_sectors}")
|
|
|
|
|
|
|
|
|
|
session.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
ingest_data()
|
|
|
|
|
# print(clean_name("A... Energi"))
|
|
|
|
|
# print(clean_name("B.. Tech"))
|
|
|
|
|
# print(clean_name("A... Energi"))
|