Added funds table
This commit is contained in:
@@ -0,0 +1,314 @@
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
import pandas as pd
|
||||
from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
# Set up logging
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Import the schema
|
||||
init_database()
|
||||
|
||||
#===================== Ingesting Original Data =====================#
|
||||
def parse_investor_names(investor_names_str):
|
||||
"""Parse comma-separated investor names and return a list"""
|
||||
if pd.isna(investor_names_str) or investor_names_str == "":
|
||||
return []
|
||||
|
||||
# Split by comma and clean whitespace
|
||||
# investors = [name.strip() for name in str(investor_names_str).split(",")]
|
||||
investors = [clean_name(name.strip()) for name in str(investor_names_str).split(",")]
|
||||
return [investor for investor in investors if investor]
|
||||
|
||||
|
||||
def parse_industries(industries_str):
|
||||
"""Parse comma-separated industries and return a list"""
|
||||
if pd.isna(industries_str) or industries_str == "":
|
||||
return []
|
||||
|
||||
# Split by comma and clean whitespace
|
||||
industries = [industry.strip() for industry in str(industries_str).split(",")]
|
||||
return [industry for industry in industries if industry]
|
||||
|
||||
|
||||
def clean_special_characters(text):
|
||||
"""Clean special characters from text, converting to ASCII equivalents"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# First remove ellipses and other problematic patterns
|
||||
text = str(text).replace("...", "").replace("..", "")
|
||||
|
||||
# Normalize unicode characters to their closest ASCII equivalents
|
||||
normalized = unicodedata.normalize("NFKD", text)
|
||||
|
||||
# Remove accents and convert to ASCII
|
||||
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
|
||||
# Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
|
||||
cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)
|
||||
|
||||
# Clean up multiple spaces
|
||||
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def clean_string(value):
|
||||
"""Clean string values, converting empty/null/nan/0 to None and removing special characters"""
|
||||
if (
|
||||
pd.isna(value)
|
||||
or value == ""
|
||||
or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
|
||||
):
|
||||
return None
|
||||
|
||||
# First clean special characters
|
||||
cleaned = clean_special_characters(str(value).strip())
|
||||
|
||||
# Check if result is just "0" after cleaning
|
||||
if cleaned in ["0", "0.0", "null", "nan", "none"]:
|
||||
return None
|
||||
|
||||
return cleaned if cleaned else None
|
||||
|
||||
|
||||
def clean_name(value):
|
||||
"""Clean names (companies, investors) with special character handling"""
|
||||
if (
|
||||
pd.isna(value)
|
||||
or value == ""
|
||||
or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
|
||||
):
|
||||
return None
|
||||
|
||||
# Clean special characters but be more permissive for names
|
||||
text = str(value).strip()
|
||||
# First remove ellipses and other problematic patterns
|
||||
# text = text.replace("...", "").replace("..", "")
|
||||
|
||||
# Normalize unicode characters
|
||||
normalized = unicodedata.normalize("NFKD", text)
|
||||
|
||||
# Convert to ASCII but keep more characters for business names
|
||||
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
|
||||
# Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
|
||||
cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)
|
||||
|
||||
# Clean up multiple spaces
|
||||
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
||||
|
||||
# Remove any trailing or leading periods
|
||||
cleaned = cleaned.strip(".")
|
||||
|
||||
cleaned = cleaned.replace("..", "").replace("...", "")
|
||||
# Check if result is just "0" after cleaning
|
||||
if cleaned in ["0", "0.0", "null", "nan", "none"]:
|
||||
return None
|
||||
|
||||
return cleaned if cleaned else None
|
||||
|
||||
|
||||
def clean_integer(value):
|
||||
"""Clean integer values, converting empty/null/nan/0 to None"""
|
||||
if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
|
||||
return None
|
||||
try:
|
||||
cleaned_val = int(float(value))
|
||||
return cleaned_val if cleaned_val > 0 else None
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def parse_website(website_str: str):
|
||||
try:
|
||||
_, end = website_str.split(":")
|
||||
|
||||
if end == "0":
|
||||
return None
|
||||
return "https:" + end
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def ingest_data():
|
||||
# Create database engine and session
|
||||
Session = sessionmaker(bind=engine)
|
||||
session = Session()
|
||||
|
||||
# Load CSV files
|
||||
print("Loading CSV files...")
|
||||
companies_df = pd.read_csv("companies.csv")
|
||||
investors_df = pd.read_csv("investors.csv")
|
||||
|
||||
print(f"📊 Companies CSV: {len(companies_df)} rows")
|
||||
print(f"📊 Investors CSV: {len(investors_df)} rows")
|
||||
|
||||
# Step 1: Ingest Investors
|
||||
print("\n🔄 Step 1: Ingesting Investors...")
|
||||
investors_processed = 0
|
||||
|
||||
for index, row in investors_df.iterrows():
|
||||
try:
|
||||
investor_name = clean_name(row.get("Filtered investor names", ""))
|
||||
|
||||
if investor_name:
|
||||
# Check if investor already exists
|
||||
existing_investor = (
|
||||
session.query(InvestorTable).filter_by(name=investor_name).first()
|
||||
)
|
||||
if not existing_investor:
|
||||
investor = InvestorTable(
|
||||
name=investor_name,
|
||||
# description=clean_string(row.get("Business model", "")),
|
||||
# geographic_focus=clean_string(row.get("HQ", "")),
|
||||
website=parse_website(str(row.get("Website", "")).strip()),
|
||||
number_of_investments=clean_integer(
|
||||
row.get("Number of investments")
|
||||
),
|
||||
)
|
||||
session.add(investor)
|
||||
investors_processed += 1
|
||||
|
||||
if investors_processed % 1000 == 0:
|
||||
session.commit()
|
||||
print(f" Committed {investors_processed} investors")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing investor {index}: {e}")
|
||||
continue
|
||||
|
||||
session.commit()
|
||||
print(f"✅ Investors completed: {investors_processed} processed")
|
||||
|
||||
# Step 2: Ingest Companies and Rounds
|
||||
print("\n🔄 Step 2: Ingesting Companies and Sectors...")
|
||||
companies_processed = 0
|
||||
sectors_created = set()
|
||||
|
||||
for index, row in companies_df.iterrows():
|
||||
try:
|
||||
# Process company
|
||||
company_name = clean_name(row.get("Organization Name", ""))
|
||||
if not company_name:
|
||||
continue
|
||||
|
||||
# Check if company already exists
|
||||
existing_company = (
|
||||
session.query(CompanyTable).filter_by(name=company_name).first()
|
||||
)
|
||||
if existing_company:
|
||||
company = existing_company
|
||||
else:
|
||||
# Create company
|
||||
company = CompanyTable(
|
||||
name=company_name,
|
||||
description=clean_string(row.get("Organization Description", "")),
|
||||
location=clean_string(row.get("Organization Location", "")),
|
||||
industry=clean_string(row.get("Organization Industries", "")),
|
||||
website=clean_string(row.get("Organization Website", "")),
|
||||
)
|
||||
session.add(company)
|
||||
session.flush() # Get the company ID
|
||||
companies_processed += 1
|
||||
|
||||
# Process investor relationships
|
||||
investor_names_str = row.get("Investor Names", "")
|
||||
if pd.notna(investor_names_str) and investor_names_str:
|
||||
investor_names = parse_investor_names(investor_names_str)
|
||||
|
||||
for investor_name in investor_names:
|
||||
# Find investor in database
|
||||
investor = (
|
||||
session.query(InvestorTable)
|
||||
.filter_by(name=investor_name.strip())
|
||||
.first()
|
||||
)
|
||||
|
||||
if investor:
|
||||
# Add investor-company relationship
|
||||
if company not in investor.portfolio_companies:
|
||||
investor.portfolio_companies.append(company)
|
||||
else:
|
||||
print("This company has an investor not in DB:", investor_name)
|
||||
|
||||
# Process sectors/industries
|
||||
industries_str = row.get("Organization Industries", "")
|
||||
if pd.notna(industries_str) and industries_str:
|
||||
industries = parse_industries(industries_str)
|
||||
|
||||
for industry_name in industries:
|
||||
industry_name = industry_name.strip()
|
||||
if industry_name:
|
||||
# Check if sector exists
|
||||
sector = (
|
||||
session.query(SectorTable)
|
||||
.filter_by(name=industry_name)
|
||||
.first()
|
||||
)
|
||||
if not sector:
|
||||
sector = SectorTable(name=industry_name)
|
||||
session.add(sector)
|
||||
session.flush()
|
||||
sectors_created.add(industry_name)
|
||||
|
||||
# Add company-sector relationship
|
||||
if sector not in company.sectors:
|
||||
company.sectors.append(sector)
|
||||
|
||||
# Commit every 100 companies
|
||||
if companies_processed % 100 == 0 and companies_processed > 0:
|
||||
session.commit()
|
||||
print(f" Processed {companies_processed} companies...")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing company {index}: {e}")
|
||||
session.rollback()
|
||||
continue
|
||||
|
||||
# Step 3: Link investors to sectors based on portfolio companies
|
||||
print("\n🔄 Step 3: Linking Investors to Sectors...")
|
||||
investors_linked_to_sectors = 0
|
||||
all_investors = session.query(InvestorTable).all()
|
||||
for investor in all_investors:
|
||||
sectors = set()
|
||||
for company in investor.portfolio_companies:
|
||||
for sector in company.sectors:
|
||||
sectors.add(sector)
|
||||
# Add sectors to investor if not already present
|
||||
for sector in sectors:
|
||||
if sector not in investor.sectors:
|
||||
investor.sectors.append(sector)
|
||||
if sectors:
|
||||
investors_linked_to_sectors += 1
|
||||
session.commit()
|
||||
print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")
|
||||
|
||||
# Final commit
|
||||
session.commit()
|
||||
|
||||
# Final counts
|
||||
final_investors = session.query(InvestorTable).count()
|
||||
final_companies = session.query(CompanyTable).count()
|
||||
final_sectors = session.query(SectorTable).count()
|
||||
|
||||
print("\n🎉 Ingestion Complete!")
|
||||
print(f" Investors: {final_investors}")
|
||||
print(f" Companies: {final_companies}")
|
||||
print(f" Sectors: {final_sectors}")
|
||||
|
||||
session.close()
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ingest_data()
|
||||
# print(clean_name("A... Energi"))
|
||||
# print(clean_name("B.. Tech"))
|
||||
# print(clean_name("A... Energi"))
|
||||
Reference in New Issue
Block a user