import logging import re import unicodedata import pandas as pd from models import CompanyTable, InvestorTable, SectorTable, engine, init_database from sqlalchemy.orm import sessionmaker # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Import the schema init_database() # ===================== Ingesting Original Data =====================# def parse_investor_names(investor_names_str): """Parse comma-separated investor names and return a list""" if pd.isna(investor_names_str) or investor_names_str == "": return [] # Split by comma and clean whitespace # investors = [name.strip() for name in str(investor_names_str).split(",")] investors = [ clean_name(name.strip()) for name in str(investor_names_str).split(",") ] return [investor for investor in investors if investor] def parse_industries(industries_str): """Parse comma-separated industries and return a list""" if pd.isna(industries_str) or industries_str == "": return [] # Split by comma and clean whitespace industries = [industry.strip() for industry in str(industries_str).split(",")] return [industry for industry in industries if industry] def clean_special_characters(text): """Clean special characters from text, converting to ASCII equivalents""" if not text: return text # First remove ellipses and other problematic patterns text = str(text).replace("...", "").replace("..", "") # Normalize unicode characters to their closest ASCII equivalents normalized = unicodedata.normalize("NFKD", text) # Remove accents and convert to ASCII ascii_text = normalized.encode("ascii", "ignore").decode("ascii") # Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text) # Clean up multiple spaces cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned def clean_string(value): """Clean string values, converting empty/null/nan/0 to None and removing special characters""" if ( pd.isna(value) or value == "" or str(value).lower() in ["nan", "null", "none", "0", "0.0"] ): return None # First clean special characters cleaned = clean_special_characters(str(value).strip()) # Check if result is just "0" after cleaning if cleaned in ["0", "0.0", "null", "nan", "none"]: return None return cleaned if cleaned else None def clean_name(value): """Clean names (companies, investors) with special character handling""" if ( pd.isna(value) or value == "" or str(value).lower() in ["nan", "null", "none", "0", "0.0"] ): return None # Clean special characters but be more permissive for names text = str(value).strip() # First remove ellipses and other problematic patterns # text = text.replace("...", "").replace("..", "") # Normalize unicode characters normalized = unicodedata.normalize("NFKD", text) # Convert to ASCII but keep more characters for business names ascii_text = normalized.encode("ascii", "ignore").decode("ascii") # Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text) # Clean up multiple spaces cleaned = re.sub(r"\s+", " ", cleaned).strip() # Remove any trailing or leading periods cleaned = cleaned.strip(".") cleaned = cleaned.replace("..", "").replace("...", "") # Check if result is just "0" after cleaning if cleaned in ["0", "0.0", "null", "nan", "none"]: return None return cleaned if cleaned else None def clean_integer(value): """Clean integer values, converting empty/null/nan/0 to None""" if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]: return None try: cleaned_val = int(float(value)) return cleaned_val if cleaned_val > 0 else None except (ValueError, TypeError): return None def parse_website(website_str: str): try: _, end = website_str.split(":") if end == "0": return None return "https:" + end except Exception: return None def ingest_data(): # Create database engine and session Session = sessionmaker(bind=engine) session = Session() # Load CSV files print("Loading CSV files...") companies_df = pd.read_csv("companies.csv") investors_df = pd.read_csv("investors.csv") print(f"šŸ“Š Companies CSV: {len(companies_df)} rows") print(f"šŸ“Š Investors CSV: {len(investors_df)} rows") # Step 1: Ingest Investors print("\nšŸ”„ Step 1: Ingesting Investors...") investors_processed = 0 for index, row in investors_df.iterrows(): try: investor_name = clean_name(row.get("Filtered investor names", "")) if investor_name: # Check if investor already exists existing_investor = ( session.query(InvestorTable).filter_by(name=investor_name).first() ) if not existing_investor: investor = InvestorTable( name=investor_name, description=clean_string(row.get("Business model", "")), headquarters=clean_string(row.get("HQ", "")), website=parse_website(str(row.get("Website", "")).strip()), number_of_investments=clean_integer( row.get("Number of investments") ), ) session.add(investor) investors_processed += 1 if investors_processed % 1000 == 0: session.commit() print(f" Committed {investors_processed} investors") except Exception as e: logger.error(f"Error processing investor {index}: {e}") continue session.commit() print(f"āœ… Investors completed: {investors_processed} processed") # Step 2: Ingest Companies and Rounds print("\nšŸ”„ Step 2: Ingesting Companies and Sectors...") companies_processed = 0 sectors_created = set() for index, row in companies_df.iterrows(): try: # Process company company_name = clean_name(row.get("Organization Name", "")) if not company_name: continue # Check if company already exists existing_company = ( session.query(CompanyTable).filter_by(name=company_name).first() ) if existing_company: company = existing_company else: # Create company company = CompanyTable( name=company_name, description=clean_string(row.get("Organization Description", "")), location=clean_string(row.get("Organization Location", "")), industry=clean_string(row.get("Organization Industries", "")), website=clean_string(row.get("Organization Website", "")), ) session.add(company) session.flush() # Get the company ID companies_processed += 1 # Process investor relationships investor_names_str = row.get("Investor Names", "") if pd.notna(investor_names_str) and investor_names_str: investor_names = parse_investor_names(investor_names_str) for investor_name in investor_names: # Find investor in database investor = ( session.query(InvestorTable) .filter_by(name=investor_name.strip()) .first() ) if investor: # Add investor-company relationship if company not in investor.portfolio_companies: investor.portfolio_companies.append(company) else: print("This company has an investor not in DB:", investor_name) # Process sectors/industries industries_str = row.get("Organization Industries", "") if pd.notna(industries_str) and industries_str: industries = parse_industries(industries_str) for industry_name in industries: industry_name = industry_name.strip() if industry_name: # Check if sector exists sector = ( session.query(SectorTable) .filter_by(name=industry_name) .first() ) if not sector: sector = SectorTable(name=industry_name) session.add(sector) session.flush() sectors_created.add(industry_name) # Add company-sector relationship if sector not in company.sectors: company.sectors.append(sector) # Commit every 100 companies if companies_processed % 100 == 0 and companies_processed > 0: session.commit() print(f" Processed {companies_processed} companies...") except Exception as e: logger.error(f"Error processing company {index}: {e}") session.rollback() continue # Step 3: Link investors to sectors based on portfolio companies print("\nšŸ”„ Step 3: Linking Investors to Sectors...") investors_linked_to_sectors = 0 all_investors = session.query(InvestorTable).all() for investor in all_investors: sectors = set() for company in investor.portfolio_companies: for sector in company.sectors: sectors.add(sector) # Add sectors to investor if not already present for sector in sectors: if sector not in investor.sectors: investor.sectors.append(sector) if sectors: investors_linked_to_sectors += 1 session.commit() print(f"āœ… Linked {investors_linked_to_sectors} investors to sectors") # Final commit session.commit() # Final counts final_investors = session.query(InvestorTable).count() final_companies = session.query(CompanyTable).count() final_sectors = session.query(SectorTable).count() print("\nšŸŽ‰ Ingestion Complete!") print(f" Investors: {final_investors}") print(f" Companies: {final_companies}") print(f" Sectors: {final_sectors}") session.close() if __name__ == "__main__": ingest_data() # print(clean_name("A... Energi")) # print(clean_name("B.. Tech")) # print(clean_name("A... Energi"))