base_db_generator/main.py

import logging
import re
import unicodedata

import pandas as pd
from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
from sqlalchemy.orm import sessionmaker

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Import the schema
init_database()


# ===================== Ingesting Original Data =====================#
def parse_investor_names(investor_names_str):
    """Parse comma-separated investor names and return a list"""
    if pd.isna(investor_names_str) or investor_names_str == "":
        return []

    # Split by comma and clean whitespace
    # investors = [name.strip() for name in str(investor_names_str).split(",")]
    investors = [
        clean_name(name.strip()) for name in str(investor_names_str).split(",")
    ]
    return [investor for investor in investors if investor]


def parse_industries(industries_str):
    """Parse comma-separated industries and return a list"""
    if pd.isna(industries_str) or industries_str == "":
        return []

    # Split by comma and clean whitespace
    industries = [industry.strip() for industry in str(industries_str).split(",")]
    return [industry for industry in industries if industry]


def clean_special_characters(text):
    """Clean special characters from text, converting to ASCII equivalents"""
    if not text:
        return text

    # First remove ellipses and other problematic patterns
    text = str(text).replace("...", "").replace("..", "")

    # Normalize unicode characters to their closest ASCII equivalents
    normalized = unicodedata.normalize("NFKD", text)

    # Remove accents and convert to ASCII
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")

    # Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)

    # Clean up multiple spaces
    cleaned = re.sub(r"\s+", " ", cleaned).strip()

    return cleaned


def clean_string(value):
    """Clean string values, converting empty/null/nan/0 to None and removing special characters"""
    if (
        pd.isna(value)
        or value == ""
        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
    ):
        return None

    # First clean special characters
    cleaned = clean_special_characters(str(value).strip())

    # Check if result is just "0" after cleaning
    if cleaned in ["0", "0.0", "null", "nan", "none"]:
        return None

    return cleaned if cleaned else None


def clean_name(value):
    """Clean names (companies, investors) with special character handling"""
    if (
        pd.isna(value)
        or value == ""
        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
    ):
        return None

    # Clean special characters but be more permissive for names
    text = str(value).strip()
    # First remove ellipses and other problematic patterns
    # text = text.replace("...", "").replace("..", "")

    # Normalize unicode characters
    normalized = unicodedata.normalize("NFKD", text)

    # Convert to ASCII but keep more characters for business names
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")

    # Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)

    # Clean up multiple spaces
    cleaned = re.sub(r"\s+", " ", cleaned).strip()

    # Remove any trailing or leading periods
    cleaned = cleaned.strip(".")

    cleaned = cleaned.replace("..", "").replace("...", "")
    # Check if result is just "0" after cleaning
    if cleaned in ["0", "0.0", "null", "nan", "none"]:
        return None

    return cleaned if cleaned else None


def clean_integer(value):
    """Clean integer values, converting empty/null/nan/0 to None"""
    if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
        return None
    try:
        cleaned_val = int(float(value))
        return cleaned_val if cleaned_val > 0 else None
    except (ValueError, TypeError):
        return None


def parse_website(website_str: str):
    try:
        _, end = website_str.split(":")

        if end == "0":
            return None
        return "https:" + end
    except Exception:
        return None


def ingest_data():
    # Create database engine and session
    Session = sessionmaker(bind=engine)
    session = Session()

    # Load CSV files
    print("Loading CSV files...")
    companies_df = pd.read_csv("companies.csv")
    investors_df = pd.read_csv("investors.csv")

    print(f"📊 Companies CSV: {len(companies_df)} rows")
    print(f"📊 Investors CSV: {len(investors_df)} rows")

    # Step 1: Ingest Investors
    print("\n🔄 Step 1: Ingesting Investors...")
    investors_processed = 0

    for index, row in investors_df.iterrows():
        try:
            investor_name = clean_name(row.get("Filtered investor names", ""))

            if investor_name:
                # Check if investor already exists
                existing_investor = (
                    session.query(InvestorTable).filter_by(name=investor_name).first()
                )
                if not existing_investor:
                    investor = InvestorTable(
                        name=investor_name,
                        description=clean_string(row.get("Business model", "")),
                        headquarters=clean_string(row.get("HQ", "")),
                        website=parse_website(str(row.get("Website", "")).strip()),
                        number_of_investments=clean_integer(
                            row.get("Number of investments")
                        ),
                    )
                    session.add(investor)
                    investors_processed += 1

                    if investors_processed % 1000 == 0:
                        session.commit()
                        print(f"  Committed {investors_processed} investors")

        except Exception as e:
            logger.error(f"Error processing investor {index}: {e}")
            continue

    session.commit()
    print(f"✅ Investors completed: {investors_processed} processed")

    # Step 2: Ingest Companies and Rounds
    print("\n🔄 Step 2: Ingesting Companies and Sectors...")
    companies_processed = 0
    sectors_created = set()

    for index, row in companies_df.iterrows():
        try:
            # Process company
            company_name = clean_name(row.get("Organization Name", ""))
            if not company_name:
                continue

            # Check if company already exists
            existing_company = (
                session.query(CompanyTable).filter_by(name=company_name).first()
            )
            if existing_company:
                company = existing_company
            else:
                # Create company
                company = CompanyTable(
                    name=company_name,
                    description=clean_string(row.get("Organization Description", "")),
                    location=clean_string(row.get("Organization Location", "")),
                    industry=clean_string(row.get("Organization Industries", "")),
                    website=clean_string(row.get("Organization Website", "")),
                )
                session.add(company)
                session.flush()  # Get the company ID
                companies_processed += 1

            # Process investor relationships
            investor_names_str = row.get("Investor Names", "")
            if pd.notna(investor_names_str) and investor_names_str:
                investor_names = parse_investor_names(investor_names_str)

                for investor_name in investor_names:
                    # Find investor in database
                    investor = (
                        session.query(InvestorTable)
                        .filter_by(name=investor_name.strip())
                        .first()
                    )

                    if investor:
                        # Add investor-company relationship
                        if company not in investor.portfolio_companies:
                            investor.portfolio_companies.append(company)
                    else:
                        print("This company has an investor not in DB:", investor_name)

            # Process sectors/industries
            industries_str = row.get("Organization Industries", "")
            if pd.notna(industries_str) and industries_str:
                industries = parse_industries(industries_str)

                for industry_name in industries:
                    industry_name = industry_name.strip()
                    if industry_name:
                        # Check if sector exists
                        sector = (
                            session.query(SectorTable)
                            .filter_by(name=industry_name)
                            .first()
                        )
                        if not sector:
                            sector = SectorTable(name=industry_name)
                            session.add(sector)
                            session.flush()
                            sectors_created.add(industry_name)

                        # Add company-sector relationship
                        if sector not in company.sectors:
                            company.sectors.append(sector)

            # Commit every 100 companies
            if companies_processed % 100 == 0 and companies_processed > 0:
                session.commit()
                print(f"  Processed {companies_processed} companies...")

        except Exception as e:
            logger.error(f"Error processing company {index}: {e}")
            session.rollback()
            continue

    # Step 3: Link investors to sectors based on portfolio companies
    print("\n🔄 Step 3: Linking Investors to Sectors...")
    investors_linked_to_sectors = 0
    all_investors = session.query(InvestorTable).all()
    for investor in all_investors:
        sectors = set()
        for company in investor.portfolio_companies:
            for sector in company.sectors:
                sectors.add(sector)
        # Add sectors to investor if not already present
        for sector in sectors:
            if sector not in investor.sectors:
                investor.sectors.append(sector)
        if sectors:
            investors_linked_to_sectors += 1
    session.commit()
    print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")

    # Final commit
    session.commit()

    # Final counts
    final_investors = session.query(InvestorTable).count()
    final_companies = session.query(CompanyTable).count()
    final_sectors = session.query(SectorTable).count()

    print("\n🎉 Ingestion Complete!")
    print(f"   Investors: {final_investors}")
    print(f"   Companies: {final_companies}")
    print(f"   Sectors: {final_sectors}")

    session.close()


if __name__ == "__main__":
    ingest_data()
    # print(clean_name("A... Energi"))
    # print(clean_name("B.. Tech"))
    # print(clean_name("A... Energi"))
Added funds table 2025-10-05 19:16:03 +01:00			`import logging`
			`import re`
			`import unicodedata`

			`import pandas as pd`
			`from models import CompanyTable, InvestorTable, SectorTable, engine, init_database`
			`from sqlalchemy.orm import sessionmaker`

			`# Set up logging`
			`logging.basicConfig(level=logging.INFO)`
			`logger = logging.getLogger(__name__)`

			`# Import the schema`
			`init_database()`

Refactor code structure for improved readability and maintainability 2025-10-06 12:57:08 +01:00
			`# ===================== Ingesting Original Data =====================#`
Added funds table 2025-10-05 19:16:03 +01:00			`def parse_investor_names(investor_names_str):`
			`"""Parse comma-separated investor names and return a list"""`
			`if pd.isna(investor_names_str) or investor_names_str == "":`
			`return []`

			`# Split by comma and clean whitespace`
			`# investors = [name.strip() for name in str(investor_names_str).split(",")]`
Refactor code structure for improved readability and maintainability 2025-10-06 12:57:08 +01:00			`investors = [`
			`clean_name(name.strip()) for name in str(investor_names_str).split(",")`
			`]`
Added funds table 2025-10-05 19:16:03 +01:00			`return [investor for investor in investors if investor]`


			`def parse_industries(industries_str):`
			`"""Parse comma-separated industries and return a list"""`
			`if pd.isna(industries_str) or industries_str == "":`
			`return []`

			`# Split by comma and clean whitespace`
			`industries = [industry.strip() for industry in str(industries_str).split(",")]`
			`return [industry for industry in industries if industry]`


			`def clean_special_characters(text):`
			`"""Clean special characters from text, converting to ASCII equivalents"""`
			`if not text:`
			`return text`

			`# First remove ellipses and other problematic patterns`
			`text = str(text).replace("...", "").replace("..", "")`

			`# Normalize unicode characters to their closest ASCII equivalents`
			`normalized = unicodedata.normalize("NFKD", text)`

			`# Remove accents and convert to ASCII`
			`ascii_text = normalized.encode("ascii", "ignore").decode("ascii")`

			`# Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods`
			`cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)`

			`# Clean up multiple spaces`
			`cleaned = re.sub(r"\s+", " ", cleaned).strip()`

			`return cleaned`


			`def clean_string(value):`
			`"""Clean string values, converting empty/null/nan/0 to None and removing special characters"""`
			`if (`
			`pd.isna(value)`
			`or value == ""`
			`or str(value).lower() in ["nan", "null", "none", "0", "0.0"]`
			`):`
			`return None`

			`# First clean special characters`
			`cleaned = clean_special_characters(str(value).strip())`

			`# Check if result is just "0" after cleaning`
			`if cleaned in ["0", "0.0", "null", "nan", "none"]:`
			`return None`

			`return cleaned if cleaned else None`


			`def clean_name(value):`
			`"""Clean names (companies, investors) with special character handling"""`
			`if (`
			`pd.isna(value)`
			`or value == ""`
			`or str(value).lower() in ["nan", "null", "none", "0", "0.0"]`
			`):`
			`return None`

			`# Clean special characters but be more permissive for names`
			`text = str(value).strip()`
			`# First remove ellipses and other problematic patterns`
			`# text = text.replace("...", "").replace("..", "")`

			`# Normalize unicode characters`
			`normalized = unicodedata.normalize("NFKD", text)`

			`# Convert to ASCII but keep more characters for business names`
			`ascii_text = normalized.encode("ascii", "ignore").decode("ascii")`

			`# Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands`
			`cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)`

			`# Clean up multiple spaces`
			`cleaned = re.sub(r"\s+", " ", cleaned).strip()`

			`# Remove any trailing or leading periods`
			`cleaned = cleaned.strip(".")`

			`cleaned = cleaned.replace("..", "").replace("...", "")`
			`# Check if result is just "0" after cleaning`
			`if cleaned in ["0", "0.0", "null", "nan", "none"]:`
			`return None`

			`return cleaned if cleaned else None`


			`def clean_integer(value):`
			`"""Clean integer values, converting empty/null/nan/0 to None"""`
			`if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:`
			`return None`
			`try:`
			`cleaned_val = int(float(value))`
			`return cleaned_val if cleaned_val > 0 else None`
			`except (ValueError, TypeError):`
			`return None`


			`def parse_website(website_str: str):`
			`try:`
			`_, end = website_str.split(":")`

			`if end == "0":`
			`return None`
			`return "https:" + end`
			`except Exception:`
			`return None`


			`def ingest_data():`
			`# Create database engine and session`
			`Session = sessionmaker(bind=engine)`
			`session = Session()`

			`# Load CSV files`
			`print("Loading CSV files...")`
			`companies_df = pd.read_csv("companies.csv")`
			`investors_df = pd.read_csv("investors.csv")`

			`print(f"📊 Companies CSV: {len(companies_df)} rows")`
			`print(f"📊 Investors CSV: {len(investors_df)} rows")`

			`# Step 1: Ingest Investors`
			`print("\n🔄 Step 1: Ingesting Investors...")`
			`investors_processed = 0`

			`for index, row in investors_df.iterrows():`
			`try:`
			`investor_name = clean_name(row.get("Filtered investor names", ""))`

			`if investor_name:`
			`# Check if investor already exists`
			`existing_investor = (`
			`session.query(InvestorTable).filter_by(name=investor_name).first()`
			`)`
			`if not existing_investor:`
			`investor = InvestorTable(`
			`name=investor_name,`
Refactor code structure for improved readability and maintainability 2025-10-06 12:57:08 +01:00			`description=clean_string(row.get("Business model", "")),`
			`headquarters=clean_string(row.get("HQ", "")),`
Added funds table 2025-10-05 19:16:03 +01:00			`website=parse_website(str(row.get("Website", "")).strip()),`
			`number_of_investments=clean_integer(`
			`row.get("Number of investments")`
			`),`
			`)`
			`session.add(investor)`
			`investors_processed += 1`

			`if investors_processed % 1000 == 0:`
			`session.commit()`
			`print(f" Committed {investors_processed} investors")`

			`except Exception as e:`
			`logger.error(f"Error processing investor {index}: {e}")`
			`continue`

			`session.commit()`
			`print(f"✅ Investors completed: {investors_processed} processed")`

			`# Step 2: Ingest Companies and Rounds`
			`print("\n🔄 Step 2: Ingesting Companies and Sectors...")`
			`companies_processed = 0`
			`sectors_created = set()`

			`for index, row in companies_df.iterrows():`
			`try:`
			`# Process company`
			`company_name = clean_name(row.get("Organization Name", ""))`
			`if not company_name:`
			`continue`

			`# Check if company already exists`
			`existing_company = (`
			`session.query(CompanyTable).filter_by(name=company_name).first()`
			`)`
			`if existing_company:`
			`company = existing_company`
			`else:`
			`# Create company`
			`company = CompanyTable(`
			`name=company_name,`
			`description=clean_string(row.get("Organization Description", "")),`
			`location=clean_string(row.get("Organization Location", "")),`
			`industry=clean_string(row.get("Organization Industries", "")),`
			`website=clean_string(row.get("Organization Website", "")),`
			`)`
			`session.add(company)`
			`session.flush() # Get the company ID`
			`companies_processed += 1`

			`# Process investor relationships`
			`investor_names_str = row.get("Investor Names", "")`
			`if pd.notna(investor_names_str) and investor_names_str:`
			`investor_names = parse_investor_names(investor_names_str)`

			`for investor_name in investor_names:`
			`# Find investor in database`
			`investor = (`
			`session.query(InvestorTable)`
			`.filter_by(name=investor_name.strip())`
			`.first()`
			`)`

			`if investor:`
			`# Add investor-company relationship`
			`if company not in investor.portfolio_companies:`
			`investor.portfolio_companies.append(company)`
			`else:`
			`print("This company has an investor not in DB:", investor_name)`

			`# Process sectors/industries`
			`industries_str = row.get("Organization Industries", "")`
			`if pd.notna(industries_str) and industries_str:`
			`industries = parse_industries(industries_str)`

			`for industry_name in industries:`
			`industry_name = industry_name.strip()`
			`if industry_name:`
			`# Check if sector exists`
			`sector = (`
			`session.query(SectorTable)`
			`.filter_by(name=industry_name)`
			`.first()`
			`)`
			`if not sector:`
			`sector = SectorTable(name=industry_name)`
			`session.add(sector)`
			`session.flush()`
			`sectors_created.add(industry_name)`

			`# Add company-sector relationship`
			`if sector not in company.sectors:`
			`company.sectors.append(sector)`

			`# Commit every 100 companies`
			`if companies_processed % 100 == 0 and companies_processed > 0:`
			`session.commit()`
			`print(f" Processed {companies_processed} companies...")`

			`except Exception as e:`
			`logger.error(f"Error processing company {index}: {e}")`
			`session.rollback()`
			`continue`

			`# Step 3: Link investors to sectors based on portfolio companies`
			`print("\n🔄 Step 3: Linking Investors to Sectors...")`
			`investors_linked_to_sectors = 0`
			`all_investors = session.query(InvestorTable).all()`
			`for investor in all_investors:`
			`sectors = set()`
			`for company in investor.portfolio_companies:`
			`for sector in company.sectors:`
			`sectors.add(sector)`
			`# Add sectors to investor if not already present`
			`for sector in sectors:`
			`if sector not in investor.sectors:`
			`investor.sectors.append(sector)`
			`if sectors:`
			`investors_linked_to_sectors += 1`
			`session.commit()`
			`print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")`

			`# Final commit`
			`session.commit()`

			`# Final counts`
			`final_investors = session.query(InvestorTable).count()`
			`final_companies = session.query(CompanyTable).count()`
			`final_sectors = session.query(SectorTable).count()`

			`print("\n🎉 Ingestion Complete!")`
			`print(f" Investors: {final_investors}")`
			`print(f" Companies: {final_companies}")`
			`print(f" Sectors: {final_sectors}")`

			`session.close()`


			`if __name__ == "__main__":`
			`ingest_data()`
			`# print(clean_name("A... Energi"))`
			`# print(clean_name("B.. Tech"))`
			`# print(clean_name("A... Energi"))`