Anton_wireframe/base_db_generator/main.py

import logging
import re
import unicodedata

import pandas as pd
from models import CompanyTable, InvestorTable, SectorTable, engine, init_database
from sqlalchemy.orm import sessionmaker

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Import the schema
init_database()


# ===================== Ingesting Original Data =====================#
def parse_investor_names(investor_names_str):
    """Parse comma-separated investor names and return a list"""
    if pd.isna(investor_names_str) or investor_names_str == "":
        return []

    # Split by comma and clean whitespace
    # investors = [name.strip() for name in str(investor_names_str).split(",")]
    investors = [
        clean_name(name.strip()) for name in str(investor_names_str).split(",")
    ]
    return [investor for investor in investors if investor]


def parse_industries(industries_str):
    """Parse comma-separated industries and return a list"""
    if pd.isna(industries_str) or industries_str == "":
        return []

    # Split by comma and clean whitespace
    industries = [industry.strip() for industry in str(industries_str).split(",")]
    return [industry for industry in industries if industry]


def clean_special_characters(text):
    """Clean special characters from text, converting to ASCII equivalents"""
    if not text:
        return text

    # First remove ellipses and other problematic patterns
    text = str(text).replace("...", "").replace("..", "")

    # Normalize unicode characters to their closest ASCII equivalents
    normalized = unicodedata.normalize("NFKD", text)

    # Remove accents and convert to ASCII
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")

    # Remove any remaining non-alphanumeric characters except spaces, hyphens, and periods
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.]", "", ascii_text)

    # Clean up multiple spaces
    cleaned = re.sub(r"\s+", " ", cleaned).strip()

    return cleaned


def clean_string(value):
    """Clean string values, converting empty/null/nan/0 to None and removing special characters"""
    if (
        pd.isna(value)
        or value == ""
        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
    ):
        return None

    # First clean special characters
    cleaned = clean_special_characters(str(value).strip())

    # Check if result is just "0" after cleaning
    if cleaned in ["0", "0.0", "null", "nan", "none"]:
        return None

    return cleaned if cleaned else None


def clean_name(value):
    """Clean names (companies, investors) with special character handling"""
    if (
        pd.isna(value)
        or value == ""
        or str(value).lower() in ["nan", "null", "none", "0", "0.0"]
    ):
        return None

    # Clean special characters but be more permissive for names
    text = str(value).strip()
    # First remove ellipses and other problematic patterns
    # text = text.replace("...", "").replace("..", "")

    # Normalize unicode characters
    normalized = unicodedata.normalize("NFKD", text)

    # Convert to ASCII but keep more characters for business names
    ascii_text = normalized.encode("ascii", "ignore").decode("ascii")

    # Allow alphanumeric, spaces, hyphens, periods, parentheses, and ampersands
    cleaned = re.sub(r"[^a-zA-Z0-9\s\-\.\(\)&]", "", ascii_text)

    # Clean up multiple spaces
    cleaned = re.sub(r"\s+", " ", cleaned).strip()

    # Remove any trailing or leading periods
    cleaned = cleaned.strip(".")

    cleaned = cleaned.replace("..", "").replace("...", "")
    # Check if result is just "0" after cleaning
    if cleaned in ["0", "0.0", "null", "nan", "none"]:
        return None

    return cleaned if cleaned else None


def clean_integer(value):
    """Clean integer values, converting empty/null/nan/0 to None"""
    if pd.isna(value) or str(value).lower() in ["nan", "null", "none", "", "0", "0.0"]:
        return None
    try:
        cleaned_val = int(float(value))
        return cleaned_val if cleaned_val > 0 else None
    except (ValueError, TypeError):
        return None


def parse_website(website_str: str):
    try:
        _, end = website_str.split(":")

        if end == "0":
            return None
        return "https:" + end
    except Exception:
        return None


def ingest_data():
    # Create database engine and session
    Session = sessionmaker(bind=engine)
    session = Session()

    # Load CSV files
    print("Loading CSV files...")
    companies_df = pd.read_csv("companies.csv")
    investors_df = pd.read_csv("investors.csv")

    print(f"📊 Companies CSV: {len(companies_df)} rows")
    print(f"📊 Investors CSV: {len(investors_df)} rows")

    # Step 1: Ingest Investors
    print("\n🔄 Step 1: Ingesting Investors...")
    investors_processed = 0

    for index, row in investors_df.iterrows():
        try:
            investor_name = clean_name(row.get("Filtered investor names", ""))

            if investor_name:
                # Check if investor already exists
                existing_investor = (
                    session.query(InvestorTable).filter_by(name=investor_name).first()
                )
                if not existing_investor:
                    investor = InvestorTable(
                        name=investor_name,
                        description=clean_string(row.get("Business model", "")),
                        headquarters=clean_string(row.get("HQ", "")),
                        website=parse_website(str(row.get("Website", "")).strip()),
                        number_of_investments=clean_integer(
                            row.get("Number of investments")
                        ),
                    )
                    session.add(investor)
                    investors_processed += 1

                    if investors_processed % 1000 == 0:
                        session.commit()
                        print(f"  Committed {investors_processed} investors")

        except Exception as e:
            logger.error(f"Error processing investor {index}: {e}")
            continue

    session.commit()
    print(f"✅ Investors completed: {investors_processed} processed")

    # Step 2: Ingest Companies and Rounds
    print("\n🔄 Step 2: Ingesting Companies and Sectors...")
    companies_processed = 0
    sectors_created = set()

    for index, row in companies_df.iterrows():
        try:
            # Process company
            company_name = clean_name(row.get("Organization Name", ""))
            if not company_name:
                continue

            # Check if company already exists
            existing_company = (
                session.query(CompanyTable).filter_by(name=company_name).first()
            )
            if existing_company:
                company = existing_company
            else:
                # Create company
                company = CompanyTable(
                    name=company_name,
                    description=clean_string(row.get("Organization Description", "")),
                    location=clean_string(row.get("Organization Location", "")),
                    industry=clean_string(row.get("Organization Industries", "")),
                    website=clean_string(row.get("Organization Website", "")),
                )
                session.add(company)
                session.flush()  # Get the company ID
                companies_processed += 1

            # Process investor relationships
            investor_names_str = row.get("Investor Names", "")
            if pd.notna(investor_names_str) and investor_names_str:
                investor_names = parse_investor_names(investor_names_str)

                for investor_name in investor_names:
                    # Find investor in database
                    investor = (
                        session.query(InvestorTable)
                        .filter_by(name=investor_name.strip())
                        .first()
                    )

                    if investor:
                        # Add investor-company relationship
                        if company not in investor.portfolio_companies:
                            investor.portfolio_companies.append(company)
                    else:
                        print("This company has an investor not in DB:", investor_name)

            # Process sectors/industries
            industries_str = row.get("Organization Industries", "")
            if pd.notna(industries_str) and industries_str:
                industries = parse_industries(industries_str)

                for industry_name in industries:
                    industry_name = industry_name.strip()
                    if industry_name:
                        # Check if sector exists
                        sector = (
                            session.query(SectorTable)
                            .filter_by(name=industry_name)
                            .first()
                        )
                        if not sector:
                            sector = SectorTable(name=industry_name)
                            session.add(sector)
                            session.flush()
                            sectors_created.add(industry_name)

                        # Add company-sector relationship
                        if sector not in company.sectors:
                            company.sectors.append(sector)

            # Commit every 100 companies
            if companies_processed % 100 == 0 and companies_processed > 0:
                session.commit()
                print(f"  Processed {companies_processed} companies...")

        except Exception as e:
            logger.error(f"Error processing company {index}: {e}")
            session.rollback()
            continue

    # Step 3: Link investors to sectors based on portfolio companies
    print("\n🔄 Step 3: Linking Investors to Sectors...")
    investors_linked_to_sectors = 0
    all_investors = session.query(InvestorTable).all()
    for investor in all_investors:
        sectors = set()
        for company in investor.portfolio_companies:
            for sector in company.sectors:
                sectors.add(sector)
        # Add sectors to investor if not already present
        for sector in sectors:
            if sector not in investor.sectors:
                investor.sectors.append(sector)
        if sectors:
            investors_linked_to_sectors += 1
    session.commit()
    print(f"✅ Linked {investors_linked_to_sectors} investors to sectors")

    # Final commit
    session.commit()

    # Final counts
    final_investors = session.query(InvestorTable).count()
    final_companies = session.query(CompanyTable).count()
    final_sectors = session.query(SectorTable).count()

    print("\n🎉 Ingestion Complete!")
    print(f"   Investors: {final_investors}")
    print(f"   Companies: {final_companies}")
    print(f"   Sectors: {final_sectors}")

    session.close()


if __name__ == "__main__":
    ingest_data()
    # print(clean_name("A... Energi"))
    # print(clean_name("B.. Tech"))
    # print(clean_name("A... Energi"))