preprocessor/web_crawler.py

import asyncio
import logging
import os
from typing import Optional

from crawl4ai import AsyncWebCrawler
from web_crawler_schemas import InvestorDataScrape
from ddgs import DDGS
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from models import (
    CompanyTable,
    InvestmentStageTable,
    InvestorMember,
    InvestorTable,
    SectorTable,
    engine,
)
from sqlalchemy.orm import sessionmaker

Session = sessionmaker(bind=engine)
session = Session()

# ------------------------------------------------------------------
# Logging setup
# ------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
logger = logging.getLogger("web_search_agent")

# ------------------------------------------------------------------
# Environment
# ------------------------------------------------------------------
load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

if not OPENROUTER_API_KEY:
    logger.warning("OPENROUTER_API_KEY not set. LLM calls will fail if invoked.")


class QueryProcessor:
    def __init__(self, sql_session: Optional[object] = None):
        self.sql_session = sql_session

        self.llm = ChatOpenAI(
            api_key=OPENROUTER_API_KEY,
            base_url="https://openrouter.ai/api/v1",
            model="openai/gpt-5-nano",
            temperature=0,
        )
        self.agent = create_react_agent(
            model=self.llm,
            tools=[self.crawl, self.web_search],
            response_format=InvestorDataScrape,
        )

        self.ddg_search = DDGS()

    async def fill_investor(self, investor: InvestorTable):
        inv_dict = {
            col.name: getattr(investor, col.name) for col in investor.__table__.columns
        }

        website = inv_dict.get("website", "No Website")
        name = inv_dict.get("name", "Unknown")
        description = inv_dict.get("description", "No description")
        aum = inv_dict.get("aum", "Unknown")
        check_size_lower = inv_dict.get("check_size_lower", "Unknown")
        check_size_upper = inv_dict.get("check_size_upper", "Unknown")
        geographic_focus = inv_dict.get("geographic_focus", "Unknown")
        number_of_investments = inv_dict.get("number_of_investments", "Unknown")

        print(website)

        prompt = f"""
        You are a crawler agent. You will be provided with information about a venture capital investor and their website.
        Your task is to navigate the website to find and enrich the existing information.
        If the website is not available, use the `web_search` tool to google the name of the investor company.
        Use the `crawl` tool to visit web pages and extract information.

        Current investor information:
        - Name: {name}
        - Website: {website}
        - Description: {description}
        - Assets Under Management: {aum}
        - Check Size Lower: {check_size_lower}
        - Check Size Upper: {check_size_upper}
        - Geographic Focus: {geographic_focus}
        - Number of Investments: {number_of_investments}

        IMPORTANT: Investment Stages - Investors often focus on MULTIPLE stages. Look for:
        - "Seed to Series A" = [SEED, SERIES_A]
        - "Early stage" = [SEED, SERIES_A]  
        - "Growth stage" = [SERIES_B, SERIES_C, GROWTH]
        - "Multi-stage" = [SEED, SERIES_A, SERIES_B, SERIES_C]
        - "Late stage" = [GROWTH, LATE_STAGE]
        - "Series A and B" = [SERIES_A, SERIES_B]

        IMPORTANT: Additional guidance for AUM and Check Size
        - "Check size" may also be written as "ticket size", "investment size", "typical investment range", or "investment amount".
        - "Assets under management (AUM)" may also be called "fund size", "capital under management", or "fund raised".
        - If not on the official website, search news and databases like Crunchbase, PitchBook, Dealroom, TechCrunch, PRNewswire, or EU-Startups.
        - Look for numbers with currency symbols (€,$,£) followed by "M", "B", "million", or "billion".
        - Example: "fund size €200M", "typical tickets $1–5M", "raised £1 billion".

        Follow these steps:
        1. Use the `crawl` tool with the main website URL to get the initial content.
        2. Analyze the returned content. Look for links or sections related to the information you need (About, Team, Portfolio, Investments, Funds).
        3. If you find a relevant URL, call the `crawl` tool again with that new URL to get more detailed information.
        4. If AUM or check size are still missing, immediately perform 1–2 `web_search` queries such as:
        - "{name} fund size site:techcrunch.com"
        - "{name} ticket size site:eu-startups.com"
        - "{name} raises fund site:prnewswire.com"
        5. Continue this process, exploring relevant pages, until you have gathered all the required information.
        6. Extract and update the following information:
        - investor: Core investor data (name, description, aum, check_size_lower, check_size_upper, geographic_focus, number_of_investments)
        - team_members: List of key members with name, role, and email/LinkedIn
        - sectors: List of investment sectors they focus on
        - investment_stages: List of ALL investment stages they focus on (can be multiple!)
        7. If any information is not available or cannot be improved, leave it as null or use existing data.

        Stop crawling/searching once you have found the missing information or confirmed it is not available online.

        Website: {website}
        """

        return prompt

    async def crawl(self, url: str):
        """Tool to search the web using a web crawler. given the url"""
        print(f"🕷️ Crawling: {url}")
        try:
            if url == "No Website" or not url or url.strip() == "":
                return "No website provided for this investor. Please use web_search to find information."

            async with AsyncWebCrawler() as crawler:
                results = await crawler.arun(url)
                return results.markdown[:5000]  # Limit content to avoid token limits
        except Exception as e:
            print(f"❌ Failed to crawl {url}: {e}")
            return f"Failed to crawl website: {e}. Please try web_search instead."

    def web_search(self, query: str):
        """Tool to search the web using google"""
        print(f"🔍 Searching: {query}")
        try:
            result = self.ddg_search.text(query, max_results=10, backend="google")
            # Format results for better LLM consumption
            formatted_results = []
            for r in result:
                formatted_results.append(
                    {
                        "title": r.get("title", ""),
                        "url": r.get("href", ""),
                        "snippet": r.get("body", ""),
                    }
                )
            return formatted_results
        except Exception as e:
            print(f"❌ Search failed: {e}")
            return f"Search failed: {e}"


def needs_enrichment(investor: InvestorTable) -> bool:
    """Check if an investor needs enrichment based on missing fields"""
    missing_fields = []

    if not investor.description:
        missing_fields.append("description")
    if not investor.aum:
        missing_fields.append("aum")
    if not investor.check_size_lower or not investor.check_size_upper:
        missing_fields.append("check_size")
    if not investor.geographic_focus:
        missing_fields.append("geographic_focus")
    if not investor.investment_stages:
        missing_fields.append("investment_stages")
    if not investor.team_members:
        missing_fields.append("team_members")

    if missing_fields:
        print(f"Investor {investor.name} missing: {', '.join(missing_fields)}")
        return True
    return False


def update_investor(session, investor: InvestorTable, data: InvestorDataScrape):
    """Update an InvestorTable row with extracted data, safely handling members and relationships."""

    # --- Core investor info ---
    if data.investor.description:
        investor.description = data.investor.description

    if data.investor.aum:
        investor.aum = data.investor.aum

    if data.investor.check_size_lower:
        investor.check_size_lower = data.investor.check_size_lower

    if data.investor.check_size_upper:
        investor.check_size_upper = data.investor.check_size_upper

    if data.investor.geographic_focus:
        investor.geographic_focus = data.investor.geographic_focus

    if data.investor.number_of_investments:
        investor.number_of_investments = data.investor.number_of_investments

    # --- Investment Stages (NEW) ---
    if data.investment_stages:
        # Get current stage IDs for comparison
        current_stage_enums = {stage.stage for stage in investor.investment_stages}

        for stage_data in data.investment_stages:
            if stage_data.stage not in current_stage_enums:
                # Check if stage already exists in database
                existing_stage = (
                    session.query(InvestmentStageTable)
                    .filter_by(stage=stage_data.stage)
                    .first()
                )

                if not existing_stage:
                    # Create new stage record
                    existing_stage = InvestmentStageTable(stage=stage_data.stage)
                    session.add(existing_stage)
                    session.flush()  # Get the ID

                # Add to investor's stages
                investor.investment_stages.append(existing_stage)

    # --- Team Members ---
    if data.team_members:
        # Index current members by name for quick lookup
        current_members = {m.name.lower(): m for m in investor.team_members if m.name}

        for m in data.team_members:
            if not m.name:
                continue
            normalized = m.name.strip().lower()

            if normalized in current_members:
                # Update existing member
                member_obj = current_members[normalized]
                if m.role:
                    member_obj.role = m.role
                if m.email:
                    member_obj.email = m.email
            else:
                # Create new member
                member_obj = InvestorMember(
                    name=m.name.strip(),
                    role=m.role,
                    email=m.email,
                    investor=investor,
                )
                session.add(member_obj)

    # --- Sectors ---
    if data.sectors:
        for sector_data in data.sectors:
            if not sector_data.name:
                continue

            # Check if sector already exists
            existing_sector = (
                session.query(SectorTable).filter_by(name=sector_data.name).first()
            )
            if not existing_sector:
                existing_sector = SectorTable(name=sector_data.name)
                session.add(existing_sector)
                session.flush()  # Get the ID

            # Add relationship if not already exists
            if existing_sector not in investor.sectors:
                investor.sectors.append(existing_sector)

    # --- Portfolio Companies ---
    # if data.portfolio_companies:
    #     for company_data in data.portfolio_companies:
    #         if not company_data.name:
    #             continue

    #         # Check if company already exists
    #         existing_company = (
    #             session.query(CompanyTable).filter_by(name=company_data.name).first()
    #         )
    #         if not existing_company:
    #             existing_company = CompanyTable(
    #                 name=company_data.name,
    #                 industry=company_data.industry,
    #                 location=company_data.location,
    #                 description=company_data.description,
    #                 founded_year=company_data.founded_year,
    #                 website=company_data.website,
    #             )
    #             session.add(existing_company)
    #             session.flush()  # Get the ID

    #         # Add relationship if not already exists
    #         if existing_company not in investor.portfolio_companies:
    #             investor.portfolio_companies.append(existing_company)

    session.add(investor)
    session.commit()
    return investor


# ------------------------------------------------------------------
# Main
# ------------------------------------------------------------------
async def main():
    qp = QueryProcessor(sql_session=session)
    all_investors = qp.sql_session.query(InvestorTable).all() if qp.sql_session else []

    # Filter investors that need enrichment
    investors_to_enrich = [inv for inv in all_investors if needs_enrichment(inv)]

    # print(
    #     f"Found {len(investors_to_enrich)} investors that need enrichment out of {len(all_investors)} total"
    # )

    # Process first 10 that need enrichment
    for inv in investors_to_enrich[:10]:
        try:
            print(f"\n🔄 Processing investor: {inv.name}")
            prompt = await qp.fill_investor(inv)
            ai_response = await qp.agent.ainvoke({"messages": [("user", f"{prompt}")]})
            extracted = ai_response["structured_response"]

            # Save JSON backup
            with open("enriched_investors.json", "a") as f:
                f.write(f"# Investor: {inv.name}\n")
                f.write(extracted.model_dump_json(indent=2) + "\n\n")

            # Update database
            update_investor(session, inv, extracted)

            print(f"✅ Updated investor {inv.name} (id={inv.id})")

        except Exception as e:
            logger.error(f"Failed to enrich investor {getattr(inv, 'id', None)}: {e}")
            continue


if __name__ == "__main__":
    asyncio.run(main())