import asyncio import logging import os from typing import Optional from crawl4ai import AsyncWebCrawler from web_crawler_schemas import InvestorDataScrape from ddgs import DDGS from dotenv import load_dotenv from langchain_openai import ChatOpenAI from langgraph.prebuilt import create_react_agent from models import ( CompanyTable, InvestmentStageTable, InvestorMember, InvestorTable, SectorTable, engine, ) from sqlalchemy.orm import sessionmaker Session = sessionmaker(bind=engine) session = Session() # ------------------------------------------------------------------ # Logging setup # ------------------------------------------------------------------ logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s" ) logger = logging.getLogger("web_search_agent") # ------------------------------------------------------------------ # Environment # ------------------------------------------------------------------ load_dotenv() OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") if not OPENROUTER_API_KEY: logger.warning("OPENROUTER_API_KEY not set. LLM calls will fail if invoked.") class QueryProcessor: def __init__(self, sql_session: Optional[object] = None): self.sql_session = sql_session self.llm = ChatOpenAI( api_key=OPENROUTER_API_KEY, base_url="https://openrouter.ai/api/v1", model="openai/gpt-5-nano", temperature=0, ) self.agent = create_react_agent( model=self.llm, tools=[self.crawl, self.web_search], response_format=InvestorDataScrape, ) self.ddg_search = DDGS() async def fill_investor(self, investor: InvestorTable): inv_dict = { col.name: getattr(investor, col.name) for col in investor.__table__.columns } website = inv_dict.get("website", "No Website") name = inv_dict.get("name", "Unknown") description = inv_dict.get("description", "No description") aum = inv_dict.get("aum", "Unknown") check_size_lower = inv_dict.get("check_size_lower", "Unknown") check_size_upper = inv_dict.get("check_size_upper", "Unknown") geographic_focus = inv_dict.get("geographic_focus", "Unknown") number_of_investments = inv_dict.get("number_of_investments", "Unknown") print(website) prompt = f""" You are a crawler agent. You will be provided with information about a venture capital investor and their website. Your task is to navigate the website to find and enrich the existing information. If the website is not available, use the `web_search` tool to google the name of the investor company. Use the `crawl` tool to visit web pages and extract information. Current investor information: - Name: {name} - Website: {website} - Description: {description} - Assets Under Management: {aum} - Check Size Lower: {check_size_lower} - Check Size Upper: {check_size_upper} - Geographic Focus: {geographic_focus} - Number of Investments: {number_of_investments} IMPORTANT: Investment Stages - Investors often focus on MULTIPLE stages. Look for: - "Seed to Series A" = [SEED, SERIES_A] - "Early stage" = [SEED, SERIES_A] - "Growth stage" = [SERIES_B, SERIES_C, GROWTH] - "Multi-stage" = [SEED, SERIES_A, SERIES_B, SERIES_C] - "Late stage" = [GROWTH, LATE_STAGE] - "Series A and B" = [SERIES_A, SERIES_B] IMPORTANT: Additional guidance for AUM and Check Size - "Check size" may also be written as "ticket size", "investment size", "typical investment range", or "investment amount". - "Assets under management (AUM)" may also be called "fund size", "capital under management", or "fund raised". - If not on the official website, search news and databases like Crunchbase, PitchBook, Dealroom, TechCrunch, PRNewswire, or EU-Startups. - Look for numbers with currency symbols (€,$,£) followed by "M", "B", "million", or "billion". - Example: "fund size €200M", "typical tickets $1–5M", "raised £1 billion". Follow these steps: 1. Use the `crawl` tool with the main website URL to get the initial content. 2. Analyze the returned content. Look for links or sections related to the information you need (About, Team, Portfolio, Investments, Funds). 3. If you find a relevant URL, call the `crawl` tool again with that new URL to get more detailed information. 4. If AUM or check size are still missing, immediately perform 1–2 `web_search` queries such as: - "{name} fund size site:techcrunch.com" - "{name} ticket size site:eu-startups.com" - "{name} raises fund site:prnewswire.com" 5. Continue this process, exploring relevant pages, until you have gathered all the required information. 6. Extract and update the following information: - investor: Core investor data (name, description, aum, check_size_lower, check_size_upper, geographic_focus, number_of_investments) - team_members: List of key members with name, role, and email/LinkedIn - sectors: List of investment sectors they focus on - investment_stages: List of ALL investment stages they focus on (can be multiple!) 7. If any information is not available or cannot be improved, leave it as null or use existing data. Stop crawling/searching once you have found the missing information or confirmed it is not available online. Website: {website} """ return prompt async def crawl(self, url: str): """Tool to search the web using a web crawler. given the url""" print(f"🕷️ Crawling: {url}") try: if url == "No Website" or not url or url.strip() == "": return "No website provided for this investor. Please use web_search to find information." async with AsyncWebCrawler() as crawler: results = await crawler.arun(url) return results.markdown[:5000] # Limit content to avoid token limits except Exception as e: print(f"❌ Failed to crawl {url}: {e}") return f"Failed to crawl website: {e}. Please try web_search instead." def web_search(self, query: str): """Tool to search the web using google""" print(f"🔍 Searching: {query}") try: result = self.ddg_search.text(query, max_results=10, backend="google") # Format results for better LLM consumption formatted_results = [] for r in result: formatted_results.append( { "title": r.get("title", ""), "url": r.get("href", ""), "snippet": r.get("body", ""), } ) return formatted_results except Exception as e: print(f"❌ Search failed: {e}") return f"Search failed: {e}" def needs_enrichment(investor: InvestorTable) -> bool: """Check if an investor needs enrichment based on missing fields""" missing_fields = [] if not investor.description: missing_fields.append("description") if not investor.aum: missing_fields.append("aum") if not investor.check_size_lower or not investor.check_size_upper: missing_fields.append("check_size") if not investor.geographic_focus: missing_fields.append("geographic_focus") if not investor.investment_stages: missing_fields.append("investment_stages") if not investor.team_members: missing_fields.append("team_members") if missing_fields: print(f"Investor {investor.name} missing: {', '.join(missing_fields)}") return True return False def update_investor(session, investor: InvestorTable, data: InvestorDataScrape): """Update an InvestorTable row with extracted data, safely handling members and relationships.""" # --- Core investor info --- if data.investor.description: investor.description = data.investor.description if data.investor.aum: investor.aum = data.investor.aum if data.investor.check_size_lower: investor.check_size_lower = data.investor.check_size_lower if data.investor.check_size_upper: investor.check_size_upper = data.investor.check_size_upper if data.investor.geographic_focus: investor.geographic_focus = data.investor.geographic_focus if data.investor.number_of_investments: investor.number_of_investments = data.investor.number_of_investments # --- Investment Stages (NEW) --- if data.investment_stages: # Get current stage IDs for comparison current_stage_enums = {stage.stage for stage in investor.investment_stages} for stage_data in data.investment_stages: if stage_data.stage not in current_stage_enums: # Check if stage already exists in database existing_stage = ( session.query(InvestmentStageTable) .filter_by(stage=stage_data.stage) .first() ) if not existing_stage: # Create new stage record existing_stage = InvestmentStageTable(stage=stage_data.stage) session.add(existing_stage) session.flush() # Get the ID # Add to investor's stages investor.investment_stages.append(existing_stage) # --- Team Members --- if data.team_members: # Index current members by name for quick lookup current_members = {m.name.lower(): m for m in investor.team_members if m.name} for m in data.team_members: if not m.name: continue normalized = m.name.strip().lower() if normalized in current_members: # Update existing member member_obj = current_members[normalized] if m.role: member_obj.role = m.role if m.email: member_obj.email = m.email else: # Create new member member_obj = InvestorMember( name=m.name.strip(), role=m.role, email=m.email, investor=investor, ) session.add(member_obj) # --- Sectors --- if data.sectors: for sector_data in data.sectors: if not sector_data.name: continue # Check if sector already exists existing_sector = ( session.query(SectorTable).filter_by(name=sector_data.name).first() ) if not existing_sector: existing_sector = SectorTable(name=sector_data.name) session.add(existing_sector) session.flush() # Get the ID # Add relationship if not already exists if existing_sector not in investor.sectors: investor.sectors.append(existing_sector) # --- Portfolio Companies --- # if data.portfolio_companies: # for company_data in data.portfolio_companies: # if not company_data.name: # continue # # Check if company already exists # existing_company = ( # session.query(CompanyTable).filter_by(name=company_data.name).first() # ) # if not existing_company: # existing_company = CompanyTable( # name=company_data.name, # industry=company_data.industry, # location=company_data.location, # description=company_data.description, # founded_year=company_data.founded_year, # website=company_data.website, # ) # session.add(existing_company) # session.flush() # Get the ID # # Add relationship if not already exists # if existing_company not in investor.portfolio_companies: # investor.portfolio_companies.append(existing_company) session.add(investor) session.commit() return investor # ------------------------------------------------------------------ # Main # ------------------------------------------------------------------ async def main(): qp = QueryProcessor(sql_session=session) all_investors = qp.sql_session.query(InvestorTable).all() if qp.sql_session else [] # Filter investors that need enrichment investors_to_enrich = [inv for inv in all_investors if needs_enrichment(inv)] # print( # f"Found {len(investors_to_enrich)} investors that need enrichment out of {len(all_investors)} total" # ) # Process first 10 that need enrichment for inv in investors_to_enrich[:10]: try: print(f"\n🔄 Processing investor: {inv.name}") prompt = await qp.fill_investor(inv) ai_response = await qp.agent.ainvoke({"messages": [("user", f"{prompt}")]}) extracted = ai_response["structured_response"] # Save JSON backup with open("enriched_investors.json", "a") as f: f.write(f"# Investor: {inv.name}\n") f.write(extracted.model_dump_json(indent=2) + "\n\n") # Update database update_investor(session, inv, extracted) print(f"✅ Updated investor {inv.name} (id={inv.id})") except Exception as e: logger.error(f"Failed to enrich investor {getattr(inv, 'id', None)}: {e}") continue if __name__ == "__main__": asyncio.run(main())