Files
Anton_wireframe/preprocessor/web_crawler.py
T
2025-10-05 19:16:03 +01:00

350 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import asyncio
import logging
import os
from typing import Optional
from crawl4ai import AsyncWebCrawler
from web_crawler_schemas import InvestorDataScrape
from ddgs import DDGS
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from models import (
CompanyTable,
InvestmentStageTable,
InvestorMember,
InvestorTable,
SectorTable,
engine,
)
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)
session = Session()
# ------------------------------------------------------------------
# Logging setup
# ------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
)
logger = logging.getLogger("web_search_agent")
# ------------------------------------------------------------------
# Environment
# ------------------------------------------------------------------
load_dotenv()
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
logger.warning("OPENROUTER_API_KEY not set. LLM calls will fail if invoked.")
class QueryProcessor:
def __init__(self, sql_session: Optional[object] = None):
self.sql_session = sql_session
self.llm = ChatOpenAI(
api_key=OPENROUTER_API_KEY,
base_url="https://openrouter.ai/api/v1",
model="openai/gpt-5-nano",
temperature=0,
)
self.agent = create_react_agent(
model=self.llm,
tools=[self.crawl, self.web_search],
response_format=InvestorDataScrape,
)
self.ddg_search = DDGS()
async def fill_investor(self, investor: InvestorTable):
inv_dict = {
col.name: getattr(investor, col.name) for col in investor.__table__.columns
}
website = inv_dict.get("website", "No Website")
name = inv_dict.get("name", "Unknown")
description = inv_dict.get("description", "No description")
aum = inv_dict.get("aum", "Unknown")
check_size_lower = inv_dict.get("check_size_lower", "Unknown")
check_size_upper = inv_dict.get("check_size_upper", "Unknown")
geographic_focus = inv_dict.get("geographic_focus", "Unknown")
number_of_investments = inv_dict.get("number_of_investments", "Unknown")
print(website)
prompt = f"""
You are a crawler agent. You will be provided with information about a venture capital investor and their website.
Your task is to navigate the website to find and enrich the existing information.
If the website is not available, use the `web_search` tool to google the name of the investor company.
Use the `crawl` tool to visit web pages and extract information.
Current investor information:
- Name: {name}
- Website: {website}
- Description: {description}
- Assets Under Management: {aum}
- Check Size Lower: {check_size_lower}
- Check Size Upper: {check_size_upper}
- Geographic Focus: {geographic_focus}
- Number of Investments: {number_of_investments}
IMPORTANT: Investment Stages - Investors often focus on MULTIPLE stages. Look for:
- "Seed to Series A" = [SEED, SERIES_A]
- "Early stage" = [SEED, SERIES_A]
- "Growth stage" = [SERIES_B, SERIES_C, GROWTH]
- "Multi-stage" = [SEED, SERIES_A, SERIES_B, SERIES_C]
- "Late stage" = [GROWTH, LATE_STAGE]
- "Series A and B" = [SERIES_A, SERIES_B]
IMPORTANT: Additional guidance for AUM and Check Size
- "Check size" may also be written as "ticket size", "investment size", "typical investment range", or "investment amount".
- "Assets under management (AUM)" may also be called "fund size", "capital under management", or "fund raised".
- If not on the official website, search news and databases like Crunchbase, PitchBook, Dealroom, TechCrunch, PRNewswire, or EU-Startups.
- Look for numbers with currency symbols (€,$,£) followed by "M", "B", "million", or "billion".
- Example: "fund size €200M", "typical tickets $15M", "raised £1 billion".
Follow these steps:
1. Use the `crawl` tool with the main website URL to get the initial content.
2. Analyze the returned content. Look for links or sections related to the information you need (About, Team, Portfolio, Investments, Funds).
3. If you find a relevant URL, call the `crawl` tool again with that new URL to get more detailed information.
4. If AUM or check size are still missing, immediately perform 12 `web_search` queries such as:
- "{name} fund size site:techcrunch.com"
- "{name} ticket size site:eu-startups.com"
- "{name} raises fund site:prnewswire.com"
5. Continue this process, exploring relevant pages, until you have gathered all the required information.
6. Extract and update the following information:
- investor: Core investor data (name, description, aum, check_size_lower, check_size_upper, geographic_focus, number_of_investments)
- team_members: List of key members with name, role, and email/LinkedIn
- sectors: List of investment sectors they focus on
- investment_stages: List of ALL investment stages they focus on (can be multiple!)
7. If any information is not available or cannot be improved, leave it as null or use existing data.
Stop crawling/searching once you have found the missing information or confirmed it is not available online.
Website: {website}
"""
return prompt
async def crawl(self, url: str):
"""Tool to search the web using a web crawler. given the url"""
print(f"🕷️ Crawling: {url}")
try:
if url == "No Website" or not url or url.strip() == "":
return "No website provided for this investor. Please use web_search to find information."
async with AsyncWebCrawler() as crawler:
results = await crawler.arun(url)
return results.markdown[:5000] # Limit content to avoid token limits
except Exception as e:
print(f"❌ Failed to crawl {url}: {e}")
return f"Failed to crawl website: {e}. Please try web_search instead."
def web_search(self, query: str):
"""Tool to search the web using google"""
print(f"🔍 Searching: {query}")
try:
result = self.ddg_search.text(query, max_results=10, backend="google")
# Format results for better LLM consumption
formatted_results = []
for r in result:
formatted_results.append(
{
"title": r.get("title", ""),
"url": r.get("href", ""),
"snippet": r.get("body", ""),
}
)
return formatted_results
except Exception as e:
print(f"❌ Search failed: {e}")
return f"Search failed: {e}"
def needs_enrichment(investor: InvestorTable) -> bool:
"""Check if an investor needs enrichment based on missing fields"""
missing_fields = []
if not investor.description:
missing_fields.append("description")
if not investor.aum:
missing_fields.append("aum")
if not investor.check_size_lower or not investor.check_size_upper:
missing_fields.append("check_size")
if not investor.geographic_focus:
missing_fields.append("geographic_focus")
if not investor.investment_stages:
missing_fields.append("investment_stages")
if not investor.team_members:
missing_fields.append("team_members")
if missing_fields:
print(f"Investor {investor.name} missing: {', '.join(missing_fields)}")
return True
return False
def update_investor(session, investor: InvestorTable, data: InvestorDataScrape):
"""Update an InvestorTable row with extracted data, safely handling members and relationships."""
# --- Core investor info ---
if data.investor.description:
investor.description = data.investor.description
if data.investor.aum:
investor.aum = data.investor.aum
if data.investor.check_size_lower:
investor.check_size_lower = data.investor.check_size_lower
if data.investor.check_size_upper:
investor.check_size_upper = data.investor.check_size_upper
if data.investor.geographic_focus:
investor.geographic_focus = data.investor.geographic_focus
if data.investor.number_of_investments:
investor.number_of_investments = data.investor.number_of_investments
# --- Investment Stages (NEW) ---
if data.investment_stages:
# Get current stage IDs for comparison
current_stage_enums = {stage.stage for stage in investor.investment_stages}
for stage_data in data.investment_stages:
if stage_data.stage not in current_stage_enums:
# Check if stage already exists in database
existing_stage = (
session.query(InvestmentStageTable)
.filter_by(stage=stage_data.stage)
.first()
)
if not existing_stage:
# Create new stage record
existing_stage = InvestmentStageTable(stage=stage_data.stage)
session.add(existing_stage)
session.flush() # Get the ID
# Add to investor's stages
investor.investment_stages.append(existing_stage)
# --- Team Members ---
if data.team_members:
# Index current members by name for quick lookup
current_members = {m.name.lower(): m for m in investor.team_members if m.name}
for m in data.team_members:
if not m.name:
continue
normalized = m.name.strip().lower()
if normalized in current_members:
# Update existing member
member_obj = current_members[normalized]
if m.role:
member_obj.role = m.role
if m.email:
member_obj.email = m.email
else:
# Create new member
member_obj = InvestorMember(
name=m.name.strip(),
role=m.role,
email=m.email,
investor=investor,
)
session.add(member_obj)
# --- Sectors ---
if data.sectors:
for sector_data in data.sectors:
if not sector_data.name:
continue
# Check if sector already exists
existing_sector = (
session.query(SectorTable).filter_by(name=sector_data.name).first()
)
if not existing_sector:
existing_sector = SectorTable(name=sector_data.name)
session.add(existing_sector)
session.flush() # Get the ID
# Add relationship if not already exists
if existing_sector not in investor.sectors:
investor.sectors.append(existing_sector)
# --- Portfolio Companies ---
# if data.portfolio_companies:
# for company_data in data.portfolio_companies:
# if not company_data.name:
# continue
# # Check if company already exists
# existing_company = (
# session.query(CompanyTable).filter_by(name=company_data.name).first()
# )
# if not existing_company:
# existing_company = CompanyTable(
# name=company_data.name,
# industry=company_data.industry,
# location=company_data.location,
# description=company_data.description,
# founded_year=company_data.founded_year,
# website=company_data.website,
# )
# session.add(existing_company)
# session.flush() # Get the ID
# # Add relationship if not already exists
# if existing_company not in investor.portfolio_companies:
# investor.portfolio_companies.append(existing_company)
session.add(investor)
session.commit()
return investor
# ------------------------------------------------------------------
# Main
# ------------------------------------------------------------------
async def main():
qp = QueryProcessor(sql_session=session)
all_investors = qp.sql_session.query(InvestorTable).all() if qp.sql_session else []
# Filter investors that need enrichment
investors_to_enrich = [inv for inv in all_investors if needs_enrichment(inv)]
# print(
# f"Found {len(investors_to_enrich)} investors that need enrichment out of {len(all_investors)} total"
# )
# Process first 10 that need enrichment
for inv in investors_to_enrich[:10]:
try:
print(f"\n🔄 Processing investor: {inv.name}")
prompt = await qp.fill_investor(inv)
ai_response = await qp.agent.ainvoke({"messages": [("user", f"{prompt}")]})
extracted = ai_response["structured_response"]
# Save JSON backup
with open("enriched_investors.json", "a") as f:
f.write(f"# Investor: {inv.name}\n")
f.write(extracted.model_dump_json(indent=2) + "\n\n")
# Update database
update_investor(session, inv, extracted)
print(f"✅ Updated investor {inv.name} (id={inv.id})")
except Exception as e:
logger.error(f"Failed to enrich investor {getattr(inv, 'id', None)}: {e}")
continue
if __name__ == "__main__":
asyncio.run(main())