350 lines
14 KiB
Python
350 lines
14 KiB
Python
import asyncio
|
||
import logging
|
||
import os
|
||
from typing import Optional
|
||
|
||
from crawl4ai import AsyncWebCrawler
|
||
from web_crawler_schemas import InvestorDataScrape
|
||
from ddgs import DDGS
|
||
from dotenv import load_dotenv
|
||
from langchain_openai import ChatOpenAI
|
||
from langgraph.prebuilt import create_react_agent
|
||
from models import (
|
||
CompanyTable,
|
||
InvestmentStageTable,
|
||
InvestorMember,
|
||
InvestorTable,
|
||
SectorTable,
|
||
engine,
|
||
)
|
||
from sqlalchemy.orm import sessionmaker
|
||
|
||
Session = sessionmaker(bind=engine)
|
||
session = Session()
|
||
|
||
# ------------------------------------------------------------------
|
||
# Logging setup
|
||
# ------------------------------------------------------------------
|
||
logging.basicConfig(
|
||
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
||
)
|
||
logger = logging.getLogger("web_search_agent")
|
||
|
||
# ------------------------------------------------------------------
|
||
# Environment
|
||
# ------------------------------------------------------------------
|
||
load_dotenv()
|
||
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
||
|
||
if not OPENROUTER_API_KEY:
|
||
logger.warning("OPENROUTER_API_KEY not set. LLM calls will fail if invoked.")
|
||
|
||
|
||
class QueryProcessor:
|
||
def __init__(self, sql_session: Optional[object] = None):
|
||
self.sql_session = sql_session
|
||
|
||
self.llm = ChatOpenAI(
|
||
api_key=OPENROUTER_API_KEY,
|
||
base_url="https://openrouter.ai/api/v1",
|
||
model="openai/gpt-5-nano",
|
||
temperature=0,
|
||
)
|
||
self.agent = create_react_agent(
|
||
model=self.llm,
|
||
tools=[self.crawl, self.web_search],
|
||
response_format=InvestorDataScrape,
|
||
)
|
||
|
||
self.ddg_search = DDGS()
|
||
|
||
async def fill_investor(self, investor: InvestorTable):
|
||
inv_dict = {
|
||
col.name: getattr(investor, col.name) for col in investor.__table__.columns
|
||
}
|
||
|
||
website = inv_dict.get("website", "No Website")
|
||
name = inv_dict.get("name", "Unknown")
|
||
description = inv_dict.get("description", "No description")
|
||
aum = inv_dict.get("aum", "Unknown")
|
||
check_size_lower = inv_dict.get("check_size_lower", "Unknown")
|
||
check_size_upper = inv_dict.get("check_size_upper", "Unknown")
|
||
geographic_focus = inv_dict.get("geographic_focus", "Unknown")
|
||
number_of_investments = inv_dict.get("number_of_investments", "Unknown")
|
||
|
||
print(website)
|
||
|
||
prompt = f"""
|
||
You are a crawler agent. You will be provided with information about a venture capital investor and their website.
|
||
Your task is to navigate the website to find and enrich the existing information.
|
||
If the website is not available, use the `web_search` tool to google the name of the investor company.
|
||
Use the `crawl` tool to visit web pages and extract information.
|
||
|
||
Current investor information:
|
||
- Name: {name}
|
||
- Website: {website}
|
||
- Description: {description}
|
||
- Assets Under Management: {aum}
|
||
- Check Size Lower: {check_size_lower}
|
||
- Check Size Upper: {check_size_upper}
|
||
- Geographic Focus: {geographic_focus}
|
||
- Number of Investments: {number_of_investments}
|
||
|
||
IMPORTANT: Investment Stages - Investors often focus on MULTIPLE stages. Look for:
|
||
- "Seed to Series A" = [SEED, SERIES_A]
|
||
- "Early stage" = [SEED, SERIES_A]
|
||
- "Growth stage" = [SERIES_B, SERIES_C, GROWTH]
|
||
- "Multi-stage" = [SEED, SERIES_A, SERIES_B, SERIES_C]
|
||
- "Late stage" = [GROWTH, LATE_STAGE]
|
||
- "Series A and B" = [SERIES_A, SERIES_B]
|
||
|
||
IMPORTANT: Additional guidance for AUM and Check Size
|
||
- "Check size" may also be written as "ticket size", "investment size", "typical investment range", or "investment amount".
|
||
- "Assets under management (AUM)" may also be called "fund size", "capital under management", or "fund raised".
|
||
- If not on the official website, search news and databases like Crunchbase, PitchBook, Dealroom, TechCrunch, PRNewswire, or EU-Startups.
|
||
- Look for numbers with currency symbols (€,$,£) followed by "M", "B", "million", or "billion".
|
||
- Example: "fund size €200M", "typical tickets $1–5M", "raised £1 billion".
|
||
|
||
Follow these steps:
|
||
1. Use the `crawl` tool with the main website URL to get the initial content.
|
||
2. Analyze the returned content. Look for links or sections related to the information you need (About, Team, Portfolio, Investments, Funds).
|
||
3. If you find a relevant URL, call the `crawl` tool again with that new URL to get more detailed information.
|
||
4. If AUM or check size are still missing, immediately perform 1–2 `web_search` queries such as:
|
||
- "{name} fund size site:techcrunch.com"
|
||
- "{name} ticket size site:eu-startups.com"
|
||
- "{name} raises fund site:prnewswire.com"
|
||
5. Continue this process, exploring relevant pages, until you have gathered all the required information.
|
||
6. Extract and update the following information:
|
||
- investor: Core investor data (name, description, aum, check_size_lower, check_size_upper, geographic_focus, number_of_investments)
|
||
- team_members: List of key members with name, role, and email/LinkedIn
|
||
- sectors: List of investment sectors they focus on
|
||
- investment_stages: List of ALL investment stages they focus on (can be multiple!)
|
||
7. If any information is not available or cannot be improved, leave it as null or use existing data.
|
||
|
||
Stop crawling/searching once you have found the missing information or confirmed it is not available online.
|
||
|
||
Website: {website}
|
||
"""
|
||
|
||
return prompt
|
||
|
||
async def crawl(self, url: str):
|
||
"""Tool to search the web using a web crawler. given the url"""
|
||
print(f"🕷️ Crawling: {url}")
|
||
try:
|
||
if url == "No Website" or not url or url.strip() == "":
|
||
return "No website provided for this investor. Please use web_search to find information."
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
results = await crawler.arun(url)
|
||
return results.markdown[:5000] # Limit content to avoid token limits
|
||
except Exception as e:
|
||
print(f"❌ Failed to crawl {url}: {e}")
|
||
return f"Failed to crawl website: {e}. Please try web_search instead."
|
||
|
||
def web_search(self, query: str):
|
||
"""Tool to search the web using google"""
|
||
print(f"🔍 Searching: {query}")
|
||
try:
|
||
result = self.ddg_search.text(query, max_results=10, backend="google")
|
||
# Format results for better LLM consumption
|
||
formatted_results = []
|
||
for r in result:
|
||
formatted_results.append(
|
||
{
|
||
"title": r.get("title", ""),
|
||
"url": r.get("href", ""),
|
||
"snippet": r.get("body", ""),
|
||
}
|
||
)
|
||
return formatted_results
|
||
except Exception as e:
|
||
print(f"❌ Search failed: {e}")
|
||
return f"Search failed: {e}"
|
||
|
||
|
||
def needs_enrichment(investor: InvestorTable) -> bool:
|
||
"""Check if an investor needs enrichment based on missing fields"""
|
||
missing_fields = []
|
||
|
||
if not investor.description:
|
||
missing_fields.append("description")
|
||
if not investor.aum:
|
||
missing_fields.append("aum")
|
||
if not investor.check_size_lower or not investor.check_size_upper:
|
||
missing_fields.append("check_size")
|
||
if not investor.geographic_focus:
|
||
missing_fields.append("geographic_focus")
|
||
if not investor.investment_stages:
|
||
missing_fields.append("investment_stages")
|
||
if not investor.team_members:
|
||
missing_fields.append("team_members")
|
||
|
||
if missing_fields:
|
||
print(f"Investor {investor.name} missing: {', '.join(missing_fields)}")
|
||
return True
|
||
return False
|
||
|
||
|
||
def update_investor(session, investor: InvestorTable, data: InvestorDataScrape):
|
||
"""Update an InvestorTable row with extracted data, safely handling members and relationships."""
|
||
|
||
# --- Core investor info ---
|
||
if data.investor.description:
|
||
investor.description = data.investor.description
|
||
|
||
if data.investor.aum:
|
||
investor.aum = data.investor.aum
|
||
|
||
if data.investor.check_size_lower:
|
||
investor.check_size_lower = data.investor.check_size_lower
|
||
|
||
if data.investor.check_size_upper:
|
||
investor.check_size_upper = data.investor.check_size_upper
|
||
|
||
if data.investor.geographic_focus:
|
||
investor.geographic_focus = data.investor.geographic_focus
|
||
|
||
if data.investor.number_of_investments:
|
||
investor.number_of_investments = data.investor.number_of_investments
|
||
|
||
# --- Investment Stages (NEW) ---
|
||
if data.investment_stages:
|
||
# Get current stage IDs for comparison
|
||
current_stage_enums = {stage.stage for stage in investor.investment_stages}
|
||
|
||
for stage_data in data.investment_stages:
|
||
if stage_data.stage not in current_stage_enums:
|
||
# Check if stage already exists in database
|
||
existing_stage = (
|
||
session.query(InvestmentStageTable)
|
||
.filter_by(stage=stage_data.stage)
|
||
.first()
|
||
)
|
||
|
||
if not existing_stage:
|
||
# Create new stage record
|
||
existing_stage = InvestmentStageTable(stage=stage_data.stage)
|
||
session.add(existing_stage)
|
||
session.flush() # Get the ID
|
||
|
||
# Add to investor's stages
|
||
investor.investment_stages.append(existing_stage)
|
||
|
||
# --- Team Members ---
|
||
if data.team_members:
|
||
# Index current members by name for quick lookup
|
||
current_members = {m.name.lower(): m for m in investor.team_members if m.name}
|
||
|
||
for m in data.team_members:
|
||
if not m.name:
|
||
continue
|
||
normalized = m.name.strip().lower()
|
||
|
||
if normalized in current_members:
|
||
# Update existing member
|
||
member_obj = current_members[normalized]
|
||
if m.role:
|
||
member_obj.role = m.role
|
||
if m.email:
|
||
member_obj.email = m.email
|
||
else:
|
||
# Create new member
|
||
member_obj = InvestorMember(
|
||
name=m.name.strip(),
|
||
role=m.role,
|
||
email=m.email,
|
||
investor=investor,
|
||
)
|
||
session.add(member_obj)
|
||
|
||
# --- Sectors ---
|
||
if data.sectors:
|
||
for sector_data in data.sectors:
|
||
if not sector_data.name:
|
||
continue
|
||
|
||
# Check if sector already exists
|
||
existing_sector = (
|
||
session.query(SectorTable).filter_by(name=sector_data.name).first()
|
||
)
|
||
if not existing_sector:
|
||
existing_sector = SectorTable(name=sector_data.name)
|
||
session.add(existing_sector)
|
||
session.flush() # Get the ID
|
||
|
||
# Add relationship if not already exists
|
||
if existing_sector not in investor.sectors:
|
||
investor.sectors.append(existing_sector)
|
||
|
||
# --- Portfolio Companies ---
|
||
# if data.portfolio_companies:
|
||
# for company_data in data.portfolio_companies:
|
||
# if not company_data.name:
|
||
# continue
|
||
|
||
# # Check if company already exists
|
||
# existing_company = (
|
||
# session.query(CompanyTable).filter_by(name=company_data.name).first()
|
||
# )
|
||
# if not existing_company:
|
||
# existing_company = CompanyTable(
|
||
# name=company_data.name,
|
||
# industry=company_data.industry,
|
||
# location=company_data.location,
|
||
# description=company_data.description,
|
||
# founded_year=company_data.founded_year,
|
||
# website=company_data.website,
|
||
# )
|
||
# session.add(existing_company)
|
||
# session.flush() # Get the ID
|
||
|
||
# # Add relationship if not already exists
|
||
# if existing_company not in investor.portfolio_companies:
|
||
# investor.portfolio_companies.append(existing_company)
|
||
|
||
session.add(investor)
|
||
session.commit()
|
||
return investor
|
||
|
||
|
||
# ------------------------------------------------------------------
|
||
# Main
|
||
# ------------------------------------------------------------------
|
||
async def main():
|
||
qp = QueryProcessor(sql_session=session)
|
||
all_investors = qp.sql_session.query(InvestorTable).all() if qp.sql_session else []
|
||
|
||
# Filter investors that need enrichment
|
||
investors_to_enrich = [inv for inv in all_investors if needs_enrichment(inv)]
|
||
|
||
# print(
|
||
# f"Found {len(investors_to_enrich)} investors that need enrichment out of {len(all_investors)} total"
|
||
# )
|
||
|
||
# Process first 10 that need enrichment
|
||
for inv in investors_to_enrich[:10]:
|
||
try:
|
||
print(f"\n🔄 Processing investor: {inv.name}")
|
||
prompt = await qp.fill_investor(inv)
|
||
ai_response = await qp.agent.ainvoke({"messages": [("user", f"{prompt}")]})
|
||
extracted = ai_response["structured_response"]
|
||
|
||
# Save JSON backup
|
||
with open("enriched_investors.json", "a") as f:
|
||
f.write(f"# Investor: {inv.name}\n")
|
||
f.write(extracted.model_dump_json(indent=2) + "\n\n")
|
||
|
||
# Update database
|
||
update_investor(session, inv, extracted)
|
||
|
||
print(f"✅ Updated investor {inv.name} (id={inv.id})")
|
||
|
||
except Exception as e:
|
||
logger.error(f"Failed to enrich investor {getattr(inv, 'id', None)}: {e}")
|
||
continue
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|