350 lines
14 KiB
Python
350 lines
14 KiB
Python
|
|
import asyncio
|
|||
|
|
import logging
|
|||
|
|
import os
|
|||
|
|
from typing import Optional
|
|||
|
|
|
|||
|
|
from crawl4ai import AsyncWebCrawler
|
|||
|
|
from web_crawler_schemas import InvestorDataScrape
|
|||
|
|
from ddgs import DDGS
|
|||
|
|
from dotenv import load_dotenv
|
|||
|
|
from langchain_openai import ChatOpenAI
|
|||
|
|
from langgraph.prebuilt import create_react_agent
|
|||
|
|
from models import (
|
|||
|
|
CompanyTable,
|
|||
|
|
InvestmentStageTable,
|
|||
|
|
InvestorMember,
|
|||
|
|
InvestorTable,
|
|||
|
|
SectorTable,
|
|||
|
|
engine,
|
|||
|
|
)
|
|||
|
|
from sqlalchemy.orm import sessionmaker
|
|||
|
|
|
|||
|
|
Session = sessionmaker(bind=engine)
|
|||
|
|
session = Session()
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Logging setup
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
logging.basicConfig(
|
|||
|
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
|||
|
|
)
|
|||
|
|
logger = logging.getLogger("web_search_agent")
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Environment
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
load_dotenv()
|
|||
|
|
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|||
|
|
|
|||
|
|
if not OPENROUTER_API_KEY:
|
|||
|
|
logger.warning("OPENROUTER_API_KEY not set. LLM calls will fail if invoked.")
|
|||
|
|
|
|||
|
|
|
|||
|
|
class QueryProcessor:
|
|||
|
|
def __init__(self, sql_session: Optional[object] = None):
|
|||
|
|
self.sql_session = sql_session
|
|||
|
|
|
|||
|
|
self.llm = ChatOpenAI(
|
|||
|
|
api_key=OPENROUTER_API_KEY,
|
|||
|
|
base_url="https://openrouter.ai/api/v1",
|
|||
|
|
model="openai/gpt-5-nano",
|
|||
|
|
temperature=0,
|
|||
|
|
)
|
|||
|
|
self.agent = create_react_agent(
|
|||
|
|
model=self.llm,
|
|||
|
|
tools=[self.crawl, self.web_search],
|
|||
|
|
response_format=InvestorDataScrape,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
self.ddg_search = DDGS()
|
|||
|
|
|
|||
|
|
async def fill_investor(self, investor: InvestorTable):
|
|||
|
|
inv_dict = {
|
|||
|
|
col.name: getattr(investor, col.name) for col in investor.__table__.columns
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
website = inv_dict.get("website", "No Website")
|
|||
|
|
name = inv_dict.get("name", "Unknown")
|
|||
|
|
description = inv_dict.get("description", "No description")
|
|||
|
|
aum = inv_dict.get("aum", "Unknown")
|
|||
|
|
check_size_lower = inv_dict.get("check_size_lower", "Unknown")
|
|||
|
|
check_size_upper = inv_dict.get("check_size_upper", "Unknown")
|
|||
|
|
geographic_focus = inv_dict.get("geographic_focus", "Unknown")
|
|||
|
|
number_of_investments = inv_dict.get("number_of_investments", "Unknown")
|
|||
|
|
|
|||
|
|
print(website)
|
|||
|
|
|
|||
|
|
prompt = f"""
|
|||
|
|
You are a crawler agent. You will be provided with information about a venture capital investor and their website.
|
|||
|
|
Your task is to navigate the website to find and enrich the existing information.
|
|||
|
|
If the website is not available, use the `web_search` tool to google the name of the investor company.
|
|||
|
|
Use the `crawl` tool to visit web pages and extract information.
|
|||
|
|
|
|||
|
|
Current investor information:
|
|||
|
|
- Name: {name}
|
|||
|
|
- Website: {website}
|
|||
|
|
- Description: {description}
|
|||
|
|
- Assets Under Management: {aum}
|
|||
|
|
- Check Size Lower: {check_size_lower}
|
|||
|
|
- Check Size Upper: {check_size_upper}
|
|||
|
|
- Geographic Focus: {geographic_focus}
|
|||
|
|
- Number of Investments: {number_of_investments}
|
|||
|
|
|
|||
|
|
IMPORTANT: Investment Stages - Investors often focus on MULTIPLE stages. Look for:
|
|||
|
|
- "Seed to Series A" = [SEED, SERIES_A]
|
|||
|
|
- "Early stage" = [SEED, SERIES_A]
|
|||
|
|
- "Growth stage" = [SERIES_B, SERIES_C, GROWTH]
|
|||
|
|
- "Multi-stage" = [SEED, SERIES_A, SERIES_B, SERIES_C]
|
|||
|
|
- "Late stage" = [GROWTH, LATE_STAGE]
|
|||
|
|
- "Series A and B" = [SERIES_A, SERIES_B]
|
|||
|
|
|
|||
|
|
IMPORTANT: Additional guidance for AUM and Check Size
|
|||
|
|
- "Check size" may also be written as "ticket size", "investment size", "typical investment range", or "investment amount".
|
|||
|
|
- "Assets under management (AUM)" may also be called "fund size", "capital under management", or "fund raised".
|
|||
|
|
- If not on the official website, search news and databases like Crunchbase, PitchBook, Dealroom, TechCrunch, PRNewswire, or EU-Startups.
|
|||
|
|
- Look for numbers with currency symbols (€,$,£) followed by "M", "B", "million", or "billion".
|
|||
|
|
- Example: "fund size €200M", "typical tickets $1–5M", "raised £1 billion".
|
|||
|
|
|
|||
|
|
Follow these steps:
|
|||
|
|
1. Use the `crawl` tool with the main website URL to get the initial content.
|
|||
|
|
2. Analyze the returned content. Look for links or sections related to the information you need (About, Team, Portfolio, Investments, Funds).
|
|||
|
|
3. If you find a relevant URL, call the `crawl` tool again with that new URL to get more detailed information.
|
|||
|
|
4. If AUM or check size are still missing, immediately perform 1–2 `web_search` queries such as:
|
|||
|
|
- "{name} fund size site:techcrunch.com"
|
|||
|
|
- "{name} ticket size site:eu-startups.com"
|
|||
|
|
- "{name} raises fund site:prnewswire.com"
|
|||
|
|
5. Continue this process, exploring relevant pages, until you have gathered all the required information.
|
|||
|
|
6. Extract and update the following information:
|
|||
|
|
- investor: Core investor data (name, description, aum, check_size_lower, check_size_upper, geographic_focus, number_of_investments)
|
|||
|
|
- team_members: List of key members with name, role, and email/LinkedIn
|
|||
|
|
- sectors: List of investment sectors they focus on
|
|||
|
|
- investment_stages: List of ALL investment stages they focus on (can be multiple!)
|
|||
|
|
7. If any information is not available or cannot be improved, leave it as null or use existing data.
|
|||
|
|
|
|||
|
|
Stop crawling/searching once you have found the missing information or confirmed it is not available online.
|
|||
|
|
|
|||
|
|
Website: {website}
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
return prompt
|
|||
|
|
|
|||
|
|
async def crawl(self, url: str):
|
|||
|
|
"""Tool to search the web using a web crawler. given the url"""
|
|||
|
|
print(f"🕷️ Crawling: {url}")
|
|||
|
|
try:
|
|||
|
|
if url == "No Website" or not url or url.strip() == "":
|
|||
|
|
return "No website provided for this investor. Please use web_search to find information."
|
|||
|
|
|
|||
|
|
async with AsyncWebCrawler() as crawler:
|
|||
|
|
results = await crawler.arun(url)
|
|||
|
|
return results.markdown[:5000] # Limit content to avoid token limits
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Failed to crawl {url}: {e}")
|
|||
|
|
return f"Failed to crawl website: {e}. Please try web_search instead."
|
|||
|
|
|
|||
|
|
def web_search(self, query: str):
|
|||
|
|
"""Tool to search the web using google"""
|
|||
|
|
print(f"🔍 Searching: {query}")
|
|||
|
|
try:
|
|||
|
|
result = self.ddg_search.text(query, max_results=10, backend="google")
|
|||
|
|
# Format results for better LLM consumption
|
|||
|
|
formatted_results = []
|
|||
|
|
for r in result:
|
|||
|
|
formatted_results.append(
|
|||
|
|
{
|
|||
|
|
"title": r.get("title", ""),
|
|||
|
|
"url": r.get("href", ""),
|
|||
|
|
"snippet": r.get("body", ""),
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
return formatted_results
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ Search failed: {e}")
|
|||
|
|
return f"Search failed: {e}"
|
|||
|
|
|
|||
|
|
|
|||
|
|
def needs_enrichment(investor: InvestorTable) -> bool:
|
|||
|
|
"""Check if an investor needs enrichment based on missing fields"""
|
|||
|
|
missing_fields = []
|
|||
|
|
|
|||
|
|
if not investor.description:
|
|||
|
|
missing_fields.append("description")
|
|||
|
|
if not investor.aum:
|
|||
|
|
missing_fields.append("aum")
|
|||
|
|
if not investor.check_size_lower or not investor.check_size_upper:
|
|||
|
|
missing_fields.append("check_size")
|
|||
|
|
if not investor.geographic_focus:
|
|||
|
|
missing_fields.append("geographic_focus")
|
|||
|
|
if not investor.investment_stages:
|
|||
|
|
missing_fields.append("investment_stages")
|
|||
|
|
if not investor.team_members:
|
|||
|
|
missing_fields.append("team_members")
|
|||
|
|
|
|||
|
|
if missing_fields:
|
|||
|
|
print(f"Investor {investor.name} missing: {', '.join(missing_fields)}")
|
|||
|
|
return True
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
def update_investor(session, investor: InvestorTable, data: InvestorDataScrape):
|
|||
|
|
"""Update an InvestorTable row with extracted data, safely handling members and relationships."""
|
|||
|
|
|
|||
|
|
# --- Core investor info ---
|
|||
|
|
if data.investor.description:
|
|||
|
|
investor.description = data.investor.description
|
|||
|
|
|
|||
|
|
if data.investor.aum:
|
|||
|
|
investor.aum = data.investor.aum
|
|||
|
|
|
|||
|
|
if data.investor.check_size_lower:
|
|||
|
|
investor.check_size_lower = data.investor.check_size_lower
|
|||
|
|
|
|||
|
|
if data.investor.check_size_upper:
|
|||
|
|
investor.check_size_upper = data.investor.check_size_upper
|
|||
|
|
|
|||
|
|
if data.investor.geographic_focus:
|
|||
|
|
investor.geographic_focus = data.investor.geographic_focus
|
|||
|
|
|
|||
|
|
if data.investor.number_of_investments:
|
|||
|
|
investor.number_of_investments = data.investor.number_of_investments
|
|||
|
|
|
|||
|
|
# --- Investment Stages (NEW) ---
|
|||
|
|
if data.investment_stages:
|
|||
|
|
# Get current stage IDs for comparison
|
|||
|
|
current_stage_enums = {stage.stage for stage in investor.investment_stages}
|
|||
|
|
|
|||
|
|
for stage_data in data.investment_stages:
|
|||
|
|
if stage_data.stage not in current_stage_enums:
|
|||
|
|
# Check if stage already exists in database
|
|||
|
|
existing_stage = (
|
|||
|
|
session.query(InvestmentStageTable)
|
|||
|
|
.filter_by(stage=stage_data.stage)
|
|||
|
|
.first()
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if not existing_stage:
|
|||
|
|
# Create new stage record
|
|||
|
|
existing_stage = InvestmentStageTable(stage=stage_data.stage)
|
|||
|
|
session.add(existing_stage)
|
|||
|
|
session.flush() # Get the ID
|
|||
|
|
|
|||
|
|
# Add to investor's stages
|
|||
|
|
investor.investment_stages.append(existing_stage)
|
|||
|
|
|
|||
|
|
# --- Team Members ---
|
|||
|
|
if data.team_members:
|
|||
|
|
# Index current members by name for quick lookup
|
|||
|
|
current_members = {m.name.lower(): m for m in investor.team_members if m.name}
|
|||
|
|
|
|||
|
|
for m in data.team_members:
|
|||
|
|
if not m.name:
|
|||
|
|
continue
|
|||
|
|
normalized = m.name.strip().lower()
|
|||
|
|
|
|||
|
|
if normalized in current_members:
|
|||
|
|
# Update existing member
|
|||
|
|
member_obj = current_members[normalized]
|
|||
|
|
if m.role:
|
|||
|
|
member_obj.role = m.role
|
|||
|
|
if m.email:
|
|||
|
|
member_obj.email = m.email
|
|||
|
|
else:
|
|||
|
|
# Create new member
|
|||
|
|
member_obj = InvestorMember(
|
|||
|
|
name=m.name.strip(),
|
|||
|
|
role=m.role,
|
|||
|
|
email=m.email,
|
|||
|
|
investor=investor,
|
|||
|
|
)
|
|||
|
|
session.add(member_obj)
|
|||
|
|
|
|||
|
|
# --- Sectors ---
|
|||
|
|
if data.sectors:
|
|||
|
|
for sector_data in data.sectors:
|
|||
|
|
if not sector_data.name:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# Check if sector already exists
|
|||
|
|
existing_sector = (
|
|||
|
|
session.query(SectorTable).filter_by(name=sector_data.name).first()
|
|||
|
|
)
|
|||
|
|
if not existing_sector:
|
|||
|
|
existing_sector = SectorTable(name=sector_data.name)
|
|||
|
|
session.add(existing_sector)
|
|||
|
|
session.flush() # Get the ID
|
|||
|
|
|
|||
|
|
# Add relationship if not already exists
|
|||
|
|
if existing_sector not in investor.sectors:
|
|||
|
|
investor.sectors.append(existing_sector)
|
|||
|
|
|
|||
|
|
# --- Portfolio Companies ---
|
|||
|
|
# if data.portfolio_companies:
|
|||
|
|
# for company_data in data.portfolio_companies:
|
|||
|
|
# if not company_data.name:
|
|||
|
|
# continue
|
|||
|
|
|
|||
|
|
# # Check if company already exists
|
|||
|
|
# existing_company = (
|
|||
|
|
# session.query(CompanyTable).filter_by(name=company_data.name).first()
|
|||
|
|
# )
|
|||
|
|
# if not existing_company:
|
|||
|
|
# existing_company = CompanyTable(
|
|||
|
|
# name=company_data.name,
|
|||
|
|
# industry=company_data.industry,
|
|||
|
|
# location=company_data.location,
|
|||
|
|
# description=company_data.description,
|
|||
|
|
# founded_year=company_data.founded_year,
|
|||
|
|
# website=company_data.website,
|
|||
|
|
# )
|
|||
|
|
# session.add(existing_company)
|
|||
|
|
# session.flush() # Get the ID
|
|||
|
|
|
|||
|
|
# # Add relationship if not already exists
|
|||
|
|
# if existing_company not in investor.portfolio_companies:
|
|||
|
|
# investor.portfolio_companies.append(existing_company)
|
|||
|
|
|
|||
|
|
session.add(investor)
|
|||
|
|
session.commit()
|
|||
|
|
return investor
|
|||
|
|
|
|||
|
|
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
# Main
|
|||
|
|
# ------------------------------------------------------------------
|
|||
|
|
async def main():
|
|||
|
|
qp = QueryProcessor(sql_session=session)
|
|||
|
|
all_investors = qp.sql_session.query(InvestorTable).all() if qp.sql_session else []
|
|||
|
|
|
|||
|
|
# Filter investors that need enrichment
|
|||
|
|
investors_to_enrich = [inv for inv in all_investors if needs_enrichment(inv)]
|
|||
|
|
|
|||
|
|
# print(
|
|||
|
|
# f"Found {len(investors_to_enrich)} investors that need enrichment out of {len(all_investors)} total"
|
|||
|
|
# )
|
|||
|
|
|
|||
|
|
# Process first 10 that need enrichment
|
|||
|
|
for inv in investors_to_enrich[:10]:
|
|||
|
|
try:
|
|||
|
|
print(f"\n🔄 Processing investor: {inv.name}")
|
|||
|
|
prompt = await qp.fill_investor(inv)
|
|||
|
|
ai_response = await qp.agent.ainvoke({"messages": [("user", f"{prompt}")]})
|
|||
|
|
extracted = ai_response["structured_response"]
|
|||
|
|
|
|||
|
|
# Save JSON backup
|
|||
|
|
with open("enriched_investors.json", "a") as f:
|
|||
|
|
f.write(f"# Investor: {inv.name}\n")
|
|||
|
|
f.write(extracted.model_dump_json(indent=2) + "\n\n")
|
|||
|
|
|
|||
|
|
# Update database
|
|||
|
|
update_investor(session, inv, extracted)
|
|||
|
|
|
|||
|
|
print(f"✅ Updated investor {inv.name} (id={inv.id})")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"Failed to enrich investor {getattr(inv, 'id', None)}: {e}")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
asyncio.run(main())
|