diff --git a/app/db/__pycache__/models.cpython-312.pyc b/app/db/__pycache__/models.cpython-312.pyc index e04c350..89d49ab 100644 Binary files a/app/db/__pycache__/models.cpython-312.pyc and b/app/db/__pycache__/models.cpython-312.pyc differ diff --git a/app/linkedin_scraper.py b/app/linkedin_scraper.py new file mode 100644 index 0000000..53e9be9 --- /dev/null +++ b/app/linkedin_scraper.py @@ -0,0 +1,730 @@ +""" +LinkedIn Profile Scraper for Investor Members + +This module uses crawl4ai to scrape team pages and find LinkedIn profiles. +Strategies: +1. Crawl the source_url (team pages) to extract LinkedIn profile links +2. Use LLM-powered web search to find LinkedIn profiles by name + +Key advantages of crawl4ai: +- Handles JavaScript-rendered pages +- Better at extracting content from modern websites +- More reliable than simple requests +""" + +import asyncio +import logging +import os +import re +from typing import Dict, List, Optional + +from crawl4ai import AsyncWebCrawler +from ddgs import DDGS +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI + +# Setup logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s" +) +logger = logging.getLogger("linkedin_scraper") + +load_dotenv() +OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") + + +class LinkedInProfileScraper: + """ + LinkedIn profile finder using crawl4ai and LLM-powered web search. + + Strategies: + 1. Crawl source URLs (team pages) to extract LinkedIn links + 2. Use LLM-powered web search to find profiles by name + """ + + def __init__( + self, + rate_limit_delay: float = 0.5, + use_cache: bool = True, + use_llm_search: bool = True, + ): + """ + Initialize the scraper + + Args: + rate_limit_delay: Delay between requests in seconds + use_cache: Whether to cache crawled pages + use_llm_search: Whether to use LLM-powered web search as fallback + """ + self.rate_limit_delay = rate_limit_delay + self.use_cache = use_cache + self.use_llm_search = use_llm_search and OPENROUTER_API_KEY + self.page_cache: Dict[str, str] = {} # Cache crawled pages by URL + self.html_cache: Dict[str, str] = {} # Cache HTML separately + self.profile_cache: Dict[str, Dict] = {} # Cache results by member + + # Initialize LLM agent if API key available + if self.use_llm_search: + self._init_llm_agent() + else: + self.llm = None + self.agent = None + self.ddg_search = None + logger.info("LLM search disabled (no OPENROUTER_API_KEY)") + + def _init_llm_agent(self): + """Initialize LLM agent for web search""" + try: + self.llm = ChatOpenAI( + api_key=OPENROUTER_API_KEY, + base_url="https://openrouter.ai/api/v1", + model="x-ai/grok-4.1-fast:free", + temperature=0, + ) + self.ddg_search = DDGS() + logger.info("LLM search agent initialized") + except Exception as e: + logger.error(f"Failed to initialize LLM agent: {e}") + self.llm = None + self.ddg_search = None + + def web_search(self, query: str) -> List[Dict]: + """Tool to search the web using DuckDuckGo""" + if not self.ddg_search: + return [] + try: + results = list(self.ddg_search.text(query, max_results=10)) + return results + except Exception as e: + logger.error(f"Web search error: {e}") + return [] + + async def crawl_page(self, url: str) -> Optional[str]: + """ + Crawl a webpage and return its content. + + Args: + url: URL to crawl + + Returns: + Page content as markdown/text, or None if failed + """ + if not url: + return None + + # Check cache first + if self.use_cache and url in self.page_cache: + logger.debug(f"Using cached page for {url}") + return self.page_cache[url] + + try: + logger.info(f"Crawling: {url}") + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url) + + if result and result.markdown: + content = result.markdown + # Also get HTML for better link extraction + html_content = result.html if hasattr(result, "html") else "" + + # Cache the results + if self.use_cache: + self.page_cache[url] = content + self.html_cache[url] = html_content + + return content + + except Exception as e: + logger.error(f"Error crawling {url}: {e}") + + return None + + def extract_linkedin_urls_from_content(self, content: str) -> List[Dict[str, str]]: + """ + Extract all LinkedIn profile URLs from content (HTML or markdown). + + Returns: + List of dicts with 'url', 'context', and 'username' + """ + linkedin_links = [] + + # Pattern for LinkedIn profile URLs (handles country-specific domains) + linkedin_pattern = ( + r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)/?" + ) + + # Find all LinkedIn URLs + matches = list(re.finditer(linkedin_pattern, content, re.IGNORECASE)) + + for match in matches: + url = match.group(0).rstrip("/") + # Normalize URL + url = self._normalize_linkedin_url(url) + + # Get surrounding context (200 chars before and after) + start = max(0, match.start() - 200) + end = min(len(content), match.end() + 200) + context = content[start:end] + + # Clean up context (remove HTML tags for readability) + context = re.sub(r"<[^>]+>", " ", context) + context = " ".join(context.split()) # Normalize whitespace + + linkedin_links.append( + {"url": url, "context": context, "username": match.group(1)} + ) + + # Remove duplicates while preserving order + seen_urls = set() + unique_links = [] + for link in linkedin_links: + if link["url"] not in seen_urls: + seen_urls.add(link["url"]) + unique_links.append(link) + + return unique_links + + def _normalize_linkedin_url(self, url: str) -> str: + """Normalize LinkedIn URL to standard format""" + # Remove trailing slashes + url = url.rstrip("/") + + # Convert country-specific to www + url = re.sub( + r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url + ) + + # Ensure https + if url.startswith("http://"): + url = url.replace("http://", "https://") + + return url + + def _name_matches_context(self, name: str, context: str) -> float: + """ + Check if a person's name appears in the context around a LinkedIn URL. + + Returns: + Confidence score 0-100 + """ + if not name or not context: + return 0 + + context_lower = context.lower() + name_lower = name.lower() + + # Split name into parts (handle multiple spaces, titles like "Dr.", etc.) + name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 1] + + # Check for full name match + if name_lower in context_lower: + return 95 + + # Check for name parts in context + matches = sum( + 1 for part in name_parts if part in context_lower and len(part) > 2 + ) + + if len(name_parts) > 0: + if matches == len(name_parts): + return 90 # All name parts found + elif matches >= 2: + return 75 # At least 2 parts found (first + last typically) + elif matches == 1 and len(name_parts) <= 2: + return 50 # Only one part found but name is short + elif matches == 1: + return 35 # Only one part found + + return 0 + + def _name_matches_username(self, name: str, username: str) -> float: + """ + Check if LinkedIn username contains parts of the name. + + Returns: + Confidence score 0-100 + """ + if not name or not username: + return 0 + + name_lower = name.lower() + username_lower = username.lower().replace("-", " ").replace("_", " ") + + name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 2] + + matches = sum(1 for part in name_parts if part in username_lower) + + if len(name_parts) > 0: + if matches == len(name_parts) and len(name_parts) >= 2: + return 85 # Full name in username + elif matches >= 2: + return 70 # Multiple parts match + elif matches == 1: + return 35 # Only one part matches + + return 0 + + async def find_linkedin_from_source( + self, name: str, source_url: str, role: Optional[str] = None + ) -> Dict: + """ + Find LinkedIn profile by crawling the source URL (team page). + + Args: + name: Person's name + source_url: URL of the team/about page + role: Person's role (for additional context matching) + + Returns: + Dict with linkedin_url, confidence, method, notes + """ + if not source_url: + return { + "linkedin_url": None, + "confidence": 0, + "method": "source_crawl", + "notes": "No source URL provided", + } + + # Crawl the page + content = await self.crawl_page(source_url) + + if not content: + return { + "linkedin_url": None, + "confidence": 0, + "method": "source_crawl", + "notes": f"Failed to crawl {source_url}", + } + + # Get HTML for better link extraction + html = self.html_cache.get(source_url, content) + + # Extract all LinkedIn URLs from both HTML and markdown + linkedin_links = self.extract_linkedin_urls_from_content(html) + if not linkedin_links: + linkedin_links = self.extract_linkedin_urls_from_content(content) + + if not linkedin_links: + return { + "linkedin_url": None, + "confidence": 0, + "method": "source_crawl", + "notes": f"No LinkedIn URLs found on {source_url}", + } + + # Score each LinkedIn URL based on name matching + best_match = None + best_score = 0 + + for link in linkedin_links: + # Score based on context matching + context_score = self._name_matches_context(name, link["context"]) + + # Score based on username matching + username_score = self._name_matches_username(name, link["username"]) + + # Also check if role appears in context + role_bonus = 0 + if role and role.lower() in link["context"].lower(): + role_bonus = 10 + + # Combined score (take best of context or username, plus role bonus) + total_score = max(context_score, username_score) + role_bonus + + logger.debug( + f" {name} -> {link['url']}: context={context_score}, username={username_score}, role={role_bonus}, total={total_score}" + ) + + if total_score > best_score: + best_score = total_score + best_match = link + + if best_match and best_score >= 30: # Minimum threshold + return { + "linkedin_url": best_match["url"], + "confidence": min(best_score, 100), + "method": "source_crawl", + "notes": f"Found on {source_url}", + } + + return { + "linkedin_url": None, + "confidence": 0, + "method": "source_crawl", + "notes": f'No matching LinkedIn profile found for "{name}" on {source_url}', + } + + async def find_linkedin_via_search( + self, name: str, company: str, role: Optional[str] = None + ) -> Dict: + """ + Find LinkedIn profile using web search. + + Args: + name: Person's name + company: Company/investor name + role: Person's role (optional) + + Returns: + Dict with linkedin_url, confidence, method, notes + """ + if not self.ddg_search: + return { + "linkedin_url": None, + "confidence": 0, + "method": "web_search", + "notes": "Web search not available", + } + + try: + # Build search query - search for LinkedIn profile + query = f"{name} {company} site:linkedin.com/in" + if role: + query = f"{name} {role} {company} site:linkedin.com/in" + + logger.debug(f"Searching: {query}") + results = self.web_search(query) + + if results: + # Look for LinkedIn profile URLs in results + linkedin_pattern = r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)" + + for result in results: + url = result.get("href") or result.get("link") or "" + title = result.get("title", "").lower() + body = result.get("body", "").lower() + + match = re.search(linkedin_pattern, url, re.IGNORECASE) + if match: + linkedin_url = self._normalize_linkedin_url(match.group(0)) + username = match.group(1) + + # Score based on name matching in title/body and username + context = f"{title} {body}" + context_score = self._name_matches_context(name, context) + username_score = self._name_matches_username(name, username) + + total_score = max(context_score, username_score) + + if total_score >= 30: + return { + "linkedin_url": linkedin_url, + "confidence": min( + total_score, 90 + ), # Cap at 90 for search results + "method": "web_search", + "notes": "Found via web search", + } + + return { + "linkedin_url": None, + "confidence": 0, + "method": "web_search", + "notes": "No matching profile found in search results", + } + + except Exception as e: + logger.error(f"Web search error for {name}: {e}") + return { + "linkedin_url": None, + "confidence": 0, + "method": "web_search", + "notes": f"Search error: {str(e)}", + } + + async def find_linkedin_profile( + self, + name: str, + company: str, + role: Optional[str] = None, + source_url: Optional[str] = None, + ) -> Dict: + """ + Find LinkedIn profile for a person. + + Primary strategy: Crawl source URL to find LinkedIn links. + + Args: + name: Person's name + company: Company/investor name + role: Person's role/title (optional) + source_url: URL where person info was found (optional) + + Returns: + Dict with: + - linkedin_url: Found LinkedIn URL or None + - confidence: Confidence score (0-100) + - method: Method used to find the profile + - notes: Additional information + """ + cache_key = f"{name}|{company}" + + # Check cache + if self.use_cache and cache_key in self.profile_cache: + logger.debug(f"Using cached result for {name}") + return self.profile_cache[cache_key] + + result = {"linkedin_url": None, "confidence": 0, "method": "none", "notes": ""} + + # Primary strategy: Crawl source URL + if source_url: + result = await self.find_linkedin_from_source(name, source_url, role) + + if result["linkedin_url"]: + if self.use_cache: + self.profile_cache[cache_key] = result + return result + + # Fallback strategy: Web search (if enabled and no result from source crawl) + if self.use_llm_search and not result.get("linkedin_url"): + search_result = await self.find_linkedin_via_search(name, company, role) + if search_result["linkedin_url"]: + if self.use_cache: + self.profile_cache[cache_key] = search_result + return search_result + + # If no source URL or no match found + if not result["linkedin_url"]: + result = { + "linkedin_url": None, + "confidence": 0, + "method": "none", + "notes": "No source URL available" + if not source_url + else result.get("notes", "Not found"), + } + + if self.use_cache: + self.profile_cache[cache_key] = result + + return result + + async def batch_find_profiles( + self, members: List[Dict], progress_callback=None, db_callback=None + ) -> List[Dict]: + """ + Find LinkedIn profiles for multiple members efficiently. + + Groups members by source_url to minimize crawling the same page multiple times. + + Args: + members: List of dicts with 'name', 'company', 'role', 'source_url', 'id' + progress_callback: Optional callback function(current, total, result) + db_callback: Optional callback to save to database immediately when profile found + Signature: db_callback(member_id, linkedin_url) -> bool + + Returns: + List of results for each member + """ + results = [] + total = len(members) + + # Group members by source_url for efficient crawling + url_groups: Dict[str, List[Dict]] = {} + no_url_members = [] + + for member in members: + url = member.get("source_url") + if url: + if url not in url_groups: + url_groups[url] = [] + url_groups[url].append(member) + else: + no_url_members.append(member) + + logger.info( + f"Processing {len(url_groups)} unique source URLs for {total} members" + ) + logger.info(f"Members with source URLs: {total - len(no_url_members)}") + logger.info(f"Members without source URLs: {len(no_url_members)}") + if self.use_llm_search: + logger.info("Web search fallback: ENABLED") + else: + logger.info("Web search fallback: DISABLED") + + processed = 0 + + # Process members grouped by URL (efficient - one crawl per page) + for url, group_members in url_groups.items(): + # Crawl the page once + content = await self.crawl_page(url) + html = self.html_cache.get(url, content or "") + + # Extract all LinkedIn URLs from this page + linkedin_links = [] + if content: + linkedin_links = self.extract_linkedin_urls_from_content(html) + if not linkedin_links: + linkedin_links = self.extract_linkedin_urls_from_content(content) + + # Match each member in this group + for member in group_members: + processed += 1 + result = None + found_linkedin = False + + if linkedin_links: + # Find best matching LinkedIn for this member + best_match = None + best_score = 0 + + for link in linkedin_links: + context_score = self._name_matches_context( + member["name"], link["context"] + ) + username_score = self._name_matches_username( + member["name"], link["username"] + ) + role_bonus = ( + 10 + if member.get("role") + and member["role"].lower() in link["context"].lower() + else 0 + ) + total_score = max(context_score, username_score) + role_bonus + + if total_score > best_score: + best_score = total_score + best_match = link + + if best_match and best_score >= 30: + result = { + "linkedin_url": best_match["url"], + "confidence": min(best_score, 100), + "method": "source_crawl", + "notes": f"Found on {url}", + "member_id": member.get("id"), + "member_name": member["name"], + } + found_linkedin = True + # Save to database immediately if callback provided + if db_callback and member.get("id"): + db_callback(member["id"], best_match["url"]) + + # If no result from source crawl, try web search IMMEDIATELY + if not found_linkedin and self.use_llm_search: + search_result = await self.find_linkedin_via_search( + member["name"], member["company"], member.get("role") + ) + + if search_result["linkedin_url"]: + result = { + "linkedin_url": search_result["linkedin_url"], + "confidence": search_result["confidence"], + "method": "web_search", + "notes": search_result.get("notes", "Found via web search"), + "member_id": member.get("id"), + "member_name": member["name"], + } + found_linkedin = True + # Save to database immediately + if db_callback and member.get("id"): + db_callback(member["id"], search_result["linkedin_url"]) + + # If still no result, record as not found + if not found_linkedin: + result = { + "linkedin_url": None, + "confidence": 0, + "method": "source_crawl" if content else "none", + "notes": f"No match on {url}" + if linkedin_links + else ( + f"No LinkedIn URLs on {url}" + if content + else f"Failed to crawl {url}" + ), + "member_id": member.get("id"), + "member_name": member["name"], + } + + results.append(result) + + if progress_callback: + progress_callback(processed, total, result) + + # Small delay between different URLs + await asyncio.sleep(self.rate_limit_delay) + + # Process members without source URLs - do web search immediately for each + for member in no_url_members: + processed += 1 + result = None + + # Try web search immediately + if self.use_llm_search: + search_result = await self.find_linkedin_via_search( + member["name"], member["company"], member.get("role") + ) + + if search_result["linkedin_url"]: + result = { + "linkedin_url": search_result["linkedin_url"], + "confidence": search_result["confidence"], + "method": "web_search", + "notes": search_result.get("notes", "Found via web search"), + "member_id": member.get("id"), + "member_name": member["name"], + } + # Save to database immediately + if db_callback and member.get("id"): + db_callback(member["id"], search_result["linkedin_url"]) + + # If no result from search + if not result: + result = { + "linkedin_url": None, + "confidence": 0, + "method": "web_search" if self.use_llm_search else "none", + "notes": "No LinkedIn profile found" + if self.use_llm_search + else "No source URL available", + "member_id": member.get("id"), + "member_name": member["name"], + } + + results.append(result) + + if progress_callback: + progress_callback(processed, total, result) + + # Rate limit between searches + await asyncio.sleep(self.rate_limit_delay) + + return results + + +def format_linkedin_url(url: str) -> str: + """Normalize LinkedIn URL format""" + if not url: + return url + + # Remove trailing slashes + url = url.rstrip("/") + + # Ensure https and normalize to www + url = re.sub(r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url) + if url.startswith("http://"): + url = url.replace("http://", "https://") + + return url + + +# Async wrapper for sync contexts +def run_batch_scraper( + members: List[Dict], rate_limit: float = 0.5, progress_callback=None +) -> List[Dict]: + """ + Synchronous wrapper for batch_find_profiles. + + Args: + members: List of member dicts + rate_limit: Delay between URL crawls + progress_callback: Optional progress callback + + Returns: + List of results + """ + scraper = LinkedInProfileScraper(rate_limit_delay=rate_limit) + return asyncio.run(scraper.batch_find_profiles(members, progress_callback)) diff --git a/investors.db b/investors.db index 07c306b..a1f0d6d 100644 Binary files a/investors.db and b/investors.db differ diff --git a/update_linkedin_profiles.py b/update_linkedin_profiles.py new file mode 100644 index 0000000..a6e815d --- /dev/null +++ b/update_linkedin_profiles.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python3 +""" +Update Investor Members LinkedIn Profiles Script + +This script finds and updates LinkedIn profile URLs for investor members in the database. +Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs. + +Usage: + python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing] + +Options: + --test Test mode: process only 10 records and don't update database + --limit N Process only N records (default: all) + --skip-existing Skip members that already have LinkedIn URLs + --start-from N Start from record N (for resuming) +""" + +import argparse +import asyncio +import json +import os +import sys +from datetime import datetime + +# Add app to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app")) + +from db.db import get_db_session +from db.models import InvestorMember, InvestorTable +from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url + + +def progress_callback(current, total, result): + """Print progress updates""" + percent = (current / total) * 100 + status = "✓" if result["linkedin_url"] else "✗" + print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}") + if result["linkedin_url"]: + print( + f" → {result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})" + ) + + +def create_db_callback(test_mode=False): + """ + Create a callback function that saves LinkedIn profiles to the database immediately. + This allows stopping and resuming without losing progress. + """ + saved_count = {"count": 0} # Use dict to allow modification in closure + + def db_callback(member_id: int, linkedin_url: str) -> bool: + """Save LinkedIn URL to database immediately""" + if test_mode: + print(f" [TEST] Would save to DB: member {member_id}") + saved_count["count"] += 1 + return True + + try: + db = get_db_session() + member = db.query(InvestorMember).filter_by(id=member_id).first() + if member: + member.linkedin = format_linkedin_url(linkedin_url) + db.commit() + saved_count["count"] += 1 + return True + except Exception as e: + print(f" ⚠️ DB Error for member {member_id}: {e}") + try: + db.rollback() + except Exception: + pass + return False + finally: + try: + db.close() + except Exception: + pass + return False + + return db_callback, saved_count + + +def update_database(members_data, test_mode=False): + """Update database with found LinkedIn profiles""" + db = get_db_session() + + try: + updated_count = 0 + for data in members_data: + if data["linkedin_url"] and data["member_id"]: + if not test_mode: + member = ( + db.query(InvestorMember).filter_by(id=data["member_id"]).first() + ) + if member: + member.linkedin = format_linkedin_url(data["linkedin_url"]) + updated_count += 1 + else: + print( + f" [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}" + ) + updated_count += 1 + + if not test_mode: + db.commit() + print(f"\n✓ Successfully updated {updated_count} records in database") + else: + print(f"\n[TEST MODE] Would have updated {updated_count} records") + + return updated_count + + except Exception as e: + db.rollback() + print(f"\n✗ Error updating database: {e}") + raise + finally: + db.close() + + +def save_results(results, filename="linkedin_scraping_results.json"): + """Save results to JSON file for backup/analysis""" + output = { + "timestamp": datetime.now().isoformat(), + "total_processed": len(results), + "found_count": sum(1 for r in results if r["linkedin_url"]), + "results": results, + } + + with open(filename, "w") as f: + json.dump(output, f, indent=2) + + print(f"\n✓ Results saved to {filename}") + + +def print_summary(results): + """Print summary statistics""" + total = len(results) + found = sum(1 for r in results if r["linkedin_url"]) + not_found = total - found + + # Count by method + methods = {} + for r in results: + if r["linkedin_url"]: + method = r["method"] + methods[method] = methods.get(method, 0) + 1 + + # Average confidence for found profiles + avg_confidence = ( + sum(r["confidence"] for r in results if r["linkedin_url"]) / found + if found > 0 + else 0 + ) + + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Total processed: {total}") + print(f"LinkedIn found: {found} ({found / total * 100:.1f}%)") + print(f"Not found: {not_found} ({not_found / total * 100:.1f}%)") + print(f"\nAverage confidence: {avg_confidence:.1f}%") + print("\nMethods used:") + for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True): + print(f" {method:20s} {count:5d} ({count / found * 100:.1f}%)") + print("=" * 60) + + +def main(): + parser = argparse.ArgumentParser( + description="Update LinkedIn profiles for investor members" + ) + parser.add_argument( + "--test", + action="store_true", + help="Test mode: process only 10 records without updating database", + ) + parser.add_argument("--limit", type=int, help="Limit number of records to process") + parser.add_argument( + "--skip-existing", + action="store_true", + help="Skip members that already have LinkedIn URLs", + ) + parser.add_argument( + "--start-from", + type=int, + default=0, + help="Start from record N (for resuming interrupted runs)", + ) + parser.add_argument( + "--rate-limit", + type=float, + default=0.5, + help="Delay between URL crawls in seconds (default: 0.5)", + ) + + args = parser.parse_args() + + # Test mode overrides limit + if args.test and not args.limit: + args.limit = 10 + + print("=" * 60) + print("LinkedIn Profile Scraper for Investor Members (crawl4ai)") + print("=" * 60) + + if args.test: + print("\n⚠️ TEST MODE - No database changes will be made") + + # Initialize database and scraper + db = get_db_session() + + try: + # Build query + query = db.query(InvestorMember, InvestorTable).join( + InvestorTable, InvestorMember.investor_id == InvestorTable.id + ) + + # Filter existing if requested + if args.skip_existing: + query = query.filter( + (InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "") + ) + print("\n✓ Filtering to members without LinkedIn profiles") + + # Get total count + total_available = query.count() + print(f"\n✓ Found {total_available} members to process") + + # Apply offset and limit + if args.start_from > 0: + query = query.offset(args.start_from) + print(f"✓ Starting from record {args.start_from}") + + if args.limit: + query = query.limit(args.limit) + print(f"✓ Processing {args.limit} records") + + # Fetch members + members_data = [] + for member, investor in query.all(): + members_data.append( + { + "id": member.id, + "name": member.name, + "company": investor.name, + "role": member.role, + "source_url": member.source_url, + } + ) + + if not members_data: + print("\n⚠️ No members to process") + return + + # Count unique source URLs + unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"])) + with_urls = sum(1 for m in members_data if m["source_url"]) + + print(f"\n✓ Loaded {len(members_data)} members") + print( + f"✓ {with_urls} members have source URLs ({unique_urls} unique pages to crawl)" + ) + print(f"✓ {len(members_data) - with_urls} members without source URLs") + print(f"✓ Rate limit: {args.rate_limit}s between page crawls") + print("\nStarting LinkedIn profile search using crawl4ai...\n") + + finally: + db.close() + + # Initialize scraper + scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True) + + print("ℹ️ Using crawl4ai to scrape team pages and extract LinkedIn URLs") + print( + "ℹ️ Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n" + ) + + # Create database callback for real-time saving + db_callback, saved_count = create_db_callback(test_mode=args.test) + + # Process members asynchronously with real-time DB saving + results = asyncio.run( + scraper.batch_find_profiles( + members_data, progress_callback=progress_callback, db_callback=db_callback + ) + ) + + # Print summary + print_summary(results) + + # Save results + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + results_file = f"linkedin_results_{timestamp}.json" + save_results(results, results_file) + + # Show database update summary + if not args.test: + print( + f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved" + ) + else: + print( + f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database" + ) + + print("\n✓ Done! You can resume anytime with --skip-existing") + + +if __name__ == "__main__": + main()