""" LinkedIn Profile Scraper for Investor Members This module uses crawl4ai to scrape team pages and find LinkedIn profiles. Strategies: 1. Crawl the source_url (team pages) to extract LinkedIn profile links 2. Use LLM-powered web search to find LinkedIn profiles by name Key advantages of crawl4ai: - Handles JavaScript-rendered pages - Better at extracting content from modern websites - More reliable than simple requests """ import asyncio import logging import os import re from typing import Dict, List, Optional from crawl4ai import AsyncWebCrawler from ddgs import DDGS from dotenv import load_dotenv from langchain_openai import ChatOpenAI # Setup logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s" ) logger = logging.getLogger("linkedin_scraper") load_dotenv() OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") class LinkedInProfileScraper: """ LinkedIn profile finder using crawl4ai and LLM-powered web search. Strategies: 1. Crawl source URLs (team pages) to extract LinkedIn links 2. Use LLM-powered web search to find profiles by name """ def __init__( self, rate_limit_delay: float = 0.5, use_cache: bool = True, use_llm_search: bool = True, ): """ Initialize the scraper Args: rate_limit_delay: Delay between requests in seconds use_cache: Whether to cache crawled pages use_llm_search: Whether to use LLM-powered web search as fallback """ self.rate_limit_delay = rate_limit_delay self.use_cache = use_cache self.use_llm_search = use_llm_search and OPENROUTER_API_KEY self.page_cache: Dict[str, str] = {} # Cache crawled pages by URL self.html_cache: Dict[str, str] = {} # Cache HTML separately self.profile_cache: Dict[str, Dict] = {} # Cache results by member # Initialize LLM agent if API key available if self.use_llm_search: self._init_llm_agent() else: self.llm = None self.agent = None self.ddg_search = None logger.info("LLM search disabled (no OPENROUTER_API_KEY)") def _init_llm_agent(self): """Initialize LLM agent for web search""" try: self.llm = ChatOpenAI( api_key=OPENROUTER_API_KEY, base_url="https://openrouter.ai/api/v1", model="x-ai/grok-4.1-fast:free", temperature=0, ) self.ddg_search = DDGS() logger.info("LLM search agent initialized") except Exception as e: logger.error(f"Failed to initialize LLM agent: {e}") self.llm = None self.ddg_search = None def web_search(self, query: str) -> List[Dict]: """Tool to search the web using DuckDuckGo""" if not self.ddg_search: return [] try: results = list(self.ddg_search.text(query, max_results=10)) return results except Exception as e: logger.error(f"Web search error: {e}") return [] async def crawl_page(self, url: str) -> Optional[str]: """ Crawl a webpage and return its content. Args: url: URL to crawl Returns: Page content as markdown/text, or None if failed """ if not url: return None # Check cache first if self.use_cache and url in self.page_cache: logger.debug(f"Using cached page for {url}") return self.page_cache[url] try: logger.info(f"Crawling: {url}") async with AsyncWebCrawler() as crawler: result = await crawler.arun(url) if result and result.markdown: content = result.markdown # Also get HTML for better link extraction html_content = result.html if hasattr(result, "html") else "" # Cache the results if self.use_cache: self.page_cache[url] = content self.html_cache[url] = html_content return content except Exception as e: logger.error(f"Error crawling {url}: {e}") return None def extract_linkedin_urls_from_content(self, content: str) -> List[Dict[str, str]]: """ Extract all LinkedIn profile URLs from content (HTML or markdown). Returns: List of dicts with 'url', 'context', and 'username' """ linkedin_links = [] # Pattern for LinkedIn profile URLs (handles country-specific domains) linkedin_pattern = ( r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)/?" ) # Find all LinkedIn URLs matches = list(re.finditer(linkedin_pattern, content, re.IGNORECASE)) for match in matches: url = match.group(0).rstrip("/") # Normalize URL url = self._normalize_linkedin_url(url) # Get surrounding context (200 chars before and after) start = max(0, match.start() - 200) end = min(len(content), match.end() + 200) context = content[start:end] # Clean up context (remove HTML tags for readability) context = re.sub(r"<[^>]+>", " ", context) context = " ".join(context.split()) # Normalize whitespace linkedin_links.append( {"url": url, "context": context, "username": match.group(1)} ) # Remove duplicates while preserving order seen_urls = set() unique_links = [] for link in linkedin_links: if link["url"] not in seen_urls: seen_urls.add(link["url"]) unique_links.append(link) return unique_links def _normalize_linkedin_url(self, url: str) -> str: """Normalize LinkedIn URL to standard format""" # Remove trailing slashes url = url.rstrip("/") # Convert country-specific to www url = re.sub( r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url ) # Ensure https if url.startswith("http://"): url = url.replace("http://", "https://") return url def _name_matches_context(self, name: str, context: str) -> float: """ Check if a person's name appears in the context around a LinkedIn URL. Returns: Confidence score 0-100 """ if not name or not context: return 0 context_lower = context.lower() name_lower = name.lower() # Split name into parts (handle multiple spaces, titles like "Dr.", etc.) name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 1] # Check for full name match if name_lower in context_lower: return 95 # Check for name parts in context matches = sum( 1 for part in name_parts if part in context_lower and len(part) > 2 ) if len(name_parts) > 0: if matches == len(name_parts): return 90 # All name parts found elif matches >= 2: return 75 # At least 2 parts found (first + last typically) elif matches == 1 and len(name_parts) <= 2: return 50 # Only one part found but name is short elif matches == 1: return 35 # Only one part found return 0 def _name_matches_username(self, name: str, username: str) -> float: """ Check if LinkedIn username contains parts of the name. Returns: Confidence score 0-100 """ if not name or not username: return 0 name_lower = name.lower() username_lower = username.lower().replace("-", " ").replace("_", " ") name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 2] matches = sum(1 for part in name_parts if part in username_lower) if len(name_parts) > 0: if matches == len(name_parts) and len(name_parts) >= 2: return 85 # Full name in username elif matches >= 2: return 70 # Multiple parts match elif matches == 1: return 35 # Only one part matches return 0 async def find_linkedin_from_source( self, name: str, source_url: str, role: Optional[str] = None ) -> Dict: """ Find LinkedIn profile by crawling the source URL (team page). Args: name: Person's name source_url: URL of the team/about page role: Person's role (for additional context matching) Returns: Dict with linkedin_url, confidence, method, notes """ if not source_url: return { "linkedin_url": None, "confidence": 0, "method": "source_crawl", "notes": "No source URL provided", } # Crawl the page content = await self.crawl_page(source_url) if not content: return { "linkedin_url": None, "confidence": 0, "method": "source_crawl", "notes": f"Failed to crawl {source_url}", } # Get HTML for better link extraction html = self.html_cache.get(source_url, content) # Extract all LinkedIn URLs from both HTML and markdown linkedin_links = self.extract_linkedin_urls_from_content(html) if not linkedin_links: linkedin_links = self.extract_linkedin_urls_from_content(content) if not linkedin_links: return { "linkedin_url": None, "confidence": 0, "method": "source_crawl", "notes": f"No LinkedIn URLs found on {source_url}", } # Score each LinkedIn URL based on name matching best_match = None best_score = 0 for link in linkedin_links: # Score based on context matching context_score = self._name_matches_context(name, link["context"]) # Score based on username matching username_score = self._name_matches_username(name, link["username"]) # Also check if role appears in context role_bonus = 0 if role and role.lower() in link["context"].lower(): role_bonus = 10 # Combined score (take best of context or username, plus role bonus) total_score = max(context_score, username_score) + role_bonus logger.debug( f" {name} -> {link['url']}: context={context_score}, username={username_score}, role={role_bonus}, total={total_score}" ) if total_score > best_score: best_score = total_score best_match = link if best_match and best_score >= 30: # Minimum threshold return { "linkedin_url": best_match["url"], "confidence": min(best_score, 100), "method": "source_crawl", "notes": f"Found on {source_url}", } return { "linkedin_url": None, "confidence": 0, "method": "source_crawl", "notes": f'No matching LinkedIn profile found for "{name}" on {source_url}', } async def find_linkedin_via_search( self, name: str, company: str, role: Optional[str] = None ) -> Dict: """ Find LinkedIn profile using web search. Args: name: Person's name company: Company/investor name role: Person's role (optional) Returns: Dict with linkedin_url, confidence, method, notes """ if not self.ddg_search: return { "linkedin_url": None, "confidence": 0, "method": "web_search", "notes": "Web search not available", } try: # Build search query - search for LinkedIn profile query = f"{name} {company} site:linkedin.com/in" if role: query = f"{name} {role} {company} site:linkedin.com/in" logger.debug(f"Searching: {query}") results = self.web_search(query) if results: # Look for LinkedIn profile URLs in results linkedin_pattern = r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)" for result in results: url = result.get("href") or result.get("link") or "" title = result.get("title", "").lower() body = result.get("body", "").lower() match = re.search(linkedin_pattern, url, re.IGNORECASE) if match: linkedin_url = self._normalize_linkedin_url(match.group(0)) username = match.group(1) # Score based on name matching in title/body and username context = f"{title} {body}" context_score = self._name_matches_context(name, context) username_score = self._name_matches_username(name, username) total_score = max(context_score, username_score) if total_score >= 30: return { "linkedin_url": linkedin_url, "confidence": min( total_score, 90 ), # Cap at 90 for search results "method": "web_search", "notes": "Found via web search", } return { "linkedin_url": None, "confidence": 0, "method": "web_search", "notes": "No matching profile found in search results", } except Exception as e: logger.error(f"Web search error for {name}: {e}") return { "linkedin_url": None, "confidence": 0, "method": "web_search", "notes": f"Search error: {str(e)}", } async def find_linkedin_profile( self, name: str, company: str, role: Optional[str] = None, source_url: Optional[str] = None, ) -> Dict: """ Find LinkedIn profile for a person. Primary strategy: Crawl source URL to find LinkedIn links. Args: name: Person's name company: Company/investor name role: Person's role/title (optional) source_url: URL where person info was found (optional) Returns: Dict with: - linkedin_url: Found LinkedIn URL or None - confidence: Confidence score (0-100) - method: Method used to find the profile - notes: Additional information """ cache_key = f"{name}|{company}" # Check cache if self.use_cache and cache_key in self.profile_cache: logger.debug(f"Using cached result for {name}") return self.profile_cache[cache_key] result = {"linkedin_url": None, "confidence": 0, "method": "none", "notes": ""} # Primary strategy: Crawl source URL if source_url: result = await self.find_linkedin_from_source(name, source_url, role) if result["linkedin_url"]: if self.use_cache: self.profile_cache[cache_key] = result return result # Fallback strategy: Web search (if enabled and no result from source crawl) if self.use_llm_search and not result.get("linkedin_url"): search_result = await self.find_linkedin_via_search(name, company, role) if search_result["linkedin_url"]: if self.use_cache: self.profile_cache[cache_key] = search_result return search_result # If no source URL or no match found if not result["linkedin_url"]: result = { "linkedin_url": None, "confidence": 0, "method": "none", "notes": "No source URL available" if not source_url else result.get("notes", "Not found"), } if self.use_cache: self.profile_cache[cache_key] = result return result async def batch_find_profiles( self, members: List[Dict], progress_callback=None, db_callback=None ) -> List[Dict]: """ Find LinkedIn profiles for multiple members efficiently. Groups members by source_url to minimize crawling the same page multiple times. Args: members: List of dicts with 'name', 'company', 'role', 'source_url', 'id' progress_callback: Optional callback function(current, total, result) db_callback: Optional callback to save to database immediately when profile found Signature: db_callback(member_id, linkedin_url) -> bool Returns: List of results for each member """ results = [] total = len(members) # Group members by source_url for efficient crawling url_groups: Dict[str, List[Dict]] = {} no_url_members = [] for member in members: url = member.get("source_url") if url: if url not in url_groups: url_groups[url] = [] url_groups[url].append(member) else: no_url_members.append(member) logger.info( f"Processing {len(url_groups)} unique source URLs for {total} members" ) logger.info(f"Members with source URLs: {total - len(no_url_members)}") logger.info(f"Members without source URLs: {len(no_url_members)}") if self.use_llm_search: logger.info("Web search fallback: ENABLED") else: logger.info("Web search fallback: DISABLED") processed = 0 # Process members grouped by URL (efficient - one crawl per page) for url, group_members in url_groups.items(): # Crawl the page once content = await self.crawl_page(url) html = self.html_cache.get(url, content or "") # Extract all LinkedIn URLs from this page linkedin_links = [] if content: linkedin_links = self.extract_linkedin_urls_from_content(html) if not linkedin_links: linkedin_links = self.extract_linkedin_urls_from_content(content) # Match each member in this group for member in group_members: processed += 1 result = None found_linkedin = False if linkedin_links: # Find best matching LinkedIn for this member best_match = None best_score = 0 for link in linkedin_links: context_score = self._name_matches_context( member["name"], link["context"] ) username_score = self._name_matches_username( member["name"], link["username"] ) role_bonus = ( 10 if member.get("role") and member["role"].lower() in link["context"].lower() else 0 ) total_score = max(context_score, username_score) + role_bonus if total_score > best_score: best_score = total_score best_match = link if best_match and best_score >= 30: result = { "linkedin_url": best_match["url"], "confidence": min(best_score, 100), "method": "source_crawl", "notes": f"Found on {url}", "member_id": member.get("id"), "member_name": member["name"], } found_linkedin = True # Save to database immediately if callback provided if db_callback and member.get("id"): db_callback(member["id"], best_match["url"]) # If no result from source crawl, try web search IMMEDIATELY if not found_linkedin and self.use_llm_search: search_result = await self.find_linkedin_via_search( member["name"], member["company"], member.get("role") ) if search_result["linkedin_url"]: result = { "linkedin_url": search_result["linkedin_url"], "confidence": search_result["confidence"], "method": "web_search", "notes": search_result.get("notes", "Found via web search"), "member_id": member.get("id"), "member_name": member["name"], } found_linkedin = True # Save to database immediately if db_callback and member.get("id"): db_callback(member["id"], search_result["linkedin_url"]) # If still no result, record as not found if not found_linkedin: result = { "linkedin_url": None, "confidence": 0, "method": "source_crawl" if content else "none", "notes": f"No match on {url}" if linkedin_links else ( f"No LinkedIn URLs on {url}" if content else f"Failed to crawl {url}" ), "member_id": member.get("id"), "member_name": member["name"], } results.append(result) if progress_callback: progress_callback(processed, total, result) # Small delay between different URLs await asyncio.sleep(self.rate_limit_delay) # Process members without source URLs - do web search immediately for each for member in no_url_members: processed += 1 result = None # Try web search immediately if self.use_llm_search: search_result = await self.find_linkedin_via_search( member["name"], member["company"], member.get("role") ) if search_result["linkedin_url"]: result = { "linkedin_url": search_result["linkedin_url"], "confidence": search_result["confidence"], "method": "web_search", "notes": search_result.get("notes", "Found via web search"), "member_id": member.get("id"), "member_name": member["name"], } # Save to database immediately if db_callback and member.get("id"): db_callback(member["id"], search_result["linkedin_url"]) # If no result from search if not result: result = { "linkedin_url": None, "confidence": 0, "method": "web_search" if self.use_llm_search else "none", "notes": "No LinkedIn profile found" if self.use_llm_search else "No source URL available", "member_id": member.get("id"), "member_name": member["name"], } results.append(result) if progress_callback: progress_callback(processed, total, result) # Rate limit between searches await asyncio.sleep(self.rate_limit_delay) return results def format_linkedin_url(url: str) -> str: """Normalize LinkedIn URL format""" if not url: return url # Remove trailing slashes url = url.rstrip("/") # Ensure https and normalize to www url = re.sub(r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url) if url.startswith("http://"): url = url.replace("http://", "https://") return url # Async wrapper for sync contexts def run_batch_scraper( members: List[Dict], rate_limit: float = 0.5, progress_callback=None ) -> List[Dict]: """ Synchronous wrapper for batch_find_profiles. Args: members: List of member dicts rate_limit: Delay between URL crawls progress_callback: Optional progress callback Returns: List of results """ scraper = LinkedInProfileScraper(rate_limit_delay=rate_limit) return asyncio.run(scraper.batch_find_profiles(members, progress_callback))