added linkedin profiles

2025-11-27 16:44:22 +01:00
parent 100e0b2b0c
commit 495f8a0ff6
4 changed files with 1040 additions and 0 deletions
@@ -0,0 +1,730 @@
+"""
+LinkedIn Profile Scraper for Investor Members
+
+This module uses crawl4ai to scrape team pages and find LinkedIn profiles.
+Strategies:
+1. Crawl the source_url (team pages) to extract LinkedIn profile links
+2. Use LLM-powered web search to find LinkedIn profiles by name
+
+Key advantages of crawl4ai:
+- Handles JavaScript-rendered pages
+- Better at extracting content from modern websites
+- More reliable than simple requests
+"""
+
+import asyncio
+import logging
+import os
+import re
+from typing import Dict, List, Optional
+
+from crawl4ai import AsyncWebCrawler
+from ddgs import DDGS
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+# Setup logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+)
+logger = logging.getLogger("linkedin_scraper")
+
+load_dotenv()
+OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
+
+
+class LinkedInProfileScraper:
+    """
+    LinkedIn profile finder using crawl4ai and LLM-powered web search.
+
+    Strategies:
+    1. Crawl source URLs (team pages) to extract LinkedIn links
+    2. Use LLM-powered web search to find profiles by name
+    """
+
+    def __init__(
+        self,
+        rate_limit_delay: float = 0.5,
+        use_cache: bool = True,
+        use_llm_search: bool = True,
+    ):
+        """
+        Initialize the scraper
+
+        Args:
+            rate_limit_delay: Delay between requests in seconds
+            use_cache: Whether to cache crawled pages
+            use_llm_search: Whether to use LLM-powered web search as fallback
+        """
+        self.rate_limit_delay = rate_limit_delay
+        self.use_cache = use_cache
+        self.use_llm_search = use_llm_search and OPENROUTER_API_KEY
+        self.page_cache: Dict[str, str] = {}  # Cache crawled pages by URL
+        self.html_cache: Dict[str, str] = {}  # Cache HTML separately
+        self.profile_cache: Dict[str, Dict] = {}  # Cache results by member
+
+        # Initialize LLM agent if API key available
+        if self.use_llm_search:
+            self._init_llm_agent()
+        else:
+            self.llm = None
+            self.agent = None
+            self.ddg_search = None
+            logger.info("LLM search disabled (no OPENROUTER_API_KEY)")
+
+    def _init_llm_agent(self):
+        """Initialize LLM agent for web search"""
+        try:
+            self.llm = ChatOpenAI(
+                api_key=OPENROUTER_API_KEY,
+                base_url="https://openrouter.ai/api/v1",
+                model="x-ai/grok-4.1-fast:free",
+                temperature=0,
+            )
+            self.ddg_search = DDGS()
+            logger.info("LLM search agent initialized")
+        except Exception as e:
+            logger.error(f"Failed to initialize LLM agent: {e}")
+            self.llm = None
+            self.ddg_search = None
+
+    def web_search(self, query: str) -> List[Dict]:
+        """Tool to search the web using DuckDuckGo"""
+        if not self.ddg_search:
+            return []
+        try:
+            results = list(self.ddg_search.text(query, max_results=10))
+            return results
+        except Exception as e:
+            logger.error(f"Web search error: {e}")
+            return []
+
+    async def crawl_page(self, url: str) -> Optional[str]:
+        """
+        Crawl a webpage and return its content.
+
+        Args:
+            url: URL to crawl
+
+        Returns:
+            Page content as markdown/text, or None if failed
+        """
+        if not url:
+            return None
+
+        # Check cache first
+        if self.use_cache and url in self.page_cache:
+            logger.debug(f"Using cached page for {url}")
+            return self.page_cache[url]
+
+        try:
+            logger.info(f"Crawling: {url}")
+            async with AsyncWebCrawler() as crawler:
+                result = await crawler.arun(url)
+
+                if result and result.markdown:
+                    content = result.markdown
+                    # Also get HTML for better link extraction
+                    html_content = result.html if hasattr(result, "html") else ""
+
+                    # Cache the results
+                    if self.use_cache:
+                        self.page_cache[url] = content
+                        self.html_cache[url] = html_content
+
+                    return content
+
+        except Exception as e:
+            logger.error(f"Error crawling {url}: {e}")
+
+        return None
+
+    def extract_linkedin_urls_from_content(self, content: str) -> List[Dict[str, str]]:
+        """
+        Extract all LinkedIn profile URLs from content (HTML or markdown).
+
+        Returns:
+            List of dicts with 'url', 'context', and 'username'
+        """
+        linkedin_links = []
+
+        # Pattern for LinkedIn profile URLs (handles country-specific domains)
+        linkedin_pattern = (
+            r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)/?"
+        )
+
+        # Find all LinkedIn URLs
+        matches = list(re.finditer(linkedin_pattern, content, re.IGNORECASE))
+
+        for match in matches:
+            url = match.group(0).rstrip("/")
+            # Normalize URL
+            url = self._normalize_linkedin_url(url)
+
+            # Get surrounding context (200 chars before and after)
+            start = max(0, match.start() - 200)
+            end = min(len(content), match.end() + 200)
+            context = content[start:end]
+
+            # Clean up context (remove HTML tags for readability)
+            context = re.sub(r"<[^>]+>", " ", context)
+            context = " ".join(context.split())  # Normalize whitespace
+
+            linkedin_links.append(
+                {"url": url, "context": context, "username": match.group(1)}
+            )
+
+        # Remove duplicates while preserving order
+        seen_urls = set()
+        unique_links = []
+        for link in linkedin_links:
+            if link["url"] not in seen_urls:
+                seen_urls.add(link["url"])
+                unique_links.append(link)
+
+        return unique_links
+
+    def _normalize_linkedin_url(self, url: str) -> str:
+        """Normalize LinkedIn URL to standard format"""
+        # Remove trailing slashes
+        url = url.rstrip("/")
+
+        # Convert country-specific to www
+        url = re.sub(
+            r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url
+        )
+
+        # Ensure https
+        if url.startswith("http://"):
+            url = url.replace("http://", "https://")
+
+        return url
+
+    def _name_matches_context(self, name: str, context: str) -> float:
+        """
+        Check if a person's name appears in the context around a LinkedIn URL.
+
+        Returns:
+            Confidence score 0-100
+        """
+        if not name or not context:
+            return 0
+
+        context_lower = context.lower()
+        name_lower = name.lower()
+
+        # Split name into parts (handle multiple spaces, titles like "Dr.", etc.)
+        name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 1]
+
+        # Check for full name match
+        if name_lower in context_lower:
+            return 95
+
+        # Check for name parts in context
+        matches = sum(
+            1 for part in name_parts if part in context_lower and len(part) > 2
+        )
+
+        if len(name_parts) > 0:
+            if matches == len(name_parts):
+                return 90  # All name parts found
+            elif matches >= 2:
+                return 75  # At least 2 parts found (first + last typically)
+            elif matches == 1 and len(name_parts) <= 2:
+                return 50  # Only one part found but name is short
+            elif matches == 1:
+                return 35  # Only one part found
+
+        return 0
+
+    def _name_matches_username(self, name: str, username: str) -> float:
+        """
+        Check if LinkedIn username contains parts of the name.
+
+        Returns:
+            Confidence score 0-100
+        """
+        if not name or not username:
+            return 0
+
+        name_lower = name.lower()
+        username_lower = username.lower().replace("-", " ").replace("_", " ")
+
+        name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 2]
+
+        matches = sum(1 for part in name_parts if part in username_lower)
+
+        if len(name_parts) > 0:
+            if matches == len(name_parts) and len(name_parts) >= 2:
+                return 85  # Full name in username
+            elif matches >= 2:
+                return 70  # Multiple parts match
+            elif matches == 1:
+                return 35  # Only one part matches
+
+        return 0
+
+    async def find_linkedin_from_source(
+        self, name: str, source_url: str, role: Optional[str] = None
+    ) -> Dict:
+        """
+        Find LinkedIn profile by crawling the source URL (team page).
+
+        Args:
+            name: Person's name
+            source_url: URL of the team/about page
+            role: Person's role (for additional context matching)
+
+        Returns:
+            Dict with linkedin_url, confidence, method, notes
+        """
+        if not source_url:
+            return {
+                "linkedin_url": None,
+                "confidence": 0,
+                "method": "source_crawl",
+                "notes": "No source URL provided",
+            }
+
+        # Crawl the page
+        content = await self.crawl_page(source_url)
+
+        if not content:
+            return {
+                "linkedin_url": None,
+                "confidence": 0,
+                "method": "source_crawl",
+                "notes": f"Failed to crawl {source_url}",
+            }
+
+        # Get HTML for better link extraction
+        html = self.html_cache.get(source_url, content)
+
+        # Extract all LinkedIn URLs from both HTML and markdown
+        linkedin_links = self.extract_linkedin_urls_from_content(html)
+        if not linkedin_links:
+            linkedin_links = self.extract_linkedin_urls_from_content(content)
+
+        if not linkedin_links:
+            return {
+                "linkedin_url": None,
+                "confidence": 0,
+                "method": "source_crawl",
+                "notes": f"No LinkedIn URLs found on {source_url}",
+            }
+
+        # Score each LinkedIn URL based on name matching
+        best_match = None
+        best_score = 0
+
+        for link in linkedin_links:
+            # Score based on context matching
+            context_score = self._name_matches_context(name, link["context"])
+
+            # Score based on username matching
+            username_score = self._name_matches_username(name, link["username"])
+
+            # Also check if role appears in context
+            role_bonus = 0
+            if role and role.lower() in link["context"].lower():
+                role_bonus = 10
+
+            # Combined score (take best of context or username, plus role bonus)
+            total_score = max(context_score, username_score) + role_bonus
+
+            logger.debug(
+                f"  {name} -> {link['url']}: context={context_score}, username={username_score}, role={role_bonus}, total={total_score}"
+            )
+
+            if total_score > best_score:
+                best_score = total_score
+                best_match = link
+
+        if best_match and best_score >= 30:  # Minimum threshold
+            return {
+                "linkedin_url": best_match["url"],
+                "confidence": min(best_score, 100),
+                "method": "source_crawl",
+                "notes": f"Found on {source_url}",
+            }
+
+        return {
+            "linkedin_url": None,
+            "confidence": 0,
+            "method": "source_crawl",
+            "notes": f'No matching LinkedIn profile found for "{name}" on {source_url}',
+        }
+
+    async def find_linkedin_via_search(
+        self, name: str, company: str, role: Optional[str] = None
+    ) -> Dict:
+        """
+        Find LinkedIn profile using web search.
+
+        Args:
+            name: Person's name
+            company: Company/investor name
+            role: Person's role (optional)
+
+        Returns:
+            Dict with linkedin_url, confidence, method, notes
+        """
+        if not self.ddg_search:
+            return {
+                "linkedin_url": None,
+                "confidence": 0,
+                "method": "web_search",
+                "notes": "Web search not available",
+            }
+
+        try:
+            # Build search query - search for LinkedIn profile
+            query = f"{name} {company} site:linkedin.com/in"
+            if role:
+                query = f"{name} {role} {company} site:linkedin.com/in"
+
+            logger.debug(f"Searching: {query}")
+            results = self.web_search(query)
+
+            if results:
+                # Look for LinkedIn profile URLs in results
+                linkedin_pattern = r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)"
+
+                for result in results:
+                    url = result.get("href") or result.get("link") or ""
+                    title = result.get("title", "").lower()
+                    body = result.get("body", "").lower()
+
+                    match = re.search(linkedin_pattern, url, re.IGNORECASE)
+                    if match:
+                        linkedin_url = self._normalize_linkedin_url(match.group(0))
+                        username = match.group(1)
+
+                        # Score based on name matching in title/body and username
+                        context = f"{title} {body}"
+                        context_score = self._name_matches_context(name, context)
+                        username_score = self._name_matches_username(name, username)
+
+                        total_score = max(context_score, username_score)
+
+                        if total_score >= 30:
+                            return {
+                                "linkedin_url": linkedin_url,
+                                "confidence": min(
+                                    total_score, 90
+                                ),  # Cap at 90 for search results
+                                "method": "web_search",
+                                "notes": "Found via web search",
+                            }
+
+            return {
+                "linkedin_url": None,
+                "confidence": 0,
+                "method": "web_search",
+                "notes": "No matching profile found in search results",
+            }
+
+        except Exception as e:
+            logger.error(f"Web search error for {name}: {e}")
+            return {
+                "linkedin_url": None,
+                "confidence": 0,
+                "method": "web_search",
+                "notes": f"Search error: {str(e)}",
+            }
+
+    async def find_linkedin_profile(
+        self,
+        name: str,
+        company: str,
+        role: Optional[str] = None,
+        source_url: Optional[str] = None,
+    ) -> Dict:
+        """
+        Find LinkedIn profile for a person.
+
+        Primary strategy: Crawl source URL to find LinkedIn links.
+
+        Args:
+            name: Person's name
+            company: Company/investor name
+            role: Person's role/title (optional)
+            source_url: URL where person info was found (optional)
+
+        Returns:
+            Dict with:
+                - linkedin_url: Found LinkedIn URL or None
+                - confidence: Confidence score (0-100)
+                - method: Method used to find the profile
+                - notes: Additional information
+        """
+        cache_key = f"{name}|{company}"
+
+        # Check cache
+        if self.use_cache and cache_key in self.profile_cache:
+            logger.debug(f"Using cached result for {name}")
+            return self.profile_cache[cache_key]
+
+        result = {"linkedin_url": None, "confidence": 0, "method": "none", "notes": ""}
+
+        # Primary strategy: Crawl source URL
+        if source_url:
+            result = await self.find_linkedin_from_source(name, source_url, role)
+
+            if result["linkedin_url"]:
+                if self.use_cache:
+                    self.profile_cache[cache_key] = result
+                return result
+
+        # Fallback strategy: Web search (if enabled and no result from source crawl)
+        if self.use_llm_search and not result.get("linkedin_url"):
+            search_result = await self.find_linkedin_via_search(name, company, role)
+            if search_result["linkedin_url"]:
+                if self.use_cache:
+                    self.profile_cache[cache_key] = search_result
+                return search_result
+
+        # If no source URL or no match found
+        if not result["linkedin_url"]:
+            result = {
+                "linkedin_url": None,
+                "confidence": 0,
+                "method": "none",
+                "notes": "No source URL available"
+                if not source_url
+                else result.get("notes", "Not found"),
+            }
+
+        if self.use_cache:
+            self.profile_cache[cache_key] = result
+
+        return result
+
+    async def batch_find_profiles(
+        self, members: List[Dict], progress_callback=None, db_callback=None
+    ) -> List[Dict]:
+        """
+        Find LinkedIn profiles for multiple members efficiently.
+
+        Groups members by source_url to minimize crawling the same page multiple times.
+
+        Args:
+            members: List of dicts with 'name', 'company', 'role', 'source_url', 'id'
+            progress_callback: Optional callback function(current, total, result)
+            db_callback: Optional callback to save to database immediately when profile found
+                         Signature: db_callback(member_id, linkedin_url) -> bool
+
+        Returns:
+            List of results for each member
+        """
+        results = []
+        total = len(members)
+
+        # Group members by source_url for efficient crawling
+        url_groups: Dict[str, List[Dict]] = {}
+        no_url_members = []
+
+        for member in members:
+            url = member.get("source_url")
+            if url:
+                if url not in url_groups:
+                    url_groups[url] = []
+                url_groups[url].append(member)
+            else:
+                no_url_members.append(member)
+
+        logger.info(
+            f"Processing {len(url_groups)} unique source URLs for {total} members"
+        )
+        logger.info(f"Members with source URLs: {total - len(no_url_members)}")
+        logger.info(f"Members without source URLs: {len(no_url_members)}")
+        if self.use_llm_search:
+            logger.info("Web search fallback: ENABLED")
+        else:
+            logger.info("Web search fallback: DISABLED")
+
+        processed = 0
+
+        # Process members grouped by URL (efficient - one crawl per page)
+        for url, group_members in url_groups.items():
+            # Crawl the page once
+            content = await self.crawl_page(url)
+            html = self.html_cache.get(url, content or "")
+
+            # Extract all LinkedIn URLs from this page
+            linkedin_links = []
+            if content:
+                linkedin_links = self.extract_linkedin_urls_from_content(html)
+                if not linkedin_links:
+                    linkedin_links = self.extract_linkedin_urls_from_content(content)
+
+            # Match each member in this group
+            for member in group_members:
+                processed += 1
+                result = None
+                found_linkedin = False
+
+                if linkedin_links:
+                    # Find best matching LinkedIn for this member
+                    best_match = None
+                    best_score = 0
+
+                    for link in linkedin_links:
+                        context_score = self._name_matches_context(
+                            member["name"], link["context"]
+                        )
+                        username_score = self._name_matches_username(
+                            member["name"], link["username"]
+                        )
+                        role_bonus = (
+                            10
+                            if member.get("role")
+                            and member["role"].lower() in link["context"].lower()
+                            else 0
+                        )
+                        total_score = max(context_score, username_score) + role_bonus
+
+                        if total_score > best_score:
+                            best_score = total_score
+                            best_match = link
+
+                    if best_match and best_score >= 30:
+                        result = {
+                            "linkedin_url": best_match["url"],
+                            "confidence": min(best_score, 100),
+                            "method": "source_crawl",
+                            "notes": f"Found on {url}",
+                            "member_id": member.get("id"),
+                            "member_name": member["name"],
+                        }
+                        found_linkedin = True
+                        # Save to database immediately if callback provided
+                        if db_callback and member.get("id"):
+                            db_callback(member["id"], best_match["url"])
+
+                # If no result from source crawl, try web search IMMEDIATELY
+                if not found_linkedin and self.use_llm_search:
+                    search_result = await self.find_linkedin_via_search(
+                        member["name"], member["company"], member.get("role")
+                    )
+
+                    if search_result["linkedin_url"]:
+                        result = {
+                            "linkedin_url": search_result["linkedin_url"],
+                            "confidence": search_result["confidence"],
+                            "method": "web_search",
+                            "notes": search_result.get("notes", "Found via web search"),
+                            "member_id": member.get("id"),
+                            "member_name": member["name"],
+                        }
+                        found_linkedin = True
+                        # Save to database immediately
+                        if db_callback and member.get("id"):
+                            db_callback(member["id"], search_result["linkedin_url"])
+
+                # If still no result, record as not found
+                if not found_linkedin:
+                    result = {
+                        "linkedin_url": None,
+                        "confidence": 0,
+                        "method": "source_crawl" if content else "none",
+                        "notes": f"No match on {url}"
+                        if linkedin_links
+                        else (
+                            f"No LinkedIn URLs on {url}"
+                            if content
+                            else f"Failed to crawl {url}"
+                        ),
+                        "member_id": member.get("id"),
+                        "member_name": member["name"],
+                    }
+
+                results.append(result)
+
+                if progress_callback:
+                    progress_callback(processed, total, result)
+
+            # Small delay between different URLs
+            await asyncio.sleep(self.rate_limit_delay)
+
+        # Process members without source URLs - do web search immediately for each
+        for member in no_url_members:
+            processed += 1
+            result = None
+
+            # Try web search immediately
+            if self.use_llm_search:
+                search_result = await self.find_linkedin_via_search(
+                    member["name"], member["company"], member.get("role")
+                )
+
+                if search_result["linkedin_url"]:
+                    result = {
+                        "linkedin_url": search_result["linkedin_url"],
+                        "confidence": search_result["confidence"],
+                        "method": "web_search",
+                        "notes": search_result.get("notes", "Found via web search"),
+                        "member_id": member.get("id"),
+                        "member_name": member["name"],
+                    }
+                    # Save to database immediately
+                    if db_callback and member.get("id"):
+                        db_callback(member["id"], search_result["linkedin_url"])
+
+            # If no result from search
+            if not result:
+                result = {
+                    "linkedin_url": None,
+                    "confidence": 0,
+                    "method": "web_search" if self.use_llm_search else "none",
+                    "notes": "No LinkedIn profile found"
+                    if self.use_llm_search
+                    else "No source URL available",
+                    "member_id": member.get("id"),
+                    "member_name": member["name"],
+                }
+
+            results.append(result)
+
+            if progress_callback:
+                progress_callback(processed, total, result)
+
+            # Rate limit between searches
+            await asyncio.sleep(self.rate_limit_delay)
+
+        return results
+
+
+def format_linkedin_url(url: str) -> str:
+    """Normalize LinkedIn URL format"""
+    if not url:
+        return url
+
+    # Remove trailing slashes
+    url = url.rstrip("/")
+
+    # Ensure https and normalize to www
+    url = re.sub(r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url)
+    if url.startswith("http://"):
+        url = url.replace("http://", "https://")
+
+    return url
+
+
+# Async wrapper for sync contexts
+def run_batch_scraper(
+    members: List[Dict], rate_limit: float = 0.5, progress_callback=None
+) -> List[Dict]:
+    """
+    Synchronous wrapper for batch_find_profiles.
+
+    Args:
+        members: List of member dicts
+        rate_limit: Delay between URL crawls
+        progress_callback: Optional progress callback
+
+    Returns:
+        List of results
+    """
+    scraper = LinkedInProfileScraper(rate_limit_delay=rate_limit)
+    return asyncio.run(scraper.batch_find_profiles(members, progress_callback))