added linkedin profiles
This commit is contained in:
@@ -0,0 +1,730 @@
|
||||
"""
|
||||
LinkedIn Profile Scraper for Investor Members
|
||||
|
||||
This module uses crawl4ai to scrape team pages and find LinkedIn profiles.
|
||||
Strategies:
|
||||
1. Crawl the source_url (team pages) to extract LinkedIn profile links
|
||||
2. Use LLM-powered web search to find LinkedIn profiles by name
|
||||
|
||||
Key advantages of crawl4ai:
|
||||
- Handles JavaScript-rendered pages
|
||||
- Better at extracting content from modern websites
|
||||
- More reliable than simple requests
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from ddgs import DDGS
|
||||
from dotenv import load_dotenv
|
||||
from langchain_openai import ChatOpenAI
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
||||
)
|
||||
logger = logging.getLogger("linkedin_scraper")
|
||||
|
||||
load_dotenv()
|
||||
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
||||
|
||||
|
||||
class LinkedInProfileScraper:
|
||||
"""
|
||||
LinkedIn profile finder using crawl4ai and LLM-powered web search.
|
||||
|
||||
Strategies:
|
||||
1. Crawl source URLs (team pages) to extract LinkedIn links
|
||||
2. Use LLM-powered web search to find profiles by name
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
rate_limit_delay: float = 0.5,
|
||||
use_cache: bool = True,
|
||||
use_llm_search: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the scraper
|
||||
|
||||
Args:
|
||||
rate_limit_delay: Delay between requests in seconds
|
||||
use_cache: Whether to cache crawled pages
|
||||
use_llm_search: Whether to use LLM-powered web search as fallback
|
||||
"""
|
||||
self.rate_limit_delay = rate_limit_delay
|
||||
self.use_cache = use_cache
|
||||
self.use_llm_search = use_llm_search and OPENROUTER_API_KEY
|
||||
self.page_cache: Dict[str, str] = {} # Cache crawled pages by URL
|
||||
self.html_cache: Dict[str, str] = {} # Cache HTML separately
|
||||
self.profile_cache: Dict[str, Dict] = {} # Cache results by member
|
||||
|
||||
# Initialize LLM agent if API key available
|
||||
if self.use_llm_search:
|
||||
self._init_llm_agent()
|
||||
else:
|
||||
self.llm = None
|
||||
self.agent = None
|
||||
self.ddg_search = None
|
||||
logger.info("LLM search disabled (no OPENROUTER_API_KEY)")
|
||||
|
||||
def _init_llm_agent(self):
|
||||
"""Initialize LLM agent for web search"""
|
||||
try:
|
||||
self.llm = ChatOpenAI(
|
||||
api_key=OPENROUTER_API_KEY,
|
||||
base_url="https://openrouter.ai/api/v1",
|
||||
model="x-ai/grok-4.1-fast:free",
|
||||
temperature=0,
|
||||
)
|
||||
self.ddg_search = DDGS()
|
||||
logger.info("LLM search agent initialized")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize LLM agent: {e}")
|
||||
self.llm = None
|
||||
self.ddg_search = None
|
||||
|
||||
def web_search(self, query: str) -> List[Dict]:
|
||||
"""Tool to search the web using DuckDuckGo"""
|
||||
if not self.ddg_search:
|
||||
return []
|
||||
try:
|
||||
results = list(self.ddg_search.text(query, max_results=10))
|
||||
return results
|
||||
except Exception as e:
|
||||
logger.error(f"Web search error: {e}")
|
||||
return []
|
||||
|
||||
async def crawl_page(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Crawl a webpage and return its content.
|
||||
|
||||
Args:
|
||||
url: URL to crawl
|
||||
|
||||
Returns:
|
||||
Page content as markdown/text, or None if failed
|
||||
"""
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Check cache first
|
||||
if self.use_cache and url in self.page_cache:
|
||||
logger.debug(f"Using cached page for {url}")
|
||||
return self.page_cache[url]
|
||||
|
||||
try:
|
||||
logger.info(f"Crawling: {url}")
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url)
|
||||
|
||||
if result and result.markdown:
|
||||
content = result.markdown
|
||||
# Also get HTML for better link extraction
|
||||
html_content = result.html if hasattr(result, "html") else ""
|
||||
|
||||
# Cache the results
|
||||
if self.use_cache:
|
||||
self.page_cache[url] = content
|
||||
self.html_cache[url] = html_content
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling {url}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def extract_linkedin_urls_from_content(self, content: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract all LinkedIn profile URLs from content (HTML or markdown).
|
||||
|
||||
Returns:
|
||||
List of dicts with 'url', 'context', and 'username'
|
||||
"""
|
||||
linkedin_links = []
|
||||
|
||||
# Pattern for LinkedIn profile URLs (handles country-specific domains)
|
||||
linkedin_pattern = (
|
||||
r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)/?"
|
||||
)
|
||||
|
||||
# Find all LinkedIn URLs
|
||||
matches = list(re.finditer(linkedin_pattern, content, re.IGNORECASE))
|
||||
|
||||
for match in matches:
|
||||
url = match.group(0).rstrip("/")
|
||||
# Normalize URL
|
||||
url = self._normalize_linkedin_url(url)
|
||||
|
||||
# Get surrounding context (200 chars before and after)
|
||||
start = max(0, match.start() - 200)
|
||||
end = min(len(content), match.end() + 200)
|
||||
context = content[start:end]
|
||||
|
||||
# Clean up context (remove HTML tags for readability)
|
||||
context = re.sub(r"<[^>]+>", " ", context)
|
||||
context = " ".join(context.split()) # Normalize whitespace
|
||||
|
||||
linkedin_links.append(
|
||||
{"url": url, "context": context, "username": match.group(1)}
|
||||
)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen_urls = set()
|
||||
unique_links = []
|
||||
for link in linkedin_links:
|
||||
if link["url"] not in seen_urls:
|
||||
seen_urls.add(link["url"])
|
||||
unique_links.append(link)
|
||||
|
||||
return unique_links
|
||||
|
||||
def _normalize_linkedin_url(self, url: str) -> str:
|
||||
"""Normalize LinkedIn URL to standard format"""
|
||||
# Remove trailing slashes
|
||||
url = url.rstrip("/")
|
||||
|
||||
# Convert country-specific to www
|
||||
url = re.sub(
|
||||
r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url
|
||||
)
|
||||
|
||||
# Ensure https
|
||||
if url.startswith("http://"):
|
||||
url = url.replace("http://", "https://")
|
||||
|
||||
return url
|
||||
|
||||
def _name_matches_context(self, name: str, context: str) -> float:
|
||||
"""
|
||||
Check if a person's name appears in the context around a LinkedIn URL.
|
||||
|
||||
Returns:
|
||||
Confidence score 0-100
|
||||
"""
|
||||
if not name or not context:
|
||||
return 0
|
||||
|
||||
context_lower = context.lower()
|
||||
name_lower = name.lower()
|
||||
|
||||
# Split name into parts (handle multiple spaces, titles like "Dr.", etc.)
|
||||
name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 1]
|
||||
|
||||
# Check for full name match
|
||||
if name_lower in context_lower:
|
||||
return 95
|
||||
|
||||
# Check for name parts in context
|
||||
matches = sum(
|
||||
1 for part in name_parts if part in context_lower and len(part) > 2
|
||||
)
|
||||
|
||||
if len(name_parts) > 0:
|
||||
if matches == len(name_parts):
|
||||
return 90 # All name parts found
|
||||
elif matches >= 2:
|
||||
return 75 # At least 2 parts found (first + last typically)
|
||||
elif matches == 1 and len(name_parts) <= 2:
|
||||
return 50 # Only one part found but name is short
|
||||
elif matches == 1:
|
||||
return 35 # Only one part found
|
||||
|
||||
return 0
|
||||
|
||||
def _name_matches_username(self, name: str, username: str) -> float:
|
||||
"""
|
||||
Check if LinkedIn username contains parts of the name.
|
||||
|
||||
Returns:
|
||||
Confidence score 0-100
|
||||
"""
|
||||
if not name or not username:
|
||||
return 0
|
||||
|
||||
name_lower = name.lower()
|
||||
username_lower = username.lower().replace("-", " ").replace("_", " ")
|
||||
|
||||
name_parts = [p for p in name_lower.replace(".", " ").split() if len(p) > 2]
|
||||
|
||||
matches = sum(1 for part in name_parts if part in username_lower)
|
||||
|
||||
if len(name_parts) > 0:
|
||||
if matches == len(name_parts) and len(name_parts) >= 2:
|
||||
return 85 # Full name in username
|
||||
elif matches >= 2:
|
||||
return 70 # Multiple parts match
|
||||
elif matches == 1:
|
||||
return 35 # Only one part matches
|
||||
|
||||
return 0
|
||||
|
||||
async def find_linkedin_from_source(
|
||||
self, name: str, source_url: str, role: Optional[str] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Find LinkedIn profile by crawling the source URL (team page).
|
||||
|
||||
Args:
|
||||
name: Person's name
|
||||
source_url: URL of the team/about page
|
||||
role: Person's role (for additional context matching)
|
||||
|
||||
Returns:
|
||||
Dict with linkedin_url, confidence, method, notes
|
||||
"""
|
||||
if not source_url:
|
||||
return {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "source_crawl",
|
||||
"notes": "No source URL provided",
|
||||
}
|
||||
|
||||
# Crawl the page
|
||||
content = await self.crawl_page(source_url)
|
||||
|
||||
if not content:
|
||||
return {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "source_crawl",
|
||||
"notes": f"Failed to crawl {source_url}",
|
||||
}
|
||||
|
||||
# Get HTML for better link extraction
|
||||
html = self.html_cache.get(source_url, content)
|
||||
|
||||
# Extract all LinkedIn URLs from both HTML and markdown
|
||||
linkedin_links = self.extract_linkedin_urls_from_content(html)
|
||||
if not linkedin_links:
|
||||
linkedin_links = self.extract_linkedin_urls_from_content(content)
|
||||
|
||||
if not linkedin_links:
|
||||
return {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "source_crawl",
|
||||
"notes": f"No LinkedIn URLs found on {source_url}",
|
||||
}
|
||||
|
||||
# Score each LinkedIn URL based on name matching
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
for link in linkedin_links:
|
||||
# Score based on context matching
|
||||
context_score = self._name_matches_context(name, link["context"])
|
||||
|
||||
# Score based on username matching
|
||||
username_score = self._name_matches_username(name, link["username"])
|
||||
|
||||
# Also check if role appears in context
|
||||
role_bonus = 0
|
||||
if role and role.lower() in link["context"].lower():
|
||||
role_bonus = 10
|
||||
|
||||
# Combined score (take best of context or username, plus role bonus)
|
||||
total_score = max(context_score, username_score) + role_bonus
|
||||
|
||||
logger.debug(
|
||||
f" {name} -> {link['url']}: context={context_score}, username={username_score}, role={role_bonus}, total={total_score}"
|
||||
)
|
||||
|
||||
if total_score > best_score:
|
||||
best_score = total_score
|
||||
best_match = link
|
||||
|
||||
if best_match and best_score >= 30: # Minimum threshold
|
||||
return {
|
||||
"linkedin_url": best_match["url"],
|
||||
"confidence": min(best_score, 100),
|
||||
"method": "source_crawl",
|
||||
"notes": f"Found on {source_url}",
|
||||
}
|
||||
|
||||
return {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "source_crawl",
|
||||
"notes": f'No matching LinkedIn profile found for "{name}" on {source_url}',
|
||||
}
|
||||
|
||||
async def find_linkedin_via_search(
|
||||
self, name: str, company: str, role: Optional[str] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Find LinkedIn profile using web search.
|
||||
|
||||
Args:
|
||||
name: Person's name
|
||||
company: Company/investor name
|
||||
role: Person's role (optional)
|
||||
|
||||
Returns:
|
||||
Dict with linkedin_url, confidence, method, notes
|
||||
"""
|
||||
if not self.ddg_search:
|
||||
return {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "web_search",
|
||||
"notes": "Web search not available",
|
||||
}
|
||||
|
||||
try:
|
||||
# Build search query - search for LinkedIn profile
|
||||
query = f"{name} {company} site:linkedin.com/in"
|
||||
if role:
|
||||
query = f"{name} {role} {company} site:linkedin.com/in"
|
||||
|
||||
logger.debug(f"Searching: {query}")
|
||||
results = self.web_search(query)
|
||||
|
||||
if results:
|
||||
# Look for LinkedIn profile URLs in results
|
||||
linkedin_pattern = r"https?://(?:www\.)?(?:[a-z]{2,3}\.)?linkedin\.com/in/([a-zA-Z0-9_-]+)"
|
||||
|
||||
for result in results:
|
||||
url = result.get("href") or result.get("link") or ""
|
||||
title = result.get("title", "").lower()
|
||||
body = result.get("body", "").lower()
|
||||
|
||||
match = re.search(linkedin_pattern, url, re.IGNORECASE)
|
||||
if match:
|
||||
linkedin_url = self._normalize_linkedin_url(match.group(0))
|
||||
username = match.group(1)
|
||||
|
||||
# Score based on name matching in title/body and username
|
||||
context = f"{title} {body}"
|
||||
context_score = self._name_matches_context(name, context)
|
||||
username_score = self._name_matches_username(name, username)
|
||||
|
||||
total_score = max(context_score, username_score)
|
||||
|
||||
if total_score >= 30:
|
||||
return {
|
||||
"linkedin_url": linkedin_url,
|
||||
"confidence": min(
|
||||
total_score, 90
|
||||
), # Cap at 90 for search results
|
||||
"method": "web_search",
|
||||
"notes": "Found via web search",
|
||||
}
|
||||
|
||||
return {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "web_search",
|
||||
"notes": "No matching profile found in search results",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Web search error for {name}: {e}")
|
||||
return {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "web_search",
|
||||
"notes": f"Search error: {str(e)}",
|
||||
}
|
||||
|
||||
async def find_linkedin_profile(
|
||||
self,
|
||||
name: str,
|
||||
company: str,
|
||||
role: Optional[str] = None,
|
||||
source_url: Optional[str] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Find LinkedIn profile for a person.
|
||||
|
||||
Primary strategy: Crawl source URL to find LinkedIn links.
|
||||
|
||||
Args:
|
||||
name: Person's name
|
||||
company: Company/investor name
|
||||
role: Person's role/title (optional)
|
||||
source_url: URL where person info was found (optional)
|
||||
|
||||
Returns:
|
||||
Dict with:
|
||||
- linkedin_url: Found LinkedIn URL or None
|
||||
- confidence: Confidence score (0-100)
|
||||
- method: Method used to find the profile
|
||||
- notes: Additional information
|
||||
"""
|
||||
cache_key = f"{name}|{company}"
|
||||
|
||||
# Check cache
|
||||
if self.use_cache and cache_key in self.profile_cache:
|
||||
logger.debug(f"Using cached result for {name}")
|
||||
return self.profile_cache[cache_key]
|
||||
|
||||
result = {"linkedin_url": None, "confidence": 0, "method": "none", "notes": ""}
|
||||
|
||||
# Primary strategy: Crawl source URL
|
||||
if source_url:
|
||||
result = await self.find_linkedin_from_source(name, source_url, role)
|
||||
|
||||
if result["linkedin_url"]:
|
||||
if self.use_cache:
|
||||
self.profile_cache[cache_key] = result
|
||||
return result
|
||||
|
||||
# Fallback strategy: Web search (if enabled and no result from source crawl)
|
||||
if self.use_llm_search and not result.get("linkedin_url"):
|
||||
search_result = await self.find_linkedin_via_search(name, company, role)
|
||||
if search_result["linkedin_url"]:
|
||||
if self.use_cache:
|
||||
self.profile_cache[cache_key] = search_result
|
||||
return search_result
|
||||
|
||||
# If no source URL or no match found
|
||||
if not result["linkedin_url"]:
|
||||
result = {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "none",
|
||||
"notes": "No source URL available"
|
||||
if not source_url
|
||||
else result.get("notes", "Not found"),
|
||||
}
|
||||
|
||||
if self.use_cache:
|
||||
self.profile_cache[cache_key] = result
|
||||
|
||||
return result
|
||||
|
||||
async def batch_find_profiles(
|
||||
self, members: List[Dict], progress_callback=None, db_callback=None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Find LinkedIn profiles for multiple members efficiently.
|
||||
|
||||
Groups members by source_url to minimize crawling the same page multiple times.
|
||||
|
||||
Args:
|
||||
members: List of dicts with 'name', 'company', 'role', 'source_url', 'id'
|
||||
progress_callback: Optional callback function(current, total, result)
|
||||
db_callback: Optional callback to save to database immediately when profile found
|
||||
Signature: db_callback(member_id, linkedin_url) -> bool
|
||||
|
||||
Returns:
|
||||
List of results for each member
|
||||
"""
|
||||
results = []
|
||||
total = len(members)
|
||||
|
||||
# Group members by source_url for efficient crawling
|
||||
url_groups: Dict[str, List[Dict]] = {}
|
||||
no_url_members = []
|
||||
|
||||
for member in members:
|
||||
url = member.get("source_url")
|
||||
if url:
|
||||
if url not in url_groups:
|
||||
url_groups[url] = []
|
||||
url_groups[url].append(member)
|
||||
else:
|
||||
no_url_members.append(member)
|
||||
|
||||
logger.info(
|
||||
f"Processing {len(url_groups)} unique source URLs for {total} members"
|
||||
)
|
||||
logger.info(f"Members with source URLs: {total - len(no_url_members)}")
|
||||
logger.info(f"Members without source URLs: {len(no_url_members)}")
|
||||
if self.use_llm_search:
|
||||
logger.info("Web search fallback: ENABLED")
|
||||
else:
|
||||
logger.info("Web search fallback: DISABLED")
|
||||
|
||||
processed = 0
|
||||
|
||||
# Process members grouped by URL (efficient - one crawl per page)
|
||||
for url, group_members in url_groups.items():
|
||||
# Crawl the page once
|
||||
content = await self.crawl_page(url)
|
||||
html = self.html_cache.get(url, content or "")
|
||||
|
||||
# Extract all LinkedIn URLs from this page
|
||||
linkedin_links = []
|
||||
if content:
|
||||
linkedin_links = self.extract_linkedin_urls_from_content(html)
|
||||
if not linkedin_links:
|
||||
linkedin_links = self.extract_linkedin_urls_from_content(content)
|
||||
|
||||
# Match each member in this group
|
||||
for member in group_members:
|
||||
processed += 1
|
||||
result = None
|
||||
found_linkedin = False
|
||||
|
||||
if linkedin_links:
|
||||
# Find best matching LinkedIn for this member
|
||||
best_match = None
|
||||
best_score = 0
|
||||
|
||||
for link in linkedin_links:
|
||||
context_score = self._name_matches_context(
|
||||
member["name"], link["context"]
|
||||
)
|
||||
username_score = self._name_matches_username(
|
||||
member["name"], link["username"]
|
||||
)
|
||||
role_bonus = (
|
||||
10
|
||||
if member.get("role")
|
||||
and member["role"].lower() in link["context"].lower()
|
||||
else 0
|
||||
)
|
||||
total_score = max(context_score, username_score) + role_bonus
|
||||
|
||||
if total_score > best_score:
|
||||
best_score = total_score
|
||||
best_match = link
|
||||
|
||||
if best_match and best_score >= 30:
|
||||
result = {
|
||||
"linkedin_url": best_match["url"],
|
||||
"confidence": min(best_score, 100),
|
||||
"method": "source_crawl",
|
||||
"notes": f"Found on {url}",
|
||||
"member_id": member.get("id"),
|
||||
"member_name": member["name"],
|
||||
}
|
||||
found_linkedin = True
|
||||
# Save to database immediately if callback provided
|
||||
if db_callback and member.get("id"):
|
||||
db_callback(member["id"], best_match["url"])
|
||||
|
||||
# If no result from source crawl, try web search IMMEDIATELY
|
||||
if not found_linkedin and self.use_llm_search:
|
||||
search_result = await self.find_linkedin_via_search(
|
||||
member["name"], member["company"], member.get("role")
|
||||
)
|
||||
|
||||
if search_result["linkedin_url"]:
|
||||
result = {
|
||||
"linkedin_url": search_result["linkedin_url"],
|
||||
"confidence": search_result["confidence"],
|
||||
"method": "web_search",
|
||||
"notes": search_result.get("notes", "Found via web search"),
|
||||
"member_id": member.get("id"),
|
||||
"member_name": member["name"],
|
||||
}
|
||||
found_linkedin = True
|
||||
# Save to database immediately
|
||||
if db_callback and member.get("id"):
|
||||
db_callback(member["id"], search_result["linkedin_url"])
|
||||
|
||||
# If still no result, record as not found
|
||||
if not found_linkedin:
|
||||
result = {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "source_crawl" if content else "none",
|
||||
"notes": f"No match on {url}"
|
||||
if linkedin_links
|
||||
else (
|
||||
f"No LinkedIn URLs on {url}"
|
||||
if content
|
||||
else f"Failed to crawl {url}"
|
||||
),
|
||||
"member_id": member.get("id"),
|
||||
"member_name": member["name"],
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(processed, total, result)
|
||||
|
||||
# Small delay between different URLs
|
||||
await asyncio.sleep(self.rate_limit_delay)
|
||||
|
||||
# Process members without source URLs - do web search immediately for each
|
||||
for member in no_url_members:
|
||||
processed += 1
|
||||
result = None
|
||||
|
||||
# Try web search immediately
|
||||
if self.use_llm_search:
|
||||
search_result = await self.find_linkedin_via_search(
|
||||
member["name"], member["company"], member.get("role")
|
||||
)
|
||||
|
||||
if search_result["linkedin_url"]:
|
||||
result = {
|
||||
"linkedin_url": search_result["linkedin_url"],
|
||||
"confidence": search_result["confidence"],
|
||||
"method": "web_search",
|
||||
"notes": search_result.get("notes", "Found via web search"),
|
||||
"member_id": member.get("id"),
|
||||
"member_name": member["name"],
|
||||
}
|
||||
# Save to database immediately
|
||||
if db_callback and member.get("id"):
|
||||
db_callback(member["id"], search_result["linkedin_url"])
|
||||
|
||||
# If no result from search
|
||||
if not result:
|
||||
result = {
|
||||
"linkedin_url": None,
|
||||
"confidence": 0,
|
||||
"method": "web_search" if self.use_llm_search else "none",
|
||||
"notes": "No LinkedIn profile found"
|
||||
if self.use_llm_search
|
||||
else "No source URL available",
|
||||
"member_id": member.get("id"),
|
||||
"member_name": member["name"],
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
if progress_callback:
|
||||
progress_callback(processed, total, result)
|
||||
|
||||
# Rate limit between searches
|
||||
await asyncio.sleep(self.rate_limit_delay)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def format_linkedin_url(url: str) -> str:
|
||||
"""Normalize LinkedIn URL format"""
|
||||
if not url:
|
||||
return url
|
||||
|
||||
# Remove trailing slashes
|
||||
url = url.rstrip("/")
|
||||
|
||||
# Ensure https and normalize to www
|
||||
url = re.sub(r"https?://[a-z]{2,3}\.linkedin\.com", "https://www.linkedin.com", url)
|
||||
if url.startswith("http://"):
|
||||
url = url.replace("http://", "https://")
|
||||
|
||||
return url
|
||||
|
||||
|
||||
# Async wrapper for sync contexts
|
||||
def run_batch_scraper(
|
||||
members: List[Dict], rate_limit: float = 0.5, progress_callback=None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Synchronous wrapper for batch_find_profiles.
|
||||
|
||||
Args:
|
||||
members: List of member dicts
|
||||
rate_limit: Delay between URL crawls
|
||||
progress_callback: Optional progress callback
|
||||
|
||||
Returns:
|
||||
List of results
|
||||
"""
|
||||
scraper = LinkedInProfileScraper(rate_limit_delay=rate_limit)
|
||||
return asyncio.run(scraper.batch_find_profiles(members, progress_callback))
|
||||
Reference in New Issue
Block a user