311 lines
9.7 KiB
Python
311 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Update Investor Members LinkedIn Profiles Script
|
||
|
||
This script finds and updates LinkedIn profile URLs for investor members in the database.
|
||
Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs.
|
||
|
||
Usage:
|
||
python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing]
|
||
|
||
Options:
|
||
--test Test mode: process only 10 records and don't update database
|
||
--limit N Process only N records (default: all)
|
||
--skip-existing Skip members that already have LinkedIn URLs
|
||
--start-from N Start from record N (for resuming)
|
||
"""
|
||
|
||
import argparse
|
||
import asyncio
|
||
import json
|
||
import os
|
||
import sys
|
||
from datetime import datetime
|
||
|
||
# Add app to path
|
||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app"))
|
||
|
||
from db.db import get_db_session
|
||
from db.models import InvestorMember, InvestorTable
|
||
from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url
|
||
|
||
|
||
def progress_callback(current, total, result):
|
||
"""Print progress updates"""
|
||
percent = (current / total) * 100
|
||
status = "✓" if result["linkedin_url"] else "✗"
|
||
print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}")
|
||
if result["linkedin_url"]:
|
||
print(
|
||
f" → {result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})"
|
||
)
|
||
|
||
|
||
def create_db_callback(test_mode=False):
|
||
"""
|
||
Create a callback function that saves LinkedIn profiles to the database immediately.
|
||
This allows stopping and resuming without losing progress.
|
||
"""
|
||
saved_count = {"count": 0} # Use dict to allow modification in closure
|
||
|
||
def db_callback(member_id: int, linkedin_url: str) -> bool:
|
||
"""Save LinkedIn URL to database immediately"""
|
||
if test_mode:
|
||
print(f" [TEST] Would save to DB: member {member_id}")
|
||
saved_count["count"] += 1
|
||
return True
|
||
|
||
try:
|
||
db = get_db_session()
|
||
member = db.query(InvestorMember).filter_by(id=member_id).first()
|
||
if member:
|
||
member.linkedin = format_linkedin_url(linkedin_url)
|
||
db.commit()
|
||
saved_count["count"] += 1
|
||
return True
|
||
except Exception as e:
|
||
print(f" ⚠️ DB Error for member {member_id}: {e}")
|
||
try:
|
||
db.rollback()
|
||
except Exception:
|
||
pass
|
||
return False
|
||
finally:
|
||
try:
|
||
db.close()
|
||
except Exception:
|
||
pass
|
||
return False
|
||
|
||
return db_callback, saved_count
|
||
|
||
|
||
def update_database(members_data, test_mode=False):
|
||
"""Update database with found LinkedIn profiles"""
|
||
db = get_db_session()
|
||
|
||
try:
|
||
updated_count = 0
|
||
for data in members_data:
|
||
if data["linkedin_url"] and data["member_id"]:
|
||
if not test_mode:
|
||
member = (
|
||
db.query(InvestorMember).filter_by(id=data["member_id"]).first()
|
||
)
|
||
if member:
|
||
member.linkedin = format_linkedin_url(data["linkedin_url"])
|
||
updated_count += 1
|
||
else:
|
||
print(
|
||
f" [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}"
|
||
)
|
||
updated_count += 1
|
||
|
||
if not test_mode:
|
||
db.commit()
|
||
print(f"\n✓ Successfully updated {updated_count} records in database")
|
||
else:
|
||
print(f"\n[TEST MODE] Would have updated {updated_count} records")
|
||
|
||
return updated_count
|
||
|
||
except Exception as e:
|
||
db.rollback()
|
||
print(f"\n✗ Error updating database: {e}")
|
||
raise
|
||
finally:
|
||
db.close()
|
||
|
||
|
||
def save_results(results, filename="linkedin_scraping_results.json"):
|
||
"""Save results to JSON file for backup/analysis"""
|
||
output = {
|
||
"timestamp": datetime.now().isoformat(),
|
||
"total_processed": len(results),
|
||
"found_count": sum(1 for r in results if r["linkedin_url"]),
|
||
"results": results,
|
||
}
|
||
|
||
with open(filename, "w") as f:
|
||
json.dump(output, f, indent=2)
|
||
|
||
print(f"\n✓ Results saved to {filename}")
|
||
|
||
|
||
def print_summary(results):
|
||
"""Print summary statistics"""
|
||
total = len(results)
|
||
found = sum(1 for r in results if r["linkedin_url"])
|
||
not_found = total - found
|
||
|
||
# Count by method
|
||
methods = {}
|
||
for r in results:
|
||
if r["linkedin_url"]:
|
||
method = r["method"]
|
||
methods[method] = methods.get(method, 0) + 1
|
||
|
||
# Average confidence for found profiles
|
||
avg_confidence = (
|
||
sum(r["confidence"] for r in results if r["linkedin_url"]) / found
|
||
if found > 0
|
||
else 0
|
||
)
|
||
|
||
print("\n" + "=" * 60)
|
||
print("SUMMARY")
|
||
print("=" * 60)
|
||
print(f"Total processed: {total}")
|
||
print(f"LinkedIn found: {found} ({found / total * 100:.1f}%)")
|
||
print(f"Not found: {not_found} ({not_found / total * 100:.1f}%)")
|
||
print(f"\nAverage confidence: {avg_confidence:.1f}%")
|
||
print("\nMethods used:")
|
||
for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True):
|
||
print(f" {method:20s} {count:5d} ({count / found * 100:.1f}%)")
|
||
print("=" * 60)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(
|
||
description="Update LinkedIn profiles for investor members"
|
||
)
|
||
parser.add_argument(
|
||
"--test",
|
||
action="store_true",
|
||
help="Test mode: process only 10 records without updating database",
|
||
)
|
||
parser.add_argument("--limit", type=int, help="Limit number of records to process")
|
||
parser.add_argument(
|
||
"--skip-existing",
|
||
action="store_true",
|
||
help="Skip members that already have LinkedIn URLs",
|
||
)
|
||
parser.add_argument(
|
||
"--start-from",
|
||
type=int,
|
||
default=0,
|
||
help="Start from record N (for resuming interrupted runs)",
|
||
)
|
||
parser.add_argument(
|
||
"--rate-limit",
|
||
type=float,
|
||
default=0.5,
|
||
help="Delay between URL crawls in seconds (default: 0.5)",
|
||
)
|
||
|
||
args = parser.parse_args()
|
||
|
||
# Test mode overrides limit
|
||
if args.test and not args.limit:
|
||
args.limit = 10
|
||
|
||
print("=" * 60)
|
||
print("LinkedIn Profile Scraper for Investor Members (crawl4ai)")
|
||
print("=" * 60)
|
||
|
||
if args.test:
|
||
print("\n⚠️ TEST MODE - No database changes will be made")
|
||
|
||
# Initialize database and scraper
|
||
db = get_db_session()
|
||
|
||
try:
|
||
# Build query
|
||
query = db.query(InvestorMember, InvestorTable).join(
|
||
InvestorTable, InvestorMember.investor_id == InvestorTable.id
|
||
)
|
||
|
||
# Filter existing if requested
|
||
if args.skip_existing:
|
||
query = query.filter(
|
||
(InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "")
|
||
)
|
||
print("\n✓ Filtering to members without LinkedIn profiles")
|
||
|
||
# Get total count
|
||
total_available = query.count()
|
||
print(f"\n✓ Found {total_available} members to process")
|
||
|
||
# Apply offset and limit
|
||
if args.start_from > 0:
|
||
query = query.offset(args.start_from)
|
||
print(f"✓ Starting from record {args.start_from}")
|
||
|
||
if args.limit:
|
||
query = query.limit(args.limit)
|
||
print(f"✓ Processing {args.limit} records")
|
||
|
||
# Fetch members
|
||
members_data = []
|
||
for member, investor in query.all():
|
||
members_data.append(
|
||
{
|
||
"id": member.id,
|
||
"name": member.name,
|
||
"company": investor.name,
|
||
"role": member.role,
|
||
"source_url": member.source_url,
|
||
}
|
||
)
|
||
|
||
if not members_data:
|
||
print("\n⚠️ No members to process")
|
||
return
|
||
|
||
# Count unique source URLs
|
||
unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"]))
|
||
with_urls = sum(1 for m in members_data if m["source_url"])
|
||
|
||
print(f"\n✓ Loaded {len(members_data)} members")
|
||
print(
|
||
f"✓ {with_urls} members have source URLs ({unique_urls} unique pages to crawl)"
|
||
)
|
||
print(f"✓ {len(members_data) - with_urls} members without source URLs")
|
||
print(f"✓ Rate limit: {args.rate_limit}s between page crawls")
|
||
print("\nStarting LinkedIn profile search using crawl4ai...\n")
|
||
|
||
finally:
|
||
db.close()
|
||
|
||
# Initialize scraper
|
||
scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True)
|
||
|
||
print("ℹ️ Using crawl4ai to scrape team pages and extract LinkedIn URLs")
|
||
print(
|
||
"ℹ️ Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n"
|
||
)
|
||
|
||
# Create database callback for real-time saving
|
||
db_callback, saved_count = create_db_callback(test_mode=args.test)
|
||
|
||
# Process members asynchronously with real-time DB saving
|
||
results = asyncio.run(
|
||
scraper.batch_find_profiles(
|
||
members_data, progress_callback=progress_callback, db_callback=db_callback
|
||
)
|
||
)
|
||
|
||
# Print summary
|
||
print_summary(results)
|
||
|
||
# Save results
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
results_file = f"linkedin_results_{timestamp}.json"
|
||
save_results(results, results_file)
|
||
|
||
# Show database update summary
|
||
if not args.test:
|
||
print(
|
||
f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved"
|
||
)
|
||
else:
|
||
print(
|
||
f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database"
|
||
)
|
||
|
||
print("\n✓ Done! You can resume anytime with --skip-existing")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|