311 lines
9.7 KiB
Python
311 lines
9.7 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Update Investor Members LinkedIn Profiles Script
|
|||
|
|
|
|||
|
|
This script finds and updates LinkedIn profile URLs for investor members in the database.
|
|||
|
|
Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing]
|
|||
|
|
|
|||
|
|
Options:
|
|||
|
|
--test Test mode: process only 10 records and don't update database
|
|||
|
|
--limit N Process only N records (default: all)
|
|||
|
|
--skip-existing Skip members that already have LinkedIn URLs
|
|||
|
|
--start-from N Start from record N (for resuming)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import asyncio
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
# Add app to path
|
|||
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app"))
|
|||
|
|
|
|||
|
|
from db.db import get_db_session
|
|||
|
|
from db.models import InvestorMember, InvestorTable
|
|||
|
|
from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url
|
|||
|
|
|
|||
|
|
|
|||
|
|
def progress_callback(current, total, result):
|
|||
|
|
"""Print progress updates"""
|
|||
|
|
percent = (current / total) * 100
|
|||
|
|
status = "✓" if result["linkedin_url"] else "✗"
|
|||
|
|
print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}")
|
|||
|
|
if result["linkedin_url"]:
|
|||
|
|
print(
|
|||
|
|
f" → {result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def create_db_callback(test_mode=False):
|
|||
|
|
"""
|
|||
|
|
Create a callback function that saves LinkedIn profiles to the database immediately.
|
|||
|
|
This allows stopping and resuming without losing progress.
|
|||
|
|
"""
|
|||
|
|
saved_count = {"count": 0} # Use dict to allow modification in closure
|
|||
|
|
|
|||
|
|
def db_callback(member_id: int, linkedin_url: str) -> bool:
|
|||
|
|
"""Save LinkedIn URL to database immediately"""
|
|||
|
|
if test_mode:
|
|||
|
|
print(f" [TEST] Would save to DB: member {member_id}")
|
|||
|
|
saved_count["count"] += 1
|
|||
|
|
return True
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
db = get_db_session()
|
|||
|
|
member = db.query(InvestorMember).filter_by(id=member_id).first()
|
|||
|
|
if member:
|
|||
|
|
member.linkedin = format_linkedin_url(linkedin_url)
|
|||
|
|
db.commit()
|
|||
|
|
saved_count["count"] += 1
|
|||
|
|
return True
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f" ⚠️ DB Error for member {member_id}: {e}")
|
|||
|
|
try:
|
|||
|
|
db.rollback()
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
return False
|
|||
|
|
finally:
|
|||
|
|
try:
|
|||
|
|
db.close()
|
|||
|
|
except Exception:
|
|||
|
|
pass
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
return db_callback, saved_count
|
|||
|
|
|
|||
|
|
|
|||
|
|
def update_database(members_data, test_mode=False):
|
|||
|
|
"""Update database with found LinkedIn profiles"""
|
|||
|
|
db = get_db_session()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
updated_count = 0
|
|||
|
|
for data in members_data:
|
|||
|
|
if data["linkedin_url"] and data["member_id"]:
|
|||
|
|
if not test_mode:
|
|||
|
|
member = (
|
|||
|
|
db.query(InvestorMember).filter_by(id=data["member_id"]).first()
|
|||
|
|
)
|
|||
|
|
if member:
|
|||
|
|
member.linkedin = format_linkedin_url(data["linkedin_url"])
|
|||
|
|
updated_count += 1
|
|||
|
|
else:
|
|||
|
|
print(
|
|||
|
|
f" [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}"
|
|||
|
|
)
|
|||
|
|
updated_count += 1
|
|||
|
|
|
|||
|
|
if not test_mode:
|
|||
|
|
db.commit()
|
|||
|
|
print(f"\n✓ Successfully updated {updated_count} records in database")
|
|||
|
|
else:
|
|||
|
|
print(f"\n[TEST MODE] Would have updated {updated_count} records")
|
|||
|
|
|
|||
|
|
return updated_count
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
db.rollback()
|
|||
|
|
print(f"\n✗ Error updating database: {e}")
|
|||
|
|
raise
|
|||
|
|
finally:
|
|||
|
|
db.close()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def save_results(results, filename="linkedin_scraping_results.json"):
|
|||
|
|
"""Save results to JSON file for backup/analysis"""
|
|||
|
|
output = {
|
|||
|
|
"timestamp": datetime.now().isoformat(),
|
|||
|
|
"total_processed": len(results),
|
|||
|
|
"found_count": sum(1 for r in results if r["linkedin_url"]),
|
|||
|
|
"results": results,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
with open(filename, "w") as f:
|
|||
|
|
json.dump(output, f, indent=2)
|
|||
|
|
|
|||
|
|
print(f"\n✓ Results saved to {filename}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def print_summary(results):
|
|||
|
|
"""Print summary statistics"""
|
|||
|
|
total = len(results)
|
|||
|
|
found = sum(1 for r in results if r["linkedin_url"])
|
|||
|
|
not_found = total - found
|
|||
|
|
|
|||
|
|
# Count by method
|
|||
|
|
methods = {}
|
|||
|
|
for r in results:
|
|||
|
|
if r["linkedin_url"]:
|
|||
|
|
method = r["method"]
|
|||
|
|
methods[method] = methods.get(method, 0) + 1
|
|||
|
|
|
|||
|
|
# Average confidence for found profiles
|
|||
|
|
avg_confidence = (
|
|||
|
|
sum(r["confidence"] for r in results if r["linkedin_url"]) / found
|
|||
|
|
if found > 0
|
|||
|
|
else 0
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print("\n" + "=" * 60)
|
|||
|
|
print("SUMMARY")
|
|||
|
|
print("=" * 60)
|
|||
|
|
print(f"Total processed: {total}")
|
|||
|
|
print(f"LinkedIn found: {found} ({found / total * 100:.1f}%)")
|
|||
|
|
print(f"Not found: {not_found} ({not_found / total * 100:.1f}%)")
|
|||
|
|
print(f"\nAverage confidence: {avg_confidence:.1f}%")
|
|||
|
|
print("\nMethods used:")
|
|||
|
|
for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True):
|
|||
|
|
print(f" {method:20s} {count:5d} ({count / found * 100:.1f}%)")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(
|
|||
|
|
description="Update LinkedIn profiles for investor members"
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--test",
|
|||
|
|
action="store_true",
|
|||
|
|
help="Test mode: process only 10 records without updating database",
|
|||
|
|
)
|
|||
|
|
parser.add_argument("--limit", type=int, help="Limit number of records to process")
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--skip-existing",
|
|||
|
|
action="store_true",
|
|||
|
|
help="Skip members that already have LinkedIn URLs",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--start-from",
|
|||
|
|
type=int,
|
|||
|
|
default=0,
|
|||
|
|
help="Start from record N (for resuming interrupted runs)",
|
|||
|
|
)
|
|||
|
|
parser.add_argument(
|
|||
|
|
"--rate-limit",
|
|||
|
|
type=float,
|
|||
|
|
default=0.5,
|
|||
|
|
help="Delay between URL crawls in seconds (default: 0.5)",
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
# Test mode overrides limit
|
|||
|
|
if args.test and not args.limit:
|
|||
|
|
args.limit = 10
|
|||
|
|
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("LinkedIn Profile Scraper for Investor Members (crawl4ai)")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
if args.test:
|
|||
|
|
print("\n⚠️ TEST MODE - No database changes will be made")
|
|||
|
|
|
|||
|
|
# Initialize database and scraper
|
|||
|
|
db = get_db_session()
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# Build query
|
|||
|
|
query = db.query(InvestorMember, InvestorTable).join(
|
|||
|
|
InvestorTable, InvestorMember.investor_id == InvestorTable.id
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Filter existing if requested
|
|||
|
|
if args.skip_existing:
|
|||
|
|
query = query.filter(
|
|||
|
|
(InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "")
|
|||
|
|
)
|
|||
|
|
print("\n✓ Filtering to members without LinkedIn profiles")
|
|||
|
|
|
|||
|
|
# Get total count
|
|||
|
|
total_available = query.count()
|
|||
|
|
print(f"\n✓ Found {total_available} members to process")
|
|||
|
|
|
|||
|
|
# Apply offset and limit
|
|||
|
|
if args.start_from > 0:
|
|||
|
|
query = query.offset(args.start_from)
|
|||
|
|
print(f"✓ Starting from record {args.start_from}")
|
|||
|
|
|
|||
|
|
if args.limit:
|
|||
|
|
query = query.limit(args.limit)
|
|||
|
|
print(f"✓ Processing {args.limit} records")
|
|||
|
|
|
|||
|
|
# Fetch members
|
|||
|
|
members_data = []
|
|||
|
|
for member, investor in query.all():
|
|||
|
|
members_data.append(
|
|||
|
|
{
|
|||
|
|
"id": member.id,
|
|||
|
|
"name": member.name,
|
|||
|
|
"company": investor.name,
|
|||
|
|
"role": member.role,
|
|||
|
|
"source_url": member.source_url,
|
|||
|
|
}
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if not members_data:
|
|||
|
|
print("\n⚠️ No members to process")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
# Count unique source URLs
|
|||
|
|
unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"]))
|
|||
|
|
with_urls = sum(1 for m in members_data if m["source_url"])
|
|||
|
|
|
|||
|
|
print(f"\n✓ Loaded {len(members_data)} members")
|
|||
|
|
print(
|
|||
|
|
f"✓ {with_urls} members have source URLs ({unique_urls} unique pages to crawl)"
|
|||
|
|
)
|
|||
|
|
print(f"✓ {len(members_data) - with_urls} members without source URLs")
|
|||
|
|
print(f"✓ Rate limit: {args.rate_limit}s between page crawls")
|
|||
|
|
print("\nStarting LinkedIn profile search using crawl4ai...\n")
|
|||
|
|
|
|||
|
|
finally:
|
|||
|
|
db.close()
|
|||
|
|
|
|||
|
|
# Initialize scraper
|
|||
|
|
scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True)
|
|||
|
|
|
|||
|
|
print("ℹ️ Using crawl4ai to scrape team pages and extract LinkedIn URLs")
|
|||
|
|
print(
|
|||
|
|
"ℹ️ Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Create database callback for real-time saving
|
|||
|
|
db_callback, saved_count = create_db_callback(test_mode=args.test)
|
|||
|
|
|
|||
|
|
# Process members asynchronously with real-time DB saving
|
|||
|
|
results = asyncio.run(
|
|||
|
|
scraper.batch_find_profiles(
|
|||
|
|
members_data, progress_callback=progress_callback, db_callback=db_callback
|
|||
|
|
)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# Print summary
|
|||
|
|
print_summary(results)
|
|||
|
|
|
|||
|
|
# Save results
|
|||
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|||
|
|
results_file = f"linkedin_results_{timestamp}.json"
|
|||
|
|
save_results(results, results_file)
|
|||
|
|
|
|||
|
|
# Show database update summary
|
|||
|
|
if not args.test:
|
|||
|
|
print(
|
|||
|
|
f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved"
|
|||
|
|
)
|
|||
|
|
else:
|
|||
|
|
print(
|
|||
|
|
f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
print("\n✓ Done! You can resume anytime with --skip-existing")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|