Files
Anton_wireframe/update_linkedin_profiles.py
T
2025-11-27 16:44:22 +01:00

311 lines
9.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Update Investor Members LinkedIn Profiles Script
This script finds and updates LinkedIn profile URLs for investor members in the database.
Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs.
Usage:
python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing]
Options:
--test Test mode: process only 10 records and don't update database
--limit N Process only N records (default: all)
--skip-existing Skip members that already have LinkedIn URLs
--start-from N Start from record N (for resuming)
"""
import argparse
import asyncio
import json
import os
import sys
from datetime import datetime
# Add app to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app"))
from db.db import get_db_session
from db.models import InvestorMember, InvestorTable
from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url
def progress_callback(current, total, result):
"""Print progress updates"""
percent = (current / total) * 100
status = "" if result["linkedin_url"] else ""
print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}")
if result["linkedin_url"]:
print(
f"{result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})"
)
def create_db_callback(test_mode=False):
"""
Create a callback function that saves LinkedIn profiles to the database immediately.
This allows stopping and resuming without losing progress.
"""
saved_count = {"count": 0} # Use dict to allow modification in closure
def db_callback(member_id: int, linkedin_url: str) -> bool:
"""Save LinkedIn URL to database immediately"""
if test_mode:
print(f" [TEST] Would save to DB: member {member_id}")
saved_count["count"] += 1
return True
try:
db = get_db_session()
member = db.query(InvestorMember).filter_by(id=member_id).first()
if member:
member.linkedin = format_linkedin_url(linkedin_url)
db.commit()
saved_count["count"] += 1
return True
except Exception as e:
print(f" ⚠️ DB Error for member {member_id}: {e}")
try:
db.rollback()
except Exception:
pass
return False
finally:
try:
db.close()
except Exception:
pass
return False
return db_callback, saved_count
def update_database(members_data, test_mode=False):
"""Update database with found LinkedIn profiles"""
db = get_db_session()
try:
updated_count = 0
for data in members_data:
if data["linkedin_url"] and data["member_id"]:
if not test_mode:
member = (
db.query(InvestorMember).filter_by(id=data["member_id"]).first()
)
if member:
member.linkedin = format_linkedin_url(data["linkedin_url"])
updated_count += 1
else:
print(
f" [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}"
)
updated_count += 1
if not test_mode:
db.commit()
print(f"\n✓ Successfully updated {updated_count} records in database")
else:
print(f"\n[TEST MODE] Would have updated {updated_count} records")
return updated_count
except Exception as e:
db.rollback()
print(f"\n✗ Error updating database: {e}")
raise
finally:
db.close()
def save_results(results, filename="linkedin_scraping_results.json"):
"""Save results to JSON file for backup/analysis"""
output = {
"timestamp": datetime.now().isoformat(),
"total_processed": len(results),
"found_count": sum(1 for r in results if r["linkedin_url"]),
"results": results,
}
with open(filename, "w") as f:
json.dump(output, f, indent=2)
print(f"\n✓ Results saved to {filename}")
def print_summary(results):
"""Print summary statistics"""
total = len(results)
found = sum(1 for r in results if r["linkedin_url"])
not_found = total - found
# Count by method
methods = {}
for r in results:
if r["linkedin_url"]:
method = r["method"]
methods[method] = methods.get(method, 0) + 1
# Average confidence for found profiles
avg_confidence = (
sum(r["confidence"] for r in results if r["linkedin_url"]) / found
if found > 0
else 0
)
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Total processed: {total}")
print(f"LinkedIn found: {found} ({found / total * 100:.1f}%)")
print(f"Not found: {not_found} ({not_found / total * 100:.1f}%)")
print(f"\nAverage confidence: {avg_confidence:.1f}%")
print("\nMethods used:")
for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True):
print(f" {method:20s} {count:5d} ({count / found * 100:.1f}%)")
print("=" * 60)
def main():
parser = argparse.ArgumentParser(
description="Update LinkedIn profiles for investor members"
)
parser.add_argument(
"--test",
action="store_true",
help="Test mode: process only 10 records without updating database",
)
parser.add_argument("--limit", type=int, help="Limit number of records to process")
parser.add_argument(
"--skip-existing",
action="store_true",
help="Skip members that already have LinkedIn URLs",
)
parser.add_argument(
"--start-from",
type=int,
default=0,
help="Start from record N (for resuming interrupted runs)",
)
parser.add_argument(
"--rate-limit",
type=float,
default=0.5,
help="Delay between URL crawls in seconds (default: 0.5)",
)
args = parser.parse_args()
# Test mode overrides limit
if args.test and not args.limit:
args.limit = 10
print("=" * 60)
print("LinkedIn Profile Scraper for Investor Members (crawl4ai)")
print("=" * 60)
if args.test:
print("\n⚠️ TEST MODE - No database changes will be made")
# Initialize database and scraper
db = get_db_session()
try:
# Build query
query = db.query(InvestorMember, InvestorTable).join(
InvestorTable, InvestorMember.investor_id == InvestorTable.id
)
# Filter existing if requested
if args.skip_existing:
query = query.filter(
(InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "")
)
print("\n✓ Filtering to members without LinkedIn profiles")
# Get total count
total_available = query.count()
print(f"\n✓ Found {total_available} members to process")
# Apply offset and limit
if args.start_from > 0:
query = query.offset(args.start_from)
print(f"✓ Starting from record {args.start_from}")
if args.limit:
query = query.limit(args.limit)
print(f"✓ Processing {args.limit} records")
# Fetch members
members_data = []
for member, investor in query.all():
members_data.append(
{
"id": member.id,
"name": member.name,
"company": investor.name,
"role": member.role,
"source_url": member.source_url,
}
)
if not members_data:
print("\n⚠️ No members to process")
return
# Count unique source URLs
unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"]))
with_urls = sum(1 for m in members_data if m["source_url"])
print(f"\n✓ Loaded {len(members_data)} members")
print(
f"{with_urls} members have source URLs ({unique_urls} unique pages to crawl)"
)
print(f"{len(members_data) - with_urls} members without source URLs")
print(f"✓ Rate limit: {args.rate_limit}s between page crawls")
print("\nStarting LinkedIn profile search using crawl4ai...\n")
finally:
db.close()
# Initialize scraper
scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True)
print("️ Using crawl4ai to scrape team pages and extract LinkedIn URLs")
print(
"️ Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n"
)
# Create database callback for real-time saving
db_callback, saved_count = create_db_callback(test_mode=args.test)
# Process members asynchronously with real-time DB saving
results = asyncio.run(
scraper.batch_find_profiles(
members_data, progress_callback=progress_callback, db_callback=db_callback
)
)
# Print summary
print_summary(results)
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"linkedin_results_{timestamp}.json"
save_results(results, results_file)
# Show database update summary
if not args.test:
print(
f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved"
)
else:
print(
f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database"
)
print("\n✓ Done! You can resume anytime with --skip-existing")
if __name__ == "__main__":
main()