#!/usr/bin/env python3 """ Update Investor Members LinkedIn Profiles Script This script finds and updates LinkedIn profile URLs for investor members in the database. Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs. Usage: python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing] Options: --test Test mode: process only 10 records and don't update database --limit N Process only N records (default: all) --skip-existing Skip members that already have LinkedIn URLs --start-from N Start from record N (for resuming) """ import argparse import asyncio import json import os import sys from datetime import datetime # Add app to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app")) from db.db import get_db_session from db.models import InvestorMember, InvestorTable from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url def progress_callback(current, total, result): """Print progress updates""" percent = (current / total) * 100 status = "✓" if result["linkedin_url"] else "✗" print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}") if result["linkedin_url"]: print( f" → {result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})" ) def create_db_callback(test_mode=False): """ Create a callback function that saves LinkedIn profiles to the database immediately. This allows stopping and resuming without losing progress. """ saved_count = {"count": 0} # Use dict to allow modification in closure def db_callback(member_id: int, linkedin_url: str) -> bool: """Save LinkedIn URL to database immediately""" if test_mode: print(f" [TEST] Would save to DB: member {member_id}") saved_count["count"] += 1 return True try: db = get_db_session() member = db.query(InvestorMember).filter_by(id=member_id).first() if member: member.linkedin = format_linkedin_url(linkedin_url) db.commit() saved_count["count"] += 1 return True except Exception as e: print(f" ⚠️ DB Error for member {member_id}: {e}") try: db.rollback() except Exception: pass return False finally: try: db.close() except Exception: pass return False return db_callback, saved_count def update_database(members_data, test_mode=False): """Update database with found LinkedIn profiles""" db = get_db_session() try: updated_count = 0 for data in members_data: if data["linkedin_url"] and data["member_id"]: if not test_mode: member = ( db.query(InvestorMember).filter_by(id=data["member_id"]).first() ) if member: member.linkedin = format_linkedin_url(data["linkedin_url"]) updated_count += 1 else: print( f" [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}" ) updated_count += 1 if not test_mode: db.commit() print(f"\n✓ Successfully updated {updated_count} records in database") else: print(f"\n[TEST MODE] Would have updated {updated_count} records") return updated_count except Exception as e: db.rollback() print(f"\n✗ Error updating database: {e}") raise finally: db.close() def save_results(results, filename="linkedin_scraping_results.json"): """Save results to JSON file for backup/analysis""" output = { "timestamp": datetime.now().isoformat(), "total_processed": len(results), "found_count": sum(1 for r in results if r["linkedin_url"]), "results": results, } with open(filename, "w") as f: json.dump(output, f, indent=2) print(f"\n✓ Results saved to {filename}") def print_summary(results): """Print summary statistics""" total = len(results) found = sum(1 for r in results if r["linkedin_url"]) not_found = total - found # Count by method methods = {} for r in results: if r["linkedin_url"]: method = r["method"] methods[method] = methods.get(method, 0) + 1 # Average confidence for found profiles avg_confidence = ( sum(r["confidence"] for r in results if r["linkedin_url"]) / found if found > 0 else 0 ) print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"Total processed: {total}") print(f"LinkedIn found: {found} ({found / total * 100:.1f}%)") print(f"Not found: {not_found} ({not_found / total * 100:.1f}%)") print(f"\nAverage confidence: {avg_confidence:.1f}%") print("\nMethods used:") for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True): print(f" {method:20s} {count:5d} ({count / found * 100:.1f}%)") print("=" * 60) def main(): parser = argparse.ArgumentParser( description="Update LinkedIn profiles for investor members" ) parser.add_argument( "--test", action="store_true", help="Test mode: process only 10 records without updating database", ) parser.add_argument("--limit", type=int, help="Limit number of records to process") parser.add_argument( "--skip-existing", action="store_true", help="Skip members that already have LinkedIn URLs", ) parser.add_argument( "--start-from", type=int, default=0, help="Start from record N (for resuming interrupted runs)", ) parser.add_argument( "--rate-limit", type=float, default=0.5, help="Delay between URL crawls in seconds (default: 0.5)", ) args = parser.parse_args() # Test mode overrides limit if args.test and not args.limit: args.limit = 10 print("=" * 60) print("LinkedIn Profile Scraper for Investor Members (crawl4ai)") print("=" * 60) if args.test: print("\n⚠️ TEST MODE - No database changes will be made") # Initialize database and scraper db = get_db_session() try: # Build query query = db.query(InvestorMember, InvestorTable).join( InvestorTable, InvestorMember.investor_id == InvestorTable.id ) # Filter existing if requested if args.skip_existing: query = query.filter( (InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "") ) print("\n✓ Filtering to members without LinkedIn profiles") # Get total count total_available = query.count() print(f"\n✓ Found {total_available} members to process") # Apply offset and limit if args.start_from > 0: query = query.offset(args.start_from) print(f"✓ Starting from record {args.start_from}") if args.limit: query = query.limit(args.limit) print(f"✓ Processing {args.limit} records") # Fetch members members_data = [] for member, investor in query.all(): members_data.append( { "id": member.id, "name": member.name, "company": investor.name, "role": member.role, "source_url": member.source_url, } ) if not members_data: print("\n⚠️ No members to process") return # Count unique source URLs unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"])) with_urls = sum(1 for m in members_data if m["source_url"]) print(f"\n✓ Loaded {len(members_data)} members") print( f"✓ {with_urls} members have source URLs ({unique_urls} unique pages to crawl)" ) print(f"✓ {len(members_data) - with_urls} members without source URLs") print(f"✓ Rate limit: {args.rate_limit}s between page crawls") print("\nStarting LinkedIn profile search using crawl4ai...\n") finally: db.close() # Initialize scraper scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True) print("ℹ️ Using crawl4ai to scrape team pages and extract LinkedIn URLs") print( "ℹ️ Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n" ) # Create database callback for real-time saving db_callback, saved_count = create_db_callback(test_mode=args.test) # Process members asynchronously with real-time DB saving results = asyncio.run( scraper.batch_find_profiles( members_data, progress_callback=progress_callback, db_callback=db_callback ) ) # Print summary print_summary(results) # Save results timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results_file = f"linkedin_results_{timestamp}.json" save_results(results, results_file) # Show database update summary if not args.test: print( f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved" ) else: print( f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database" ) print("\n✓ Done! You can resume anytime with --skip-existing") if __name__ == "__main__": main()