Anton_wireframe/update_linkedin_profiles.py

#!/usr/bin/env python3
"""
Update Investor Members LinkedIn Profiles Script

This script finds and updates LinkedIn profile URLs for investor members in the database.
Uses crawl4ai to efficiently scrape team pages and extract LinkedIn URLs.

Usage:
    python update_linkedin_profiles.py [--test] [--limit N] [--skip-existing]

Options:
    --test          Test mode: process only 10 records and don't update database
    --limit N       Process only N records (default: all)
    --skip-existing Skip members that already have LinkedIn URLs
    --start-from N  Start from record N (for resuming)
"""

import argparse
import asyncio
import json
import os
import sys
from datetime import datetime

# Add app to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "app"))

from db.db import get_db_session
from db.models import InvestorMember, InvestorTable
from linkedin_scraper import LinkedInProfileScraper, format_linkedin_url


def progress_callback(current, total, result):
    """Print progress updates"""
    percent = (current / total) * 100
    status = "✓" if result["linkedin_url"] else "✗"
    print(f"[{current}/{total} - {percent:.1f}%] {status} {result['member_name']}")
    if result["linkedin_url"]:
        print(
            f"  → {result['linkedin_url']} (confidence: {result['confidence']}%, method: {result['method']})"
        )


def create_db_callback(test_mode=False):
    """
    Create a callback function that saves LinkedIn profiles to the database immediately.
    This allows stopping and resuming without losing progress.
    """
    saved_count = {"count": 0}  # Use dict to allow modification in closure

    def db_callback(member_id: int, linkedin_url: str) -> bool:
        """Save LinkedIn URL to database immediately"""
        if test_mode:
            print(f"  [TEST] Would save to DB: member {member_id}")
            saved_count["count"] += 1
            return True

        try:
            db = get_db_session()
            member = db.query(InvestorMember).filter_by(id=member_id).first()
            if member:
                member.linkedin = format_linkedin_url(linkedin_url)
                db.commit()
                saved_count["count"] += 1
                return True
        except Exception as e:
            print(f"  ⚠️  DB Error for member {member_id}: {e}")
            try:
                db.rollback()
            except Exception:
                pass
            return False
        finally:
            try:
                db.close()
            except Exception:
                pass
        return False

    return db_callback, saved_count


def update_database(members_data, test_mode=False):
    """Update database with found LinkedIn profiles"""
    db = get_db_session()

    try:
        updated_count = 0
        for data in members_data:
            if data["linkedin_url"] and data["member_id"]:
                if not test_mode:
                    member = (
                        db.query(InvestorMember).filter_by(id=data["member_id"]).first()
                    )
                    if member:
                        member.linkedin = format_linkedin_url(data["linkedin_url"])
                        updated_count += 1
                else:
                    print(
                        f"  [TEST MODE] Would update member {data['member_id']}: {data['linkedin_url']}"
                    )
                    updated_count += 1

        if not test_mode:
            db.commit()
            print(f"\n✓ Successfully updated {updated_count} records in database")
        else:
            print(f"\n[TEST MODE] Would have updated {updated_count} records")

        return updated_count

    except Exception as e:
        db.rollback()
        print(f"\n✗ Error updating database: {e}")
        raise
    finally:
        db.close()


def save_results(results, filename="linkedin_scraping_results.json"):
    """Save results to JSON file for backup/analysis"""
    output = {
        "timestamp": datetime.now().isoformat(),
        "total_processed": len(results),
        "found_count": sum(1 for r in results if r["linkedin_url"]),
        "results": results,
    }

    with open(filename, "w") as f:
        json.dump(output, f, indent=2)

    print(f"\n✓ Results saved to {filename}")


def print_summary(results):
    """Print summary statistics"""
    total = len(results)
    found = sum(1 for r in results if r["linkedin_url"])
    not_found = total - found

    # Count by method
    methods = {}
    for r in results:
        if r["linkedin_url"]:
            method = r["method"]
            methods[method] = methods.get(method, 0) + 1

    # Average confidence for found profiles
    avg_confidence = (
        sum(r["confidence"] for r in results if r["linkedin_url"]) / found
        if found > 0
        else 0
    )

    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Total processed:        {total}")
    print(f"LinkedIn found:         {found} ({found / total * 100:.1f}%)")
    print(f"Not found:              {not_found} ({not_found / total * 100:.1f}%)")
    print(f"\nAverage confidence:     {avg_confidence:.1f}%")
    print("\nMethods used:")
    for method, count in sorted(methods.items(), key=lambda x: x[1], reverse=True):
        print(f"  {method:20s} {count:5d} ({count / found * 100:.1f}%)")
    print("=" * 60)


def main():
    parser = argparse.ArgumentParser(
        description="Update LinkedIn profiles for investor members"
    )
    parser.add_argument(
        "--test",
        action="store_true",
        help="Test mode: process only 10 records without updating database",
    )
    parser.add_argument("--limit", type=int, help="Limit number of records to process")
    parser.add_argument(
        "--skip-existing",
        action="store_true",
        help="Skip members that already have LinkedIn URLs",
    )
    parser.add_argument(
        "--start-from",
        type=int,
        default=0,
        help="Start from record N (for resuming interrupted runs)",
    )
    parser.add_argument(
        "--rate-limit",
        type=float,
        default=0.5,
        help="Delay between URL crawls in seconds (default: 0.5)",
    )

    args = parser.parse_args()

    # Test mode overrides limit
    if args.test and not args.limit:
        args.limit = 10

    print("=" * 60)
    print("LinkedIn Profile Scraper for Investor Members (crawl4ai)")
    print("=" * 60)

    if args.test:
        print("\n⚠️  TEST MODE - No database changes will be made")

    # Initialize database and scraper
    db = get_db_session()

    try:
        # Build query
        query = db.query(InvestorMember, InvestorTable).join(
            InvestorTable, InvestorMember.investor_id == InvestorTable.id
        )

        # Filter existing if requested
        if args.skip_existing:
            query = query.filter(
                (InvestorMember.linkedin.is_(None)) | (InvestorMember.linkedin == "")
            )
            print("\n✓ Filtering to members without LinkedIn profiles")

        # Get total count
        total_available = query.count()
        print(f"\n✓ Found {total_available} members to process")

        # Apply offset and limit
        if args.start_from > 0:
            query = query.offset(args.start_from)
            print(f"✓ Starting from record {args.start_from}")

        if args.limit:
            query = query.limit(args.limit)
            print(f"✓ Processing {args.limit} records")

        # Fetch members
        members_data = []
        for member, investor in query.all():
            members_data.append(
                {
                    "id": member.id,
                    "name": member.name,
                    "company": investor.name,
                    "role": member.role,
                    "source_url": member.source_url,
                }
            )

        if not members_data:
            print("\n⚠️  No members to process")
            return

        # Count unique source URLs
        unique_urls = len(set(m["source_url"] for m in members_data if m["source_url"]))
        with_urls = sum(1 for m in members_data if m["source_url"])

        print(f"\n✓ Loaded {len(members_data)} members")
        print(
            f"✓ {with_urls} members have source URLs ({unique_urls} unique pages to crawl)"
        )
        print(f"✓ {len(members_data) - with_urls} members without source URLs")
        print(f"✓ Rate limit: {args.rate_limit}s between page crawls")
        print("\nStarting LinkedIn profile search using crawl4ai...\n")

    finally:
        db.close()

    # Initialize scraper
    scraper = LinkedInProfileScraper(rate_limit_delay=args.rate_limit, use_cache=True)

    print("ℹ️  Using crawl4ai to scrape team pages and extract LinkedIn URLs")
    print(
        "ℹ️  Profiles are saved to database IMMEDIATELY when found - safe to stop anytime!\n"
    )

    # Create database callback for real-time saving
    db_callback, saved_count = create_db_callback(test_mode=args.test)

    # Process members asynchronously with real-time DB saving
    results = asyncio.run(
        scraper.batch_find_profiles(
            members_data, progress_callback=progress_callback, db_callback=db_callback
        )
    )

    # Print summary
    print_summary(results)

    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_file = f"linkedin_results_{timestamp}.json"
    save_results(results, results_file)

    # Show database update summary
    if not args.test:
        print(
            f"\n✓ Database updated in real-time: {saved_count['count']} profiles saved"
        )
    else:
        print(
            f"\n[TEST MODE] Would have saved {saved_count['count']} profiles to database"
        )

    print("\n✓ Done! You can resume anytime with --skip-existing")


if __name__ == "__main__":
    main()