microcap_scrapping/scrape_sec_filings.py

"""
Scrape SEC EDGAR filings and extract ownership data
Gets 10-K, 10-Q, 8-K, DEF 14A, and insider ownership (Forms 3, 4, 5, 13D, 13G)
"""

import asyncio
import json
import os
import re
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import requests
import time
from typing import Dict, List, Any, Optional

from config import SEC_BASE_URL, SEC_API_URL, SEC_USER_AGENT, FILING_TYPES_SEC


class SECFilingScraper:
    def __init__(self, output_dir="data/sec_filings"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.headers = {'User-Agent': SEC_USER_AGENT}

    def get_cik_from_ticker(self, ticker: str) -> Optional[str]:
        """Get CIK number from ticker symbol using multiple methods"""
        try:
            # Method 1: Try the company_tickers.json endpoint
            try:
                url = f"{SEC_API_URL}/files/company_tickers.json"
                response = requests.get(url, headers=self.headers, timeout=10)
                response.raise_for_status()

                companies = response.json()

                for company_data in companies.values():
                    if company_data['ticker'].upper() == ticker.upper():
                        cik = str(company_data['cik_str']).zfill(10)
                        return cik
            except:
                pass  # Try alternative method

            # Method 2: Use SEC's search page (fallback)
            # Known CIKs for major companies (as fallback)
            known_ciks = {
                'AAPL': '0000320193',
                'MSFT': '0000789019',
                'GOOGL': '0001652044',
                'GOOG': '0001652044',
                'AMZN': '0001018724',
                'TSLA': '0001318605',
                'META': '0001326801',
                'NVDA': '0001045810',
                'JPM': '0000019617',
                'V': '0001403161',
                'WMT': '0000104169',
                'DIS': '0001744489',
                'NFLX': '0001065280',
                'CRM': '0001108524',
                'PYPL': '0001633917'
            }

            if ticker.upper() in known_ciks:
                return known_ciks[ticker.upper()]

            # Method 3: Try searching SEC's website
            search_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&ticker={ticker}&count=1&output=atom"
            response = requests.get(search_url, headers=self.headers, timeout=10)
            if response.status_code == 200:
                # Parse CIK from response
                match = re.search(r'CIK=(\d+)', response.text)
                if match:
                    return match.group(1).zfill(10)

            return None
        except Exception as e:
            print(f"Error getting CIK for {ticker}: {e}")
            return None

    def get_company_filings(self, cik: str, limit: int = 100) -> List[Dict]:
        """Get recent filings for a company"""
        try:
            url = f"{SEC_API_URL}/submissions/CIK{cik}.json"
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()

            data = response.json()
            filings = []

            recent_filings = data.get('filings', {}).get('recent', {})

            for i in range(min(limit, len(recent_filings.get('form', [])))):
                filing = {
                    'form_type': recent_filings['form'][i],
                    'filing_date': recent_filings['filingDate'][i],
                    'accession_number': recent_filings['accessionNumber'][i],
                    'primary_document': recent_filings.get('primaryDocument', [''])[i],
                    'description': recent_filings.get('primaryDocDescription', [''])[i]
                }

                # Build document URL
                acc_no_clean = filing['accession_number'].replace('-', '')
                filing['url'] = f"{SEC_BASE_URL}/Archives/edgar/data/{cik}/{acc_no_clean}/{filing['primary_document']}"

                filings.append(filing)

            return filings
        except Exception as e:
            print(f"Error getting filings for CIK {cik}: {e}")
            return []

    def get_insider_ownership(self, cik: str) -> Dict[str, Any]:
        """Get insider ownership data from Forms 3, 4, 5"""
        try:
            filings = self.get_company_filings(cik, limit=200)

            # Filter for ownership forms
            ownership_forms = ['3', '4', '5', 'SC 13D', 'SC 13G']
            insider_filings = [f for f in filings if f['form_type'] in ownership_forms]

            # Parse the most recent ownership data
            ownership_data = {
                'insiders': [],
                'major_shareholders': [],
                'total_insider_shares': 0,
                'last_updated': datetime.now().isoformat()
            }

            # Group by filer
            filers = {}
            for filing in insider_filings[:50]:  # Check last 50 ownership filings
                # Would need to parse the actual XML/HTML document to get share counts
                # This is a placeholder structure
                ownership_data['insiders'].append({
                    'filing_type': filing['form_type'],
                    'filing_date': filing['filing_date'],
                    'document_url': filing['url']
                })

            return ownership_data
        except Exception as e:
            print(f"Error getting insider ownership for CIK {cik}: {e}")
            return {}

    async def scrape_filing_document(self, url: str) -> Dict[str, Any]:
        """Scrape the actual filing document for detailed information"""

        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            try:
                await page.goto(url, wait_until='networkidle', timeout=30000)
                await asyncio.sleep(2)

                # Extract text content
                content = await page.content()
                text = await page.inner_text('body')

                # Extract key information
                filing_data = {
                    'url': url,
                    'scraped_at': datetime.now().isoformat(),
                    'full_text': text[:50000],  # Limit size
                    'content_html': content[:50000]
                }

                # Try to extract specific sections
                # AGM information
                agm_patterns = [
                    r'annual general meeting.*?(\d{1,2}[/-]\d{1,2}[/-]\d{4})',
                    r'agm.*?(\d{1,2}[/-]\d{1,2}[/-]\d{4})',
                    r'shareholder meeting.*?(\d{1,2}[/-]\d{1,2}[/-]\d{4})'
                ]

                for pattern in agm_patterns:
                    match = re.search(pattern, text.lower())
                    if match:
                        filing_data['agm_date'] = match.group(1)
                        break

                # Ownership information
                ownership_patterns = [
                    r'beneficially own.*?(\d{1,3}(?:,\d{3})*)\s*shares',
                    r'total shares.*?(\d{1,3}(?:,\d{3})*)',
                    r'common stock.*?(\d{1,3}(?:,\d{3})*)'
                ]

                shares_owned = []
                for pattern in ownership_patterns:
                    matches = re.finditer(pattern, text.lower())
                    for match in matches:
                        shares = match.group(1).replace(',', '')
                        shares_owned.append(int(shares))

                if shares_owned:
                    filing_data['shares_mentioned'] = shares_owned

                return filing_data

            except Exception as e:
                print(f"Error scraping {url}: {e}")
                return {'url': url, 'error': str(e)}
            finally:
                await browser.close()

    async def get_complete_company_data(self, ticker: str) -> Dict[str, Any]:
        """Get complete SEC data for a company"""
        print(f"\n🔍 Scraping SEC filings for {ticker}...")

        # Get CIK
        cik = self.get_cik_from_ticker(ticker)
        if not cik:
            print(f"⚠️  CIK not found for {ticker}")
            return {'ticker': ticker, 'error': 'CIK not found'}

        print(f"   Found CIK: {cik}")

        data = {
            'ticker': ticker,
            'cik': cik,
            'scraped_at': datetime.now().isoformat(),
            'filings': [],
            'ownership': {},
            'agm_info': {},
            'key_documents': {}
        }

        # Get all filings
        all_filings = self.get_company_filings(cik, limit=100)
        data['filings'] = all_filings

        print(f"   Found {len(all_filings)} recent filings")

        # Get most recent important filings
        important_forms = ['10-K', '10-Q', 'DEF 14A', '8-K']
        recent_important = {}

        for filing in all_filings:
            form_type = filing['form_type']
            if form_type in important_forms and form_type not in recent_important:
                recent_important[form_type] = filing

        # Scrape key documents
        for form_type, filing in recent_important.items():
            print(f"   Scraping {form_type} from {filing['filing_date']}...")
            doc_data = await self.scrape_filing_document(filing['url'])
            data['key_documents'][form_type] = doc_data
            await asyncio.sleep(2)  # Rate limiting

        # Get ownership data
        print(f"   Getting ownership data...")
        ownership = self.get_insider_ownership(cik)
        data['ownership'] = ownership

        # Save to file
        output_file = f"{self.output_dir}/{ticker}_sec_filings.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

        print(f"✅ Saved SEC data to {output_file}")

        return data

    async def scrape_multiple_companies(self, tickers: List[str]):
        """Scrape SEC data for multiple companies"""
        print("=" * 70)
        print("SEC EDGAR FILING SCRAPER")
        print("=" * 70)

        all_data = []

        for ticker in tickers:
            data = await self.get_complete_company_data(ticker)
            all_data.append(data)
            await asyncio.sleep(3)  # Respect SEC rate limits

        print(f"\n✅ Completed scraping {len(all_data)} companies")
        return all_data


async def main():
    """Test the SEC scraper"""
    scraper = SECFilingScraper()

    # Test with a few well-known tickers
    test_tickers = ['AAPL', 'MSFT', 'TSLA']

    print("Testing SEC scraper with sample tickers...")
    await scraper.scrape_multiple_companies(test_tickers[:1])  # Just test one


if __name__ == "__main__":
    asyncio.run(main())