""" Scrape SEC EDGAR filings and extract ownership data Gets 10-K, 10-Q, 8-K, DEF 14A, and insider ownership (Forms 3, 4, 5, 13D, 13G) """ import asyncio import json import os import re from datetime import datetime, timedelta from playwright.async_api import async_playwright import requests import time from typing import Dict, List, Any, Optional from config import SEC_BASE_URL, SEC_API_URL, SEC_USER_AGENT, FILING_TYPES_SEC class SECFilingScraper: def __init__(self, output_dir="data/sec_filings"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) self.headers = {'User-Agent': SEC_USER_AGENT} def get_cik_from_ticker(self, ticker: str) -> Optional[str]: """Get CIK number from ticker symbol using multiple methods""" try: # Method 1: Try the company_tickers.json endpoint try: url = f"{SEC_API_URL}/files/company_tickers.json" response = requests.get(url, headers=self.headers, timeout=10) response.raise_for_status() companies = response.json() for company_data in companies.values(): if company_data['ticker'].upper() == ticker.upper(): cik = str(company_data['cik_str']).zfill(10) return cik except: pass # Try alternative method # Method 2: Use SEC's search page (fallback) # Known CIKs for major companies (as fallback) known_ciks = { 'AAPL': '0000320193', 'MSFT': '0000789019', 'GOOGL': '0001652044', 'GOOG': '0001652044', 'AMZN': '0001018724', 'TSLA': '0001318605', 'META': '0001326801', 'NVDA': '0001045810', 'JPM': '0000019617', 'V': '0001403161', 'WMT': '0000104169', 'DIS': '0001744489', 'NFLX': '0001065280', 'CRM': '0001108524', 'PYPL': '0001633917' } if ticker.upper() in known_ciks: return known_ciks[ticker.upper()] # Method 3: Try searching SEC's website search_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&ticker={ticker}&count=1&output=atom" response = requests.get(search_url, headers=self.headers, timeout=10) if response.status_code == 200: # Parse CIK from response match = re.search(r'CIK=(\d+)', response.text) if match: return match.group(1).zfill(10) return None except Exception as e: print(f"Error getting CIK for {ticker}: {e}") return None def get_company_filings(self, cik: str, limit: int = 100) -> List[Dict]: """Get recent filings for a company""" try: url = f"{SEC_API_URL}/submissions/CIK{cik}.json" response = requests.get(url, headers=self.headers) response.raise_for_status() data = response.json() filings = [] recent_filings = data.get('filings', {}).get('recent', {}) for i in range(min(limit, len(recent_filings.get('form', [])))): filing = { 'form_type': recent_filings['form'][i], 'filing_date': recent_filings['filingDate'][i], 'accession_number': recent_filings['accessionNumber'][i], 'primary_document': recent_filings.get('primaryDocument', [''])[i], 'description': recent_filings.get('primaryDocDescription', [''])[i] } # Build document URL acc_no_clean = filing['accession_number'].replace('-', '') filing['url'] = f"{SEC_BASE_URL}/Archives/edgar/data/{cik}/{acc_no_clean}/{filing['primary_document']}" filings.append(filing) return filings except Exception as e: print(f"Error getting filings for CIK {cik}: {e}") return [] def get_insider_ownership(self, cik: str) -> Dict[str, Any]: """Get insider ownership data from Forms 3, 4, 5""" try: filings = self.get_company_filings(cik, limit=200) # Filter for ownership forms ownership_forms = ['3', '4', '5', 'SC 13D', 'SC 13G'] insider_filings = [f for f in filings if f['form_type'] in ownership_forms] # Parse the most recent ownership data ownership_data = { 'insiders': [], 'major_shareholders': [], 'total_insider_shares': 0, 'last_updated': datetime.now().isoformat() } # Group by filer filers = {} for filing in insider_filings[:50]: # Check last 50 ownership filings # Would need to parse the actual XML/HTML document to get share counts # This is a placeholder structure ownership_data['insiders'].append({ 'filing_type': filing['form_type'], 'filing_date': filing['filing_date'], 'document_url': filing['url'] }) return ownership_data except Exception as e: print(f"Error getting insider ownership for CIK {cik}: {e}") return {} async def scrape_filing_document(self, url: str) -> Dict[str, Any]: """Scrape the actual filing document for detailed information""" async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: await page.goto(url, wait_until='networkidle', timeout=30000) await asyncio.sleep(2) # Extract text content content = await page.content() text = await page.inner_text('body') # Extract key information filing_data = { 'url': url, 'scraped_at': datetime.now().isoformat(), 'full_text': text[:50000], # Limit size 'content_html': content[:50000] } # Try to extract specific sections # AGM information agm_patterns = [ r'annual general meeting.*?(\d{1,2}[/-]\d{1,2}[/-]\d{4})', r'agm.*?(\d{1,2}[/-]\d{1,2}[/-]\d{4})', r'shareholder meeting.*?(\d{1,2}[/-]\d{1,2}[/-]\d{4})' ] for pattern in agm_patterns: match = re.search(pattern, text.lower()) if match: filing_data['agm_date'] = match.group(1) break # Ownership information ownership_patterns = [ r'beneficially own.*?(\d{1,3}(?:,\d{3})*)\s*shares', r'total shares.*?(\d{1,3}(?:,\d{3})*)', r'common stock.*?(\d{1,3}(?:,\d{3})*)' ] shares_owned = [] for pattern in ownership_patterns: matches = re.finditer(pattern, text.lower()) for match in matches: shares = match.group(1).replace(',', '') shares_owned.append(int(shares)) if shares_owned: filing_data['shares_mentioned'] = shares_owned return filing_data except Exception as e: print(f"Error scraping {url}: {e}") return {'url': url, 'error': str(e)} finally: await browser.close() async def get_complete_company_data(self, ticker: str) -> Dict[str, Any]: """Get complete SEC data for a company""" print(f"\nšŸ” Scraping SEC filings for {ticker}...") # Get CIK cik = self.get_cik_from_ticker(ticker) if not cik: print(f"āš ļø CIK not found for {ticker}") return {'ticker': ticker, 'error': 'CIK not found'} print(f" Found CIK: {cik}") data = { 'ticker': ticker, 'cik': cik, 'scraped_at': datetime.now().isoformat(), 'filings': [], 'ownership': {}, 'agm_info': {}, 'key_documents': {} } # Get all filings all_filings = self.get_company_filings(cik, limit=100) data['filings'] = all_filings print(f" Found {len(all_filings)} recent filings") # Get most recent important filings important_forms = ['10-K', '10-Q', 'DEF 14A', '8-K'] recent_important = {} for filing in all_filings: form_type = filing['form_type'] if form_type in important_forms and form_type not in recent_important: recent_important[form_type] = filing # Scrape key documents for form_type, filing in recent_important.items(): print(f" Scraping {form_type} from {filing['filing_date']}...") doc_data = await self.scrape_filing_document(filing['url']) data['key_documents'][form_type] = doc_data await asyncio.sleep(2) # Rate limiting # Get ownership data print(f" Getting ownership data...") ownership = self.get_insider_ownership(cik) data['ownership'] = ownership # Save to file output_file = f"{self.output_dir}/{ticker}_sec_filings.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) print(f"āœ… Saved SEC data to {output_file}") return data async def scrape_multiple_companies(self, tickers: List[str]): """Scrape SEC data for multiple companies""" print("=" * 70) print("SEC EDGAR FILING SCRAPER") print("=" * 70) all_data = [] for ticker in tickers: data = await self.get_complete_company_data(ticker) all_data.append(data) await asyncio.sleep(3) # Respect SEC rate limits print(f"\nāœ… Completed scraping {len(all_data)} companies") return all_data async def main(): """Test the SEC scraper""" scraper = SECFilingScraper() # Test with a few well-known tickers test_tickers = ['AAPL', 'MSFT', 'TSLA'] print("Testing SEC scraper with sample tickers...") await scraper.scrape_multiple_companies(test_tickers[:1]) # Just test one if __name__ == "__main__": asyncio.run(main())