""" PRODUCTION-READY Stock Intelligence System Includes: SEC filings, SEDAR+, ownership data, tax info, AGM details, calculated metrics Can run daily on any stock or full universe """ import asyncio import os import json import sys from datetime import datetime from typing import List, Dict, Any # Import all modules from extract_listings import StockListingExtractor from database import StockDatabase from scrape_yahoo_finance import YahooFinanceScraper from scrape_news_pr import NewsPressScraper from scrape_serpapi import SerpAPINewsScraper from scrape_sec_filings import SECFilingScraper from scrape_sedar import SEDARPlusScraper from financial_calculator import FinancialMetricsCalculator from export_csv import CSVExporter from config import * class RobustStockIntelligence: def __init__(self): self.db = StockDatabase() self.stats = { 'start_time': datetime.now(), 'stocks_processed': 0, 'financials_scraped': 0, 'news_scraped': 0, 'filings_scraped': 0, 'metrics_calculated': 0, 'errors': [] } async def step1_extract_listings(self, force_refresh=False): """Extract stock listings from exchanges""" print("\n" + "=" * 70) print("STEP 1: EXTRACTING STOCK LISTINGS") print("=" * 70) listings_file = "data/listings/all_listings_combined.json" if os.path.exists(listings_file) and not force_refresh: print(f"šŸ“‚ Loading existing listings from {listings_file}") with open(listings_file, 'r') as f: listings = json.load(f) print(f"āœ… Loaded {len(listings)} stocks from file") else: print("šŸ”„ Extracting fresh listings from exchanges...") extractor = StockListingExtractor() listings = await extractor.extract_all() self.stats['stocks_processed'] = len(listings) return listings def step2_import_to_database(self): """Import listings to database""" print("\n" + "=" * 70) print("STEP 2: IMPORTING TO DATABASE") print("=" * 70) listings_file = "data/listings/all_listings_combined.json" if os.path.exists(listings_file): imported = self.db.import_listings_from_json(listings_file) return imported else: print(f"āŒ No listings file found") return 0 async def step3_scrape_financials(self, stocks: List[Dict], use_serpapi_fallback=True): """Scrape financial data with Yahoo Finance""" print("\n" + "=" * 70) print("STEP 3: SCRAPING FINANCIAL DATA") print("=" * 70) scraper = YahooFinanceScraper() results = await scraper.scrape_multiple_stocks(stocks) self.stats['financials_scraped'] = len([r for r in results if not r.get('error')]) # Update database for result in results: if not result.get('error'): self.db.update_coverage( result['ticker'], has_financials=True, has_ttm=True ) # Insert quote data if available quote_data = result.get('quote', {}) if quote_data and any(quote_data.values()): self.db.insert_stock_quote(result['ticker'], quote_data) return results async def step4_calculate_metrics(self, financial_data: List[Dict]): """Calculate all financial metrics from base numbers""" print("\n" + "=" * 70) print("STEP 4: CALCULATING FINANCIAL METRICS") print("=" * 70) calculator = FinancialMetricsCalculator() metrics_calculated = 0 for data in financial_data: if data.get('error'): continue ticker = data['ticker'] print(f" Calculating metrics for {ticker}...") try: # Convert Yahoo Finance data to calculator format base_data = calculator.convert_yahoo_data(data) # Calculate all metrics metrics = calculator.calculate_all_metrics(base_data) # Save metrics to file metrics_file = f"data/metrics/{ticker}_calculated_metrics.json" os.makedirs(os.path.dirname(metrics_file), exist_ok=True) with open(metrics_file, 'w') as f: json.dump(metrics, f, indent=2) # Insert metrics into database current_year = datetime.now().year self.db.insert_financial_metrics(ticker, current_year, metrics, is_ttm=True) metrics_calculated += 1 except Exception as e: print(f" Error calculating metrics: {e}") self.stats['errors'].append(f"{ticker} metrics: {e}") self.stats['metrics_calculated'] = metrics_calculated print(f"āœ… Calculated metrics for {metrics_calculated} stocks") return metrics_calculated async def step5_scrape_news_pr(self, stocks: List[Dict], use_serpapi=True): """Scrape news and press releases""" print("\n" + "=" * 70) print("STEP 5: SCRAPING NEWS & PRESS RELEASES") print("=" * 70) if use_serpapi: print("šŸ“” Using SerpAPI for robust news collection...") scraper = SerpAPINewsScraper() results = scraper.scrape_multiple_stocks(stocks) else: print("🌐 Using direct web scraping...") scraper = NewsPressScraper() results = await scraper.scrape_multiple_stocks(stocks) self.stats['news_scraped'] = len(results) # Insert articles into database and update coverage for result in results: ticker = result['ticker'] news_articles = result.get('news_articles', []) press_releases = result.get('press_releases', []) # Insert news articles for article in news_articles: self.db.insert_news_article( ticker=ticker, title=article.get('title', ''), source=article.get('source', ''), published_date=article.get('date', ''), url=article.get('link') or article.get('url', ''), snippet=article.get('snippet', '') ) # Insert press releases as news articles (same table) for pr in press_releases: self.db.insert_news_article( ticker=ticker, title=pr.get('title', ''), source=pr.get('source', 'Press Release'), published_date=pr.get('date', ''), url=pr.get('link') or pr.get('url', ''), snippet=pr.get('snippet', '') ) # Update coverage flags has_news = len(news_articles) > 0 has_pr = len(press_releases) > 0 self.db.update_coverage( ticker, has_news=has_news, has_press_releases=has_pr ) return results async def step6_scrape_sec_filings(self, stocks: List[Dict]): """Scrape SEC EDGAR filings (for US-listed stocks)""" print("\n" + "=" * 70) print("STEP 6: SCRAPING SEC EDGAR FILINGS") print("=" * 70) # Filter for US-listed or cross-listed stocks us_stocks = [s for s in stocks if s.get('exchange') in ['CBOE', 'NYSE', 'NASDAQ']] if not us_stocks: print("āš ļø No US-listed stocks to process") return [] scraper = SECFilingScraper() results = [] for stock in us_stocks: ticker = stock['symbol'] data = await scraper.get_complete_company_data(ticker) results.append(data) if not data.get('error'): # Insert filings into database filings = data.get('filings', []) for filing in filings: self.db.insert_filing( ticker=ticker, filing_date=filing.get('filing_date', ''), filing_type=filing.get('form_type', ''), title=filing.get('description', ''), document_url=filing.get('url', ''), source='SEC EDGAR' ) # Insert ownership forms ownership = data.get('insider_ownership', []) for form in ownership: self.db.insert_filing( ticker=ticker, filing_date=form.get('filing_date', ''), filing_type=form.get('form_type', ''), title=f"Insider Transaction - {form.get('owner', '')}", document_url=form.get('url', ''), source='SEC EDGAR - Ownership' ) self.db.update_coverage( ticker, has_filings=True ) self.stats['filings_scraped'] += len([r for r in results if not r.get('error')]) return results async def step7_scrape_sedar_filings(self, stocks: List[Dict]): """Scrape SEDAR+ filings (for Canadian stocks)""" print("\n" + "=" * 70) print("STEP 7: SCRAPING SEDAR+ FILINGS") print("=" * 70) # Filter for Canadian stocks canadian_stocks = [s for s in stocks if s.get('exchange') in ['TSX', 'TSXV', 'CSE']] if not canadian_stocks: print("āš ļø No Canadian stocks to process") return [] scraper = SEDARPlusScraper() results = await scraper.scrape_multiple_companies(canadian_stocks) # Insert filings and update database for result in results: if not result.get('error'): ticker = result['ticker'] # Insert filings filings = result.get('filings', []) for filing in filings: self.db.insert_filing( ticker=ticker, filing_date=filing.get('date', ''), filing_type=filing.get('type', ''), title=filing.get('title', ''), document_url=filing.get('url', ''), source='SEDAR+' ) has_agm = bool(result.get('agm_info')) has_tax = bool(result.get('tax_disclosures')) self.db.update_coverage( ticker, has_filings=True, has_agm_info=has_agm, has_tax_disclosures=has_tax ) self.stats['filings_scraped'] += len([r for r in results if not r.get('error')]) return results def step8_generate_reports(self): """Generate comprehensive reports""" print("\n" + "=" * 70) print("STEP 8: GENERATING REPORTS") print("=" * 70) reports_dir = "data/reports" os.makedirs(reports_dir, exist_ok=True) stocks = self.db.get_all_stocks() reports_generated = 0 for stock in stocks: ticker = stock[1] company_name = stock[2] exchange = stock[3] try: report = self._generate_comprehensive_report(ticker, company_name, exchange) report_file = f"{reports_dir}/{ticker}_comprehensive_report.txt" with open(report_file, 'w', encoding='utf-8') as f: f.write(report) reports_generated += 1 except Exception as e: print(f"āŒ Error generating report for {ticker}: {e}") self.stats['errors'].append(f"{ticker} report: {e}") print(f"āœ… Generated {reports_generated} comprehensive reports") return reports_generated def step9_export_csv(self): """Export all data to CSV files""" print("\n" + "=" * 70) print("STEP 9: EXPORTING TO CSV") print("=" * 70) exporter = CSVExporter() files = exporter.export_all() exporter.close() return files def _extract_base_financials(self, yahoo_data: Dict) -> Dict: """Extract base financial numbers from Yahoo Finance data""" base = {} stats = yahoo_data.get('statistics', {}) profile = yahoo_data.get('profile', {}) # Try to extract numeric values from Yahoo Finance statistics # This is a simplified version - actual implementation would need more parsing base['price'] = profile.get('current_price', 0) # Parse statistics (values come as strings with formatting) # Example: "1.2B" -> 1200000000 for key, value in stats.items(): if isinstance(value, str): # Try to convert formatted numbers try: if 'B' in value: base[key] = float(value.replace('B', '').replace(',', '')) * 1_000_000_000 elif 'M' in value: base[key] = float(value.replace('M', '').replace(',', '')) * 1_000_000 elif 'K' in value: base[key] = float(value.replace('K', '').replace(',', '')) * 1_000 else: base[key] = float(value.replace(',', '').replace('%', '')) except: pass return base def _generate_comprehensive_report(self, ticker: str, company_name: str, exchange: str) -> str: """Generate comprehensive report with all data""" report = [] report.append("=" * 80) report.append(f"COMPREHENSIVE STOCK INTELLIGENCE REPORT") report.append(f"Ticker: {ticker} | Company: {company_name} | Exchange: {exchange}") report.append("=" * 80) report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") report.append("") # Load all available data files data_sources = { 'yahoo': f"data/financials/{ticker}_yahoo.json", 'metrics': f"data/metrics/{ticker}_calculated_metrics.json", 'news': f"data/news/{ticker}_news_pr.json", 'serpapi': f"data/serpapi_news/{ticker}_serpapi.json", 'sec': f"data/sec_filings/{ticker}_sec_filings.json", 'sedar': f"data/sedar_filings/{ticker}_sedar_data.json" } # Financial Data Section if os.path.exists(data_sources['metrics']): report.append("[CALCULATED FINANCIAL METRICS]") report.append("-" * 80) with open(data_sources['metrics'], 'r') as f: metrics = json.load(f) calculator = FinancialMetricsCalculator() report.append(calculator.format_metrics_for_display(metrics)) report.append("") # News Section news_files = [data_sources['news'], data_sources['serpapi']] all_news = [] for nf in news_files: if os.path.exists(nf): with open(nf, 'r') as f: data = json.load(f) all_news.extend(data.get('news_articles', [])) if all_news: report.append("[NEWS ARTICLES - Last 12 Months]") report.append("-" * 80) for article in all_news[:15]: report.append(f"\nTitle: {article.get('title', 'N/A')}") report.append(f"Source: {article.get('source', 'N/A')}") report.append(f"Date: {article.get('date', 'N/A')}") report.append(f"URL: {article.get('link') or article.get('url', 'N/A')}") report.append("") # SEC Filings Section if os.path.exists(data_sources['sec']): report.append("[SEC EDGAR FILINGS]") report.append("-" * 80) with open(data_sources['sec'], 'r') as f: sec_data = json.load(f) filings = sec_data.get('filings', [])[:10] for filing in filings: report.append(f"\n{filing['form_type']} - {filing['filing_date']}") report.append(f" {filing.get('description', 'N/A')}") report.append(f" URL: {filing['url']}") report.append("") # SEDAR+ Filings Section if os.path.exists(data_sources['sedar']): report.append("[SEDAR+ FILINGS & AGM INFORMATION]") report.append("-" * 80) with open(data_sources['sedar'], 'r') as f: sedar_data = json.load(f) # AGM Info agm = sedar_data.get('agm_info', {}) if agm: report.append("\nAnnual General Meeting:") report.append(f" Date: {agm.get('date', 'N/A')}") report.append(f" Location: {agm.get('location', 'N/A')}") # Recent filings filings = sedar_data.get('filings', [])[:10] if filings: report.append("\nRecent Filings:") for filing in filings: report.append(f" - {filing.get('title', 'N/A')[:70]}") report.append("") report.append("=" * 80) report.append("END OF REPORT") report.append("=" * 80) return "\n".join(report) async def run_for_single_stock(self, ticker: str): """Run complete analysis for a single stock (daily update mode)""" print("\n" + "=" * 70) print(f"DAILY UPDATE FOR STOCK: {ticker}") print("=" * 70) # Get stock info from database self.db.cursor.execute("SELECT * FROM stocks_master WHERE symbol = ?", (ticker,)) stock_data = self.db.cursor.fetchone() if not stock_data: print(f"āŒ Stock {ticker} not found in database") return stock = { 'symbol': stock_data[1], 'name': stock_data[2], 'exchange': stock_data[3] } # Run all steps for this one stock await self.step3_scrape_financials([stock]) await self.step5_scrape_news_pr([stock], use_serpapi=True) if stock['exchange'] in ['CBOE', 'NYSE', 'NASDAQ']: await self.step6_scrape_sec_filings([stock]) elif stock['exchange'] in ['TSX', 'TSXV', 'CSE']: await self.step7_scrape_sedar_filings([stock]) self.step8_generate_reports() self.step9_export_csv() print(f"\nāœ… Daily update completed for {ticker}") async def run_full_pipeline(self, test_mode=False, stocks_limit=None): """Run complete pipeline""" print("\n" + "=" * 70) print("PRODUCTION-READY STOCK INTELLIGENCE SYSTEM") print("=" * 70) print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") if test_mode: print("āš ļø RUNNING IN TEST MODE") print("=" * 70) try: # Step 1: Get listings listings = await self.step1_extract_listings() if not listings: print("\nāŒ No listings found") return # Step 2: Import to database self.step2_import_to_database() # Limit stocks if requested if stocks_limit: listings = listings[:stocks_limit] print(f"\nāš ļø Limited to {stocks_limit} stocks for testing") # Step 3: Scrape financials financial_data = await self.step3_scrape_financials(listings) # Step 4: Calculate metrics await self.step4_calculate_metrics(financial_data) # Step 5: Scrape news (using SerpAPI for robustness) await self.step5_scrape_news_pr(listings, use_serpapi=True) # Step 6 & 7: Scrape filings await self.step6_scrape_sec_filings(listings) await self.step7_scrape_sedar_filings(listings) # Step 8: Generate reports self.step8_generate_reports() # Step 9: Export to CSV self.step9_export_csv() # Print final stats self._print_final_stats() print("\nāœ… PIPELINE COMPLETED SUCCESSFULLY!") except Exception as e: print(f"\nāŒ Pipeline failed: {e}") import traceback traceback.print_exc() finally: self.db.close() def _print_final_stats(self): """Print final statistics""" end_time = datetime.now() duration = end_time - self.stats['start_time'] print("\n" + "=" * 70) print("FINAL STATISTICS") print("=" * 70) print(f"Duration: {duration}") print(f"Stocks processed: {self.stats['stocks_processed']}") print(f"Financials scraped: {self.stats['financials_scraped']}") print(f"Metrics calculated: {self.stats['metrics_calculated']}") print(f"News articles collected: {self.stats['news_scraped']}") print(f"Filings scraped: {self.stats['filings_scraped']}") print(f"Errors: {len(self.stats['errors'])}") print("=" * 70) async def main(): """Main entry point""" orchestrator = RobustStockIntelligence() # Check command line arguments if len(sys.argv) > 1: command = sys.argv[1] if command == "--ticker" and len(sys.argv) > 2: # Daily update for single stock ticker = sys.argv[2].upper() await orchestrator.run_for_single_stock(ticker) elif command == "--full": # Full pipeline, all stocks await orchestrator.run_full_pipeline(test_mode=False) elif command == "--test": # Test mode with limited stocks limit = int(sys.argv[2]) if len(sys.argv) > 2 else 5 await orchestrator.run_full_pipeline(test_mode=True, stocks_limit=limit) else: print("Usage:") print(" python main_robust.py --test [num] # Test mode with N stocks") print(" python main_robust.py --full # Full pipeline, all stocks") print(" python main_robust.py --ticker SYMBOL # Daily update for one stock") else: # Default: test mode with 5 stocks print("\nāš ļø No arguments provided. Running in test mode (5 stocks)") print(" Use --help to see options") await orchestrator.run_full_pipeline(test_mode=True, stocks_limit=5) if __name__ == "__main__": asyncio.run(main())