""" Scrape financial data from Yahoo Finance (no API key needed) Gets financials, ratios, and key metrics for each stock """ import asyncio import json import os from datetime import datetime from playwright.async_api import async_playwright import time import re class YahooFinanceScraper: def __init__(self, output_dir="data/financials"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) async def scrape_stock_data(self, ticker, exchange=""): """Scrape comprehensive data for a single stock""" print(f"\nšŸ” Scraping {ticker}...") # Format ticker for Yahoo Finance yahoo_ticker = ticker # Canadian stocks need exchange-specific suffixes if exchange in ['TSX', 'TSXV', 'TSX/TSXV']: if not ticker.endswith('.TO') and not ticker.endswith('.V'): yahoo_ticker = f"{ticker}.TO" # Try TSX first # CSE (Canadian Securities Exchange) stocks use .CN suffix # CSE tickers in database may have "T2" prefix which needs to be removed elif exchange == 'CSE': # Remove T2 prefix if present (e.g., T2AAA -> AAA) clean_ticker = ticker.replace('T2', '') if ticker.startswith('T2') else ticker # Remove any suffix after a dot (e.g., T2AAAWH.U -> AAAWH) if '.' in clean_ticker: clean_ticker = clean_ticker.split('.')[0] yahoo_ticker = f"{clean_ticker}.CN" print(f" CSE stock: {ticker} -> {yahoo_ticker}") stock_data = { 'ticker': ticker, 'exchange': exchange, 'yahoo_ticker': yahoo_ticker, 'scraped_at': datetime.now().isoformat(), 'profile': {}, 'quote': {}, # Real-time quote data 'financials': {}, 'statistics': {}, 'analysis': {}, 'error': None } async with async_playwright() as p: # Launch with no-cache to avoid stale data browser = await p.chromium.launch( headless=True, args=['--disable-blink-features=AutomationControlled'] ) context = await browser.new_context( viewport={'width': 1920, 'height': 1080}, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' ) page = await context.new_page() try: # 1. Get Summary/Statistics page url = f"https://finance.yahoo.com/quote/{yahoo_ticker}" print(f" Loading {url}...") await page.goto(url, wait_until='domcontentloaded', timeout=60000) await asyncio.sleep(5) # Wait for dynamic content to load # Check if ticker exists page_content = await page.content() if "Symbol Lookup" in page_content or "Symbols similar to" in page_content: print(f"āš ļø {yahoo_ticker} not found on Yahoo Finance") stock_data['error'] = 'Ticker not found' # Try alternative suffix for TSXV if yahoo_ticker.endswith('.TO'): yahoo_ticker = f"{ticker}.V" print(f" Trying {yahoo_ticker}...") url = f"https://finance.yahoo.com/quote/{yahoo_ticker}" await page.goto(url, wait_until='domcontentloaded', timeout=60000) await asyncio.sleep(5) page_content = await page.content() if "Symbol Lookup" in page_content: await browser.close() return stock_data else: stock_data['yahoo_ticker'] = yahoo_ticker stock_data['error'] = None # Extract key stats and quote data from summary try: # Get real-time quote data from the quote header section # Initialize quote fields to empty to avoid caching from previous runs stock_data['quote'] = { 'date': '', 'open': '', 'high': '', 'low': '', 'close': '', 'volume': '' } # Close (current price) price_elem = await page.query_selector('[data-field="regularMarketPrice"]') if price_elem: price_text = await price_elem.inner_text() # Remove whitespace and newlines price_text = ' '.join(price_text.split()) print(f" Raw price text: '{price_text}'") try: current_price = float(price_text.replace(',', '')) stock_data['profile']['current_price'] = current_price stock_data['quote']['close'] = price_text print(f" Parsed price: {current_price}") except ValueError: print(f" Warning: Could not parse price: {price_text}") # Open price open_elem = await page.query_selector('[data-field="regularMarketOpen"]') if open_elem: open_text = await open_elem.inner_text() stock_data['quote']['open'] = ' '.join(open_text.split()) # Day range (high/low) range_elem = await page.query_selector('[data-field="regularMarketDayRange"]') if range_elem: range_text = await range_elem.inner_text() range_text = ' '.join(range_text.split()) if ' - ' in range_text: low, high = range_text.split(' - ') stock_data['quote']['low'] = low.strip() stock_data['quote']['high'] = high.strip() # Volume volume_elem = await page.query_selector('[data-field="regularMarketVolume"]') if volume_elem: volume_text = await volume_elem.inner_text() stock_data['quote']['volume'] = ' '.join(volume_text.split()) # Date/time - extract from page text page_text = await page.inner_text('body') # Look for "At close: November 5 at 4:00:01 PM EST" pattern import re time_match = re.search(r'At close:\s*([^\\n]+(?:EST|EDT|PST|PDT))', page_text) if time_match: stock_data['quote']['date'] = time_match.group(1).strip() except Exception as e: print(f" Error extracting summary: {e}") # Get market cap, P/E, etc from the stats table stat_rows = await page.query_selector_all('table tr') for row in stat_rows: try: cells = await row.query_selector_all('td') if len(cells) == 2: label = await cells[0].inner_text() value = await cells[1].inner_text() label = label.strip().lower().replace(' ', '_').replace('/', '_') stock_data['statistics'][label] = value.strip() except: continue except Exception as e: print(f" Error extracting summary: {e}") # 2. Get Financials page try: financials_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/financials" await page.goto(financials_url, wait_until='domcontentloaded', timeout=60000) await asyncio.sleep(5) # Extract financial data financial_tables = await page.query_selector_all('div[class*="financials"] table') for table in financial_tables: rows = await table.query_selector_all('tr') for row in rows: try: cells = await row.query_selector_all('td, th') if len(cells) >= 2: label = await cells[0].inner_text() values = [] for i in range(1, len(cells)): val = await cells[i].inner_text() values.append(val.strip()) label_key = label.strip().lower().replace(' ', '_') stock_data['financials'][label_key] = values except: continue except Exception as e: print(f" Error extracting financials: {e}") # 3. Get Key Statistics page try: stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics" await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000) await asyncio.sleep(5) # Extract all statistics stat_tables = await page.query_selector_all('table') for table in stat_tables: rows = await table.query_selector_all('tr') for row in rows: try: cells = await row.query_selector_all('td') if len(cells) == 2: label = await cells[0].inner_text() value = await cells[1].inner_text() label_key = label.strip().lower().replace(' ', '_').replace('/', '_') stock_data['statistics'][label_key] = value.strip() except: continue except Exception as e: print(f" Error extracting statistics: {e}") # 4. Get Analysis page (analyst ratings, growth estimates) try: analysis_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/analysis" await page.goto(analysis_url, wait_until='networkidle', timeout=30000) await asyncio.sleep(2) # Extract analysis data analysis_tables = await page.query_selector_all('table') for idx, table in enumerate(analysis_tables): table_data = [] rows = await table.query_selector_all('tr') for row in rows: cells = await row.query_selector_all('td, th') row_data = [] for cell in cells: text = await cell.inner_text() row_data.append(text.strip()) if row_data: table_data.append(row_data) stock_data['analysis'][f'table_{idx}'] = table_data except Exception as e: print(f" Error extracting analysis: {e}") print(f"āœ… {ticker} data scraped successfully") except Exception as e: print(f"āŒ Error scraping {ticker}: {e}") stock_data['error'] = str(e) finally: await browser.close() # Save individual stock data output_file = f"{self.output_dir}/{ticker}_yahoo.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(stock_data, f, indent=2) return stock_data async def scrape_multiple_stocks(self, stock_list, max_stocks=None): """Scrape data for multiple stocks""" print("=" * 60) print("YAHOO FINANCE SCRAPING") print("=" * 60) if max_stocks: stock_list = stock_list[:max_stocks] all_data = [] successful = 0 failed = 0 for stock in stock_list: ticker = stock.get('symbol') exchange = stock.get('exchange') data = await self.scrape_stock_data(ticker, exchange) all_data.append(data) if data.get('error'): failed += 1 else: successful += 1 # Rate limiting await asyncio.sleep(2) print("\n" + "=" * 60) print(f"āœ… Successfully scraped: {successful}") print(f"āŒ Failed: {failed}") print(f"šŸ“ Data saved to: {self.output_dir}/") print("=" * 60) return all_data async def main(): """Test the scraper with a few stocks""" # Load listings listings_file = "data/listings/all_listings_combined.json" if not os.path.exists(listings_file): print(f"āŒ No listings file found at {listings_file}") print(" Run extract_listings.py first") return with open(listings_file, 'r', encoding='utf-8') as f: listings = json.load(f) print(f"šŸ“Š Found {len(listings)} stocks in listings") # Test with first 5 stocks scraper = YahooFinanceScraper() await scraper.scrape_multiple_stocks(listings, max_stocks=5) if __name__ == "__main__": asyncio.run(main())