""" Complete Yahoo Finance scraper - gets quote data AND full statistics. """ import asyncio import json import os from datetime import datetime from playwright.async_api import async_playwright from database import StockDatabase from generate_company_report import gather_contents, save_markdown, render_pdf_from_text import re async def scrape_complete_stock_data(ticker, exchange): """Scrape complete data including quote and all statistics""" # Format ticker yahoo_ticker = ticker if exchange in ['TSX', 'TSXV']: if not ticker.endswith('.TO') and not ticker.endswith('.V'): yahoo_ticker = f"{ticker}.TO" print(f"\n{'='*70}") print(f"Scraping: {ticker} ({exchange}) -> {yahoo_ticker}") print('='*70) async with async_playwright() as p: browser = await p.chromium.launch(headless=True) context = await browser.new_context(viewport={'width': 1920, 'height': 1080}) page = await context.new_page() stock_data = { 'ticker': ticker, 'exchange': exchange, 'yahoo_ticker': yahoo_ticker, 'scraped_at': datetime.now().isoformat(), 'profile': {}, 'quote': {}, 'financials': {}, 'statistics': {}, 'error': None } try: # 1. Summary page - get quote data url = f"https://finance.yahoo.com/quote/{yahoo_ticker}" print(f"[1/2] Loading summary page...") await page.goto(url, wait_until='domcontentloaded', timeout=60000) await asyncio.sleep(5) # Check valid content = await page.content() if "Symbol Lookup" in content: print(f"❌ Ticker not found") stock_data['error'] = 'Ticker not found' await browser.close() return stock_data # Get quote data with ticker filtering # Don't wait for selector since there are multiple elements # Close price - find the one matching our ticker all_prices = await page.query_selector_all('[data-field="regularMarketPrice"]') for elem in all_prices: symbol_attr = await elem.get_attribute('data-symbol') if symbol_attr and symbol_attr.upper() == yahoo_ticker.upper(): price_text = await elem.text_content() price_clean = ' '.join(price_text.split()) stock_data['profile']['current_price'] = float(price_clean.replace(',', '')) stock_data['quote']['close'] = price_clean break # Other quote fields (no data-symbol, safe to use first) open_elem = await page.query_selector('[data-field="regularMarketOpen"]') if open_elem: stock_data['quote']['open'] = ' '.join((await open_elem.text_content()).split()) range_elem = await page.query_selector('[data-field="regularMarketDayRange"]') if range_elem: range_text = ' '.join((await range_elem.text_content()).split()) if ' - ' in range_text: low, high = range_text.split(' - ') stock_data['quote']['low'] = low.strip() stock_data['quote']['high'] = high.strip() volume_elem = await page.query_selector('[data-field="regularMarketVolume"]') if volume_elem: stock_data['quote']['volume'] = ' '.join((await volume_elem.text_content()).split()) page_text = await page.inner_text('body') time_match = re.search(r'At close:\s*([^\n]+(?:EST|EDT|PST|PDT))', page_text) if time_match: stock_data['quote']['date'] = time_match.group(1).strip() print(f"✅ Quote data extracted") print(f" Close: {stock_data['quote'].get('close', 'N/A')}") print(f" Open: {stock_data['quote'].get('open', 'N/A')}") print(f" High/Low: {stock_data['quote'].get('high', 'N/A')} / {stock_data['quote'].get('low', 'N/A')}") print(f" Volume: {stock_data['quote'].get('volume', 'N/A')}") # 2. Key Statistics page - get full statistics stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics" print(f"[2/2] Loading key statistics page...") await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000) await asyncio.sleep(5) stat_tables = await page.query_selector_all('table') stats_count = 0 for table in stat_tables: rows = await table.query_selector_all('tr') for row in rows: try: cells = await row.query_selector_all('td') if len(cells) == 2: label = await cells[0].text_content() value = await cells[1].text_content() label_key = label.strip().lower().replace(' ', '_').replace('/', '_') stock_data['statistics'][label_key] = value.strip() stats_count += 1 except: continue print(f"✅ Extracted {stats_count} statistics") print(f"✅ {ticker} complete!\n") except Exception as e: print(f"❌ Error: {e}") stock_data['error'] = str(e) finally: await browser.close() return stock_data async def main(): """Scrape all stocks, save data, insert to DB, generate reports""" stocks = [ ('AAPL', 'NASDAQ'), ('MSFT', 'NASDAQ'), ('SHOP.TO', 'TSX'), ] db = StockDatabase() print("\n" + "="*70) print("COMPLETE STOCK DATA SCRAPER & REPORT GENERATOR") print("="*70) print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") for ticker, exchange in stocks: # Scrape result = await scrape_complete_stock_data(ticker, exchange) if result.get('error'): print(f"⚠️ Skipping {ticker} due to error\n") continue # Save to file os.makedirs('data/financials', exist_ok=True) filepath = f'data/financials/{ticker}_yahoo.json' with open(filepath, 'w') as f: json.dump(result, f, indent=2) print(f"💾 Saved to {filepath}") # Insert quote to database quote = result.get('quote', {}) if quote and any(quote.values()): db.insert_stock_quote(ticker, quote) print(f"💾 Quote saved to database") # Generate report print(f"📄 Generating report...") content = gather_contents(ticker) md_path = save_markdown(ticker, content) print(f"✅ Markdown: {md_path}") try: pdf_path = f'data/reports/{ticker}_full_report.pdf' render_pdf_from_text(ticker, content, pdf_path) print(f"✅ PDF: {pdf_path}") except Exception as e: print(f"⚠️ PDF skipped: {e}") print("") db.close() print("="*70) print("✅ ALL COMPLETE!") print("="*70) if __name__ == "__main__": asyncio.run(main())