complete_scraper_with_reports.py

"""
Complete Yahoo Finance scraper - gets quote data AND full statistics.
"""

import asyncio
import json
import os
from datetime import datetime
from playwright.async_api import async_playwright
from database import StockDatabase
from generate_company_report import gather_contents, save_markdown, render_pdf_from_text
import re


async def scrape_complete_stock_data(ticker, exchange):
    """Scrape complete data including quote and all statistics"""
    
    # Format ticker
    yahoo_ticker = ticker
    if exchange in ['TSX', 'TSXV']:
        if not ticker.endswith('.TO') and not ticker.endswith('.V'):
            yahoo_ticker = f"{ticker}.TO"
    
    print(f"\n{'='*70}")
    print(f"Scraping: {ticker} ({exchange}) -> {yahoo_ticker}")
    print('='*70)
    
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
        page = await context.new_page()
        
        stock_data = {
            'ticker': ticker,
            'exchange': exchange,
            'yahoo_ticker': yahoo_ticker,
            'scraped_at': datetime.now().isoformat(),
            'profile': {},
            'quote': {},
            'financials': {},
            'statistics': {},
            'error': None
        }
        
        try:
            # 1. Summary page - get quote data
            url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
            print(f"[1/2] Loading summary page...")
            await page.goto(url, wait_until='domcontentloaded', timeout=60000)
            await asyncio.sleep(5)
            
            # Check valid
            content = await page.content()
            if "Symbol Lookup" in content:
                print(f"❌ Ticker not found")
                stock_data['error'] = 'Ticker not found'
                await browser.close()
                return stock_data
            
            # Get quote data with ticker filtering
            # Don't wait for selector since there are multiple elements
            
            # Close price - find the one matching our ticker
            all_prices = await page.query_selector_all('[data-field="regularMarketPrice"]')
            for elem in all_prices:
                symbol_attr = await elem.get_attribute('data-symbol')
                if symbol_attr and symbol_attr.upper() == yahoo_ticker.upper():
                    price_text = await elem.text_content()
                    price_clean = ' '.join(price_text.split())
                    stock_data['profile']['current_price'] = float(price_clean.replace(',', ''))
                    stock_data['quote']['close'] = price_clean
                    break
            
            # Other quote fields (no data-symbol, safe to use first)
            open_elem = await page.query_selector('[data-field="regularMarketOpen"]')
            if open_elem:
                stock_data['quote']['open'] = ' '.join((await open_elem.text_content()).split())
            
            range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')
            if range_elem:
                range_text = ' '.join((await range_elem.text_content()).split())
                if ' - ' in range_text:
                    low, high = range_text.split(' - ')
                    stock_data['quote']['low'] = low.strip()
                    stock_data['quote']['high'] = high.strip()
            
            volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')
            if volume_elem:
                stock_data['quote']['volume'] = ' '.join((await volume_elem.text_content()).split())
            
            page_text = await page.inner_text('body')
            time_match = re.search(r'At close:\s*([^\n]+(?:EST|EDT|PST|PDT))', page_text)
            if time_match:
                stock_data['quote']['date'] = time_match.group(1).strip()
            
            print(f"✅ Quote data extracted")
            print(f"   Close: {stock_data['quote'].get('close', 'N/A')}")
            print(f"   Open: {stock_data['quote'].get('open', 'N/A')}")
            print(f"   High/Low: {stock_data['quote'].get('high', 'N/A')} / {stock_data['quote'].get('low', 'N/A')}")
            print(f"   Volume: {stock_data['quote'].get('volume', 'N/A')}")
            
            # 2. Key Statistics page - get full statistics
            stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"
            print(f"[2/2] Loading key statistics page...")
            await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)
            await asyncio.sleep(5)
            
            stat_tables = await page.query_selector_all('table')
            stats_count = 0
            for table in stat_tables:
                rows = await table.query_selector_all('tr')
                for row in rows:
                    try:
                        cells = await row.query_selector_all('td')
                        if len(cells) == 2:
                            label = await cells[0].text_content()
                            value = await cells[1].text_content()
                            label_key = label.strip().lower().replace(' ', '_').replace('/', '_')
                            stock_data['statistics'][label_key] = value.strip()
                            stats_count += 1
                    except:
                        continue
            
            print(f"✅ Extracted {stats_count} statistics")
            print(f"✅ {ticker} complete!\n")
            
        except Exception as e:
            print(f"❌ Error: {e}")
            stock_data['error'] = str(e)
        
        finally:
            await browser.close()
        
        return stock_data


async def main():
    """Scrape all stocks, save data, insert to DB, generate reports"""
    stocks = [
        ('AAPL', 'NASDAQ'),
        ('MSFT', 'NASDAQ'),
        ('SHOP.TO', 'TSX'),
    ]
    
    db = StockDatabase()
    
    print("\n" + "="*70)
    print("COMPLETE STOCK DATA SCRAPER & REPORT GENERATOR")
    print("="*70)
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    
    for ticker, exchange in stocks:
        # Scrape
        result = await scrape_complete_stock_data(ticker, exchange)
        
        if result.get('error'):
            print(f"⚠️  Skipping {ticker} due to error\n")
            continue
        
        # Save to file
        os.makedirs('data/financials', exist_ok=True)
        filepath = f'data/financials/{ticker}_yahoo.json'
        with open(filepath, 'w') as f:
            json.dump(result, f, indent=2)
        print(f"💾 Saved to {filepath}")
        
        # Insert quote to database
        quote = result.get('quote', {})
        if quote and any(quote.values()):
            db.insert_stock_quote(ticker, quote)
            print(f"💾 Quote saved to database")
        
        # Generate report
        print(f"📄 Generating report...")
        content = gather_contents(ticker)
        md_path = save_markdown(ticker, content)
        print(f"✅ Markdown: {md_path}")
        
        try:
            pdf_path = f'data/reports/{ticker}_full_report.pdf'
            render_pdf_from_text(ticker, content, pdf_path)
            print(f"✅ PDF: {pdf_path}")
        except Exception as e:
            print(f"⚠️  PDF skipped: {e}")
        
        print("")
    
    db.close()
    
    print("="*70)
    print("✅ ALL COMPLETE!")
    print("="*70)


if __name__ == "__main__":
    asyncio.run(main())
Initial commit: Stock Intelligence Automation System 2025-11-06 12:22:19 +01:00			`"""`
			`Complete Yahoo Finance scraper - gets quote data AND full statistics.`
			`"""`

			`import asyncio`
			`import json`
			`import os`
			`from datetime import datetime`
			`from playwright.async_api import async_playwright`
			`from database import StockDatabase`
			`from generate_company_report import gather_contents, save_markdown, render_pdf_from_text`
			`import re`


			`async def scrape_complete_stock_data(ticker, exchange):`
			`"""Scrape complete data including quote and all statistics"""`

			`# Format ticker`
			`yahoo_ticker = ticker`
			`if exchange in ['TSX', 'TSXV']:`
			`if not ticker.endswith('.TO') and not ticker.endswith('.V'):`
			`yahoo_ticker = f"{ticker}.TO"`

			`print(f"\n{'='*70}")`
			`print(f"Scraping: {ticker} ({exchange}) -> {yahoo_ticker}")`
			`print('='*70)`

			`async with async_playwright() as p:`
			`browser = await p.chromium.launch(headless=True)`
			`context = await browser.new_context(viewport={'width': 1920, 'height': 1080})`
			`page = await context.new_page()`

			`stock_data = {`
			`'ticker': ticker,`
			`'exchange': exchange,`
			`'yahoo_ticker': yahoo_ticker,`
			`'scraped_at': datetime.now().isoformat(),`
			`'profile': {},`
			`'quote': {},`
			`'financials': {},`
			`'statistics': {},`
			`'error': None`
			`}`

			`try:`
			`# 1. Summary page - get quote data`
			`url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"`
			`print(f"[1/2] Loading summary page...")`
			`await page.goto(url, wait_until='domcontentloaded', timeout=60000)`
			`await asyncio.sleep(5)`

			`# Check valid`
			`content = await page.content()`
			`if "Symbol Lookup" in content:`
			`print(f"❌ Ticker not found")`
			`stock_data['error'] = 'Ticker not found'`
			`await browser.close()`
			`return stock_data`

			`# Get quote data with ticker filtering`
			`# Don't wait for selector since there are multiple elements`

			`# Close price - find the one matching our ticker`
			`all_prices = await page.query_selector_all('[data-field="regularMarketPrice"]')`
			`for elem in all_prices:`
			`symbol_attr = await elem.get_attribute('data-symbol')`
			`if symbol_attr and symbol_attr.upper() == yahoo_ticker.upper():`
			`price_text = await elem.text_content()`
			`price_clean = ' '.join(price_text.split())`
			`stock_data['profile']['current_price'] = float(price_clean.replace(',', ''))`
			`stock_data['quote']['close'] = price_clean`
			`break`

			`# Other quote fields (no data-symbol, safe to use first)`
			`open_elem = await page.query_selector('[data-field="regularMarketOpen"]')`
			`if open_elem:`
			`stock_data['quote']['open'] = ' '.join((await open_elem.text_content()).split())`

			`range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')`
			`if range_elem:`
			`range_text = ' '.join((await range_elem.text_content()).split())`
			`if ' - ' in range_text:`
			`low, high = range_text.split(' - ')`
			`stock_data['quote']['low'] = low.strip()`
			`stock_data['quote']['high'] = high.strip()`

			`volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')`
			`if volume_elem:`
			`stock_data['quote']['volume'] = ' '.join((await volume_elem.text_content()).split())`

			`page_text = await page.inner_text('body')`
			`time_match = re.search(r'At close:\s*([^\n]+(?:EST\|EDT\|PST\|PDT))', page_text)`
			`if time_match:`
			`stock_data['quote']['date'] = time_match.group(1).strip()`

			`print(f"✅ Quote data extracted")`
			`print(f" Close: {stock_data['quote'].get('close', 'N/A')}")`
			`print(f" Open: {stock_data['quote'].get('open', 'N/A')}")`
			`print(f" High/Low: {stock_data['quote'].get('high', 'N/A')} / {stock_data['quote'].get('low', 'N/A')}")`
			`print(f" Volume: {stock_data['quote'].get('volume', 'N/A')}")`

			`# 2. Key Statistics page - get full statistics`
			`stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"`
			`print(f"[2/2] Loading key statistics page...")`
			`await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)`
			`await asyncio.sleep(5)`

			`stat_tables = await page.query_selector_all('table')`
			`stats_count = 0`
			`for table in stat_tables:`
			`rows = await table.query_selector_all('tr')`
			`for row in rows:`
			`try:`
			`cells = await row.query_selector_all('td')`
			`if len(cells) == 2:`
			`label = await cells[0].text_content()`
			`value = await cells[1].text_content()`
			`label_key = label.strip().lower().replace(' ', '_').replace('/', '_')`
			`stock_data['statistics'][label_key] = value.strip()`
			`stats_count += 1`
			`except:`
			`continue`

			`print(f"✅ Extracted {stats_count} statistics")`
			`print(f"✅ {ticker} complete!\n")`

			`except Exception as e:`
			`print(f"❌ Error: {e}")`
			`stock_data['error'] = str(e)`

			`finally:`
			`await browser.close()`

			`return stock_data`


			`async def main():`
			`"""Scrape all stocks, save data, insert to DB, generate reports"""`
			`stocks = [`
			`('AAPL', 'NASDAQ'),`
			`('MSFT', 'NASDAQ'),`
			`('SHOP.TO', 'TSX'),`
			`]`

			`db = StockDatabase()`

			`print("\n" + "="*70)`
			`print("COMPLETE STOCK DATA SCRAPER & REPORT GENERATOR")`
			`print("="*70)`
			`print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")`

			`for ticker, exchange in stocks:`
			`# Scrape`
			`result = await scrape_complete_stock_data(ticker, exchange)`

			`if result.get('error'):`
			`print(f"⚠️ Skipping {ticker} due to error\n")`
			`continue`

			`# Save to file`
			`os.makedirs('data/financials', exist_ok=True)`
			`filepath = f'data/financials/{ticker}_yahoo.json'`
			`with open(filepath, 'w') as f:`
			`json.dump(result, f, indent=2)`
			`print(f"💾 Saved to {filepath}")`

			`# Insert quote to database`
			`quote = result.get('quote', {})`
			`if quote and any(quote.values()):`
			`db.insert_stock_quote(ticker, quote)`
			`print(f"💾 Quote saved to database")`

			`# Generate report`
			`print(f"📄 Generating report...")`
			`content = gather_contents(ticker)`
			`md_path = save_markdown(ticker, content)`
			`print(f"✅ Markdown: {md_path}")`

			`try:`
			`pdf_path = f'data/reports/{ticker}_full_report.pdf'`
			`render_pdf_from_text(ticker, content, pdf_path)`
			`print(f"✅ PDF: {pdf_path}")`
			`except Exception as e:`
			`print(f"⚠️ PDF skipped: {e}")`

			`print("")`

			`db.close()`

			`print("="*70)`
			`print("✅ ALL COMPLETE!")`
			`print("="*70)`


			`if __name__ == "__main__":`
			`asyncio.run(main())`