microcap_scrapping/complete_scraper_with_reports.py

"""
Complete Yahoo Finance scraper - gets quote data AND full statistics.
"""

import asyncio
import json
import os
from datetime import datetime
from playwright.async_api import async_playwright
from database import StockDatabase
from generate_company_report import gather_contents, save_markdown, render_pdf_from_text
import re


async def scrape_complete_stock_data(ticker, exchange):
    """Scrape complete data including quote and all statistics"""

    # Format ticker
    yahoo_ticker = ticker
    if exchange in ['TSX', 'TSXV']:
        if not ticker.endswith('.TO') and not ticker.endswith('.V'):
            yahoo_ticker = f"{ticker}.TO"

    print(f"\n{'='*70}")
    print(f"Scraping: {ticker} ({exchange}) -> {yahoo_ticker}")
    print('='*70)

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
        page = await context.new_page()

        stock_data = {
            'ticker': ticker,
            'exchange': exchange,
            'yahoo_ticker': yahoo_ticker,
            'scraped_at': datetime.now().isoformat(),
            'profile': {},
            'quote': {},
            'financials': {},
            'statistics': {},
            'error': None
        }

        try:
            # 1. Summary page - get quote data
            url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
            print(f"[1/2] Loading summary page...")
            await page.goto(url, wait_until='domcontentloaded', timeout=60000)
            await asyncio.sleep(5)

            # Check valid
            content = await page.content()
            if "Symbol Lookup" in content:
                print(f"❌ Ticker not found")
                stock_data['error'] = 'Ticker not found'
                await browser.close()
                return stock_data

            # Get quote data with ticker filtering
            # Don't wait for selector since there are multiple elements

            # Close price - find the one matching our ticker
            all_prices = await page.query_selector_all('[data-field="regularMarketPrice"]')
            for elem in all_prices:
                symbol_attr = await elem.get_attribute('data-symbol')
                if symbol_attr and symbol_attr.upper() == yahoo_ticker.upper():
                    price_text = await elem.text_content()
                    price_clean = ' '.join(price_text.split())
                    stock_data['profile']['current_price'] = float(price_clean.replace(',', ''))
                    stock_data['quote']['close'] = price_clean
                    break

            # Other quote fields (no data-symbol, safe to use first)
            open_elem = await page.query_selector('[data-field="regularMarketOpen"]')
            if open_elem:
                stock_data['quote']['open'] = ' '.join((await open_elem.text_content()).split())

            range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')
            if range_elem:
                range_text = ' '.join((await range_elem.text_content()).split())
                if ' - ' in range_text:
                    low, high = range_text.split(' - ')
                    stock_data['quote']['low'] = low.strip()
                    stock_data['quote']['high'] = high.strip()

            volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')
            if volume_elem:
                stock_data['quote']['volume'] = ' '.join((await volume_elem.text_content()).split())

            page_text = await page.inner_text('body')
            time_match = re.search(r'At close:\s*([^\n]+(?:EST|EDT|PST|PDT))', page_text)
            if time_match:
                stock_data['quote']['date'] = time_match.group(1).strip()

            print(f"✅ Quote data extracted")
            print(f"   Close: {stock_data['quote'].get('close', 'N/A')}")
            print(f"   Open: {stock_data['quote'].get('open', 'N/A')}")
            print(f"   High/Low: {stock_data['quote'].get('high', 'N/A')} / {stock_data['quote'].get('low', 'N/A')}")
            print(f"   Volume: {stock_data['quote'].get('volume', 'N/A')}")

            # 2. Key Statistics page - get full statistics
            stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"
            print(f"[2/2] Loading key statistics page...")
            await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)
            await asyncio.sleep(5)

            stat_tables = await page.query_selector_all('table')
            stats_count = 0
            for table in stat_tables:
                rows = await table.query_selector_all('tr')
                for row in rows:
                    try:
                        cells = await row.query_selector_all('td')
                        if len(cells) == 2:
                            label = await cells[0].text_content()
                            value = await cells[1].text_content()
                            label_key = label.strip().lower().replace(' ', '_').replace('/', '_')
                            stock_data['statistics'][label_key] = value.strip()
                            stats_count += 1
                    except:
                        continue

            print(f"✅ Extracted {stats_count} statistics")
            print(f"✅ {ticker} complete!\n")

        except Exception as e:
            print(f"❌ Error: {e}")
            stock_data['error'] = str(e)

        finally:
            await browser.close()

        return stock_data


async def main():
    """Scrape all stocks, save data, insert to DB, generate reports"""
    stocks = [
        ('AAPL', 'NASDAQ'),
        ('MSFT', 'NASDAQ'),
        ('SHOP.TO', 'TSX'),
    ]

    db = StockDatabase()

    print("\n" + "="*70)
    print("COMPLETE STOCK DATA SCRAPER & REPORT GENERATOR")
    print("="*70)
    print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

    for ticker, exchange in stocks:
        # Scrape
        result = await scrape_complete_stock_data(ticker, exchange)

        if result.get('error'):
            print(f"⚠️  Skipping {ticker} due to error\n")
            continue

        # Save to file
        os.makedirs('data/financials', exist_ok=True)
        filepath = f'data/financials/{ticker}_yahoo.json'
        with open(filepath, 'w') as f:
            json.dump(result, f, indent=2)
        print(f"💾 Saved to {filepath}")

        # Insert quote to database
        quote = result.get('quote', {})
        if quote and any(quote.values()):
            db.insert_stock_quote(ticker, quote)
            print(f"💾 Quote saved to database")

        # Generate report
        print(f"📄 Generating report...")
        content = gather_contents(ticker)
        md_path = save_markdown(ticker, content)
        print(f"✅ Markdown: {md_path}")

        try:
            pdf_path = f'data/reports/{ticker}_full_report.pdf'
            render_pdf_from_text(ticker, content, pdf_path)
            print(f"✅ PDF: {pdf_path}")
        except Exception as e:
            print(f"⚠️  PDF skipped: {e}")

        print("")

    db.close()

    print("="*70)
    print("✅ ALL COMPLETE!")
    print("="*70)


if __name__ == "__main__":
    asyncio.run(main())