scrape_yahoo_finance.py

"""
Scrape financial data from Yahoo Finance (no API key needed)
Gets financials, ratios, and key metrics for each stock
"""

import asyncio
import json
import os
from datetime import datetime
from playwright.async_api import async_playwright
import time
import re


class YahooFinanceScraper:
    def __init__(self, output_dir="data/financials"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    async def scrape_stock_data(self, ticker, exchange=""):
        """Scrape comprehensive data for a single stock"""
        print(f"\n🔍 Scraping {ticker}...")
        
        # Format ticker for Yahoo Finance
        yahoo_ticker = ticker
        
        # Canadian stocks need exchange-specific suffixes
        if exchange in ['TSX', 'TSXV', 'TSX/TSXV']:
            if not ticker.endswith('.TO') and not ticker.endswith('.V'):
                yahoo_ticker = f"{ticker}.TO"  # Try TSX first
        
        # CSE (Canadian Securities Exchange) stocks use .CN suffix
        # CSE tickers in database may have "T2" prefix which needs to be removed
        elif exchange == 'CSE':
            # Remove T2 prefix if present (e.g., T2AAA -> AAA)
            clean_ticker = ticker.replace('T2', '') if ticker.startswith('T2') else ticker
            # Remove any suffix after a dot (e.g., T2AAAWH.U -> AAAWH)
            if '.' in clean_ticker:
                clean_ticker = clean_ticker.split('.')[0]
            yahoo_ticker = f"{clean_ticker}.CN"
            print(f"   CSE stock: {ticker} -> {yahoo_ticker}")
        
        stock_data = {
            'ticker': ticker,
            'exchange': exchange,
            'yahoo_ticker': yahoo_ticker,
            'scraped_at': datetime.now().isoformat(),
            'profile': {},
            'quote': {},  # Real-time quote data
            'financials': {},
            'statistics': {},
            'analysis': {},
            'error': None
        }
        
        async with async_playwright() as p:
            # Launch with no-cache to avoid stale data
            browser = await p.chromium.launch(
                headless=True,
                args=['--disable-blink-features=AutomationControlled']
            )
            context = await browser.new_context(
                viewport={'width': 1920, 'height': 1080},
                user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
            )
            page = await context.new_page()
            
            try:
                # 1. Get Summary/Statistics page
                url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
                print(f"   Loading {url}...")
                await page.goto(url, wait_until='domcontentloaded', timeout=60000)
                await asyncio.sleep(5)  # Wait for dynamic content to load
                
                # Check if ticker exists
                page_content = await page.content()
                if "Symbol Lookup" in page_content or "Symbols similar to" in page_content:
                    print(f"⚠️  {yahoo_ticker} not found on Yahoo Finance")
                    stock_data['error'] = 'Ticker not found'
                    
                    # Try alternative suffix for TSXV
                    if yahoo_ticker.endswith('.TO'):
                        yahoo_ticker = f"{ticker}.V"
                        print(f"   Trying {yahoo_ticker}...")
                        url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
                        await page.goto(url, wait_until='domcontentloaded', timeout=60000)
                        await asyncio.sleep(5)
                        
                        page_content = await page.content()
                        if "Symbol Lookup" in page_content:
                            await browser.close()
                            return stock_data
                        else:
                            stock_data['yahoo_ticker'] = yahoo_ticker
                            stock_data['error'] = None
                
                # Extract key stats and quote data from summary
                try:
                    # Get real-time quote data from the quote header section
                    # Initialize quote fields to empty to avoid caching from previous runs
                    stock_data['quote'] = {
                        'date': '',
                        'open': '',
                        'high': '',
                        'low': '',
                        'close': '',
                        'volume': ''
                    }
                    
                    # Close (current price)
                    price_elem = await page.query_selector('[data-field="regularMarketPrice"]')
                    if price_elem:
                        price_text = await price_elem.inner_text()
                        # Remove whitespace and newlines
                        price_text = ' '.join(price_text.split())
                        print(f"   Raw price text: '{price_text}'")
                        try:
                            current_price = float(price_text.replace(',', ''))
                            stock_data['profile']['current_price'] = current_price
                            stock_data['quote']['close'] = price_text
                            print(f"   Parsed price: {current_price}")
                        except ValueError:
                            print(f"   Warning: Could not parse price: {price_text}")
                    
                    # Open price
                    open_elem = await page.query_selector('[data-field="regularMarketOpen"]')
                    if open_elem:
                        open_text = await open_elem.inner_text()
                        stock_data['quote']['open'] = ' '.join(open_text.split())
                    
                    # Day range (high/low)
                    range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')
                    if range_elem:
                        range_text = await range_elem.inner_text()
                        range_text = ' '.join(range_text.split())
                        if ' - ' in range_text:
                            low, high = range_text.split(' - ')
                            stock_data['quote']['low'] = low.strip()
                            stock_data['quote']['high'] = high.strip()
                    
                    # Volume
                    volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')
                    if volume_elem:
                        volume_text = await volume_elem.inner_text()
                        stock_data['quote']['volume'] = ' '.join(volume_text.split())
                    
                    # Date/time - extract from page text
                    page_text = await page.inner_text('body')
                    # Look for "At close: November 5 at 4:00:01 PM EST" pattern
                    import re
                    time_match = re.search(r'At close:\s*([^\\n]+(?:EST|EDT|PST|PDT))', page_text)
                    if time_match:
                        stock_data['quote']['date'] = time_match.group(1).strip()
                
                except Exception as e:
                    print(f"   Error extracting summary: {e}")
                    
                    # Get market cap, P/E, etc from the stats table
                    stat_rows = await page.query_selector_all('table tr')
                    for row in stat_rows:
                        try:
                            cells = await row.query_selector_all('td')
                            if len(cells) == 2:
                                label = await cells[0].inner_text()
                                value = await cells[1].inner_text()
                                
                                label = label.strip().lower().replace(' ', '_').replace('/', '_')
                                stock_data['statistics'][label] = value.strip()
                        except:
                            continue
                
                except Exception as e:
                    print(f"   Error extracting summary: {e}")
                
                # 2. Get Financials page
                try:
                    financials_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/financials"
                    await page.goto(financials_url, wait_until='domcontentloaded', timeout=60000)
                    await asyncio.sleep(5)
                    
                    # Extract financial data
                    financial_tables = await page.query_selector_all('div[class*="financials"] table')
                    for table in financial_tables:
                        rows = await table.query_selector_all('tr')
                        for row in rows:
                            try:
                                cells = await row.query_selector_all('td, th')
                                if len(cells) >= 2:
                                    label = await cells[0].inner_text()
                                    values = []
                                    for i in range(1, len(cells)):
                                        val = await cells[i].inner_text()
                                        values.append(val.strip())
                                    
                                    label_key = label.strip().lower().replace(' ', '_')
                                    stock_data['financials'][label_key] = values
                            except:
                                continue
                
                except Exception as e:
                    print(f"   Error extracting financials: {e}")
                
                # 3. Get Key Statistics page
                try:
                    stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"
                    await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)
                    await asyncio.sleep(5)
                    
                    # Extract all statistics
                    stat_tables = await page.query_selector_all('table')
                    for table in stat_tables:
                        rows = await table.query_selector_all('tr')
                        for row in rows:
                            try:
                                cells = await row.query_selector_all('td')
                                if len(cells) == 2:
                                    label = await cells[0].inner_text()
                                    value = await cells[1].inner_text()
                                    
                                    label_key = label.strip().lower().replace(' ', '_').replace('/', '_')
                                    stock_data['statistics'][label_key] = value.strip()
                            except:
                                continue
                
                except Exception as e:
                    print(f"   Error extracting statistics: {e}")
                
                # 4. Get Analysis page (analyst ratings, growth estimates)
                try:
                    analysis_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/analysis"
                    await page.goto(analysis_url, wait_until='networkidle', timeout=30000)
                    await asyncio.sleep(2)
                    
                    # Extract analysis data
                    analysis_tables = await page.query_selector_all('table')
                    for idx, table in enumerate(analysis_tables):
                        table_data = []
                        rows = await table.query_selector_all('tr')
                        for row in rows:
                            cells = await row.query_selector_all('td, th')
                            row_data = []
                            for cell in cells:
                                text = await cell.inner_text()
                                row_data.append(text.strip())
                            if row_data:
                                table_data.append(row_data)
                        
                        stock_data['analysis'][f'table_{idx}'] = table_data
                
                except Exception as e:
                    print(f"   Error extracting analysis: {e}")
                
                print(f"✅ {ticker} data scraped successfully")
                
            except Exception as e:
                print(f"❌ Error scraping {ticker}: {e}")
                stock_data['error'] = str(e)
            
            finally:
                await browser.close()
        
        # Save individual stock data
        output_file = f"{self.output_dir}/{ticker}_yahoo.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(stock_data, f, indent=2)
        
        return stock_data
    
    async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
        """Scrape data for multiple stocks"""
        print("=" * 60)
        print("YAHOO FINANCE SCRAPING")
        print("=" * 60)
        
        if max_stocks:
            stock_list = stock_list[:max_stocks]
        
        all_data = []
        successful = 0
        failed = 0
        
        for stock in stock_list:
            ticker = stock.get('symbol')
            exchange = stock.get('exchange')
            
            data = await self.scrape_stock_data(ticker, exchange)
            all_data.append(data)
            
            if data.get('error'):
                failed += 1
            else:
                successful += 1
            
            # Rate limiting
            await asyncio.sleep(2)
        
        print("\n" + "=" * 60)
        print(f"✅ Successfully scraped: {successful}")
        print(f"❌ Failed: {failed}")
        print(f"📁 Data saved to: {self.output_dir}/")
        print("=" * 60)
        
        return all_data


async def main():
    """Test the scraper with a few stocks"""
    
    # Load listings
    listings_file = "data/listings/all_listings_combined.json"
    
    if not os.path.exists(listings_file):
        print(f"❌ No listings file found at {listings_file}")
        print("   Run extract_listings.py first")
        return
    
    with open(listings_file, 'r', encoding='utf-8') as f:
        listings = json.load(f)
    
    print(f"📊 Found {len(listings)} stocks in listings")
    
    # Test with first 5 stocks
    scraper = YahooFinanceScraper()
    await scraper.scrape_multiple_stocks(listings, max_stocks=5)


if __name__ == "__main__":
    asyncio.run(main())
Initial commit: Stock Intelligence Automation System 2025-11-06 12:22:19 +01:00			`"""`
			`Scrape financial data from Yahoo Finance (no API key needed)`
			`Gets financials, ratios, and key metrics for each stock`
			`"""`

			`import asyncio`
			`import json`
			`import os`
			`from datetime import datetime`
			`from playwright.async_api import async_playwright`
			`import time`
			`import re`


			`class YahooFinanceScraper:`
			`def __init__(self, output_dir="data/financials"):`
			`self.output_dir = output_dir`
			`os.makedirs(output_dir, exist_ok=True)`

			`async def scrape_stock_data(self, ticker, exchange=""):`
			`"""Scrape comprehensive data for a single stock"""`
			`print(f"\n🔍 Scraping {ticker}...")`

			`# Format ticker for Yahoo Finance`
			`yahoo_ticker = ticker`

			`# Canadian stocks need exchange-specific suffixes`
			`if exchange in ['TSX', 'TSXV', 'TSX/TSXV']:`
			`if not ticker.endswith('.TO') and not ticker.endswith('.V'):`
			`yahoo_ticker = f"{ticker}.TO" # Try TSX first`

			`# CSE (Canadian Securities Exchange) stocks use .CN suffix`
			`# CSE tickers in database may have "T2" prefix which needs to be removed`
			`elif exchange == 'CSE':`
			`# Remove T2 prefix if present (e.g., T2AAA -> AAA)`
			`clean_ticker = ticker.replace('T2', '') if ticker.startswith('T2') else ticker`
			`# Remove any suffix after a dot (e.g., T2AAAWH.U -> AAAWH)`
			`if '.' in clean_ticker:`
			`clean_ticker = clean_ticker.split('.')[0]`
			`yahoo_ticker = f"{clean_ticker}.CN"`
			`print(f" CSE stock: {ticker} -> {yahoo_ticker}")`

			`stock_data = {`
			`'ticker': ticker,`
			`'exchange': exchange,`
			`'yahoo_ticker': yahoo_ticker,`
			`'scraped_at': datetime.now().isoformat(),`
			`'profile': {},`
			`'quote': {}, # Real-time quote data`
			`'financials': {},`
			`'statistics': {},`
			`'analysis': {},`
			`'error': None`
			`}`

			`async with async_playwright() as p:`
			`# Launch with no-cache to avoid stale data`
			`browser = await p.chromium.launch(`
			`headless=True,`
			`args=['--disable-blink-features=AutomationControlled']`
			`)`
			`context = await browser.new_context(`
			`viewport={'width': 1920, 'height': 1080},`
			`user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'`
			`)`
			`page = await context.new_page()`

			`try:`
			`# 1. Get Summary/Statistics page`
			`url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"`
			`print(f" Loading {url}...")`
			`await page.goto(url, wait_until='domcontentloaded', timeout=60000)`
			`await asyncio.sleep(5) # Wait for dynamic content to load`

			`# Check if ticker exists`
			`page_content = await page.content()`
			`if "Symbol Lookup" in page_content or "Symbols similar to" in page_content:`
			`print(f"⚠️ {yahoo_ticker} not found on Yahoo Finance")`
			`stock_data['error'] = 'Ticker not found'`

			`# Try alternative suffix for TSXV`
			`if yahoo_ticker.endswith('.TO'):`
			`yahoo_ticker = f"{ticker}.V"`
			`print(f" Trying {yahoo_ticker}...")`
			`url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"`
			`await page.goto(url, wait_until='domcontentloaded', timeout=60000)`
			`await asyncio.sleep(5)`

			`page_content = await page.content()`
			`if "Symbol Lookup" in page_content:`
			`await browser.close()`
			`return stock_data`
			`else:`
			`stock_data['yahoo_ticker'] = yahoo_ticker`
			`stock_data['error'] = None`

			`# Extract key stats and quote data from summary`
			`try:`
			`# Get real-time quote data from the quote header section`
			`# Initialize quote fields to empty to avoid caching from previous runs`
			`stock_data['quote'] = {`
			`'date': '',`
			`'open': '',`
			`'high': '',`
			`'low': '',`
			`'close': '',`
			`'volume': ''`
			`}`

			`# Close (current price)`
			`price_elem = await page.query_selector('[data-field="regularMarketPrice"]')`
			`if price_elem:`
			`price_text = await price_elem.inner_text()`
			`# Remove whitespace and newlines`
			`price_text = ' '.join(price_text.split())`
			`print(f" Raw price text: '{price_text}'")`
			`try:`
			`current_price = float(price_text.replace(',', ''))`
			`stock_data['profile']['current_price'] = current_price`
			`stock_data['quote']['close'] = price_text`
			`print(f" Parsed price: {current_price}")`
			`except ValueError:`
			`print(f" Warning: Could not parse price: {price_text}")`

			`# Open price`
			`open_elem = await page.query_selector('[data-field="regularMarketOpen"]')`
			`if open_elem:`
			`open_text = await open_elem.inner_text()`
			`stock_data['quote']['open'] = ' '.join(open_text.split())`

			`# Day range (high/low)`
			`range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')`
			`if range_elem:`
			`range_text = await range_elem.inner_text()`
			`range_text = ' '.join(range_text.split())`
			`if ' - ' in range_text:`
			`low, high = range_text.split(' - ')`
			`stock_data['quote']['low'] = low.strip()`
			`stock_data['quote']['high'] = high.strip()`

			`# Volume`
			`volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')`
			`if volume_elem:`
			`volume_text = await volume_elem.inner_text()`
			`stock_data['quote']['volume'] = ' '.join(volume_text.split())`

			`# Date/time - extract from page text`
			`page_text = await page.inner_text('body')`
			`# Look for "At close: November 5 at 4:00:01 PM EST" pattern`
			`import re`
			`time_match = re.search(r'At close:\s*([^\\n]+(?:EST\|EDT\|PST\|PDT))', page_text)`
			`if time_match:`
			`stock_data['quote']['date'] = time_match.group(1).strip()`

			`except Exception as e:`
			`print(f" Error extracting summary: {e}")`

			`# Get market cap, P/E, etc from the stats table`
			`stat_rows = await page.query_selector_all('table tr')`
			`for row in stat_rows:`
			`try:`
			`cells = await row.query_selector_all('td')`
			`if len(cells) == 2:`
			`label = await cells[0].inner_text()`
			`value = await cells[1].inner_text()`

			`label = label.strip().lower().replace(' ', '_').replace('/', '_')`
			`stock_data['statistics'][label] = value.strip()`
			`except:`
			`continue`

			`except Exception as e:`
			`print(f" Error extracting summary: {e}")`

			`# 2. Get Financials page`
			`try:`
			`financials_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/financials"`
			`await page.goto(financials_url, wait_until='domcontentloaded', timeout=60000)`
			`await asyncio.sleep(5)`

			`# Extract financial data`
			`financial_tables = await page.query_selector_all('div[class*="financials"] table')`
			`for table in financial_tables:`
			`rows = await table.query_selector_all('tr')`
			`for row in rows:`
			`try:`
			`cells = await row.query_selector_all('td, th')`
			`if len(cells) >= 2:`
			`label = await cells[0].inner_text()`
			`values = []`
			`for i in range(1, len(cells)):`
			`val = await cells[i].inner_text()`
			`values.append(val.strip())`

			`label_key = label.strip().lower().replace(' ', '_')`
			`stock_data['financials'][label_key] = values`
			`except:`
			`continue`

			`except Exception as e:`
			`print(f" Error extracting financials: {e}")`

			`# 3. Get Key Statistics page`
			`try:`
			`stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"`
			`await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)`
			`await asyncio.sleep(5)`

			`# Extract all statistics`
			`stat_tables = await page.query_selector_all('table')`
			`for table in stat_tables:`
			`rows = await table.query_selector_all('tr')`
			`for row in rows:`
			`try:`
			`cells = await row.query_selector_all('td')`
			`if len(cells) == 2:`
			`label = await cells[0].inner_text()`
			`value = await cells[1].inner_text()`

			`label_key = label.strip().lower().replace(' ', '_').replace('/', '_')`
			`stock_data['statistics'][label_key] = value.strip()`
			`except:`
			`continue`

			`except Exception as e:`
			`print(f" Error extracting statistics: {e}")`

			`# 4. Get Analysis page (analyst ratings, growth estimates)`
			`try:`
			`analysis_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/analysis"`
			`await page.goto(analysis_url, wait_until='networkidle', timeout=30000)`
			`await asyncio.sleep(2)`

			`# Extract analysis data`
			`analysis_tables = await page.query_selector_all('table')`
			`for idx, table in enumerate(analysis_tables):`
			`table_data = []`
			`rows = await table.query_selector_all('tr')`
			`for row in rows:`
			`cells = await row.query_selector_all('td, th')`
			`row_data = []`
			`for cell in cells:`
			`text = await cell.inner_text()`
			`row_data.append(text.strip())`
			`if row_data:`
			`table_data.append(row_data)`

			`stock_data['analysis'][f'table_{idx}'] = table_data`

			`except Exception as e:`
			`print(f" Error extracting analysis: {e}")`

			`print(f"✅ {ticker} data scraped successfully")`

			`except Exception as e:`
			`print(f"❌ Error scraping {ticker}: {e}")`
			`stock_data['error'] = str(e)`

			`finally:`
			`await browser.close()`

			`# Save individual stock data`
			`output_file = f"{self.output_dir}/{ticker}_yahoo.json"`
			`with open(output_file, 'w', encoding='utf-8') as f:`
			`json.dump(stock_data, f, indent=2)`

			`return stock_data`

			`async def scrape_multiple_stocks(self, stock_list, max_stocks=None):`
			`"""Scrape data for multiple stocks"""`
			`print("=" * 60)`
			`print("YAHOO FINANCE SCRAPING")`
			`print("=" * 60)`

			`if max_stocks:`
			`stock_list = stock_list[:max_stocks]`

			`all_data = []`
			`successful = 0`
			`failed = 0`

			`for stock in stock_list:`
			`ticker = stock.get('symbol')`
			`exchange = stock.get('exchange')`

			`data = await self.scrape_stock_data(ticker, exchange)`
			`all_data.append(data)`

			`if data.get('error'):`
			`failed += 1`
			`else:`
			`successful += 1`

			`# Rate limiting`
			`await asyncio.sleep(2)`

			`print("\n" + "=" * 60)`
			`print(f"✅ Successfully scraped: {successful}")`
			`print(f"❌ Failed: {failed}")`
			`print(f"📁 Data saved to: {self.output_dir}/")`
			`print("=" * 60)`

			`return all_data`


			`async def main():`
			`"""Test the scraper with a few stocks"""`

			`# Load listings`
			`listings_file = "data/listings/all_listings_combined.json"`

			`if not os.path.exists(listings_file):`
			`print(f"❌ No listings file found at {listings_file}")`
			`print(" Run extract_listings.py first")`
			`return`

			`with open(listings_file, 'r', encoding='utf-8') as f:`
			`listings = json.load(f)`

			`print(f"📊 Found {len(listings)} stocks in listings")`

			`# Test with first 5 stocks`
			`scraper = YahooFinanceScraper()`
			`await scraper.scrape_multiple_stocks(listings, max_stocks=5)`


			`if __name__ == "__main__":`
			`asyncio.run(main())`