329 lines
14 KiB
Python
329 lines
14 KiB
Python
|
|
"""
|
||
|
|
Scrape financial data from Yahoo Finance (no API key needed)
|
||
|
|
Gets financials, ratios, and key metrics for each stock
|
||
|
|
"""
|
||
|
|
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
from datetime import datetime
|
||
|
|
from playwright.async_api import async_playwright
|
||
|
|
import time
|
||
|
|
import re
|
||
|
|
|
||
|
|
|
||
|
|
class YahooFinanceScraper:
|
||
|
|
def __init__(self, output_dir="data/financials"):
|
||
|
|
self.output_dir = output_dir
|
||
|
|
os.makedirs(output_dir, exist_ok=True)
|
||
|
|
|
||
|
|
async def scrape_stock_data(self, ticker, exchange=""):
|
||
|
|
"""Scrape comprehensive data for a single stock"""
|
||
|
|
print(f"\n🔍 Scraping {ticker}...")
|
||
|
|
|
||
|
|
# Format ticker for Yahoo Finance
|
||
|
|
yahoo_ticker = ticker
|
||
|
|
|
||
|
|
# Canadian stocks need exchange-specific suffixes
|
||
|
|
if exchange in ['TSX', 'TSXV', 'TSX/TSXV']:
|
||
|
|
if not ticker.endswith('.TO') and not ticker.endswith('.V'):
|
||
|
|
yahoo_ticker = f"{ticker}.TO" # Try TSX first
|
||
|
|
|
||
|
|
# CSE (Canadian Securities Exchange) stocks use .CN suffix
|
||
|
|
# CSE tickers in database may have "T2" prefix which needs to be removed
|
||
|
|
elif exchange == 'CSE':
|
||
|
|
# Remove T2 prefix if present (e.g., T2AAA -> AAA)
|
||
|
|
clean_ticker = ticker.replace('T2', '') if ticker.startswith('T2') else ticker
|
||
|
|
# Remove any suffix after a dot (e.g., T2AAAWH.U -> AAAWH)
|
||
|
|
if '.' in clean_ticker:
|
||
|
|
clean_ticker = clean_ticker.split('.')[0]
|
||
|
|
yahoo_ticker = f"{clean_ticker}.CN"
|
||
|
|
print(f" CSE stock: {ticker} -> {yahoo_ticker}")
|
||
|
|
|
||
|
|
stock_data = {
|
||
|
|
'ticker': ticker,
|
||
|
|
'exchange': exchange,
|
||
|
|
'yahoo_ticker': yahoo_ticker,
|
||
|
|
'scraped_at': datetime.now().isoformat(),
|
||
|
|
'profile': {},
|
||
|
|
'quote': {}, # Real-time quote data
|
||
|
|
'financials': {},
|
||
|
|
'statistics': {},
|
||
|
|
'analysis': {},
|
||
|
|
'error': None
|
||
|
|
}
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
# Launch with no-cache to avoid stale data
|
||
|
|
browser = await p.chromium.launch(
|
||
|
|
headless=True,
|
||
|
|
args=['--disable-blink-features=AutomationControlled']
|
||
|
|
)
|
||
|
|
context = await browser.new_context(
|
||
|
|
viewport={'width': 1920, 'height': 1080},
|
||
|
|
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||
|
|
)
|
||
|
|
page = await context.new_page()
|
||
|
|
|
||
|
|
try:
|
||
|
|
# 1. Get Summary/Statistics page
|
||
|
|
url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
|
||
|
|
print(f" Loading {url}...")
|
||
|
|
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||
|
|
await asyncio.sleep(5) # Wait for dynamic content to load
|
||
|
|
|
||
|
|
# Check if ticker exists
|
||
|
|
page_content = await page.content()
|
||
|
|
if "Symbol Lookup" in page_content or "Symbols similar to" in page_content:
|
||
|
|
print(f"⚠️ {yahoo_ticker} not found on Yahoo Finance")
|
||
|
|
stock_data['error'] = 'Ticker not found'
|
||
|
|
|
||
|
|
# Try alternative suffix for TSXV
|
||
|
|
if yahoo_ticker.endswith('.TO'):
|
||
|
|
yahoo_ticker = f"{ticker}.V"
|
||
|
|
print(f" Trying {yahoo_ticker}...")
|
||
|
|
url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
|
||
|
|
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
page_content = await page.content()
|
||
|
|
if "Symbol Lookup" in page_content:
|
||
|
|
await browser.close()
|
||
|
|
return stock_data
|
||
|
|
else:
|
||
|
|
stock_data['yahoo_ticker'] = yahoo_ticker
|
||
|
|
stock_data['error'] = None
|
||
|
|
|
||
|
|
# Extract key stats and quote data from summary
|
||
|
|
try:
|
||
|
|
# Get real-time quote data from the quote header section
|
||
|
|
# Initialize quote fields to empty to avoid caching from previous runs
|
||
|
|
stock_data['quote'] = {
|
||
|
|
'date': '',
|
||
|
|
'open': '',
|
||
|
|
'high': '',
|
||
|
|
'low': '',
|
||
|
|
'close': '',
|
||
|
|
'volume': ''
|
||
|
|
}
|
||
|
|
|
||
|
|
# Close (current price)
|
||
|
|
price_elem = await page.query_selector('[data-field="regularMarketPrice"]')
|
||
|
|
if price_elem:
|
||
|
|
price_text = await price_elem.inner_text()
|
||
|
|
# Remove whitespace and newlines
|
||
|
|
price_text = ' '.join(price_text.split())
|
||
|
|
print(f" Raw price text: '{price_text}'")
|
||
|
|
try:
|
||
|
|
current_price = float(price_text.replace(',', ''))
|
||
|
|
stock_data['profile']['current_price'] = current_price
|
||
|
|
stock_data['quote']['close'] = price_text
|
||
|
|
print(f" Parsed price: {current_price}")
|
||
|
|
except ValueError:
|
||
|
|
print(f" Warning: Could not parse price: {price_text}")
|
||
|
|
|
||
|
|
# Open price
|
||
|
|
open_elem = await page.query_selector('[data-field="regularMarketOpen"]')
|
||
|
|
if open_elem:
|
||
|
|
open_text = await open_elem.inner_text()
|
||
|
|
stock_data['quote']['open'] = ' '.join(open_text.split())
|
||
|
|
|
||
|
|
# Day range (high/low)
|
||
|
|
range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')
|
||
|
|
if range_elem:
|
||
|
|
range_text = await range_elem.inner_text()
|
||
|
|
range_text = ' '.join(range_text.split())
|
||
|
|
if ' - ' in range_text:
|
||
|
|
low, high = range_text.split(' - ')
|
||
|
|
stock_data['quote']['low'] = low.strip()
|
||
|
|
stock_data['quote']['high'] = high.strip()
|
||
|
|
|
||
|
|
# Volume
|
||
|
|
volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')
|
||
|
|
if volume_elem:
|
||
|
|
volume_text = await volume_elem.inner_text()
|
||
|
|
stock_data['quote']['volume'] = ' '.join(volume_text.split())
|
||
|
|
|
||
|
|
# Date/time - extract from page text
|
||
|
|
page_text = await page.inner_text('body')
|
||
|
|
# Look for "At close: November 5 at 4:00:01 PM EST" pattern
|
||
|
|
import re
|
||
|
|
time_match = re.search(r'At close:\s*([^\\n]+(?:EST|EDT|PST|PDT))', page_text)
|
||
|
|
if time_match:
|
||
|
|
stock_data['quote']['date'] = time_match.group(1).strip()
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Error extracting summary: {e}")
|
||
|
|
|
||
|
|
# Get market cap, P/E, etc from the stats table
|
||
|
|
stat_rows = await page.query_selector_all('table tr')
|
||
|
|
for row in stat_rows:
|
||
|
|
try:
|
||
|
|
cells = await row.query_selector_all('td')
|
||
|
|
if len(cells) == 2:
|
||
|
|
label = await cells[0].inner_text()
|
||
|
|
value = await cells[1].inner_text()
|
||
|
|
|
||
|
|
label = label.strip().lower().replace(' ', '_').replace('/', '_')
|
||
|
|
stock_data['statistics'][label] = value.strip()
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Error extracting summary: {e}")
|
||
|
|
|
||
|
|
# 2. Get Financials page
|
||
|
|
try:
|
||
|
|
financials_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/financials"
|
||
|
|
await page.goto(financials_url, wait_until='domcontentloaded', timeout=60000)
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
# Extract financial data
|
||
|
|
financial_tables = await page.query_selector_all('div[class*="financials"] table')
|
||
|
|
for table in financial_tables:
|
||
|
|
rows = await table.query_selector_all('tr')
|
||
|
|
for row in rows:
|
||
|
|
try:
|
||
|
|
cells = await row.query_selector_all('td, th')
|
||
|
|
if len(cells) >= 2:
|
||
|
|
label = await cells[0].inner_text()
|
||
|
|
values = []
|
||
|
|
for i in range(1, len(cells)):
|
||
|
|
val = await cells[i].inner_text()
|
||
|
|
values.append(val.strip())
|
||
|
|
|
||
|
|
label_key = label.strip().lower().replace(' ', '_')
|
||
|
|
stock_data['financials'][label_key] = values
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Error extracting financials: {e}")
|
||
|
|
|
||
|
|
# 3. Get Key Statistics page
|
||
|
|
try:
|
||
|
|
stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"
|
||
|
|
await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
# Extract all statistics
|
||
|
|
stat_tables = await page.query_selector_all('table')
|
||
|
|
for table in stat_tables:
|
||
|
|
rows = await table.query_selector_all('tr')
|
||
|
|
for row in rows:
|
||
|
|
try:
|
||
|
|
cells = await row.query_selector_all('td')
|
||
|
|
if len(cells) == 2:
|
||
|
|
label = await cells[0].inner_text()
|
||
|
|
value = await cells[1].inner_text()
|
||
|
|
|
||
|
|
label_key = label.strip().lower().replace(' ', '_').replace('/', '_')
|
||
|
|
stock_data['statistics'][label_key] = value.strip()
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Error extracting statistics: {e}")
|
||
|
|
|
||
|
|
# 4. Get Analysis page (analyst ratings, growth estimates)
|
||
|
|
try:
|
||
|
|
analysis_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/analysis"
|
||
|
|
await page.goto(analysis_url, wait_until='networkidle', timeout=30000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
# Extract analysis data
|
||
|
|
analysis_tables = await page.query_selector_all('table')
|
||
|
|
for idx, table in enumerate(analysis_tables):
|
||
|
|
table_data = []
|
||
|
|
rows = await table.query_selector_all('tr')
|
||
|
|
for row in rows:
|
||
|
|
cells = await row.query_selector_all('td, th')
|
||
|
|
row_data = []
|
||
|
|
for cell in cells:
|
||
|
|
text = await cell.inner_text()
|
||
|
|
row_data.append(text.strip())
|
||
|
|
if row_data:
|
||
|
|
table_data.append(row_data)
|
||
|
|
|
||
|
|
stock_data['analysis'][f'table_{idx}'] = table_data
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f" Error extracting analysis: {e}")
|
||
|
|
|
||
|
|
print(f"✅ {ticker} data scraped successfully")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"❌ Error scraping {ticker}: {e}")
|
||
|
|
stock_data['error'] = str(e)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
# Save individual stock data
|
||
|
|
output_file = f"{self.output_dir}/{ticker}_yahoo.json"
|
||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
|
json.dump(stock_data, f, indent=2)
|
||
|
|
|
||
|
|
return stock_data
|
||
|
|
|
||
|
|
async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
|
||
|
|
"""Scrape data for multiple stocks"""
|
||
|
|
print("=" * 60)
|
||
|
|
print("YAHOO FINANCE SCRAPING")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
if max_stocks:
|
||
|
|
stock_list = stock_list[:max_stocks]
|
||
|
|
|
||
|
|
all_data = []
|
||
|
|
successful = 0
|
||
|
|
failed = 0
|
||
|
|
|
||
|
|
for stock in stock_list:
|
||
|
|
ticker = stock.get('symbol')
|
||
|
|
exchange = stock.get('exchange')
|
||
|
|
|
||
|
|
data = await self.scrape_stock_data(ticker, exchange)
|
||
|
|
all_data.append(data)
|
||
|
|
|
||
|
|
if data.get('error'):
|
||
|
|
failed += 1
|
||
|
|
else:
|
||
|
|
successful += 1
|
||
|
|
|
||
|
|
# Rate limiting
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
print("\n" + "=" * 60)
|
||
|
|
print(f"✅ Successfully scraped: {successful}")
|
||
|
|
print(f"❌ Failed: {failed}")
|
||
|
|
print(f"📁 Data saved to: {self.output_dir}/")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
return all_data
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
"""Test the scraper with a few stocks"""
|
||
|
|
|
||
|
|
# Load listings
|
||
|
|
listings_file = "data/listings/all_listings_combined.json"
|
||
|
|
|
||
|
|
if not os.path.exists(listings_file):
|
||
|
|
print(f"❌ No listings file found at {listings_file}")
|
||
|
|
print(" Run extract_listings.py first")
|
||
|
|
return
|
||
|
|
|
||
|
|
with open(listings_file, 'r', encoding='utf-8') as f:
|
||
|
|
listings = json.load(f)
|
||
|
|
|
||
|
|
print(f"📊 Found {len(listings)} stocks in listings")
|
||
|
|
|
||
|
|
# Test with first 5 stocks
|
||
|
|
scraper = YahooFinanceScraper()
|
||
|
|
await scraper.scrape_multiple_stocks(listings, max_stocks=5)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
asyncio.run(main())
|