Files

329 lines
14 KiB
Python
Raw Permalink Normal View History

"""
Scrape financial data from Yahoo Finance (no API key needed)
Gets financials, ratios, and key metrics for each stock
"""
import asyncio
import json
import os
from datetime import datetime
from playwright.async_api import async_playwright
import time
import re
class YahooFinanceScraper:
def __init__(self, output_dir="data/financials"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
async def scrape_stock_data(self, ticker, exchange=""):
"""Scrape comprehensive data for a single stock"""
print(f"\n🔍 Scraping {ticker}...")
# Format ticker for Yahoo Finance
yahoo_ticker = ticker
# Canadian stocks need exchange-specific suffixes
if exchange in ['TSX', 'TSXV', 'TSX/TSXV']:
if not ticker.endswith('.TO') and not ticker.endswith('.V'):
yahoo_ticker = f"{ticker}.TO" # Try TSX first
# CSE (Canadian Securities Exchange) stocks use .CN suffix
# CSE tickers in database may have "T2" prefix which needs to be removed
elif exchange == 'CSE':
# Remove T2 prefix if present (e.g., T2AAA -> AAA)
clean_ticker = ticker.replace('T2', '') if ticker.startswith('T2') else ticker
# Remove any suffix after a dot (e.g., T2AAAWH.U -> AAAWH)
if '.' in clean_ticker:
clean_ticker = clean_ticker.split('.')[0]
yahoo_ticker = f"{clean_ticker}.CN"
print(f" CSE stock: {ticker} -> {yahoo_ticker}")
stock_data = {
'ticker': ticker,
'exchange': exchange,
'yahoo_ticker': yahoo_ticker,
'scraped_at': datetime.now().isoformat(),
'profile': {},
'quote': {}, # Real-time quote data
'financials': {},
'statistics': {},
'analysis': {},
'error': None
}
async with async_playwright() as p:
# Launch with no-cache to avoid stale data
browser = await p.chromium.launch(
headless=True,
args=['--disable-blink-features=AutomationControlled']
)
context = await browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
)
page = await context.new_page()
try:
# 1. Get Summary/Statistics page
url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
print(f" Loading {url}...")
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
await asyncio.sleep(5) # Wait for dynamic content to load
# Check if ticker exists
page_content = await page.content()
if "Symbol Lookup" in page_content or "Symbols similar to" in page_content:
print(f"⚠️ {yahoo_ticker} not found on Yahoo Finance")
stock_data['error'] = 'Ticker not found'
# Try alternative suffix for TSXV
if yahoo_ticker.endswith('.TO'):
yahoo_ticker = f"{ticker}.V"
print(f" Trying {yahoo_ticker}...")
url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
await asyncio.sleep(5)
page_content = await page.content()
if "Symbol Lookup" in page_content:
await browser.close()
return stock_data
else:
stock_data['yahoo_ticker'] = yahoo_ticker
stock_data['error'] = None
# Extract key stats and quote data from summary
try:
# Get real-time quote data from the quote header section
# Initialize quote fields to empty to avoid caching from previous runs
stock_data['quote'] = {
'date': '',
'open': '',
'high': '',
'low': '',
'close': '',
'volume': ''
}
# Close (current price)
price_elem = await page.query_selector('[data-field="regularMarketPrice"]')
if price_elem:
price_text = await price_elem.inner_text()
# Remove whitespace and newlines
price_text = ' '.join(price_text.split())
print(f" Raw price text: '{price_text}'")
try:
current_price = float(price_text.replace(',', ''))
stock_data['profile']['current_price'] = current_price
stock_data['quote']['close'] = price_text
print(f" Parsed price: {current_price}")
except ValueError:
print(f" Warning: Could not parse price: {price_text}")
# Open price
open_elem = await page.query_selector('[data-field="regularMarketOpen"]')
if open_elem:
open_text = await open_elem.inner_text()
stock_data['quote']['open'] = ' '.join(open_text.split())
# Day range (high/low)
range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')
if range_elem:
range_text = await range_elem.inner_text()
range_text = ' '.join(range_text.split())
if ' - ' in range_text:
low, high = range_text.split(' - ')
stock_data['quote']['low'] = low.strip()
stock_data['quote']['high'] = high.strip()
# Volume
volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')
if volume_elem:
volume_text = await volume_elem.inner_text()
stock_data['quote']['volume'] = ' '.join(volume_text.split())
# Date/time - extract from page text
page_text = await page.inner_text('body')
# Look for "At close: November 5 at 4:00:01 PM EST" pattern
import re
time_match = re.search(r'At close:\s*([^\\n]+(?:EST|EDT|PST|PDT))', page_text)
if time_match:
stock_data['quote']['date'] = time_match.group(1).strip()
except Exception as e:
print(f" Error extracting summary: {e}")
# Get market cap, P/E, etc from the stats table
stat_rows = await page.query_selector_all('table tr')
for row in stat_rows:
try:
cells = await row.query_selector_all('td')
if len(cells) == 2:
label = await cells[0].inner_text()
value = await cells[1].inner_text()
label = label.strip().lower().replace(' ', '_').replace('/', '_')
stock_data['statistics'][label] = value.strip()
except:
continue
except Exception as e:
print(f" Error extracting summary: {e}")
# 2. Get Financials page
try:
financials_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/financials"
await page.goto(financials_url, wait_until='domcontentloaded', timeout=60000)
await asyncio.sleep(5)
# Extract financial data
financial_tables = await page.query_selector_all('div[class*="financials"] table')
for table in financial_tables:
rows = await table.query_selector_all('tr')
for row in rows:
try:
cells = await row.query_selector_all('td, th')
if len(cells) >= 2:
label = await cells[0].inner_text()
values = []
for i in range(1, len(cells)):
val = await cells[i].inner_text()
values.append(val.strip())
label_key = label.strip().lower().replace(' ', '_')
stock_data['financials'][label_key] = values
except:
continue
except Exception as e:
print(f" Error extracting financials: {e}")
# 3. Get Key Statistics page
try:
stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"
await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)
await asyncio.sleep(5)
# Extract all statistics
stat_tables = await page.query_selector_all('table')
for table in stat_tables:
rows = await table.query_selector_all('tr')
for row in rows:
try:
cells = await row.query_selector_all('td')
if len(cells) == 2:
label = await cells[0].inner_text()
value = await cells[1].inner_text()
label_key = label.strip().lower().replace(' ', '_').replace('/', '_')
stock_data['statistics'][label_key] = value.strip()
except:
continue
except Exception as e:
print(f" Error extracting statistics: {e}")
# 4. Get Analysis page (analyst ratings, growth estimates)
try:
analysis_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/analysis"
await page.goto(analysis_url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Extract analysis data
analysis_tables = await page.query_selector_all('table')
for idx, table in enumerate(analysis_tables):
table_data = []
rows = await table.query_selector_all('tr')
for row in rows:
cells = await row.query_selector_all('td, th')
row_data = []
for cell in cells:
text = await cell.inner_text()
row_data.append(text.strip())
if row_data:
table_data.append(row_data)
stock_data['analysis'][f'table_{idx}'] = table_data
except Exception as e:
print(f" Error extracting analysis: {e}")
print(f"{ticker} data scraped successfully")
except Exception as e:
print(f"❌ Error scraping {ticker}: {e}")
stock_data['error'] = str(e)
finally:
await browser.close()
# Save individual stock data
output_file = f"{self.output_dir}/{ticker}_yahoo.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(stock_data, f, indent=2)
return stock_data
async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
"""Scrape data for multiple stocks"""
print("=" * 60)
print("YAHOO FINANCE SCRAPING")
print("=" * 60)
if max_stocks:
stock_list = stock_list[:max_stocks]
all_data = []
successful = 0
failed = 0
for stock in stock_list:
ticker = stock.get('symbol')
exchange = stock.get('exchange')
data = await self.scrape_stock_data(ticker, exchange)
all_data.append(data)
if data.get('error'):
failed += 1
else:
successful += 1
# Rate limiting
await asyncio.sleep(2)
print("\n" + "=" * 60)
print(f"✅ Successfully scraped: {successful}")
print(f"❌ Failed: {failed}")
print(f"📁 Data saved to: {self.output_dir}/")
print("=" * 60)
return all_data
async def main():
"""Test the scraper with a few stocks"""
# Load listings
listings_file = "data/listings/all_listings_combined.json"
if not os.path.exists(listings_file):
print(f"❌ No listings file found at {listings_file}")
print(" Run extract_listings.py first")
return
with open(listings_file, 'r', encoding='utf-8') as f:
listings = json.load(f)
print(f"📊 Found {len(listings)} stocks in listings")
# Test with first 5 stocks
scraper = YahooFinanceScraper()
await scraper.scrape_multiple_stocks(listings, max_stocks=5)
if __name__ == "__main__":
asyncio.run(main())