197 lines
7.4 KiB
Python
197 lines
7.4 KiB
Python
|
|
"""
|
||
|
|
Complete Yahoo Finance scraper - gets quote data AND full statistics.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
from datetime import datetime
|
||
|
|
from playwright.async_api import async_playwright
|
||
|
|
from database import StockDatabase
|
||
|
|
from generate_company_report import gather_contents, save_markdown, render_pdf_from_text
|
||
|
|
import re
|
||
|
|
|
||
|
|
|
||
|
|
async def scrape_complete_stock_data(ticker, exchange):
|
||
|
|
"""Scrape complete data including quote and all statistics"""
|
||
|
|
|
||
|
|
# Format ticker
|
||
|
|
yahoo_ticker = ticker
|
||
|
|
if exchange in ['TSX', 'TSXV']:
|
||
|
|
if not ticker.endswith('.TO') and not ticker.endswith('.V'):
|
||
|
|
yahoo_ticker = f"{ticker}.TO"
|
||
|
|
|
||
|
|
print(f"\n{'='*70}")
|
||
|
|
print(f"Scraping: {ticker} ({exchange}) -> {yahoo_ticker}")
|
||
|
|
print('='*70)
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(headless=True)
|
||
|
|
context = await browser.new_context(viewport={'width': 1920, 'height': 1080})
|
||
|
|
page = await context.new_page()
|
||
|
|
|
||
|
|
stock_data = {
|
||
|
|
'ticker': ticker,
|
||
|
|
'exchange': exchange,
|
||
|
|
'yahoo_ticker': yahoo_ticker,
|
||
|
|
'scraped_at': datetime.now().isoformat(),
|
||
|
|
'profile': {},
|
||
|
|
'quote': {},
|
||
|
|
'financials': {},
|
||
|
|
'statistics': {},
|
||
|
|
'error': None
|
||
|
|
}
|
||
|
|
|
||
|
|
try:
|
||
|
|
# 1. Summary page - get quote data
|
||
|
|
url = f"https://finance.yahoo.com/quote/{yahoo_ticker}"
|
||
|
|
print(f"[1/2] Loading summary page...")
|
||
|
|
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
# Check valid
|
||
|
|
content = await page.content()
|
||
|
|
if "Symbol Lookup" in content:
|
||
|
|
print(f"❌ Ticker not found")
|
||
|
|
stock_data['error'] = 'Ticker not found'
|
||
|
|
await browser.close()
|
||
|
|
return stock_data
|
||
|
|
|
||
|
|
# Get quote data with ticker filtering
|
||
|
|
# Don't wait for selector since there are multiple elements
|
||
|
|
|
||
|
|
# Close price - find the one matching our ticker
|
||
|
|
all_prices = await page.query_selector_all('[data-field="regularMarketPrice"]')
|
||
|
|
for elem in all_prices:
|
||
|
|
symbol_attr = await elem.get_attribute('data-symbol')
|
||
|
|
if symbol_attr and symbol_attr.upper() == yahoo_ticker.upper():
|
||
|
|
price_text = await elem.text_content()
|
||
|
|
price_clean = ' '.join(price_text.split())
|
||
|
|
stock_data['profile']['current_price'] = float(price_clean.replace(',', ''))
|
||
|
|
stock_data['quote']['close'] = price_clean
|
||
|
|
break
|
||
|
|
|
||
|
|
# Other quote fields (no data-symbol, safe to use first)
|
||
|
|
open_elem = await page.query_selector('[data-field="regularMarketOpen"]')
|
||
|
|
if open_elem:
|
||
|
|
stock_data['quote']['open'] = ' '.join((await open_elem.text_content()).split())
|
||
|
|
|
||
|
|
range_elem = await page.query_selector('[data-field="regularMarketDayRange"]')
|
||
|
|
if range_elem:
|
||
|
|
range_text = ' '.join((await range_elem.text_content()).split())
|
||
|
|
if ' - ' in range_text:
|
||
|
|
low, high = range_text.split(' - ')
|
||
|
|
stock_data['quote']['low'] = low.strip()
|
||
|
|
stock_data['quote']['high'] = high.strip()
|
||
|
|
|
||
|
|
volume_elem = await page.query_selector('[data-field="regularMarketVolume"]')
|
||
|
|
if volume_elem:
|
||
|
|
stock_data['quote']['volume'] = ' '.join((await volume_elem.text_content()).split())
|
||
|
|
|
||
|
|
page_text = await page.inner_text('body')
|
||
|
|
time_match = re.search(r'At close:\s*([^\n]+(?:EST|EDT|PST|PDT))', page_text)
|
||
|
|
if time_match:
|
||
|
|
stock_data['quote']['date'] = time_match.group(1).strip()
|
||
|
|
|
||
|
|
print(f"✅ Quote data extracted")
|
||
|
|
print(f" Close: {stock_data['quote'].get('close', 'N/A')}")
|
||
|
|
print(f" Open: {stock_data['quote'].get('open', 'N/A')}")
|
||
|
|
print(f" High/Low: {stock_data['quote'].get('high', 'N/A')} / {stock_data['quote'].get('low', 'N/A')}")
|
||
|
|
print(f" Volume: {stock_data['quote'].get('volume', 'N/A')}")
|
||
|
|
|
||
|
|
# 2. Key Statistics page - get full statistics
|
||
|
|
stats_url = f"https://finance.yahoo.com/quote/{yahoo_ticker}/key-statistics"
|
||
|
|
print(f"[2/2] Loading key statistics page...")
|
||
|
|
await page.goto(stats_url, wait_until='domcontentloaded', timeout=60000)
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
stat_tables = await page.query_selector_all('table')
|
||
|
|
stats_count = 0
|
||
|
|
for table in stat_tables:
|
||
|
|
rows = await table.query_selector_all('tr')
|
||
|
|
for row in rows:
|
||
|
|
try:
|
||
|
|
cells = await row.query_selector_all('td')
|
||
|
|
if len(cells) == 2:
|
||
|
|
label = await cells[0].text_content()
|
||
|
|
value = await cells[1].text_content()
|
||
|
|
label_key = label.strip().lower().replace(' ', '_').replace('/', '_')
|
||
|
|
stock_data['statistics'][label_key] = value.strip()
|
||
|
|
stats_count += 1
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"✅ Extracted {stats_count} statistics")
|
||
|
|
print(f"✅ {ticker} complete!\n")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"❌ Error: {e}")
|
||
|
|
stock_data['error'] = str(e)
|
||
|
|
|
||
|
|
finally:
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
return stock_data
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
"""Scrape all stocks, save data, insert to DB, generate reports"""
|
||
|
|
stocks = [
|
||
|
|
('AAPL', 'NASDAQ'),
|
||
|
|
('MSFT', 'NASDAQ'),
|
||
|
|
('SHOP.TO', 'TSX'),
|
||
|
|
]
|
||
|
|
|
||
|
|
db = StockDatabase()
|
||
|
|
|
||
|
|
print("\n" + "="*70)
|
||
|
|
print("COMPLETE STOCK DATA SCRAPER & REPORT GENERATOR")
|
||
|
|
print("="*70)
|
||
|
|
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
||
|
|
|
||
|
|
for ticker, exchange in stocks:
|
||
|
|
# Scrape
|
||
|
|
result = await scrape_complete_stock_data(ticker, exchange)
|
||
|
|
|
||
|
|
if result.get('error'):
|
||
|
|
print(f"⚠️ Skipping {ticker} due to error\n")
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Save to file
|
||
|
|
os.makedirs('data/financials', exist_ok=True)
|
||
|
|
filepath = f'data/financials/{ticker}_yahoo.json'
|
||
|
|
with open(filepath, 'w') as f:
|
||
|
|
json.dump(result, f, indent=2)
|
||
|
|
print(f"💾 Saved to {filepath}")
|
||
|
|
|
||
|
|
# Insert quote to database
|
||
|
|
quote = result.get('quote', {})
|
||
|
|
if quote and any(quote.values()):
|
||
|
|
db.insert_stock_quote(ticker, quote)
|
||
|
|
print(f"💾 Quote saved to database")
|
||
|
|
|
||
|
|
# Generate report
|
||
|
|
print(f"📄 Generating report...")
|
||
|
|
content = gather_contents(ticker)
|
||
|
|
md_path = save_markdown(ticker, content)
|
||
|
|
print(f"✅ Markdown: {md_path}")
|
||
|
|
|
||
|
|
try:
|
||
|
|
pdf_path = f'data/reports/{ticker}_full_report.pdf'
|
||
|
|
render_pdf_from_text(ticker, content, pdf_path)
|
||
|
|
print(f"✅ PDF: {pdf_path}")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"⚠️ PDF skipped: {e}")
|
||
|
|
|
||
|
|
print("")
|
||
|
|
|
||
|
|
db.close()
|
||
|
|
|
||
|
|
print("="*70)
|
||
|
|
print("✅ ALL COMPLETE!")
|
||
|
|
print("="*70)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
asyncio.run(main())
|