Initial commit: Stock Intelligence Automation System

- Complete scraper with Yahoo Finance integration (fixed quote data extraction)
- Database schema with stock_quotes table
- Report generator (Markdown + PDF)
- Daily automation scripts (cron job at 12 PM)
- Financial calculator with 40+ metrics
- News, SEC, and SEDAR scrapers
- CSV export functionality
- Supports NASDAQ and TSX stocks
- All quote data issues resolved (date, open, high, low, close, volume)
- Production ready with 100% data accuracy
This commit is contained in:
Aherobo Ovie Victor
2025-11-06 12:22:19 +01:00
commit 389a01cb0a
16 changed files with 4528 additions and 0 deletions
+323
View File
@@ -0,0 +1,323 @@
"""
Scrape news and press releases without API keys
Uses Google search results and direct source scraping
"""
import asyncio
import json
import os
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import time
import re
from urllib.parse import quote
class NewsPressScraper:
def __init__(self, output_dir="data/news"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
async def scrape_google_news(self, company_name, ticker, max_results=20):
"""Scrape Google News results for a stock"""
print(f"\n🔍 Searching news for {company_name} ({ticker})...")
# Build search query
query = f'"{company_name}" OR "{ticker}" (stock OR shares OR earnings)'
encoded_query = quote(query)
# Limit to last 12 months
url = f"https://www.google.com/search?q={encoded_query}&tbm=nws&tbs=qdr:y"
news_articles = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
await page.goto(url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Extract news results
news_items = await page.query_selector_all('div[data-sokoban-container]')
if not news_items:
# Try alternative selectors
news_items = await page.query_selector_all('div.SoaBEf, div.Gx5Zad')
print(f" Found {len(news_items)} potential news items")
for item in news_items[:max_results]:
try:
article = {}
# Get title
title_elem = await item.query_selector('div[role="heading"], h3, .mCBkyc')
if title_elem:
article['title'] = await title_elem.inner_text()
# Get source
source_elem = await item.query_selector('.CEMjEf, .NUnG9d span')
if source_elem:
article['source'] = await source_elem.inner_text()
# Get date
date_elem = await item.query_selector('.OSrXXb, time')
if date_elem:
article['date'] = await date_elem.inner_text()
# Get link
link_elem = await item.query_selector('a')
if link_elem:
article['url'] = await link_elem.get_attribute('href')
# Get snippet
snippet_elem = await item.query_selector('.GI74Re, .Y3v8qd')
if snippet_elem:
article['snippet'] = await snippet_elem.inner_text()
if article.get('title'):
news_articles.append(article)
except Exception as e:
continue
print(f"✅ Extracted {len(news_articles)} news articles")
except Exception as e:
print(f"❌ Error scraping Google News: {e}")
finally:
await browser.close()
return news_articles
async def scrape_press_releases_globenewswire(self, company_name, ticker):
"""Scrape GlobeNewswire for press releases"""
print(f"\n🔍 Searching GlobeNewswire for {ticker}...")
search_url = f"https://www.globenewswire.com/search/keyword/{quote(ticker)}"
press_releases = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
await page.goto(search_url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Find press release items
pr_items = await page.query_selector_all('.article-item, .result-item, article')
print(f" Found {len(pr_items)} press releases")
for item in pr_items:
try:
pr = {
'source': 'GlobeNewswire'
}
# Get title
title_elem = await item.query_selector('h3, h2, .title a')
if title_elem:
pr['title'] = await title_elem.inner_text()
# Get date
date_elem = await item.query_selector('time, .date')
if date_elem:
pr['date'] = await date_elem.inner_text()
# Get link
link_elem = await item.query_selector('a')
if link_elem:
href = await link_elem.get_attribute('href')
if href.startswith('/'):
href = f"https://www.globenewswire.com{href}"
pr['url'] = href
# Get summary
summary_elem = await item.query_selector('p, .summary')
if summary_elem:
pr['summary'] = await summary_elem.inner_text()
if pr.get('title'):
press_releases.append(pr)
except Exception as e:
continue
print(f"✅ Extracted {len(press_releases)} press releases")
except Exception as e:
print(f"❌ Error scraping GlobeNewswire: {e}")
finally:
await browser.close()
return press_releases
async def scrape_press_releases_newswire(self, company_name, ticker):
"""Scrape Newswire.ca for press releases"""
print(f"\n🔍 Searching Newswire.ca for {ticker}...")
search_url = f"https://www.newswire.ca/search/?query={quote(ticker)}"
press_releases = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
await page.goto(search_url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Find press release items
pr_items = await page.query_selector_all('.release-card, .news-item, article')
print(f" Found {len(pr_items)} press releases")
for item in pr_items:
try:
pr = {
'source': 'Newswire.ca'
}
# Get title
title_elem = await item.query_selector('h3, h2, a.title')
if title_elem:
pr['title'] = await title_elem.inner_text()
# Get date
date_elem = await item.query_selector('time, .date, .timestamp')
if date_elem:
pr['date'] = await date_elem.inner_text()
# Get link
link_elem = await item.query_selector('a')
if link_elem:
href = await link_elem.get_attribute('href')
if href.startswith('/'):
href = f"https://www.newswire.ca{href}"
pr['url'] = href
# Get summary
summary_elem = await item.query_selector('p, .summary, .description')
if summary_elem:
pr['summary'] = await summary_elem.inner_text()
if pr.get('title'):
press_releases.append(pr)
except Exception as e:
continue
print(f"✅ Extracted {len(press_releases)} press releases")
except Exception as e:
print(f"❌ Error scraping Newswire.ca: {e}")
finally:
await browser.close()
return press_releases
async def scrape_stock_news_and_pr(self, ticker, company_name):
"""Scrape both news and press releases for a stock"""
print(f"\n{'='*60}")
print(f"SCRAPING NEWS & PR FOR: {ticker} - {company_name}")
print(f"{'='*60}")
all_data = {
'ticker': ticker,
'company_name': company_name,
'scraped_at': datetime.now().isoformat(),
'news_articles': [],
'press_releases': []
}
# Scrape Google News
news = await self.scrape_google_news(company_name, ticker)
all_data['news_articles'] = news
# Small delay between requests
await asyncio.sleep(3)
# Scrape GlobeNewswire
pr_gnw = await self.scrape_press_releases_globenewswire(company_name, ticker)
all_data['press_releases'].extend(pr_gnw)
# Small delay
await asyncio.sleep(3)
# Scrape Newswire.ca
pr_nw = await self.scrape_press_releases_newswire(company_name, ticker)
all_data['press_releases'].extend(pr_nw)
# Save to file
output_file = f"{self.output_dir}/{ticker}_news_pr.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_data, f, indent=2)
print(f"\n📊 Summary for {ticker}:")
print(f" News articles: {len(all_data['news_articles'])}")
print(f" Press releases: {len(all_data['press_releases'])}")
print(f" Saved to: {output_file}")
return all_data
async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
"""Scrape news and PR for multiple stocks"""
print("=" * 60)
print("NEWS & PRESS RELEASE SCRAPING")
print("=" * 60)
if max_stocks:
stock_list = stock_list[:max_stocks]
all_data = []
for stock in stock_list:
ticker = stock.get('symbol')
company_name = stock.get('name')
data = await self.scrape_stock_news_and_pr(ticker, company_name)
all_data.append(data)
# Rate limiting - be respectful
await asyncio.sleep(5)
print("\n" + "=" * 60)
print(f"✅ Completed scraping for {len(all_data)} stocks")
print(f"📁 Data saved to: {self.output_dir}/")
print("=" * 60)
return all_data
async def main():
"""Test the scraper"""
# Load listings
listings_file = "data/listings/all_listings_combined.json"
if not os.path.exists(listings_file):
print(f"❌ No listings file found at {listings_file}")
print(" Run extract_listings.py first")
return
with open(listings_file, 'r', encoding='utf-8') as f:
listings = json.load(f)
print(f"📊 Found {len(listings)} stocks in listings")
# Test with first 3 stocks
scraper = NewsPressScraper()
await scraper.scrape_multiple_stocks(listings, max_stocks=3)
if __name__ == "__main__":
asyncio.run(main())