324 lines
12 KiB
Python
324 lines
12 KiB
Python
|
|
"""
|
||
|
|
Scrape news and press releases without API keys
|
||
|
|
Uses Google search results and direct source scraping
|
||
|
|
"""
|
||
|
|
|
||
|
|
import asyncio
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
from datetime import datetime, timedelta
|
||
|
|
from playwright.async_api import async_playwright
|
||
|
|
import time
|
||
|
|
import re
|
||
|
|
from urllib.parse import quote
|
||
|
|
|
||
|
|
|
||
|
|
class NewsPressScraper:
|
||
|
|
def __init__(self, output_dir="data/news"):
|
||
|
|
self.output_dir = output_dir
|
||
|
|
os.makedirs(output_dir, exist_ok=True)
|
||
|
|
|
||
|
|
async def scrape_google_news(self, company_name, ticker, max_results=20):
|
||
|
|
"""Scrape Google News results for a stock"""
|
||
|
|
print(f"\n🔍 Searching news for {company_name} ({ticker})...")
|
||
|
|
|
||
|
|
# Build search query
|
||
|
|
query = f'"{company_name}" OR "{ticker}" (stock OR shares OR earnings)'
|
||
|
|
encoded_query = quote(query)
|
||
|
|
|
||
|
|
# Limit to last 12 months
|
||
|
|
url = f"https://www.google.com/search?q={encoded_query}&tbm=nws&tbs=qdr:y"
|
||
|
|
|
||
|
|
news_articles = []
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(headless=True)
|
||
|
|
page = await browser.new_page()
|
||
|
|
|
||
|
|
try:
|
||
|
|
await page.goto(url, wait_until='networkidle', timeout=30000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
# Extract news results
|
||
|
|
news_items = await page.query_selector_all('div[data-sokoban-container]')
|
||
|
|
|
||
|
|
if not news_items:
|
||
|
|
# Try alternative selectors
|
||
|
|
news_items = await page.query_selector_all('div.SoaBEf, div.Gx5Zad')
|
||
|
|
|
||
|
|
print(f" Found {len(news_items)} potential news items")
|
||
|
|
|
||
|
|
for item in news_items[:max_results]:
|
||
|
|
try:
|
||
|
|
article = {}
|
||
|
|
|
||
|
|
# Get title
|
||
|
|
title_elem = await item.query_selector('div[role="heading"], h3, .mCBkyc')
|
||
|
|
if title_elem:
|
||
|
|
article['title'] = await title_elem.inner_text()
|
||
|
|
|
||
|
|
# Get source
|
||
|
|
source_elem = await item.query_selector('.CEMjEf, .NUnG9d span')
|
||
|
|
if source_elem:
|
||
|
|
article['source'] = await source_elem.inner_text()
|
||
|
|
|
||
|
|
# Get date
|
||
|
|
date_elem = await item.query_selector('.OSrXXb, time')
|
||
|
|
if date_elem:
|
||
|
|
article['date'] = await date_elem.inner_text()
|
||
|
|
|
||
|
|
# Get link
|
||
|
|
link_elem = await item.query_selector('a')
|
||
|
|
if link_elem:
|
||
|
|
article['url'] = await link_elem.get_attribute('href')
|
||
|
|
|
||
|
|
# Get snippet
|
||
|
|
snippet_elem = await item.query_selector('.GI74Re, .Y3v8qd')
|
||
|
|
if snippet_elem:
|
||
|
|
article['snippet'] = await snippet_elem.inner_text()
|
||
|
|
|
||
|
|
if article.get('title'):
|
||
|
|
news_articles.append(article)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"✅ Extracted {len(news_articles)} news articles")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"❌ Error scraping Google News: {e}")
|
||
|
|
|
||
|
|
finally:
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
return news_articles
|
||
|
|
|
||
|
|
async def scrape_press_releases_globenewswire(self, company_name, ticker):
|
||
|
|
"""Scrape GlobeNewswire for press releases"""
|
||
|
|
print(f"\n🔍 Searching GlobeNewswire for {ticker}...")
|
||
|
|
|
||
|
|
search_url = f"https://www.globenewswire.com/search/keyword/{quote(ticker)}"
|
||
|
|
|
||
|
|
press_releases = []
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(headless=True)
|
||
|
|
page = await browser.new_page()
|
||
|
|
|
||
|
|
try:
|
||
|
|
await page.goto(search_url, wait_until='networkidle', timeout=30000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
# Find press release items
|
||
|
|
pr_items = await page.query_selector_all('.article-item, .result-item, article')
|
||
|
|
|
||
|
|
print(f" Found {len(pr_items)} press releases")
|
||
|
|
|
||
|
|
for item in pr_items:
|
||
|
|
try:
|
||
|
|
pr = {
|
||
|
|
'source': 'GlobeNewswire'
|
||
|
|
}
|
||
|
|
|
||
|
|
# Get title
|
||
|
|
title_elem = await item.query_selector('h3, h2, .title a')
|
||
|
|
if title_elem:
|
||
|
|
pr['title'] = await title_elem.inner_text()
|
||
|
|
|
||
|
|
# Get date
|
||
|
|
date_elem = await item.query_selector('time, .date')
|
||
|
|
if date_elem:
|
||
|
|
pr['date'] = await date_elem.inner_text()
|
||
|
|
|
||
|
|
# Get link
|
||
|
|
link_elem = await item.query_selector('a')
|
||
|
|
if link_elem:
|
||
|
|
href = await link_elem.get_attribute('href')
|
||
|
|
if href.startswith('/'):
|
||
|
|
href = f"https://www.globenewswire.com{href}"
|
||
|
|
pr['url'] = href
|
||
|
|
|
||
|
|
# Get summary
|
||
|
|
summary_elem = await item.query_selector('p, .summary')
|
||
|
|
if summary_elem:
|
||
|
|
pr['summary'] = await summary_elem.inner_text()
|
||
|
|
|
||
|
|
if pr.get('title'):
|
||
|
|
press_releases.append(pr)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"✅ Extracted {len(press_releases)} press releases")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"❌ Error scraping GlobeNewswire: {e}")
|
||
|
|
|
||
|
|
finally:
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
return press_releases
|
||
|
|
|
||
|
|
async def scrape_press_releases_newswire(self, company_name, ticker):
|
||
|
|
"""Scrape Newswire.ca for press releases"""
|
||
|
|
print(f"\n🔍 Searching Newswire.ca for {ticker}...")
|
||
|
|
|
||
|
|
search_url = f"https://www.newswire.ca/search/?query={quote(ticker)}"
|
||
|
|
|
||
|
|
press_releases = []
|
||
|
|
|
||
|
|
async with async_playwright() as p:
|
||
|
|
browser = await p.chromium.launch(headless=True)
|
||
|
|
page = await browser.new_page()
|
||
|
|
|
||
|
|
try:
|
||
|
|
await page.goto(search_url, wait_until='networkidle', timeout=30000)
|
||
|
|
await asyncio.sleep(2)
|
||
|
|
|
||
|
|
# Find press release items
|
||
|
|
pr_items = await page.query_selector_all('.release-card, .news-item, article')
|
||
|
|
|
||
|
|
print(f" Found {len(pr_items)} press releases")
|
||
|
|
|
||
|
|
for item in pr_items:
|
||
|
|
try:
|
||
|
|
pr = {
|
||
|
|
'source': 'Newswire.ca'
|
||
|
|
}
|
||
|
|
|
||
|
|
# Get title
|
||
|
|
title_elem = await item.query_selector('h3, h2, a.title')
|
||
|
|
if title_elem:
|
||
|
|
pr['title'] = await title_elem.inner_text()
|
||
|
|
|
||
|
|
# Get date
|
||
|
|
date_elem = await item.query_selector('time, .date, .timestamp')
|
||
|
|
if date_elem:
|
||
|
|
pr['date'] = await date_elem.inner_text()
|
||
|
|
|
||
|
|
# Get link
|
||
|
|
link_elem = await item.query_selector('a')
|
||
|
|
if link_elem:
|
||
|
|
href = await link_elem.get_attribute('href')
|
||
|
|
if href.startswith('/'):
|
||
|
|
href = f"https://www.newswire.ca{href}"
|
||
|
|
pr['url'] = href
|
||
|
|
|
||
|
|
# Get summary
|
||
|
|
summary_elem = await item.query_selector('p, .summary, .description')
|
||
|
|
if summary_elem:
|
||
|
|
pr['summary'] = await summary_elem.inner_text()
|
||
|
|
|
||
|
|
if pr.get('title'):
|
||
|
|
press_releases.append(pr)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
continue
|
||
|
|
|
||
|
|
print(f"✅ Extracted {len(press_releases)} press releases")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
print(f"❌ Error scraping Newswire.ca: {e}")
|
||
|
|
|
||
|
|
finally:
|
||
|
|
await browser.close()
|
||
|
|
|
||
|
|
return press_releases
|
||
|
|
|
||
|
|
async def scrape_stock_news_and_pr(self, ticker, company_name):
|
||
|
|
"""Scrape both news and press releases for a stock"""
|
||
|
|
print(f"\n{'='*60}")
|
||
|
|
print(f"SCRAPING NEWS & PR FOR: {ticker} - {company_name}")
|
||
|
|
print(f"{'='*60}")
|
||
|
|
|
||
|
|
all_data = {
|
||
|
|
'ticker': ticker,
|
||
|
|
'company_name': company_name,
|
||
|
|
'scraped_at': datetime.now().isoformat(),
|
||
|
|
'news_articles': [],
|
||
|
|
'press_releases': []
|
||
|
|
}
|
||
|
|
|
||
|
|
# Scrape Google News
|
||
|
|
news = await self.scrape_google_news(company_name, ticker)
|
||
|
|
all_data['news_articles'] = news
|
||
|
|
|
||
|
|
# Small delay between requests
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
|
||
|
|
# Scrape GlobeNewswire
|
||
|
|
pr_gnw = await self.scrape_press_releases_globenewswire(company_name, ticker)
|
||
|
|
all_data['press_releases'].extend(pr_gnw)
|
||
|
|
|
||
|
|
# Small delay
|
||
|
|
await asyncio.sleep(3)
|
||
|
|
|
||
|
|
# Scrape Newswire.ca
|
||
|
|
pr_nw = await self.scrape_press_releases_newswire(company_name, ticker)
|
||
|
|
all_data['press_releases'].extend(pr_nw)
|
||
|
|
|
||
|
|
# Save to file
|
||
|
|
output_file = f"{self.output_dir}/{ticker}_news_pr.json"
|
||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
|
json.dump(all_data, f, indent=2)
|
||
|
|
|
||
|
|
print(f"\n📊 Summary for {ticker}:")
|
||
|
|
print(f" News articles: {len(all_data['news_articles'])}")
|
||
|
|
print(f" Press releases: {len(all_data['press_releases'])}")
|
||
|
|
print(f" Saved to: {output_file}")
|
||
|
|
|
||
|
|
return all_data
|
||
|
|
|
||
|
|
async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
|
||
|
|
"""Scrape news and PR for multiple stocks"""
|
||
|
|
print("=" * 60)
|
||
|
|
print("NEWS & PRESS RELEASE SCRAPING")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
if max_stocks:
|
||
|
|
stock_list = stock_list[:max_stocks]
|
||
|
|
|
||
|
|
all_data = []
|
||
|
|
|
||
|
|
for stock in stock_list:
|
||
|
|
ticker = stock.get('symbol')
|
||
|
|
company_name = stock.get('name')
|
||
|
|
|
||
|
|
data = await self.scrape_stock_news_and_pr(ticker, company_name)
|
||
|
|
all_data.append(data)
|
||
|
|
|
||
|
|
# Rate limiting - be respectful
|
||
|
|
await asyncio.sleep(5)
|
||
|
|
|
||
|
|
print("\n" + "=" * 60)
|
||
|
|
print(f"✅ Completed scraping for {len(all_data)} stocks")
|
||
|
|
print(f"📁 Data saved to: {self.output_dir}/")
|
||
|
|
print("=" * 60)
|
||
|
|
|
||
|
|
return all_data
|
||
|
|
|
||
|
|
|
||
|
|
async def main():
|
||
|
|
"""Test the scraper"""
|
||
|
|
|
||
|
|
# Load listings
|
||
|
|
listings_file = "data/listings/all_listings_combined.json"
|
||
|
|
|
||
|
|
if not os.path.exists(listings_file):
|
||
|
|
print(f"❌ No listings file found at {listings_file}")
|
||
|
|
print(" Run extract_listings.py first")
|
||
|
|
return
|
||
|
|
|
||
|
|
with open(listings_file, 'r', encoding='utf-8') as f:
|
||
|
|
listings = json.load(f)
|
||
|
|
|
||
|
|
print(f"📊 Found {len(listings)} stocks in listings")
|
||
|
|
|
||
|
|
# Test with first 3 stocks
|
||
|
|
scraper = NewsPressScraper()
|
||
|
|
await scraper.scrape_multiple_stocks(listings, max_stocks=3)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
asyncio.run(main())
|