Files
microcap_scrapping/scrape_news_pr.py
T

324 lines
12 KiB
Python
Raw Normal View History

"""
Scrape news and press releases without API keys
Uses Google search results and direct source scraping
"""
import asyncio
import json
import os
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import time
import re
from urllib.parse import quote
class NewsPressScraper:
def __init__(self, output_dir="data/news"):
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
async def scrape_google_news(self, company_name, ticker, max_results=20):
"""Scrape Google News results for a stock"""
print(f"\n🔍 Searching news for {company_name} ({ticker})...")
# Build search query
query = f'"{company_name}" OR "{ticker}" (stock OR shares OR earnings)'
encoded_query = quote(query)
# Limit to last 12 months
url = f"https://www.google.com/search?q={encoded_query}&tbm=nws&tbs=qdr:y"
news_articles = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
await page.goto(url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Extract news results
news_items = await page.query_selector_all('div[data-sokoban-container]')
if not news_items:
# Try alternative selectors
news_items = await page.query_selector_all('div.SoaBEf, div.Gx5Zad')
print(f" Found {len(news_items)} potential news items")
for item in news_items[:max_results]:
try:
article = {}
# Get title
title_elem = await item.query_selector('div[role="heading"], h3, .mCBkyc')
if title_elem:
article['title'] = await title_elem.inner_text()
# Get source
source_elem = await item.query_selector('.CEMjEf, .NUnG9d span')
if source_elem:
article['source'] = await source_elem.inner_text()
# Get date
date_elem = await item.query_selector('.OSrXXb, time')
if date_elem:
article['date'] = await date_elem.inner_text()
# Get link
link_elem = await item.query_selector('a')
if link_elem:
article['url'] = await link_elem.get_attribute('href')
# Get snippet
snippet_elem = await item.query_selector('.GI74Re, .Y3v8qd')
if snippet_elem:
article['snippet'] = await snippet_elem.inner_text()
if article.get('title'):
news_articles.append(article)
except Exception as e:
continue
print(f"✅ Extracted {len(news_articles)} news articles")
except Exception as e:
print(f"❌ Error scraping Google News: {e}")
finally:
await browser.close()
return news_articles
async def scrape_press_releases_globenewswire(self, company_name, ticker):
"""Scrape GlobeNewswire for press releases"""
print(f"\n🔍 Searching GlobeNewswire for {ticker}...")
search_url = f"https://www.globenewswire.com/search/keyword/{quote(ticker)}"
press_releases = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
await page.goto(search_url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Find press release items
pr_items = await page.query_selector_all('.article-item, .result-item, article')
print(f" Found {len(pr_items)} press releases")
for item in pr_items:
try:
pr = {
'source': 'GlobeNewswire'
}
# Get title
title_elem = await item.query_selector('h3, h2, .title a')
if title_elem:
pr['title'] = await title_elem.inner_text()
# Get date
date_elem = await item.query_selector('time, .date')
if date_elem:
pr['date'] = await date_elem.inner_text()
# Get link
link_elem = await item.query_selector('a')
if link_elem:
href = await link_elem.get_attribute('href')
if href.startswith('/'):
href = f"https://www.globenewswire.com{href}"
pr['url'] = href
# Get summary
summary_elem = await item.query_selector('p, .summary')
if summary_elem:
pr['summary'] = await summary_elem.inner_text()
if pr.get('title'):
press_releases.append(pr)
except Exception as e:
continue
print(f"✅ Extracted {len(press_releases)} press releases")
except Exception as e:
print(f"❌ Error scraping GlobeNewswire: {e}")
finally:
await browser.close()
return press_releases
async def scrape_press_releases_newswire(self, company_name, ticker):
"""Scrape Newswire.ca for press releases"""
print(f"\n🔍 Searching Newswire.ca for {ticker}...")
search_url = f"https://www.newswire.ca/search/?query={quote(ticker)}"
press_releases = []
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
await page.goto(search_url, wait_until='networkidle', timeout=30000)
await asyncio.sleep(2)
# Find press release items
pr_items = await page.query_selector_all('.release-card, .news-item, article')
print(f" Found {len(pr_items)} press releases")
for item in pr_items:
try:
pr = {
'source': 'Newswire.ca'
}
# Get title
title_elem = await item.query_selector('h3, h2, a.title')
if title_elem:
pr['title'] = await title_elem.inner_text()
# Get date
date_elem = await item.query_selector('time, .date, .timestamp')
if date_elem:
pr['date'] = await date_elem.inner_text()
# Get link
link_elem = await item.query_selector('a')
if link_elem:
href = await link_elem.get_attribute('href')
if href.startswith('/'):
href = f"https://www.newswire.ca{href}"
pr['url'] = href
# Get summary
summary_elem = await item.query_selector('p, .summary, .description')
if summary_elem:
pr['summary'] = await summary_elem.inner_text()
if pr.get('title'):
press_releases.append(pr)
except Exception as e:
continue
print(f"✅ Extracted {len(press_releases)} press releases")
except Exception as e:
print(f"❌ Error scraping Newswire.ca: {e}")
finally:
await browser.close()
return press_releases
async def scrape_stock_news_and_pr(self, ticker, company_name):
"""Scrape both news and press releases for a stock"""
print(f"\n{'='*60}")
print(f"SCRAPING NEWS & PR FOR: {ticker} - {company_name}")
print(f"{'='*60}")
all_data = {
'ticker': ticker,
'company_name': company_name,
'scraped_at': datetime.now().isoformat(),
'news_articles': [],
'press_releases': []
}
# Scrape Google News
news = await self.scrape_google_news(company_name, ticker)
all_data['news_articles'] = news
# Small delay between requests
await asyncio.sleep(3)
# Scrape GlobeNewswire
pr_gnw = await self.scrape_press_releases_globenewswire(company_name, ticker)
all_data['press_releases'].extend(pr_gnw)
# Small delay
await asyncio.sleep(3)
# Scrape Newswire.ca
pr_nw = await self.scrape_press_releases_newswire(company_name, ticker)
all_data['press_releases'].extend(pr_nw)
# Save to file
output_file = f"{self.output_dir}/{ticker}_news_pr.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_data, f, indent=2)
print(f"\n📊 Summary for {ticker}:")
print(f" News articles: {len(all_data['news_articles'])}")
print(f" Press releases: {len(all_data['press_releases'])}")
print(f" Saved to: {output_file}")
return all_data
async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
"""Scrape news and PR for multiple stocks"""
print("=" * 60)
print("NEWS & PRESS RELEASE SCRAPING")
print("=" * 60)
if max_stocks:
stock_list = stock_list[:max_stocks]
all_data = []
for stock in stock_list:
ticker = stock.get('symbol')
company_name = stock.get('name')
data = await self.scrape_stock_news_and_pr(ticker, company_name)
all_data.append(data)
# Rate limiting - be respectful
await asyncio.sleep(5)
print("\n" + "=" * 60)
print(f"✅ Completed scraping for {len(all_data)} stocks")
print(f"📁 Data saved to: {self.output_dir}/")
print("=" * 60)
return all_data
async def main():
"""Test the scraper"""
# Load listings
listings_file = "data/listings/all_listings_combined.json"
if not os.path.exists(listings_file):
print(f"❌ No listings file found at {listings_file}")
print(" Run extract_listings.py first")
return
with open(listings_file, 'r', encoding='utf-8') as f:
listings = json.load(f)
print(f"📊 Found {len(listings)} stocks in listings")
# Test with first 3 stocks
scraper = NewsPressScraper()
await scraper.scrape_multiple_stocks(listings, max_stocks=3)
if __name__ == "__main__":
asyncio.run(main())