""" Scrape news and press releases without API keys Uses Google search results and direct source scraping """ import asyncio import json import os from datetime import datetime, timedelta from playwright.async_api import async_playwright import time import re from urllib.parse import quote class NewsPressScraper: def __init__(self, output_dir="data/news"): self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) async def scrape_google_news(self, company_name, ticker, max_results=20): """Scrape Google News results for a stock""" print(f"\nšŸ” Searching news for {company_name} ({ticker})...") # Build search query query = f'"{company_name}" OR "{ticker}" (stock OR shares OR earnings)' encoded_query = quote(query) # Limit to last 12 months url = f"https://www.google.com/search?q={encoded_query}&tbm=nws&tbs=qdr:y" news_articles = [] async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: await page.goto(url, wait_until='networkidle', timeout=30000) await asyncio.sleep(2) # Extract news results news_items = await page.query_selector_all('div[data-sokoban-container]') if not news_items: # Try alternative selectors news_items = await page.query_selector_all('div.SoaBEf, div.Gx5Zad') print(f" Found {len(news_items)} potential news items") for item in news_items[:max_results]: try: article = {} # Get title title_elem = await item.query_selector('div[role="heading"], h3, .mCBkyc') if title_elem: article['title'] = await title_elem.inner_text() # Get source source_elem = await item.query_selector('.CEMjEf, .NUnG9d span') if source_elem: article['source'] = await source_elem.inner_text() # Get date date_elem = await item.query_selector('.OSrXXb, time') if date_elem: article['date'] = await date_elem.inner_text() # Get link link_elem = await item.query_selector('a') if link_elem: article['url'] = await link_elem.get_attribute('href') # Get snippet snippet_elem = await item.query_selector('.GI74Re, .Y3v8qd') if snippet_elem: article['snippet'] = await snippet_elem.inner_text() if article.get('title'): news_articles.append(article) except Exception as e: continue print(f"āœ… Extracted {len(news_articles)} news articles") except Exception as e: print(f"āŒ Error scraping Google News: {e}") finally: await browser.close() return news_articles async def scrape_press_releases_globenewswire(self, company_name, ticker): """Scrape GlobeNewswire for press releases""" print(f"\nšŸ” Searching GlobeNewswire for {ticker}...") search_url = f"https://www.globenewswire.com/search/keyword/{quote(ticker)}" press_releases = [] async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: await page.goto(search_url, wait_until='networkidle', timeout=30000) await asyncio.sleep(2) # Find press release items pr_items = await page.query_selector_all('.article-item, .result-item, article') print(f" Found {len(pr_items)} press releases") for item in pr_items: try: pr = { 'source': 'GlobeNewswire' } # Get title title_elem = await item.query_selector('h3, h2, .title a') if title_elem: pr['title'] = await title_elem.inner_text() # Get date date_elem = await item.query_selector('time, .date') if date_elem: pr['date'] = await date_elem.inner_text() # Get link link_elem = await item.query_selector('a') if link_elem: href = await link_elem.get_attribute('href') if href.startswith('/'): href = f"https://www.globenewswire.com{href}" pr['url'] = href # Get summary summary_elem = await item.query_selector('p, .summary') if summary_elem: pr['summary'] = await summary_elem.inner_text() if pr.get('title'): press_releases.append(pr) except Exception as e: continue print(f"āœ… Extracted {len(press_releases)} press releases") except Exception as e: print(f"āŒ Error scraping GlobeNewswire: {e}") finally: await browser.close() return press_releases async def scrape_press_releases_newswire(self, company_name, ticker): """Scrape Newswire.ca for press releases""" print(f"\nšŸ” Searching Newswire.ca for {ticker}...") search_url = f"https://www.newswire.ca/search/?query={quote(ticker)}" press_releases = [] async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: await page.goto(search_url, wait_until='networkidle', timeout=30000) await asyncio.sleep(2) # Find press release items pr_items = await page.query_selector_all('.release-card, .news-item, article') print(f" Found {len(pr_items)} press releases") for item in pr_items: try: pr = { 'source': 'Newswire.ca' } # Get title title_elem = await item.query_selector('h3, h2, a.title') if title_elem: pr['title'] = await title_elem.inner_text() # Get date date_elem = await item.query_selector('time, .date, .timestamp') if date_elem: pr['date'] = await date_elem.inner_text() # Get link link_elem = await item.query_selector('a') if link_elem: href = await link_elem.get_attribute('href') if href.startswith('/'): href = f"https://www.newswire.ca{href}" pr['url'] = href # Get summary summary_elem = await item.query_selector('p, .summary, .description') if summary_elem: pr['summary'] = await summary_elem.inner_text() if pr.get('title'): press_releases.append(pr) except Exception as e: continue print(f"āœ… Extracted {len(press_releases)} press releases") except Exception as e: print(f"āŒ Error scraping Newswire.ca: {e}") finally: await browser.close() return press_releases async def scrape_stock_news_and_pr(self, ticker, company_name): """Scrape both news and press releases for a stock""" print(f"\n{'='*60}") print(f"SCRAPING NEWS & PR FOR: {ticker} - {company_name}") print(f"{'='*60}") all_data = { 'ticker': ticker, 'company_name': company_name, 'scraped_at': datetime.now().isoformat(), 'news_articles': [], 'press_releases': [] } # Scrape Google News news = await self.scrape_google_news(company_name, ticker) all_data['news_articles'] = news # Small delay between requests await asyncio.sleep(3) # Scrape GlobeNewswire pr_gnw = await self.scrape_press_releases_globenewswire(company_name, ticker) all_data['press_releases'].extend(pr_gnw) # Small delay await asyncio.sleep(3) # Scrape Newswire.ca pr_nw = await self.scrape_press_releases_newswire(company_name, ticker) all_data['press_releases'].extend(pr_nw) # Save to file output_file = f"{self.output_dir}/{ticker}_news_pr.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(all_data, f, indent=2) print(f"\nšŸ“Š Summary for {ticker}:") print(f" News articles: {len(all_data['news_articles'])}") print(f" Press releases: {len(all_data['press_releases'])}") print(f" Saved to: {output_file}") return all_data async def scrape_multiple_stocks(self, stock_list, max_stocks=None): """Scrape news and PR for multiple stocks""" print("=" * 60) print("NEWS & PRESS RELEASE SCRAPING") print("=" * 60) if max_stocks: stock_list = stock_list[:max_stocks] all_data = [] for stock in stock_list: ticker = stock.get('symbol') company_name = stock.get('name') data = await self.scrape_stock_news_and_pr(ticker, company_name) all_data.append(data) # Rate limiting - be respectful await asyncio.sleep(5) print("\n" + "=" * 60) print(f"āœ… Completed scraping for {len(all_data)} stocks") print(f"šŸ“ Data saved to: {self.output_dir}/") print("=" * 60) return all_data async def main(): """Test the scraper""" # Load listings listings_file = "data/listings/all_listings_combined.json" if not os.path.exists(listings_file): print(f"āŒ No listings file found at {listings_file}") print(" Run extract_listings.py first") return with open(listings_file, 'r', encoding='utf-8') as f: listings = json.load(f) print(f"šŸ“Š Found {len(listings)} stocks in listings") # Test with first 3 stocks scraper = NewsPressScraper() await scraper.scrape_multiple_stocks(listings, max_stocks=3) if __name__ == "__main__": asyncio.run(main())