scrape_news_pr.py

"""
Scrape news and press releases without API keys
Uses Google search results and direct source scraping
"""

import asyncio
import json
import os
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import time
import re
from urllib.parse import quote


class NewsPressScraper:
    def __init__(self, output_dir="data/news"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    async def scrape_google_news(self, company_name, ticker, max_results=20):
        """Scrape Google News results for a stock"""
        print(f"\n🔍 Searching news for {company_name} ({ticker})...")
        
        # Build search query
        query = f'"{company_name}" OR "{ticker}" (stock OR shares OR earnings)'
        encoded_query = quote(query)
        
        # Limit to last 12 months
        url = f"https://www.google.com/search?q={encoded_query}&tbm=nws&tbs=qdr:y"
        
        news_articles = []
        
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            
            try:
                await page.goto(url, wait_until='networkidle', timeout=30000)
                await asyncio.sleep(2)
                
                # Extract news results
                news_items = await page.query_selector_all('div[data-sokoban-container]')
                
                if not news_items:
                    # Try alternative selectors
                    news_items = await page.query_selector_all('div.SoaBEf, div.Gx5Zad')
                
                print(f"   Found {len(news_items)} potential news items")
                
                for item in news_items[:max_results]:
                    try:
                        article = {}
                        
                        # Get title
                        title_elem = await item.query_selector('div[role="heading"], h3, .mCBkyc')
                        if title_elem:
                            article['title'] = await title_elem.inner_text()
                        
                        # Get source
                        source_elem = await item.query_selector('.CEMjEf, .NUnG9d span')
                        if source_elem:
                            article['source'] = await source_elem.inner_text()
                        
                        # Get date
                        date_elem = await item.query_selector('.OSrXXb, time')
                        if date_elem:
                            article['date'] = await date_elem.inner_text()
                        
                        # Get link
                        link_elem = await item.query_selector('a')
                        if link_elem:
                            article['url'] = await link_elem.get_attribute('href')
                        
                        # Get snippet
                        snippet_elem = await item.query_selector('.GI74Re, .Y3v8qd')
                        if snippet_elem:
                            article['snippet'] = await snippet_elem.inner_text()
                        
                        if article.get('title'):
                            news_articles.append(article)
                    
                    except Exception as e:
                        continue
                
                print(f"✅ Extracted {len(news_articles)} news articles")
                
            except Exception as e:
                print(f"❌ Error scraping Google News: {e}")
            
            finally:
                await browser.close()
        
        return news_articles
    
    async def scrape_press_releases_globenewswire(self, company_name, ticker):
        """Scrape GlobeNewswire for press releases"""
        print(f"\n🔍 Searching GlobeNewswire for {ticker}...")
        
        search_url = f"https://www.globenewswire.com/search/keyword/{quote(ticker)}"
        
        press_releases = []
        
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            
            try:
                await page.goto(search_url, wait_until='networkidle', timeout=30000)
                await asyncio.sleep(2)
                
                # Find press release items
                pr_items = await page.query_selector_all('.article-item, .result-item, article')
                
                print(f"   Found {len(pr_items)} press releases")
                
                for item in pr_items:
                    try:
                        pr = {
                            'source': 'GlobeNewswire'
                        }
                        
                        # Get title
                        title_elem = await item.query_selector('h3, h2, .title a')
                        if title_elem:
                            pr['title'] = await title_elem.inner_text()
                        
                        # Get date
                        date_elem = await item.query_selector('time, .date')
                        if date_elem:
                            pr['date'] = await date_elem.inner_text()
                        
                        # Get link
                        link_elem = await item.query_selector('a')
                        if link_elem:
                            href = await link_elem.get_attribute('href')
                            if href.startswith('/'):
                                href = f"https://www.globenewswire.com{href}"
                            pr['url'] = href
                        
                        # Get summary
                        summary_elem = await item.query_selector('p, .summary')
                        if summary_elem:
                            pr['summary'] = await summary_elem.inner_text()
                        
                        if pr.get('title'):
                            press_releases.append(pr)
                    
                    except Exception as e:
                        continue
                
                print(f"✅ Extracted {len(press_releases)} press releases")
                
            except Exception as e:
                print(f"❌ Error scraping GlobeNewswire: {e}")
            
            finally:
                await browser.close()
        
        return press_releases
    
    async def scrape_press_releases_newswire(self, company_name, ticker):
        """Scrape Newswire.ca for press releases"""
        print(f"\n🔍 Searching Newswire.ca for {ticker}...")
        
        search_url = f"https://www.newswire.ca/search/?query={quote(ticker)}"
        
        press_releases = []
        
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            
            try:
                await page.goto(search_url, wait_until='networkidle', timeout=30000)
                await asyncio.sleep(2)
                
                # Find press release items
                pr_items = await page.query_selector_all('.release-card, .news-item, article')
                
                print(f"   Found {len(pr_items)} press releases")
                
                for item in pr_items:
                    try:
                        pr = {
                            'source': 'Newswire.ca'
                        }
                        
                        # Get title
                        title_elem = await item.query_selector('h3, h2, a.title')
                        if title_elem:
                            pr['title'] = await title_elem.inner_text()
                        
                        # Get date
                        date_elem = await item.query_selector('time, .date, .timestamp')
                        if date_elem:
                            pr['date'] = await date_elem.inner_text()
                        
                        # Get link
                        link_elem = await item.query_selector('a')
                        if link_elem:
                            href = await link_elem.get_attribute('href')
                            if href.startswith('/'):
                                href = f"https://www.newswire.ca{href}"
                            pr['url'] = href
                        
                        # Get summary
                        summary_elem = await item.query_selector('p, .summary, .description')
                        if summary_elem:
                            pr['summary'] = await summary_elem.inner_text()
                        
                        if pr.get('title'):
                            press_releases.append(pr)
                    
                    except Exception as e:
                        continue
                
                print(f"✅ Extracted {len(press_releases)} press releases")
                
            except Exception as e:
                print(f"❌ Error scraping Newswire.ca: {e}")
            
            finally:
                await browser.close()
        
        return press_releases
    
    async def scrape_stock_news_and_pr(self, ticker, company_name):
        """Scrape both news and press releases for a stock"""
        print(f"\n{'='*60}")
        print(f"SCRAPING NEWS & PR FOR: {ticker} - {company_name}")
        print(f"{'='*60}")
        
        all_data = {
            'ticker': ticker,
            'company_name': company_name,
            'scraped_at': datetime.now().isoformat(),
            'news_articles': [],
            'press_releases': []
        }
        
        # Scrape Google News
        news = await self.scrape_google_news(company_name, ticker)
        all_data['news_articles'] = news
        
        # Small delay between requests
        await asyncio.sleep(3)
        
        # Scrape GlobeNewswire
        pr_gnw = await self.scrape_press_releases_globenewswire(company_name, ticker)
        all_data['press_releases'].extend(pr_gnw)
        
        # Small delay
        await asyncio.sleep(3)
        
        # Scrape Newswire.ca
        pr_nw = await self.scrape_press_releases_newswire(company_name, ticker)
        all_data['press_releases'].extend(pr_nw)
        
        # Save to file
        output_file = f"{self.output_dir}/{ticker}_news_pr.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_data, f, indent=2)
        
        print(f"\n📊 Summary for {ticker}:")
        print(f"   News articles: {len(all_data['news_articles'])}")
        print(f"   Press releases: {len(all_data['press_releases'])}")
        print(f"   Saved to: {output_file}")
        
        return all_data
    
    async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
        """Scrape news and PR for multiple stocks"""
        print("=" * 60)
        print("NEWS & PRESS RELEASE SCRAPING")
        print("=" * 60)
        
        if max_stocks:
            stock_list = stock_list[:max_stocks]
        
        all_data = []
        
        for stock in stock_list:
            ticker = stock.get('symbol')
            company_name = stock.get('name')
            
            data = await self.scrape_stock_news_and_pr(ticker, company_name)
            all_data.append(data)
            
            # Rate limiting - be respectful
            await asyncio.sleep(5)
        
        print("\n" + "=" * 60)
        print(f"✅ Completed scraping for {len(all_data)} stocks")
        print(f"📁 Data saved to: {self.output_dir}/")
        print("=" * 60)
        
        return all_data


async def main():
    """Test the scraper"""
    
    # Load listings
    listings_file = "data/listings/all_listings_combined.json"
    
    if not os.path.exists(listings_file):
        print(f"❌ No listings file found at {listings_file}")
        print("   Run extract_listings.py first")
        return
    
    with open(listings_file, 'r', encoding='utf-8') as f:
        listings = json.load(f)
    
    print(f"📊 Found {len(listings)} stocks in listings")
    
    # Test with first 3 stocks
    scraper = NewsPressScraper()
    await scraper.scrape_multiple_stocks(listings, max_stocks=3)


if __name__ == "__main__":
    asyncio.run(main())
Initial commit: Stock Intelligence Automation System 2025-11-06 12:22:19 +01:00			`"""`
			`Scrape news and press releases without API keys`
			`Uses Google search results and direct source scraping`
			`"""`

			`import asyncio`
			`import json`
			`import os`
			`from datetime import datetime, timedelta`
			`from playwright.async_api import async_playwright`
			`import time`
			`import re`
			`from urllib.parse import quote`


			`class NewsPressScraper:`
			`def __init__(self, output_dir="data/news"):`
			`self.output_dir = output_dir`
			`os.makedirs(output_dir, exist_ok=True)`

			`async def scrape_google_news(self, company_name, ticker, max_results=20):`
			`"""Scrape Google News results for a stock"""`
			`print(f"\n🔍 Searching news for {company_name} ({ticker})...")`

			`# Build search query`
			`query = f'"{company_name}" OR "{ticker}" (stock OR shares OR earnings)'`
			`encoded_query = quote(query)`

			`# Limit to last 12 months`
			`url = f"https://www.google.com/search?q={encoded_query}&tbm=nws&tbs=qdr:y"`

			`news_articles = []`

			`async with async_playwright() as p:`
			`browser = await p.chromium.launch(headless=True)`
			`page = await browser.new_page()`

			`try:`
			`await page.goto(url, wait_until='networkidle', timeout=30000)`
			`await asyncio.sleep(2)`

			`# Extract news results`
			`news_items = await page.query_selector_all('div[data-sokoban-container]')`

			`if not news_items:`
			`# Try alternative selectors`
			`news_items = await page.query_selector_all('div.SoaBEf, div.Gx5Zad')`

			`print(f" Found {len(news_items)} potential news items")`

			`for item in news_items[:max_results]:`
			`try:`
			`article = {}`

			`# Get title`
			`title_elem = await item.query_selector('div[role="heading"], h3, .mCBkyc')`
			`if title_elem:`
			`article['title'] = await title_elem.inner_text()`

			`# Get source`
			`source_elem = await item.query_selector('.CEMjEf, .NUnG9d span')`
			`if source_elem:`
			`article['source'] = await source_elem.inner_text()`

			`# Get date`
			`date_elem = await item.query_selector('.OSrXXb, time')`
			`if date_elem:`
			`article['date'] = await date_elem.inner_text()`

			`# Get link`
			`link_elem = await item.query_selector('a')`
			`if link_elem:`
			`article['url'] = await link_elem.get_attribute('href')`

			`# Get snippet`
			`snippet_elem = await item.query_selector('.GI74Re, .Y3v8qd')`
			`if snippet_elem:`
			`article['snippet'] = await snippet_elem.inner_text()`

			`if article.get('title'):`
			`news_articles.append(article)`

			`except Exception as e:`
			`continue`

			`print(f"✅ Extracted {len(news_articles)} news articles")`

			`except Exception as e:`
			`print(f"❌ Error scraping Google News: {e}")`

			`finally:`
			`await browser.close()`

			`return news_articles`

			`async def scrape_press_releases_globenewswire(self, company_name, ticker):`
			`"""Scrape GlobeNewswire for press releases"""`
			`print(f"\n🔍 Searching GlobeNewswire for {ticker}...")`

			`search_url = f"https://www.globenewswire.com/search/keyword/{quote(ticker)}"`

			`press_releases = []`

			`async with async_playwright() as p:`
			`browser = await p.chromium.launch(headless=True)`
			`page = await browser.new_page()`

			`try:`
			`await page.goto(search_url, wait_until='networkidle', timeout=30000)`
			`await asyncio.sleep(2)`

			`# Find press release items`
			`pr_items = await page.query_selector_all('.article-item, .result-item, article')`

			`print(f" Found {len(pr_items)} press releases")`

			`for item in pr_items:`
			`try:`
			`pr = {`
			`'source': 'GlobeNewswire'`
			`}`

			`# Get title`
			`title_elem = await item.query_selector('h3, h2, .title a')`
			`if title_elem:`
			`pr['title'] = await title_elem.inner_text()`

			`# Get date`
			`date_elem = await item.query_selector('time, .date')`
			`if date_elem:`
			`pr['date'] = await date_elem.inner_text()`

			`# Get link`
			`link_elem = await item.query_selector('a')`
			`if link_elem:`
			`href = await link_elem.get_attribute('href')`
			`if href.startswith('/'):`
			`href = f"https://www.globenewswire.com{href}"`
			`pr['url'] = href`

			`# Get summary`
			`summary_elem = await item.query_selector('p, .summary')`
			`if summary_elem:`
			`pr['summary'] = await summary_elem.inner_text()`

			`if pr.get('title'):`
			`press_releases.append(pr)`

			`except Exception as e:`
			`continue`

			`print(f"✅ Extracted {len(press_releases)} press releases")`

			`except Exception as e:`
			`print(f"❌ Error scraping GlobeNewswire: {e}")`

			`finally:`
			`await browser.close()`

			`return press_releases`

			`async def scrape_press_releases_newswire(self, company_name, ticker):`
			`"""Scrape Newswire.ca for press releases"""`
			`print(f"\n🔍 Searching Newswire.ca for {ticker}...")`

			`search_url = f"https://www.newswire.ca/search/?query={quote(ticker)}"`

			`press_releases = []`

			`async with async_playwright() as p:`
			`browser = await p.chromium.launch(headless=True)`
			`page = await browser.new_page()`

			`try:`
			`await page.goto(search_url, wait_until='networkidle', timeout=30000)`
			`await asyncio.sleep(2)`

			`# Find press release items`
			`pr_items = await page.query_selector_all('.release-card, .news-item, article')`

			`print(f" Found {len(pr_items)} press releases")`

			`for item in pr_items:`
			`try:`
			`pr = {`
			`'source': 'Newswire.ca'`
			`}`

			`# Get title`
			`title_elem = await item.query_selector('h3, h2, a.title')`
			`if title_elem:`
			`pr['title'] = await title_elem.inner_text()`

			`# Get date`
			`date_elem = await item.query_selector('time, .date, .timestamp')`
			`if date_elem:`
			`pr['date'] = await date_elem.inner_text()`

			`# Get link`
			`link_elem = await item.query_selector('a')`
			`if link_elem:`
			`href = await link_elem.get_attribute('href')`
			`if href.startswith('/'):`
			`href = f"https://www.newswire.ca{href}"`
			`pr['url'] = href`

			`# Get summary`
			`summary_elem = await item.query_selector('p, .summary, .description')`
			`if summary_elem:`
			`pr['summary'] = await summary_elem.inner_text()`

			`if pr.get('title'):`
			`press_releases.append(pr)`

			`except Exception as e:`
			`continue`

			`print(f"✅ Extracted {len(press_releases)} press releases")`

			`except Exception as e:`
			`print(f"❌ Error scraping Newswire.ca: {e}")`

			`finally:`
			`await browser.close()`

			`return press_releases`

			`async def scrape_stock_news_and_pr(self, ticker, company_name):`
			`"""Scrape both news and press releases for a stock"""`
			`print(f"\n{'='*60}")`
			`print(f"SCRAPING NEWS & PR FOR: {ticker} - {company_name}")`
			`print(f"{'='*60}")`

			`all_data = {`
			`'ticker': ticker,`
			`'company_name': company_name,`
			`'scraped_at': datetime.now().isoformat(),`
			`'news_articles': [],`
			`'press_releases': []`
			`}`

			`# Scrape Google News`
			`news = await self.scrape_google_news(company_name, ticker)`
			`all_data['news_articles'] = news`

			`# Small delay between requests`
			`await asyncio.sleep(3)`

			`# Scrape GlobeNewswire`
			`pr_gnw = await self.scrape_press_releases_globenewswire(company_name, ticker)`
			`all_data['press_releases'].extend(pr_gnw)`

			`# Small delay`
			`await asyncio.sleep(3)`

			`# Scrape Newswire.ca`
			`pr_nw = await self.scrape_press_releases_newswire(company_name, ticker)`
			`all_data['press_releases'].extend(pr_nw)`

			`# Save to file`
			`output_file = f"{self.output_dir}/{ticker}_news_pr.json"`
			`with open(output_file, 'w', encoding='utf-8') as f:`
			`json.dump(all_data, f, indent=2)`

			`print(f"\n📊 Summary for {ticker}:")`
			`print(f" News articles: {len(all_data['news_articles'])}")`
			`print(f" Press releases: {len(all_data['press_releases'])}")`
			`print(f" Saved to: {output_file}")`

			`return all_data`

			`async def scrape_multiple_stocks(self, stock_list, max_stocks=None):`
			`"""Scrape news and PR for multiple stocks"""`
			`print("=" * 60)`
			`print("NEWS & PRESS RELEASE SCRAPING")`
			`print("=" * 60)`

			`if max_stocks:`
			`stock_list = stock_list[:max_stocks]`

			`all_data = []`

			`for stock in stock_list:`
			`ticker = stock.get('symbol')`
			`company_name = stock.get('name')`

			`data = await self.scrape_stock_news_and_pr(ticker, company_name)`
			`all_data.append(data)`

			`# Rate limiting - be respectful`
			`await asyncio.sleep(5)`

			`print("\n" + "=" * 60)`
			`print(f"✅ Completed scraping for {len(all_data)} stocks")`
			`print(f"📁 Data saved to: {self.output_dir}/")`
			`print("=" * 60)`

			`return all_data`


			`async def main():`
			`"""Test the scraper"""`

			`# Load listings`
			`listings_file = "data/listings/all_listings_combined.json"`

			`if not os.path.exists(listings_file):`
			`print(f"❌ No listings file found at {listings_file}")`
			`print(" Run extract_listings.py first")`
			`return`

			`with open(listings_file, 'r', encoding='utf-8') as f:`
			`listings = json.load(f)`

			`print(f"📊 Found {len(listings)} stocks in listings")`

			`# Test with first 3 stocks`
			`scraper = NewsPressScraper()`
			`await scraper.scrape_multiple_stocks(listings, max_stocks=3)`


			`if __name__ == "__main__":`
			`asyncio.run(main())`