"""
Scrape news and press releases without API keys
Uses Google search results and direct source scraping
"""

import asyncio
import json
import os
from datetime import datetime, timedelta
from playwright.async_api import async_playwright
import time
import re
from urllib.parse import quote


class NewsPressScraper:
    def __init__(self, output_dir="data/news"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    async def scrape_google_news(self, company_name, ticker, max_results=20):
        """Scrape Google News results for a stock"""
        print(f"\n🔍 Searching news for {company_name} ({ticker})...")
        
        # Build search query
        query = f'"{company_name}" OR "{ticker}" (stock OR shares OR earnings)'
        encoded_query = quote(query)
        
        # Limit to last 12 months
        url = f"https://www.google.com/search?q={encoded_query}&tbm=nws&tbs=qdr:y"
        
        news_articles = []
        
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            
            try:
                await page.goto(url, wait_until='networkidle', timeout=30000)
                await asyncio.sleep(2)
                
                # Extract news results
                news_items = await page.query_selector_all('div[data-sokoban-container]')
                
                if not news_items:
                    # Try alternative selectors
                    news_items = await page.query_selector_all('div.SoaBEf, div.Gx5Zad')
                
                print(f"   Found {len(news_items)} potential news items")
                
                for item in news_items[:max_results]:
                    try:
                        article = {}
                        
                        # Get title
                        title_elem = await item.query_selector('div[role="heading"], h3, .mCBkyc')
                        if title_elem:
                            article['title'] = await title_elem.inner_text()
                        
                        # Get source
                        source_elem = await item.query_selector('.CEMjEf, .NUnG9d span')
                        if source_elem:
                            article['source'] = await source_elem.inner_text()
                        
                        # Get date
                        date_elem = await item.query_selector('.OSrXXb, time')
                        if date_elem:
                            article['date'] = await date_elem.inner_text()
                        
                        # Get link
                        link_elem = await item.query_selector('a')
                        if link_elem:
                            article['url'] = await link_elem.get_attribute('href')
                        
                        # Get snippet
                        snippet_elem = await item.query_selector('.GI74Re, .Y3v8qd')
                        if snippet_elem:
                            article['snippet'] = await snippet_elem.inner_text()
                        
                        if article.get('title'):
                            news_articles.append(article)
                    
                    except Exception as e:
                        continue
                
                print(f"✅ Extracted {len(news_articles)} news articles")
                
            except Exception as e:
                print(f"❌ Error scraping Google News: {e}")
            
            finally:
                await browser.close()
        
        return news_articles
    
    async def scrape_press_releases_globenewswire(self, company_name, ticker):
        """Scrape GlobeNewswire for press releases"""
        print(f"\n🔍 Searching GlobeNewswire for {ticker}...")
        
        search_url = f"https://www.globenewswire.com/search/keyword/{quote(ticker)}"
        
        press_releases = []
        
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            
            try:
                await page.goto(search_url, wait_until='networkidle', timeout=30000)
                await asyncio.sleep(2)
                
                # Find press release items
                pr_items = await page.query_selector_all('.article-item, .result-item, article')
                
                print(f"   Found {len(pr_items)} press releases")
                
                for item in pr_items:
                    try:
                        pr = {
                            'source': 'GlobeNewswire'
                        }
                        
                        # Get title
                        title_elem = await item.query_selector('h3, h2, .title a')
                        if title_elem:
                            pr['title'] = await title_elem.inner_text()
                        
                        # Get date
                        date_elem = await item.query_selector('time, .date')
                        if date_elem:
                            pr['date'] = await date_elem.inner_text()
                        
                        # Get link
                        link_elem = await item.query_selector('a')
                        if link_elem:
                            href = await link_elem.get_attribute('href')
                            if href.startswith('/'):
                                href = f"https://www.globenewswire.com{href}"
                            pr['url'] = href
                        
                        # Get summary
                        summary_elem = await item.query_selector('p, .summary')
                        if summary_elem:
                            pr['summary'] = await summary_elem.inner_text()
                        
                        if pr.get('title'):
                            press_releases.append(pr)
                    
                    except Exception as e:
                        continue
                
                print(f"✅ Extracted {len(press_releases)} press releases")
                
            except Exception as e:
                print(f"❌ Error scraping GlobeNewswire: {e}")
            
            finally:
                await browser.close()
        
        return press_releases
    
    async def scrape_press_releases_newswire(self, company_name, ticker):
        """Scrape Newswire.ca for press releases"""
        print(f"\n🔍 Searching Newswire.ca for {ticker}...")
        
        search_url = f"https://www.newswire.ca/search/?query={quote(ticker)}"
        
        press_releases = []
        
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            
            try:
                await page.goto(search_url, wait_until='networkidle', timeout=30000)
                await asyncio.sleep(2)
                
                # Find press release items
                pr_items = await page.query_selector_all('.release-card, .news-item, article')
                
                print(f"   Found {len(pr_items)} press releases")
                
                for item in pr_items:
                    try:
                        pr = {
                            'source': 'Newswire.ca'
                        }
                        
                        # Get title
                        title_elem = await item.query_selector('h3, h2, a.title')
                        if title_elem:
                            pr['title'] = await title_elem.inner_text()
                        
                        # Get date
                        date_elem = await item.query_selector('time, .date, .timestamp')
                        if date_elem:
                            pr['date'] = await date_elem.inner_text()
                        
                        # Get link
                        link_elem = await item.query_selector('a')
                        if link_elem:
                            href = await link_elem.get_attribute('href')
                            if href.startswith('/'):
                                href = f"https://www.newswire.ca{href}"
                            pr['url'] = href
                        
                        # Get summary
                        summary_elem = await item.query_selector('p, .summary, .description')
                        if summary_elem:
                            pr['summary'] = await summary_elem.inner_text()
                        
                        if pr.get('title'):
                            press_releases.append(pr)
                    
                    except Exception as e:
                        continue
                
                print(f"✅ Extracted {len(press_releases)} press releases")
                
            except Exception as e:
                print(f"❌ Error scraping Newswire.ca: {e}")
            
            finally:
                await browser.close()
        
        return press_releases
    
    async def scrape_stock_news_and_pr(self, ticker, company_name):
        """Scrape both news and press releases for a stock"""
        print(f"\n{'='*60}")
        print(f"SCRAPING NEWS & PR FOR: {ticker} - {company_name}")
        print(f"{'='*60}")
        
        all_data = {
            'ticker': ticker,
            'company_name': company_name,
            'scraped_at': datetime.now().isoformat(),
            'news_articles': [],
            'press_releases': []
        }
        
        # Scrape Google News
        news = await self.scrape_google_news(company_name, ticker)
        all_data['news_articles'] = news
        
        # Small delay between requests
        await asyncio.sleep(3)
        
        # Scrape GlobeNewswire
        pr_gnw = await self.scrape_press_releases_globenewswire(company_name, ticker)
        all_data['press_releases'].extend(pr_gnw)
        
        # Small delay
        await asyncio.sleep(3)
        
        # Scrape Newswire.ca
        pr_nw = await self.scrape_press_releases_newswire(company_name, ticker)
        all_data['press_releases'].extend(pr_nw)
        
        # Save to file
        output_file = f"{self.output_dir}/{ticker}_news_pr.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(all_data, f, indent=2)
        
        print(f"\n📊 Summary for {ticker}:")
        print(f"   News articles: {len(all_data['news_articles'])}")
        print(f"   Press releases: {len(all_data['press_releases'])}")
        print(f"   Saved to: {output_file}")
        
        return all_data
    
    async def scrape_multiple_stocks(self, stock_list, max_stocks=None):
        """Scrape news and PR for multiple stocks"""
        print("=" * 60)
        print("NEWS & PRESS RELEASE SCRAPING")
        print("=" * 60)
        
        if max_stocks:
            stock_list = stock_list[:max_stocks]
        
        all_data = []
        
        for stock in stock_list:
            ticker = stock.get('symbol')
            company_name = stock.get('name')
            
            data = await self.scrape_stock_news_and_pr(ticker, company_name)
            all_data.append(data)
            
            # Rate limiting - be respectful
            await asyncio.sleep(5)
        
        print("\n" + "=" * 60)
        print(f"✅ Completed scraping for {len(all_data)} stocks")
        print(f"📁 Data saved to: {self.output_dir}/")
        print("=" * 60)
        
        return all_data


async def main():
    """Test the scraper"""
    
    # Load listings
    listings_file = "data/listings/all_listings_combined.json"
    
    if not os.path.exists(listings_file):
        print(f"❌ No listings file found at {listings_file}")
        print("   Run extract_listings.py first")
        return
    
    with open(listings_file, 'r', encoding='utf-8') as f:
        listings = json.load(f)
    
    print(f"📊 Found {len(listings)} stocks in listings")
    
    # Test with first 3 stocks
    scraper = NewsPressScraper()
    await scraper.scrape_multiple_stocks(listings, max_stocks=3)


if __name__ == "__main__":
    asyncio.run(main())