microcap_scrapping/scrape_serpapi.py

"""
Use SerpAPI for robust news and press release scraping
Fallback option when direct scraping fails
"""

import requests
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Any
import time

from config import SERPAPI_KEY


class SerpAPINewsScraper:
    def __init__(self, output_dir="data/serpapi_news"):
        self.api_key = SERPAPI_KEY
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.base_url = "https://serpapi.com/search.json"

    def search_google_news(self, query: str, days_back: int = 365) -> List[Dict]:
        """Search Google News using SerpAPI"""
        print(f"   Searching Google News via SerpAPI: {query}...")

        params = {
            'api_key': self.api_key,
            'engine': 'google_news',
            'q': query,
            'gl': 'us',  # Country
            'hl': 'en',  # Language
            'tbs': f'qdr:y'  # Last year
        }

        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()

            data = response.json()

            news_results = data.get('news_results', [])

            articles = []
            for result in news_results:
                articles.append({
                    'title': result.get('title'),
                    'link': result.get('link'),
                    'source': result.get('source', {}).get('name'),
                    'date': result.get('date'),
                    'snippet': result.get('snippet'),
                    'thumbnail': result.get('thumbnail'),
                    'scraped_via': 'SerpAPI',
                    'scraped_at': datetime.now().isoformat()
                })

            print(f"      Found {len(articles)} articles")
            return articles

        except Exception as e:
            print(f"      Error searching Google News: {e}")
            return []

    def search_google_with_site_filter(self, query: str, sites: List[str]) -> List[Dict]:
        """Search specific sites for press releases"""
        print(f"   Searching press release sites via SerpAPI...")

        # Build site filter query
        site_filter = " OR ".join([f"site:{site}" for site in sites])
        full_query = f"{query} ({site_filter})"

        params = {
            'api_key': self.api_key,
            'engine': 'google',
            'q': full_query,
            'tbs': 'qdr:y',  # Last year
            'num': 50  # Number of results
        }

        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()

            data = response.json()

            organic_results = data.get('organic_results', [])

            press_releases = []
            for result in organic_results:
                press_releases.append({
                    'title': result.get('title'),
                    'link': result.get('link'),
                    'snippet': result.get('snippet'),
                    'displayed_link': result.get('displayed_link'),
                    'date': result.get('date'),
                    'scraped_via': 'SerpAPI',
                    'scraped_at': datetime.now().isoformat()
                })

            print(f"      Found {len(press_releases)} press releases")
            return press_releases

        except Exception as e:
            print(f"      Error searching press releases: {e}")
            return []

    def get_company_news_and_pr(self, ticker: str, company_name: str) -> Dict[str, Any]:
        """Get comprehensive news and PR for a company"""
        print(f"\n🔍 Fetching news & PR via SerpAPI for {ticker} - {company_name}")

        data = {
            'ticker': ticker,
            'company_name': company_name,
            'scraped_at': datetime.now().isoformat(),
            'news_articles': [],
            'press_releases': []
        }

        # Search Google News
        news_query = f'"{company_name}" OR "{ticker}" stock earnings financial'
        news_articles = self.search_google_news(news_query)
        data['news_articles'] = news_articles

        time.sleep(2)  # Rate limiting

        # Search press release sites
        pr_query = f'"{company_name}" OR "{ticker}"'
        pr_sites = [
            'globenewswire.com',
            'prnewswire.com',
            'newswire.ca',
            'businesswire.com',
            'stockhouse.com'
        ]

        press_releases = self.search_google_with_site_filter(pr_query, pr_sites)
        data['press_releases'] = press_releases

        # Save to file
        output_file = f"{self.output_dir}/{ticker}_serpapi.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)

        print(f"✅ Saved SerpAPI data: {len(news_articles)} news, {len(press_releases)} PR")

        return data

    def scrape_multiple_stocks(self, stock_list: List[Dict], max_stocks: int = None):
        """Scrape news and PR for multiple stocks"""
        print("=" * 70)
        print("SERPAPI NEWS & PRESS RELEASE SCRAPER")
        print("=" * 70)

        if max_stocks:
            stock_list = stock_list[:max_stocks]

        all_data = []

        for stock in stock_list:
            ticker = stock.get('symbol')
            company_name = stock.get('name')

            data = self.get_company_news_and_pr(ticker, company_name)
            all_data.append(data)

            time.sleep(3)  # Rate limiting for API

        print(f"\n✅ Completed scraping {len(all_data)} stocks via SerpAPI")
        return all_data

    def check_api_credits(self):
        """Check remaining SerpAPI credits"""
        params = {
            'api_key': self.api_key,
            'engine': 'google',
            'q': 'test'
        }

        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()

            data = response.json()
            search_metadata = data.get('search_metadata', {})

            print("\nSerpAPI Status:")
            print(f"  Status: {search_metadata.get('status')}")
            print(f"  Total time: {search_metadata.get('total_time')}s")

            # Note: Credit info might not be directly available in response
            # Check SerpAPI dashboard for actual credit count

            return True
        except Exception as e:
            print(f"Error checking API status: {e}")
            return False


def main():
    """Test SerpAPI scraper"""
    scraper = SerpAPINewsScraper()

    # Check API status
    scraper.check_api_credits()

    # Test with a sample stock
    test_stocks = [
        {'symbol': 'AAPL', 'name': 'Apple Inc.'},
    ]

    scraper.scrape_multiple_stocks(test_stocks, max_stocks=1)


if __name__ == "__main__":
    main()