scrape_serpapi.py

"""
Use SerpAPI for robust news and press release scraping
Fallback option when direct scraping fails
"""

import requests
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Any
import time

from config import SERPAPI_KEY


class SerpAPINewsScraper:
    def __init__(self, output_dir="data/serpapi_news"):
        self.api_key = SERPAPI_KEY
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.base_url = "https://serpapi.com/search.json"
    
    def search_google_news(self, query: str, days_back: int = 365) -> List[Dict]:
        """Search Google News using SerpAPI"""
        print(f"   Searching Google News via SerpAPI: {query}...")
        
        params = {
            'api_key': self.api_key,
            'engine': 'google_news',
            'q': query,
            'gl': 'us',  # Country
            'hl': 'en',  # Language
            'tbs': f'qdr:y'  # Last year
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            
            data = response.json()
            
            news_results = data.get('news_results', [])
            
            articles = []
            for result in news_results:
                articles.append({
                    'title': result.get('title'),
                    'link': result.get('link'),
                    'source': result.get('source', {}).get('name'),
                    'date': result.get('date'),
                    'snippet': result.get('snippet'),
                    'thumbnail': result.get('thumbnail'),
                    'scraped_via': 'SerpAPI',
                    'scraped_at': datetime.now().isoformat()
                })
            
            print(f"      Found {len(articles)} articles")
            return articles
            
        except Exception as e:
            print(f"      Error searching Google News: {e}")
            return []
    
    def search_google_with_site_filter(self, query: str, sites: List[str]) -> List[Dict]:
        """Search specific sites for press releases"""
        print(f"   Searching press release sites via SerpAPI...")
        
        # Build site filter query
        site_filter = " OR ".join([f"site:{site}" for site in sites])
        full_query = f"{query} ({site_filter})"
        
        params = {
            'api_key': self.api_key,
            'engine': 'google',
            'q': full_query,
            'tbs': 'qdr:y',  # Last year
            'num': 50  # Number of results
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            
            data = response.json()
            
            organic_results = data.get('organic_results', [])
            
            press_releases = []
            for result in organic_results:
                press_releases.append({
                    'title': result.get('title'),
                    'link': result.get('link'),
                    'snippet': result.get('snippet'),
                    'displayed_link': result.get('displayed_link'),
                    'date': result.get('date'),
                    'scraped_via': 'SerpAPI',
                    'scraped_at': datetime.now().isoformat()
                })
            
            print(f"      Found {len(press_releases)} press releases")
            return press_releases
            
        except Exception as e:
            print(f"      Error searching press releases: {e}")
            return []
    
    def get_company_news_and_pr(self, ticker: str, company_name: str) -> Dict[str, Any]:
        """Get comprehensive news and PR for a company"""
        print(f"\n🔍 Fetching news & PR via SerpAPI for {ticker} - {company_name}")
        
        data = {
            'ticker': ticker,
            'company_name': company_name,
            'scraped_at': datetime.now().isoformat(),
            'news_articles': [],
            'press_releases': []
        }
        
        # Search Google News
        news_query = f'"{company_name}" OR "{ticker}" stock earnings financial'
        news_articles = self.search_google_news(news_query)
        data['news_articles'] = news_articles
        
        time.sleep(2)  # Rate limiting
        
        # Search press release sites
        pr_query = f'"{company_name}" OR "{ticker}"'
        pr_sites = [
            'globenewswire.com',
            'prnewswire.com',
            'newswire.ca',
            'businesswire.com',
            'stockhouse.com'
        ]
        
        press_releases = self.search_google_with_site_filter(pr_query, pr_sites)
        data['press_releases'] = press_releases
        
        # Save to file
        output_file = f"{self.output_dir}/{ticker}_serpapi.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2)
        
        print(f"✅ Saved SerpAPI data: {len(news_articles)} news, {len(press_releases)} PR")
        
        return data
    
    def scrape_multiple_stocks(self, stock_list: List[Dict], max_stocks: int = None):
        """Scrape news and PR for multiple stocks"""
        print("=" * 70)
        print("SERPAPI NEWS & PRESS RELEASE SCRAPER")
        print("=" * 70)
        
        if max_stocks:
            stock_list = stock_list[:max_stocks]
        
        all_data = []
        
        for stock in stock_list:
            ticker = stock.get('symbol')
            company_name = stock.get('name')
            
            data = self.get_company_news_and_pr(ticker, company_name)
            all_data.append(data)
            
            time.sleep(3)  # Rate limiting for API
        
        print(f"\n✅ Completed scraping {len(all_data)} stocks via SerpAPI")
        return all_data
    
    def check_api_credits(self):
        """Check remaining SerpAPI credits"""
        params = {
            'api_key': self.api_key,
            'engine': 'google',
            'q': 'test'
        }
        
        try:
            response = requests.get(self.base_url, params=params)
            response.raise_for_status()
            
            data = response.json()
            search_metadata = data.get('search_metadata', {})
            
            print("\nSerpAPI Status:")
            print(f"  Status: {search_metadata.get('status')}")
            print(f"  Total time: {search_metadata.get('total_time')}s")
            
            # Note: Credit info might not be directly available in response
            # Check SerpAPI dashboard for actual credit count
            
            return True
        except Exception as e:
            print(f"Error checking API status: {e}")
            return False


def main():
    """Test SerpAPI scraper"""
    scraper = SerpAPINewsScraper()
    
    # Check API status
    scraper.check_api_credits()
    
    # Test with a sample stock
    test_stocks = [
        {'symbol': 'AAPL', 'name': 'Apple Inc.'},
    ]
    
    scraper.scrape_multiple_stocks(test_stocks, max_stocks=1)


if __name__ == "__main__":
    main()
Initial commit: Stock Intelligence Automation System 2025-11-06 12:22:19 +01:00			`"""`
			`Use SerpAPI for robust news and press release scraping`
			`Fallback option when direct scraping fails`
			`"""`

			`import requests`
			`import json`
			`import os`
			`from datetime import datetime, timedelta`
			`from typing import Dict, List, Any`
			`import time`

			`from config import SERPAPI_KEY`


			`class SerpAPINewsScraper:`
			`def __init__(self, output_dir="data/serpapi_news"):`
			`self.api_key = SERPAPI_KEY`
			`self.output_dir = output_dir`
			`os.makedirs(output_dir, exist_ok=True)`
			`self.base_url = "https://serpapi.com/search.json"`

			`def search_google_news(self, query: str, days_back: int = 365) -> List[Dict]:`
			`"""Search Google News using SerpAPI"""`
			`print(f" Searching Google News via SerpAPI: {query}...")`

			`params = {`
			`'api_key': self.api_key,`
			`'engine': 'google_news',`
			`'q': query,`
			`'gl': 'us', # Country`
			`'hl': 'en', # Language`
			`'tbs': f'qdr:y' # Last year`
			`}`

			`try:`
			`response = requests.get(self.base_url, params=params)`
			`response.raise_for_status()`

			`data = response.json()`

			`news_results = data.get('news_results', [])`

			`articles = []`
			`for result in news_results:`
			`articles.append({`
			`'title': result.get('title'),`
			`'link': result.get('link'),`
			`'source': result.get('source', {}).get('name'),`
			`'date': result.get('date'),`
			`'snippet': result.get('snippet'),`
			`'thumbnail': result.get('thumbnail'),`
			`'scraped_via': 'SerpAPI',`
			`'scraped_at': datetime.now().isoformat()`
			`})`

			`print(f" Found {len(articles)} articles")`
			`return articles`

			`except Exception as e:`
			`print(f" Error searching Google News: {e}")`
			`return []`

			`def search_google_with_site_filter(self, query: str, sites: List[str]) -> List[Dict]:`
			`"""Search specific sites for press releases"""`
			`print(f" Searching press release sites via SerpAPI...")`

			`# Build site filter query`
			`site_filter = " OR ".join([f"site:{site}" for site in sites])`
			`full_query = f"{query} ({site_filter})"`

			`params = {`
			`'api_key': self.api_key,`
			`'engine': 'google',`
			`'q': full_query,`
			`'tbs': 'qdr:y', # Last year`
			`'num': 50 # Number of results`
			`}`

			`try:`
			`response = requests.get(self.base_url, params=params)`
			`response.raise_for_status()`

			`data = response.json()`

			`organic_results = data.get('organic_results', [])`

			`press_releases = []`
			`for result in organic_results:`
			`press_releases.append({`
			`'title': result.get('title'),`
			`'link': result.get('link'),`
			`'snippet': result.get('snippet'),`
			`'displayed_link': result.get('displayed_link'),`
			`'date': result.get('date'),`
			`'scraped_via': 'SerpAPI',`
			`'scraped_at': datetime.now().isoformat()`
			`})`

			`print(f" Found {len(press_releases)} press releases")`
			`return press_releases`

			`except Exception as e:`
			`print(f" Error searching press releases: {e}")`
			`return []`

			`def get_company_news_and_pr(self, ticker: str, company_name: str) -> Dict[str, Any]:`
			`"""Get comprehensive news and PR for a company"""`
			`print(f"\n🔍 Fetching news & PR via SerpAPI for {ticker} - {company_name}")`

			`data = {`
			`'ticker': ticker,`
			`'company_name': company_name,`
			`'scraped_at': datetime.now().isoformat(),`
			`'news_articles': [],`
			`'press_releases': []`
			`}`

			`# Search Google News`
			`news_query = f'"{company_name}" OR "{ticker}" stock earnings financial'`
			`news_articles = self.search_google_news(news_query)`
			`data['news_articles'] = news_articles`

			`time.sleep(2) # Rate limiting`

			`# Search press release sites`
			`pr_query = f'"{company_name}" OR "{ticker}"'`
			`pr_sites = [`
			`'globenewswire.com',`
			`'prnewswire.com',`
			`'newswire.ca',`
			`'businesswire.com',`
			`'stockhouse.com'`
			`]`

			`press_releases = self.search_google_with_site_filter(pr_query, pr_sites)`
			`data['press_releases'] = press_releases`

			`# Save to file`
			`output_file = f"{self.output_dir}/{ticker}_serpapi.json"`
			`with open(output_file, 'w', encoding='utf-8') as f:`
			`json.dump(data, f, indent=2)`

			`print(f"✅ Saved SerpAPI data: {len(news_articles)} news, {len(press_releases)} PR")`

			`return data`

			`def scrape_multiple_stocks(self, stock_list: List[Dict], max_stocks: int = None):`
			`"""Scrape news and PR for multiple stocks"""`
			`print("=" * 70)`
			`print("SERPAPI NEWS & PRESS RELEASE SCRAPER")`
			`print("=" * 70)`

			`if max_stocks:`
			`stock_list = stock_list[:max_stocks]`

			`all_data = []`

			`for stock in stock_list:`
			`ticker = stock.get('symbol')`
			`company_name = stock.get('name')`

			`data = self.get_company_news_and_pr(ticker, company_name)`
			`all_data.append(data)`

			`time.sleep(3) # Rate limiting for API`

			`print(f"\n✅ Completed scraping {len(all_data)} stocks via SerpAPI")`
			`return all_data`

			`def check_api_credits(self):`
			`"""Check remaining SerpAPI credits"""`
			`params = {`
			`'api_key': self.api_key,`
			`'engine': 'google',`
			`'q': 'test'`
			`}`

			`try:`
			`response = requests.get(self.base_url, params=params)`
			`response.raise_for_status()`

			`data = response.json()`
			`search_metadata = data.get('search_metadata', {})`

			`print("\nSerpAPI Status:")`
			`print(f" Status: {search_metadata.get('status')}")`
			`print(f" Total time: {search_metadata.get('total_time')}s")`

			`# Note: Credit info might not be directly available in response`
			`# Check SerpAPI dashboard for actual credit count`

			`return True`
			`except Exception as e:`
			`print(f"Error checking API status: {e}")`
			`return False`


			`def main():`
			`"""Test SerpAPI scraper"""`
			`scraper = SerpAPINewsScraper()`

			`# Check API status`
			`scraper.check_api_credits()`

			`# Test with a sample stock`
			`test_stocks = [`
			`{'symbol': 'AAPL', 'name': 'Apple Inc.'},`
			`]`

			`scraper.scrape_multiple_stocks(test_stocks, max_stocks=1)`


			`if __name__ == "__main__":`
			`main()`