""" Use SerpAPI for robust news and press release scraping Fallback option when direct scraping fails """ import requests import json import os from datetime import datetime, timedelta from typing import Dict, List, Any import time from config import SERPAPI_KEY class SerpAPINewsScraper: def __init__(self, output_dir="data/serpapi_news"): self.api_key = SERPAPI_KEY self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) self.base_url = "https://serpapi.com/search.json" def search_google_news(self, query: str, days_back: int = 365) -> List[Dict]: """Search Google News using SerpAPI""" print(f" Searching Google News via SerpAPI: {query}...") params = { 'api_key': self.api_key, 'engine': 'google_news', 'q': query, 'gl': 'us', # Country 'hl': 'en', # Language 'tbs': f'qdr:y' # Last year } try: response = requests.get(self.base_url, params=params) response.raise_for_status() data = response.json() news_results = data.get('news_results', []) articles = [] for result in news_results: articles.append({ 'title': result.get('title'), 'link': result.get('link'), 'source': result.get('source', {}).get('name'), 'date': result.get('date'), 'snippet': result.get('snippet'), 'thumbnail': result.get('thumbnail'), 'scraped_via': 'SerpAPI', 'scraped_at': datetime.now().isoformat() }) print(f" Found {len(articles)} articles") return articles except Exception as e: print(f" Error searching Google News: {e}") return [] def search_google_with_site_filter(self, query: str, sites: List[str]) -> List[Dict]: """Search specific sites for press releases""" print(f" Searching press release sites via SerpAPI...") # Build site filter query site_filter = " OR ".join([f"site:{site}" for site in sites]) full_query = f"{query} ({site_filter})" params = { 'api_key': self.api_key, 'engine': 'google', 'q': full_query, 'tbs': 'qdr:y', # Last year 'num': 50 # Number of results } try: response = requests.get(self.base_url, params=params) response.raise_for_status() data = response.json() organic_results = data.get('organic_results', []) press_releases = [] for result in organic_results: press_releases.append({ 'title': result.get('title'), 'link': result.get('link'), 'snippet': result.get('snippet'), 'displayed_link': result.get('displayed_link'), 'date': result.get('date'), 'scraped_via': 'SerpAPI', 'scraped_at': datetime.now().isoformat() }) print(f" Found {len(press_releases)} press releases") return press_releases except Exception as e: print(f" Error searching press releases: {e}") return [] def get_company_news_and_pr(self, ticker: str, company_name: str) -> Dict[str, Any]: """Get comprehensive news and PR for a company""" print(f"\nšŸ” Fetching news & PR via SerpAPI for {ticker} - {company_name}") data = { 'ticker': ticker, 'company_name': company_name, 'scraped_at': datetime.now().isoformat(), 'news_articles': [], 'press_releases': [] } # Search Google News news_query = f'"{company_name}" OR "{ticker}" stock earnings financial' news_articles = self.search_google_news(news_query) data['news_articles'] = news_articles time.sleep(2) # Rate limiting # Search press release sites pr_query = f'"{company_name}" OR "{ticker}"' pr_sites = [ 'globenewswire.com', 'prnewswire.com', 'newswire.ca', 'businesswire.com', 'stockhouse.com' ] press_releases = self.search_google_with_site_filter(pr_query, pr_sites) data['press_releases'] = press_releases # Save to file output_file = f"{self.output_dir}/{ticker}_serpapi.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2) print(f"āœ… Saved SerpAPI data: {len(news_articles)} news, {len(press_releases)} PR") return data def scrape_multiple_stocks(self, stock_list: List[Dict], max_stocks: int = None): """Scrape news and PR for multiple stocks""" print("=" * 70) print("SERPAPI NEWS & PRESS RELEASE SCRAPER") print("=" * 70) if max_stocks: stock_list = stock_list[:max_stocks] all_data = [] for stock in stock_list: ticker = stock.get('symbol') company_name = stock.get('name') data = self.get_company_news_and_pr(ticker, company_name) all_data.append(data) time.sleep(3) # Rate limiting for API print(f"\nāœ… Completed scraping {len(all_data)} stocks via SerpAPI") return all_data def check_api_credits(self): """Check remaining SerpAPI credits""" params = { 'api_key': self.api_key, 'engine': 'google', 'q': 'test' } try: response = requests.get(self.base_url, params=params) response.raise_for_status() data = response.json() search_metadata = data.get('search_metadata', {}) print("\nSerpAPI Status:") print(f" Status: {search_metadata.get('status')}") print(f" Total time: {search_metadata.get('total_time')}s") # Note: Credit info might not be directly available in response # Check SerpAPI dashboard for actual credit count return True except Exception as e: print(f"Error checking API status: {e}") return False def main(): """Test SerpAPI scraper""" scraper = SerpAPINewsScraper() # Check API status scraper.check_api_credits() # Test with a sample stock test_stocks = [ {'symbol': 'AAPL', 'name': 'Apple Inc.'}, ] scraper.scrape_multiple_stocks(test_stocks, max_stocks=1) if __name__ == "__main__": main()