Files
microcap_scrapping/scrape_serpapi.py
T

216 lines
7.0 KiB
Python
Raw Normal View History

"""
Use SerpAPI for robust news and press release scraping
Fallback option when direct scraping fails
"""
import requests
import json
import os
from datetime import datetime, timedelta
from typing import Dict, List, Any
import time
from config import SERPAPI_KEY
class SerpAPINewsScraper:
def __init__(self, output_dir="data/serpapi_news"):
self.api_key = SERPAPI_KEY
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
self.base_url = "https://serpapi.com/search.json"
def search_google_news(self, query: str, days_back: int = 365) -> List[Dict]:
"""Search Google News using SerpAPI"""
print(f" Searching Google News via SerpAPI: {query}...")
params = {
'api_key': self.api_key,
'engine': 'google_news',
'q': query,
'gl': 'us', # Country
'hl': 'en', # Language
'tbs': f'qdr:y' # Last year
}
try:
response = requests.get(self.base_url, params=params)
response.raise_for_status()
data = response.json()
news_results = data.get('news_results', [])
articles = []
for result in news_results:
articles.append({
'title': result.get('title'),
'link': result.get('link'),
'source': result.get('source', {}).get('name'),
'date': result.get('date'),
'snippet': result.get('snippet'),
'thumbnail': result.get('thumbnail'),
'scraped_via': 'SerpAPI',
'scraped_at': datetime.now().isoformat()
})
print(f" Found {len(articles)} articles")
return articles
except Exception as e:
print(f" Error searching Google News: {e}")
return []
def search_google_with_site_filter(self, query: str, sites: List[str]) -> List[Dict]:
"""Search specific sites for press releases"""
print(f" Searching press release sites via SerpAPI...")
# Build site filter query
site_filter = " OR ".join([f"site:{site}" for site in sites])
full_query = f"{query} ({site_filter})"
params = {
'api_key': self.api_key,
'engine': 'google',
'q': full_query,
'tbs': 'qdr:y', # Last year
'num': 50 # Number of results
}
try:
response = requests.get(self.base_url, params=params)
response.raise_for_status()
data = response.json()
organic_results = data.get('organic_results', [])
press_releases = []
for result in organic_results:
press_releases.append({
'title': result.get('title'),
'link': result.get('link'),
'snippet': result.get('snippet'),
'displayed_link': result.get('displayed_link'),
'date': result.get('date'),
'scraped_via': 'SerpAPI',
'scraped_at': datetime.now().isoformat()
})
print(f" Found {len(press_releases)} press releases")
return press_releases
except Exception as e:
print(f" Error searching press releases: {e}")
return []
def get_company_news_and_pr(self, ticker: str, company_name: str) -> Dict[str, Any]:
"""Get comprehensive news and PR for a company"""
print(f"\n🔍 Fetching news & PR via SerpAPI for {ticker} - {company_name}")
data = {
'ticker': ticker,
'company_name': company_name,
'scraped_at': datetime.now().isoformat(),
'news_articles': [],
'press_releases': []
}
# Search Google News
news_query = f'"{company_name}" OR "{ticker}" stock earnings financial'
news_articles = self.search_google_news(news_query)
data['news_articles'] = news_articles
time.sleep(2) # Rate limiting
# Search press release sites
pr_query = f'"{company_name}" OR "{ticker}"'
pr_sites = [
'globenewswire.com',
'prnewswire.com',
'newswire.ca',
'businesswire.com',
'stockhouse.com'
]
press_releases = self.search_google_with_site_filter(pr_query, pr_sites)
data['press_releases'] = press_releases
# Save to file
output_file = f"{self.output_dir}/{ticker}_serpapi.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2)
print(f"✅ Saved SerpAPI data: {len(news_articles)} news, {len(press_releases)} PR")
return data
def scrape_multiple_stocks(self, stock_list: List[Dict], max_stocks: int = None):
"""Scrape news and PR for multiple stocks"""
print("=" * 70)
print("SERPAPI NEWS & PRESS RELEASE SCRAPER")
print("=" * 70)
if max_stocks:
stock_list = stock_list[:max_stocks]
all_data = []
for stock in stock_list:
ticker = stock.get('symbol')
company_name = stock.get('name')
data = self.get_company_news_and_pr(ticker, company_name)
all_data.append(data)
time.sleep(3) # Rate limiting for API
print(f"\n✅ Completed scraping {len(all_data)} stocks via SerpAPI")
return all_data
def check_api_credits(self):
"""Check remaining SerpAPI credits"""
params = {
'api_key': self.api_key,
'engine': 'google',
'q': 'test'
}
try:
response = requests.get(self.base_url, params=params)
response.raise_for_status()
data = response.json()
search_metadata = data.get('search_metadata', {})
print("\nSerpAPI Status:")
print(f" Status: {search_metadata.get('status')}")
print(f" Total time: {search_metadata.get('total_time')}s")
# Note: Credit info might not be directly available in response
# Check SerpAPI dashboard for actual credit count
return True
except Exception as e:
print(f"Error checking API status: {e}")
return False
def main():
"""Test SerpAPI scraper"""
scraper = SerpAPINewsScraper()
# Check API status
scraper.check_api_credits()
# Test with a sample stock
test_stocks = [
{'symbol': 'AAPL', 'name': 'Apple Inc.'},
]
scraper.scrape_multiple_stocks(test_stocks, max_stocks=1)
if __name__ == "__main__":
main()