148 lines
5.2 KiB
Python
148 lines
5.2 KiB
Python
"""RSS News Fetcher for DS Task AI News"""
|
|
import feedparser
|
|
import requests
|
|
import json
|
|
import os
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
from urllib.parse import urlparse
|
|
import hashlib
|
|
from config import settings
|
|
|
|
class NewsFetcher:
|
|
def __init__(self):
|
|
self.raw_news_dir = settings.raw_news_dir
|
|
self.max_articles = settings.max_articles_per_feed
|
|
|
|
# Ensure directories exist
|
|
os.makedirs(self.raw_news_dir, exist_ok=True)
|
|
|
|
def generate_article_id(self, title: str, url: str) -> str:
|
|
"""Generate unique ID for article"""
|
|
content = f"{title}{url}"
|
|
return hashlib.md5(content.encode()).hexdigest()[:12]
|
|
|
|
def clean_content(self, content: str) -> str:
|
|
"""Clean and truncate content"""
|
|
if not content:
|
|
return ""
|
|
|
|
# Remove HTML tags (basic cleaning)
|
|
import re
|
|
content = re.sub(r'<[^>]+>', '', content)
|
|
|
|
# Truncate to reasonable length
|
|
return content[:1000] if len(content) > 1000 else content
|
|
|
|
def fetch_rss_feed(self, feed_url: str) -> List[Dict[str, Any]]:
|
|
"""Fetch articles from a single RSS feed"""
|
|
try:
|
|
print(f"Fetching from: {feed_url}")
|
|
feed = feedparser.parse(feed_url)
|
|
|
|
if feed.bozo:
|
|
print(f"Warning: Feed parsing issues for {feed_url}")
|
|
|
|
articles = []
|
|
source_name = getattr(feed.feed, 'title', urlparse(feed_url).netloc)
|
|
|
|
for entry in feed.entries[:self.max_articles]:
|
|
try:
|
|
# Extract article data
|
|
title = getattr(entry, 'title', 'No Title')
|
|
content = getattr(entry, 'summary', getattr(entry, 'description', ''))
|
|
url = getattr(entry, 'link', '')
|
|
published = getattr(entry, 'published', '')
|
|
|
|
# Parse date
|
|
try:
|
|
if published:
|
|
pub_date = datetime(*entry.published_parsed[:6])
|
|
else:
|
|
pub_date = datetime.now()
|
|
except:
|
|
pub_date = datetime.now()
|
|
|
|
# Create article object
|
|
article = {
|
|
"id": self.generate_article_id(title, url),
|
|
"title": title,
|
|
"content": self.clean_content(content),
|
|
"url": url,
|
|
"source": source_name,
|
|
"published_date": pub_date.isoformat(),
|
|
"fetched_date": datetime.now().isoformat(),
|
|
"categories": getattr(entry, 'tags', []),
|
|
"slug": title.lower().replace(" ", "-").replace("'", "")[:50]
|
|
}
|
|
|
|
articles.append(article)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing entry: {e}")
|
|
continue
|
|
|
|
print(f"Fetched {len(articles)} articles from {source_name}")
|
|
return articles
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching RSS feed {feed_url}: {e}")
|
|
return []
|
|
|
|
def fetch_all_news(self) -> List[Dict[str, Any]]:
|
|
"""Fetch news from all configured RSS feeds"""
|
|
all_articles = []
|
|
|
|
for feed_url in settings.rss_feeds:
|
|
feed_url = feed_url.strip()
|
|
if feed_url:
|
|
articles = self.fetch_rss_feed(feed_url)
|
|
all_articles.extend(articles)
|
|
|
|
# Remove duplicates based on ID
|
|
unique_articles = {}
|
|
for article in all_articles:
|
|
unique_articles[article['id']] = article
|
|
|
|
final_articles = list(unique_articles.values())
|
|
print(f"Total unique articles fetched: {len(final_articles)}")
|
|
|
|
return final_articles
|
|
|
|
def save_articles(self, articles: List[Dict[str, Any]]) -> str:
|
|
"""Save articles to JSON file"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"news_{timestamp}.json"
|
|
filepath = os.path.join(self.raw_news_dir, filename)
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(articles, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Saved {len(articles)} articles to {filepath}")
|
|
return filepath
|
|
|
|
def fetch_and_save_news(self) -> Dict[str, Any]:
|
|
"""Fetch news and save to file"""
|
|
articles = self.fetch_all_news()
|
|
|
|
if articles:
|
|
filepath = self.save_articles(articles)
|
|
return {
|
|
"success": True,
|
|
"articles_count": len(articles),
|
|
"filepath": filepath,
|
|
"articles": articles
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"articles_count": 0,
|
|
"message": "No articles fetched"
|
|
}
|
|
|
|
# Test function
|
|
if __name__ == "__main__":
|
|
fetcher = NewsFetcher()
|
|
result = fetcher.fetch_and_save_news()
|
|
print(f"Result: {result}")
|