ecd24ce2a6
🚀 Major System Upgrades: - Upgraded from 10 to 15 API endpoints (50% increase) - Implemented real Sentence Transformers (all-MiniLM-L6-v2) with 384D embeddings - Added Groq LLM integration (llama3-8b-8192) for AI analysis - Built comprehensive deduplication system (1378 → 204 unique articles) - Added 3 new AI analysis endpoints: analyze-article, generate-insights, recommend-by-article-id 🤖 AI & ML Enhancements: - Replaced hash-based embeddings with genuine Sentence Transformers - Implemented offline AI model operation (no API dependencies for embeddings) - Added complete article analysis: summarization, sentiment, keyword extraction - Built multi-article insights generation with trend analysis - Enhanced semantic search with similarity scoring 🔧 Production Features: - Added intelligent duplicate detection and removal - Implemented vector index rebuilding capabilities - Enhanced RSS fetching with better error handling and timeouts - Improved search API with content inclusion control - Added comprehensive system monitoring and maintenance tools 📚 Documentation & Configuration: - Updated README.md to reflect all current features and capabilities - Added .env.example with proper configuration templates - Enhanced API documentation with working examples - Updated system architecture documentation 🎯 System Metrics: - 204 unique articles (deduplicated from 1378) - 15 fully functional API endpoints - 384-dimensional Sentence Transformers embeddings - FAISS vector database with semantic similarity search - Groq LLM integration active and operational - Production-ready with rate limiting, caching, and error handling Ready for enterprise deployment and scaling.
174 lines
6.3 KiB
Python
174 lines
6.3 KiB
Python
"""RSS News Fetcher for DS Task AI News"""
|
|
import feedparser
|
|
import requests
|
|
import json
|
|
import os
|
|
from datetime import datetime
|
|
from typing import List, Dict, Any
|
|
from urllib.parse import urlparse
|
|
import hashlib
|
|
from config import settings
|
|
|
|
class NewsFetcher:
|
|
def __init__(self):
|
|
self.raw_news_dir = settings.raw_news_dir
|
|
self.max_articles = settings.max_articles_per_feed
|
|
|
|
# Ensure directories exist
|
|
os.makedirs(self.raw_news_dir, exist_ok=True)
|
|
|
|
def generate_article_id(self, title: str, url: str) -> str:
|
|
"""Generate unique ID for article"""
|
|
content = f"{title}{url}"
|
|
return hashlib.md5(content.encode()).hexdigest()[:12]
|
|
|
|
def clean_content(self, content: str) -> str:
|
|
"""Clean and truncate content"""
|
|
if not content:
|
|
return ""
|
|
|
|
# Remove HTML tags (basic cleaning)
|
|
import re
|
|
content = re.sub(r'<[^>]+>', '', content)
|
|
|
|
# Truncate to reasonable length
|
|
return content[:1000] if len(content) > 1000 else content
|
|
|
|
def fetch_rss_feed(self, feed_url: str) -> List[Dict[str, Any]]:
|
|
"""Fetch articles from a single RSS feed"""
|
|
try:
|
|
print(f"Fetching from: {feed_url}")
|
|
|
|
# Use requests with proper headers and timeout
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
}
|
|
|
|
try:
|
|
import requests
|
|
response = requests.get(feed_url, headers=headers, timeout=15)
|
|
response.raise_for_status()
|
|
feed = feedparser.parse(response.content)
|
|
except Exception as e:
|
|
print(f"HTTP request failed, trying direct feedparser: {e}")
|
|
feed = feedparser.parse(feed_url)
|
|
|
|
if feed.bozo:
|
|
print(f"Warning: Feed parsing issues for {feed_url}")
|
|
if hasattr(feed, 'bozo_exception'):
|
|
print(f"Bozo exception: {feed.bozo_exception}")
|
|
|
|
articles = []
|
|
source_name = getattr(feed.feed, 'title', urlparse(feed_url).netloc)
|
|
|
|
for entry in feed.entries[:self.max_articles]:
|
|
try:
|
|
# Extract article data
|
|
title = getattr(entry, 'title', 'No Title')
|
|
content = getattr(entry, 'summary', getattr(entry, 'description', ''))
|
|
url = getattr(entry, 'link', '')
|
|
published = getattr(entry, 'published', '')
|
|
|
|
# Parse date
|
|
try:
|
|
if published:
|
|
pub_date = datetime(*entry.published_parsed[:6])
|
|
else:
|
|
pub_date = datetime.now()
|
|
except:
|
|
pub_date = datetime.now()
|
|
|
|
# Create article object
|
|
article = {
|
|
"id": self.generate_article_id(title, url),
|
|
"title": title,
|
|
"content": self.clean_content(content),
|
|
"url": url,
|
|
"source": source_name,
|
|
"published_date": pub_date.isoformat(),
|
|
"fetched_date": datetime.now().isoformat(),
|
|
"categories": getattr(entry, 'tags', []),
|
|
"slug": title.lower().replace(" ", "-").replace("'", "")[:50]
|
|
}
|
|
|
|
articles.append(article)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing entry: {e}")
|
|
continue
|
|
|
|
print(f"Fetched {len(articles)} articles from {source_name}")
|
|
|
|
# If no articles but feed parsed successfully, it might be due to no new content
|
|
if len(articles) == 0 and not feed.bozo:
|
|
print(f"No new articles found in {source_name} (feed is valid)")
|
|
|
|
return articles
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching RSS feed {feed_url}: {e}")
|
|
return []
|
|
|
|
def fetch_all_news(self) -> List[Dict[str, Any]]:
|
|
"""Fetch news from all configured RSS feeds"""
|
|
all_articles = []
|
|
|
|
for feed_url in settings.rss_feeds:
|
|
feed_url = feed_url.strip()
|
|
if feed_url:
|
|
articles = self.fetch_rss_feed(feed_url)
|
|
all_articles.extend(articles)
|
|
|
|
# Remove duplicates based on ID
|
|
unique_articles = {}
|
|
for article in all_articles:
|
|
unique_articles[article['id']] = article
|
|
|
|
final_articles = list(unique_articles.values())
|
|
print(f"Total unique articles fetched: {len(final_articles)}")
|
|
|
|
return final_articles
|
|
|
|
def save_articles(self, articles: List[Dict[str, Any]]) -> str:
|
|
"""Save articles to JSON file"""
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"news_{timestamp}.json"
|
|
|
|
# Normalize the path to avoid double backslashes
|
|
raw_news_dir = os.path.normpath(self.raw_news_dir)
|
|
filepath = os.path.normpath(os.path.join(raw_news_dir, filename))
|
|
|
|
# Ensure directory exists
|
|
os.makedirs(raw_news_dir, exist_ok=True)
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
json.dump(articles, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"Saved {len(articles)} articles to {filepath}")
|
|
return filepath
|
|
|
|
def fetch_and_save_news(self) -> Dict[str, Any]:
|
|
"""Fetch news and save to file"""
|
|
articles = self.fetch_all_news()
|
|
|
|
if articles:
|
|
filepath = self.save_articles(articles)
|
|
return {
|
|
"success": True,
|
|
"articles_count": len(articles),
|
|
"filepath": filepath,
|
|
"articles": articles
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"articles_count": 0,
|
|
"message": "No articles fetched"
|
|
}
|
|
|
|
# Test function
|
|
if __name__ == "__main__":
|
|
fetcher = NewsFetcher()
|
|
result = fetcher.fetch_and_save_news()
|
|
print(f"Result: {result}")
|