Files
ds_task_ai_news_bolade/backend/news_fetcher.py
T
boladeE 82fe3608d2 Refactor backend configuration and enhance news fetching functionality
- Introduced a Config dataclass in config.py to manage API keys, RSS feeds, and directory paths more effectively.
- Updated the NewsFetcher class to include retry logic for fetching articles from RSS feeds.
- Modified the EmbeddingGenerator and NewsRecommender classes to utilize the new configuration structure.
- Enhanced main.py to implement API token verification for secure access to news fetching and recommendations.
2025-04-16 17:55:36 +01:00

203 lines
8.3 KiB
Python

import feedparser
import json
import os
import logging
from datetime import datetime
from typing import List, Dict, Any, Optional
from bs4 import BeautifulSoup
import re
import time
from config import config
from embeddings import EmbeddingGenerator
from vector_store import VectorStore
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('news_fetcher.log')
]
)
logger = logging.getLogger('NewsFetcher')
class NewsFetcher:
def __init__(
self,
embedding_generator: Optional[EmbeddingGenerator] = None,
vector_store: Optional[VectorStore] = None,
max_retries: int = 3,
retry_delay: int = 5
):
self.feeds = config.rss_feeds
self.embedding_generator = embedding_generator or EmbeddingGenerator()
self.vector_store = vector_store or VectorStore()
self.max_retries = max_retries
self.retry_delay = retry_delay
logger.info("NewsFetcher initialized with %d RSS feeds", len(self.feeds))
def clean_html_content(self, html_content: str) -> str:
"""Clean HTML content and extract plain text."""
logger.debug("Cleaning HTML content of length %d", len(html_content))
# Parse HTML with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text content
text = soup.get_text()
# Clean up whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = ' '.join(chunk for chunk in chunks if chunk)
# Remove extra spaces
text = re.sub(r'\s+', ' ', text)
cleaned_text = text.strip()
logger.debug("Cleaned text length: %d", len(cleaned_text))
return cleaned_text
def fetch_rss_news(self, feed_url: str) -> List[Dict[str, Any]]:
"""Fetch news articles from a single RSS feed with retry logic."""
logger.info("Fetching news from feed: %s", feed_url)
articles = []
for attempt in range(self.max_retries):
try:
feed = feedparser.parse(feed_url)
if not feed.entries:
logger.warning("No entries found in feed %s (attempt %d/%d)",
feed_url, attempt + 1, self.max_retries)
if attempt < self.max_retries - 1:
time.sleep(self.retry_delay)
continue
return []
for entry in feed.entries:
# Get raw content with HTML
raw_content = entry.get("summary", "")
# Clean HTML content
clean_content = self.clean_html_content(raw_content)
article = {
"title": entry.title,
"raw_content": raw_content, # Store original HTML content
"content": clean_content, # Store cleaned text content
"link": entry.get("link", ""),
"published": entry.get("published", datetime.now().isoformat()),
"source": feed.feed.get("title", "Unknown"),
"categories": [tag.term for tag in entry.get("tags", [])],
"id": entry.get("id", entry.get("link", "")),
}
articles.append(article)
logger.info("Fetched %d articles from %s", len(articles), feed_url)
return articles
except Exception as e:
logger.error("Error fetching from %s (attempt %d/%d): %s",
feed_url, attempt + 1, self.max_retries, str(e))
if attempt < self.max_retries - 1:
time.sleep(self.retry_delay)
else:
logger.error("Failed to fetch from %s after %d attempts",
feed_url, self.max_retries)
return []
def fetch_all_news(self) -> List[Dict[str, Any]]:
"""Fetch news from all configured RSS feeds."""
logger.info("Starting to fetch news from all %d feeds", len(self.feeds))
all_articles = []
for feed_url in self.feeds:
articles = self.fetch_rss_news(feed_url)
all_articles.extend(articles)
logger.info("Successfully fetched %d articles from %s", len(articles), feed_url)
logger.info("Total articles fetched: %d", len(all_articles))
return all_articles
def save_raw_articles(self, articles: List[Dict[str, Any]]) -> str:
"""Save raw articles to a JSON file."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"raw_news_{timestamp}.json"
filepath = os.path.join(config.raw_news_dir, filename)
logger.info("Saving %d raw articles to %s", len(articles), filepath)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(articles, f, ensure_ascii=False, indent=2)
logger.info("Raw articles saved successfully")
return filepath
def save_processed_articles(self, articles: List[Dict[str, Any]]) -> str:
"""Save processed articles with embeddings to a JSON file."""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"processed_news_{timestamp}.json"
filepath = os.path.join(config.processed_news_dir, filename)
# Create a copy of articles without raw_content for processed storage
processed_articles = []
for article in articles:
processed_article = article.copy()
processed_article.pop('raw_content', None) # Remove raw_content from processed articles
processed_articles.append(processed_article)
logger.info("Saving %d processed articles to %s", len(processed_articles), filepath)
with open(filepath, "w", encoding="utf-8") as f:
json.dump(processed_articles, f, ensure_ascii=False, indent=2)
logger.info("Processed articles saved successfully")
return filepath
def process(self) -> Dict[str, Any]:
"""Main process to fetch, process, and store news articles."""
logger.info("Starting news processing pipeline")
# Fetch articles
logger.info("Step 1: Fetching articles from RSS feeds")
articles = self.fetch_all_news()
if not articles:
logger.warning("No articles found during fetching")
return {"status": "error", "message": "No articles found"}
# Save raw articles
logger.info("Step 2: Saving raw articles")
raw_filepath = self.save_raw_articles(articles)
# Generate embeddings
logger.info("Step 3: Generating embeddings for %d articles", len(articles))
articles_with_embeddings = self.embedding_generator.process_articles(articles)
logger.info("Embeddings generated successfully")
# Save processed articles
logger.info("Step 4: Saving processed articles with embeddings")
processed_filepath = self.save_processed_articles(articles_with_embeddings)
# Store in vector database
logger.info("Step 5: Storing articles in vector database")
success = self.vector_store.upsert_articles(articles_with_embeddings)
if success:
logger.info("Articles successfully stored in vector database")
else:
logger.error("Failed to store articles in vector database")
result = {
"status": "success" if success else "error",
"message": "Articles processed and stored successfully" if success else "Failed to store articles",
"raw_filepath": raw_filepath,
"processed_filepath": processed_filepath,
"article_count": len(articles)
}
logger.info("News processing pipeline completed with status: %s", result["status"])
return result