backend/news_fetcher.py

import feedparser
import json
import os
import logging
from datetime import datetime
from typing import List, Dict, Any
from config import RSS_FEEDS, RAW_NEWS_DIR, PROCESSED_NEWS_DIR
from embeddings import EmbeddingGenerator
from vector_store import VectorStore
from bs4 import BeautifulSoup
import re

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('news_fetcher.log')
    ]
)
logger = logging.getLogger('NewsFetcher')

class NewsFetcher:
    def __init__(self):
        self.feeds = RSS_FEEDS
        self.embedding_generator = EmbeddingGenerator()
        self.vector_store = VectorStore()
        logger.info("NewsFetcher initialized with %d RSS feeds", len(self.feeds))

    def clean_html_content(self, html_content: str) -> str:
        """Clean HTML content and extract plain text."""
        logger.debug("Cleaning HTML content of length %d", len(html_content))
        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text content
        text = soup.get_text()
        
        # Clean up whitespace
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = ' '.join(chunk for chunk in chunks if chunk)
        
        # Remove extra spaces
        text = re.sub(r'\s+', ' ', text)
        
        cleaned_text = text.strip()
        logger.debug("Cleaned text length: %d", len(cleaned_text))
        return cleaned_text

    def fetch_rss_news(self, feed_url: str) -> List[Dict[str, Any]]:
        """Fetch news articles from a single RSS feed."""
        logger.info("Fetching news from feed: %s", feed_url)
        feed = feedparser.parse(feed_url)
        articles = []
        
        for entry in feed.entries:
            # Get raw content with HTML
            raw_content = entry.get("summary", "")
            
            # Clean HTML content
            clean_content = self.clean_html_content(raw_content)
            
            article = {
                "title": entry.title,
                "raw_content": raw_content,  # Store original HTML content
                "content": clean_content,     # Store cleaned text content
                "link": entry.get("link", ""),
                "published": entry.get("published", datetime.now().isoformat()),
                "source": feed.feed.get("title", "Unknown"),
                "categories": [tag.term for tag in entry.get("tags", [])],
                "id": entry.get("id", entry.get("link", "")),
            }
            articles.append(article)
        
        logger.info("Fetched %d articles from %s", len(articles), feed_url)
        return articles

    def fetch_all_news(self) -> List[Dict[str, Any]]:
        """Fetch news from all configured RSS feeds."""
        logger.info("Starting to fetch news from all %d feeds", len(self.feeds))
        all_articles = []
        
        for feed_url in self.feeds:
            try:
                articles = self.fetch_rss_news(feed_url)
                all_articles.extend(articles)
                logger.info("Successfully fetched %d articles from %s", len(articles), feed_url)
            except Exception as e:
                logger.error("Error fetching from %s: %s", feed_url, str(e))
        
        logger.info("Total articles fetched: %d", len(all_articles))
        return all_articles

    def save_raw_articles(self, articles: List[Dict[str, Any]]) -> str:
        """Save raw articles to a JSON file."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"raw_news_{timestamp}.json"
        filepath = os.path.join(RAW_NEWS_DIR, filename)
        
        logger.info("Saving %d raw articles to %s", len(articles), filepath)
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(articles, f, ensure_ascii=False, indent=2)
        
        logger.info("Raw articles saved successfully")
        return filepath

    def save_processed_articles(self, articles: List[Dict[str, Any]]) -> str:
        """Save processed articles with embeddings to a JSON file."""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"processed_news_{timestamp}.json"
        filepath = os.path.join(PROCESSED_NEWS_DIR, filename)
        
        # Create a copy of articles without raw_content for processed storage
        processed_articles = []
        for article in articles:
            processed_article = article.copy()
            processed_article.pop('raw_content', None)  # Remove raw_content from processed articles
            processed_articles.append(processed_article)
        
        logger.info("Saving %d processed articles to %s", len(processed_articles), filepath)
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(processed_articles, f, ensure_ascii=False, indent=2)
        
        logger.info("Processed articles saved successfully")
        return filepath

    def process(self) -> Dict[str, Any]:
        """Main process to fetch, process, and store news articles."""
        logger.info("Starting news processing pipeline")
        
        # Fetch articles
        logger.info("Step 1: Fetching articles from RSS feeds")
        articles = self.fetch_all_news()
        if not articles:
            logger.warning("No articles found during fetching")
            return {"status": "error", "message": "No articles found"}

        # Save raw articles
        logger.info("Step 2: Saving raw articles")
        raw_filepath = self.save_raw_articles(articles)

        # Generate embeddings
        logger.info("Step 3: Generating embeddings for %d articles", len(articles))
        articles_with_embeddings = self.embedding_generator.process_articles(articles)
        logger.info("Embeddings generated successfully")

        # Save processed articles
        logger.info("Step 4: Saving processed articles with embeddings")
        processed_filepath = self.save_processed_articles(articles_with_embeddings)

        # Store in vector database
        logger.info("Step 5: Storing articles in vector database")
        success = self.vector_store.upsert_articles(articles_with_embeddings)
        
        if success:
            logger.info("Articles successfully stored in vector database")
        else:
            logger.error("Failed to store articles in vector database")

        result = {
            "status": "success" if success else "error",
            "message": "Articles processed and stored successfully" if success else "Failed to store articles",
            "raw_filepath": raw_filepath,
            "processed_filepath": processed_filepath,
            "article_count": len(articles)
        }
        
        logger.info("News processing pipeline completed with status: %s", result["status"])
        return result
Add backend functionality for news fetching, processing, and recommendations 2025-04-14 21:44:43 +01:00			`import feedparser`
			`import json`
			`import os`
			`import logging`
			`from datetime import datetime`
			`from typing import List, Dict, Any`
			`from config import RSS_FEEDS, RAW_NEWS_DIR, PROCESSED_NEWS_DIR`
			`from embeddings import EmbeddingGenerator`
			`from vector_store import VectorStore`
			`from bs4 import BeautifulSoup`
			`import re`

			`# Configure logging`
			`logging.basicConfig(`
			`level=logging.INFO,`
			`format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',`
			`handlers=[`
			`logging.StreamHandler(),`
			`logging.FileHandler('news_fetcher.log')`
			`]`
			`)`
			`logger = logging.getLogger('NewsFetcher')`

			`class NewsFetcher:`
			`def __init__(self):`
			`self.feeds = RSS_FEEDS`
			`self.embedding_generator = EmbeddingGenerator()`
			`self.vector_store = VectorStore()`
			`logger.info("NewsFetcher initialized with %d RSS feeds", len(self.feeds))`

			`def clean_html_content(self, html_content: str) -> str:`
			`"""Clean HTML content and extract plain text."""`
			`logger.debug("Cleaning HTML content of length %d", len(html_content))`
			`# Parse HTML with BeautifulSoup`
			`soup = BeautifulSoup(html_content, 'html.parser')`

			`# Remove script and style elements`
			`for script in soup(["script", "style"]):`
			`script.decompose()`

			`# Get text content`
			`text = soup.get_text()`

			`# Clean up whitespace`
			`lines = (line.strip() for line in text.splitlines())`
			`chunks = (phrase.strip() for line in lines for phrase in line.split(" "))`
			`text = ' '.join(chunk for chunk in chunks if chunk)`

			`# Remove extra spaces`
			`text = re.sub(r'\s+', ' ', text)`

			`cleaned_text = text.strip()`
			`logger.debug("Cleaned text length: %d", len(cleaned_text))`
			`return cleaned_text`

			`def fetch_rss_news(self, feed_url: str) -> List[Dict[str, Any]]:`
			`"""Fetch news articles from a single RSS feed."""`
			`logger.info("Fetching news from feed: %s", feed_url)`
			`feed = feedparser.parse(feed_url)`
			`articles = []`

			`for entry in feed.entries:`
			`# Get raw content with HTML`
			`raw_content = entry.get("summary", "")`

			`# Clean HTML content`
			`clean_content = self.clean_html_content(raw_content)`

			`article = {`
			`"title": entry.title,`
			`"raw_content": raw_content, # Store original HTML content`
			`"content": clean_content, # Store cleaned text content`
			`"link": entry.get("link", ""),`
			`"published": entry.get("published", datetime.now().isoformat()),`
			`"source": feed.feed.get("title", "Unknown"),`
			`"categories": [tag.term for tag in entry.get("tags", [])],`
			`"id": entry.get("id", entry.get("link", "")),`
			`}`
			`articles.append(article)`

			`logger.info("Fetched %d articles from %s", len(articles), feed_url)`
			`return articles`

			`def fetch_all_news(self) -> List[Dict[str, Any]]:`
			`"""Fetch news from all configured RSS feeds."""`
			`logger.info("Starting to fetch news from all %d feeds", len(self.feeds))`
			`all_articles = []`

			`for feed_url in self.feeds:`
			`try:`
			`articles = self.fetch_rss_news(feed_url)`
			`all_articles.extend(articles)`
			`logger.info("Successfully fetched %d articles from %s", len(articles), feed_url)`
			`except Exception as e:`
			`logger.error("Error fetching from %s: %s", feed_url, str(e))`

			`logger.info("Total articles fetched: %d", len(all_articles))`
			`return all_articles`

			`def save_raw_articles(self, articles: List[Dict[str, Any]]) -> str:`
			`"""Save raw articles to a JSON file."""`
			`timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`filename = f"raw_news_{timestamp}.json"`
			`filepath = os.path.join(RAW_NEWS_DIR, filename)`

			`logger.info("Saving %d raw articles to %s", len(articles), filepath)`
			`with open(filepath, "w", encoding="utf-8") as f:`
			`json.dump(articles, f, ensure_ascii=False, indent=2)`

			`logger.info("Raw articles saved successfully")`
			`return filepath`

			`def save_processed_articles(self, articles: List[Dict[str, Any]]) -> str:`
			`"""Save processed articles with embeddings to a JSON file."""`
			`timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`filename = f"processed_news_{timestamp}.json"`
			`filepath = os.path.join(PROCESSED_NEWS_DIR, filename)`

			`# Create a copy of articles without raw_content for processed storage`
			`processed_articles = []`
			`for article in articles:`
			`processed_article = article.copy()`
			`processed_article.pop('raw_content', None) # Remove raw_content from processed articles`
			`processed_articles.append(processed_article)`

			`logger.info("Saving %d processed articles to %s", len(processed_articles), filepath)`
			`with open(filepath, "w", encoding="utf-8") as f:`
			`json.dump(processed_articles, f, ensure_ascii=False, indent=2)`

			`logger.info("Processed articles saved successfully")`
			`return filepath`

			`def process(self) -> Dict[str, Any]:`
			`"""Main process to fetch, process, and store news articles."""`
			`logger.info("Starting news processing pipeline")`

			`# Fetch articles`
			`logger.info("Step 1: Fetching articles from RSS feeds")`
			`articles = self.fetch_all_news()`
			`if not articles:`
			`logger.warning("No articles found during fetching")`
			`return {"status": "error", "message": "No articles found"}`

			`# Save raw articles`
			`logger.info("Step 2: Saving raw articles")`
			`raw_filepath = self.save_raw_articles(articles)`

			`# Generate embeddings`
			`logger.info("Step 3: Generating embeddings for %d articles", len(articles))`
			`articles_with_embeddings = self.embedding_generator.process_articles(articles)`
			`logger.info("Embeddings generated successfully")`

			`# Save processed articles`
			`logger.info("Step 4: Saving processed articles with embeddings")`
			`processed_filepath = self.save_processed_articles(articles_with_embeddings)`

			`# Store in vector database`
			`logger.info("Step 5: Storing articles in vector database")`
			`success = self.vector_store.upsert_articles(articles_with_embeddings)`

			`if success:`
			`logger.info("Articles successfully stored in vector database")`
			`else:`
			`logger.error("Failed to store articles in vector database")`

			`result = {`
			`"status": "success" if success else "error",`
			`"message": "Articles processed and stored successfully" if success else "Failed to store articles",`
			`"raw_filepath": raw_filepath,`
			`"processed_filepath": processed_filepath,`
			`"article_count": len(articles)`
			`}`

			`logger.info("News processing pipeline completed with status: %s", result["status"])`
			`return result`