DS_Task_AI_News/backend/news_fetcher.py

# Updated newsfetcher.py with similarity search and LLM duplicate detection

import feedparser
import json
import os
from datetime import datetime
from typing import List, Dict, Optional
from .config import Config
from .embeddings import get_query_embedding
from .vector_store import VectorDB
import groq
import numpy as np

# Initialize Groq client for duplicate detection
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)


class NewsFetcher:
    """News fetcher with duplicate detection capabilities"""

    def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
        self.vector_db = vector_db
        self.similarity_threshold = similarity_threshold

    def check_similarity_duplicate(self, article: Dict) -> bool:
        """
        Check if article is a duplicate using similarity search

        Args:
            article: Article to check for duplicates

        Returns:
            True if duplicate found, False otherwise
        """
        if not self.vector_db.articles:
            return False

        # Create search text from title and content
        search_text = f"{article['title']} {article['content']}"
        query_embedding = get_query_embedding(search_text)

        if not query_embedding:
            return False

        # Search for similar articles
        similar_articles = self.vector_db.search(query_embedding, k=5)

        # Check if any similar article exceeds threshold
        for similar_article in similar_articles:
            similarity_score = similar_article.get('similarity_score', 0)
            # Convert distance to similarity (FAISS returns L2 distance)
            similarity = 1 / (1 + similarity_score)

            if similarity > self.similarity_threshold:
                return True

        return False

    def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
        """
        Check if titles are duplicates using LLM comparison

        Args:
            new_title: New article title
            existing_titles: List of existing article titles

        Returns:
            True if duplicate found, False otherwise
        """
        if not existing_titles:
            return False

        try:
            # Create prompt for LLM comparison
            titles_text = "\n".join([f"- {title}" for title in existing_titles])

            response = groq_client.chat.completions.create(
                model=Config.GROQ_MODEL,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
                    },
                    {
                        "role": "user",
                        "content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
                    }
                ],
                max_tokens=10,
                temperature=0.1
            )

            result = response.choices[0].message.content.strip().upper()
            return "DUPLICATE" in result

        except Exception as e:
            print(f"Error checking LLM duplicate: {str(e)}")
            return False

    def is_duplicate_article(self, article: Dict) -> bool:
        """
        Check if article is duplicate using both similarity and LLM methods

        Args:
            article: Article to check

        Returns:
            True if duplicate, False otherwise
        """
        # First check similarity
        if self.check_similarity_duplicate(article):
            return True

        # Then check with LLM
        existing_titles = [art['title'] for art in self.vector_db.articles]
        if self.check_llm_duplicate(article['title'], existing_titles):
            return True

        return False


# Initialize news fetcher instance
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)


def fetch_rss_news(feed_url):
    """Fetch news from RSS feed"""
    feed = feedparser.parse(feed_url)
    articles = []

    for entry in feed.entries:
        article = {
            "title": entry.title,
            "content": getattr(entry, 'summary', ''),
            "date": getattr(entry, 'published', ''),
            "slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""),
            "categories": ["Technology", "AI and Innovation"],
            "tags": ["AI", "Technology", "Innovation"],
            "url": getattr(entry, 'link', ''),
            "source": feed_url
        }
        articles.append(article)

    return articles


def fetch_all_news():
    """Fetch news from all RSS feeds with duplicate detection"""
    all_articles = []

    # Set the vector_db instance for news_fetcher
    from .recommender import vector_db
    news_fetcher.vector_db = vector_db

    for feed_url in Config.RSS_FEEDS:
        try:
            articles = fetch_rss_news(feed_url)

            # Filter out duplicates
            unique_articles = []
            for article in articles:
                if not news_fetcher.is_duplicate_article(article):
                    unique_articles.append(article)
                else:
                    print(f"Skipping duplicate article: {article['title']}")

            all_articles.extend(unique_articles)

        except Exception as e:
            print(f"Error fetching from {feed_url}: {str(e)}")

    return all_articles


def save_raw_news(articles):
    """Save raw news articles to file"""
    os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json"

    with open(filename, 'w') as f:
        json.dump(articles, f, indent=2)

    return filename


def save_processed_news(articles):
    """Save processed news articles to file"""
    os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json"

    with open(filename, 'w') as f:
        json.dump(articles, f, indent=2)

    return filename