backend/news_fetcher.py

# Updated newsfetcher.py with similarity search and LLM duplicate detection

import feedparser
import json
import os
from datetime import datetime
from typing import List, Dict, Optional
from .config import Config
from .embeddings import get_query_embedding
from .vector_store import VectorDB
import groq
import numpy as np

# Initialize Groq client for duplicate detection
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)


class NewsFetcher:
    """News fetcher with duplicate detection capabilities"""
    
    def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
        self.vector_db = vector_db
        self.similarity_threshold = similarity_threshold
    
    def check_similarity_duplicate(self, article: Dict) -> bool:
        """
        Check if article is a duplicate using similarity search
        
        Args:
            article: Article to check for duplicates
            
        Returns:
            True if duplicate found, False otherwise
        """
        if not self.vector_db.articles:
            return False
        
        # Create search text from title and content
        search_text = f"{article['title']} {article['content']}"
        query_embedding = get_query_embedding(search_text)
        
        if not query_embedding:
            return False
        
        # Search for similar articles
        similar_articles = self.vector_db.search(query_embedding, k=5)
        
        # Check if any similar article exceeds threshold
        for similar_article in similar_articles:
            similarity_score = similar_article.get('similarity_score', 0)
            # Convert distance to similarity (FAISS returns L2 distance)
            similarity = 1 / (1 + similarity_score)
            
            if similarity > self.similarity_threshold:
                return True
        
        return False
    
    def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
        """
        Check if titles are duplicates using LLM comparison
        
        Args:
            new_title: New article title
            existing_titles: List of existing article titles
            
        Returns:
            True if duplicate found, False otherwise
        """
        if not existing_titles:
            return False
        
        try:
            # Create prompt for LLM comparison
            titles_text = "\n".join([f"- {title}" for title in existing_titles])
            
            response = groq_client.chat.completions.create(
                model=Config.GROQ_MODEL,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
                    },
                    {
                        "role": "user",
                        "content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
                    }
                ],
                max_tokens=10,
                temperature=0.1
            )
            
            result = response.choices[0].message.content.strip().upper()
            return "DUPLICATE" in result
            
        except Exception as e:
            print(f"Error checking LLM duplicate: {str(e)}")
            return False
    
    def is_duplicate_article(self, article: Dict) -> bool:
        """
        Check if article is duplicate using both similarity and LLM methods
        
        Args:
            article: Article to check
            
        Returns:
            True if duplicate, False otherwise
        """
        # First check similarity
        if self.check_similarity_duplicate(article):
            return True
        
        # Then check with LLM
        existing_titles = [art['title'] for art in self.vector_db.articles]
        if self.check_llm_duplicate(article['title'], existing_titles):
            return True
        
        return False


# Initialize news fetcher instance
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)


def fetch_rss_news(feed_url):
    """Fetch news from RSS feed"""
    feed = feedparser.parse(feed_url)
    articles = []

    for entry in feed.entries:
        article = {
            "title": entry.title,
            "content": getattr(entry, 'summary', ''),
            "date": getattr(entry, 'published', ''),
            "slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""),
            "categories": ["Technology", "AI and Innovation"],
            "tags": ["AI", "Technology", "Innovation"],
            "url": getattr(entry, 'link', ''),
            "source": feed_url
        }
        articles.append(article)

    return articles


def fetch_all_news():
    """Fetch news from all RSS feeds with duplicate detection"""
    all_articles = []
    
    # Set the vector_db instance for news_fetcher
    from .recommender import vector_db
    news_fetcher.vector_db = vector_db

    for feed_url in Config.RSS_FEEDS:
        try:
            articles = fetch_rss_news(feed_url)
            
            # Filter out duplicates
            unique_articles = []
            for article in articles:
                if not news_fetcher.is_duplicate_article(article):
                    unique_articles.append(article)
                else:
                    print(f"Skipping duplicate article: {article['title']}")
            
            all_articles.extend(unique_articles)
            
        except Exception as e:
            print(f"Error fetching from {feed_url}: {str(e)}")

    return all_articles


def save_raw_news(articles):
    """Save raw news articles to file"""
    os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json"

    with open(filename, 'w') as f:
        json.dump(articles, f, indent=2)

    return filename


def save_processed_news(articles):
    """Save processed news articles to file"""
    os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json"

    with open(filename, 'w') as f:
        json.dump(articles, f, indent=2)

    return filename
update recommender and news_fetcher 2025-07-24 16:35:04 +01:00			`# Updated newsfetcher.py with similarity search and LLM duplicate detection`

Initial project setup 2025-07-07 22:08:02 +01:00			`import feedparser`
update all endpoints 2025-07-08 19:57:35 +01:00			`import json`
			`import os`
			`from datetime import datetime`
update recommender and news_fetcher 2025-07-24 16:35:04 +01:00			`from typing import List, Dict, Optional`
update all endpoints 2025-07-08 19:57:35 +01:00			`from .config import Config`
update recommender and news_fetcher 2025-07-24 16:35:04 +01:00			`from .embeddings import get_query_embedding`
			`from .vector_store import VectorDB`
			`import groq`
			`import numpy as np`

			`# Initialize Groq client for duplicate detection`
			`groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)`


			`class NewsFetcher:`
			`"""News fetcher with duplicate detection capabilities"""`

			`def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):`
			`self.vector_db = vector_db`
			`self.similarity_threshold = similarity_threshold`

			`def check_similarity_duplicate(self, article: Dict) -> bool:`
			`"""`
			`Check if article is a duplicate using similarity search`

			`Args:`
			`article: Article to check for duplicates`

			`Returns:`
			`True if duplicate found, False otherwise`
			`"""`
			`if not self.vector_db.articles:`
			`return False`

			`# Create search text from title and content`
			`search_text = f"{article['title']} {article['content']}"`
			`query_embedding = get_query_embedding(search_text)`

			`if not query_embedding:`
			`return False`

			`# Search for similar articles`
			`similar_articles = self.vector_db.search(query_embedding, k=5)`

			`# Check if any similar article exceeds threshold`
			`for similar_article in similar_articles:`
			`similarity_score = similar_article.get('similarity_score', 0)`
			`# Convert distance to similarity (FAISS returns L2 distance)`
			`similarity = 1 / (1 + similarity_score)`

			`if similarity > self.similarity_threshold:`
			`return True`

			`return False`

			`def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:`
			`"""`
			`Check if titles are duplicates using LLM comparison`

			`Args:`
			`new_title: New article title`
			`existing_titles: List of existing article titles`

			`Returns:`
			`True if duplicate found, False otherwise`
			`"""`
			`if not existing_titles:`
			`return False`

			`try:`
			`# Create prompt for LLM comparison`
			`titles_text = "\n".join([f"- {title}" for title in existing_titles])`

			`response = groq_client.chat.completions.create(`
			`model=Config.GROQ_MODEL,`
			`messages=[`
			`{`
			`"role": "system",`
			`"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."`
			`},`
			`{`
			`"role": "user",`
			`"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"`
			`}`
			`],`
			`max_tokens=10,`
			`temperature=0.1`
			`)`

			`result = response.choices[0].message.content.strip().upper()`
			`return "DUPLICATE" in result`

			`except Exception as e:`
			`print(f"Error checking LLM duplicate: {str(e)}")`
			`return False`

			`def is_duplicate_article(self, article: Dict) -> bool:`
			`"""`
			`Check if article is duplicate using both similarity and LLM methods`

			`Args:`
			`article: Article to check`

			`Returns:`
			`True if duplicate, False otherwise`
			`"""`
			`# First check similarity`
			`if self.check_similarity_duplicate(article):`
			`return True`

			`# Then check with LLM`
			`existing_titles = [art['title'] for art in self.vector_db.articles]`
			`if self.check_llm_duplicate(article['title'], existing_titles):`
			`return True`

			`return False`


			`# Initialize news fetcher instance`
			`news_fetcher = NewsFetcher(None, similarity_threshold=0.8)`
update all endpoints 2025-07-08 19:57:35 +01:00
Initial project setup 2025-07-07 22:08:02 +01:00
update all endpoints 2025-07-08 19:57:35 +01:00			`def fetch_rss_news(feed_url):`
			`"""Fetch news from RSS feed"""`
			`feed = feedparser.parse(feed_url)`
Initial project setup 2025-07-07 22:08:02 +01:00			`articles = []`
update all endpoints 2025-07-08 19:57:35 +01:00
Initial project setup 2025-07-07 22:08:02 +01:00			`for entry in feed.entries:`
update all endpoints 2025-07-08 19:57:35 +01:00			`article = {`
Initial project setup 2025-07-07 22:08:02 +01:00			`"title": entry.title,`
update all endpoints 2025-07-08 19:57:35 +01:00			`"content": getattr(entry, 'summary', ''),`
			`"date": getattr(entry, 'published', ''),`
			`"slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""),`
			`"categories": ["Technology", "AI and Innovation"],`
			`"tags": ["AI", "Technology", "Innovation"],`
			`"url": getattr(entry, 'link', ''),`
			`"source": feed_url`
			`}`
			`articles.append(article)`

			`return articles`


			`def fetch_all_news():`
update recommender and news_fetcher 2025-07-24 16:35:04 +01:00			`"""Fetch news from all RSS feeds with duplicate detection"""`
update all endpoints 2025-07-08 19:57:35 +01:00			`all_articles = []`
update recommender and news_fetcher 2025-07-24 16:35:04 +01:00
			`# Set the vector_db instance for news_fetcher`
			`from .recommender import vector_db`
			`news_fetcher.vector_db = vector_db`
update all endpoints 2025-07-08 19:57:35 +01:00
			`for feed_url in Config.RSS_FEEDS:`
			`try:`
			`articles = fetch_rss_news(feed_url)`
update recommender and news_fetcher 2025-07-24 16:35:04 +01:00
			`# Filter out duplicates`
			`unique_articles = []`
			`for article in articles:`
			`if not news_fetcher.is_duplicate_article(article):`
			`unique_articles.append(article)`
			`else:`
			`print(f"Skipping duplicate article: {article['title']}")`

			`all_articles.extend(unique_articles)`

update all endpoints 2025-07-08 19:57:35 +01:00			`except Exception as e:`
			`print(f"Error fetching from {feed_url}: {str(e)}")`

			`return all_articles`


			`def save_raw_news(articles):`
			`"""Save raw news articles to file"""`
			`os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)`

			`timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json"`

			`with open(filename, 'w') as f:`
			`json.dump(articles, f, indent=2)`

			`return filename`


			`def save_processed_news(articles):`
			`"""Save processed news articles to file"""`
			`os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)`

			`timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")`
			`filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json"`

			`with open(filename, 'w') as f:`
			`json.dump(articles, f, indent=2)`

update recommender and news_fetcher 2025-07-24 16:35:04 +01:00			`return filename`