# Updated newsfetcher.py with similarity search and LLM duplicate detection import feedparser import json import os from datetime import datetime from typing import List, Dict, Optional from .config import Config from .embeddings import get_query_embedding from .vector_store import VectorDB import groq import numpy as np # Initialize Groq client for duplicate detection groq_client = groq.Groq(api_key=Config.GROQ_API_KEY) class NewsFetcher: """News fetcher with duplicate detection capabilities""" def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8): self.vector_db = vector_db self.similarity_threshold = similarity_threshold def check_similarity_duplicate(self, article: Dict) -> bool: """ Check if article is a duplicate using similarity search Args: article: Article to check for duplicates Returns: True if duplicate found, False otherwise """ if not self.vector_db.articles: return False # Create search text from title and content search_text = f"{article['title']} {article['content']}" query_embedding = get_query_embedding(search_text) if not query_embedding: return False # Search for similar articles similar_articles = self.vector_db.search(query_embedding, k=5) # Check if any similar article exceeds threshold for similar_article in similar_articles: similarity_score = similar_article.get('similarity_score', 0) # Convert distance to similarity (FAISS returns L2 distance) similarity = 1 / (1 + similarity_score) if similarity > self.similarity_threshold: return True return False def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool: """ Check if titles are duplicates using LLM comparison Args: new_title: New article title existing_titles: List of existing article titles Returns: True if duplicate found, False otherwise """ if not existing_titles: return False try: # Create prompt for LLM comparison titles_text = "\n".join([f"- {title}" for title in existing_titles]) response = groq_client.chat.completions.create( model=Config.GROQ_MODEL, messages=[ { "role": "system", "content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates." }, { "role": "user", "content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}" } ], max_tokens=10, temperature=0.1 ) result = response.choices[0].message.content.strip().upper() return "DUPLICATE" in result except Exception as e: print(f"Error checking LLM duplicate: {str(e)}") return False def is_duplicate_article(self, article: Dict) -> bool: """ Check if article is duplicate using both similarity and LLM methods Args: article: Article to check Returns: True if duplicate, False otherwise """ # First check similarity if self.check_similarity_duplicate(article): return True # Then check with LLM existing_titles = [art['title'] for art in self.vector_db.articles] if self.check_llm_duplicate(article['title'], existing_titles): return True return False # Initialize news fetcher instance news_fetcher = NewsFetcher(None, similarity_threshold=0.8) def fetch_rss_news(feed_url): """Fetch news from RSS feed""" feed = feedparser.parse(feed_url) articles = [] for entry in feed.entries: article = { "title": entry.title, "content": getattr(entry, 'summary', ''), "date": getattr(entry, 'published', ''), "slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""), "categories": ["Technology", "AI and Innovation"], "tags": ["AI", "Technology", "Innovation"], "url": getattr(entry, 'link', ''), "source": feed_url } articles.append(article) return articles def fetch_all_news(): """Fetch news from all RSS feeds with duplicate detection""" all_articles = [] # Set the vector_db instance for news_fetcher from .recommender import vector_db news_fetcher.vector_db = vector_db for feed_url in Config.RSS_FEEDS: try: articles = fetch_rss_news(feed_url) # Filter out duplicates unique_articles = [] for article in articles: if not news_fetcher.is_duplicate_article(article): unique_articles.append(article) else: print(f"Skipping duplicate article: {article['title']}") all_articles.extend(unique_articles) except Exception as e: print(f"Error fetching from {feed_url}: {str(e)}") return all_articles def save_raw_news(articles): """Save raw news articles to file""" os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json" with open(filename, 'w') as f: json.dump(articles, f, indent=2) return filename def save_processed_news(articles): """Save processed news articles to file""" os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json" with open(filename, 'w') as f: json.dump(articles, f, indent=2) return filename