update recommender and news_fetcher

2025-07-24 16:35:04 +01:00
parent f28755e1fd
commit 8d2a277afe
4 changed files with 330 additions and 36 deletions
@@ -1,8 +1,126 @@
+# Updated newsfetcher.py with similarity search and LLM duplicate detection
+
 import feedparser
 import json
 import os
 from datetime import datetime
+from typing import List, Dict, Optional
 from .config import Config
+from .embeddings import get_query_embedding
+from .vector_store import VectorDB
+import groq
+import numpy as np
+
+# Initialize Groq client for duplicate detection
+groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
+
+
+class NewsFetcher:
+    """News fetcher with duplicate detection capabilities"""
+    
+    def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
+        self.vector_db = vector_db
+        self.similarity_threshold = similarity_threshold
+    
+    def check_similarity_duplicate(self, article: Dict) -> bool:
+        """
+        Check if article is a duplicate using similarity search
+        
+        Args:
+            article: Article to check for duplicates
+            
+        Returns:
+            True if duplicate found, False otherwise
+        """
+        if not self.vector_db.articles:
+            return False
+        
+        # Create search text from title and content
+        search_text = f"{article['title']} {article['content']}"
+        query_embedding = get_query_embedding(search_text)
+        
+        if not query_embedding:
+            return False
+        
+        # Search for similar articles
+        similar_articles = self.vector_db.search(query_embedding, k=5)
+        
+        # Check if any similar article exceeds threshold
+        for similar_article in similar_articles:
+            similarity_score = similar_article.get('similarity_score', 0)
+            # Convert distance to similarity (FAISS returns L2 distance)
+            similarity = 1 / (1 + similarity_score)
+            
+            if similarity > self.similarity_threshold:
+                return True
+        
+        return False
+    
+    def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
+        """
+        Check if titles are duplicates using LLM comparison
+        
+        Args:
+            new_title: New article title
+            existing_titles: List of existing article titles
+            
+        Returns:
+            True if duplicate found, False otherwise
+        """
+        if not existing_titles:
+            return False
+        
+        try:
+            # Create prompt for LLM comparison
+            titles_text = "\n".join([f"- {title}" for title in existing_titles])
+            
+            response = groq_client.chat.completions.create(
+                model=Config.GROQ_MODEL,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
+                    },
+                    {
+                        "role": "user",
+                        "content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
+                    }
+                ],
+                max_tokens=10,
+                temperature=0.1
+            )
+            
+            result = response.choices[0].message.content.strip().upper()
+            return "DUPLICATE" in result
+            
+        except Exception as e:
+            print(f"Error checking LLM duplicate: {str(e)}")
+            return False
+    
+    def is_duplicate_article(self, article: Dict) -> bool:
+        """
+        Check if article is duplicate using both similarity and LLM methods
+        
+        Args:
+            article: Article to check
+            
+        Returns:
+            True if duplicate, False otherwise
+        """
+        # First check similarity
+        if self.check_similarity_duplicate(article):
+            return True
+        
+        # Then check with LLM
+        existing_titles = [art['title'] for art in self.vector_db.articles]
+        if self.check_llm_duplicate(article['title'], existing_titles):
+            return True
+        
+        return False
+
+
+# Initialize news fetcher instance
+news_fetcher = NewsFetcher(None, similarity_threshold=0.8)


 def fetch_rss_news(feed_url):
@@ -27,13 +145,27 @@ def fetch_rss_news(feed_url):


 def fetch_all_news():
-    """Fetch news from all RSS feeds"""
+    """Fetch news from all RSS feeds with duplicate detection"""
    all_articles = []
+    
+    # Set the vector_db instance for news_fetcher
+    from .recommender import vector_db
+    news_fetcher.vector_db = vector_db

    for feed_url in Config.RSS_FEEDS:
        try:
            articles = fetch_rss_news(feed_url)
-            all_articles.extend(articles)
+            
+            # Filter out duplicates
+            unique_articles = []
+            for article in articles:
+                if not news_fetcher.is_duplicate_article(article):
+                    unique_articles.append(article)
+                else:
+                    print(f"Skipping duplicate article: {article['title']}")
+            
+            all_articles.extend(unique_articles)
+            
        except Exception as e:
            print(f"Error fetching from {feed_url}: {str(e)}")

@@ -63,4 +195,4 @@ def save_processed_news(articles):
    with open(filename, 'w') as f:
        json.dump(articles, f, indent=2)

-    return filename
+    return filename