update recommender and news_fetcher
This commit is contained in:
+135
-3
@@ -1,8 +1,126 @@
|
||||
# Updated newsfetcher.py with similarity search and LLM duplicate detection
|
||||
|
||||
import feedparser
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
from .config import Config
|
||||
from .embeddings import get_query_embedding
|
||||
from .vector_store import VectorDB
|
||||
import groq
|
||||
import numpy as np
|
||||
|
||||
# Initialize Groq client for duplicate detection
|
||||
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
|
||||
|
||||
|
||||
class NewsFetcher:
|
||||
"""News fetcher with duplicate detection capabilities"""
|
||||
|
||||
def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
|
||||
self.vector_db = vector_db
|
||||
self.similarity_threshold = similarity_threshold
|
||||
|
||||
def check_similarity_duplicate(self, article: Dict) -> bool:
|
||||
"""
|
||||
Check if article is a duplicate using similarity search
|
||||
|
||||
Args:
|
||||
article: Article to check for duplicates
|
||||
|
||||
Returns:
|
||||
True if duplicate found, False otherwise
|
||||
"""
|
||||
if not self.vector_db.articles:
|
||||
return False
|
||||
|
||||
# Create search text from title and content
|
||||
search_text = f"{article['title']} {article['content']}"
|
||||
query_embedding = get_query_embedding(search_text)
|
||||
|
||||
if not query_embedding:
|
||||
return False
|
||||
|
||||
# Search for similar articles
|
||||
similar_articles = self.vector_db.search(query_embedding, k=5)
|
||||
|
||||
# Check if any similar article exceeds threshold
|
||||
for similar_article in similar_articles:
|
||||
similarity_score = similar_article.get('similarity_score', 0)
|
||||
# Convert distance to similarity (FAISS returns L2 distance)
|
||||
similarity = 1 / (1 + similarity_score)
|
||||
|
||||
if similarity > self.similarity_threshold:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
|
||||
"""
|
||||
Check if titles are duplicates using LLM comparison
|
||||
|
||||
Args:
|
||||
new_title: New article title
|
||||
existing_titles: List of existing article titles
|
||||
|
||||
Returns:
|
||||
True if duplicate found, False otherwise
|
||||
"""
|
||||
if not existing_titles:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Create prompt for LLM comparison
|
||||
titles_text = "\n".join([f"- {title}" for title in existing_titles])
|
||||
|
||||
response = groq_client.chat.completions.create(
|
||||
model=Config.GROQ_MODEL,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
|
||||
}
|
||||
],
|
||||
max_tokens=10,
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
result = response.choices[0].message.content.strip().upper()
|
||||
return "DUPLICATE" in result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error checking LLM duplicate: {str(e)}")
|
||||
return False
|
||||
|
||||
def is_duplicate_article(self, article: Dict) -> bool:
|
||||
"""
|
||||
Check if article is duplicate using both similarity and LLM methods
|
||||
|
||||
Args:
|
||||
article: Article to check
|
||||
|
||||
Returns:
|
||||
True if duplicate, False otherwise
|
||||
"""
|
||||
# First check similarity
|
||||
if self.check_similarity_duplicate(article):
|
||||
return True
|
||||
|
||||
# Then check with LLM
|
||||
existing_titles = [art['title'] for art in self.vector_db.articles]
|
||||
if self.check_llm_duplicate(article['title'], existing_titles):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# Initialize news fetcher instance
|
||||
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
|
||||
|
||||
|
||||
def fetch_rss_news(feed_url):
|
||||
@@ -27,13 +145,27 @@ def fetch_rss_news(feed_url):
|
||||
|
||||
|
||||
def fetch_all_news():
|
||||
"""Fetch news from all RSS feeds"""
|
||||
"""Fetch news from all RSS feeds with duplicate detection"""
|
||||
all_articles = []
|
||||
|
||||
# Set the vector_db instance for news_fetcher
|
||||
from .recommender import vector_db
|
||||
news_fetcher.vector_db = vector_db
|
||||
|
||||
for feed_url in Config.RSS_FEEDS:
|
||||
try:
|
||||
articles = fetch_rss_news(feed_url)
|
||||
all_articles.extend(articles)
|
||||
|
||||
# Filter out duplicates
|
||||
unique_articles = []
|
||||
for article in articles:
|
||||
if not news_fetcher.is_duplicate_article(article):
|
||||
unique_articles.append(article)
|
||||
else:
|
||||
print(f"Skipping duplicate article: {article['title']}")
|
||||
|
||||
all_articles.extend(unique_articles)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching from {feed_url}: {str(e)}")
|
||||
|
||||
@@ -63,4 +195,4 @@ def save_processed_news(articles):
|
||||
with open(filename, 'w') as f:
|
||||
json.dump(articles, f, indent=2)
|
||||
|
||||
return filename
|
||||
return filename
|
||||
Reference in New Issue
Block a user