Files
2025-07-24 16:35:04 +01:00

198 lines
6.4 KiB
Python

# Updated newsfetcher.py with similarity search and LLM duplicate detection
import feedparser
import json
import os
from datetime import datetime
from typing import List, Dict, Optional
from .config import Config
from .embeddings import get_query_embedding
from .vector_store import VectorDB
import groq
import numpy as np
# Initialize Groq client for duplicate detection
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
class NewsFetcher:
"""News fetcher with duplicate detection capabilities"""
def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
self.vector_db = vector_db
self.similarity_threshold = similarity_threshold
def check_similarity_duplicate(self, article: Dict) -> bool:
"""
Check if article is a duplicate using similarity search
Args:
article: Article to check for duplicates
Returns:
True if duplicate found, False otherwise
"""
if not self.vector_db.articles:
return False
# Create search text from title and content
search_text = f"{article['title']} {article['content']}"
query_embedding = get_query_embedding(search_text)
if not query_embedding:
return False
# Search for similar articles
similar_articles = self.vector_db.search(query_embedding, k=5)
# Check if any similar article exceeds threshold
for similar_article in similar_articles:
similarity_score = similar_article.get('similarity_score', 0)
# Convert distance to similarity (FAISS returns L2 distance)
similarity = 1 / (1 + similarity_score)
if similarity > self.similarity_threshold:
return True
return False
def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
"""
Check if titles are duplicates using LLM comparison
Args:
new_title: New article title
existing_titles: List of existing article titles
Returns:
True if duplicate found, False otherwise
"""
if not existing_titles:
return False
try:
# Create prompt for LLM comparison
titles_text = "\n".join([f"- {title}" for title in existing_titles])
response = groq_client.chat.completions.create(
model=Config.GROQ_MODEL,
messages=[
{
"role": "system",
"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
},
{
"role": "user",
"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
}
],
max_tokens=10,
temperature=0.1
)
result = response.choices[0].message.content.strip().upper()
return "DUPLICATE" in result
except Exception as e:
print(f"Error checking LLM duplicate: {str(e)}")
return False
def is_duplicate_article(self, article: Dict) -> bool:
"""
Check if article is duplicate using both similarity and LLM methods
Args:
article: Article to check
Returns:
True if duplicate, False otherwise
"""
# First check similarity
if self.check_similarity_duplicate(article):
return True
# Then check with LLM
existing_titles = [art['title'] for art in self.vector_db.articles]
if self.check_llm_duplicate(article['title'], existing_titles):
return True
return False
# Initialize news fetcher instance
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
def fetch_rss_news(feed_url):
"""Fetch news from RSS feed"""
feed = feedparser.parse(feed_url)
articles = []
for entry in feed.entries:
article = {
"title": entry.title,
"content": getattr(entry, 'summary', ''),
"date": getattr(entry, 'published', ''),
"slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""),
"categories": ["Technology", "AI and Innovation"],
"tags": ["AI", "Technology", "Innovation"],
"url": getattr(entry, 'link', ''),
"source": feed_url
}
articles.append(article)
return articles
def fetch_all_news():
"""Fetch news from all RSS feeds with duplicate detection"""
all_articles = []
# Set the vector_db instance for news_fetcher
from .recommender import vector_db
news_fetcher.vector_db = vector_db
for feed_url in Config.RSS_FEEDS:
try:
articles = fetch_rss_news(feed_url)
# Filter out duplicates
unique_articles = []
for article in articles:
if not news_fetcher.is_duplicate_article(article):
unique_articles.append(article)
else:
print(f"Skipping duplicate article: {article['title']}")
all_articles.extend(unique_articles)
except Exception as e:
print(f"Error fetching from {feed_url}: {str(e)}")
return all_articles
def save_raw_news(articles):
"""Save raw news articles to file"""
os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json"
with open(filename, 'w') as f:
json.dump(articles, f, indent=2)
return filename
def save_processed_news(articles):
"""Save processed news articles to file"""
os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json"
with open(filename, 'w') as f:
json.dump(articles, f, indent=2)
return filename