Files

198 lines
6.4 KiB
Python
Raw Permalink Normal View History

2025-07-24 16:35:04 +01:00
# Updated newsfetcher.py with similarity search and LLM duplicate detection
2025-07-07 22:08:02 +01:00
import feedparser
2025-07-08 19:57:35 +01:00
import json
import os
from datetime import datetime
2025-07-24 16:35:04 +01:00
from typing import List, Dict, Optional
2025-07-08 19:57:35 +01:00
from .config import Config
2025-07-24 16:35:04 +01:00
from .embeddings import get_query_embedding
from .vector_store import VectorDB
import groq
import numpy as np
# Initialize Groq client for duplicate detection
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
class NewsFetcher:
"""News fetcher with duplicate detection capabilities"""
def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
self.vector_db = vector_db
self.similarity_threshold = similarity_threshold
def check_similarity_duplicate(self, article: Dict) -> bool:
"""
Check if article is a duplicate using similarity search
Args:
article: Article to check for duplicates
Returns:
True if duplicate found, False otherwise
"""
if not self.vector_db.articles:
return False
# Create search text from title and content
search_text = f"{article['title']} {article['content']}"
query_embedding = get_query_embedding(search_text)
if not query_embedding:
return False
# Search for similar articles
similar_articles = self.vector_db.search(query_embedding, k=5)
# Check if any similar article exceeds threshold
for similar_article in similar_articles:
similarity_score = similar_article.get('similarity_score', 0)
# Convert distance to similarity (FAISS returns L2 distance)
similarity = 1 / (1 + similarity_score)
if similarity > self.similarity_threshold:
return True
return False
def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
"""
Check if titles are duplicates using LLM comparison
Args:
new_title: New article title
existing_titles: List of existing article titles
Returns:
True if duplicate found, False otherwise
"""
if not existing_titles:
return False
try:
# Create prompt for LLM comparison
titles_text = "\n".join([f"- {title}" for title in existing_titles])
response = groq_client.chat.completions.create(
model=Config.GROQ_MODEL,
messages=[
{
"role": "system",
"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
},
{
"role": "user",
"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
}
],
max_tokens=10,
temperature=0.1
)
result = response.choices[0].message.content.strip().upper()
return "DUPLICATE" in result
except Exception as e:
print(f"Error checking LLM duplicate: {str(e)}")
return False
def is_duplicate_article(self, article: Dict) -> bool:
"""
Check if article is duplicate using both similarity and LLM methods
Args:
article: Article to check
Returns:
True if duplicate, False otherwise
"""
# First check similarity
if self.check_similarity_duplicate(article):
return True
# Then check with LLM
existing_titles = [art['title'] for art in self.vector_db.articles]
if self.check_llm_duplicate(article['title'], existing_titles):
return True
return False
# Initialize news fetcher instance
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
2025-07-08 19:57:35 +01:00
2025-07-07 22:08:02 +01:00
2025-07-08 19:57:35 +01:00
def fetch_rss_news(feed_url):
"""Fetch news from RSS feed"""
feed = feedparser.parse(feed_url)
2025-07-07 22:08:02 +01:00
articles = []
2025-07-08 19:57:35 +01:00
2025-07-07 22:08:02 +01:00
for entry in feed.entries:
2025-07-08 19:57:35 +01:00
article = {
2025-07-07 22:08:02 +01:00
"title": entry.title,
2025-07-08 19:57:35 +01:00
"content": getattr(entry, 'summary', ''),
"date": getattr(entry, 'published', ''),
"slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""),
"categories": ["Technology", "AI and Innovation"],
"tags": ["AI", "Technology", "Innovation"],
"url": getattr(entry, 'link', ''),
"source": feed_url
}
articles.append(article)
return articles
def fetch_all_news():
2025-07-24 16:35:04 +01:00
"""Fetch news from all RSS feeds with duplicate detection"""
2025-07-08 19:57:35 +01:00
all_articles = []
2025-07-24 16:35:04 +01:00
# Set the vector_db instance for news_fetcher
from .recommender import vector_db
news_fetcher.vector_db = vector_db
2025-07-08 19:57:35 +01:00
for feed_url in Config.RSS_FEEDS:
try:
articles = fetch_rss_news(feed_url)
2025-07-24 16:35:04 +01:00
# Filter out duplicates
unique_articles = []
for article in articles:
if not news_fetcher.is_duplicate_article(article):
unique_articles.append(article)
else:
print(f"Skipping duplicate article: {article['title']}")
all_articles.extend(unique_articles)
2025-07-08 19:57:35 +01:00
except Exception as e:
print(f"Error fetching from {feed_url}: {str(e)}")
return all_articles
def save_raw_news(articles):
"""Save raw news articles to file"""
os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json"
with open(filename, 'w') as f:
json.dump(articles, f, indent=2)
return filename
def save_processed_news(articles):
"""Save processed news articles to file"""
os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json"
with open(filename, 'w') as f:
json.dump(articles, f, indent=2)
2025-07-24 16:35:04 +01:00
return filename