198 lines
6.4 KiB
Python
198 lines
6.4 KiB
Python
# Updated newsfetcher.py with similarity search and LLM duplicate detection
|
|
|
|
import feedparser
|
|
import json
|
|
import os
|
|
from datetime import datetime
|
|
from typing import List, Dict, Optional
|
|
from .config import Config
|
|
from .embeddings import get_query_embedding
|
|
from .vector_store import VectorDB
|
|
import groq
|
|
import numpy as np
|
|
|
|
# Initialize Groq client for duplicate detection
|
|
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
|
|
|
|
|
|
class NewsFetcher:
|
|
"""News fetcher with duplicate detection capabilities"""
|
|
|
|
def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
|
|
self.vector_db = vector_db
|
|
self.similarity_threshold = similarity_threshold
|
|
|
|
def check_similarity_duplicate(self, article: Dict) -> bool:
|
|
"""
|
|
Check if article is a duplicate using similarity search
|
|
|
|
Args:
|
|
article: Article to check for duplicates
|
|
|
|
Returns:
|
|
True if duplicate found, False otherwise
|
|
"""
|
|
if not self.vector_db.articles:
|
|
return False
|
|
|
|
# Create search text from title and content
|
|
search_text = f"{article['title']} {article['content']}"
|
|
query_embedding = get_query_embedding(search_text)
|
|
|
|
if not query_embedding:
|
|
return False
|
|
|
|
# Search for similar articles
|
|
similar_articles = self.vector_db.search(query_embedding, k=5)
|
|
|
|
# Check if any similar article exceeds threshold
|
|
for similar_article in similar_articles:
|
|
similarity_score = similar_article.get('similarity_score', 0)
|
|
# Convert distance to similarity (FAISS returns L2 distance)
|
|
similarity = 1 / (1 + similarity_score)
|
|
|
|
if similarity > self.similarity_threshold:
|
|
return True
|
|
|
|
return False
|
|
|
|
def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
|
|
"""
|
|
Check if titles are duplicates using LLM comparison
|
|
|
|
Args:
|
|
new_title: New article title
|
|
existing_titles: List of existing article titles
|
|
|
|
Returns:
|
|
True if duplicate found, False otherwise
|
|
"""
|
|
if not existing_titles:
|
|
return False
|
|
|
|
try:
|
|
# Create prompt for LLM comparison
|
|
titles_text = "\n".join([f"- {title}" for title in existing_titles])
|
|
|
|
response = groq_client.chat.completions.create(
|
|
model=Config.GROQ_MODEL,
|
|
messages=[
|
|
{
|
|
"role": "system",
|
|
"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
|
|
},
|
|
{
|
|
"role": "user",
|
|
"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
|
|
}
|
|
],
|
|
max_tokens=10,
|
|
temperature=0.1
|
|
)
|
|
|
|
result = response.choices[0].message.content.strip().upper()
|
|
return "DUPLICATE" in result
|
|
|
|
except Exception as e:
|
|
print(f"Error checking LLM duplicate: {str(e)}")
|
|
return False
|
|
|
|
def is_duplicate_article(self, article: Dict) -> bool:
|
|
"""
|
|
Check if article is duplicate using both similarity and LLM methods
|
|
|
|
Args:
|
|
article: Article to check
|
|
|
|
Returns:
|
|
True if duplicate, False otherwise
|
|
"""
|
|
# First check similarity
|
|
if self.check_similarity_duplicate(article):
|
|
return True
|
|
|
|
# Then check with LLM
|
|
existing_titles = [art['title'] for art in self.vector_db.articles]
|
|
if self.check_llm_duplicate(article['title'], existing_titles):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
# Initialize news fetcher instance
|
|
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
|
|
|
|
|
|
def fetch_rss_news(feed_url):
|
|
"""Fetch news from RSS feed"""
|
|
feed = feedparser.parse(feed_url)
|
|
articles = []
|
|
|
|
for entry in feed.entries:
|
|
article = {
|
|
"title": entry.title,
|
|
"content": getattr(entry, 'summary', ''),
|
|
"date": getattr(entry, 'published', ''),
|
|
"slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""),
|
|
"categories": ["Technology", "AI and Innovation"],
|
|
"tags": ["AI", "Technology", "Innovation"],
|
|
"url": getattr(entry, 'link', ''),
|
|
"source": feed_url
|
|
}
|
|
articles.append(article)
|
|
|
|
return articles
|
|
|
|
|
|
def fetch_all_news():
|
|
"""Fetch news from all RSS feeds with duplicate detection"""
|
|
all_articles = []
|
|
|
|
# Set the vector_db instance for news_fetcher
|
|
from .recommender import vector_db
|
|
news_fetcher.vector_db = vector_db
|
|
|
|
for feed_url in Config.RSS_FEEDS:
|
|
try:
|
|
articles = fetch_rss_news(feed_url)
|
|
|
|
# Filter out duplicates
|
|
unique_articles = []
|
|
for article in articles:
|
|
if not news_fetcher.is_duplicate_article(article):
|
|
unique_articles.append(article)
|
|
else:
|
|
print(f"Skipping duplicate article: {article['title']}")
|
|
|
|
all_articles.extend(unique_articles)
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching from {feed_url}: {str(e)}")
|
|
|
|
return all_articles
|
|
|
|
|
|
def save_raw_news(articles):
|
|
"""Save raw news articles to file"""
|
|
os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json"
|
|
|
|
with open(filename, 'w') as f:
|
|
json.dump(articles, f, indent=2)
|
|
|
|
return filename
|
|
|
|
|
|
def save_processed_news(articles):
|
|
"""Save processed news articles to file"""
|
|
os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json"
|
|
|
|
with open(filename, 'w') as f:
|
|
json.dump(articles, f, indent=2)
|
|
|
|
return filename |