2025-07-24 16:35:04 +01:00
# Updated newsfetcher.py with similarity search and LLM duplicate detection
2025-07-07 22:08:02 +01:00
import feedparser
2025-07-08 19:57:35 +01:00
import json
import os
from datetime import datetime
2025-07-24 16:35:04 +01:00
from typing import List , Dict , Optional
2025-07-08 19:57:35 +01:00
from . config import Config
2025-07-24 16:35:04 +01:00
from . embeddings import get_query_embedding
from . vector_store import VectorDB
import groq
import numpy as np
# Initialize Groq client for duplicate detection
groq_client = groq . Groq ( api_key = Config . GROQ_API_KEY )
class NewsFetcher :
""" News fetcher with duplicate detection capabilities """
def __init__ ( self , vector_db : VectorDB , similarity_threshold : float = 0.8 ) :
self . vector_db = vector_db
self . similarity_threshold = similarity_threshold
def check_similarity_duplicate ( self , article : Dict ) - > bool :
"""
Check if article is a duplicate using similarity search
Args:
article: Article to check for duplicates
Returns:
True if duplicate found, False otherwise
"""
if not self . vector_db . articles :
return False
# Create search text from title and content
search_text = f " { article [ ' title ' ] } { article [ ' content ' ] } "
query_embedding = get_query_embedding ( search_text )
if not query_embedding :
return False
# Search for similar articles
similar_articles = self . vector_db . search ( query_embedding , k = 5 )
# Check if any similar article exceeds threshold
for similar_article in similar_articles :
similarity_score = similar_article . get ( ' similarity_score ' , 0 )
# Convert distance to similarity (FAISS returns L2 distance)
similarity = 1 / ( 1 + similarity_score )
if similarity > self . similarity_threshold :
return True
return False
def check_llm_duplicate ( self , new_title : str , existing_titles : List [ str ] ) - > bool :
"""
Check if titles are duplicates using LLM comparison
Args:
new_title: New article title
existing_titles: List of existing article titles
Returns:
True if duplicate found, False otherwise
"""
if not existing_titles :
return False
try :
# Create prompt for LLM comparison
titles_text = " \n " . join ( [ f " - { title } " for title in existing_titles ] )
response = groq_client . chat . completions . create (
model = Config . GROQ_MODEL ,
messages = [
{
" role " : " system " ,
" content " : " You are a duplicate detection system. Compare the new article title with existing titles and respond with ' DUPLICATE ' if they refer to the same news story, or ' UNIQUE ' if it ' s a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates. "
} ,
{
" role " : " user " ,
" content " : f " New title: { new_title } \n \n Existing titles: \n { titles_text } "
}
] ,
max_tokens = 10 ,
temperature = 0.1
)
result = response . choices [ 0 ] . message . content . strip ( ) . upper ( )
return " DUPLICATE " in result
except Exception as e :
print ( f " Error checking LLM duplicate: { str ( e ) } " )
return False
def is_duplicate_article ( self , article : Dict ) - > bool :
"""
Check if article is duplicate using both similarity and LLM methods
Args:
article: Article to check
Returns:
True if duplicate, False otherwise
"""
# First check similarity
if self . check_similarity_duplicate ( article ) :
return True
# Then check with LLM
existing_titles = [ art [ ' title ' ] for art in self . vector_db . articles ]
if self . check_llm_duplicate ( article [ ' title ' ] , existing_titles ) :
return True
return False
# Initialize news fetcher instance
news_fetcher = NewsFetcher ( None , similarity_threshold = 0.8 )
2025-07-08 19:57:35 +01:00
2025-07-07 22:08:02 +01:00
2025-07-08 19:57:35 +01:00
def fetch_rss_news ( feed_url ) :
""" Fetch news from RSS feed """
feed = feedparser . parse ( feed_url )
2025-07-07 22:08:02 +01:00
articles = [ ]
2025-07-08 19:57:35 +01:00
2025-07-07 22:08:02 +01:00
for entry in feed . entries :
2025-07-08 19:57:35 +01:00
article = {
2025-07-07 22:08:02 +01:00
" title " : entry . title ,
2025-07-08 19:57:35 +01:00
" content " : getattr ( entry , ' summary ' , ' ' ) ,
" date " : getattr ( entry , ' published ' , ' ' ) ,
" slug " : entry . title . lower ( ) . replace ( " " , " - " ) . replace ( " , " , " " ) . replace ( " . " , " " ) ,
" categories " : [ " Technology " , " AI and Innovation " ] ,
" tags " : [ " AI " , " Technology " , " Innovation " ] ,
" url " : getattr ( entry , ' link ' , ' ' ) ,
" source " : feed_url
}
articles . append ( article )
return articles
def fetch_all_news ( ) :
2025-07-24 16:35:04 +01:00
""" Fetch news from all RSS feeds with duplicate detection """
2025-07-08 19:57:35 +01:00
all_articles = [ ]
2025-07-24 16:35:04 +01:00
# Set the vector_db instance for news_fetcher
from . recommender import vector_db
news_fetcher . vector_db = vector_db
2025-07-08 19:57:35 +01:00
for feed_url in Config . RSS_FEEDS :
try :
articles = fetch_rss_news ( feed_url )
2025-07-24 16:35:04 +01:00
# Filter out duplicates
unique_articles = [ ]
for article in articles :
if not news_fetcher . is_duplicate_article ( article ) :
unique_articles . append ( article )
else :
print ( f " Skipping duplicate article: { article [ ' title ' ] } " )
all_articles . extend ( unique_articles )
2025-07-08 19:57:35 +01:00
except Exception as e :
print ( f " Error fetching from { feed_url } : { str ( e ) } " )
return all_articles
def save_raw_news ( articles ) :
""" Save raw news articles to file """
os . makedirs ( Config . RAW_NEWS_PATH , exist_ok = True )
timestamp = datetime . now ( ) . strftime ( " % Y % m %d _ % H % M % S " )
filename = f " { Config . RAW_NEWS_PATH } news_ { timestamp } .json "
with open ( filename , ' w ' ) as f :
json . dump ( articles , f , indent = 2 )
return filename
def save_processed_news ( articles ) :
""" Save processed news articles to file """
os . makedirs ( Config . PROCESSED_NEWS_PATH , exist_ok = True )
timestamp = datetime . now ( ) . strftime ( " % Y % m %d _ % H % M % S " )
filename = f " { Config . PROCESSED_NEWS_PATH } processed_news_ { timestamp } .json "
with open ( filename , ' w ' ) as f :
json . dump ( articles , f , indent = 2 )
2025-07-24 16:35:04 +01:00
return filename