From 8d2a277afebfa5da5dba399b479a6c372099fd06 Mon Sep 17 00:00:00 2001 From: Ayomide Date: Thu, 24 Jul 2025 16:35:04 +0100 Subject: [PATCH] update recommender and news_fetcher --- .gitignore | 112 ++++++++++++++++++++++++++++++++ backend/main.py | 28 ++++++-- backend/news_fetcher.py | 138 +++++++++++++++++++++++++++++++++++++++- backend/recommender.py | 88 ++++++++++++++++--------- 4 files changed, 330 insertions(+), 36 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fe1f278 --- /dev/null +++ b/.gitignore @@ -0,0 +1,112 @@ +# .gitignore for DS Task AI News Project + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +### Environment ### +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Virtual environment +pythonenv* + +### IDE ### +# VS Code +.vscode/ +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json + +# PyCharm +.idea/ +*.iml +*.ipr +*.iws + +### Data Files ### +# Raw and processed news +data/raw_news/ +data/processed_news/ +*.csv +*.json +*.parquet +*.feather +*.pkl +*.pickle +*.db +*.sqlite + +# Vector database files +*.faiss +*.index +*.bin +*.vec + +### Logs ### +*.log +logs/ + +### OS Generated ### +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +### Groq/Cohere Cache ### +.cache/ +model_cache/ + +### Test Files ### +test_output/ +benchmark_results/ + +### Documentation ### +docs/_build/ \ No newline at end of file diff --git a/backend/main.py b/backend/main.py index cbfb1dc..15f1279 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,7 +1,7 @@ from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news -from .recommender import recommend_similar, process_articles_for_vector_db +from .recommender import recommend_similar, process_articles_for_vector_db, news_recommender from .recommender import analyze_article_with_groq from .recommender import get_personalized_recommendations, vector_db from .vector_store import VectorDB @@ -41,7 +41,7 @@ async def root(): @app.get("/fetch-news") async def fetch_news(): - """Fetch news from RSS feeds""" + """Fetch news from RSS feeds with duplicate detection""" try: articles = fetch_all_news() @@ -74,7 +74,7 @@ async def fetch_news(): @app.get("/recommend-news") async def recommend_news(article_id: str): - """Retrieve similar news based on the selected article""" + """Retrieve similar news based on the selected article (backward compatibility)""" try: recommendations = recommend_similar(article_id) @@ -91,6 +91,25 @@ async def recommend_news(article_id: str): raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}") +@app.get("/recommend-by-text") +async def recommend_by_text(text_description: str, top_n: int = 3): + """Recommend articles based on text description""" + try: + recommendations = news_recommender.recommend_by_text(text_description, top_n) + + if not recommendations: + raise HTTPException(status_code=404, detail="No recommendations found") + + return { + "text_description": text_description, + "recommendations": recommendations, + "count": len(recommendations) + } + + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}") + + @app.get("/analyze-article") async def analyze_article(article_id: str): """Analyze article using Groq LLM""" @@ -134,6 +153,7 @@ async def health_check(): """Health check endpoint""" return {"status": "healthy", "database_articles": len(vector_db.articles)} + if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/backend/news_fetcher.py b/backend/news_fetcher.py index 9d8bab0..cb5f3dd 100644 --- a/backend/news_fetcher.py +++ b/backend/news_fetcher.py @@ -1,8 +1,126 @@ +# Updated newsfetcher.py with similarity search and LLM duplicate detection + import feedparser import json import os from datetime import datetime +from typing import List, Dict, Optional from .config import Config +from .embeddings import get_query_embedding +from .vector_store import VectorDB +import groq +import numpy as np + +# Initialize Groq client for duplicate detection +groq_client = groq.Groq(api_key=Config.GROQ_API_KEY) + + +class NewsFetcher: + """News fetcher with duplicate detection capabilities""" + + def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8): + self.vector_db = vector_db + self.similarity_threshold = similarity_threshold + + def check_similarity_duplicate(self, article: Dict) -> bool: + """ + Check if article is a duplicate using similarity search + + Args: + article: Article to check for duplicates + + Returns: + True if duplicate found, False otherwise + """ + if not self.vector_db.articles: + return False + + # Create search text from title and content + search_text = f"{article['title']} {article['content']}" + query_embedding = get_query_embedding(search_text) + + if not query_embedding: + return False + + # Search for similar articles + similar_articles = self.vector_db.search(query_embedding, k=5) + + # Check if any similar article exceeds threshold + for similar_article in similar_articles: + similarity_score = similar_article.get('similarity_score', 0) + # Convert distance to similarity (FAISS returns L2 distance) + similarity = 1 / (1 + similarity_score) + + if similarity > self.similarity_threshold: + return True + + return False + + def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool: + """ + Check if titles are duplicates using LLM comparison + + Args: + new_title: New article title + existing_titles: List of existing article titles + + Returns: + True if duplicate found, False otherwise + """ + if not existing_titles: + return False + + try: + # Create prompt for LLM comparison + titles_text = "\n".join([f"- {title}" for title in existing_titles]) + + response = groq_client.chat.completions.create( + model=Config.GROQ_MODEL, + messages=[ + { + "role": "system", + "content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates." + }, + { + "role": "user", + "content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}" + } + ], + max_tokens=10, + temperature=0.1 + ) + + result = response.choices[0].message.content.strip().upper() + return "DUPLICATE" in result + + except Exception as e: + print(f"Error checking LLM duplicate: {str(e)}") + return False + + def is_duplicate_article(self, article: Dict) -> bool: + """ + Check if article is duplicate using both similarity and LLM methods + + Args: + article: Article to check + + Returns: + True if duplicate, False otherwise + """ + # First check similarity + if self.check_similarity_duplicate(article): + return True + + # Then check with LLM + existing_titles = [art['title'] for art in self.vector_db.articles] + if self.check_llm_duplicate(article['title'], existing_titles): + return True + + return False + + +# Initialize news fetcher instance +news_fetcher = NewsFetcher(None, similarity_threshold=0.8) def fetch_rss_news(feed_url): @@ -27,13 +145,27 @@ def fetch_rss_news(feed_url): def fetch_all_news(): - """Fetch news from all RSS feeds""" + """Fetch news from all RSS feeds with duplicate detection""" all_articles = [] + + # Set the vector_db instance for news_fetcher + from .recommender import vector_db + news_fetcher.vector_db = vector_db for feed_url in Config.RSS_FEEDS: try: articles = fetch_rss_news(feed_url) - all_articles.extend(articles) + + # Filter out duplicates + unique_articles = [] + for article in articles: + if not news_fetcher.is_duplicate_article(article): + unique_articles.append(article) + else: + print(f"Skipping duplicate article: {article['title']}") + + all_articles.extend(unique_articles) + except Exception as e: print(f"Error fetching from {feed_url}: {str(e)}") @@ -63,4 +195,4 @@ def save_processed_news(articles): with open(filename, 'w') as f: json.dump(articles, f, indent=2) - return filename + return filename \ No newline at end of file diff --git a/backend/recommender.py b/backend/recommender.py index 5350929..36e6875 100644 --- a/backend/recommender.py +++ b/backend/recommender.py @@ -1,3 +1,4 @@ +from typing import List, Dict, Optional from .embeddings import get_embeddings, get_query_embedding, rerank_results from .vector_store import VectorDB import groq @@ -10,6 +11,56 @@ groq_client = groq.Groq(api_key=Config.GROQ_API_KEY) vector_db = VectorDB() +class NewsRecommender: + """News recommendation system using vector similarity search""" + + def __init__(self, vector_db: VectorDB): + self.vector_db = vector_db + + def recommend_by_text(self, text_description: str, top_n: int = 3) -> List[Dict]: + """ + Recommend articles based on text description + + Args: + text_description: Text description to find similar articles for + top_n: Number of recommendations to return + + Returns: + List of recommended articles + """ + query_embedding = get_query_embedding(text_description) + if not query_embedding: + return [] + + # Search for similar articles + similar_articles = self.vector_db.search(query_embedding, k=top_n) + + # Re-rank results for better relevance + if similar_articles: + documents = [f"{art['title']} {art['content']}" for + art in similar_articles] + reranked = rerank_results(text_description, documents) + + if reranked: + # Reorder recommendations based on reranking + reordered = [] + for result in reranked: + if result.index < len(similar_articles): + reordered.append(similar_articles[result.index]) + return reordered + + return similar_articles + + def get_personalized_recommendations(self, user_interests: str, top_n: + int = 5) -> List[Dict]: + """Get personalized recommendations based on user interests""" + return self.recommend_by_text(user_interests, top_n) + + +# Initialize recommender instance +news_recommender = NewsRecommender(vector_db) + + def process_articles_for_vector_db(articles): """Process articles and add to vector database""" if not articles: @@ -35,18 +86,14 @@ def recommend_similar(article_id, top_n=3): # Get embedding for the article article_text = f"{article['title']} {article['content']}" - query_embedding = get_query_embedding(article_text) - if not query_embedding: - return [] - - # Search for similar articles - similar_articles = vector_db.search(query_embedding, k=top_n + 1) + # Use the new recommender with text description + recommendations = news_recommender.recommend_by_text(article_text, top_n + 1) # Filter out the original article - recommendations = [art for art in similar_articles if art.get('slug') != article_id] + filtered_recommendations = [art for art in recommendations if art.get('slug') != article_id] - return recommendations[:top_n] + return filtered_recommendations[:top_n] def analyze_article_with_groq(article_text): @@ -57,7 +104,8 @@ def analyze_article_with_groq(article_text): messages=[ { "role": "system", - "content": "You are an AI news analyst. Provide insights, key points, and sentiment analysis for the given article." + "content": "You are an AI news analyst. Provide insights, " + "key points, and sentiment analysis for the given article." }, { "role": "user", @@ -75,23 +123,5 @@ def analyze_article_with_groq(article_text): def get_personalized_recommendations(user_interests, top_n=5): """Get personalized recommendations based on user interests""" - query_embedding = get_query_embedding(user_interests) - if not query_embedding: - return [] - - recommendations = vector_db.search(query_embedding, k=top_n) - - # Re-rank results for better relevance - if recommendations: - documents = [f"{art['title']} {art['content']}" for art in recommendations] - reranked = rerank_results(user_interests, documents) - - if reranked: - # Reorder recommendations based on reranking - reordered = [] - for result in reranked: - if result.index < len(recommendations): - reordered.append(recommendations[result.index]) - return reordered - - return recommendations + return news_recommender.get_personalized_recommendations(user_interests, + top_n)