update recommender and news_fetcher

This commit is contained in:
Ayomide
2025-07-24 16:35:04 +01:00
parent f28755e1fd
commit 8d2a277afe
4 changed files with 330 additions and 36 deletions
+112
View File
@@ -0,0 +1,112 @@
# .gitignore for DS Task AI News Project
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Jupyter Notebook
.ipynb_checkpoints
### Environment ###
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Virtual environment
pythonenv*
### IDE ###
# VS Code
.vscode/
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
# PyCharm
.idea/
*.iml
*.ipr
*.iws
### Data Files ###
# Raw and processed news
data/raw_news/
data/processed_news/
*.csv
*.json
*.parquet
*.feather
*.pkl
*.pickle
*.db
*.sqlite
# Vector database files
*.faiss
*.index
*.bin
*.vec
### Logs ###
*.log
logs/
### OS Generated ###
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
### Groq/Cohere Cache ###
.cache/
model_cache/
### Test Files ###
test_output/
benchmark_results/
### Documentation ###
docs/_build/
+24 -4
View File
@@ -1,7 +1,7 @@
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news
from .recommender import recommend_similar, process_articles_for_vector_db
from .recommender import recommend_similar, process_articles_for_vector_db, news_recommender
from .recommender import analyze_article_with_groq
from .recommender import get_personalized_recommendations, vector_db
from .vector_store import VectorDB
@@ -41,7 +41,7 @@ async def root():
@app.get("/fetch-news")
async def fetch_news():
"""Fetch news from RSS feeds"""
"""Fetch news from RSS feeds with duplicate detection"""
try:
articles = fetch_all_news()
@@ -74,7 +74,7 @@ async def fetch_news():
@app.get("/recommend-news")
async def recommend_news(article_id: str):
"""Retrieve similar news based on the selected article"""
"""Retrieve similar news based on the selected article (backward compatibility)"""
try:
recommendations = recommend_similar(article_id)
@@ -91,6 +91,25 @@ async def recommend_news(article_id: str):
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.get("/recommend-by-text")
async def recommend_by_text(text_description: str, top_n: int = 3):
"""Recommend articles based on text description"""
try:
recommendations = news_recommender.recommend_by_text(text_description, top_n)
if not recommendations:
raise HTTPException(status_code=404, detail="No recommendations found")
return {
"text_description": text_description,
"recommendations": recommendations,
"count": len(recommendations)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.get("/analyze-article")
async def analyze_article(article_id: str):
"""Analyze article using Groq LLM"""
@@ -134,6 +153,7 @@ async def health_check():
"""Health check endpoint"""
return {"status": "healthy", "database_articles": len(vector_db.articles)}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
uvicorn.run(app, host="0.0.0.0", port=8000)
+135 -3
View File
@@ -1,8 +1,126 @@
# Updated newsfetcher.py with similarity search and LLM duplicate detection
import feedparser
import json
import os
from datetime import datetime
from typing import List, Dict, Optional
from .config import Config
from .embeddings import get_query_embedding
from .vector_store import VectorDB
import groq
import numpy as np
# Initialize Groq client for duplicate detection
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
class NewsFetcher:
"""News fetcher with duplicate detection capabilities"""
def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
self.vector_db = vector_db
self.similarity_threshold = similarity_threshold
def check_similarity_duplicate(self, article: Dict) -> bool:
"""
Check if article is a duplicate using similarity search
Args:
article: Article to check for duplicates
Returns:
True if duplicate found, False otherwise
"""
if not self.vector_db.articles:
return False
# Create search text from title and content
search_text = f"{article['title']} {article['content']}"
query_embedding = get_query_embedding(search_text)
if not query_embedding:
return False
# Search for similar articles
similar_articles = self.vector_db.search(query_embedding, k=5)
# Check if any similar article exceeds threshold
for similar_article in similar_articles:
similarity_score = similar_article.get('similarity_score', 0)
# Convert distance to similarity (FAISS returns L2 distance)
similarity = 1 / (1 + similarity_score)
if similarity > self.similarity_threshold:
return True
return False
def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
"""
Check if titles are duplicates using LLM comparison
Args:
new_title: New article title
existing_titles: List of existing article titles
Returns:
True if duplicate found, False otherwise
"""
if not existing_titles:
return False
try:
# Create prompt for LLM comparison
titles_text = "\n".join([f"- {title}" for title in existing_titles])
response = groq_client.chat.completions.create(
model=Config.GROQ_MODEL,
messages=[
{
"role": "system",
"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
},
{
"role": "user",
"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
}
],
max_tokens=10,
temperature=0.1
)
result = response.choices[0].message.content.strip().upper()
return "DUPLICATE" in result
except Exception as e:
print(f"Error checking LLM duplicate: {str(e)}")
return False
def is_duplicate_article(self, article: Dict) -> bool:
"""
Check if article is duplicate using both similarity and LLM methods
Args:
article: Article to check
Returns:
True if duplicate, False otherwise
"""
# First check similarity
if self.check_similarity_duplicate(article):
return True
# Then check with LLM
existing_titles = [art['title'] for art in self.vector_db.articles]
if self.check_llm_duplicate(article['title'], existing_titles):
return True
return False
# Initialize news fetcher instance
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
def fetch_rss_news(feed_url):
@@ -27,13 +145,27 @@ def fetch_rss_news(feed_url):
def fetch_all_news():
"""Fetch news from all RSS feeds"""
"""Fetch news from all RSS feeds with duplicate detection"""
all_articles = []
# Set the vector_db instance for news_fetcher
from .recommender import vector_db
news_fetcher.vector_db = vector_db
for feed_url in Config.RSS_FEEDS:
try:
articles = fetch_rss_news(feed_url)
all_articles.extend(articles)
# Filter out duplicates
unique_articles = []
for article in articles:
if not news_fetcher.is_duplicate_article(article):
unique_articles.append(article)
else:
print(f"Skipping duplicate article: {article['title']}")
all_articles.extend(unique_articles)
except Exception as e:
print(f"Error fetching from {feed_url}: {str(e)}")
@@ -63,4 +195,4 @@ def save_processed_news(articles):
with open(filename, 'w') as f:
json.dump(articles, f, indent=2)
return filename
return filename
+59 -29
View File
@@ -1,3 +1,4 @@
from typing import List, Dict, Optional
from .embeddings import get_embeddings, get_query_embedding, rerank_results
from .vector_store import VectorDB
import groq
@@ -10,6 +11,56 @@ groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
vector_db = VectorDB()
class NewsRecommender:
"""News recommendation system using vector similarity search"""
def __init__(self, vector_db: VectorDB):
self.vector_db = vector_db
def recommend_by_text(self, text_description: str, top_n: int = 3) -> List[Dict]:
"""
Recommend articles based on text description
Args:
text_description: Text description to find similar articles for
top_n: Number of recommendations to return
Returns:
List of recommended articles
"""
query_embedding = get_query_embedding(text_description)
if not query_embedding:
return []
# Search for similar articles
similar_articles = self.vector_db.search(query_embedding, k=top_n)
# Re-rank results for better relevance
if similar_articles:
documents = [f"{art['title']} {art['content']}" for
art in similar_articles]
reranked = rerank_results(text_description, documents)
if reranked:
# Reorder recommendations based on reranking
reordered = []
for result in reranked:
if result.index < len(similar_articles):
reordered.append(similar_articles[result.index])
return reordered
return similar_articles
def get_personalized_recommendations(self, user_interests: str, top_n:
int = 5) -> List[Dict]:
"""Get personalized recommendations based on user interests"""
return self.recommend_by_text(user_interests, top_n)
# Initialize recommender instance
news_recommender = NewsRecommender(vector_db)
def process_articles_for_vector_db(articles):
"""Process articles and add to vector database"""
if not articles:
@@ -35,18 +86,14 @@ def recommend_similar(article_id, top_n=3):
# Get embedding for the article
article_text = f"{article['title']} {article['content']}"
query_embedding = get_query_embedding(article_text)
if not query_embedding:
return []
# Search for similar articles
similar_articles = vector_db.search(query_embedding, k=top_n + 1)
# Use the new recommender with text description
recommendations = news_recommender.recommend_by_text(article_text, top_n + 1)
# Filter out the original article
recommendations = [art for art in similar_articles if art.get('slug') != article_id]
filtered_recommendations = [art for art in recommendations if art.get('slug') != article_id]
return recommendations[:top_n]
return filtered_recommendations[:top_n]
def analyze_article_with_groq(article_text):
@@ -57,7 +104,8 @@ def analyze_article_with_groq(article_text):
messages=[
{
"role": "system",
"content": "You are an AI news analyst. Provide insights, key points, and sentiment analysis for the given article."
"content": "You are an AI news analyst. Provide insights, "
"key points, and sentiment analysis for the given article."
},
{
"role": "user",
@@ -75,23 +123,5 @@ def analyze_article_with_groq(article_text):
def get_personalized_recommendations(user_interests, top_n=5):
"""Get personalized recommendations based on user interests"""
query_embedding = get_query_embedding(user_interests)
if not query_embedding:
return []
recommendations = vector_db.search(query_embedding, k=top_n)
# Re-rank results for better relevance
if recommendations:
documents = [f"{art['title']} {art['content']}" for art in recommendations]
reranked = rerank_results(user_interests, documents)
if reranked:
# Reorder recommendations based on reranking
reordered = []
for result in reranked:
if result.index < len(recommendations):
reordered.append(recommendations[result.index])
return reordered
return recommendations
return news_recommender.get_personalized_recommendations(user_interests,
top_n)