update recommender and news_fetcher
This commit is contained in:
+112
@@ -0,0 +1,112 @@
|
||||
# .gitignore for DS Task AI News Project
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
### Environment ###
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Virtual environment
|
||||
pythonenv*
|
||||
|
||||
### IDE ###
|
||||
# VS Code
|
||||
.vscode/
|
||||
!.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
|
||||
# PyCharm
|
||||
.idea/
|
||||
*.iml
|
||||
*.ipr
|
||||
*.iws
|
||||
|
||||
### Data Files ###
|
||||
# Raw and processed news
|
||||
data/raw_news/
|
||||
data/processed_news/
|
||||
*.csv
|
||||
*.json
|
||||
*.parquet
|
||||
*.feather
|
||||
*.pkl
|
||||
*.pickle
|
||||
*.db
|
||||
*.sqlite
|
||||
|
||||
# Vector database files
|
||||
*.faiss
|
||||
*.index
|
||||
*.bin
|
||||
*.vec
|
||||
|
||||
### Logs ###
|
||||
*.log
|
||||
logs/
|
||||
|
||||
### OS Generated ###
|
||||
.DS_Store
|
||||
.DS_Store?
|
||||
._*
|
||||
.Spotlight-V100
|
||||
.Trashes
|
||||
ehthumbs.db
|
||||
Thumbs.db
|
||||
|
||||
### Groq/Cohere Cache ###
|
||||
.cache/
|
||||
model_cache/
|
||||
|
||||
### Test Files ###
|
||||
test_output/
|
||||
benchmark_results/
|
||||
|
||||
### Documentation ###
|
||||
docs/_build/
|
||||
+23
-3
@@ -1,7 +1,7 @@
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news
|
||||
from .recommender import recommend_similar, process_articles_for_vector_db
|
||||
from .recommender import recommend_similar, process_articles_for_vector_db, news_recommender
|
||||
from .recommender import analyze_article_with_groq
|
||||
from .recommender import get_personalized_recommendations, vector_db
|
||||
from .vector_store import VectorDB
|
||||
@@ -41,7 +41,7 @@ async def root():
|
||||
|
||||
@app.get("/fetch-news")
|
||||
async def fetch_news():
|
||||
"""Fetch news from RSS feeds"""
|
||||
"""Fetch news from RSS feeds with duplicate detection"""
|
||||
try:
|
||||
articles = fetch_all_news()
|
||||
|
||||
@@ -74,7 +74,7 @@ async def fetch_news():
|
||||
|
||||
@app.get("/recommend-news")
|
||||
async def recommend_news(article_id: str):
|
||||
"""Retrieve similar news based on the selected article"""
|
||||
"""Retrieve similar news based on the selected article (backward compatibility)"""
|
||||
try:
|
||||
recommendations = recommend_similar(article_id)
|
||||
|
||||
@@ -91,6 +91,25 @@ async def recommend_news(article_id: str):
|
||||
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/recommend-by-text")
|
||||
async def recommend_by_text(text_description: str, top_n: int = 3):
|
||||
"""Recommend articles based on text description"""
|
||||
try:
|
||||
recommendations = news_recommender.recommend_by_text(text_description, top_n)
|
||||
|
||||
if not recommendations:
|
||||
raise HTTPException(status_code=404, detail="No recommendations found")
|
||||
|
||||
return {
|
||||
"text_description": text_description,
|
||||
"recommendations": recommendations,
|
||||
"count": len(recommendations)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/analyze-article")
|
||||
async def analyze_article(article_id: str):
|
||||
"""Analyze article using Groq LLM"""
|
||||
@@ -134,6 +153,7 @@ async def health_check():
|
||||
"""Health check endpoint"""
|
||||
return {"status": "healthy", "database_articles": len(vector_db.articles)}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
+134
-2
@@ -1,8 +1,126 @@
|
||||
# Updated newsfetcher.py with similarity search and LLM duplicate detection
|
||||
|
||||
import feedparser
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
from .config import Config
|
||||
from .embeddings import get_query_embedding
|
||||
from .vector_store import VectorDB
|
||||
import groq
|
||||
import numpy as np
|
||||
|
||||
# Initialize Groq client for duplicate detection
|
||||
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
|
||||
|
||||
|
||||
class NewsFetcher:
|
||||
"""News fetcher with duplicate detection capabilities"""
|
||||
|
||||
def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
|
||||
self.vector_db = vector_db
|
||||
self.similarity_threshold = similarity_threshold
|
||||
|
||||
def check_similarity_duplicate(self, article: Dict) -> bool:
|
||||
"""
|
||||
Check if article is a duplicate using similarity search
|
||||
|
||||
Args:
|
||||
article: Article to check for duplicates
|
||||
|
||||
Returns:
|
||||
True if duplicate found, False otherwise
|
||||
"""
|
||||
if not self.vector_db.articles:
|
||||
return False
|
||||
|
||||
# Create search text from title and content
|
||||
search_text = f"{article['title']} {article['content']}"
|
||||
query_embedding = get_query_embedding(search_text)
|
||||
|
||||
if not query_embedding:
|
||||
return False
|
||||
|
||||
# Search for similar articles
|
||||
similar_articles = self.vector_db.search(query_embedding, k=5)
|
||||
|
||||
# Check if any similar article exceeds threshold
|
||||
for similar_article in similar_articles:
|
||||
similarity_score = similar_article.get('similarity_score', 0)
|
||||
# Convert distance to similarity (FAISS returns L2 distance)
|
||||
similarity = 1 / (1 + similarity_score)
|
||||
|
||||
if similarity > self.similarity_threshold:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
|
||||
"""
|
||||
Check if titles are duplicates using LLM comparison
|
||||
|
||||
Args:
|
||||
new_title: New article title
|
||||
existing_titles: List of existing article titles
|
||||
|
||||
Returns:
|
||||
True if duplicate found, False otherwise
|
||||
"""
|
||||
if not existing_titles:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Create prompt for LLM comparison
|
||||
titles_text = "\n".join([f"- {title}" for title in existing_titles])
|
||||
|
||||
response = groq_client.chat.completions.create(
|
||||
model=Config.GROQ_MODEL,
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
|
||||
}
|
||||
],
|
||||
max_tokens=10,
|
||||
temperature=0.1
|
||||
)
|
||||
|
||||
result = response.choices[0].message.content.strip().upper()
|
||||
return "DUPLICATE" in result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error checking LLM duplicate: {str(e)}")
|
||||
return False
|
||||
|
||||
def is_duplicate_article(self, article: Dict) -> bool:
|
||||
"""
|
||||
Check if article is duplicate using both similarity and LLM methods
|
||||
|
||||
Args:
|
||||
article: Article to check
|
||||
|
||||
Returns:
|
||||
True if duplicate, False otherwise
|
||||
"""
|
||||
# First check similarity
|
||||
if self.check_similarity_duplicate(article):
|
||||
return True
|
||||
|
||||
# Then check with LLM
|
||||
existing_titles = [art['title'] for art in self.vector_db.articles]
|
||||
if self.check_llm_duplicate(article['title'], existing_titles):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# Initialize news fetcher instance
|
||||
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
|
||||
|
||||
|
||||
def fetch_rss_news(feed_url):
|
||||
@@ -27,13 +145,27 @@ def fetch_rss_news(feed_url):
|
||||
|
||||
|
||||
def fetch_all_news():
|
||||
"""Fetch news from all RSS feeds"""
|
||||
"""Fetch news from all RSS feeds with duplicate detection"""
|
||||
all_articles = []
|
||||
|
||||
# Set the vector_db instance for news_fetcher
|
||||
from .recommender import vector_db
|
||||
news_fetcher.vector_db = vector_db
|
||||
|
||||
for feed_url in Config.RSS_FEEDS:
|
||||
try:
|
||||
articles = fetch_rss_news(feed_url)
|
||||
all_articles.extend(articles)
|
||||
|
||||
# Filter out duplicates
|
||||
unique_articles = []
|
||||
for article in articles:
|
||||
if not news_fetcher.is_duplicate_article(article):
|
||||
unique_articles.append(article)
|
||||
else:
|
||||
print(f"Skipping duplicate article: {article['title']}")
|
||||
|
||||
all_articles.extend(unique_articles)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching from {feed_url}: {str(e)}")
|
||||
|
||||
|
||||
+59
-29
@@ -1,3 +1,4 @@
|
||||
from typing import List, Dict, Optional
|
||||
from .embeddings import get_embeddings, get_query_embedding, rerank_results
|
||||
from .vector_store import VectorDB
|
||||
import groq
|
||||
@@ -10,6 +11,56 @@ groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
|
||||
vector_db = VectorDB()
|
||||
|
||||
|
||||
class NewsRecommender:
|
||||
"""News recommendation system using vector similarity search"""
|
||||
|
||||
def __init__(self, vector_db: VectorDB):
|
||||
self.vector_db = vector_db
|
||||
|
||||
def recommend_by_text(self, text_description: str, top_n: int = 3) -> List[Dict]:
|
||||
"""
|
||||
Recommend articles based on text description
|
||||
|
||||
Args:
|
||||
text_description: Text description to find similar articles for
|
||||
top_n: Number of recommendations to return
|
||||
|
||||
Returns:
|
||||
List of recommended articles
|
||||
"""
|
||||
query_embedding = get_query_embedding(text_description)
|
||||
if not query_embedding:
|
||||
return []
|
||||
|
||||
# Search for similar articles
|
||||
similar_articles = self.vector_db.search(query_embedding, k=top_n)
|
||||
|
||||
# Re-rank results for better relevance
|
||||
if similar_articles:
|
||||
documents = [f"{art['title']} {art['content']}" for
|
||||
art in similar_articles]
|
||||
reranked = rerank_results(text_description, documents)
|
||||
|
||||
if reranked:
|
||||
# Reorder recommendations based on reranking
|
||||
reordered = []
|
||||
for result in reranked:
|
||||
if result.index < len(similar_articles):
|
||||
reordered.append(similar_articles[result.index])
|
||||
return reordered
|
||||
|
||||
return similar_articles
|
||||
|
||||
def get_personalized_recommendations(self, user_interests: str, top_n:
|
||||
int = 5) -> List[Dict]:
|
||||
"""Get personalized recommendations based on user interests"""
|
||||
return self.recommend_by_text(user_interests, top_n)
|
||||
|
||||
|
||||
# Initialize recommender instance
|
||||
news_recommender = NewsRecommender(vector_db)
|
||||
|
||||
|
||||
def process_articles_for_vector_db(articles):
|
||||
"""Process articles and add to vector database"""
|
||||
if not articles:
|
||||
@@ -35,18 +86,14 @@ def recommend_similar(article_id, top_n=3):
|
||||
|
||||
# Get embedding for the article
|
||||
article_text = f"{article['title']} {article['content']}"
|
||||
query_embedding = get_query_embedding(article_text)
|
||||
|
||||
if not query_embedding:
|
||||
return []
|
||||
|
||||
# Search for similar articles
|
||||
similar_articles = vector_db.search(query_embedding, k=top_n + 1)
|
||||
# Use the new recommender with text description
|
||||
recommendations = news_recommender.recommend_by_text(article_text, top_n + 1)
|
||||
|
||||
# Filter out the original article
|
||||
recommendations = [art for art in similar_articles if art.get('slug') != article_id]
|
||||
filtered_recommendations = [art for art in recommendations if art.get('slug') != article_id]
|
||||
|
||||
return recommendations[:top_n]
|
||||
return filtered_recommendations[:top_n]
|
||||
|
||||
|
||||
def analyze_article_with_groq(article_text):
|
||||
@@ -57,7 +104,8 @@ def analyze_article_with_groq(article_text):
|
||||
messages=[
|
||||
{
|
||||
"role": "system",
|
||||
"content": "You are an AI news analyst. Provide insights, key points, and sentiment analysis for the given article."
|
||||
"content": "You are an AI news analyst. Provide insights, "
|
||||
"key points, and sentiment analysis for the given article."
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
@@ -75,23 +123,5 @@ def analyze_article_with_groq(article_text):
|
||||
|
||||
def get_personalized_recommendations(user_interests, top_n=5):
|
||||
"""Get personalized recommendations based on user interests"""
|
||||
query_embedding = get_query_embedding(user_interests)
|
||||
if not query_embedding:
|
||||
return []
|
||||
|
||||
recommendations = vector_db.search(query_embedding, k=top_n)
|
||||
|
||||
# Re-rank results for better relevance
|
||||
if recommendations:
|
||||
documents = [f"{art['title']} {art['content']}" for art in recommendations]
|
||||
reranked = rerank_results(user_interests, documents)
|
||||
|
||||
if reranked:
|
||||
# Reorder recommendations based on reranking
|
||||
reordered = []
|
||||
for result in reranked:
|
||||
if result.index < len(recommendations):
|
||||
reordered.append(recommendations[result.index])
|
||||
return reordered
|
||||
|
||||
return recommendations
|
||||
return news_recommender.get_personalized_recommendations(user_interests,
|
||||
top_n)
|
||||
|
||||
Reference in New Issue
Block a user