update recommender and news_fetcher
This commit is contained in:
+112
@@ -0,0 +1,112 @@
|
|||||||
|
# .gitignore for DS Task AI News Project
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
### Environment ###
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Virtual environment
|
||||||
|
pythonenv*
|
||||||
|
|
||||||
|
### IDE ###
|
||||||
|
# VS Code
|
||||||
|
.vscode/
|
||||||
|
!.vscode/settings.json
|
||||||
|
!.vscode/tasks.json
|
||||||
|
!.vscode/launch.json
|
||||||
|
!.vscode/extensions.json
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
.idea/
|
||||||
|
*.iml
|
||||||
|
*.ipr
|
||||||
|
*.iws
|
||||||
|
|
||||||
|
### Data Files ###
|
||||||
|
# Raw and processed news
|
||||||
|
data/raw_news/
|
||||||
|
data/processed_news/
|
||||||
|
*.csv
|
||||||
|
*.json
|
||||||
|
*.parquet
|
||||||
|
*.feather
|
||||||
|
*.pkl
|
||||||
|
*.pickle
|
||||||
|
*.db
|
||||||
|
*.sqlite
|
||||||
|
|
||||||
|
# Vector database files
|
||||||
|
*.faiss
|
||||||
|
*.index
|
||||||
|
*.bin
|
||||||
|
*.vec
|
||||||
|
|
||||||
|
### Logs ###
|
||||||
|
*.log
|
||||||
|
logs/
|
||||||
|
|
||||||
|
### OS Generated ###
|
||||||
|
.DS_Store
|
||||||
|
.DS_Store?
|
||||||
|
._*
|
||||||
|
.Spotlight-V100
|
||||||
|
.Trashes
|
||||||
|
ehthumbs.db
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
### Groq/Cohere Cache ###
|
||||||
|
.cache/
|
||||||
|
model_cache/
|
||||||
|
|
||||||
|
### Test Files ###
|
||||||
|
test_output/
|
||||||
|
benchmark_results/
|
||||||
|
|
||||||
|
### Documentation ###
|
||||||
|
docs/_build/
|
||||||
+23
-3
@@ -1,7 +1,7 @@
|
|||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news
|
from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news
|
||||||
from .recommender import recommend_similar, process_articles_for_vector_db
|
from .recommender import recommend_similar, process_articles_for_vector_db, news_recommender
|
||||||
from .recommender import analyze_article_with_groq
|
from .recommender import analyze_article_with_groq
|
||||||
from .recommender import get_personalized_recommendations, vector_db
|
from .recommender import get_personalized_recommendations, vector_db
|
||||||
from .vector_store import VectorDB
|
from .vector_store import VectorDB
|
||||||
@@ -41,7 +41,7 @@ async def root():
|
|||||||
|
|
||||||
@app.get("/fetch-news")
|
@app.get("/fetch-news")
|
||||||
async def fetch_news():
|
async def fetch_news():
|
||||||
"""Fetch news from RSS feeds"""
|
"""Fetch news from RSS feeds with duplicate detection"""
|
||||||
try:
|
try:
|
||||||
articles = fetch_all_news()
|
articles = fetch_all_news()
|
||||||
|
|
||||||
@@ -74,7 +74,7 @@ async def fetch_news():
|
|||||||
|
|
||||||
@app.get("/recommend-news")
|
@app.get("/recommend-news")
|
||||||
async def recommend_news(article_id: str):
|
async def recommend_news(article_id: str):
|
||||||
"""Retrieve similar news based on the selected article"""
|
"""Retrieve similar news based on the selected article (backward compatibility)"""
|
||||||
try:
|
try:
|
||||||
recommendations = recommend_similar(article_id)
|
recommendations = recommend_similar(article_id)
|
||||||
|
|
||||||
@@ -91,6 +91,25 @@ async def recommend_news(article_id: str):
|
|||||||
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
|
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/recommend-by-text")
|
||||||
|
async def recommend_by_text(text_description: str, top_n: int = 3):
|
||||||
|
"""Recommend articles based on text description"""
|
||||||
|
try:
|
||||||
|
recommendations = news_recommender.recommend_by_text(text_description, top_n)
|
||||||
|
|
||||||
|
if not recommendations:
|
||||||
|
raise HTTPException(status_code=404, detail="No recommendations found")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"text_description": text_description,
|
||||||
|
"recommendations": recommendations,
|
||||||
|
"count": len(recommendations)
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
@app.get("/analyze-article")
|
@app.get("/analyze-article")
|
||||||
async def analyze_article(article_id: str):
|
async def analyze_article(article_id: str):
|
||||||
"""Analyze article using Groq LLM"""
|
"""Analyze article using Groq LLM"""
|
||||||
@@ -134,6 +153,7 @@ async def health_check():
|
|||||||
"""Health check endpoint"""
|
"""Health check endpoint"""
|
||||||
return {"status": "healthy", "database_articles": len(vector_db.articles)}
|
return {"status": "healthy", "database_articles": len(vector_db.articles)}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import uvicorn
|
import uvicorn
|
||||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||||
+134
-2
@@ -1,8 +1,126 @@
|
|||||||
|
# Updated newsfetcher.py with similarity search and LLM duplicate detection
|
||||||
|
|
||||||
import feedparser
|
import feedparser
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import List, Dict, Optional
|
||||||
from .config import Config
|
from .config import Config
|
||||||
|
from .embeddings import get_query_embedding
|
||||||
|
from .vector_store import VectorDB
|
||||||
|
import groq
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Initialize Groq client for duplicate detection
|
||||||
|
groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
|
||||||
|
|
||||||
|
|
||||||
|
class NewsFetcher:
|
||||||
|
"""News fetcher with duplicate detection capabilities"""
|
||||||
|
|
||||||
|
def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
|
||||||
|
self.vector_db = vector_db
|
||||||
|
self.similarity_threshold = similarity_threshold
|
||||||
|
|
||||||
|
def check_similarity_duplicate(self, article: Dict) -> bool:
|
||||||
|
"""
|
||||||
|
Check if article is a duplicate using similarity search
|
||||||
|
|
||||||
|
Args:
|
||||||
|
article: Article to check for duplicates
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if duplicate found, False otherwise
|
||||||
|
"""
|
||||||
|
if not self.vector_db.articles:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create search text from title and content
|
||||||
|
search_text = f"{article['title']} {article['content']}"
|
||||||
|
query_embedding = get_query_embedding(search_text)
|
||||||
|
|
||||||
|
if not query_embedding:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Search for similar articles
|
||||||
|
similar_articles = self.vector_db.search(query_embedding, k=5)
|
||||||
|
|
||||||
|
# Check if any similar article exceeds threshold
|
||||||
|
for similar_article in similar_articles:
|
||||||
|
similarity_score = similar_article.get('similarity_score', 0)
|
||||||
|
# Convert distance to similarity (FAISS returns L2 distance)
|
||||||
|
similarity = 1 / (1 + similarity_score)
|
||||||
|
|
||||||
|
if similarity > self.similarity_threshold:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
|
||||||
|
"""
|
||||||
|
Check if titles are duplicates using LLM comparison
|
||||||
|
|
||||||
|
Args:
|
||||||
|
new_title: New article title
|
||||||
|
existing_titles: List of existing article titles
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if duplicate found, False otherwise
|
||||||
|
"""
|
||||||
|
if not existing_titles:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create prompt for LLM comparison
|
||||||
|
titles_text = "\n".join([f"- {title}" for title in existing_titles])
|
||||||
|
|
||||||
|
response = groq_client.chat.completions.create(
|
||||||
|
model=Config.GROQ_MODEL,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_tokens=10,
|
||||||
|
temperature=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
result = response.choices[0].message.content.strip().upper()
|
||||||
|
return "DUPLICATE" in result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error checking LLM duplicate: {str(e)}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def is_duplicate_article(self, article: Dict) -> bool:
|
||||||
|
"""
|
||||||
|
Check if article is duplicate using both similarity and LLM methods
|
||||||
|
|
||||||
|
Args:
|
||||||
|
article: Article to check
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if duplicate, False otherwise
|
||||||
|
"""
|
||||||
|
# First check similarity
|
||||||
|
if self.check_similarity_duplicate(article):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Then check with LLM
|
||||||
|
existing_titles = [art['title'] for art in self.vector_db.articles]
|
||||||
|
if self.check_llm_duplicate(article['title'], existing_titles):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize news fetcher instance
|
||||||
|
news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
|
||||||
|
|
||||||
|
|
||||||
def fetch_rss_news(feed_url):
|
def fetch_rss_news(feed_url):
|
||||||
@@ -27,13 +145,27 @@ def fetch_rss_news(feed_url):
|
|||||||
|
|
||||||
|
|
||||||
def fetch_all_news():
|
def fetch_all_news():
|
||||||
"""Fetch news from all RSS feeds"""
|
"""Fetch news from all RSS feeds with duplicate detection"""
|
||||||
all_articles = []
|
all_articles = []
|
||||||
|
|
||||||
|
# Set the vector_db instance for news_fetcher
|
||||||
|
from .recommender import vector_db
|
||||||
|
news_fetcher.vector_db = vector_db
|
||||||
|
|
||||||
for feed_url in Config.RSS_FEEDS:
|
for feed_url in Config.RSS_FEEDS:
|
||||||
try:
|
try:
|
||||||
articles = fetch_rss_news(feed_url)
|
articles = fetch_rss_news(feed_url)
|
||||||
all_articles.extend(articles)
|
|
||||||
|
# Filter out duplicates
|
||||||
|
unique_articles = []
|
||||||
|
for article in articles:
|
||||||
|
if not news_fetcher.is_duplicate_article(article):
|
||||||
|
unique_articles.append(article)
|
||||||
|
else:
|
||||||
|
print(f"Skipping duplicate article: {article['title']}")
|
||||||
|
|
||||||
|
all_articles.extend(unique_articles)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error fetching from {feed_url}: {str(e)}")
|
print(f"Error fetching from {feed_url}: {str(e)}")
|
||||||
|
|
||||||
|
|||||||
+59
-29
@@ -1,3 +1,4 @@
|
|||||||
|
from typing import List, Dict, Optional
|
||||||
from .embeddings import get_embeddings, get_query_embedding, rerank_results
|
from .embeddings import get_embeddings, get_query_embedding, rerank_results
|
||||||
from .vector_store import VectorDB
|
from .vector_store import VectorDB
|
||||||
import groq
|
import groq
|
||||||
@@ -10,6 +11,56 @@ groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
|
|||||||
vector_db = VectorDB()
|
vector_db = VectorDB()
|
||||||
|
|
||||||
|
|
||||||
|
class NewsRecommender:
|
||||||
|
"""News recommendation system using vector similarity search"""
|
||||||
|
|
||||||
|
def __init__(self, vector_db: VectorDB):
|
||||||
|
self.vector_db = vector_db
|
||||||
|
|
||||||
|
def recommend_by_text(self, text_description: str, top_n: int = 3) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Recommend articles based on text description
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text_description: Text description to find similar articles for
|
||||||
|
top_n: Number of recommendations to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of recommended articles
|
||||||
|
"""
|
||||||
|
query_embedding = get_query_embedding(text_description)
|
||||||
|
if not query_embedding:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Search for similar articles
|
||||||
|
similar_articles = self.vector_db.search(query_embedding, k=top_n)
|
||||||
|
|
||||||
|
# Re-rank results for better relevance
|
||||||
|
if similar_articles:
|
||||||
|
documents = [f"{art['title']} {art['content']}" for
|
||||||
|
art in similar_articles]
|
||||||
|
reranked = rerank_results(text_description, documents)
|
||||||
|
|
||||||
|
if reranked:
|
||||||
|
# Reorder recommendations based on reranking
|
||||||
|
reordered = []
|
||||||
|
for result in reranked:
|
||||||
|
if result.index < len(similar_articles):
|
||||||
|
reordered.append(similar_articles[result.index])
|
||||||
|
return reordered
|
||||||
|
|
||||||
|
return similar_articles
|
||||||
|
|
||||||
|
def get_personalized_recommendations(self, user_interests: str, top_n:
|
||||||
|
int = 5) -> List[Dict]:
|
||||||
|
"""Get personalized recommendations based on user interests"""
|
||||||
|
return self.recommend_by_text(user_interests, top_n)
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize recommender instance
|
||||||
|
news_recommender = NewsRecommender(vector_db)
|
||||||
|
|
||||||
|
|
||||||
def process_articles_for_vector_db(articles):
|
def process_articles_for_vector_db(articles):
|
||||||
"""Process articles and add to vector database"""
|
"""Process articles and add to vector database"""
|
||||||
if not articles:
|
if not articles:
|
||||||
@@ -35,18 +86,14 @@ def recommend_similar(article_id, top_n=3):
|
|||||||
|
|
||||||
# Get embedding for the article
|
# Get embedding for the article
|
||||||
article_text = f"{article['title']} {article['content']}"
|
article_text = f"{article['title']} {article['content']}"
|
||||||
query_embedding = get_query_embedding(article_text)
|
|
||||||
|
|
||||||
if not query_embedding:
|
# Use the new recommender with text description
|
||||||
return []
|
recommendations = news_recommender.recommend_by_text(article_text, top_n + 1)
|
||||||
|
|
||||||
# Search for similar articles
|
|
||||||
similar_articles = vector_db.search(query_embedding, k=top_n + 1)
|
|
||||||
|
|
||||||
# Filter out the original article
|
# Filter out the original article
|
||||||
recommendations = [art for art in similar_articles if art.get('slug') != article_id]
|
filtered_recommendations = [art for art in recommendations if art.get('slug') != article_id]
|
||||||
|
|
||||||
return recommendations[:top_n]
|
return filtered_recommendations[:top_n]
|
||||||
|
|
||||||
|
|
||||||
def analyze_article_with_groq(article_text):
|
def analyze_article_with_groq(article_text):
|
||||||
@@ -57,7 +104,8 @@ def analyze_article_with_groq(article_text):
|
|||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
"role": "system",
|
"role": "system",
|
||||||
"content": "You are an AI news analyst. Provide insights, key points, and sentiment analysis for the given article."
|
"content": "You are an AI news analyst. Provide insights, "
|
||||||
|
"key points, and sentiment analysis for the given article."
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@@ -75,23 +123,5 @@ def analyze_article_with_groq(article_text):
|
|||||||
|
|
||||||
def get_personalized_recommendations(user_interests, top_n=5):
|
def get_personalized_recommendations(user_interests, top_n=5):
|
||||||
"""Get personalized recommendations based on user interests"""
|
"""Get personalized recommendations based on user interests"""
|
||||||
query_embedding = get_query_embedding(user_interests)
|
return news_recommender.get_personalized_recommendations(user_interests,
|
||||||
if not query_embedding:
|
top_n)
|
||||||
return []
|
|
||||||
|
|
||||||
recommendations = vector_db.search(query_embedding, k=top_n)
|
|
||||||
|
|
||||||
# Re-rank results for better relevance
|
|
||||||
if recommendations:
|
|
||||||
documents = [f"{art['title']} {art['content']}" for art in recommendations]
|
|
||||||
reranked = rerank_results(user_interests, documents)
|
|
||||||
|
|
||||||
if reranked:
|
|
||||||
# Reorder recommendations based on reranking
|
|
||||||
reordered = []
|
|
||||||
for result in reranked:
|
|
||||||
if result.index < len(recommendations):
|
|
||||||
reordered.append(recommendations[result.index])
|
|
||||||
return reordered
|
|
||||||
|
|
||||||
return recommendations
|
|
||||||
|
|||||||
Reference in New Issue
Block a user