update recommender and news_fetcher

2025-07-24 16:35:04 +01:00
parent f28755e1fd
commit 8d2a277afe
4 changed files with 330 additions and 36 deletions
@@ -0,0 +1,112 @@
 # .gitignore for DS Task AI News Project
 ### Python ###
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 # Jupyter Notebook
 .ipynb_checkpoints
 ### Environment ###
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Virtual environment
 pythonenv*
 ### IDE ###
 # VS Code
 .vscode/
 !.vscode/settings.json
 !.vscode/tasks.json
 !.vscode/launch.json
 !.vscode/extensions.json
 # PyCharm
 .idea/
 *.iml
 *.ipr
 *.iws
 ### Data Files ###
 # Raw and processed news
 data/raw_news/
 data/processed_news/
 *.csv
 *.json
 *.parquet
 *.feather
 *.pkl
 *.pickle
 *.db
 *.sqlite
 # Vector database files
 *.faiss
 *.index
 *.bin
 *.vec
 ### Logs ###
 *.log
 logs/
 ### OS Generated ###
 .DS_Store
 .DS_Store?
 ._*
 .Spotlight-V100
 .Trashes
 ehthumbs.db
 Thumbs.db
 ### Groq/Cohere Cache ###
 .cache/
 model_cache/
 ### Test Files ###
 test_output/
 benchmark_results/
 ### Documentation ###
 docs/_build/
@@ -1,7 +1,7 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news
-from .recommender import recommend_similar, process_articles_for_vector_db
+from .recommender import recommend_similar, process_articles_for_vector_db, news_recommender
 from .recommender import analyze_article_with_groq
 from .recommender import get_personalized_recommendations, vector_db
 from .vector_store import VectorDB
@@ -41,7 +41,7 @@ async def root():
@app.get("/fetch-news")
 async def fetch_news():
-    """Fetch news from RSS feeds"""
+    """Fetch news from RSS feeds with duplicate detection"""
    try:
        articles = fetch_all_news()
@@ -74,7 +74,7 @@ async def fetch_news():
@app.get("/recommend-news")
 async def recommend_news(article_id: str):
-    """Retrieve similar news based on the selected article"""
+    """Retrieve similar news based on the selected article (backward compatibility)"""
    try:
        recommendations = recommend_similar(article_id)
@@ -91,6 +91,25 @@ async def recommend_news(article_id: str):
        raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.get("/recommend-by-text")
 async def recommend_by_text(text_description: str, top_n: int = 3):
    """Recommend articles based on text description"""
    try:
        recommendations = news_recommender.recommend_by_text(text_description, top_n)
        if not recommendations:
            raise HTTPException(status_code=404, detail="No recommendations found")
        return {
            "text_description": text_description,
            "recommendations": recommendations,
            "count": len(recommendations)
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.get("/analyze-article")
 async def analyze_article(article_id: str):
    """Analyze article using Groq LLM"""
@@ -134,6 +153,7 @@ async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "database_articles": len(vector_db.articles)}
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -1,8 +1,126 @@
 # Updated newsfetcher.py with similarity search and LLM duplicate detection
 import feedparser
 import json
 import os
 from datetime import datetime
 from typing import List, Dict, Optional
 from .config import Config
 from .embeddings import get_query_embedding
 from .vector_store import VectorDB
 import groq
 import numpy as np
 # Initialize Groq client for duplicate detection
 groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
 class NewsFetcher:
    """News fetcher with duplicate detection capabilities"""
    def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
        self.vector_db = vector_db
        self.similarity_threshold = similarity_threshold
    def check_similarity_duplicate(self, article: Dict) -> bool:
        """
        Check if article is a duplicate using similarity search
        Args:
            article: Article to check for duplicates
        Returns:
            True if duplicate found, False otherwise
        """
        if not self.vector_db.articles:
            return False
        # Create search text from title and content
        search_text = f"{article['title']} {article['content']}"
        query_embedding = get_query_embedding(search_text)
        if not query_embedding:
            return False
        # Search for similar articles
        similar_articles = self.vector_db.search(query_embedding, k=5)
        # Check if any similar article exceeds threshold
        for similar_article in similar_articles:
            similarity_score = similar_article.get('similarity_score', 0)
            # Convert distance to similarity (FAISS returns L2 distance)
            similarity = 1 / (1 + similarity_score)
            if similarity > self.similarity_threshold:
                return True
        return False
    def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
        """
        Check if titles are duplicates using LLM comparison
        Args:
            new_title: New article title
            existing_titles: List of existing article titles
        Returns:
            True if duplicate found, False otherwise
        """
        if not existing_titles:
            return False
        try:
            # Create prompt for LLM comparison
            titles_text = "\n".join([f"- {title}" for title in existing_titles])
            response = groq_client.chat.completions.create(
                model=Config.GROQ_MODEL,
                messages=[
                    {
                        "role": "system",
                        "content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
                    },
                    {
                        "role": "user",
                        "content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
                    }
                ],
                max_tokens=10,
                temperature=0.1
            )
            result = response.choices[0].message.content.strip().upper()
            return "DUPLICATE" in result
        except Exception as e:
            print(f"Error checking LLM duplicate: {str(e)}")
            return False
    def is_duplicate_article(self, article: Dict) -> bool:
        """
        Check if article is duplicate using both similarity and LLM methods
        Args:
            article: Article to check
        Returns:
            True if duplicate, False otherwise
        """
        # First check similarity
        if self.check_similarity_duplicate(article):
            return True
        # Then check with LLM
        existing_titles = [art['title'] for art in self.vector_db.articles]
        if self.check_llm_duplicate(article['title'], existing_titles):
            return True
        return False
 # Initialize news fetcher instance
 news_fetcher = NewsFetcher(None, similarity_threshold=0.8)
 def fetch_rss_news(feed_url):
@@ -27,13 +145,27 @@ def fetch_rss_news(feed_url):
 def fetch_all_news():
-    """Fetch news from all RSS feeds"""
+    """Fetch news from all RSS feeds with duplicate detection"""
    all_articles = []
    # Set the vector_db instance for news_fetcher
    from .recommender import vector_db
    news_fetcher.vector_db = vector_db
    for feed_url in Config.RSS_FEEDS:
        try:
            articles = fetch_rss_news(feed_url)
-            all_articles.extend(articles)
+            
            # Filter out duplicates
            unique_articles = []
            for article in articles:
                if not news_fetcher.is_duplicate_article(article):
                    unique_articles.append(article)
                else:
                    print(f"Skipping duplicate article: {article['title']}")
            all_articles.extend(unique_articles)
        except Exception as e:
            print(f"Error fetching from {feed_url}: {str(e)}")
@@ -1,3 +1,4 @@
 from typing import List, Dict, Optional
 from .embeddings import get_embeddings, get_query_embedding, rerank_results
 from .vector_store import VectorDB
 import groq
@@ -10,6 +11,56 @@ groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
 vector_db = VectorDB()
 class NewsRecommender:
    """News recommendation system using vector similarity search"""
    def __init__(self, vector_db: VectorDB):
        self.vector_db = vector_db
    def recommend_by_text(self, text_description: str, top_n: int = 3) -> List[Dict]:
        """
        Recommend articles based on text description
        Args:
            text_description: Text description to find similar articles for
            top_n: Number of recommendations to return
        Returns:
            List of recommended articles
        """
        query_embedding = get_query_embedding(text_description)
        if not query_embedding:
            return []
        # Search for similar articles
        similar_articles = self.vector_db.search(query_embedding, k=top_n)
        # Re-rank results for better relevance
        if similar_articles:
            documents = [f"{art['title']} {art['content']}" for
                         art in similar_articles]
            reranked = rerank_results(text_description, documents)
            if reranked:
                # Reorder recommendations based on reranking
                reordered = []
                for result in reranked:
                    if result.index < len(similar_articles):
                        reordered.append(similar_articles[result.index])
                return reordered
        return similar_articles
    def get_personalized_recommendations(self, user_interests: str, top_n:
                                         int = 5) -> List[Dict]:
        """Get personalized recommendations based on user interests"""
        return self.recommend_by_text(user_interests, top_n)
 # Initialize recommender instance
 news_recommender = NewsRecommender(vector_db)
 def process_articles_for_vector_db(articles):
    """Process articles and add to vector database"""
    if not articles:
@@ -35,18 +86,14 @@ def recommend_similar(article_id, top_n=3):
    # Get embedding for the article
    article_text = f"{article['title']} {article['content']}"
    query_embedding = get_query_embedding(article_text)
-    if not query_embedding:
+    # Use the new recommender with text description
-        return []
+    recommendations = news_recommender.recommend_by_text(article_text, top_n + 1)
    # Search for similar articles
    similar_articles = vector_db.search(query_embedding, k=top_n + 1)
    # Filter out the original article
-    recommendations = [art for art in similar_articles if art.get('slug') != article_id]
+    filtered_recommendations = [art for art in recommendations if art.get('slug') != article_id]
-    return recommendations[:top_n]
+    return filtered_recommendations[:top_n]
 def analyze_article_with_groq(article_text):
@@ -57,7 +104,8 @@ def analyze_article_with_groq(article_text):
            messages=[
                {
                    "role": "system",
-                    "content": "You are an AI news analyst. Provide insights, key points, and sentiment analysis for the given article."
+                    "content": "You are an AI news analyst. Provide insights, "
                    "key points, and sentiment analysis for the given article."
                },
                {
                    "role": "user",
@@ -75,23 +123,5 @@ def analyze_article_with_groq(article_text):
 def get_personalized_recommendations(user_interests, top_n=5):
    """Get personalized recommendations based on user interests"""
-    query_embedding = get_query_embedding(user_interests)
+    return news_recommender.get_personalized_recommendations(user_interests,
-    if not query_embedding:
+                                                             top_n)
        return []
    recommendations = vector_db.search(query_embedding, k=top_n)
    # Re-rank results for better relevance
    if recommendations:
        documents = [f"{art['title']} {art['content']}" for art in recommendations]
        reranked = rerank_results(user_interests, documents)
        if reranked:
            # Reorder recommendations based on reranking
            reordered = []
            for result in reranked:
                if result.index < len(recommendations):
                    reordered.append(recommendations[result.index])
            return reordered
    return recommendations