update recommender and news_fetcher

2025-07-24 16:35:04 +01:00
parent f28755e1fd
commit 8d2a277afe
4 changed files with 330 additions and 36 deletions
@@ -0,0 +1,112 @@
+# .gitignore for DS Task AI News Project
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+### Environment ###
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Virtual environment
+pythonenv*
+
+### IDE ###
+# VS Code
+.vscode/
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+
+# PyCharm
+.idea/
+*.iml
+*.ipr
+*.iws
+
+### Data Files ###
+# Raw and processed news
+data/raw_news/
+data/processed_news/
+*.csv
+*.json
+*.parquet
+*.feather
+*.pkl
+*.pickle
+*.db
+*.sqlite
+
+# Vector database files
+*.faiss
+*.index
+*.bin
+*.vec
+
+### Logs ###
+*.log
+logs/
+
+### OS Generated ###
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+
+### Groq/Cohere Cache ###
+.cache/
+model_cache/
+
+### Test Files ###
+test_output/
+benchmark_results/
+
+### Documentation ###
+docs/_build/
@@ -1,7 +1,7 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news
-from .recommender import recommend_similar, process_articles_for_vector_db
+from .recommender import recommend_similar, process_articles_for_vector_db, news_recommender
 from .recommender import analyze_article_with_groq
 from .recommender import get_personalized_recommendations, vector_db
 from .vector_store import VectorDB
@@ -41,7 +41,7 @@ async def root():

@app.get("/fetch-news")
 async def fetch_news():
-    """Fetch news from RSS feeds"""
+    """Fetch news from RSS feeds with duplicate detection"""
    try:
        articles = fetch_all_news()

@@ -74,7 +74,7 @@ async def fetch_news():

@app.get("/recommend-news")
 async def recommend_news(article_id: str):
-    """Retrieve similar news based on the selected article"""
+    """Retrieve similar news based on the selected article (backward compatibility)"""
    try:
        recommendations = recommend_similar(article_id)

@@ -91,6 +91,25 @@ async def recommend_news(article_id: str):
        raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")


+@app.get("/recommend-by-text")
+async def recommend_by_text(text_description: str, top_n: int = 3):
+    """Recommend articles based on text description"""
+    try:
+        recommendations = news_recommender.recommend_by_text(text_description, top_n)
+
+        if not recommendations:
+            raise HTTPException(status_code=404, detail="No recommendations found")
+
+        return {
+            "text_description": text_description,
+            "recommendations": recommendations,
+            "count": len(recommendations)
+        }
+
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
+
+
@app.get("/analyze-article")
 async def analyze_article(article_id: str):
    """Analyze article using Groq LLM"""
@@ -134,6 +153,7 @@ async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "database_articles": len(vector_db.articles)}

+
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -1,8 +1,126 @@
+# Updated newsfetcher.py with similarity search and LLM duplicate detection
+
 import feedparser
 import json
 import os
 from datetime import datetime
+from typing import List, Dict, Optional
 from .config import Config
+from .embeddings import get_query_embedding
+from .vector_store import VectorDB
+import groq
+import numpy as np
+
+# Initialize Groq client for duplicate detection
+groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
+
+
+class NewsFetcher:
+    """News fetcher with duplicate detection capabilities"""
+    
+    def __init__(self, vector_db: VectorDB, similarity_threshold: float = 0.8):
+        self.vector_db = vector_db
+        self.similarity_threshold = similarity_threshold
+    
+    def check_similarity_duplicate(self, article: Dict) -> bool:
+        """
+        Check if article is a duplicate using similarity search
+        
+        Args:
+            article: Article to check for duplicates
+            
+        Returns:
+            True if duplicate found, False otherwise
+        """
+        if not self.vector_db.articles:
+            return False
+        
+        # Create search text from title and content
+        search_text = f"{article['title']} {article['content']}"
+        query_embedding = get_query_embedding(search_text)
+        
+        if not query_embedding:
+            return False
+        
+        # Search for similar articles
+        similar_articles = self.vector_db.search(query_embedding, k=5)
+        
+        # Check if any similar article exceeds threshold
+        for similar_article in similar_articles:
+            similarity_score = similar_article.get('similarity_score', 0)
+            # Convert distance to similarity (FAISS returns L2 distance)
+            similarity = 1 / (1 + similarity_score)
+            
+            if similarity > self.similarity_threshold:
+                return True
+        
+        return False
+    
+    def check_llm_duplicate(self, new_title: str, existing_titles: List[str]) -> bool:
+        """
+        Check if titles are duplicates using LLM comparison
+        
+        Args:
+            new_title: New article title
+            existing_titles: List of existing article titles
+            
+        Returns:
+            True if duplicate found, False otherwise
+        """
+        if not existing_titles:
+            return False
+        
+        try:
+            # Create prompt for LLM comparison
+            titles_text = "\n".join([f"- {title}" for title in existing_titles])
+            
+            response = groq_client.chat.completions.create(
+                model=Config.GROQ_MODEL,
+                messages=[
+                    {
+                        "role": "system",
+                        "content": "You are a duplicate detection system. Compare the new article title with existing titles and respond with 'DUPLICATE' if they refer to the same news story, or 'UNIQUE' if it's a different story. Consider different phrasings, synonyms, and variations of the same story as duplicates."
+                    },
+                    {
+                        "role": "user",
+                        "content": f"New title: {new_title}\n\nExisting titles:\n{titles_text}"
+                    }
+                ],
+                max_tokens=10,
+                temperature=0.1
+            )
+            
+            result = response.choices[0].message.content.strip().upper()
+            return "DUPLICATE" in result
+            
+        except Exception as e:
+            print(f"Error checking LLM duplicate: {str(e)}")
+            return False
+    
+    def is_duplicate_article(self, article: Dict) -> bool:
+        """
+        Check if article is duplicate using both similarity and LLM methods
+        
+        Args:
+            article: Article to check
+            
+        Returns:
+            True if duplicate, False otherwise
+        """
+        # First check similarity
+        if self.check_similarity_duplicate(article):
+            return True
+        
+        # Then check with LLM
+        existing_titles = [art['title'] for art in self.vector_db.articles]
+        if self.check_llm_duplicate(article['title'], existing_titles):
+            return True
+        
+        return False
+
+
+# Initialize news fetcher instance
+news_fetcher = NewsFetcher(None, similarity_threshold=0.8)


 def fetch_rss_news(feed_url):
@@ -27,13 +145,27 @@ def fetch_rss_news(feed_url):


 def fetch_all_news():
-    """Fetch news from all RSS feeds"""
+    """Fetch news from all RSS feeds with duplicate detection"""
    all_articles = []
    
+    # Set the vector_db instance for news_fetcher
+    from .recommender import vector_db
+    news_fetcher.vector_db = vector_db
+
    for feed_url in Config.RSS_FEEDS:
        try:
            articles = fetch_rss_news(feed_url)
-            all_articles.extend(articles)
+            
+            # Filter out duplicates
+            unique_articles = []
+            for article in articles:
+                if not news_fetcher.is_duplicate_article(article):
+                    unique_articles.append(article)
+                else:
+                    print(f"Skipping duplicate article: {article['title']}")
+            
+            all_articles.extend(unique_articles)
+            
        except Exception as e:
            print(f"Error fetching from {feed_url}: {str(e)}")

@@ -1,3 +1,4 @@
+from typing import List, Dict, Optional
 from .embeddings import get_embeddings, get_query_embedding, rerank_results
 from .vector_store import VectorDB
 import groq
@@ -10,6 +11,56 @@ groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
 vector_db = VectorDB()


+class NewsRecommender:
+    """News recommendation system using vector similarity search"""
+
+    def __init__(self, vector_db: VectorDB):
+        self.vector_db = vector_db
+
+    def recommend_by_text(self, text_description: str, top_n: int = 3) -> List[Dict]:
+        """
+        Recommend articles based on text description
+
+        Args:
+            text_description: Text description to find similar articles for
+            top_n: Number of recommendations to return
+
+        Returns:
+            List of recommended articles
+        """
+        query_embedding = get_query_embedding(text_description)
+        if not query_embedding:
+            return []
+
+        # Search for similar articles
+        similar_articles = self.vector_db.search(query_embedding, k=top_n)
+
+        # Re-rank results for better relevance
+        if similar_articles:
+            documents = [f"{art['title']} {art['content']}" for
+                         art in similar_articles]
+            reranked = rerank_results(text_description, documents)
+
+            if reranked:
+                # Reorder recommendations based on reranking
+                reordered = []
+                for result in reranked:
+                    if result.index < len(similar_articles):
+                        reordered.append(similar_articles[result.index])
+                return reordered
+
+        return similar_articles
+
+    def get_personalized_recommendations(self, user_interests: str, top_n:
+                                         int = 5) -> List[Dict]:
+        """Get personalized recommendations based on user interests"""
+        return self.recommend_by_text(user_interests, top_n)
+
+
+# Initialize recommender instance
+news_recommender = NewsRecommender(vector_db)
+
+
 def process_articles_for_vector_db(articles):
    """Process articles and add to vector database"""
    if not articles:
@@ -35,18 +86,14 @@ def recommend_similar(article_id, top_n=3):

    # Get embedding for the article
    article_text = f"{article['title']} {article['content']}"
-    query_embedding = get_query_embedding(article_text)

-    if not query_embedding:
-        return []
-
-    # Search for similar articles
-    similar_articles = vector_db.search(query_embedding, k=top_n + 1)
+    # Use the new recommender with text description
+    recommendations = news_recommender.recommend_by_text(article_text, top_n + 1)

    # Filter out the original article
-    recommendations = [art for art in similar_articles if art.get('slug') != article_id]
+    filtered_recommendations = [art for art in recommendations if art.get('slug') != article_id]

-    return recommendations[:top_n]
+    return filtered_recommendations[:top_n]


 def analyze_article_with_groq(article_text):
@@ -57,7 +104,8 @@ def analyze_article_with_groq(article_text):
            messages=[
                {
                    "role": "system",
-                    "content": "You are an AI news analyst. Provide insights, key points, and sentiment analysis for the given article."
+                    "content": "You are an AI news analyst. Provide insights, "
+                    "key points, and sentiment analysis for the given article."
                },
                {
                    "role": "user",
@@ -75,23 +123,5 @@ def analyze_article_with_groq(article_text):

 def get_personalized_recommendations(user_interests, top_n=5):
    """Get personalized recommendations based on user interests"""
-    query_embedding = get_query_embedding(user_interests)
-    if not query_embedding:
-        return []
-
-    recommendations = vector_db.search(query_embedding, k=top_n)
-
-    # Re-rank results for better relevance
-    if recommendations:
-        documents = [f"{art['title']} {art['content']}" for art in recommendations]
-        reranked = rerank_results(user_interests, documents)
-
-        if reranked:
-            # Reorder recommendations based on reranking
-            reordered = []
-            for result in reranked:
-                if result.index < len(recommendations):
-                    reordered.append(recommendations[result.index])
-            return reordered
-
-    return recommendations
+    return news_recommender.get_personalized_recommendations(user_interests,
+                                                             top_n)