Add backend functionality for news fetching, processing, and recommendations

- Implemented NewsFetcher class to fetch articles from RSS feeds and clean HTML content. - Added EmbeddingGenerator for generating embeddings using Cohere API. - Created VectorStore for storing and retrieving articles using Pinecone. - Developed NewsRecommender for analyzing articles and generating insights with Groq. - Set up FastAPI application with endpoints for fetching news and providing recommendations. - Configured logging for better traceability and debugging. - Updated .gitignore to include environment variables and data directories. - Added requirements.txt for project dependencies.
2025-04-14 21:44:43 +01:00
parent 042f2386a0
commit e3d00bb4dc
8 changed files with 590 additions and 4 deletions
@@ -1,9 +1,43 @@
+# Environment variables
+.env
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
 # Virtual Environment
 .venv

-# Environment Variables
-.env

-# vscode settings
-.vscode
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Data directories
+data/raw_news/
+data/processed_news/
+
+# Logs
+*.log

@@ -0,0 +1,38 @@
+import os
+from dotenv import load_dotenv
+
+# Load environment variables
+
+# Construct the path to the .env file
+# dotenv_path = os.path.join(os.path.dirname(__file__), '..', '.env')
+
+# Load environment variables from the specified path
+load_dotenv()
+
+# API Keys
+COHERE_API_KEY = os.getenv("COHERE_API_KEY")
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
+
+# Pinecone Configuration
+PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME", "news-articles")
+
+# News Sources
+RSS_FEEDS = [
+    "https://feeds.feedburner.com/TechCrunch/",
+    # "https://www.theverge.com/rss/index.xml",
+    # "https://www.wired.com/feed/rss",
+    # "https://www.technologyreview.com/feed/",
+]
+
+# Vector Database Settings
+VECTOR_DIMENSION = 4096  # Cohere embedding dimension
+TOP_K_RESULTS = 5
+
+# Data Directories
+RAW_NEWS_DIR = "data/raw_news"
+PROCESSED_NEWS_DIR = "data/processed_news"
+
+# Create directories if they don't exist
+os.makedirs(RAW_NEWS_DIR, exist_ok=True)
+os.makedirs(PROCESSED_NEWS_DIR, exist_ok=True)
@@ -0,0 +1,50 @@
+import cohere
+from typing import List, Dict, Any
+from config import COHERE_API_KEY
+
+class EmbeddingGenerator:
+    def __init__(self):
+        self.client = cohere.Client(COHERE_API_KEY)
+
+    def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for a list of texts using Cohere."""
+        try:
+            response = self.client.embed(
+                texts=texts,
+                model="embed-english-v3.0",
+                input_type="search_document"
+            )
+            return response.embeddings
+        except Exception as e:
+            print(f"Error generating embeddings: {str(e)}")
+            return []
+
+    def process_articles(self, articles: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Process articles and add embeddings to them."""
+        # Prepare texts for embedding
+        texts = [
+            f"{article['title']} {article['content']}" 
+            for article in articles
+        ]
+        
+        # Generate embeddings
+        embeddings = self.generate_embeddings(texts)
+        
+        # Add embeddings to articles
+        for article, embedding in zip(articles, embeddings):
+            article["embedding"] = embedding
+        
+        return articles
+
+    def get_query_embedding(self, query: str) -> List[float]:
+        """Generate embedding for a search query."""
+        try:
+            response = self.client.embed(
+                texts=[query],
+                model="embed-english-v3.0",
+                input_type="search_query"
+            )
+            return response.embeddings[0]
+        except Exception as e:
+            print(f"Error generating query embedding: {str(e)}")
+            return []
@@ -0,0 +1,112 @@
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from typing import List, Dict, Any
+import json
+import os
+
+from news_fetcher import NewsFetcher
+from embeddings import EmbeddingGenerator
+from vector_store import VectorStore
+from recommender import NewsRecommender
+from config import RAW_NEWS_DIR, PROCESSED_NEWS_DIR
+
+app = FastAPI(title="DS Task AI News API")
+
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Initialize components
+news_fetcher = NewsFetcher()
+embedding_generator = EmbeddingGenerator()
+vector_store = VectorStore()
+recommender = NewsRecommender()
+
+@app.get("/")
+async def root():
+    """Root endpoint returning API information."""
+    return {
+        "name": "DS Task AI News API",
+        "version": "1.0.0",
+        "description": "AI-powered news retrieval and recommendation system"
+    }
+
+@app.get("/fetch-news")
+async def fetch_news():
+    """Fetch news from RSS feeds and store in vector database."""
+    try:
+        result = news_fetcher.process()
+        if result["status"] == "error":
+            raise HTTPException(status_code=404, detail=result["message"])
+        
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/recommend-news")
+async def recommend_news(article_id: str = None, query: str = None):
+    """Get news recommendations based on article ID or search query."""
+    try:
+        if article_id:
+            # Get article from vector store
+            article = vector_store.search_similar([0] * 4096, top_k=1)  # Placeholder vector
+            if not article:
+                raise HTTPException(status_code=404, detail="Article not found")
+            
+            # Generate query embedding from article content
+            query_embedding = embedding_generator.get_query_embedding(
+                f"{article[0]['title']} {article[0]['content']}"
+            )
+        elif query:
+            # Generate query embedding from search query
+            query_embedding = embedding_generator.get_query_embedding(query)
+        else:
+            raise HTTPException(
+                status_code=400,
+                detail="Either article_id or query parameter is required"
+            )
+
+        # Search for similar articles
+        similar_articles = vector_store.search_similar(query_embedding)
+        if not similar_articles:
+            raise HTTPException(status_code=404, detail="No similar articles found")
+
+        # Generate insights for the articles
+        insights = recommender.analyze_articles(similar_articles)
+
+        return {
+            "articles": similar_articles,
+            "insights": insights
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/article/{article_id}")
+async def get_article(article_id: str):
+    """Get a specific article and its summary."""
+    try:
+        # Search for the article
+        articles = vector_store.search_similar([0] * 4096, top_k=1)  # Placeholder vector
+        if not articles:
+            raise HTTPException(status_code=404, detail="Article not found")
+
+        article = articles[0]
+        
+        # Generate summary
+        summary = recommender.generate_summary(article)
+
+        return {
+            "article": article,
+            "summary": summary
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -0,0 +1,178 @@
+import feedparser
+import json
+import os
+import logging
+from datetime import datetime
+from typing import List, Dict, Any
+from config import RSS_FEEDS, RAW_NEWS_DIR, PROCESSED_NEWS_DIR
+from embeddings import EmbeddingGenerator
+from vector_store import VectorStore
+from bs4 import BeautifulSoup
+import re
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(),
+        logging.FileHandler('news_fetcher.log')
+    ]
+)
+logger = logging.getLogger('NewsFetcher')
+
+class NewsFetcher:
+    def __init__(self):
+        self.feeds = RSS_FEEDS
+        self.embedding_generator = EmbeddingGenerator()
+        self.vector_store = VectorStore()
+        logger.info("NewsFetcher initialized with %d RSS feeds", len(self.feeds))
+
+    def clean_html_content(self, html_content: str) -> str:
+        """Clean HTML content and extract plain text."""
+        logger.debug("Cleaning HTML content of length %d", len(html_content))
+        # Parse HTML with BeautifulSoup
+        soup = BeautifulSoup(html_content, 'html.parser')
+        
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        
+        # Get text content
+        text = soup.get_text()
+        
+        # Clean up whitespace
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        
+        # Remove extra spaces
+        text = re.sub(r'\s+', ' ', text)
+        
+        cleaned_text = text.strip()
+        logger.debug("Cleaned text length: %d", len(cleaned_text))
+        return cleaned_text
+
+    def fetch_rss_news(self, feed_url: str) -> List[Dict[str, Any]]:
+        """Fetch news articles from a single RSS feed."""
+        logger.info("Fetching news from feed: %s", feed_url)
+        feed = feedparser.parse(feed_url)
+        articles = []
+        
+        for entry in feed.entries:
+            # Get raw content with HTML
+            raw_content = entry.get("summary", "")
+            
+            # Clean HTML content
+            clean_content = self.clean_html_content(raw_content)
+            
+            article = {
+                "title": entry.title,
+                "raw_content": raw_content,  # Store original HTML content
+                "content": clean_content,     # Store cleaned text content
+                "link": entry.get("link", ""),
+                "published": entry.get("published", datetime.now().isoformat()),
+                "source": feed.feed.get("title", "Unknown"),
+                "categories": [tag.term for tag in entry.get("tags", [])],
+                "id": entry.get("id", entry.get("link", "")),
+            }
+            articles.append(article)
+        
+        logger.info("Fetched %d articles from %s", len(articles), feed_url)
+        return articles
+
+    def fetch_all_news(self) -> List[Dict[str, Any]]:
+        """Fetch news from all configured RSS feeds."""
+        logger.info("Starting to fetch news from all %d feeds", len(self.feeds))
+        all_articles = []
+        
+        for feed_url in self.feeds:
+            try:
+                articles = self.fetch_rss_news(feed_url)
+                all_articles.extend(articles)
+                logger.info("Successfully fetched %d articles from %s", len(articles), feed_url)
+            except Exception as e:
+                logger.error("Error fetching from %s: %s", feed_url, str(e))
+        
+        logger.info("Total articles fetched: %d", len(all_articles))
+        return all_articles
+
+    def save_raw_articles(self, articles: List[Dict[str, Any]]) -> str:
+        """Save raw articles to a JSON file."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"raw_news_{timestamp}.json"
+        filepath = os.path.join(RAW_NEWS_DIR, filename)
+        
+        logger.info("Saving %d raw articles to %s", len(articles), filepath)
+        with open(filepath, "w", encoding="utf-8") as f:
+            json.dump(articles, f, ensure_ascii=False, indent=2)
+        
+        logger.info("Raw articles saved successfully")
+        return filepath
+
+    def save_processed_articles(self, articles: List[Dict[str, Any]]) -> str:
+        """Save processed articles with embeddings to a JSON file."""
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"processed_news_{timestamp}.json"
+        filepath = os.path.join(PROCESSED_NEWS_DIR, filename)
+        
+        # Create a copy of articles without raw_content for processed storage
+        processed_articles = []
+        for article in articles:
+            processed_article = article.copy()
+            processed_article.pop('raw_content', None)  # Remove raw_content from processed articles
+            processed_articles.append(processed_article)
+        
+        logger.info("Saving %d processed articles to %s", len(processed_articles), filepath)
+        with open(filepath, "w", encoding="utf-8") as f:
+            json.dump(processed_articles, f, ensure_ascii=False, indent=2)
+        
+        logger.info("Processed articles saved successfully")
+        return filepath
+
+    def process(self) -> Dict[str, Any]:
+        """Main process to fetch, process, and store news articles."""
+        logger.info("Starting news processing pipeline")
+        
+        # Fetch articles
+        logger.info("Step 1: Fetching articles from RSS feeds")
+        articles = self.fetch_all_news()
+        if not articles:
+            logger.warning("No articles found during fetching")
+            return {"status": "error", "message": "No articles found"}
+
+        # Save raw articles
+        logger.info("Step 2: Saving raw articles")
+        raw_filepath = self.save_raw_articles(articles)
+
+        # Generate embeddings
+        logger.info("Step 3: Generating embeddings for %d articles", len(articles))
+        articles_with_embeddings = self.embedding_generator.process_articles(articles)
+        logger.info("Embeddings generated successfully")
+
+        # Save processed articles
+        logger.info("Step 4: Saving processed articles with embeddings")
+        processed_filepath = self.save_processed_articles(articles_with_embeddings)
+
+        # Store in vector database
+        logger.info("Step 5: Storing articles in vector database")
+        success = self.vector_store.upsert_articles(articles_with_embeddings)
+        
+        if success:
+            logger.info("Articles successfully stored in vector database")
+        else:
+            logger.error("Failed to store articles in vector database")
+
+        result = {
+            "status": "success" if success else "error",
+            "message": "Articles processed and stored successfully" if success else "Failed to store articles",
+            "raw_filepath": raw_filepath,
+            "processed_filepath": processed_filepath,
+            "article_count": len(articles)
+        }
+        
+        logger.info("News processing pipeline completed with status: %s", result["status"])
+        return result
+
+news_fetcher = NewsFetcher()
+print(news_fetcher.process())
@@ -0,0 +1,75 @@
+from groq import Groq
+from typing import List, Dict, Any
+from config import GROQ_API_KEY
+
+class NewsRecommender:
+    def __init__(self):
+        self.client = Groq(api_key=GROQ_API_KEY)
+
+    def analyze_articles(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Analyze a set of articles using Groq to generate insights."""
+        try:
+            # Prepare the prompt
+            articles_text = "\n\n".join([
+                f"Title: {article['title']}\nContent: {article['content']}"
+                for article in articles
+            ])
+            
+            prompt = f"""Analyze these news articles and provide insights:
+
+{articles_text}
+
+Please provide:
+1. Main themes and topics
+2. Key insights and trends
+3. Potential implications
+4. Related areas of interest
+
+Format the response as a JSON with these keys: themes, insights, implications, related_areas"""
+
+            # Get completion from Groq
+            completion = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": "You are a news analyst providing insights about technology and AI news."},
+                    {"role": "user", "content": prompt}
+                ],
+                model="mixtral-8x7b-32768",
+                temperature=0.7,
+                max_tokens=1000
+            )
+
+            # Parse and return the analysis
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"Error analyzing articles: {str(e)}")
+            return {
+                "themes": [],
+                "insights": [],
+                "implications": [],
+                "related_areas": []
+            }
+
+    def generate_summary(self, article: Dict[str, Any]) -> str:
+        """Generate a summary of a single article using Groq."""
+        try:
+            prompt = f"""Summarize this news article:
+
+Title: {article['title']}
+Content: {article['content']}
+
+Please provide a concise summary focusing on the key points and implications."""
+
+            completion = self.client.chat.completions.create(
+                messages=[
+                    {"role": "system", "content": "You are a news summarizer providing concise summaries of technology and AI news."},
+                    {"role": "user", "content": prompt}
+                ],
+                model="mixtral-8x7b-32768",
+                temperature=0.5,
+                max_tokens=500
+            )
+
+            return completion.choices[0].message.content
+        except Exception as e:
+            print(f"Error generating summary: {str(e)}")
+            return "Unable to generate summary."
@@ -0,0 +1,11 @@
+fastapi==0.109.2
+uvicorn==0.27.1
+feedparser==6.0.10
+cohere==4.47
+pinecone-client==3.0.2
+python-dotenv==1.0.1
+groq==0.4.2
+pydantic==2.6.3
+python-multipart==0.0.9
+httpx==0.27.0
+beautifulsoup4==4.12.3
@@ -0,0 +1,88 @@
+from pinecone import Pinecone, ServerlessSpec
+from typing import List, Dict, Any
+from config import (
+    PINECONE_API_KEY,
+    PINECONE_ENVIRONMENT,
+    PINECONE_INDEX_NAME,
+    VECTOR_DIMENSION,
+    TOP_K_RESULTS
+)
+
+class VectorStore:
+    def __init__(self):
+        self.pinecone = Pinecone(api_key=PINECONE_API_KEY)
+        self.index_name = PINECONE_INDEX_NAME
+        self._ensure_index()
+
+    def _ensure_index(self):
+        """Ensure the Pinecone index exists, create if it doesn't."""
+        if self.index_name not in self.pinecone.list_indexes().names():
+            self.pinecone.create_index(
+                name=self.index_name,
+                dimension=VECTOR_DIMENSION,
+                metric="cosine",
+                spec=ServerlessSpec(cloud="aws", region="us-east-1")
+            )
+        self.index = self.pinecone.Index(self.index_name)
+
+    def upsert_articles(self, articles: List[Dict[str, Any]]) -> bool:
+        """Upsert articles to the vector store."""
+        try:
+            vectors = []
+            for article in articles:
+                if "embedding" not in article:
+                    continue
+                    
+                vector = {
+                    "id": article["id"],
+                    "values": article["embedding"],
+                    "metadata": {
+                        "title": article["title"],
+                        "content": article["content"],
+                        "link": article["link"],
+                        "published": article["published"],
+                        "source": article["source"],
+                        "categories": article["categories"]
+                    }
+                }
+                vectors.append(vector)
+
+            if vectors:
+                self.index.upsert(vectors=vectors)
+            return True
+        except Exception as e:
+            print(f"Error upserting articles: {str(e)}")
+            return False
+
+    def search_similar(self, query_embedding: List[float], top_k: int = TOP_K_RESULTS) -> List[Dict[str, Any]]:
+        """Search for similar articles using the query embedding."""
+        try:
+            results = self.index.query(
+                vector=query_embedding,
+                top_k=top_k,
+                include_metadata=True
+            )
+            
+            articles = []
+            for match in results.matches:
+                article = {
+                    "id": match.id,
+                    "score": match.score,
+                    **match.metadata
+                }
+                articles.append(article)
+            
+            return articles
+        except Exception as e:
+            print(f"Error searching similar articles: {str(e)}")
+            return []
+
+    def delete_article(self, article_id: str) -> bool:
+        """Delete an article from the vector store."""
+        try:
+            self.index.delete(ids=[article_id])
+            return True
+        except Exception as e:
+            print(f"Error deleting article: {str(e)}")
+            return False
+