update all endpoints

2025-07-08 19:57:35 +01:00
parent b76a3e75f3
commit f1af775950
7 changed files with 400 additions and 54 deletions
@@ -3,11 +3,18 @@ from dotenv import load_dotenv
 load_dotenv()
 class Config:
    COHERE_API_KEY = os.getenv("COHERE_API_KEY")
    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
    RSS_FEEDS = [
        "http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml",
-        "https://feeds.bbci.co.uk/news/technology/rss.xml"
+        "https://feeds.bbci.co.uk/news/technology/rss.xml",
        "https://feeds.a.dj.com/rss/RSSMarketsMain.xml",
        "https://rss.cnn.com/rss/edition.rss"
    ]
    VECTOR_DB_PATH = "data/vector_db.index"
    RAW_NEWS_PATH = "data/raw_news/"
    PROCESSED_NEWS_PATH = "data/processed_news/"
    EMBEDDING_MODEL = "embed-english-v3.0"
    GROQ_MODEL = "llama3-8b-8192"
@@ -1,8 +1,47 @@
 import cohere
-from backend.config import Config
+from .config import Config
 co = cohere.Client(Config.COHERE_API_KEY)
 def get_embeddings(texts):
-    response = co.embed(texts=texts, model="embed-english-v3.0")
+    """Generate embeddings using Cohere"""
    try:
        response = co.embed(
            texts=texts,
            model=Config.EMBEDDING_MODEL,
            input_type="search_document"
        )
        return response.embeddings
    except Exception as e:
        print(f"Error generating embeddings: {str(e)}")
        return None
 def get_query_embedding(query):
    """Generate embedding for search query"""
    try:
        response = co.embed(
            texts=[query],
            model=Config.EMBEDDING_MODEL,
            input_type="search_query"
        )
        return response.embeddings[0]
    except Exception as e:
        print(f"Error generating query embedding: {str(e)}")
        return None
 def rerank_results(query, documents):
    """Re-rank search results using Cohere"""
    try:
        response = co.rerank(
            model="rerank-english-v2.0",
            query=query,
            documents=documents,
            top_n=5
        )
        return response.results
    except Exception as e:
        print(f"Error reranking results: {str(e)}")
        return []
@@ -1,20 +1,139 @@
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException
-from backend.news_fetcher import fetch_news
+from fastapi.middleware.cors import CORSMiddleware
-from backend.recommender import recommend_similar
+from .news_fetcher import fetch_all_news, save_raw_news, save_processed_news
-from backend.config import Config
+from .recommender import recommend_similar, process_articles_for_vector_db
 from .recommender import analyze_article_with_groq
 from .recommender import get_personalized_recommendations, vector_db
 from .vector_store import VectorDB
 from .config import Config
 import os
 app = FastAPI(title="DS Task AI News", version="1.0.0")
 # Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
@app.on_event("startup")
 async def startup_event():
    """Initialize the application"""
    # Create necessary directories
    os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)
    os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)
    os.makedirs("data", exist_ok=True)
    # Load existing vector database if available
    if os.path.exists(Config.VECTOR_DB_PATH):
        vector_db.load_index(Config.VECTOR_DB_PATH)
@app.get("/")
 async def root():
    """Root endpoint"""
    return {"message": "DS Task AI News API", "version": "1.0.0"}
 app = FastAPI()
@app.get("/fetch-news")
-async def get_latest_news():
+async def fetch_news():
-    all_news = []
+    """Fetch news from RSS feeds"""
-    for feed in Config.RSS_FEEDS:
+    try:
-        all_news.extend(fetch_news(feed))
+        articles = fetch_all_news()
-    return {"news": all_news}
+
        if not articles:
            raise HTTPException(status_code=404, detail="No articles found")
        # Save raw news
        raw_file = save_raw_news(articles)
        # Process articles for vector database
        process_articles_for_vector_db(articles)
        # Save processed news
        processed_file = save_processed_news(articles)
        # Save vector database
        vector_db.save_index(Config.VECTOR_DB_PATH)
        return {
            "message": "News fetched successfully",
            "articles_count": len(articles),
            "raw_file": raw_file,
            "processed_file": processed_file,
            "articles": articles
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error fetching news: {str(e)}")
-@app.get("/recommend")
+@app.get("/recommend-news")
 async def recommend_news(article_id: str):
-    sample_text = "AI breakthroughs in 2024"
+    """Retrieve similar news based on the selected article"""
-    similar_ids = recommend_similar(sample_text)
+    try:
-    return {"similar_articles": similar_ids}
+        recommendations = recommend_similar(article_id)
        if not recommendations:
            raise HTTPException(status_code=404, detail="No recommendations found")
        return {
            "article_id": article_id,
            "recommendations": recommendations,
            "count": len(recommendations)
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.get("/analyze-article")
 async def analyze_article(article_id: str):
    """Analyze article using Groq LLM"""
    try:
        article = vector_db.get_article_by_id(article_id)
        if not article:
            raise HTTPException(status_code=404, detail="Article not found")
        article_text = f"{article['title']} {article['content']}"
        analysis = analyze_article_with_groq(article_text)
        return {
            "article_id": article_id,
            "article": article,
            "analysis": analysis
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error analyzing article: {str(e)}")
@app.get("/personalized-recommendations")
 async def personalized_recommendations(interests: str):
    """Get personalized recommendations based on user interests"""
    try:
        recommendations = get_personalized_recommendations(interests)
        return {
            "interests": interests,
            "recommendations": recommendations,
            "count": len(recommendations)
        }
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error getting personalized recommendations: {str(e)}")
@app.get("/health")
 async def health_check():
    """Health check endpoint"""
    return {"status": "healthy", "database_articles": len(vector_db.articles)}
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)
@@ -1,26 +1,66 @@
 # backend/news_fetcher.py
 from datetime import datetime
 import feedparser
 import json
 import os
 from datetime import datetime
 from .config import Config
-def fetch_news(rss_url):
+
-    feed = feedparser.parse(rss_url)
+def fetch_rss_news(feed_url):
    """Fetch news from RSS feed"""
    feed = feedparser.parse(feed_url)
    articles = []
    for entry in feed.entries:
        try:
            # Try parsing with timezone first
            pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %z")
        except ValueError:
            try:
                # Fallback to GMT format without timezone
                pub_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %Z")
            except ValueError:
                # Final fallback - use current time if parsing fails
                pub_date = datetime.now()
-        articles.append({
+    for entry in feed.entries:
        article = {
            "title": entry.title,
-            "content": entry.description,
+            "content": getattr(entry, 'summary', ''),
-            "published": pub_date,
+            "date": getattr(entry, 'published', ''),
-            "source": rss_url
+            "slug": entry.title.lower().replace(" ", "-").replace(",", "").replace(".", ""),
-        })
+            "categories": ["Technology", "AI and Innovation"],
            "tags": ["AI", "Technology", "Innovation"],
            "url": getattr(entry, 'link', ''),
            "source": feed_url
        }
        articles.append(article)
    return articles
 def fetch_all_news():
    """Fetch news from all RSS feeds"""
    all_articles = []
    for feed_url in Config.RSS_FEEDS:
        try:
            articles = fetch_rss_news(feed_url)
            all_articles.extend(articles)
        except Exception as e:
            print(f"Error fetching from {feed_url}: {str(e)}")
    return all_articles
 def save_raw_news(articles):
    """Save raw news articles to file"""
    os.makedirs(Config.RAW_NEWS_PATH, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{Config.RAW_NEWS_PATH}news_{timestamp}.json"
    with open(filename, 'w') as f:
        json.dump(articles, f, indent=2)
    return filename
 def save_processed_news(articles):
    """Save processed news articles to file"""
    os.makedirs(Config.PROCESSED_NEWS_PATH, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{Config.PROCESSED_NEWS_PATH}processed_news_{timestamp}.json"
    with open(filename, 'w') as f:
        json.dump(articles, f, indent=2)
    return filename
@@ -1,8 +1,97 @@
-from backend.embeddings import get_embeddings
+from .embeddings import get_embeddings, get_query_embedding, rerank_results
-from backend.vector_store import VectorDB
+from .vector_store import VectorDB
 import groq
 from .config import Config
-db = VectorDB()
+# Initialize Groq client
 groq_client = groq.Groq(api_key=Config.GROQ_API_KEY)
-def recommend_similar(article_text, top_k=3):
+# Vector database instance
-    query_embed = get_embeddings([article_text])[0]
+vector_db = VectorDB()
-    return db.search(query_embed, k=top_k)
+
 def process_articles_for_vector_db(articles):
    """Process articles and add to vector database"""
    if not articles:
        return
    # Extract text content for embedding
    texts = []
    for article in articles:
        text = f"{article['title']} {article['content']}"
        texts.append(text)
    # Generate embeddings
    embeddings = get_embeddings(texts)
    if embeddings:
        vector_db.add_vectors(embeddings, articles)
 def recommend_similar(article_id, top_n=3):
    """Recommend similar articles based on article ID"""
    article = vector_db.get_article_by_id(article_id)
    if not article:
        return []
    # Get embedding for the article
    article_text = f"{article['title']} {article['content']}"
    query_embedding = get_query_embedding(article_text)
    if not query_embedding:
        return []
    # Search for similar articles
    similar_articles = vector_db.search(query_embedding, k=top_n + 1)
    # Filter out the original article
    recommendations = [art for art in similar_articles if art.get('slug') != article_id]
    return recommendations[:top_n]
 def analyze_article_with_groq(article_text):
    """Analyze article using Groq LLM"""
    try:
        response = groq_client.chat.completions.create(
            model=Config.GROQ_MODEL,
            messages=[
                {
                    "role": "system",
                    "content": "You are an AI news analyst. Provide insights, key points, and sentiment analysis for the given article."
                },
                {
                    "role": "user",
                    "content": f"Analyze this news article: {article_text}"
                }
            ],
            max_tokens=500,
            temperature=0.3
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error analyzing article with Groq: {str(e)}")
        return "Analysis unavailable"
 def get_personalized_recommendations(user_interests, top_n=5):
    """Get personalized recommendations based on user interests"""
    query_embedding = get_query_embedding(user_interests)
    if not query_embedding:
        return []
    recommendations = vector_db.search(query_embedding, k=top_n)
    # Re-rank results for better relevance
    if recommendations:
        documents = [f"{art['title']} {art['content']}" for art in recommendations]
        reranked = rerank_results(user_interests, documents)
        if reranked:
            # Reorder recommendations based on reranking
            reordered = []
            for result in reranked:
                if result.index < len(recommendations):
                    reordered.append(recommendations[result.index])
            return reordered
    return recommendations
@@ -1,14 +1,59 @@
 import numpy as np
 import faiss
-from backend.config import Config
+import os
 class VectorDB:
    def __init__(self):
-        self.index = faiss.IndexFlatL2(768)  # Cohere embedding dim
+        self.index = None
        self.articles = []
        self.dimension = 1024  # Cohere embedding dimension
-    def add_vectors(self, ids, embeddings):
+    def initialize_index(self):
-        self.index.add(np.array(embeddings).astype('float32'))
+        """Initialize FAISS index"""
        self.index = faiss.IndexFlatL2(self.dimension)
    def add_vectors(self, embeddings, articles):
        """Add vectors to the database"""
        if self.index is None:
            self.initialize_index()
        embeddings_array = np.array(embeddings).astype('float32')
        self.index.add(embeddings_array)
        self.articles.extend(articles)
    def search(self, query_embedding, k=5):
-        distances, indices = self.index.search(np.array([query_embedding]), k)
+        """Search for similar vectors"""
-        return indices[0]
+        if self.index is None or self.index.ntotal == 0:
            return []
        query_array = np.array([query_embedding]).astype('float32')
        distances, indices = self.index.search(query_array, k)
        results = []
        for i, idx in enumerate(indices[0]):
            if idx < len(self.articles):
                result = self.articles[idx].copy()
                result['similarity_score'] = float(distances[0][i])
                results.append(result)
        return results
    def save_index(self, filepath):
        """Save the index to file"""
        if self.index is not None:
            faiss.write_index(self.index, filepath)
    def load_index(self, filepath):
        """Load the index from file"""
        if os.path.exists(filepath):
            self.index = faiss.read_index(filepath)
    def get_article_by_id(self, article_id):
        """Get article by ID/slug"""
        for article in self.articles:
            if (article.get('slug') == article_id or
                article.get('title') == article_id or
                article.get('title', '').lower().replace(' ', '-').replace(',', '').replace('.', '') == article_id):
                return article
        return None
@@ -0,0 +1,7 @@
 # API Endpoints
 1. **GET /fetch-news** - Fetches news from RSS feeds
 2. **GET /recommend-news** - Retrieves similar news based on the selected article
 3. **GET /analyze-article** - Analyzes article using Groq LLM
 4. **GET /personalized-recommendations** - Get personalized recommendations
 5. **GET /health** - Health check endpoint