"""FastAPI backend for DS Task AI News""" from fastapi import FastAPI, HTTPException, Query, Request from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from typing import List, Dict, Any, Optional import uvicorn import time from collections import defaultdict from datetime import datetime from config import settings from news_fetcher import NewsFetcher from recommender import NewsRecommender from ai_analyzer import AIAnalyzer # Groq integration try: from groq import Groq groq_client = Groq(api_key=settings.groq_api_key) if settings.groq_api_key else None groq_available = groq_client is not None if groq_available: print("✅ Groq LLM service initialized") else: print("⚠️ Groq API key not provided") except Exception as e: print(f"⚠️ Groq initialization failed: {e}") groq_client = None groq_available = False # Initialize FastAPI app app = FastAPI( title="DS Task AI News API", description="AI-powered news retrieval and recommendation system", version="1.0.0" ) # Add CORS middleware app.add_middleware( CORSMiddleware, allow_origins=["*"], # In production, specify actual origins allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Initialize components news_fetcher = NewsFetcher() recommender = NewsRecommender() ai_analyzer = AIAnalyzer() # Simple rate limiter rate_limit_storage = defaultdict(list) RATE_LIMIT_REQUESTS = 100 # requests per minute RATE_LIMIT_WINDOW = 60 # seconds def check_rate_limit(client_ip: str) -> bool: """Check if client has exceeded rate limit""" current_time = time.time() # Clean old requests rate_limit_storage[client_ip] = [ req_time for req_time in rate_limit_storage[client_ip] if current_time - req_time < RATE_LIMIT_WINDOW ] # Check if limit exceeded if len(rate_limit_storage[client_ip]) >= RATE_LIMIT_REQUESTS: return False # Add current request rate_limit_storage[client_ip].append(current_time) return True # Pydantic models class NewsQuery(BaseModel): query: str top_k: int = 5 class InterestsQuery(BaseModel): interests: List[str] top_k: int = 10 class SearchQuery(BaseModel): query: str source: Optional[str] = None date_from: Optional[str] = None date_to: Optional[str] = None top_k: int = 10 include_content: bool = False # API Endpoints @app.get("/") async def root(): """Health check endpoint""" return { "message": "DS Task AI News API is running!", "version": "1.0.0", "status": "healthy" } @app.get("/health") async def health_check(): """Detailed health check""" stats = recommender.get_store_stats() return { "status": "healthy", "vector_store": stats, "settings": { "embedding_model": settings.embedding_model, "vector_db_type": settings.vector_db_type, "rss_feeds_count": len(settings.rss_feeds) } } @app.post("/fetch-news") async def fetch_news(): """Fetch news from RSS feeds and add to vector store""" try: # Fetch news articles result = news_fetcher.fetch_and_save_news() if not result["success"]: raise HTTPException(status_code=500, detail=result.get("message", "Failed to fetch news")) # Add articles to vector store articles = result["articles"] store_result = recommender.add_articles_to_store(articles) if not store_result["success"]: raise HTTPException(status_code=500, detail=store_result.get("message", "Failed to add articles to store")) return { "success": True, "message": "News fetched and processed successfully", "articles_fetched": result["articles_count"], "articles_stored": store_result["articles_added"], "total_articles": store_result["total_articles"] } except Exception as e: raise HTTPException(status_code=500, detail=f"Error fetching news: {str(e)}") @app.post("/recommend-by-query") async def recommend_by_query(query_data: NewsQuery): """Get news recommendations based on text query""" try: recommendations = recommender.recommend_by_query(query_data.query, query_data.top_k) return { "success": True, "query": query_data.query, "recommendations": recommendations, "count": len(recommendations) } except Exception as e: raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}") @app.post("/recommend-by-interests") async def recommend_by_interests(interests_data: InterestsQuery): """Get news recommendations based on user interests""" try: recommendations = recommender.recommend_by_interests(interests_data.interests, interests_data.top_k) return { "success": True, "interests": interests_data.interests, "recommendations": recommendations, "count": len(recommendations) } except Exception as e: raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}") @app.get("/trending") async def get_trending_news(top_k: int = Query(10, description="Number of trending articles to return")): """Get trending news articles""" try: trending = recommender.get_trending_articles(top_k) return { "success": True, "trending_articles": trending, "count": len(trending) } except Exception as e: raise HTTPException(status_code=500, detail=f"Error getting trending news: {str(e)}") @app.get("/articles") async def get_all_articles( source: Optional[str] = Query(None, description="Filter by news source"), limit: int = Query(50, description="Maximum number of articles to return"), offset: int = Query(0, description="Number of articles to skip for pagination"), category: Optional[str] = Query(None, description="Filter by article category"), date_from: Optional[str] = Query(None, description="Filter articles from this date (YYYY-MM-DD)"), date_to: Optional[str] = Query(None, description="Filter articles to this date (YYYY-MM-DD)") ): """Get all articles with pagination and advanced filtering""" try: # Get all articles first all_articles = recommender.vector_store.get_all_articles() # Apply filters filtered_articles = all_articles # Filter by source if source: filtered_articles = [a for a in filtered_articles if a.get('source', '').lower() == source.lower()] # Filter by category (if articles have categories) if category: filtered_articles = [a for a in filtered_articles if category.lower() in [cat.lower() for cat in a.get('categories', [])]] # Filter by date range if date_from or date_to: from datetime import datetime def parse_date(date_str): try: return datetime.fromisoformat(date_str.replace('Z', '+00:00')) except: try: return datetime.strptime(date_str, '%Y-%m-%d') except: return None if date_from: from_date = parse_date(date_from) if from_date: filtered_articles = [a for a in filtered_articles if parse_date(a.get('published_date', '')) and parse_date(a.get('published_date', '')) >= from_date] if date_to: to_date = parse_date(date_to) if to_date: filtered_articles = [a for a in filtered_articles if parse_date(a.get('published_date', '')) and parse_date(a.get('published_date', '')) <= to_date] # Sort by published date (newest first) filtered_articles = sorted(filtered_articles, key=lambda x: x.get('published_date', ''), reverse=True) # Calculate pagination total_count = len(filtered_articles) start_idx = offset end_idx = offset + limit paginated_articles = filtered_articles[start_idx:end_idx] # Calculate pagination metadata has_next = end_idx < total_count has_prev = offset > 0 total_pages = (total_count + limit - 1) // limit # Ceiling division current_page = (offset // limit) + 1 return { "success": True, "articles": paginated_articles, "pagination": { "total_count": total_count, "count": len(paginated_articles), "limit": limit, "offset": offset, "current_page": current_page, "total_pages": total_pages, "has_next": has_next, "has_prev": has_prev, "next_offset": end_idx if has_next else None, "prev_offset": max(0, offset - limit) if has_prev else None }, "filters": { "source": source, "category": category, "date_from": date_from, "date_to": date_to } } except Exception as e: raise HTTPException(status_code=500, detail=f"Error getting articles: {str(e)}") @app.post("/search") async def search_articles(search_data: SearchQuery, request: Request): """Advanced search with multiple filters and semantic similarity""" try: # Rate limiting client_ip = request.client.host if not check_rate_limit(client_ip): raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") # Get semantic search results first semantic_results = recommender.search_articles(search_data.query, {}, search_data.top_k * 2) # Apply additional filters filtered_results = semantic_results # Filter by source if search_data.source: filtered_results = [r for r in filtered_results if r.get('source', '').lower() == search_data.source.lower()] # Filter by date range if search_data.date_from or search_data.date_to: from datetime import datetime def parse_date(date_str): try: return datetime.fromisoformat(date_str.replace('Z', '+00:00')) except: try: return datetime.strptime(date_str, '%Y-%m-%d') except: return None if search_data.date_from: from_date = parse_date(search_data.date_from) if from_date: filtered_results = [r for r in filtered_results if parse_date(r.get('published_date', '')) and parse_date(r.get('published_date', '')) >= from_date] if search_data.date_to: to_date = parse_date(search_data.date_to) if to_date: filtered_results = [r for r in filtered_results if parse_date(r.get('published_date', '')) and parse_date(r.get('published_date', '')) <= to_date] # Limit results to requested amount final_results = filtered_results[:search_data.top_k] # Optionally exclude content for lighter responses if not search_data.include_content: for result in final_results: if 'content' in result: del result['content'] return { "success": True, "query": search_data.query, "filters": { "source": search_data.source, "date_from": search_data.date_from, "date_to": search_data.date_to }, "results": final_results, "count": len(final_results), "total_semantic_matches": len(semantic_results), "filtered_matches": len(filtered_results) } except Exception as e: raise HTTPException(status_code=500, detail=f"Error searching articles: {str(e)}") @app.get("/stats") async def get_stats(): """Get system statistics""" try: stats = recommender.get_store_stats() # Add RSS feed information stats['rss_feeds'] = settings.rss_feeds stats['embedding_model'] = settings.embedding_model stats['groq_available'] = groq_available return { "success": True, "statistics": stats } except Exception as e: raise HTTPException(status_code=500, detail=f"Error getting stats: {str(e)}") # AI Analysis Endpoints @app.get("/ai-status") async def get_ai_status(): """Get AI analyzer status and capabilities""" try: status = ai_analyzer.get_status() return { "success": True, "ai_status": status } except Exception as e: raise HTTPException(status_code=500, detail=f"Error getting AI status: {str(e)}") @app.post("/analyze-article") async def analyze_article(request: Request, article_data: dict): """Analyze a specific article with AI (sentiment, keywords, summary)""" try: # Rate limiting client_ip = request.client.host if not check_rate_limit(client_ip): raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") # Validate input if not article_data or 'id' not in article_data: raise HTTPException(status_code=400, detail="Article ID is required") article_id = article_data['id'] # Get article from vector store articles = recommender.vector_store.articles_metadata article = None for a in articles: if a.get('id') == article_id: article = a break if not article: raise HTTPException(status_code=404, detail="Article not found") # Perform AI analysis analysis = {} # Get summary summary = ai_analyzer.summarize_article(article) analysis['summary'] = summary # Get sentiment analysis sentiment = ai_analyzer.analyze_sentiment(article) analysis['sentiment'] = sentiment # Get keywords keywords = ai_analyzer.extract_keywords(article) analysis['keywords'] = keywords return { "success": True, "article_id": article_id, "article_title": article.get('title', ''), "analysis": analysis, "analyzed_at": datetime.now().isoformat() } except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"Error analyzing article: {str(e)}") @app.post("/generate-insights") async def generate_insights(request: Request, insights_data: dict = None): """Generate insights from recent articles using AI analysis""" try: # Rate limiting client_ip = request.client.host if not check_rate_limit(client_ip): raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") # Get parameters limit = insights_data.get('limit', 20) if insights_data else 20 source = insights_data.get('source') if insights_data else None # Get recent articles articles = recommender.vector_store.articles_metadata # Filter by source if specified if source: articles = [a for a in articles if a.get('source', '').lower() == source.lower()] # Get most recent articles sorted_articles = sorted(articles, key=lambda x: x.get('added_date', ''), reverse=True) recent_articles = sorted_articles[:limit] if not recent_articles: return { "success": True, "insights": { "trends": [], "key_developments": [], "implications": "No recent articles found for analysis" }, "article_count": 0, "analyzed_at": datetime.now().isoformat() } # Generate insights using AI insights = ai_analyzer.generate_insights(recent_articles) return { "success": True, "insights": insights, "article_count": len(recent_articles), "source_filter": source, "analyzed_at": datetime.now().isoformat() } except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"Error generating insights: {str(e)}") @app.get("/recommend-by-article-id/{article_id}") async def recommend_by_article_id(article_id: str, request: Request, top_k: int = Query(5, description="Number of recommendations")): """Get recommendations based on a specific article ID""" try: # Rate limiting client_ip = request.client.host if not check_rate_limit(client_ip): raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") # Find the article articles = recommender.vector_store.articles_metadata source_article = None source_index = None for i, article in enumerate(articles): if article.get('id') == article_id: source_article = article source_index = i break if not source_article: raise HTTPException(status_code=404, detail="Article not found") # Get article embedding from vector store if recommender.vector_store.index is None: raise HTTPException(status_code=500, detail="Vector index not available") # Get the embedding for this article article_embedding = recommender.vector_store.index.reconstruct(source_index) # Find similar articles similar_results = recommender.vector_store.search_similar( article_embedding.reshape(1, -1), top_k + 1 # +1 to exclude the source article ) # Filter out the source article recommendations = [r for r in similar_results if r.get('id') != article_id][:top_k] return { "success": True, "source_article": { "id": source_article.get('id'), "title": source_article.get('title'), "source": source_article.get('source') }, "recommendations": recommendations, "count": len(recommendations) } except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}") @app.post("/rebuild-index") async def rebuild_vector_index(request: Request): """Rebuild the vector index from existing metadata""" try: # Rate limiting client_ip = request.client.host if not check_rate_limit(client_ip): raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") # Check if we have metadata if not recommender.vector_store.articles_metadata: raise HTTPException(status_code=400, detail="No articles metadata found") articles_count = len(recommender.vector_store.articles_metadata) # Create articles list from metadata articles = [] for meta in recommender.vector_store.articles_metadata: article = { 'id': meta.get('id'), 'title': meta.get('title', ''), 'content': meta.get('content', ''), 'url': meta.get('url'), 'source': meta.get('source'), 'published_date': meta.get('published_date'), 'added_date': meta.get('added_date') } articles.append(article) # Generate embeddings using the embedding generator from embeddings import EmbeddingGenerator embedding_gen = EmbeddingGenerator() embeddings = embedding_gen.generate_embeddings(articles) # Create new index and add articles recommender.vector_store.create_index(embeddings.shape[1]) recommender.vector_store.add_articles(articles, embeddings) recommender.vector_store.save_index() return { "success": True, "message": "Vector index rebuilt successfully", "articles_processed": articles_count, "embedding_dimension": embeddings.shape[1] } except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"Error rebuilding index: {str(e)}") @app.post("/remove-duplicates") async def remove_duplicates(request: Request): """Remove duplicate articles from the vector store""" try: # Rate limiting client_ip = request.client.host if not check_rate_limit(client_ip): raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.") # Get current stats original_count = len(recommender.vector_store.articles_metadata) # Remove duplicates recommender.vector_store.remove_duplicates() # Save the cleaned index recommender.vector_store.save_index() # Get new stats new_count = len(recommender.vector_store.articles_metadata) duplicates_removed = original_count - new_count return { "success": True, "message": "Duplicates removed successfully", "original_count": original_count, "new_count": new_count, "duplicates_removed": duplicates_removed } except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=f"Error removing duplicates: {str(e)}") # Run the application if __name__ == "__main__": uvicorn.run( "main:app", host=settings.host, port=settings.port, reload=settings.debug )