Files
DS_TASK_AI_VIEWS/backend/main.py
T
Aherobo Ovie Victor ecd24ce2a6 feat: Complete AI transformation to production-ready system
🚀 Major System Upgrades:
- Upgraded from 10 to 15 API endpoints (50% increase)
- Implemented real Sentence Transformers (all-MiniLM-L6-v2) with 384D embeddings
- Added Groq LLM integration (llama3-8b-8192) for AI analysis
- Built comprehensive deduplication system (1378 → 204 unique articles)
- Added 3 new AI analysis endpoints: analyze-article, generate-insights, recommend-by-article-id

🤖 AI & ML Enhancements:
- Replaced hash-based embeddings with genuine Sentence Transformers
- Implemented offline AI model operation (no API dependencies for embeddings)
- Added complete article analysis: summarization, sentiment, keyword extraction
- Built multi-article insights generation with trend analysis
- Enhanced semantic search with similarity scoring

🔧 Production Features:
- Added intelligent duplicate detection and removal
- Implemented vector index rebuilding capabilities
- Enhanced RSS fetching with better error handling and timeouts
- Improved search API with content inclusion control
- Added comprehensive system monitoring and maintenance tools

📚 Documentation & Configuration:
- Updated README.md to reflect all current features and capabilities
- Added .env.example with proper configuration templates
- Enhanced API documentation with working examples
- Updated system architecture documentation

🎯 System Metrics:
- 204 unique articles (deduplicated from 1378)
- 15 fully functional API endpoints
- 384-dimensional Sentence Transformers embeddings
- FAISS vector database with semantic similarity search
- Groq LLM integration active and operational
- Production-ready with rate limiting, caching, and error handling

Ready for enterprise deployment and scaling.
2025-07-09 12:31:24 +01:00

652 lines
22 KiB
Python

"""FastAPI backend for DS Task AI News"""
from fastapi import FastAPI, HTTPException, Query, Request
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Dict, Any, Optional
import uvicorn
import time
from collections import defaultdict
from datetime import datetime
from config import settings
from news_fetcher import NewsFetcher
from recommender import NewsRecommender
from ai_analyzer import AIAnalyzer
# Groq integration
try:
from groq import Groq
groq_client = Groq(api_key=settings.groq_api_key) if settings.groq_api_key else None
groq_available = groq_client is not None
if groq_available:
print("✅ Groq LLM service initialized")
else:
print("⚠️ Groq API key not provided")
except Exception as e:
print(f"⚠️ Groq initialization failed: {e}")
groq_client = None
groq_available = False
# Initialize FastAPI app
app = FastAPI(
title="DS Task AI News API",
description="AI-powered news retrieval and recommendation system",
version="1.0.0"
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, specify actual origins
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize components
news_fetcher = NewsFetcher()
recommender = NewsRecommender()
ai_analyzer = AIAnalyzer()
# Simple rate limiter
rate_limit_storage = defaultdict(list)
RATE_LIMIT_REQUESTS = 100 # requests per minute
RATE_LIMIT_WINDOW = 60 # seconds
def check_rate_limit(client_ip: str) -> bool:
"""Check if client has exceeded rate limit"""
current_time = time.time()
# Clean old requests
rate_limit_storage[client_ip] = [
req_time for req_time in rate_limit_storage[client_ip]
if current_time - req_time < RATE_LIMIT_WINDOW
]
# Check if limit exceeded
if len(rate_limit_storage[client_ip]) >= RATE_LIMIT_REQUESTS:
return False
# Add current request
rate_limit_storage[client_ip].append(current_time)
return True
# Pydantic models
class NewsQuery(BaseModel):
query: str
top_k: int = 5
class InterestsQuery(BaseModel):
interests: List[str]
top_k: int = 10
class SearchQuery(BaseModel):
query: str
source: Optional[str] = None
date_from: Optional[str] = None
date_to: Optional[str] = None
top_k: int = 10
include_content: bool = False
# API Endpoints
@app.get("/")
async def root():
"""Health check endpoint"""
return {
"message": "DS Task AI News API is running!",
"version": "1.0.0",
"status": "healthy"
}
@app.get("/health")
async def health_check():
"""Detailed health check"""
stats = recommender.get_store_stats()
return {
"status": "healthy",
"vector_store": stats,
"settings": {
"embedding_model": settings.embedding_model,
"vector_db_type": settings.vector_db_type,
"rss_feeds_count": len(settings.rss_feeds)
}
}
@app.post("/fetch-news")
async def fetch_news():
"""Fetch news from RSS feeds and add to vector store"""
try:
# Fetch news articles
result = news_fetcher.fetch_and_save_news()
if not result["success"]:
raise HTTPException(status_code=500, detail=result.get("message", "Failed to fetch news"))
# Add articles to vector store
articles = result["articles"]
store_result = recommender.add_articles_to_store(articles)
if not store_result["success"]:
raise HTTPException(status_code=500, detail=store_result.get("message", "Failed to add articles to store"))
return {
"success": True,
"message": "News fetched and processed successfully",
"articles_fetched": result["articles_count"],
"articles_stored": store_result["articles_added"],
"total_articles": store_result["total_articles"]
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error fetching news: {str(e)}")
@app.post("/recommend-by-query")
async def recommend_by_query(query_data: NewsQuery):
"""Get news recommendations based on text query"""
try:
recommendations = recommender.recommend_by_query(query_data.query, query_data.top_k)
return {
"success": True,
"query": query_data.query,
"recommendations": recommendations,
"count": len(recommendations)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.post("/recommend-by-interests")
async def recommend_by_interests(interests_data: InterestsQuery):
"""Get news recommendations based on user interests"""
try:
recommendations = recommender.recommend_by_interests(interests_data.interests, interests_data.top_k)
return {
"success": True,
"interests": interests_data.interests,
"recommendations": recommendations,
"count": len(recommendations)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.get("/trending")
async def get_trending_news(top_k: int = Query(10, description="Number of trending articles to return")):
"""Get trending news articles"""
try:
trending = recommender.get_trending_articles(top_k)
return {
"success": True,
"trending_articles": trending,
"count": len(trending)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting trending news: {str(e)}")
@app.get("/articles")
async def get_all_articles(
source: Optional[str] = Query(None, description="Filter by news source"),
limit: int = Query(50, description="Maximum number of articles to return"),
offset: int = Query(0, description="Number of articles to skip for pagination"),
category: Optional[str] = Query(None, description="Filter by article category"),
date_from: Optional[str] = Query(None, description="Filter articles from this date (YYYY-MM-DD)"),
date_to: Optional[str] = Query(None, description="Filter articles to this date (YYYY-MM-DD)")
):
"""Get all articles with pagination and advanced filtering"""
try:
# Get all articles first
all_articles = recommender.vector_store.get_all_articles()
# Apply filters
filtered_articles = all_articles
# Filter by source
if source:
filtered_articles = [a for a in filtered_articles if a.get('source', '').lower() == source.lower()]
# Filter by category (if articles have categories)
if category:
filtered_articles = [a for a in filtered_articles
if category.lower() in [cat.lower() for cat in a.get('categories', [])]]
# Filter by date range
if date_from or date_to:
from datetime import datetime
def parse_date(date_str):
try:
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
except:
try:
return datetime.strptime(date_str, '%Y-%m-%d')
except:
return None
if date_from:
from_date = parse_date(date_from)
if from_date:
filtered_articles = [a for a in filtered_articles
if parse_date(a.get('published_date', '')) and
parse_date(a.get('published_date', '')) >= from_date]
if date_to:
to_date = parse_date(date_to)
if to_date:
filtered_articles = [a for a in filtered_articles
if parse_date(a.get('published_date', '')) and
parse_date(a.get('published_date', '')) <= to_date]
# Sort by published date (newest first)
filtered_articles = sorted(filtered_articles,
key=lambda x: x.get('published_date', ''),
reverse=True)
# Calculate pagination
total_count = len(filtered_articles)
start_idx = offset
end_idx = offset + limit
paginated_articles = filtered_articles[start_idx:end_idx]
# Calculate pagination metadata
has_next = end_idx < total_count
has_prev = offset > 0
total_pages = (total_count + limit - 1) // limit # Ceiling division
current_page = (offset // limit) + 1
return {
"success": True,
"articles": paginated_articles,
"pagination": {
"total_count": total_count,
"count": len(paginated_articles),
"limit": limit,
"offset": offset,
"current_page": current_page,
"total_pages": total_pages,
"has_next": has_next,
"has_prev": has_prev,
"next_offset": end_idx if has_next else None,
"prev_offset": max(0, offset - limit) if has_prev else None
},
"filters": {
"source": source,
"category": category,
"date_from": date_from,
"date_to": date_to
}
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting articles: {str(e)}")
@app.post("/search")
async def search_articles(search_data: SearchQuery, request: Request):
"""Advanced search with multiple filters and semantic similarity"""
try:
# Rate limiting
client_ip = request.client.host
if not check_rate_limit(client_ip):
raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
# Get semantic search results first
semantic_results = recommender.search_articles(search_data.query, {}, search_data.top_k * 2)
# Apply additional filters
filtered_results = semantic_results
# Filter by source
if search_data.source:
filtered_results = [r for r in filtered_results
if r.get('source', '').lower() == search_data.source.lower()]
# Filter by date range
if search_data.date_from or search_data.date_to:
from datetime import datetime
def parse_date(date_str):
try:
return datetime.fromisoformat(date_str.replace('Z', '+00:00'))
except:
try:
return datetime.strptime(date_str, '%Y-%m-%d')
except:
return None
if search_data.date_from:
from_date = parse_date(search_data.date_from)
if from_date:
filtered_results = [r for r in filtered_results
if parse_date(r.get('published_date', '')) and
parse_date(r.get('published_date', '')) >= from_date]
if search_data.date_to:
to_date = parse_date(search_data.date_to)
if to_date:
filtered_results = [r for r in filtered_results
if parse_date(r.get('published_date', '')) and
parse_date(r.get('published_date', '')) <= to_date]
# Limit results to requested amount
final_results = filtered_results[:search_data.top_k]
# Optionally exclude content for lighter responses
if not search_data.include_content:
for result in final_results:
if 'content' in result:
del result['content']
return {
"success": True,
"query": search_data.query,
"filters": {
"source": search_data.source,
"date_from": search_data.date_from,
"date_to": search_data.date_to
},
"results": final_results,
"count": len(final_results),
"total_semantic_matches": len(semantic_results),
"filtered_matches": len(filtered_results)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error searching articles: {str(e)}")
@app.get("/stats")
async def get_stats():
"""Get system statistics"""
try:
stats = recommender.get_store_stats()
# Add RSS feed information
stats['rss_feeds'] = settings.rss_feeds
stats['embedding_model'] = settings.embedding_model
stats['groq_available'] = groq_available
return {
"success": True,
"statistics": stats
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting stats: {str(e)}")
# AI Analysis Endpoints
@app.get("/ai-status")
async def get_ai_status():
"""Get AI analyzer status and capabilities"""
try:
status = ai_analyzer.get_status()
return {
"success": True,
"ai_status": status
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting AI status: {str(e)}")
@app.post("/analyze-article")
async def analyze_article(request: Request, article_data: dict):
"""Analyze a specific article with AI (sentiment, keywords, summary)"""
try:
# Rate limiting
client_ip = request.client.host
if not check_rate_limit(client_ip):
raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
# Validate input
if not article_data or 'id' not in article_data:
raise HTTPException(status_code=400, detail="Article ID is required")
article_id = article_data['id']
# Get article from vector store
articles = recommender.vector_store.articles_metadata
article = None
for a in articles:
if a.get('id') == article_id:
article = a
break
if not article:
raise HTTPException(status_code=404, detail="Article not found")
# Perform AI analysis
analysis = {}
# Get summary
summary = ai_analyzer.summarize_article(article)
analysis['summary'] = summary
# Get sentiment analysis
sentiment = ai_analyzer.analyze_sentiment(article)
analysis['sentiment'] = sentiment
# Get keywords
keywords = ai_analyzer.extract_keywords(article)
analysis['keywords'] = keywords
return {
"success": True,
"article_id": article_id,
"article_title": article.get('title', ''),
"analysis": analysis,
"analyzed_at": datetime.now().isoformat()
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error analyzing article: {str(e)}")
@app.post("/generate-insights")
async def generate_insights(request: Request, insights_data: dict = None):
"""Generate insights from recent articles using AI analysis"""
try:
# Rate limiting
client_ip = request.client.host
if not check_rate_limit(client_ip):
raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
# Get parameters
limit = insights_data.get('limit', 20) if insights_data else 20
source = insights_data.get('source') if insights_data else None
# Get recent articles
articles = recommender.vector_store.articles_metadata
# Filter by source if specified
if source:
articles = [a for a in articles if a.get('source', '').lower() == source.lower()]
# Get most recent articles
sorted_articles = sorted(articles, key=lambda x: x.get('added_date', ''), reverse=True)
recent_articles = sorted_articles[:limit]
if not recent_articles:
return {
"success": True,
"insights": {
"trends": [],
"key_developments": [],
"implications": "No recent articles found for analysis"
},
"article_count": 0,
"analyzed_at": datetime.now().isoformat()
}
# Generate insights using AI
insights = ai_analyzer.generate_insights(recent_articles)
return {
"success": True,
"insights": insights,
"article_count": len(recent_articles),
"source_filter": source,
"analyzed_at": datetime.now().isoformat()
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error generating insights: {str(e)}")
@app.get("/recommend-by-article-id/{article_id}")
async def recommend_by_article_id(article_id: str, request: Request, top_k: int = Query(5, description="Number of recommendations")):
"""Get recommendations based on a specific article ID"""
try:
# Rate limiting
client_ip = request.client.host
if not check_rate_limit(client_ip):
raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
# Find the article
articles = recommender.vector_store.articles_metadata
source_article = None
source_index = None
for i, article in enumerate(articles):
if article.get('id') == article_id:
source_article = article
source_index = i
break
if not source_article:
raise HTTPException(status_code=404, detail="Article not found")
# Get article embedding from vector store
if recommender.vector_store.index is None:
raise HTTPException(status_code=500, detail="Vector index not available")
# Get the embedding for this article
article_embedding = recommender.vector_store.index.reconstruct(source_index)
# Find similar articles
similar_results = recommender.vector_store.search_similar(
article_embedding.reshape(1, -1),
top_k + 1 # +1 to exclude the source article
)
# Filter out the source article
recommendations = [r for r in similar_results if r.get('id') != article_id][:top_k]
return {
"success": True,
"source_article": {
"id": source_article.get('id'),
"title": source_article.get('title'),
"source": source_article.get('source')
},
"recommendations": recommendations,
"count": len(recommendations)
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error getting recommendations: {str(e)}")
@app.post("/rebuild-index")
async def rebuild_vector_index(request: Request):
"""Rebuild the vector index from existing metadata"""
try:
# Rate limiting
client_ip = request.client.host
if not check_rate_limit(client_ip):
raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
# Check if we have metadata
if not recommender.vector_store.articles_metadata:
raise HTTPException(status_code=400, detail="No articles metadata found")
articles_count = len(recommender.vector_store.articles_metadata)
# Create articles list from metadata
articles = []
for meta in recommender.vector_store.articles_metadata:
article = {
'id': meta.get('id'),
'title': meta.get('title', ''),
'content': meta.get('content', ''),
'url': meta.get('url'),
'source': meta.get('source'),
'published_date': meta.get('published_date'),
'added_date': meta.get('added_date')
}
articles.append(article)
# Generate embeddings using the embedding generator
from embeddings import EmbeddingGenerator
embedding_gen = EmbeddingGenerator()
embeddings = embedding_gen.generate_embeddings(articles)
# Create new index and add articles
recommender.vector_store.create_index(embeddings.shape[1])
recommender.vector_store.add_articles(articles, embeddings)
recommender.vector_store.save_index()
return {
"success": True,
"message": "Vector index rebuilt successfully",
"articles_processed": articles_count,
"embedding_dimension": embeddings.shape[1]
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error rebuilding index: {str(e)}")
@app.post("/remove-duplicates")
async def remove_duplicates(request: Request):
"""Remove duplicate articles from the vector store"""
try:
# Rate limiting
client_ip = request.client.host
if not check_rate_limit(client_ip):
raise HTTPException(status_code=429, detail="Rate limit exceeded. Please try again later.")
# Get current stats
original_count = len(recommender.vector_store.articles_metadata)
# Remove duplicates
recommender.vector_store.remove_duplicates()
# Save the cleaned index
recommender.vector_store.save_index()
# Get new stats
new_count = len(recommender.vector_store.articles_metadata)
duplicates_removed = original_count - new_count
return {
"success": True,
"message": "Duplicates removed successfully",
"original_count": original_count,
"new_count": new_count,
"duplicates_removed": duplicates_removed
}
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Error removing duplicates: {str(e)}")
# Run the application
if __name__ == "__main__":
uvicorn.run(
"main:app",
host=settings.host,
port=settings.port,
reload=settings.debug
)