Update README and backend functionality for improved news application

- Enhanced README.md with a clearer project overview, features, technologies used, and installation instructions.
- Updated vector dimension in config.py from 4096 to 1024 for Cohere embeddings.
- Modified main.py to serve HTML responses for the home page, news fetching, and recommendations.
- Improved error handling and ensured articles have links in the responses.
- Cleaned up news_fetcher.py by removing unnecessary print statements.
- Updated recommender.py to refine insights generation and summary extraction.
- Added Jinja2 for templating and improved the project structure for better organization.
- Included API documentation for better understanding of endpoints and usage.
This commit is contained in:
boladeE
2025-04-15 11:59:39 +01:00
parent e3d00bb4dc
commit bc485b44b8
14 changed files with 957 additions and 108 deletions
+1 -1
View File
@@ -26,7 +26,7 @@ RSS_FEEDS = [
]
# Vector Database Settings
VECTOR_DIMENSION = 4096 # Cohere embedding dimension
VECTOR_DIMENSION = 1024 # Cohere embedding dimension
TOP_K_RESULTS = 5
# Data Directories
+68 -21
View File
@@ -1,5 +1,7 @@
from fastapi import FastAPI, HTTPException
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.templating import Jinja2Templates
from fastapi.responses import HTMLResponse
from typing import List, Dict, Any
import json
import os
@@ -12,6 +14,19 @@ from config import RAW_NEWS_DIR, PROCESSED_NEWS_DIR
app = FastAPI(title="DS Task AI News API")
# Configure templates
templates = Jinja2Templates(directory="backend/templates")
# Add custom filters
def from_json(value):
"""Parse a JSON string into a Python object."""
try:
return json.loads(value)
except (json.JSONDecodeError, TypeError):
return None
templates.env.filters["from_json"] = from_json
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
@@ -27,34 +42,51 @@ embedding_generator = EmbeddingGenerator()
vector_store = VectorStore()
recommender = NewsRecommender()
@app.get("/")
async def root():
"""Root endpoint returning API information."""
return {
"name": "DS Task AI News API",
"version": "1.0.0",
"description": "AI-powered news retrieval and recommendation system"
}
@app.get("/", response_class=HTMLResponse)
async def root(request: Request):
"""Root endpoint returning the home page with links to other routes."""
return templates.TemplateResponse(
"home.html",
{"request": request}
)
@app.get("/fetch-news")
async def fetch_news():
@app.get("/fetch-news", response_class=HTMLResponse)
async def fetch_news(request: Request):
"""Fetch news from RSS feeds and store in vector database."""
try:
result = news_fetcher.process()
if result["status"] == "error":
raise HTTPException(status_code=404, detail=result["message"])
return result
# Get the latest processed articles
processed_files = sorted(os.listdir(PROCESSED_NEWS_DIR), reverse=True)
if not processed_files:
raise HTTPException(status_code=404, detail="No processed articles found")
latest_file = os.path.join(PROCESSED_NEWS_DIR, processed_files[0])
with open(latest_file, 'r', encoding='utf-8') as f:
articles = json.load(f)
# Ensure each article has a link
for article in articles:
if 'link' not in article or not article['link']:
# If no link is available, use the article ID as a fallback
article['link'] = f"/article/{article.get('id', '')}"
return templates.TemplateResponse(
"news.html",
{"request": request, "articles": articles}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/recommend-news")
async def recommend_news(article_id: str = None, query: str = None):
@app.get("/recommend-news", response_class=HTMLResponse)
async def recommend_news(request: Request, article_id: str = None, query: str = None):
"""Get news recommendations based on article ID or search query."""
try:
if article_id:
# Get article from vector store
article = vector_store.search_similar([0] * 4096, top_k=1) # Placeholder vector
article = vector_store.search_similar([0] * 1024, top_k=1) # Placeholder vector with correct dimension
if not article:
raise HTTPException(status_code=404, detail="Article not found")
@@ -76,13 +108,23 @@ async def recommend_news(article_id: str = None, query: str = None):
if not similar_articles:
raise HTTPException(status_code=404, detail="No similar articles found")
# Ensure each article has a link
for article in similar_articles:
if 'link' not in article or not article['link']:
# If no link is available, use the article ID as a fallback
article['link'] = f"/article/{article.get('id', '')}"
# Generate insights for the articles
insights = recommender.analyze_articles(similar_articles)
return {
"articles": similar_articles,
"insights": insights
}
return templates.TemplateResponse(
"recommendations.html",
{
"request": request,
"articles": similar_articles,
"insights": insights
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@@ -91,12 +133,17 @@ async def get_article(article_id: str):
"""Get a specific article and its summary."""
try:
# Search for the article
articles = vector_store.search_similar([0] * 4096, top_k=1) # Placeholder vector
articles = vector_store.search_similar([0] * 1024, top_k=1) # Placeholder vector with correct dimension
if not articles:
raise HTTPException(status_code=404, detail="Article not found")
article = articles[0]
# Ensure the article has a link
if 'link' not in article or not article['link']:
# If no link is available, use the article ID as a fallback
article['link'] = f"/article/{article.get('id', '')}"
# Generate summary
summary = recommender.generate_summary(article)
@@ -109,4 +156,4 @@ async def get_article(article_id: str):
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
uvicorn.run(app, host="localhost", port=8000)
-2
View File
@@ -174,5 +174,3 @@ class NewsFetcher:
logger.info("News processing pipeline completed with status: %s", result["status"])
return result
news_fetcher = NewsFetcher()
print(news_fetcher.process())
+28 -6
View File
@@ -1,6 +1,7 @@
from groq import Groq
from typing import List, Dict, Any
from config import GROQ_API_KEY
import json
class NewsRecommender:
def __init__(self):
@@ -11,7 +12,7 @@ class NewsRecommender:
try:
# Prepare the prompt
articles_text = "\n\n".join([
f"Title: {article['title']}\nContent: {article['content']}"
f"Title: {article['title']}"
for article in articles
])
@@ -33,13 +34,34 @@ Format the response as a JSON with these keys: themes, insights, implications, r
{"role": "system", "content": "You are a news analyst providing insights about technology and AI news."},
{"role": "user", "content": prompt}
],
model="mixtral-8x7b-32768",
model="llama3-70b-8192",
temperature=0.7,
max_tokens=1000
max_tokens=500
)
# Parse and return the analysis
return completion.choices[0].message.content
response_text = completion.choices[0].message.content
# Try to extract JSON from the response if it's wrapped in markdown code blocks
if "```json" in response_text:
json_str = response_text.split("```json")[1].split("```")[0].strip()
try:
return json.loads(json_str)
except json.JSONDecodeError:
pass
elif "```" in response_text:
json_str = response_text.split("```")[1].split("```")[0].strip()
try:
return json.loads(json_str)
except json.JSONDecodeError:
pass
# If we couldn't extract JSON, try to parse the entire response
try:
return json.loads(response_text)
except json.JSONDecodeError:
# If all parsing attempts fail, return the raw text
return response_text
except Exception as e:
print(f"Error analyzing articles: {str(e)}")
return {
@@ -64,9 +86,9 @@ Please provide a concise summary focusing on the key points and implications."""
{"role": "system", "content": "You are a news summarizer providing concise summaries of technology and AI news."},
{"role": "user", "content": prompt}
],
model="mixtral-8x7b-32768",
model="llama3-70b-8192",
temperature=0.5,
max_tokens=500
max_tokens=250
)
return completion.choices[0].message.content
+1
View File
@@ -9,3 +9,4 @@ pydantic==2.6.3
python-multipart==0.0.9
httpx==0.27.0
beautifulsoup4==4.12.3
jinja2==3.1.2
+34
View File
@@ -0,0 +1,34 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{% block title %}DS Task AI News{% endblock %}</title>
<link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
<style>
.article-card {
transition: transform 0.2s;
}
.article-card:hover {
transform: translateY(-5px);
}
</style>
</head>
<body class="bg-gray-100 min-h-screen">
<nav class="bg-blue-600 text-white p-4">
<div class="container mx-auto">
<h1 class="text-2xl font-bold">DS Task AI News</h1>
</div>
</nav>
<main class="container mx-auto px-4 py-8">
{% block content %}{% endblock %}
</main>
<footer class="bg-gray-800 text-white p-4 mt-8">
<div class="container mx-auto text-center">
<p>&copy; 2024 DS Task AI News. All rights reserved.</p>
</div>
</footer>
</body>
</html>
+54
View File
@@ -0,0 +1,54 @@
{% extends "base.html" %}
{% block title %}Home - DS Task AI News{% endblock %}
{% block content %}
<div class="max-w-4xl mx-auto">
<div class="text-center mb-12">
<h1 class="text-4xl font-bold text-gray-800 mb-4">Welcome to DS Task AI News</h1>
<p class="text-xl text-gray-600">Your AI-powered news retrieval and recommendation system</p>
</div>
<div class="grid grid-cols-1 md:grid-cols-2 gap-8">
<!-- Fetch News Card -->
<div class="bg-white rounded-lg shadow-md overflow-hidden hover:shadow-lg transition-shadow duration-300">
<div class="p-6">
<h2 class="text-2xl font-semibold text-gray-800 mb-4">Latest News</h2>
<p class="text-gray-600 mb-6">View the latest news articles fetched from our RSS feeds.</p>
<a href="/fetch-news" class="inline-block bg-blue-600 text-white px-6 py-3 rounded-md font-medium hover:bg-blue-700 transition-colors duration-300">
View Latest News
</a>
</div>
</div>
<!-- Recommend News Card -->
<div class="bg-white rounded-lg shadow-md overflow-hidden hover:shadow-lg transition-shadow duration-300">
<div class="p-6">
<h2 class="text-2xl font-semibold text-gray-800 mb-4">News Recommendations</h2>
<p class="text-gray-600 mb-6">Get personalized news recommendations based on your interests.</p>
<div class="space-y-4">
<a href="/recommend-news?query=technology" class="block bg-blue-600 text-white px-6 py-3 rounded-md font-medium hover:bg-blue-700 transition-colors duration-300 text-center">
Technology News
</a>
<a href="/recommend-news?query=artificial intelligence" class="block bg-blue-600 text-white px-6 py-3 rounded-md font-medium hover:bg-blue-700 transition-colors duration-300 text-center">
AI News
</a>
</div>
</div>
</div>
</div>
<div class="mt-12 bg-white rounded-lg shadow-md p-6">
<h2 class="text-2xl font-semibold text-gray-800 mb-4">About This Application</h2>
<p class="text-gray-600 mb-4">
This application uses AI to fetch, process, and recommend news articles. It leverages:
</p>
<ul class="list-disc list-inside text-gray-600 space-y-2">
<li>RSS feeds for news collection</li>
<li>Cohere embeddings for semantic understanding</li>
<li>Pinecone vector database for efficient retrieval</li>
<li>AI-powered analysis for personalized recommendations</li>
</ul>
</div>
</div>
{% endblock %}
+42
View File
@@ -0,0 +1,42 @@
{% extends "base.html" %}
{% block title %}Latest News - DS Task AI News{% endblock %}
{% block content %}
<div class="space-y-6">
<h2 class="text-3xl font-bold text-gray-800 mb-6">Latest News Articles</h2>
<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
{% for article in articles %}
<article class="article-card bg-white rounded-lg shadow-md overflow-hidden">
<div class="p-6">
<h3 class="text-xl font-semibold text-gray-800 mb-2">
<a href="{{ article.link }}" target="_blank" class="hover:text-blue-600">
{{ article.title }}
</a>
</h3>
<p class="text-gray-600 mb-4">{{ article.content[:200] }}...</p>
<div class="flex justify-between items-center text-sm text-gray-500">
<span>{{ article.source }}</span>
<span>{{ article.published }}</span>
</div>
{% if article.categories %}
<div class="mt-4 flex flex-wrap gap-2">
{% for category in article.categories %}
<span class="px-2 py-1 bg-blue-100 text-blue-800 rounded-full text-xs">
{{ category }}
</span>
{% endfor %}
</div>
{% endif %}
<div class="mt-4">
<a href="{{ article.link }}" target="_blank" class="inline-block bg-blue-600 text-white px-4 py-2 rounded-md font-medium hover:bg-blue-700 transition-colors duration-300">
Read More
</a>
</div>
</div>
</article>
{% endfor %}
</div>
</div>
{% endblock %}
+157
View File
@@ -0,0 +1,157 @@
{% extends "base.html" %}
{% block title %}Recommended News - DS Task AI News{% endblock %}
{% block content %}
<div class="space-y-8">
<div class="bg-white rounded-lg shadow-md p-6 mb-8">
<h2 class="text-2xl font-bold text-gray-800 mb-4">AI Insights</h2>
<div class="prose max-w-none">
{% if insights %}
{% if insights is string %}
{# If insights is a string (JSON or markdown), try to parse it #}
{% set insights_data = insights | from_json %}
{% if insights_data %}
<div class="space-y-6">
{% if insights_data.themes %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Themes</h3>
<ul class="list-disc list-inside space-y-1">
{% for theme in insights_data.themes %}
<li class="text-gray-700">{{ theme }}</li>
{% endfor %}
</ul>
</div>
{% endif %}
{% if insights_data.insights %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Key Insights</h3>
<ul class="list-disc list-inside space-y-1">
{% for insight in insights_data.insights %}
<li class="text-gray-700">{{ insight }}</li>
{% endfor %}
</ul>
</div>
{% endif %}
{% if insights_data.implications %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Implications</h3>
<ul class="list-disc list-inside space-y-1">
{% for implication in insights_data.implications %}
<li class="text-gray-700">{{ implication }}</li>
{% endfor %}
</ul>
</div>
{% endif %}
{% if insights_data.related_areas %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Related Areas</h3>
<div class="flex flex-wrap gap-2">
{% for area in insights_data.related_areas %}
<span class="px-3 py-1 bg-blue-100 text-blue-800 rounded-full text-sm">
{{ area }}
</span>
{% endfor %}
</div>
</div>
{% endif %}
</div>
{% else %}
{# If parsing failed, display the raw insights #}
<div class="whitespace-pre-wrap">{{ insights }}</div>
{% endif %}
{% else %}
{# If insights is already a dict/object #}
<div class="space-y-6">
{% if insights.themes %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Themes</h3>
<ul class="list-disc list-inside space-y-1">
{% for theme in insights.themes %}
<li class="text-gray-700">{{ theme }}</li>
{% endfor %}
</ul>
</div>
{% endif %}
{% if insights.insights %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Key Insights</h3>
<ul class="list-disc list-inside space-y-1">
{% for insight in insights.insights %}
<li class="text-gray-700">{{ insight }}</li>
{% endfor %}
</ul>
</div>
{% endif %}
{% if insights.implications %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Implications</h3>
<ul class="list-disc list-inside space-y-1">
{% for implication in insights.implications %}
<li class="text-gray-700">{{ implication }}</li>
{% endfor %}
</ul>
</div>
{% endif %}
{% if insights.related_areas %}
<div>
<h3 class="text-xl font-semibold text-gray-800 mb-2">Related Areas</h3>
<div class="flex flex-wrap gap-2">
{% for area in insights.related_areas %}
<span class="px-3 py-1 bg-blue-100 text-blue-800 rounded-full text-sm">
{{ area }}
</span>
{% endfor %}
</div>
</div>
{% endif %}
</div>
{% endif %}
{% else %}
<p class="text-gray-600">No insights available for these articles.</p>
{% endif %}
</div>
</div>
<h2 class="text-3xl font-bold text-gray-800 mb-6">Recommended Articles</h2>
<div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
{% for article in articles %}
<article class="article-card bg-white rounded-lg shadow-md overflow-hidden">
<div class="p-6">
<h3 class="text-xl font-semibold text-gray-800 mb-2">
<a href="{{ article.link }}" target="_blank" class="hover:text-blue-600">
{{ article.title }}
</a>
</h3>
<p class="text-gray-600 mb-4">{{ article.content[:200] }}...</p>
<div class="flex justify-between items-center text-sm text-gray-500">
<span>{{ article.source }}</span>
<span>{{ article.published }}</span>
</div>
{% if article.categories %}
<div class="mt-4 flex flex-wrap gap-2">
{% for category in article.categories %}
<span class="px-2 py-1 bg-blue-100 text-blue-800 rounded-full text-xs">
{{ category }}
</span>
{% endfor %}
</div>
{% endif %}
<div class="mt-4">
<a href="{{ article.link }}" target="_blank" class="inline-block bg-blue-600 text-white px-4 py-2 rounded-md font-medium hover:bg-blue-700 transition-colors duration-300">
Read More
</a>
</div>
</div>
</article>
{% endfor %}
</div>
</div>
{% endblock %}
+4 -1
View File
@@ -2,7 +2,6 @@ from pinecone import Pinecone, ServerlessSpec
from typing import List, Dict, Any
from config import (
PINECONE_API_KEY,
PINECONE_ENVIRONMENT,
PINECONE_INDEX_NAME,
VECTOR_DIMENSION,
TOP_K_RESULTS
@@ -16,13 +15,17 @@ class VectorStore:
def _ensure_index(self):
"""Ensure the Pinecone index exists, create if it doesn't."""
# Check if index exists, create if it doesn't
if self.index_name not in self.pinecone.list_indexes().names():
# Create a new index with the correct dimension
self.pinecone.create_index(
name=self.index_name,
dimension=VECTOR_DIMENSION,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1")
)
print(f"Created new index '{self.index_name}' with dimension {VECTOR_DIMENSION}")
self.index = self.pinecone.Index(self.index_name)
def upsert_articles(self, articles: List[Dict[str, Any]]) -> bool: