ecd24ce2a6
🚀 Major System Upgrades: - Upgraded from 10 to 15 API endpoints (50% increase) - Implemented real Sentence Transformers (all-MiniLM-L6-v2) with 384D embeddings - Added Groq LLM integration (llama3-8b-8192) for AI analysis - Built comprehensive deduplication system (1378 → 204 unique articles) - Added 3 new AI analysis endpoints: analyze-article, generate-insights, recommend-by-article-id 🤖 AI & ML Enhancements: - Replaced hash-based embeddings with genuine Sentence Transformers - Implemented offline AI model operation (no API dependencies for embeddings) - Added complete article analysis: summarization, sentiment, keyword extraction - Built multi-article insights generation with trend analysis - Enhanced semantic search with similarity scoring 🔧 Production Features: - Added intelligent duplicate detection and removal - Implemented vector index rebuilding capabilities - Enhanced RSS fetching with better error handling and timeouts - Improved search API with content inclusion control - Added comprehensive system monitoring and maintenance tools 📚 Documentation & Configuration: - Updated README.md to reflect all current features and capabilities - Added .env.example with proper configuration templates - Enhanced API documentation with working examples - Updated system architecture documentation 🎯 System Metrics: - 204 unique articles (deduplicated from 1378) - 15 fully functional API endpoints - 384-dimensional Sentence Transformers embeddings - FAISS vector database with semantic similarity search - Groq LLM integration active and operational - Production-ready with rate limiting, caching, and error handling Ready for enterprise deployment and scaling.
58 lines
2.0 KiB
Python
58 lines
2.0 KiB
Python
"""Configuration settings for DS Task AI News"""
|
|
import os
|
|
from typing import List
|
|
from pydantic_settings import BaseSettings
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv()
|
|
|
|
class Settings(BaseSettings):
|
|
# API Keys
|
|
cohere_api_key: str = os.getenv("COHERE_API_KEY", "")
|
|
groq_api_key: str = os.getenv("GROQ_API_KEY", "")
|
|
|
|
# Vector Database
|
|
vector_db_type: str = os.getenv("VECTOR_DB_TYPE", "faiss")
|
|
vector_dimension: int = int(os.getenv("VECTOR_DIMENSION", "384"))
|
|
|
|
# RSS Feeds
|
|
@property
|
|
def rss_feeds(self) -> List[str]:
|
|
feeds_str = os.getenv(
|
|
"RSS_FEEDS",
|
|
"https://feeds.bbci.co.uk/news/technology/rss.xml,"
|
|
"https://techcrunch.com/feed/,"
|
|
"https://www.wired.com/feed/rss"
|
|
)
|
|
return [feed.strip() for feed in feeds_str.split(",") if feed.strip()]
|
|
|
|
# Server Settings
|
|
host: str = os.getenv("HOST", "0.0.0.0")
|
|
port: int = int(os.getenv("PORT", "8000"))
|
|
debug: bool = os.getenv("DEBUG", "true").lower() == "true"
|
|
|
|
# Data Storage (paths relative to project root)
|
|
@property
|
|
def raw_news_dir(self) -> str:
|
|
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
return os.getenv("RAW_NEWS_DIR", os.path.join(base_path, "data", "raw_news"))
|
|
|
|
@property
|
|
def processed_news_dir(self) -> str:
|
|
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
return os.getenv("PROCESSED_NEWS_DIR", os.path.join(base_path, "data", "processed_news"))
|
|
|
|
@property
|
|
def vector_index_path(self) -> str:
|
|
base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
return os.getenv("VECTOR_INDEX_PATH", os.path.join(base_path, "data", "news_vectors.faiss"))
|
|
|
|
# Embedding Model (will download automatically on first use)
|
|
embedding_model: str = "all-MiniLM-L6-v2"
|
|
|
|
# News Processing
|
|
max_articles_per_feed: int = 50
|
|
similarity_threshold: float = 0.1 # Very low threshold for maximum recall
|
|
|
|
settings = Settings()
|