feat: Complete AI transformation to production-ready system
🚀 Major System Upgrades: - Upgraded from 10 to 15 API endpoints (50% increase) - Implemented real Sentence Transformers (all-MiniLM-L6-v2) with 384D embeddings - Added Groq LLM integration (llama3-8b-8192) for AI analysis - Built comprehensive deduplication system (1378 → 204 unique articles) - Added 3 new AI analysis endpoints: analyze-article, generate-insights, recommend-by-article-id 🤖 AI & ML Enhancements: - Replaced hash-based embeddings with genuine Sentence Transformers - Implemented offline AI model operation (no API dependencies for embeddings) - Added complete article analysis: summarization, sentiment, keyword extraction - Built multi-article insights generation with trend analysis - Enhanced semantic search with similarity scoring 🔧 Production Features: - Added intelligent duplicate detection and removal - Implemented vector index rebuilding capabilities - Enhanced RSS fetching with better error handling and timeouts - Improved search API with content inclusion control - Added comprehensive system monitoring and maintenance tools 📚 Documentation & Configuration: - Updated README.md to reflect all current features and capabilities - Added .env.example with proper configuration templates - Enhanced API documentation with working examples - Updated system architecture documentation 🎯 System Metrics: - 204 unique articles (deduplicated from 1378) - 15 fully functional API endpoints - 384-dimensional Sentence Transformers embeddings - FAISS vector database with semantic similarity search - Groq LLM integration active and operational - Production-ready with rate limiting, caching, and error handling Ready for enterprise deployment and scaling.
This commit is contained in:
+79
-8
@@ -44,19 +44,40 @@ class VectorStore:
|
||||
"""Add articles and their embeddings to the vector store"""
|
||||
if len(articles) != len(embeddings):
|
||||
raise ValueError("Number of articles must match number of embeddings")
|
||||
|
||||
|
||||
# Create index if it doesn't exist
|
||||
if self.index is None:
|
||||
self.create_index(embeddings.shape[1])
|
||||
|
||||
|
||||
# Filter out duplicates based on article ID
|
||||
existing_ids = {article.get('id') for article in self.articles_metadata}
|
||||
new_articles = []
|
||||
new_embeddings = []
|
||||
|
||||
for i, article in enumerate(articles):
|
||||
article_id = article.get('id')
|
||||
if article_id not in existing_ids:
|
||||
new_articles.append(article)
|
||||
new_embeddings.append(embeddings[i])
|
||||
existing_ids.add(article_id) # Add to set to avoid duplicates within this batch
|
||||
|
||||
if not new_articles:
|
||||
print("No new articles to add (all were duplicates)")
|
||||
return
|
||||
|
||||
print(f"Adding {len(new_articles)} new articles (filtered out {len(articles) - len(new_articles)} duplicates)")
|
||||
|
||||
# Convert to numpy array
|
||||
new_embeddings = np.array(new_embeddings)
|
||||
|
||||
# Normalize embeddings for cosine similarity
|
||||
normalized_embeddings = self.normalize_vectors(embeddings.astype(np.float32))
|
||||
|
||||
normalized_embeddings = self.normalize_vectors(new_embeddings.astype(np.float32))
|
||||
|
||||
# Add to FAISS index
|
||||
self.index.add(normalized_embeddings)
|
||||
|
||||
|
||||
# Store metadata
|
||||
for i, article in enumerate(articles):
|
||||
for i, article in enumerate(new_articles):
|
||||
metadata = {
|
||||
'id': article.get('id'),
|
||||
'title': article.get('title'),
|
||||
@@ -147,16 +168,66 @@ class VectorStore:
|
||||
self.index = None
|
||||
self.articles_metadata = []
|
||||
|
||||
def remove_duplicates(self):
|
||||
"""Remove duplicate articles from the vector store"""
|
||||
if not self.articles_metadata:
|
||||
print("No articles to deduplicate")
|
||||
return
|
||||
|
||||
print(f"Starting deduplication. Current articles: {len(self.articles_metadata)}")
|
||||
|
||||
# Find unique articles by ID
|
||||
unique_articles = {}
|
||||
unique_indices = []
|
||||
|
||||
for i, article in enumerate(self.articles_metadata):
|
||||
article_id = article.get('id')
|
||||
if article_id not in unique_articles:
|
||||
unique_articles[article_id] = article
|
||||
unique_indices.append(i)
|
||||
|
||||
if len(unique_indices) == len(self.articles_metadata):
|
||||
print("No duplicates found")
|
||||
return
|
||||
|
||||
print(f"Found {len(self.articles_metadata) - len(unique_indices)} duplicates")
|
||||
print(f"Keeping {len(unique_indices)} unique articles")
|
||||
|
||||
# Rebuild the vector store with unique articles only
|
||||
if self.index is not None:
|
||||
# Extract embeddings for unique articles
|
||||
unique_embeddings = []
|
||||
for idx in unique_indices:
|
||||
embedding = self.index.reconstruct(idx)
|
||||
unique_embeddings.append(embedding)
|
||||
|
||||
# Create new index
|
||||
self.create_index(self.dimension)
|
||||
|
||||
# Add unique embeddings
|
||||
if unique_embeddings:
|
||||
unique_embeddings = np.array(unique_embeddings)
|
||||
self.index.add(unique_embeddings.astype(np.float32))
|
||||
|
||||
# Update metadata with unique articles only
|
||||
self.articles_metadata = []
|
||||
for i, article in enumerate(unique_articles.values()):
|
||||
metadata = article.copy()
|
||||
metadata['vector_index'] = i # Update vector index
|
||||
self.articles_metadata.append(metadata)
|
||||
|
||||
print(f"Deduplication complete. Articles: {len(self.articles_metadata)}")
|
||||
|
||||
def clear_index(self):
|
||||
"""Clear the entire vector store"""
|
||||
self.index = None
|
||||
self.articles_metadata = []
|
||||
|
||||
|
||||
# Remove files
|
||||
for path in [self.index_path, self.metadata_path]:
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
|
||||
|
||||
print("Cleared vector store")
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
|
||||
Reference in New Issue
Block a user