ecd24ce2a6
🚀 Major System Upgrades: - Upgraded from 10 to 15 API endpoints (50% increase) - Implemented real Sentence Transformers (all-MiniLM-L6-v2) with 384D embeddings - Added Groq LLM integration (llama3-8b-8192) for AI analysis - Built comprehensive deduplication system (1378 → 204 unique articles) - Added 3 new AI analysis endpoints: analyze-article, generate-insights, recommend-by-article-id 🤖 AI & ML Enhancements: - Replaced hash-based embeddings with genuine Sentence Transformers - Implemented offline AI model operation (no API dependencies for embeddings) - Added complete article analysis: summarization, sentiment, keyword extraction - Built multi-article insights generation with trend analysis - Enhanced semantic search with similarity scoring 🔧 Production Features: - Added intelligent duplicate detection and removal - Implemented vector index rebuilding capabilities - Enhanced RSS fetching with better error handling and timeouts - Improved search API with content inclusion control - Added comprehensive system monitoring and maintenance tools 📚 Documentation & Configuration: - Updated README.md to reflect all current features and capabilities - Added .env.example with proper configuration templates - Enhanced API documentation with working examples - Updated system architecture documentation 🎯 System Metrics: - 204 unique articles (deduplicated from 1378) - 15 fully functional API endpoints - 384-dimensional Sentence Transformers embeddings - FAISS vector database with semantic similarity search - Groq LLM integration active and operational - Production-ready with rate limiting, caching, and error handling Ready for enterprise deployment and scaling.
273 lines
10 KiB
Python
273 lines
10 KiB
Python
"""Vector database operations using FAISS"""
|
|
import os
|
|
import json
|
|
import pickle
|
|
import time
|
|
import numpy as np
|
|
import faiss
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
from datetime import datetime
|
|
from config import settings
|
|
|
|
class VectorStore:
|
|
def __init__(self):
|
|
self.index_path = settings.vector_index_path
|
|
self.metadata_path = self.index_path.replace('.faiss', '_metadata.pkl')
|
|
self.dimension = settings.vector_dimension
|
|
|
|
# Initialize FAISS index
|
|
self.index = None
|
|
self.articles_metadata = []
|
|
|
|
# Simple in-memory cache for frequent queries
|
|
self._cache = {}
|
|
self._cache_ttl = 300 # 5 minutes
|
|
|
|
# Load existing index if available
|
|
self.load_index()
|
|
|
|
def create_index(self, dimension: int):
|
|
"""Create a new FAISS index"""
|
|
# Using IndexFlatIP for cosine similarity (Inner Product)
|
|
# We'll normalize vectors before adding them
|
|
self.index = faiss.IndexFlatIP(dimension)
|
|
self.articles_metadata = []
|
|
print(f"Created new FAISS index with dimension {dimension}")
|
|
|
|
def normalize_vectors(self, vectors: np.ndarray) -> np.ndarray:
|
|
"""Normalize vectors for cosine similarity"""
|
|
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
|
|
norms[norms == 0] = 1 # Avoid division by zero
|
|
return vectors / norms
|
|
|
|
def add_articles(self, articles: List[Dict[str, Any]], embeddings: np.ndarray):
|
|
"""Add articles and their embeddings to the vector store"""
|
|
if len(articles) != len(embeddings):
|
|
raise ValueError("Number of articles must match number of embeddings")
|
|
|
|
# Create index if it doesn't exist
|
|
if self.index is None:
|
|
self.create_index(embeddings.shape[1])
|
|
|
|
# Filter out duplicates based on article ID
|
|
existing_ids = {article.get('id') for article in self.articles_metadata}
|
|
new_articles = []
|
|
new_embeddings = []
|
|
|
|
for i, article in enumerate(articles):
|
|
article_id = article.get('id')
|
|
if article_id not in existing_ids:
|
|
new_articles.append(article)
|
|
new_embeddings.append(embeddings[i])
|
|
existing_ids.add(article_id) # Add to set to avoid duplicates within this batch
|
|
|
|
if not new_articles:
|
|
print("No new articles to add (all were duplicates)")
|
|
return
|
|
|
|
print(f"Adding {len(new_articles)} new articles (filtered out {len(articles) - len(new_articles)} duplicates)")
|
|
|
|
# Convert to numpy array
|
|
new_embeddings = np.array(new_embeddings)
|
|
|
|
# Normalize embeddings for cosine similarity
|
|
normalized_embeddings = self.normalize_vectors(new_embeddings.astype(np.float32))
|
|
|
|
# Add to FAISS index
|
|
self.index.add(normalized_embeddings)
|
|
|
|
# Store metadata
|
|
for i, article in enumerate(new_articles):
|
|
metadata = {
|
|
'id': article.get('id'),
|
|
'title': article.get('title'),
|
|
'content': article.get('content', '')[:200], # Truncate for storage
|
|
'url': article.get('url'),
|
|
'source': article.get('source'),
|
|
'published_date': article.get('published_date'),
|
|
'added_date': datetime.now().isoformat(),
|
|
'vector_index': len(self.articles_metadata) # Current index in FAISS
|
|
}
|
|
self.articles_metadata.append(metadata)
|
|
|
|
print(f"Added {len(articles)} articles to vector store")
|
|
print(f"Total articles in store: {len(self.articles_metadata)}")
|
|
|
|
# Save the updated index
|
|
self.save_index()
|
|
|
|
def search_similar(self, query_embedding: np.ndarray, top_k: int = 5) -> List[Dict[str, Any]]:
|
|
"""Search for similar articles"""
|
|
if self.index is None or len(self.articles_metadata) == 0:
|
|
return []
|
|
|
|
# Normalize query embedding
|
|
query_embedding = self.normalize_vectors(query_embedding.reshape(1, -1))
|
|
|
|
# Search in FAISS
|
|
similarities, indices = self.index.search(query_embedding, min(top_k, len(self.articles_metadata)))
|
|
|
|
results = []
|
|
for similarity, idx in zip(similarities[0], indices[0]):
|
|
if idx >= 0 and idx < len(self.articles_metadata): # Valid index
|
|
article = self.articles_metadata[idx].copy()
|
|
article['similarity_score'] = float(similarity)
|
|
|
|
# Always include results (threshold removed for better recall)
|
|
results.append(article)
|
|
|
|
return results
|
|
|
|
def get_article_by_id(self, article_id: str) -> Optional[Dict[str, Any]]:
|
|
"""Get article metadata by ID"""
|
|
for article in self.articles_metadata:
|
|
if article.get('id') == article_id:
|
|
return article
|
|
return None
|
|
|
|
def get_all_articles(self) -> List[Dict[str, Any]]:
|
|
"""Get all articles metadata"""
|
|
return self.articles_metadata.copy()
|
|
|
|
def save_index(self):
|
|
"""Save FAISS index and metadata to disk"""
|
|
try:
|
|
# Ensure directory exists
|
|
os.makedirs(os.path.dirname(self.index_path), exist_ok=True)
|
|
|
|
# Save FAISS index
|
|
if self.index is not None:
|
|
faiss.write_index(self.index, self.index_path)
|
|
|
|
# Save metadata
|
|
with open(self.metadata_path, 'wb') as f:
|
|
pickle.dump(self.articles_metadata, f)
|
|
|
|
print(f"Saved vector store to {self.index_path}")
|
|
|
|
except Exception as e:
|
|
print(f"Error saving vector store: {e}")
|
|
|
|
def load_index(self):
|
|
"""Load FAISS index and metadata from disk"""
|
|
try:
|
|
# Load FAISS index
|
|
if os.path.exists(self.index_path):
|
|
self.index = faiss.read_index(self.index_path)
|
|
print(f"Loaded FAISS index from {self.index_path}")
|
|
|
|
# Load metadata
|
|
if os.path.exists(self.metadata_path):
|
|
with open(self.metadata_path, 'rb') as f:
|
|
self.articles_metadata = pickle.load(f)
|
|
print(f"Loaded {len(self.articles_metadata)} articles metadata")
|
|
|
|
except Exception as e:
|
|
print(f"Error loading vector store: {e}")
|
|
# Create new index if loading fails
|
|
self.index = None
|
|
self.articles_metadata = []
|
|
|
|
def remove_duplicates(self):
|
|
"""Remove duplicate articles from the vector store"""
|
|
if not self.articles_metadata:
|
|
print("No articles to deduplicate")
|
|
return
|
|
|
|
print(f"Starting deduplication. Current articles: {len(self.articles_metadata)}")
|
|
|
|
# Find unique articles by ID
|
|
unique_articles = {}
|
|
unique_indices = []
|
|
|
|
for i, article in enumerate(self.articles_metadata):
|
|
article_id = article.get('id')
|
|
if article_id not in unique_articles:
|
|
unique_articles[article_id] = article
|
|
unique_indices.append(i)
|
|
|
|
if len(unique_indices) == len(self.articles_metadata):
|
|
print("No duplicates found")
|
|
return
|
|
|
|
print(f"Found {len(self.articles_metadata) - len(unique_indices)} duplicates")
|
|
print(f"Keeping {len(unique_indices)} unique articles")
|
|
|
|
# Rebuild the vector store with unique articles only
|
|
if self.index is not None:
|
|
# Extract embeddings for unique articles
|
|
unique_embeddings = []
|
|
for idx in unique_indices:
|
|
embedding = self.index.reconstruct(idx)
|
|
unique_embeddings.append(embedding)
|
|
|
|
# Create new index
|
|
self.create_index(self.dimension)
|
|
|
|
# Add unique embeddings
|
|
if unique_embeddings:
|
|
unique_embeddings = np.array(unique_embeddings)
|
|
self.index.add(unique_embeddings.astype(np.float32))
|
|
|
|
# Update metadata with unique articles only
|
|
self.articles_metadata = []
|
|
for i, article in enumerate(unique_articles.values()):
|
|
metadata = article.copy()
|
|
metadata['vector_index'] = i # Update vector index
|
|
self.articles_metadata.append(metadata)
|
|
|
|
print(f"Deduplication complete. Articles: {len(self.articles_metadata)}")
|
|
|
|
def clear_index(self):
|
|
"""Clear the entire vector store"""
|
|
self.index = None
|
|
self.articles_metadata = []
|
|
|
|
# Remove files
|
|
for path in [self.index_path, self.metadata_path]:
|
|
if os.path.exists(path):
|
|
os.remove(path)
|
|
|
|
print("Cleared vector store")
|
|
|
|
def get_stats(self) -> Dict[str, Any]:
|
|
"""Get vector store statistics"""
|
|
return {
|
|
'total_articles': len(self.articles_metadata),
|
|
'index_dimension': self.dimension,
|
|
'index_exists': self.index is not None,
|
|
'index_size': self.index.ntotal if self.index else 0,
|
|
'last_updated': max([a.get('added_date', '') for a in self.articles_metadata]) if self.articles_metadata else None
|
|
}
|
|
|
|
def _get_cache_key(self, operation: str, *args) -> str:
|
|
"""Generate cache key for operation"""
|
|
import hashlib
|
|
key_data = f"{operation}:{':'.join(map(str, args))}"
|
|
return hashlib.md5(key_data.encode()).hexdigest()
|
|
|
|
def _get_from_cache(self, key: str) -> Optional[Any]:
|
|
"""Get value from cache if not expired"""
|
|
if key in self._cache:
|
|
cached_data, timestamp = self._cache[key]
|
|
if time.time() - timestamp < self._cache_ttl:
|
|
return cached_data
|
|
else:
|
|
del self._cache[key]
|
|
return None
|
|
|
|
def _set_cache(self, key: str, value: Any) -> None:
|
|
"""Set value in cache with timestamp"""
|
|
self._cache[key] = (value, time.time())
|
|
|
|
def _clear_cache(self) -> None:
|
|
"""Clear all cache entries"""
|
|
self._cache.clear()
|
|
|
|
# Test function
|
|
if __name__ == "__main__":
|
|
# Test vector store
|
|
store = VectorStore()
|
|
stats = store.get_stats()
|
|
print(f"Vector store stats: {stats}")
|