Files
DS_TASK_AI_VIEWS/backend/embeddings.py
T

225 lines
8.6 KiB
Python

"""Embeddings generation for DS Task AI News"""
import os
import numpy as np
from typing import List, Dict, Any, Optional
try:
from sentence_transformers import SentenceTransformer
SENTENCE_TRANSFORMERS_AVAILABLE = True
except ImportError:
SENTENCE_TRANSFORMERS_AVAILABLE = False
print("⚠️ Sentence Transformers not available")
try:
import cohere
COHERE_AVAILABLE = True
except ImportError:
COHERE_AVAILABLE = False
print("⚠️ Cohere not available")
from config import settings
class EmbeddingGenerator:
def __init__(self):
self.cohere_client = None
self.sentence_model = None
self.use_cohere = COHERE_AVAILABLE and bool(settings.cohere_api_key)
self.model_loaded = False
self.dimension = settings.vector_dimension
# Initialize embedding model
if self.use_cohere:
try:
self.cohere_client = cohere.Client(settings.cohere_api_key)
print("✅ Using Cohere for embeddings")
self.model_loaded = True
except Exception as e:
print(f"❌ Cohere initialization failed: {e}")
self.use_cohere = False
if not self.use_cohere:
# Always start with simple embeddings for immediate functionality
print("⚡ Using fast hash-based embeddings for immediate startup")
self.model_loaded = True # Simple embeddings are always ready
# Note: Sentence Transformers available for future enhancement
def _load_sentence_model(self):
"""Lazy load sentence transformer model"""
if not self.model_loaded and SENTENCE_TRANSFORMERS_AVAILABLE:
try:
print("📥 Loading Sentence Transformer model (this may take a moment)...")
self.sentence_model = SentenceTransformer(settings.embedding_model)
self.model_loaded = True
print("✅ Sentence Transformer model loaded successfully")
except Exception as e:
print(f"❌ Failed to load Sentence Transformer: {e}")
self.sentence_model = None
self.model_loaded = False
def _simple_text_to_vector(self, text: str) -> np.ndarray:
"""Convert text to a simple vector using basic hashing (fallback method)"""
words = text.lower().split()
vector = np.zeros(self.dimension)
for i, word in enumerate(words[:50]): # Use first 50 words
hash_val = hash(word) % self.dimension
vector[hash_val] += 1.0 / (i + 1) # Weight by position
# Normalize
norm = np.linalg.norm(vector)
if norm > 0:
vector = vector / norm
return vector
def create_article_text(self, article: Dict[str, Any]) -> str:
"""Combine article fields into text for embedding"""
title = article.get('title', '')
content = article.get('content', '')
source = article.get('source', '')
# Combine with weights (title is more important)
text = f"{title}. {content}"
if source:
text += f" Source: {source}"
return text.strip()
def generate_embeddings_cohere(self, texts: List[str]) -> np.ndarray:
"""Generate embeddings using Cohere"""
try:
response = self.cohere_client.embed(
texts=texts,
model='embed-english-v3.0',
input_type='search_document'
)
return np.array(response.embeddings)
except Exception as e:
print(f"Cohere embedding error: {e}")
raise
def generate_embeddings_sentence_transformer(self, texts: List[str]) -> np.ndarray:
"""Generate embeddings using Sentence Transformers"""
try:
if not self.model_loaded and SENTENCE_TRANSFORMERS_AVAILABLE:
self._load_sentence_model()
if self.sentence_model is None:
# Use simple hash-based embeddings as fallback
print("⚠️ Using simple hash-based embeddings (Sentence Transformers not available)")
embeddings = []
for text in texts:
embedding = self._simple_text_to_vector(text)
embeddings.append(embedding)
return np.array(embeddings)
embeddings = self.sentence_model.encode(texts, convert_to_numpy=True)
return embeddings
except Exception as e:
print(f"❌ Sentence Transformer embedding error: {e}")
# Use simple embeddings as fallback
print("⚠️ Falling back to simple hash-based embeddings")
embeddings = []
for text in texts:
embedding = self._simple_text_to_vector(text)
embeddings.append(embedding)
return np.array(embeddings)
def generate_embeddings(self, articles: List[Dict[str, Any]]) -> np.ndarray:
"""Generate embeddings for articles"""
if not articles:
return np.array([])
# Create texts for embedding
texts = [self.create_article_text(article) for article in articles]
print(f"Generating embeddings for {len(texts)} articles...")
# Generate embeddings
if self.use_cohere:
embeddings = self.generate_embeddings_cohere(texts)
else:
embeddings = self.generate_embeddings_sentence_transformer(texts)
print(f"Generated embeddings shape: {embeddings.shape}")
return embeddings
def generate_query_embedding(self, query: str) -> np.ndarray:
"""Generate embedding for a search query"""
if self.use_cohere:
try:
response = self.cohere_client.embed(
texts=[query],
model='embed-english-v3.0',
input_type='search_query'
)
return np.array(response.embeddings[0])
except Exception as e:
print(f"Cohere query embedding error: {e}")
# Fallback to simple embeddings
return self._simple_text_to_vector(query)
else:
if self.sentence_model is not None:
return self.sentence_model.encode([query], convert_to_numpy=True)[0]
else:
# Use simple hash-based embeddings
return self._simple_text_to_vector(query)
def compute_similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
"""Compute cosine similarity between two embeddings"""
# Normalize embeddings
norm1 = np.linalg.norm(embedding1)
norm2 = np.linalg.norm(embedding2)
if norm1 == 0 or norm2 == 0:
return 0.0
# Cosine similarity
similarity = np.dot(embedding1, embedding2) / (norm1 * norm2)
return float(similarity)
def find_similar_articles(self, query_embedding: np.ndarray,
article_embeddings: np.ndarray,
articles: List[Dict[str, Any]],
top_k: int = 5) -> List[Dict[str, Any]]:
"""Find most similar articles to query"""
if len(article_embeddings) == 0:
return []
similarities = []
for i, article_embedding in enumerate(article_embeddings):
similarity = self.compute_similarity(query_embedding, article_embedding)
similarities.append((similarity, i))
# Sort by similarity (descending)
similarities.sort(reverse=True)
# Get top-k results
results = []
for similarity, idx in similarities[:top_k]:
if similarity >= settings.similarity_threshold:
article = articles[idx].copy()
article['similarity_score'] = similarity
results.append(article)
return results
# Test function
if __name__ == "__main__":
# Test with sample articles
sample_articles = [
{
"title": "AI Revolution in Healthcare",
"content": "Artificial intelligence is transforming medical diagnosis and treatment.",
"source": "TechNews"
},
{
"title": "Climate Change Solutions",
"content": "New technologies are being developed to combat global warming.",
"source": "ScienceDaily"
}
]
generator = EmbeddingGenerator()
embeddings = generator.generate_embeddings(sample_articles)
print(f"Test embeddings shape: {embeddings.shape}")