82fe3608d2
- Introduced a Config dataclass in config.py to manage API keys, RSS feeds, and directory paths more effectively. - Updated the NewsFetcher class to include retry logic for fetching articles from RSS feeds. - Modified the EmbeddingGenerator and NewsRecommender classes to utilize the new configuration structure. - Enhanced main.py to implement API token verification for secure access to news fetching and recommendations.
87 lines
3.2 KiB
Python
87 lines
3.2 KiB
Python
from pinecone import Pinecone, ServerlessSpec
|
|
from typing import List, Dict, Any, Optional
|
|
from config import config
|
|
|
|
class VectorStore:
|
|
def __init__(self, pinecone_client: Optional[Pinecone] = None):
|
|
self.pinecone = pinecone_client or Pinecone(api_key=config.pinecone_api_key)
|
|
self.index_name = config.pinecone_index_name
|
|
self._ensure_index()
|
|
|
|
def _ensure_index(self):
|
|
"""Ensure the Pinecone index exists, create if it doesn't."""
|
|
# Check if index exists, create if it doesn't
|
|
if self.index_name not in self.pinecone.list_indexes().names():
|
|
# Create a new index with the correct dimension
|
|
self.pinecone.create_index(
|
|
name=self.index_name,
|
|
dimension=config.vector_dimension,
|
|
metric="cosine",
|
|
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
|
)
|
|
print(f"Created new index '{self.index_name}' with dimension {config.vector_dimension}")
|
|
|
|
self.index = self.pinecone.Index(self.index_name)
|
|
|
|
def upsert_articles(self, articles: List[Dict[str, Any]]) -> bool:
|
|
"""Upsert articles to the vector store."""
|
|
try:
|
|
vectors = []
|
|
for article in articles:
|
|
if "embedding" not in article:
|
|
continue
|
|
|
|
vector = {
|
|
"id": article["id"],
|
|
"values": article["embedding"],
|
|
"metadata": {
|
|
"title": article["title"],
|
|
"content": article["content"],
|
|
"link": article["link"],
|
|
"published": article["published"],
|
|
"source": article["source"],
|
|
"categories": article["categories"]
|
|
}
|
|
}
|
|
vectors.append(vector)
|
|
|
|
if vectors:
|
|
self.index.upsert(vectors=vectors)
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error upserting articles: {str(e)}")
|
|
return False
|
|
|
|
def search_similar(self, query_embedding: List[float], top_k: int = None) -> List[Dict[str, Any]]:
|
|
"""Search for similar articles using the query embedding."""
|
|
try:
|
|
results = self.index.query(
|
|
vector=query_embedding,
|
|
top_k=top_k or config.top_k_results,
|
|
include_metadata=True
|
|
)
|
|
|
|
articles = []
|
|
for match in results.matches:
|
|
article = {
|
|
"id": match.id,
|
|
"score": match.score,
|
|
**match.metadata
|
|
}
|
|
articles.append(article)
|
|
|
|
return articles
|
|
except Exception as e:
|
|
print(f"Error searching similar articles: {str(e)}")
|
|
return []
|
|
|
|
def delete_article(self, article_id: str) -> bool:
|
|
"""Delete an article from the vector store."""
|
|
try:
|
|
self.index.delete(ids=[article_id])
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error deleting article: {str(e)}")
|
|
return False
|
|
|