bc485b44b8
- Enhanced README.md with a clearer project overview, features, technologies used, and installation instructions. - Updated vector dimension in config.py from 4096 to 1024 for Cohere embeddings. - Modified main.py to serve HTML responses for the home page, news fetching, and recommendations. - Improved error handling and ensured articles have links in the responses. - Cleaned up news_fetcher.py by removing unnecessary print statements. - Updated recommender.py to refine insights generation and summary extraction. - Added Jinja2 for templating and improved the project structure for better organization. - Included API documentation for better understanding of endpoints and usage.
92 lines
3.2 KiB
Python
92 lines
3.2 KiB
Python
from pinecone import Pinecone, ServerlessSpec
|
|
from typing import List, Dict, Any
|
|
from config import (
|
|
PINECONE_API_KEY,
|
|
PINECONE_INDEX_NAME,
|
|
VECTOR_DIMENSION,
|
|
TOP_K_RESULTS
|
|
)
|
|
|
|
class VectorStore:
|
|
def __init__(self):
|
|
self.pinecone = Pinecone(api_key=PINECONE_API_KEY)
|
|
self.index_name = PINECONE_INDEX_NAME
|
|
self._ensure_index()
|
|
|
|
def _ensure_index(self):
|
|
"""Ensure the Pinecone index exists, create if it doesn't."""
|
|
# Check if index exists, create if it doesn't
|
|
if self.index_name not in self.pinecone.list_indexes().names():
|
|
# Create a new index with the correct dimension
|
|
self.pinecone.create_index(
|
|
name=self.index_name,
|
|
dimension=VECTOR_DIMENSION,
|
|
metric="cosine",
|
|
spec=ServerlessSpec(cloud="aws", region="us-east-1")
|
|
)
|
|
print(f"Created new index '{self.index_name}' with dimension {VECTOR_DIMENSION}")
|
|
|
|
self.index = self.pinecone.Index(self.index_name)
|
|
|
|
def upsert_articles(self, articles: List[Dict[str, Any]]) -> bool:
|
|
"""Upsert articles to the vector store."""
|
|
try:
|
|
vectors = []
|
|
for article in articles:
|
|
if "embedding" not in article:
|
|
continue
|
|
|
|
vector = {
|
|
"id": article["id"],
|
|
"values": article["embedding"],
|
|
"metadata": {
|
|
"title": article["title"],
|
|
"content": article["content"],
|
|
"link": article["link"],
|
|
"published": article["published"],
|
|
"source": article["source"],
|
|
"categories": article["categories"]
|
|
}
|
|
}
|
|
vectors.append(vector)
|
|
|
|
if vectors:
|
|
self.index.upsert(vectors=vectors)
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error upserting articles: {str(e)}")
|
|
return False
|
|
|
|
def search_similar(self, query_embedding: List[float], top_k: int = TOP_K_RESULTS) -> List[Dict[str, Any]]:
|
|
"""Search for similar articles using the query embedding."""
|
|
try:
|
|
results = self.index.query(
|
|
vector=query_embedding,
|
|
top_k=top_k,
|
|
include_metadata=True
|
|
)
|
|
|
|
articles = []
|
|
for match in results.matches:
|
|
article = {
|
|
"id": match.id,
|
|
"score": match.score,
|
|
**match.metadata
|
|
}
|
|
articles.append(article)
|
|
|
|
return articles
|
|
except Exception as e:
|
|
print(f"Error searching similar articles: {str(e)}")
|
|
return []
|
|
|
|
def delete_article(self, article_id: str) -> bool:
|
|
"""Delete an article from the vector store."""
|
|
try:
|
|
self.index.delete(ids=[article_id])
|
|
return True
|
|
except Exception as e:
|
|
print(f"Error deleting article: {str(e)}")
|
|
return False
|
|
|