feat: Implement Pinecone vector store integration

- Update config.py with Pinecone settings and model configurations
- Implement VectorStore class with Pinecone backend
- Add comprehensive vector operations (add, search, delete)
- Set up proper error handling and metadata management
- Add .gitignore for Python project
This commit is contained in:
boladeE
2025-04-16 23:09:52 +01:00
commit 859c17aad8
27 changed files with 2820 additions and 0 deletions
+113
View File
@@ -0,0 +1,113 @@
from typing import Dict, List
import json
import os
from transformers import pipeline
import numpy as np
class BrandStyleChecker:
def __init__(self):
self.style_guidelines = self._load_style_guidelines()
self.classifier = pipeline(
"text-classification",
model="distilbert-base-uncased-finetuned-sst-2-english",
device=-1
)
self.tone_keywords = self._load_tone_keywords()
def _load_style_guidelines(self) -> Dict:
"""Load brand style guidelines from file."""
guidelines_path = 'data/style_guidelines/brand_guidelines.json'
if os.path.exists(guidelines_path):
with open(guidelines_path, 'r') as f:
return json.load(f)
return {
"tone": "professional yet approachable",
"voice": "confident and authoritative",
"key_phrases": [],
"avoided_phrases": [],
"brand_values": []
}
def _load_tone_keywords(self) -> Dict[str, List[str]]:
"""Load tone keywords for analysis."""
keywords_path = 'data/style_guidelines/tone_keywords.json'
if os.path.exists(keywords_path):
with open(keywords_path, 'r') as f:
return json.load(f)
return {
"professional": ["expert", "professional", "industry", "experience"],
"approachable": ["friendly", "helpful", "understand", "support"],
"confident": ["guaranteed", "proven", "success", "expertise"],
"authoritative": ["leading", "best", "premier", "trusted"]
}
def check_alignment(self, content: str) -> float:
"""
Check how well the content aligns with the brand style.
Args:
content: The content to check
Returns:
Alignment score between 0 and 1
"""
scores = []
# Check sentiment alignment
sentiment_score = self._check_sentiment(content)
scores.append(sentiment_score)
# Check keyword presence
keyword_score = self._check_keywords(content)
scores.append(keyword_score)
# Check tone consistency
tone_score = self._check_tone_consistency(content)
scores.append(tone_score)
# Calculate final score (weighted average)
weights = [0.3, 0.4, 0.3] # Adjust weights based on importance
final_score = np.average(scores, weights=weights)
return float(final_score)
def _check_sentiment(self, content: str) -> float:
"""Check if the sentiment aligns with brand guidelines."""
result = self.classifier(content)[0]
# Assuming positive sentiment (score > 0.5) is desired
return result['score'] if result['label'] == 'POSITIVE' else 1 - result['score']
def _check_keywords(self, content: str) -> float:
"""Check presence of brand-aligned keywords."""
content_lower = content.lower()
total_keywords = sum(len(keywords) for keywords in self.tone_keywords.values())
found_keywords = sum(
sum(1 for keyword in keywords if keyword in content_lower)
for keywords in self.tone_keywords.values()
)
return found_keywords / total_keywords if total_keywords > 0 else 0.0
def _check_tone_consistency(self, content: str) -> float:
"""Check consistency with brand tone guidelines."""
# This is a simplified version - in practice, you might want to use
# more sophisticated NLP techniques or a fine-tuned model
content_lower = content.lower()
tone_matches = 0
total_checks = 0
# Check for professional tone
if any(word in content_lower for word in ["we", "our", "us"]):
tone_matches += 1
total_checks += 1
# Check for approachable tone
if any(word in content_lower for word in ["you", "your", "help", "support"]):
tone_matches += 1
total_checks += 1
# Check for confident tone
if any(word in content_lower for word in ["guarantee", "proven", "expert"]):
tone_matches += 1
total_checks += 1
return tone_matches / total_checks if total_checks > 0 else 0.0
+47
View File
@@ -0,0 +1,47 @@
from pydantic_settings import BaseSettings
from typing import Optional
import os
from dotenv import load_dotenv
load_dotenv()
class Settings(BaseSettings):
# API Keys
COHERE_API_KEY: str = os.getenv('COHERE_API_KEY', '')
PINECONE_API_KEY: str = os.getenv('PINECONE_API_KEY', '')
# Model Settings
MODEL_NAME: str = "facebook/opt-350m" # Using the finetuned model
EMBEDDING_MODEL: str = "embed-english-v3.0"
# Vector Store Settings
VECTOR_DIMENSION: int = 768 # Default dimension for Cohere embeddings
MAX_SEARCH_RESULTS: int = 5
# Pinecone Settings
PINECONE_ENVIRONMENT: str = os.getenv('PINECONE_ENVIRONMENT', 'us-west1-gcp')
PINECONE_INDEX_NAME: str = os.getenv('PINECONE_INDEX_NAME', 'marketing-assistant')
# Content Generation Settings
MAX_CONTENT_LENGTH: int = 500
TEMPERATURE: float = 0.7
TOP_P: float = 0.9
# Brand Style Settings
BRAND_GUIDELINES_PATH: str = "data/style_guidelines/brand_guidelines.json"
TONE_KEYWORDS_PATH: str = "data/style_guidelines/tone_keywords.json"
# Storage Settings
VECTOR_STORE_PATH: str = "data/vector_store"
PAST_CAMPAIGNS_PATH: str = "data/past_campaigns"
USER_QUERIES_PATH: str = "data/user_queries"
# Finetuned Model Settings
FINETUNED_MODEL_PATH: str = "../finetuned_model"
class Config:
env_file = ".env"
case_sensitive = True
# Create global settings instance
settings = Settings()
+98
View File
@@ -0,0 +1,98 @@
from transformers import pipeline
from typing import List, Optional
import torch
from finetuned_model import finetuned_model
class MarketingCopywriter:
def __init__(self):
# Use the finetuned model instead of the default GPT-2
self.model = finetuned_model
def generate(
self,
prompt: str,
content_type: str,
similar_content: List[str],
tone: Optional[str] = None,
) -> str:
# Generate the marketing copy using the finetuned model
generated_texts = self.model.generate_with_context(
prompt=prompt,
content_type=content_type,
similar_content=similar_content,
tone=tone,
max_length=500,
num_return_sequences=1,
temperature=0.7,
top_p=0.9
)
# Return the first generated text
return generated_texts[0] if generated_texts else ""
def _build_context(
self,
prompt: str,
content_type: str,
similar_content: List[str],
tone: Optional[str],
target_audience: Optional[str]
) -> str:
context = f"Content Type: {content_type}\n"
if tone:
context += f"Tone: {tone}\n"
if target_audience:
context += f"Target Audience: {target_audience}\n"
context += "\nSimilar Content Examples:\n"
for content in similar_content[:3]: # Use top 3 similar content pieces
context += f"- {content}\n"
context += f"\nGenerate marketing copy for: {prompt}\n"
return context
def _post_process(self, text: str) -> str:
# Clean up the generated text
text = text.strip()
# Add any additional post-processing steps here
return text
# Initialize the copywriter
copywriter = MarketingCopywriter()
def generate_marketing_copy(
prompt: str,
content_type: str,
similar_content: List[str],
tone: Optional[str] = None,
target_audience: Optional[str] = None
) -> str:
"""
Generate marketing copy based on the given parameters.
Args:
prompt: The main prompt for content generation
content_type: Type of content (email, social media, etc.)
similar_content: List of similar content for context
tone: Optional tone specification
target_audience: Optional target audience specification
Returns:
Generated marketing copy
"""
return copywriter.generate(
prompt=prompt,
content_type=content_type,
similar_content=similar_content,
tone=tone,
target_audience=target_audience
)
generate_marketing_copy(
prompt="Help me write a blog post about the benefits of using our product",
content_type="blog post",
similar_content=[],
tone="",
target_audience=""
)
+55
View File
@@ -0,0 +1,55 @@
import cohere
import numpy as np
from typing import List, Union
import os
from dotenv import load_dotenv
load_dotenv()
class CohereEmbeddings:
def __init__(self):
self.api_key = os.getenv('COHERE_API_KEY')
if not self.api_key:
raise ValueError("COHERE_API_KEY environment variable is not set")
self.client = cohere.Client(self.api_key)
def generate(self, text: Union[str, List[str]]) -> np.ndarray:
"""
Generate embeddings for the given text using Cohere.
Args:
text: Single text string or list of texts
Returns:
numpy array of embeddings
"""
if isinstance(text, str):
text = [text]
response = self.client.embed(
texts=text,
model='embed-english-v3.0',
input_type='search_document'
)
return np.array(response.embeddings)
def generate_batch(self, texts: List[str], batch_size: int = 96) -> List[np.ndarray]:
"""
Generate embeddings for a large batch of texts.
Args:
texts: List of texts to generate embeddings for
batch_size: Size of each batch
Returns:
List of numpy arrays containing embeddings
"""
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
embeddings = self.generate(batch)
all_embeddings.extend(embeddings)
return all_embeddings
+158
View File
@@ -0,0 +1,158 @@
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import List, Optional, Dict, Any
class FinetunedModel:
def __init__(self, model_path: str = "../finetuned_model"):
"""
Initialize the finetuned model.
Args:
model_path: Path to the finetuned model directory
"""
self.model_path = model_path
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading finetuned model from {model_path}")
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForCausalLM.from_pretrained(model_path)
self.model.to(self.device)
# Set pad token if not set
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
def generate(
self,
prompt: str,
max_length: int = 200,
num_return_sequences: int = 1,
temperature: float = 0.7,
top_p: float = 0.9,
**kwargs
) -> List[str]:
"""
Generate text using the finetuned model.
Args:
prompt: The prompt to generate text from
max_length: Maximum length of the generated text
num_return_sequences: Number of sequences to generate
temperature: Sampling temperature (higher = more random)
top_p: Nucleus sampling parameter
**kwargs: Additional arguments to pass to the model
Returns:
List of generated text sequences
"""
# Format the prompt
formatted_prompt = f"Prompt: {prompt}\nCompletion:"
# Tokenize the prompt
inputs = self.tokenizer(formatted_prompt, return_tensors="pt")
inputs = {k: v.to(self.device) for k, v in inputs.items()}
# Generate text
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=max_length,
num_return_sequences=num_return_sequences,
temperature=temperature,
top_p=top_p,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
**kwargs
)
# Decode the generated text
generated_texts = []
for output in outputs:
generated_text = self.tokenizer.decode(output, skip_special_tokens=True)
# Extract just the completion part
completion = generated_text.split("Completion:")[-1].strip()
generated_texts.append(completion)
return generated_texts
def generate_with_context(
self,
prompt: str,
content_type: str,
similar_content: List[str],
tone: Optional[str] = None,
target_audience: Optional[str] = None,
max_length: int = 200,
num_return_sequences: int = 1,
temperature: float = 0.7,
top_p: float = 0.9,
**kwargs
) -> List[str]:
"""
Generate text with additional context.
Args:
prompt: The main prompt for content generation
content_type: Type of content (email, social media, etc.)
similar_content: List of similar content for context
tone: Optional tone specification
target_audience: Optional target audience specification
max_length: Maximum length of the generated text
num_return_sequences: Number of sequences to generate
temperature: Sampling temperature (higher = more random)
top_p: Nucleus sampling parameter
**kwargs: Additional arguments to pass to the model
Returns:
List of generated text sequences
"""
# Build the context
context = self._build_context(prompt, content_type, similar_content, tone, target_audience)
# Generate text
return self.generate(
prompt=context,
max_length=max_length,
num_return_sequences=num_return_sequences,
temperature=temperature,
top_p=top_p,
**kwargs
)
def _build_context(
self,
prompt: str,
content_type: str,
similar_content: List[str],
tone: Optional[str],
target_audience: Optional[str]
) -> str:
"""
Build a context string for the model.
Args:
prompt: The main prompt for content generation
content_type: Type of content (email, social media, etc.)
similar_content: List of similar content for context
tone: Optional tone specification
target_audience: Optional target audience specification
Returns:
Context string for the model
"""
context = f"Content Type: {content_type}\n"
if tone:
context += f"Tone: {tone}\n"
if target_audience:
context += f"Target Audience: {target_audience}\n"
context += "\nSimilar Content Examples:\n"
for content in similar_content[:3]: # Use top 3 similar content pieces
context += f"- {content}\n"
context += f"\nGenerate marketing copy for: {prompt}\n"
return context
# Initialize the model
finetuned_model = FinetunedModel()
+93
View File
@@ -0,0 +1,93 @@
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional, List
import uvicorn
from copywriter import generate_marketing_copy
from vector_store import VectorStore
from embeddings import CohereEmbeddings
from brand_style import BrandStyleChecker
from config import Settings
from finetuned_model import finetuned_model
app = FastAPI(title="Marketing Assistant AI")
settings = Settings()
vector_store = VectorStore()
embeddings = CohereEmbeddings()
brand_checker = BrandStyleChecker()
class CopyRequest(BaseModel):
prompt: str
content_type: str
tone: Optional[str] = None
target_audience: Optional[str] = None
class CopyResponse(BaseModel):
content: str
confidence_score: float
brand_alignment_score: float
class DirectModelRequest(BaseModel):
prompt: str
max_length: Optional[int] = 200
num_return_sequences: Optional[int] = 1
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.9
class DirectModelResponse(BaseModel):
generated_texts: List[str]
@app.post("/generate-copy", response_model=CopyResponse)
async def create_marketing_copy(request: CopyRequest):
try:
# Generate embeddings for the prompt
prompt_embedding = embeddings.generate(request.prompt)
# Retrieve similar content from vector store
similar_content = vector_store.search(prompt_embedding)
# Generate marketing copy
content = generate_marketing_copy(
prompt=request.prompt,
content_type=request.content_type,
similar_content=similar_content,
tone=request.tone,
target_audience=request.target_audience
)
# Check brand alignment
brand_alignment = brand_checker.check_alignment(content)
return CopyResponse(
content=content,
confidence_score=0.85, # This should be calculated based on model confidence
brand_alignment_score=brand_alignment
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/direct-model", response_model=DirectModelResponse)
async def direct_model_inference(request: DirectModelRequest):
"""
Direct inference using the finetuned model without using the vector store or other components.
This endpoint is useful for testing the model directly.
"""
try:
# Generate text using the finetuned model
generated_texts = finetuned_model.generate(
prompt=request.prompt,
max_length=request.max_length,
num_return_sequences=request.num_return_sequences,
temperature=request.temperature,
top_p=request.top_p
)
return DirectModelResponse(generated_texts=generated_texts)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
return {"status": "healthy"}
if __name__ == "__main__":
uvicorn.run("main:app", host="localhost", port=8000, reload=True)
+12
View File
@@ -0,0 +1,12 @@
fastapi==0.104.1
uvicorn==0.24.0
cohere==4.37
faiss-cpu==1.7.4
python-dotenv==1.0.0
pydantic==2.4.2
numpy==1.24.3
transformers==4.35.2
torch==2.1.1
python-multipart==0.0.6
PyPDF2==3.0.1
pycryptodome==3.17.1
+97
View File
@@ -0,0 +1,97 @@
import pinecone
from typing import List, Dict, Any, Optional
import uuid
from config import settings
class VectorStore:
def __init__(self):
# Initialize Pinecone
pinecone.init(
api_key=settings.PINECONE_API_KEY,
environment=settings.PINECONE_ENVIRONMENT
)
# Create or get the index
if settings.PINECONE_INDEX_NAME not in pinecone.list_indexes():
pinecone.create_index(
name=settings.PINECONE_INDEX_NAME,
dimension=settings.VECTOR_DIMENSION,
metric="cosine"
)
self.index = pinecone.Index(settings.PINECONE_INDEX_NAME)
def add_content(self, content: str, metadata: Optional[Dict[str, Any]] = None) -> str:
"""
Add content to the vector store with optional metadata.
Returns the ID of the added content.
"""
content_id = str(uuid.uuid4())
# Prepare metadata
if metadata is None:
metadata = {}
metadata['content'] = content
# Upsert the vector with metadata
self.index.upsert(
vectors=[(content_id, [0] * settings.VECTOR_DIMENSION, metadata)],
namespace="content"
)
return content_id
def search(self, query_vector: List[float], top_k: int = settings.MAX_SEARCH_RESULTS) -> List[Dict[str, Any]]:
"""
Search for similar content using a query vector.
Returns a list of dictionaries containing content and metadata.
"""
results = self.index.query(
vector=query_vector,
top_k=top_k,
include_metadata=True,
namespace="content"
)
return [
{
'id': match.id,
'content': match.metadata['content'],
'score': match.score,
**{k: v for k, v in match.metadata.items() if k != 'content'}
}
for match in results.matches
]
def get_all_content(self) -> List[Dict[str, Any]]:
"""
Retrieve all content from the vector store.
"""
# Fetch all vectors from the index
results = self.index.query(
vector=[0] * settings.VECTOR_DIMENSION,
top_k=10000, # Adjust based on your needs
include_metadata=True,
namespace="content"
)
return [
{
'id': match.id,
'content': match.metadata['content'],
**{k: v for k, v in match.metadata.items() if k != 'content'}
}
for match in results.matches
]
def delete_content(self, content_id: str) -> bool:
"""
Delete content from the vector store by ID.
Returns True if successful, False otherwise.
"""
try:
self.index.delete(ids=[content_id], namespace="content")
return True
except Exception as e:
print(f"Error deleting content: {e}")
return False