feat: Implement Pinecone vector store integration

- Update config.py with Pinecone settings and model configurations - Implement VectorStore class with Pinecone backend - Add comprehensive vector operations (add, search, delete) - Set up proper error handling and metadata management - Add .gitignore for Python project
2025-04-16 23:09:52 +01:00
commit 859c17aad8
27 changed files with 2820 additions and 0 deletions
@@ -0,0 +1,113 @@
+from typing import Dict, List
+import json
+import os
+from transformers import pipeline
+import numpy as np
+
+class BrandStyleChecker:
+    def __init__(self):
+        self.style_guidelines = self._load_style_guidelines()
+        self.classifier = pipeline(
+            "text-classification",
+            model="distilbert-base-uncased-finetuned-sst-2-english",
+            device=-1
+        )
+        self.tone_keywords = self._load_tone_keywords()
+    
+    def _load_style_guidelines(self) -> Dict:
+        """Load brand style guidelines from file."""
+        guidelines_path = 'data/style_guidelines/brand_guidelines.json'
+        if os.path.exists(guidelines_path):
+            with open(guidelines_path, 'r') as f:
+                return json.load(f)
+        return {
+            "tone": "professional yet approachable",
+            "voice": "confident and authoritative",
+            "key_phrases": [],
+            "avoided_phrases": [],
+            "brand_values": []
+        }
+    
+    def _load_tone_keywords(self) -> Dict[str, List[str]]:
+        """Load tone keywords for analysis."""
+        keywords_path = 'data/style_guidelines/tone_keywords.json'
+        if os.path.exists(keywords_path):
+            with open(keywords_path, 'r') as f:
+                return json.load(f)
+        return {
+            "professional": ["expert", "professional", "industry", "experience"],
+            "approachable": ["friendly", "helpful", "understand", "support"],
+            "confident": ["guaranteed", "proven", "success", "expertise"],
+            "authoritative": ["leading", "best", "premier", "trusted"]
+        }
+    
+    def check_alignment(self, content: str) -> float:
+        """
+        Check how well the content aligns with the brand style.
+        
+        Args:
+            content: The content to check
+            
+        Returns:
+            Alignment score between 0 and 1
+        """
+        scores = []
+        
+        # Check sentiment alignment
+        sentiment_score = self._check_sentiment(content)
+        scores.append(sentiment_score)
+        
+        # Check keyword presence
+        keyword_score = self._check_keywords(content)
+        scores.append(keyword_score)
+        
+        # Check tone consistency
+        tone_score = self._check_tone_consistency(content)
+        scores.append(tone_score)
+        
+        # Calculate final score (weighted average)
+        weights = [0.3, 0.4, 0.3]  # Adjust weights based on importance
+        final_score = np.average(scores, weights=weights)
+        
+        return float(final_score)
+    
+    def _check_sentiment(self, content: str) -> float:
+        """Check if the sentiment aligns with brand guidelines."""
+        result = self.classifier(content)[0]
+        # Assuming positive sentiment (score > 0.5) is desired
+        return result['score'] if result['label'] == 'POSITIVE' else 1 - result['score']
+    
+    def _check_keywords(self, content: str) -> float:
+        """Check presence of brand-aligned keywords."""
+        content_lower = content.lower()
+        total_keywords = sum(len(keywords) for keywords in self.tone_keywords.values())
+        found_keywords = sum(
+            sum(1 for keyword in keywords if keyword in content_lower)
+            for keywords in self.tone_keywords.values()
+        )
+        return found_keywords / total_keywords if total_keywords > 0 else 0.0
+    
+    def _check_tone_consistency(self, content: str) -> float:
+        """Check consistency with brand tone guidelines."""
+        # This is a simplified version - in practice, you might want to use
+        # more sophisticated NLP techniques or a fine-tuned model
+        content_lower = content.lower()
+        tone_matches = 0
+        total_checks = 0
+        
+        # Check for professional tone
+        if any(word in content_lower for word in ["we", "our", "us"]):
+            tone_matches += 1
+        total_checks += 1
+        
+        # Check for approachable tone
+        if any(word in content_lower for word in ["you", "your", "help", "support"]):
+            tone_matches += 1
+        total_checks += 1
+        
+        # Check for confident tone
+        if any(word in content_lower for word in ["guarantee", "proven", "expert"]):
+            tone_matches += 1
+        total_checks += 1
+        
+        return tone_matches / total_checks if total_checks > 0 else 0.0 
@@ -0,0 +1,47 @@
+from pydantic_settings import BaseSettings
+from typing import Optional
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+class Settings(BaseSettings):
+    # API Keys
+    COHERE_API_KEY: str = os.getenv('COHERE_API_KEY', '')
+    PINECONE_API_KEY: str = os.getenv('PINECONE_API_KEY', '')
+    
+    # Model Settings
+    MODEL_NAME: str = "facebook/opt-350m"  # Using the finetuned model
+    EMBEDDING_MODEL: str = "embed-english-v3.0"
+    
+    # Vector Store Settings
+    VECTOR_DIMENSION: int = 768  # Default dimension for Cohere embeddings
+    MAX_SEARCH_RESULTS: int = 5
+    
+    # Pinecone Settings
+    PINECONE_ENVIRONMENT: str = os.getenv('PINECONE_ENVIRONMENT', 'us-west1-gcp')
+    PINECONE_INDEX_NAME: str = os.getenv('PINECONE_INDEX_NAME', 'marketing-assistant')
+    
+    # Content Generation Settings
+    MAX_CONTENT_LENGTH: int = 500
+    TEMPERATURE: float = 0.7
+    TOP_P: float = 0.9
+    
+    # Brand Style Settings
+    BRAND_GUIDELINES_PATH: str = "data/style_guidelines/brand_guidelines.json"
+    TONE_KEYWORDS_PATH: str = "data/style_guidelines/tone_keywords.json"
+    
+    # Storage Settings
+    VECTOR_STORE_PATH: str = "data/vector_store"
+    PAST_CAMPAIGNS_PATH: str = "data/past_campaigns"
+    USER_QUERIES_PATH: str = "data/user_queries"
+    
+    # Finetuned Model Settings
+    FINETUNED_MODEL_PATH: str = "../finetuned_model"
+    
+    class Config:
+        env_file = ".env"
+        case_sensitive = True
+
+# Create global settings instance
+settings = Settings() 
@@ -0,0 +1,98 @@
+from transformers import pipeline
+from typing import List, Optional
+import torch
+from finetuned_model import finetuned_model
+
+class MarketingCopywriter:
+    def __init__(self):
+        # Use the finetuned model instead of the default GPT-2
+        self.model = finetuned_model
+        
+    def generate(
+        self,
+        prompt: str,
+        content_type: str,
+        similar_content: List[str],
+        tone: Optional[str] = None,
+    ) -> str:
+        # Generate the marketing copy using the finetuned model
+        generated_texts = self.model.generate_with_context(
+            prompt=prompt,
+            content_type=content_type,
+            similar_content=similar_content,
+            tone=tone,
+            max_length=500,
+            num_return_sequences=1,
+            temperature=0.7,
+            top_p=0.9
+        )
+        
+        # Return the first generated text
+        return generated_texts[0] if generated_texts else ""
+    
+    def _build_context(
+        self,
+        prompt: str,
+        content_type: str,
+        similar_content: List[str],
+        tone: Optional[str],
+        target_audience: Optional[str]
+    ) -> str:
+        context = f"Content Type: {content_type}\n"
+        if tone:
+            context += f"Tone: {tone}\n"
+        if target_audience:
+            context += f"Target Audience: {target_audience}\n"
+        
+        context += "\nSimilar Content Examples:\n"
+        for content in similar_content[:3]:  # Use top 3 similar content pieces
+            context += f"- {content}\n"
+        
+        context += f"\nGenerate marketing copy for: {prompt}\n"
+        return context
+    
+    def _post_process(self, text: str) -> str:
+        # Clean up the generated text
+        text = text.strip()
+        # Add any additional post-processing steps here
+        return text
+
+# Initialize the copywriter
+copywriter = MarketingCopywriter()
+
+def generate_marketing_copy(
+    prompt: str,
+    content_type: str,
+    similar_content: List[str],
+    tone: Optional[str] = None,
+    target_audience: Optional[str] = None
+) -> str:
+    """
+    Generate marketing copy based on the given parameters.
+    
+    Args:
+        prompt: The main prompt for content generation
+        content_type: Type of content (email, social media, etc.)
+        similar_content: List of similar content for context
+        tone: Optional tone specification
+        target_audience: Optional target audience specification
+    
+    Returns:
+        Generated marketing copy
+    """
+    return copywriter.generate(
+        prompt=prompt,
+        content_type=content_type,
+        similar_content=similar_content,
+        tone=tone,
+        target_audience=target_audience
+    ) 
+
+
+generate_marketing_copy(
+    prompt="Help me write a blog post about the benefits of using our product",
+    content_type="blog post",
+    similar_content=[],
+    tone="",
+    target_audience=""
+)
@@ -0,0 +1,55 @@
+import cohere
+import numpy as np
+from typing import List, Union
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+class CohereEmbeddings:
+    def __init__(self):
+        self.api_key = os.getenv('COHERE_API_KEY')
+        if not self.api_key:
+            raise ValueError("COHERE_API_KEY environment variable is not set")
+        self.client = cohere.Client(self.api_key)
+    
+    def generate(self, text: Union[str, List[str]]) -> np.ndarray:
+        """
+        Generate embeddings for the given text using Cohere.
+        
+        Args:
+            text: Single text string or list of texts
+            
+        Returns:
+            numpy array of embeddings
+        """
+        if isinstance(text, str):
+            text = [text]
+        
+        response = self.client.embed(
+            texts=text,
+            model='embed-english-v3.0',
+            input_type='search_document'
+        )
+        
+        return np.array(response.embeddings)
+    
+    def generate_batch(self, texts: List[str], batch_size: int = 96) -> List[np.ndarray]:
+        """
+        Generate embeddings for a large batch of texts.
+        
+        Args:
+            texts: List of texts to generate embeddings for
+            batch_size: Size of each batch
+            
+        Returns:
+            List of numpy arrays containing embeddings
+        """
+        all_embeddings = []
+        
+        for i in range(0, len(texts), batch_size):
+            batch = texts[i:i + batch_size]
+            embeddings = self.generate(batch)
+            all_embeddings.extend(embeddings)
+        
+        return all_embeddings 
@@ -0,0 +1,158 @@
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import List, Optional, Dict, Any
+
+class FinetunedModel:
+    def __init__(self, model_path: str = "../finetuned_model"):
+        """
+        Initialize the finetuned model.
+        
+        Args:
+            model_path: Path to the finetuned model directory
+        """
+        self.model_path = model_path
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        
+        print(f"Loading finetuned model from {model_path}")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.model = AutoModelForCausalLM.from_pretrained(model_path)
+        self.model.to(self.device)
+        
+        # Set pad token if not set
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    
+    def generate(
+        self,
+        prompt: str,
+        max_length: int = 200,
+        num_return_sequences: int = 1,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        **kwargs
+    ) -> List[str]:
+        """
+        Generate text using the finetuned model.
+        
+        Args:
+            prompt: The prompt to generate text from
+            max_length: Maximum length of the generated text
+            num_return_sequences: Number of sequences to generate
+            temperature: Sampling temperature (higher = more random)
+            top_p: Nucleus sampling parameter
+            **kwargs: Additional arguments to pass to the model
+            
+        Returns:
+            List of generated text sequences
+        """
+        # Format the prompt
+        formatted_prompt = f"Prompt: {prompt}\nCompletion:"
+        
+        # Tokenize the prompt
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt")
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        
+        # Generate text
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                max_length=max_length,
+                num_return_sequences=num_return_sequences,
+                temperature=temperature,
+                top_p=top_p,
+                do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id,
+                **kwargs
+            )
+        
+        # Decode the generated text
+        generated_texts = []
+        for output in outputs:
+            generated_text = self.tokenizer.decode(output, skip_special_tokens=True)
+            # Extract just the completion part
+            completion = generated_text.split("Completion:")[-1].strip()
+            generated_texts.append(completion)
+        
+        return generated_texts
+    
+    def generate_with_context(
+        self,
+        prompt: str,
+        content_type: str,
+        similar_content: List[str],
+        tone: Optional[str] = None,
+        target_audience: Optional[str] = None,
+        max_length: int = 200,
+        num_return_sequences: int = 1,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+        **kwargs
+    ) -> List[str]:
+        """
+        Generate text with additional context.
+        
+        Args:
+            prompt: The main prompt for content generation
+            content_type: Type of content (email, social media, etc.)
+            similar_content: List of similar content for context
+            tone: Optional tone specification
+            target_audience: Optional target audience specification
+            max_length: Maximum length of the generated text
+            num_return_sequences: Number of sequences to generate
+            temperature: Sampling temperature (higher = more random)
+            top_p: Nucleus sampling parameter
+            **kwargs: Additional arguments to pass to the model
+            
+        Returns:
+            List of generated text sequences
+        """
+        # Build the context
+        context = self._build_context(prompt, content_type, similar_content, tone, target_audience)
+        
+        # Generate text
+        return self.generate(
+            prompt=context,
+            max_length=max_length,
+            num_return_sequences=num_return_sequences,
+            temperature=temperature,
+            top_p=top_p,
+            **kwargs
+        )
+    
+    def _build_context(
+        self,
+        prompt: str,
+        content_type: str,
+        similar_content: List[str],
+        tone: Optional[str],
+        target_audience: Optional[str]
+    ) -> str:
+        """
+        Build a context string for the model.
+        
+        Args:
+            prompt: The main prompt for content generation
+            content_type: Type of content (email, social media, etc.)
+            similar_content: List of similar content for context
+            tone: Optional tone specification
+            target_audience: Optional target audience specification
+            
+        Returns:
+            Context string for the model
+        """
+        context = f"Content Type: {content_type}\n"
+        if tone:
+            context += f"Tone: {tone}\n"
+        if target_audience:
+            context += f"Target Audience: {target_audience}\n"
+        
+        context += "\nSimilar Content Examples:\n"
+        for content in similar_content[:3]:  # Use top 3 similar content pieces
+            context += f"- {content}\n"
+        
+        context += f"\nGenerate marketing copy for: {prompt}\n"
+        return context
+
+# Initialize the model
+finetuned_model = FinetunedModel() 
@@ -0,0 +1,93 @@
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+from typing import Optional, List
+import uvicorn
+from copywriter import generate_marketing_copy
+from vector_store import VectorStore
+from embeddings import CohereEmbeddings
+from brand_style import BrandStyleChecker
+from config import Settings
+from finetuned_model import finetuned_model
+
+app = FastAPI(title="Marketing Assistant AI")
+settings = Settings()
+vector_store = VectorStore()
+embeddings = CohereEmbeddings()
+brand_checker = BrandStyleChecker()
+
+class CopyRequest(BaseModel):
+    prompt: str
+    content_type: str
+    tone: Optional[str] = None
+    target_audience: Optional[str] = None
+
+class CopyResponse(BaseModel):
+    content: str
+    confidence_score: float
+    brand_alignment_score: float
+
+class DirectModelRequest(BaseModel):
+    prompt: str
+    max_length: Optional[int] = 200
+    num_return_sequences: Optional[int] = 1
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 0.9
+
+class DirectModelResponse(BaseModel):
+    generated_texts: List[str]
+
+@app.post("/generate-copy", response_model=CopyResponse)
+async def create_marketing_copy(request: CopyRequest):
+    try:
+        # Generate embeddings for the prompt
+        prompt_embedding = embeddings.generate(request.prompt)
+        
+        # Retrieve similar content from vector store
+        similar_content = vector_store.search(prompt_embedding)
+        
+        # Generate marketing copy
+        content = generate_marketing_copy(
+            prompt=request.prompt,
+            content_type=request.content_type,
+            similar_content=similar_content,
+            tone=request.tone,
+            target_audience=request.target_audience
+        )
+        
+        # Check brand alignment
+        brand_alignment = brand_checker.check_alignment(content)
+        
+        return CopyResponse(
+            content=content,
+            confidence_score=0.85,  # This should be calculated based on model confidence
+            brand_alignment_score=brand_alignment
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/direct-model", response_model=DirectModelResponse)
+async def direct_model_inference(request: DirectModelRequest):
+    """
+    Direct inference using the finetuned model without using the vector store or other components.
+    This endpoint is useful for testing the model directly.
+    """
+    try:
+        # Generate text using the finetuned model
+        generated_texts = finetuned_model.generate(
+            prompt=request.prompt,
+            max_length=request.max_length,
+            num_return_sequences=request.num_return_sequences,
+            temperature=request.temperature,
+            top_p=request.top_p
+        )
+        
+        return DirectModelResponse(generated_texts=generated_texts)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}
+
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="localhost", port=8000, reload=True) 
@@ -0,0 +1,12 @@
+fastapi==0.104.1
+uvicorn==0.24.0
+cohere==4.37
+faiss-cpu==1.7.4
+python-dotenv==1.0.0
+pydantic==2.4.2
+numpy==1.24.3
+transformers==4.35.2
+torch==2.1.1
+python-multipart==0.0.6 
+PyPDF2==3.0.1
+pycryptodome==3.17.1
@@ -0,0 +1,97 @@
+import pinecone
+from typing import List, Dict, Any, Optional
+import uuid
+from config import settings
+
+class VectorStore:
+    def __init__(self):
+        # Initialize Pinecone
+        pinecone.init(
+            api_key=settings.PINECONE_API_KEY,
+            environment=settings.PINECONE_ENVIRONMENT
+        )
+        
+        # Create or get the index
+        if settings.PINECONE_INDEX_NAME not in pinecone.list_indexes():
+            pinecone.create_index(
+                name=settings.PINECONE_INDEX_NAME,
+                dimension=settings.VECTOR_DIMENSION,
+                metric="cosine"
+            )
+        
+        self.index = pinecone.Index(settings.PINECONE_INDEX_NAME)
+    
+    def add_content(self, content: str, metadata: Optional[Dict[str, Any]] = None) -> str:
+        """
+        Add content to the vector store with optional metadata.
+        Returns the ID of the added content.
+        """
+        content_id = str(uuid.uuid4())
+        
+        # Prepare metadata
+        if metadata is None:
+            metadata = {}
+        metadata['content'] = content
+        
+        # Upsert the vector with metadata
+        self.index.upsert(
+            vectors=[(content_id, [0] * settings.VECTOR_DIMENSION, metadata)],
+            namespace="content"
+        )
+        
+        return content_id
+    
+    def search(self, query_vector: List[float], top_k: int = settings.MAX_SEARCH_RESULTS) -> List[Dict[str, Any]]:
+        """
+        Search for similar content using a query vector.
+        Returns a list of dictionaries containing content and metadata.
+        """
+        results = self.index.query(
+            vector=query_vector,
+            top_k=top_k,
+            include_metadata=True,
+            namespace="content"
+        )
+        
+        return [
+            {
+                'id': match.id,
+                'content': match.metadata['content'],
+                'score': match.score,
+                **{k: v for k, v in match.metadata.items() if k != 'content'}
+            }
+            for match in results.matches
+        ]
+    
+    def get_all_content(self) -> List[Dict[str, Any]]:
+        """
+        Retrieve all content from the vector store.
+        """
+        # Fetch all vectors from the index
+        results = self.index.query(
+            vector=[0] * settings.VECTOR_DIMENSION,
+            top_k=10000,  # Adjust based on your needs
+            include_metadata=True,
+            namespace="content"
+        )
+        
+        return [
+            {
+                'id': match.id,
+                'content': match.metadata['content'],
+                **{k: v for k, v in match.metadata.items() if k != 'content'}
+            }
+            for match in results.matches
+        ]
+    
+    def delete_content(self, content_id: str) -> bool:
+        """
+        Delete content from the vector store by ID.
+        Returns True if successful, False otherwise.
+        """
+        try:
+            self.index.delete(ids=[content_id], namespace="content")
+            return True
+        except Exception as e:
+            print(f"Error deleting content: {e}")
+            return False