feat: Implement Pinecone vector store integration

- Update config.py with Pinecone settings and model configurations - Implement VectorStore class with Pinecone backend - Add comprehensive vector operations (add, search, delete) - Set up proper error handling and metadata management - Add .gitignore for Python project
2025-04-16 23:09:52 +01:00
commit 859c17aad8
27 changed files with 2820 additions and 0 deletions
@@ -0,0 +1,91 @@
+import json
+import re
+import unicodedata
+from typing import Dict, List, Any
+
+def clean_text(text: str) -> str:
+    """
+    Clean text by removing special characters, normalizing quotes, and fixing formatting.
+    
+    Args:
+        text: The text to clean
+        
+    Returns:
+        Cleaned text
+    """
+    # Remove null bytes and control characters
+    text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
+    
+    # Normalize quotes
+    text = text.replace('"', '"').replace('"', '"')
+    text = text.replace(''', "'").replace(''', "'")
+    
+    # Fix spacing around punctuation
+    text = re.sub(r'\s+([.,!?])', r'\1', text)
+    text = re.sub(r'([.,!?])\s*([A-Z])', r'\1 \2', text)
+    
+    # Normalize whitespace
+    text = re.sub(r'\s+', ' ', text)
+    
+    # Fix common formatting issues
+    text = text.replace(' .', '.')
+    text = text.replace(' ,', ',')
+    text = text.replace(' !', '!')
+    text = text.replace(' ?', '?')
+    
+    # Remove any remaining special characters
+    text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C')
+    
+    return text.strip()
+
+def clean_dataset_item(item: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Clean a single dataset item.
+    
+    Args:
+        item: The dataset item to clean
+        
+    Returns:
+        Cleaned dataset item
+    """
+    # Clean the prompt
+    if 'prompt' in item:
+        item['prompt'] = clean_text(item['prompt'])
+    
+    # Clean the completion
+    if 'completion' in item:
+        item['completion'] = clean_text(item['completion'])
+    
+    # Clean any metadata fields
+    for key, value in item.items():
+        if isinstance(value, str):
+            item[key] = clean_text(value)
+    
+    return item
+
+def process_dataset(input_file: str, output_file: str) -> None:
+    """
+    Process the dataset file, cleaning all items and writing to a new file.
+    
+    Args:
+        input_file: Path to input dataset file
+        output_file: Path to output cleaned dataset file
+    """
+    # Read the input dataset
+    with open(input_file, 'r', encoding='utf-8') as f:
+        dataset = json.load(f)
+    
+    # Clean each item
+    cleaned_dataset = [clean_dataset_item(item) for item in dataset]
+    
+    # Write the cleaned dataset
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(cleaned_dataset, f, indent=2, ensure_ascii=False)
+    
+    print(f"Cleaned dataset saved to {output_file}")
+    print(f"Processed {len(cleaned_dataset)} items")
+
+if __name__ == "__main__":
+    input_file = "datasets/stage1_book_content.json"
+    output_file = "datasets/stage1_book_content_cleaned.json"
+    process_dataset(input_file, output_file)