clean_dataset.py

import json
import re
import unicodedata
from typing import Dict, List, Any

def clean_text(text: str) -> str:
    """
    Clean text by removing special characters, normalizing quotes, and fixing formatting.
    
    Args:
        text: The text to clean
        
    Returns:
        Cleaned text
    """
    # Remove null bytes and control characters
    text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
    
    # Normalize quotes
    text = text.replace('"', '"').replace('"', '"')
    text = text.replace(''', "'").replace(''', "'")
    
    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,!?])', r'\1', text)
    text = re.sub(r'([.,!?])\s*([A-Z])', r'\1 \2', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Fix common formatting issues
    text = text.replace(' .', '.')
    text = text.replace(' ,', ',')
    text = text.replace(' !', '!')
    text = text.replace(' ?', '?')
    
    # Remove any remaining special characters
    text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C')
    
    return text.strip()

def clean_dataset_item(item: Dict[str, Any]) -> Dict[str, Any]:
    """
    Clean a single dataset item.
    
    Args:
        item: The dataset item to clean
        
    Returns:
        Cleaned dataset item
    """
    # Clean the prompt
    if 'prompt' in item:
        item['prompt'] = clean_text(item['prompt'])
    
    # Clean the completion
    if 'completion' in item:
        item['completion'] = clean_text(item['completion'])
    
    # Clean any metadata fields
    for key, value in item.items():
        if isinstance(value, str):
            item[key] = clean_text(value)
    
    return item

def process_dataset(input_file: str, output_file: str) -> None:
    """
    Process the dataset file, cleaning all items and writing to a new file.
    
    Args:
        input_file: Path to input dataset file
        output_file: Path to output cleaned dataset file
    """
    # Read the input dataset
    with open(input_file, 'r', encoding='utf-8') as f:
        dataset = json.load(f)
    
    # Clean each item
    cleaned_dataset = [clean_dataset_item(item) for item in dataset]
    
    # Write the cleaned dataset
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_dataset, f, indent=2, ensure_ascii=False)
    
    print(f"Cleaned dataset saved to {output_file}")
    print(f"Processed {len(cleaned_dataset)} items")

if __name__ == "__main__":
    input_file = "datasets/stage1_book_content.json"
    output_file = "datasets/stage1_book_content_cleaned.json"
    process_dataset(input_file, output_file)
feat: Implement Pinecone vector store integration 2025-04-16 23:09:52 +01:00			`import json`
			`import re`
			`import unicodedata`
			`from typing import Dict, List, Any`

			`def clean_text(text: str) -> str:`
			`"""`
			`Clean text by removing special characters, normalizing quotes, and fixing formatting.`

			`Args:`
			`text: The text to clean`

			`Returns:`
			`Cleaned text`
			`"""`
			`# Remove null bytes and control characters`
			`text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)`

			`# Normalize quotes`
			`text = text.replace('"', '"').replace('"', '"')`
			`text = text.replace(''', "'").replace(''', "'")`

			`# Fix spacing around punctuation`
			`text = re.sub(r'\s+([.,!?])', r'\1', text)`
			`text = re.sub(r'([.,!?])\s*([A-Z])', r'\1 \2', text)`

			`# Normalize whitespace`
			`text = re.sub(r'\s+', ' ', text)`

			`# Fix common formatting issues`
			`text = text.replace(' .', '.')`
			`text = text.replace(' ,', ',')`
			`text = text.replace(' !', '!')`
			`text = text.replace(' ?', '?')`

			`# Remove any remaining special characters`
			`text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C')`

			`return text.strip()`

			`def clean_dataset_item(item: Dict[str, Any]) -> Dict[str, Any]:`
			`"""`
			`Clean a single dataset item.`

			`Args:`
			`item: The dataset item to clean`

			`Returns:`
			`Cleaned dataset item`
			`"""`
			`# Clean the prompt`
			`if 'prompt' in item:`
			`item['prompt'] = clean_text(item['prompt'])`

			`# Clean the completion`
			`if 'completion' in item:`
			`item['completion'] = clean_text(item['completion'])`

			`# Clean any metadata fields`
			`for key, value in item.items():`
			`if isinstance(value, str):`
			`item[key] = clean_text(value)`

			`return item`

			`def process_dataset(input_file: str, output_file: str) -> None:`
			`"""`
			`Process the dataset file, cleaning all items and writing to a new file.`

			`Args:`
			`input_file: Path to input dataset file`
			`output_file: Path to output cleaned dataset file`
			`"""`
			`# Read the input dataset`
			`with open(input_file, 'r', encoding='utf-8') as f:`
			`dataset = json.load(f)`

			`# Clean each item`
			`cleaned_dataset = [clean_dataset_item(item) for item in dataset]`

			`# Write the cleaned dataset`
			`with open(output_file, 'w', encoding='utf-8') as f:`
			`json.dump(cleaned_dataset, f, indent=2, ensure_ascii=False)`

			`print(f"Cleaned dataset saved to {output_file}")`
			`print(f"Processed {len(cleaned_dataset)} items")`

			`if __name__ == "__main__":`
			`input_file = "datasets/stage1_book_content.json"`
			`output_file = "datasets/stage1_book_content_cleaned.json"`
			`process_dataset(input_file, output_file)`