import json import re import unicodedata from typing import Dict, List, Any def clean_text(text: str) -> str: """ Clean text by removing special characters, normalizing quotes, and fixing formatting. Args: text: The text to clean Returns: Cleaned text """ # Remove null bytes and control characters text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Normalize quotes text = text.replace('"', '"').replace('"', '"') text = text.replace(''', "'").replace(''', "'") # Fix spacing around punctuation text = re.sub(r'\s+([.,!?])', r'\1', text) text = re.sub(r'([.,!?])\s*([A-Z])', r'\1 \2', text) # Normalize whitespace text = re.sub(r'\s+', ' ', text) # Fix common formatting issues text = text.replace(' .', '.') text = text.replace(' ,', ',') text = text.replace(' !', '!') text = text.replace(' ?', '?') # Remove any remaining special characters text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C') return text.strip() def clean_dataset_item(item: Dict[str, Any]) -> Dict[str, Any]: """ Clean a single dataset item. Args: item: The dataset item to clean Returns: Cleaned dataset item """ # Clean the prompt if 'prompt' in item: item['prompt'] = clean_text(item['prompt']) # Clean the completion if 'completion' in item: item['completion'] = clean_text(item['completion']) # Clean any metadata fields for key, value in item.items(): if isinstance(value, str): item[key] = clean_text(value) return item def process_dataset(input_file: str, output_file: str) -> None: """ Process the dataset file, cleaning all items and writing to a new file. Args: input_file: Path to input dataset file output_file: Path to output cleaned dataset file """ # Read the input dataset with open(input_file, 'r', encoding='utf-8') as f: dataset = json.load(f) # Clean each item cleaned_dataset = [clean_dataset_item(item) for item in dataset] # Write the cleaned dataset with open(output_file, 'w', encoding='utf-8') as f: json.dump(cleaned_dataset, f, indent=2, ensure_ascii=False) print(f"Cleaned dataset saved to {output_file}") print(f"Processed {len(cleaned_dataset)} items") if __name__ == "__main__": input_file = "datasets/stage1_book_content.json" output_file = "datasets/stage1_book_content_cleaned.json" process_dataset(input_file, output_file)