91 lines
2.6 KiB
Python
91 lines
2.6 KiB
Python
|
|
import json
|
||
|
|
import re
|
||
|
|
import unicodedata
|
||
|
|
from typing import Dict, List, Any
|
||
|
|
|
||
|
|
def clean_text(text: str) -> str:
|
||
|
|
"""
|
||
|
|
Clean text by removing special characters, normalizing quotes, and fixing formatting.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
text: The text to clean
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Cleaned text
|
||
|
|
"""
|
||
|
|
# Remove null bytes and control characters
|
||
|
|
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
|
||
|
|
|
||
|
|
# Normalize quotes
|
||
|
|
text = text.replace('"', '"').replace('"', '"')
|
||
|
|
text = text.replace(''', "'").replace(''', "'")
|
||
|
|
|
||
|
|
# Fix spacing around punctuation
|
||
|
|
text = re.sub(r'\s+([.,!?])', r'\1', text)
|
||
|
|
text = re.sub(r'([.,!?])\s*([A-Z])', r'\1 \2', text)
|
||
|
|
|
||
|
|
# Normalize whitespace
|
||
|
|
text = re.sub(r'\s+', ' ', text)
|
||
|
|
|
||
|
|
# Fix common formatting issues
|
||
|
|
text = text.replace(' .', '.')
|
||
|
|
text = text.replace(' ,', ',')
|
||
|
|
text = text.replace(' !', '!')
|
||
|
|
text = text.replace(' ?', '?')
|
||
|
|
|
||
|
|
# Remove any remaining special characters
|
||
|
|
text = ''.join(char for char in text if unicodedata.category(char)[0] != 'C')
|
||
|
|
|
||
|
|
return text.strip()
|
||
|
|
|
||
|
|
def clean_dataset_item(item: Dict[str, Any]) -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Clean a single dataset item.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
item: The dataset item to clean
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Cleaned dataset item
|
||
|
|
"""
|
||
|
|
# Clean the prompt
|
||
|
|
if 'prompt' in item:
|
||
|
|
item['prompt'] = clean_text(item['prompt'])
|
||
|
|
|
||
|
|
# Clean the completion
|
||
|
|
if 'completion' in item:
|
||
|
|
item['completion'] = clean_text(item['completion'])
|
||
|
|
|
||
|
|
# Clean any metadata fields
|
||
|
|
for key, value in item.items():
|
||
|
|
if isinstance(value, str):
|
||
|
|
item[key] = clean_text(value)
|
||
|
|
|
||
|
|
return item
|
||
|
|
|
||
|
|
def process_dataset(input_file: str, output_file: str) -> None:
|
||
|
|
"""
|
||
|
|
Process the dataset file, cleaning all items and writing to a new file.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
input_file: Path to input dataset file
|
||
|
|
output_file: Path to output cleaned dataset file
|
||
|
|
"""
|
||
|
|
# Read the input dataset
|
||
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
||
|
|
dataset = json.load(f)
|
||
|
|
|
||
|
|
# Clean each item
|
||
|
|
cleaned_dataset = [clean_dataset_item(item) for item in dataset]
|
||
|
|
|
||
|
|
# Write the cleaned dataset
|
||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||
|
|
json.dump(cleaned_dataset, f, indent=2, ensure_ascii=False)
|
||
|
|
|
||
|
|
print(f"Cleaned dataset saved to {output_file}")
|
||
|
|
print(f"Processed {len(cleaned_dataset)} items")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
input_file = "datasets/stage1_book_content.json"
|
||
|
|
output_file = "datasets/stage1_book_content_cleaned.json"
|
||
|
|
process_dataset(input_file, output_file)
|