#!/usr/bin/env python3 """ Test script for the styling data processor """ import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) from pipelines.styling.data_processor import StylingDataPipeline, create_custom_config, create_huggingface_config def test_styling_pipeline(): """Test the styling data processor with custom data""" print("Testing Styling Data Processor") print("=" * 50) # Initialize the pipeline pipeline = StylingDataPipeline() # Example 1: Load configuration from YAML print("\n1. Loading configuration from YAML...") try: yaml_config = pipeline.load_config_from_yaml("./configs/styling/formal.yaml") print(f" ✅ YAML config loaded successfully!") print(f" Output directory: {yaml_config.output_dir}") print(f" Instruction: {yaml_config.instruction}") print(f" Input field: {yaml_config.input_field}") print(f" Output field: {yaml_config.output_field}") except Exception as e: print(f" ❌ Error loading YAML config: {e}") yaml_config = None # Example 2: Create custom dataset configuration print("\n2. Creating custom dataset configuration...") custom_config = create_custom_config( data_path="./data/raw/styling/formal_dataset.jsonl", data_format="jsonl", input_field="text", output_field="styled_text", instruction="Rewrite the following text in a formal style", max_samples=1000, min_length=10, max_length=256, clean_text=True, lowercase=False, output_format="alpaca" ) print(f" Input field: {custom_config.input_field} (maps to 'input')") print(f" Output field: {custom_config.output_field} (maps to 'output')") print(f" Instruction: {custom_config.instruction}") print(f" Max samples: {custom_config.max_samples}") # Example 3: Test with sample data (if available) print("\n3. Testing pipeline with sample data...") # Create a sample dataset for testing sample_data = [ { "input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?" }, { "input": "This is really cool stuff!", "output": "This is quite impressive material." }, { "input": "I'm gonna go to the store later.", "output": "I will go to the store later." } ] # Save sample data to test file import json test_file = "./data/raw/styling/test_formal.jsonl" os.makedirs(os.path.dirname(test_file), exist_ok=True) with open(test_file, 'w', encoding='utf-8') as f: for item in sample_data: f.write(json.dumps(item, ensure_ascii=False) + '\n') print(f" Created test file: {test_file}") # Test the pipeline with the sample data try: test_config = create_custom_config( data_path=test_file, data_format="jsonl", input_field="input", output_field="output", instruction="Rewrite the following text in a formal style", max_samples=10, output_format="alpaca" ) print(" Running pipeline...") result = pipeline.run_pipeline(test_config, output_format="alpaca", save_splits=True, create_hf_dataset=True, save_hf_dataset=True) print(" ✅ Pipeline completed successfully!") print(f" Total samples: {result['analysis']['overall']['total_samples']}") print(f" Split sizes: {result['analysis']['overall']['split_sizes']}") print(f" Output directory: {result['output_dir']}") # Show HuggingFace dataset info if created if 'hf_dataset' in result: hf_dataset = result['hf_dataset'] print(f" HuggingFace dataset created with {len(hf_dataset)} entries") print(f" Dataset features: {hf_dataset.features}") # Show save path if saved to disk if 'hf_dataset_path' in result: print(f" Dataset saved to: {result['hf_dataset_path']}") # Show formatted example if len(hf_dataset) > 0: print(f" Example formatted text:") print(f" {hf_dataset[0]['text'][:200]}...") # Show sample processed data print("\n Sample processed data:") for split_name, split_data in result['data'].items(): if split_data: print(f" {split_name} split:") for i, item in enumerate(split_data[:2]): # Show first 2 items print(f" Item {i+1}:") print(f" Instruction: {item['instruction']}") print(f" Input: {item['input'][:50]}...") print(f" Output: {item['output'][:50]}...") break except Exception as e: print(f" ❌ Error running pipeline: {e}") print("\n" + "=" * 50) print("Test completed!") def test_hf_dataset_save_load(): """Test HuggingFace dataset save and load functionality""" print("\nTesting HuggingFace Dataset Save/Load") print("=" * 50) from pipelines.styling.data_processor import save_hf_dataset_to_disk, load_hf_dataset_from_disk # Create a sample dataset for testing sample_data = [ { "instruction": "Rewrite in formal style", "input": "Hey, what's up?", "output": "Hello, how are you?" }, { "instruction": "Rewrite in formal style", "input": "This is really cool!", "output": "This is quite impressive." } ] # Test configuration config = create_custom_config( data_path="dummy", instruction="Rewrite in formal style" ) # Convert to HuggingFace dataset pipeline = StylingDataPipeline() hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config) print(f"Created HuggingFace dataset with {len(hf_dataset)} entries") # Test saving to disk save_path = "./data/processed/styling/test_hf_dataset" print(f"\nSaving dataset to: {save_path}") success = save_hf_dataset_to_disk(hf_dataset, save_path) if success: print("✅ Dataset saved successfully!") # Test loading from disk print(f"\nLoading dataset from: {save_path}") loaded_dataset = load_hf_dataset_from_disk(save_path) if loaded_dataset is not None: print("✅ Dataset loaded successfully!") print(f"Loaded dataset has {len(loaded_dataset)} entries") print(f"Features: {loaded_dataset.features}") # Show sample data print("\nSample loaded data:") for i in range(len(loaded_dataset)): print(f" Entry {i+1}: {loaded_dataset[i]['text'][:100]}...") else: print("❌ Failed to load dataset") else: print("❌ Failed to save dataset") return hf_dataset def test_hf_dataset_conversion(): """Test the HuggingFace dataset conversion""" print("\nTesting HuggingFace Dataset Conversion") print("=" * 50) pipeline = StylingDataPipeline() # Sample data with instruction field sample_data = [ { "instruction": "Rewrite in formal style", "input": "Hey, what's up?", "output": "Hello, how are you?" }, { "instruction": "Rewrite in formal style", "input": "This is really cool!", "output": "This is quite impressive." } ] # Test configuration config = create_custom_config( data_path="dummy", instruction="Rewrite in formal style" ) # Convert to HuggingFace dataset hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config) print(f"HuggingFace dataset created with {len(hf_dataset)} entries") print(f"Dataset features: {hf_dataset.features}") # Show formatted examples print("\nFormatted examples:") for i in range(len(hf_dataset)): print(f" Example {i+1}:") print(f" {hf_dataset[i]['text'][:150]}...") print() # Test the dataset can be used for training print("Dataset ready for training!") print(f"Number of training examples: {len(hf_dataset)}") return hf_dataset if __name__ == "__main__": test_styling_pipeline() # test_hf_dataset_save_load() # test_hf_dataset_conversion()