252 lines
8.6 KiB
Python
252 lines
8.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Test script for the styling data processor
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
|
|
|
from pipelines.styling.data_processor import StylingDataPipeline, create_custom_config, create_huggingface_config
|
|
|
|
def test_styling_pipeline():
|
|
"""Test the styling data processor with custom data"""
|
|
|
|
print("Testing Styling Data Processor")
|
|
print("=" * 50)
|
|
|
|
# Initialize the pipeline
|
|
pipeline = StylingDataPipeline()
|
|
|
|
# Example 1: Load configuration from YAML
|
|
print("\n1. Loading configuration from YAML...")
|
|
try:
|
|
yaml_config = pipeline.load_config_from_yaml("./configs/styling/formal.yaml")
|
|
print(f" ✅ YAML config loaded successfully!")
|
|
print(f" Output directory: {yaml_config.output_dir}")
|
|
print(f" Instruction: {yaml_config.instruction}")
|
|
print(f" Input field: {yaml_config.input_field}")
|
|
print(f" Output field: {yaml_config.output_field}")
|
|
except Exception as e:
|
|
print(f" ❌ Error loading YAML config: {e}")
|
|
yaml_config = None
|
|
|
|
# Example 2: Create custom dataset configuration
|
|
print("\n2. Creating custom dataset configuration...")
|
|
custom_config = create_custom_config(
|
|
data_path="./data/raw/styling/formal_dataset.jsonl",
|
|
data_format="jsonl",
|
|
input_field="text",
|
|
output_field="styled_text",
|
|
instruction="Rewrite the following text in a formal style",
|
|
max_samples=1000,
|
|
min_length=10,
|
|
max_length=256,
|
|
clean_text=True,
|
|
lowercase=False,
|
|
output_format="alpaca"
|
|
)
|
|
|
|
print(f" Input field: {custom_config.input_field} (maps to 'input')")
|
|
print(f" Output field: {custom_config.output_field} (maps to 'output')")
|
|
print(f" Instruction: {custom_config.instruction}")
|
|
print(f" Max samples: {custom_config.max_samples}")
|
|
|
|
# Example 3: Test with sample data (if available)
|
|
print("\n3. Testing pipeline with sample data...")
|
|
|
|
# Create a sample dataset for testing
|
|
sample_data = [
|
|
{
|
|
"input": "Hey, what's up? How are you doing today?",
|
|
"output": "Hello, how are you doing today?"
|
|
},
|
|
{
|
|
"input": "This is really cool stuff!",
|
|
"output": "This is quite impressive material."
|
|
},
|
|
{
|
|
"input": "I'm gonna go to the store later.",
|
|
"output": "I will go to the store later."
|
|
}
|
|
]
|
|
|
|
# Save sample data to test file
|
|
import json
|
|
test_file = "./data/raw/styling/test_formal.jsonl"
|
|
os.makedirs(os.path.dirname(test_file), exist_ok=True)
|
|
|
|
with open(test_file, 'w', encoding='utf-8') as f:
|
|
for item in sample_data:
|
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
|
|
|
print(f" Created test file: {test_file}")
|
|
|
|
# Test the pipeline with the sample data
|
|
try:
|
|
test_config = create_custom_config(
|
|
data_path=test_file,
|
|
data_format="jsonl",
|
|
input_field="input",
|
|
output_field="output",
|
|
instruction="Rewrite the following text in a formal style",
|
|
max_samples=10,
|
|
output_format="alpaca"
|
|
)
|
|
|
|
print(" Running pipeline...")
|
|
result = pipeline.run_pipeline(test_config, output_format="alpaca", save_splits=True, create_hf_dataset=True, save_hf_dataset=True)
|
|
|
|
print(" ✅ Pipeline completed successfully!")
|
|
print(f" Total samples: {result['analysis']['overall']['total_samples']}")
|
|
print(f" Split sizes: {result['analysis']['overall']['split_sizes']}")
|
|
print(f" Output directory: {result['output_dir']}")
|
|
|
|
# Show HuggingFace dataset info if created
|
|
if 'hf_dataset' in result:
|
|
hf_dataset = result['hf_dataset']
|
|
print(f" HuggingFace dataset created with {len(hf_dataset)} entries")
|
|
print(f" Dataset features: {hf_dataset.features}")
|
|
|
|
# Show save path if saved to disk
|
|
if 'hf_dataset_path' in result:
|
|
print(f" Dataset saved to: {result['hf_dataset_path']}")
|
|
|
|
# Show formatted example
|
|
if len(hf_dataset) > 0:
|
|
print(f" Example formatted text:")
|
|
print(f" {hf_dataset[0]['text'][:200]}...")
|
|
|
|
# Show sample processed data
|
|
print("\n Sample processed data:")
|
|
for split_name, split_data in result['data'].items():
|
|
if split_data:
|
|
print(f" {split_name} split:")
|
|
for i, item in enumerate(split_data[:2]): # Show first 2 items
|
|
print(f" Item {i+1}:")
|
|
print(f" Instruction: {item['instruction']}")
|
|
print(f" Input: {item['input'][:50]}...")
|
|
print(f" Output: {item['output'][:50]}...")
|
|
break
|
|
|
|
except Exception as e:
|
|
print(f" ❌ Error running pipeline: {e}")
|
|
|
|
print("\n" + "=" * 50)
|
|
print("Test completed!")
|
|
|
|
def test_hf_dataset_save_load():
|
|
"""Test HuggingFace dataset save and load functionality"""
|
|
|
|
print("\nTesting HuggingFace Dataset Save/Load")
|
|
print("=" * 50)
|
|
|
|
from pipelines.styling.data_processor import save_hf_dataset_to_disk, load_hf_dataset_from_disk
|
|
|
|
# Create a sample dataset for testing
|
|
sample_data = [
|
|
{
|
|
"instruction": "Rewrite in formal style",
|
|
"input": "Hey, what's up?",
|
|
"output": "Hello, how are you?"
|
|
},
|
|
{
|
|
"instruction": "Rewrite in formal style",
|
|
"input": "This is really cool!",
|
|
"output": "This is quite impressive."
|
|
}
|
|
]
|
|
|
|
# Test configuration
|
|
config = create_custom_config(
|
|
data_path="dummy",
|
|
instruction="Rewrite in formal style"
|
|
)
|
|
|
|
# Convert to HuggingFace dataset
|
|
pipeline = StylingDataPipeline()
|
|
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
|
|
|
|
print(f"Created HuggingFace dataset with {len(hf_dataset)} entries")
|
|
|
|
# Test saving to disk
|
|
save_path = "./data/processed/styling/test_hf_dataset"
|
|
print(f"\nSaving dataset to: {save_path}")
|
|
|
|
success = save_hf_dataset_to_disk(hf_dataset, save_path)
|
|
if success:
|
|
print("✅ Dataset saved successfully!")
|
|
|
|
# Test loading from disk
|
|
print(f"\nLoading dataset from: {save_path}")
|
|
loaded_dataset = load_hf_dataset_from_disk(save_path)
|
|
|
|
if loaded_dataset is not None:
|
|
print("✅ Dataset loaded successfully!")
|
|
print(f"Loaded dataset has {len(loaded_dataset)} entries")
|
|
print(f"Features: {loaded_dataset.features}")
|
|
|
|
# Show sample data
|
|
print("\nSample loaded data:")
|
|
for i in range(len(loaded_dataset)):
|
|
print(f" Entry {i+1}: {loaded_dataset[i]['text'][:100]}...")
|
|
else:
|
|
print("❌ Failed to load dataset")
|
|
else:
|
|
print("❌ Failed to save dataset")
|
|
|
|
return hf_dataset
|
|
|
|
def test_hf_dataset_conversion():
|
|
"""Test the HuggingFace dataset conversion"""
|
|
|
|
print("\nTesting HuggingFace Dataset Conversion")
|
|
print("=" * 50)
|
|
|
|
pipeline = StylingDataPipeline()
|
|
|
|
# Sample data with instruction field
|
|
sample_data = [
|
|
{
|
|
"instruction": "Rewrite in formal style",
|
|
"input": "Hey, what's up?",
|
|
"output": "Hello, how are you?"
|
|
},
|
|
{
|
|
"instruction": "Rewrite in formal style",
|
|
"input": "This is really cool!",
|
|
"output": "This is quite impressive."
|
|
}
|
|
]
|
|
|
|
# Test configuration
|
|
config = create_custom_config(
|
|
data_path="dummy",
|
|
instruction="Rewrite in formal style"
|
|
)
|
|
|
|
# Convert to HuggingFace dataset
|
|
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
|
|
|
|
print(f"HuggingFace dataset created with {len(hf_dataset)} entries")
|
|
print(f"Dataset features: {hf_dataset.features}")
|
|
|
|
# Show formatted examples
|
|
print("\nFormatted examples:")
|
|
for i in range(len(hf_dataset)):
|
|
print(f" Example {i+1}:")
|
|
print(f" {hf_dataset[i]['text'][:150]}...")
|
|
print()
|
|
|
|
# Test the dataset can be used for training
|
|
print("Dataset ready for training!")
|
|
print(f"Number of training examples: {len(hf_dataset)}")
|
|
|
|
return hf_dataset
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_styling_pipeline()
|
|
# test_hf_dataset_save_load()
|
|
# test_hf_dataset_conversion()
|