added style mimicking piepelines
This commit is contained in:
@@ -0,0 +1,251 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test script for the styling data processor
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from pipelines.styling.data_processor import StylingDataPipeline, create_custom_config, create_huggingface_config
|
||||
|
||||
def test_styling_pipeline():
|
||||
"""Test the styling data processor with custom data"""
|
||||
|
||||
print("Testing Styling Data Processor")
|
||||
print("=" * 50)
|
||||
|
||||
# Initialize the pipeline
|
||||
pipeline = StylingDataPipeline()
|
||||
|
||||
# Example 1: Load configuration from YAML
|
||||
print("\n1. Loading configuration from YAML...")
|
||||
try:
|
||||
yaml_config = pipeline.load_config_from_yaml("./configs/styling/formal.yaml")
|
||||
print(f" ✅ YAML config loaded successfully!")
|
||||
print(f" Output directory: {yaml_config.output_dir}")
|
||||
print(f" Instruction: {yaml_config.instruction}")
|
||||
print(f" Input field: {yaml_config.input_field}")
|
||||
print(f" Output field: {yaml_config.output_field}")
|
||||
except Exception as e:
|
||||
print(f" ❌ Error loading YAML config: {e}")
|
||||
yaml_config = None
|
||||
|
||||
# Example 2: Create custom dataset configuration
|
||||
print("\n2. Creating custom dataset configuration...")
|
||||
custom_config = create_custom_config(
|
||||
data_path="./data/raw/styling/formal_dataset.jsonl",
|
||||
data_format="jsonl",
|
||||
input_field="text",
|
||||
output_field="styled_text",
|
||||
instruction="Rewrite the following text in a formal style",
|
||||
max_samples=1000,
|
||||
min_length=10,
|
||||
max_length=256,
|
||||
clean_text=True,
|
||||
lowercase=False,
|
||||
output_format="alpaca"
|
||||
)
|
||||
|
||||
print(f" Input field: {custom_config.input_field} (maps to 'input')")
|
||||
print(f" Output field: {custom_config.output_field} (maps to 'output')")
|
||||
print(f" Instruction: {custom_config.instruction}")
|
||||
print(f" Max samples: {custom_config.max_samples}")
|
||||
|
||||
# Example 3: Test with sample data (if available)
|
||||
print("\n3. Testing pipeline with sample data...")
|
||||
|
||||
# Create a sample dataset for testing
|
||||
sample_data = [
|
||||
{
|
||||
"input": "Hey, what's up? How are you doing today?",
|
||||
"output": "Hello, how are you doing today?"
|
||||
},
|
||||
{
|
||||
"input": "This is really cool stuff!",
|
||||
"output": "This is quite impressive material."
|
||||
},
|
||||
{
|
||||
"input": "I'm gonna go to the store later.",
|
||||
"output": "I will go to the store later."
|
||||
}
|
||||
]
|
||||
|
||||
# Save sample data to test file
|
||||
import json
|
||||
test_file = "./data/raw/styling/test_formal.jsonl"
|
||||
os.makedirs(os.path.dirname(test_file), exist_ok=True)
|
||||
|
||||
with open(test_file, 'w', encoding='utf-8') as f:
|
||||
for item in sample_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
print(f" Created test file: {test_file}")
|
||||
|
||||
# Test the pipeline with the sample data
|
||||
try:
|
||||
test_config = create_custom_config(
|
||||
data_path=test_file,
|
||||
data_format="jsonl",
|
||||
input_field="input",
|
||||
output_field="output",
|
||||
instruction="Rewrite the following text in a formal style",
|
||||
max_samples=10,
|
||||
output_format="alpaca"
|
||||
)
|
||||
|
||||
print(" Running pipeline...")
|
||||
result = pipeline.run_pipeline(test_config, output_format="alpaca", save_splits=True, create_hf_dataset=True, save_hf_dataset=True)
|
||||
|
||||
print(" ✅ Pipeline completed successfully!")
|
||||
print(f" Total samples: {result['analysis']['overall']['total_samples']}")
|
||||
print(f" Split sizes: {result['analysis']['overall']['split_sizes']}")
|
||||
print(f" Output directory: {result['output_dir']}")
|
||||
|
||||
# Show HuggingFace dataset info if created
|
||||
if 'hf_dataset' in result:
|
||||
hf_dataset = result['hf_dataset']
|
||||
print(f" HuggingFace dataset created with {len(hf_dataset)} entries")
|
||||
print(f" Dataset features: {hf_dataset.features}")
|
||||
|
||||
# Show save path if saved to disk
|
||||
if 'hf_dataset_path' in result:
|
||||
print(f" Dataset saved to: {result['hf_dataset_path']}")
|
||||
|
||||
# Show formatted example
|
||||
if len(hf_dataset) > 0:
|
||||
print(f" Example formatted text:")
|
||||
print(f" {hf_dataset[0]['text'][:200]}...")
|
||||
|
||||
# Show sample processed data
|
||||
print("\n Sample processed data:")
|
||||
for split_name, split_data in result['data'].items():
|
||||
if split_data:
|
||||
print(f" {split_name} split:")
|
||||
for i, item in enumerate(split_data[:2]): # Show first 2 items
|
||||
print(f" Item {i+1}:")
|
||||
print(f" Instruction: {item['instruction']}")
|
||||
print(f" Input: {item['input'][:50]}...")
|
||||
print(f" Output: {item['output'][:50]}...")
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Error running pipeline: {e}")
|
||||
|
||||
print("\n" + "=" * 50)
|
||||
print("Test completed!")
|
||||
|
||||
def test_hf_dataset_save_load():
|
||||
"""Test HuggingFace dataset save and load functionality"""
|
||||
|
||||
print("\nTesting HuggingFace Dataset Save/Load")
|
||||
print("=" * 50)
|
||||
|
||||
from pipelines.styling.data_processor import save_hf_dataset_to_disk, load_hf_dataset_from_disk
|
||||
|
||||
# Create a sample dataset for testing
|
||||
sample_data = [
|
||||
{
|
||||
"instruction": "Rewrite in formal style",
|
||||
"input": "Hey, what's up?",
|
||||
"output": "Hello, how are you?"
|
||||
},
|
||||
{
|
||||
"instruction": "Rewrite in formal style",
|
||||
"input": "This is really cool!",
|
||||
"output": "This is quite impressive."
|
||||
}
|
||||
]
|
||||
|
||||
# Test configuration
|
||||
config = create_custom_config(
|
||||
data_path="dummy",
|
||||
instruction="Rewrite in formal style"
|
||||
)
|
||||
|
||||
# Convert to HuggingFace dataset
|
||||
pipeline = StylingDataPipeline()
|
||||
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
|
||||
|
||||
print(f"Created HuggingFace dataset with {len(hf_dataset)} entries")
|
||||
|
||||
# Test saving to disk
|
||||
save_path = "./data/processed/styling/test_hf_dataset"
|
||||
print(f"\nSaving dataset to: {save_path}")
|
||||
|
||||
success = save_hf_dataset_to_disk(hf_dataset, save_path)
|
||||
if success:
|
||||
print("✅ Dataset saved successfully!")
|
||||
|
||||
# Test loading from disk
|
||||
print(f"\nLoading dataset from: {save_path}")
|
||||
loaded_dataset = load_hf_dataset_from_disk(save_path)
|
||||
|
||||
if loaded_dataset is not None:
|
||||
print("✅ Dataset loaded successfully!")
|
||||
print(f"Loaded dataset has {len(loaded_dataset)} entries")
|
||||
print(f"Features: {loaded_dataset.features}")
|
||||
|
||||
# Show sample data
|
||||
print("\nSample loaded data:")
|
||||
for i in range(len(loaded_dataset)):
|
||||
print(f" Entry {i+1}: {loaded_dataset[i]['text'][:100]}...")
|
||||
else:
|
||||
print("❌ Failed to load dataset")
|
||||
else:
|
||||
print("❌ Failed to save dataset")
|
||||
|
||||
return hf_dataset
|
||||
|
||||
def test_hf_dataset_conversion():
|
||||
"""Test the HuggingFace dataset conversion"""
|
||||
|
||||
print("\nTesting HuggingFace Dataset Conversion")
|
||||
print("=" * 50)
|
||||
|
||||
pipeline = StylingDataPipeline()
|
||||
|
||||
# Sample data with instruction field
|
||||
sample_data = [
|
||||
{
|
||||
"instruction": "Rewrite in formal style",
|
||||
"input": "Hey, what's up?",
|
||||
"output": "Hello, how are you?"
|
||||
},
|
||||
{
|
||||
"instruction": "Rewrite in formal style",
|
||||
"input": "This is really cool!",
|
||||
"output": "This is quite impressive."
|
||||
}
|
||||
]
|
||||
|
||||
# Test configuration
|
||||
config = create_custom_config(
|
||||
data_path="dummy",
|
||||
instruction="Rewrite in formal style"
|
||||
)
|
||||
|
||||
# Convert to HuggingFace dataset
|
||||
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
|
||||
|
||||
print(f"HuggingFace dataset created with {len(hf_dataset)} entries")
|
||||
print(f"Dataset features: {hf_dataset.features}")
|
||||
|
||||
# Show formatted examples
|
||||
print("\nFormatted examples:")
|
||||
for i in range(len(hf_dataset)):
|
||||
print(f" Example {i+1}:")
|
||||
print(f" {hf_dataset[i]['text'][:150]}...")
|
||||
print()
|
||||
|
||||
# Test the dataset can be used for training
|
||||
print("Dataset ready for training!")
|
||||
print(f"Number of training examples: {len(hf_dataset)}")
|
||||
|
||||
return hf_dataset
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_styling_pipeline()
|
||||
# test_hf_dataset_save_load()
|
||||
# test_hf_dataset_conversion()
|
||||
Reference in New Issue
Block a user