Files
DS-LLM-TEMPLATE-FINETUNING/.ipynb_checkpoints/test-checkpoint.py
T

252 lines
8.6 KiB
Python
Raw Normal View History

2025-08-28 22:41:56 +00:00
#!/usr/bin/env python3
"""
Test script for the styling data processor
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from pipelines.styling.data_processor import StylingDataPipeline, create_custom_config, create_huggingface_config
def test_styling_pipeline():
"""Test the styling data processor with custom data"""
print("Testing Styling Data Processor")
print("=" * 50)
# Initialize the pipeline
pipeline = StylingDataPipeline()
# Example 1: Load configuration from YAML
print("\n1. Loading configuration from YAML...")
try:
yaml_config = pipeline.load_config_from_yaml("./configs/styling/formal.yaml")
print(f" ✅ YAML config loaded successfully!")
print(f" Output directory: {yaml_config.output_dir}")
print(f" Instruction: {yaml_config.instruction}")
print(f" Input field: {yaml_config.input_field}")
print(f" Output field: {yaml_config.output_field}")
except Exception as e:
print(f" ❌ Error loading YAML config: {e}")
yaml_config = None
# Example 2: Create custom dataset configuration
print("\n2. Creating custom dataset configuration...")
custom_config = create_custom_config(
data_path="./data/raw/styling/formal_dataset.jsonl",
data_format="jsonl",
input_field="text",
output_field="styled_text",
instruction="Rewrite the following text in a formal style",
max_samples=1000,
min_length=10,
max_length=256,
clean_text=True,
lowercase=False,
output_format="alpaca"
)
print(f" Input field: {custom_config.input_field} (maps to 'input')")
print(f" Output field: {custom_config.output_field} (maps to 'output')")
print(f" Instruction: {custom_config.instruction}")
print(f" Max samples: {custom_config.max_samples}")
# Example 3: Test with sample data (if available)
print("\n3. Testing pipeline with sample data...")
# Create a sample dataset for testing
sample_data = [
{
"input": "Hey, what's up? How are you doing today?",
"output": "Hello, how are you doing today?"
},
{
"input": "This is really cool stuff!",
"output": "This is quite impressive material."
},
{
"input": "I'm gonna go to the store later.",
"output": "I will go to the store later."
}
]
# Save sample data to test file
import json
test_file = "./data/raw/styling/test_formal.jsonl"
os.makedirs(os.path.dirname(test_file), exist_ok=True)
with open(test_file, 'w', encoding='utf-8') as f:
for item in sample_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f" Created test file: {test_file}")
# Test the pipeline with the sample data
try:
test_config = create_custom_config(
data_path=test_file,
data_format="jsonl",
input_field="input",
output_field="output",
instruction="Rewrite the following text in a formal style",
max_samples=10,
output_format="alpaca"
)
print(" Running pipeline...")
result = pipeline.run_pipeline(test_config, output_format="alpaca", save_splits=True, create_hf_dataset=True, save_hf_dataset=True)
print(" ✅ Pipeline completed successfully!")
print(f" Total samples: {result['analysis']['overall']['total_samples']}")
print(f" Split sizes: {result['analysis']['overall']['split_sizes']}")
print(f" Output directory: {result['output_dir']}")
# Show HuggingFace dataset info if created
if 'hf_dataset' in result:
hf_dataset = result['hf_dataset']
print(f" HuggingFace dataset created with {len(hf_dataset)} entries")
print(f" Dataset features: {hf_dataset.features}")
# Show save path if saved to disk
if 'hf_dataset_path' in result:
print(f" Dataset saved to: {result['hf_dataset_path']}")
# Show formatted example
if len(hf_dataset) > 0:
print(f" Example formatted text:")
print(f" {hf_dataset[0]['text'][:200]}...")
# Show sample processed data
print("\n Sample processed data:")
for split_name, split_data in result['data'].items():
if split_data:
print(f" {split_name} split:")
for i, item in enumerate(split_data[:2]): # Show first 2 items
print(f" Item {i+1}:")
print(f" Instruction: {item['instruction']}")
print(f" Input: {item['input'][:50]}...")
print(f" Output: {item['output'][:50]}...")
break
except Exception as e:
print(f" ❌ Error running pipeline: {e}")
print("\n" + "=" * 50)
print("Test completed!")
def test_hf_dataset_save_load():
"""Test HuggingFace dataset save and load functionality"""
print("\nTesting HuggingFace Dataset Save/Load")
print("=" * 50)
from pipelines.styling.data_processor import save_hf_dataset_to_disk, load_hf_dataset_from_disk
# Create a sample dataset for testing
sample_data = [
{
"instruction": "Rewrite in formal style",
"input": "Hey, what's up?",
"output": "Hello, how are you?"
},
{
"instruction": "Rewrite in formal style",
"input": "This is really cool!",
"output": "This is quite impressive."
}
]
# Test configuration
config = create_custom_config(
data_path="dummy",
instruction="Rewrite in formal style"
)
# Convert to HuggingFace dataset
pipeline = StylingDataPipeline()
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
print(f"Created HuggingFace dataset with {len(hf_dataset)} entries")
# Test saving to disk
save_path = "./data/processed/styling/test_hf_dataset"
print(f"\nSaving dataset to: {save_path}")
success = save_hf_dataset_to_disk(hf_dataset, save_path)
if success:
print("✅ Dataset saved successfully!")
# Test loading from disk
print(f"\nLoading dataset from: {save_path}")
loaded_dataset = load_hf_dataset_from_disk(save_path)
if loaded_dataset is not None:
print("✅ Dataset loaded successfully!")
print(f"Loaded dataset has {len(loaded_dataset)} entries")
print(f"Features: {loaded_dataset.features}")
# Show sample data
print("\nSample loaded data:")
for i in range(len(loaded_dataset)):
print(f" Entry {i+1}: {loaded_dataset[i]['text'][:100]}...")
else:
print("❌ Failed to load dataset")
else:
print("❌ Failed to save dataset")
return hf_dataset
def test_hf_dataset_conversion():
"""Test the HuggingFace dataset conversion"""
print("\nTesting HuggingFace Dataset Conversion")
print("=" * 50)
pipeline = StylingDataPipeline()
# Sample data with instruction field
sample_data = [
{
"instruction": "Rewrite in formal style",
"input": "Hey, what's up?",
"output": "Hello, how are you?"
},
{
"instruction": "Rewrite in formal style",
"input": "This is really cool!",
"output": "This is quite impressive."
}
]
# Test configuration
config = create_custom_config(
data_path="dummy",
instruction="Rewrite in formal style"
)
# Convert to HuggingFace dataset
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
print(f"HuggingFace dataset created with {len(hf_dataset)} entries")
print(f"Dataset features: {hf_dataset.features}")
# Show formatted examples
print("\nFormatted examples:")
for i in range(len(hf_dataset)):
print(f" Example {i+1}:")
print(f" {hf_dataset[i]['text'][:150]}...")
print()
# Test the dataset can be used for training
print("Dataset ready for training!")
print(f"Number of training examples: {len(hf_dataset)}")
return hf_dataset
if __name__ == "__main__":
test_styling_pipeline()
# test_hf_dataset_save_load()
# test_hf_dataset_conversion()