168 lines
5.7 KiB
Python
168 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Data processor script that uses YAML configurations.
|
|
This provides a more flexible and maintainable approach than command-line arguments alone.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import subprocess
|
|
from pathlib import Path
|
|
from utils.config.config_manager import ConfigManager
|
|
|
|
def run_with_yaml_config(config_path: str, **cli_overrides):
|
|
"""Run data processor with YAML configuration"""
|
|
print(f"=== Running with YAML config: {config_path} ===")
|
|
|
|
cmd = [
|
|
"python", "pipelines/classification/data_processor.py",
|
|
"--config", config_path
|
|
]
|
|
|
|
# Add CLI overrides
|
|
for key, value in cli_overrides.items():
|
|
if value is not None:
|
|
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
|
|
|
print(f"Running command: {' '.join(cmd)}")
|
|
print()
|
|
|
|
try:
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
print("✅ Data processing completed successfully!")
|
|
print(result.stdout)
|
|
return True
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"❌ Error running data processor: {e}")
|
|
print(f"Error output: {e.stderr}")
|
|
return False
|
|
|
|
def run_classification_examples():
|
|
"""Run classification examples with YAML configs"""
|
|
|
|
# Example 1: Emotion classification
|
|
print("=== Example 1: Emotion Classification ===")
|
|
success = run_with_yaml_config(
|
|
"configs/classification/emotion.yaml",
|
|
max_samples=500, # Override YAML value
|
|
output_dir="./data/emotion_small"
|
|
)
|
|
|
|
if success:
|
|
print("✅ Emotion classification completed!")
|
|
|
|
# Example 2: Custom dataset (if available)
|
|
print("\n=== Example 2: Custom Dataset ===")
|
|
if os.path.exists("data/classification/train.jsonl"):
|
|
success = run_with_yaml_config(
|
|
"configs/classification/custom.yaml", # You can create this
|
|
data_source="custom",
|
|
data_path="data/classification/train.jsonl",
|
|
output_dir="./data/custom_processed"
|
|
)
|
|
if success:
|
|
print("✅ Custom dataset processing completed!")
|
|
else:
|
|
print("⚠️ Custom dataset not found, skipping...")
|
|
|
|
def create_custom_config():
|
|
"""Create a custom configuration file"""
|
|
custom_config = """task:
|
|
name: "classification"
|
|
type: "sequence_classification"
|
|
|
|
data:
|
|
source: "custom"
|
|
data_format: "jsonl"
|
|
input_field: "text"
|
|
label_field: "label"
|
|
max_samples: 1000
|
|
train_split: 0.8
|
|
validation_split: 0.1
|
|
test_split: 0.1
|
|
|
|
processing:
|
|
clean_text: true
|
|
lowercase: true
|
|
min_length: 10
|
|
max_length: 1000
|
|
|
|
output:
|
|
output_dir: "./data/custom_processed"
|
|
output_format: "classification"
|
|
"""
|
|
|
|
config_path = "configs/classification/custom.yaml"
|
|
with open(config_path, 'w') as f:
|
|
f.write(custom_config)
|
|
|
|
print(f"✅ Created custom config: {config_path}")
|
|
|
|
def show_yaml_benefits():
|
|
"""Show the benefits of using YAML configurations"""
|
|
print("=== YAML Configuration Benefits ===")
|
|
print()
|
|
print("1. **Separation of Concerns**:")
|
|
print(" - Configuration separate from code")
|
|
print(" - Easy to version control configs")
|
|
print(" - No need to modify scripts for different experiments")
|
|
print()
|
|
print("2. **Flexibility**:")
|
|
print(" - Can override YAML values with CLI args")
|
|
print(" - Multiple configs for different experiments")
|
|
print(" - Easy to share and reproduce experiments")
|
|
print()
|
|
print("3. **Maintainability**:")
|
|
print(" - All parameters in one place")
|
|
print(" - Easy to understand and modify")
|
|
print(" - Self-documenting configurations")
|
|
print()
|
|
print("4. **Scalability**:")
|
|
print(" - Easy to add new parameters")
|
|
print(" - Hierarchical configuration structure")
|
|
print(" - Support for complex nested configurations")
|
|
print()
|
|
print("=== Usage Examples ===")
|
|
print()
|
|
print("1. Use YAML config only:")
|
|
print(" python scripts/run_data_processor_yaml.py --config configs/classification/emotion.yaml")
|
|
print()
|
|
print("2. Override YAML values:")
|
|
print(" python scripts/run_data_processor_yaml.py --config configs/classification/emotion.yaml --max-samples 500")
|
|
print()
|
|
print("3. Use CLI only (backward compatibility):")
|
|
print(" python scripts/run_data_processor_yaml.py --data-source huggingface --dataset-name dair-ai/emotion")
|
|
|
|
def main():
|
|
"""Main function"""
|
|
if len(sys.argv) > 1:
|
|
if sys.argv[1] == "examples":
|
|
run_classification_examples()
|
|
elif sys.argv[1] == "create-config":
|
|
create_custom_config()
|
|
elif sys.argv[1] == "benefits":
|
|
show_yaml_benefits()
|
|
else:
|
|
print(f"Unknown option: {sys.argv[1]}")
|
|
print("Use: python scripts/run_data_processor_yaml.py [examples|create-config|benefits]")
|
|
else:
|
|
print("YAML-Based Data Processor")
|
|
print("========================")
|
|
print()
|
|
print("This script demonstrates using YAML configurations instead of")
|
|
print("command-line arguments for better flexibility and maintainability.")
|
|
print()
|
|
print("Usage:")
|
|
print(" python scripts/run_data_processor_yaml.py examples # Run examples")
|
|
print(" python scripts/run_data_processor_yaml.py create-config # Create custom config")
|
|
print(" python scripts/run_data_processor_yaml.py benefits # Show YAML benefits")
|
|
print()
|
|
print("Benefits of YAML configurations:")
|
|
print(" ✅ Easier to manage complex configurations")
|
|
print(" ✅ Version control friendly")
|
|
print(" ✅ Self-documenting")
|
|
print(" ✅ Can still override with CLI args")
|
|
print(" ✅ Better for team collaboration")
|
|
|
|
if __name__ == "__main__":
|
|
main() |