Files

168 lines
5.7 KiB
Python
Raw Permalink Normal View History

2025-08-06 22:45:37 +01:00
#!/usr/bin/env python3
"""
Data processor script that uses YAML configurations.
This provides a more flexible and maintainable approach than command-line arguments alone.
"""
import sys
import os
import subprocess
from pathlib import Path
from utils.config.config_manager import ConfigManager
def run_with_yaml_config(config_path: str, **cli_overrides):
"""Run data processor with YAML configuration"""
print(f"=== Running with YAML config: {config_path} ===")
cmd = [
"python", "pipelines/classification/data_processor.py",
"--config", config_path
]
# Add CLI overrides
for key, value in cli_overrides.items():
if value is not None:
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
print(f"Running command: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Data processing completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running data processor: {e}")
print(f"Error output: {e.stderr}")
return False
def run_classification_examples():
"""Run classification examples with YAML configs"""
# Example 1: Emotion classification
print("=== Example 1: Emotion Classification ===")
success = run_with_yaml_config(
"configs/classification/emotion.yaml",
max_samples=500, # Override YAML value
output_dir="./data/emotion_small"
)
if success:
print("✅ Emotion classification completed!")
# Example 2: Custom dataset (if available)
print("\n=== Example 2: Custom Dataset ===")
if os.path.exists("data/classification/train.jsonl"):
success = run_with_yaml_config(
"configs/classification/custom.yaml", # You can create this
data_source="custom",
data_path="data/classification/train.jsonl",
output_dir="./data/custom_processed"
)
if success:
print("✅ Custom dataset processing completed!")
else:
print("⚠️ Custom dataset not found, skipping...")
def create_custom_config():
"""Create a custom configuration file"""
custom_config = """task:
name: "classification"
type: "sequence_classification"
data:
source: "custom"
data_format: "jsonl"
input_field: "text"
label_field: "label"
max_samples: 1000
train_split: 0.8
validation_split: 0.1
test_split: 0.1
processing:
clean_text: true
lowercase: true
min_length: 10
max_length: 1000
output:
output_dir: "./data/custom_processed"
output_format: "classification"
"""
config_path = "configs/classification/custom.yaml"
with open(config_path, 'w') as f:
f.write(custom_config)
print(f"✅ Created custom config: {config_path}")
def show_yaml_benefits():
"""Show the benefits of using YAML configurations"""
print("=== YAML Configuration Benefits ===")
print()
print("1. **Separation of Concerns**:")
print(" - Configuration separate from code")
print(" - Easy to version control configs")
print(" - No need to modify scripts for different experiments")
print()
print("2. **Flexibility**:")
print(" - Can override YAML values with CLI args")
print(" - Multiple configs for different experiments")
print(" - Easy to share and reproduce experiments")
print()
print("3. **Maintainability**:")
print(" - All parameters in one place")
print(" - Easy to understand and modify")
print(" - Self-documenting configurations")
print()
print("4. **Scalability**:")
print(" - Easy to add new parameters")
print(" - Hierarchical configuration structure")
print(" - Support for complex nested configurations")
print()
print("=== Usage Examples ===")
print()
print("1. Use YAML config only:")
print(" python scripts/run_data_processor_yaml.py --config configs/classification/emotion.yaml")
print()
print("2. Override YAML values:")
print(" python scripts/run_data_processor_yaml.py --config configs/classification/emotion.yaml --max-samples 500")
print()
print("3. Use CLI only (backward compatibility):")
print(" python scripts/run_data_processor_yaml.py --data-source huggingface --dataset-name dair-ai/emotion")
def main():
"""Main function"""
if len(sys.argv) > 1:
if sys.argv[1] == "examples":
run_classification_examples()
elif sys.argv[1] == "create-config":
create_custom_config()
elif sys.argv[1] == "benefits":
show_yaml_benefits()
else:
print(f"Unknown option: {sys.argv[1]}")
print("Use: python scripts/run_data_processor_yaml.py [examples|create-config|benefits]")
else:
print("YAML-Based Data Processor")
print("========================")
print()
print("This script demonstrates using YAML configurations instead of")
print("command-line arguments for better flexibility and maintainability.")
print()
print("Usage:")
print(" python scripts/run_data_processor_yaml.py examples # Run examples")
print(" python scripts/run_data_processor_yaml.py create-config # Create custom config")
print(" python scripts/run_data_processor_yaml.py benefits # Show YAML benefits")
print()
print("Benefits of YAML configurations:")
print(" ✅ Easier to manage complex configurations")
print(" ✅ Version control friendly")
print(" ✅ Self-documenting")
print(" ✅ Can still override with CLI args")
print(" ✅ Better for team collaboration")
if __name__ == "__main__":
main()