#!/usr/bin/env python3 """ Data processor script that uses YAML configurations. This provides a more flexible and maintainable approach than command-line arguments alone. """ import sys import os import subprocess from pathlib import Path from utils.config.config_manager import ConfigManager def run_with_yaml_config(config_path: str, **cli_overrides): """Run data processor with YAML configuration""" print(f"=== Running with YAML config: {config_path} ===") cmd = [ "python", "pipelines/classification/data_processor.py", "--config", config_path ] # Add CLI overrides for key, value in cli_overrides.items(): if value is not None: cmd.extend([f"--{key.replace('_', '-')}", str(value)]) print(f"Running command: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("✅ Data processing completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"❌ Error running data processor: {e}") print(f"Error output: {e.stderr}") return False def run_classification_examples(): """Run classification examples with YAML configs""" # Example 1: Emotion classification print("=== Example 1: Emotion Classification ===") success = run_with_yaml_config( "configs/classification/emotion.yaml", max_samples=500, # Override YAML value output_dir="./data/emotion_small" ) if success: print("✅ Emotion classification completed!") # Example 2: Custom dataset (if available) print("\n=== Example 2: Custom Dataset ===") if os.path.exists("data/classification/train.jsonl"): success = run_with_yaml_config( "configs/classification/custom.yaml", # You can create this data_source="custom", data_path="data/classification/train.jsonl", output_dir="./data/custom_processed" ) if success: print("✅ Custom dataset processing completed!") else: print("⚠️ Custom dataset not found, skipping...") def create_custom_config(): """Create a custom configuration file""" custom_config = """task: name: "classification" type: "sequence_classification" data: source: "custom" data_format: "jsonl" input_field: "text" label_field: "label" max_samples: 1000 train_split: 0.8 validation_split: 0.1 test_split: 0.1 processing: clean_text: true lowercase: true min_length: 10 max_length: 1000 output: output_dir: "./data/custom_processed" output_format: "classification" """ config_path = "configs/classification/custom.yaml" with open(config_path, 'w') as f: f.write(custom_config) print(f"✅ Created custom config: {config_path}") def show_yaml_benefits(): """Show the benefits of using YAML configurations""" print("=== YAML Configuration Benefits ===") print() print("1. **Separation of Concerns**:") print(" - Configuration separate from code") print(" - Easy to version control configs") print(" - No need to modify scripts for different experiments") print() print("2. **Flexibility**:") print(" - Can override YAML values with CLI args") print(" - Multiple configs for different experiments") print(" - Easy to share and reproduce experiments") print() print("3. **Maintainability**:") print(" - All parameters in one place") print(" - Easy to understand and modify") print(" - Self-documenting configurations") print() print("4. **Scalability**:") print(" - Easy to add new parameters") print(" - Hierarchical configuration structure") print(" - Support for complex nested configurations") print() print("=== Usage Examples ===") print() print("1. Use YAML config only:") print(" python scripts/run_data_processor_yaml.py --config configs/classification/emotion.yaml") print() print("2. Override YAML values:") print(" python scripts/run_data_processor_yaml.py --config configs/classification/emotion.yaml --max-samples 500") print() print("3. Use CLI only (backward compatibility):") print(" python scripts/run_data_processor_yaml.py --data-source huggingface --dataset-name dair-ai/emotion") def main(): """Main function""" if len(sys.argv) > 1: if sys.argv[1] == "examples": run_classification_examples() elif sys.argv[1] == "create-config": create_custom_config() elif sys.argv[1] == "benefits": show_yaml_benefits() else: print(f"Unknown option: {sys.argv[1]}") print("Use: python scripts/run_data_processor_yaml.py [examples|create-config|benefits]") else: print("YAML-Based Data Processor") print("========================") print() print("This script demonstrates using YAML configurations instead of") print("command-line arguments for better flexibility and maintainability.") print() print("Usage:") print(" python scripts/run_data_processor_yaml.py examples # Run examples") print(" python scripts/run_data_processor_yaml.py create-config # Create custom config") print(" python scripts/run_data_processor_yaml.py benefits # Show YAML benefits") print() print("Benefits of YAML configurations:") print(" ✅ Easier to manage complex configurations") print(" ✅ Version control friendly") print(" ✅ Self-documenting") print(" ✅ Can still override with CLI args") print(" ✅ Better for team collaboration") if __name__ == "__main__": main()