#!/usr/bin/env python3 """ Classification Data Processor Script Uses YAML configurations for flexible and maintainable data processing. """ import sys import os import subprocess import argparse from pathlib import Path def run_with_yaml_config(config_path: str, **cli_overrides): """Run data processor with YAML configuration""" print(f"=== Running Classification Data Processor ===") print(f"Config: {config_path}") cmd = [ "python", "pipelines/classification/data_processor.py", "--config", config_path ] # Add CLI overrides for key, value in cli_overrides.items(): if value is not None: cmd.extend([f"--{key.replace('_', '-')}", str(value)]) print(f"Command: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("✅ Data processing completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"❌ Error running data processor: {e}") print(f"Error output: {e.stderr}") return False def run_emotion_example(): """Run emotion classification example""" print("=== Emotion Classification Example ===") success = run_with_yaml_config( "configs/classification/emotion.yaml", max_samples=500, # Override YAML value output_dir="./data/emotion_small" ) if success: print("✅ Emotion classification data processing completed!") else: print("❌ Emotion classification failed!") def run_custom_example(): """Run custom dataset example""" print("\n=== Custom Dataset Example ===") if os.path.exists("data/classification/train.jsonl"): success = run_with_yaml_config( "configs/classification/custom.yaml", data_source="custom", data_path="data/classification/train.jsonl", output_dir="./data/custom_processed" ) if success: print("✅ Custom dataset processing completed!") else: print("❌ Custom dataset processing failed!") else: print("⚠️ Custom dataset not found, skipping...") def create_custom_config(): """Create a custom configuration file""" custom_config = """task: name: "classification" type: "sequence_classification" data: source: "custom" data_format: "jsonl" input_field: "text" label_field: "label" max_samples: 1000 train_split: 0.8 validation_split: 0.1 test_split: 0.1 processing: clean_text: true lowercase: true min_length: 10 max_length: 1000 output: output_dir: "./data/custom_processed" output_format: "classification" """ config_path = "configs/classification/custom.yaml" with open(config_path, 'w') as f: f.write(custom_config) print(f"✅ Created custom config: {config_path}") def show_usage(): """Show usage examples""" print("=== Classification Data Processor Usage ===") print() print("1. Use YAML config only:") print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml") print() print("2. Override YAML values:") print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml --max-samples 500") print() print("3. Use CLI only (backward compatibility):") print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion") print() print("4. Run examples:") print(" python scripts/classification/data_processor.py examples") print() print("5. Create custom config:") print(" python scripts/classification/data_processor.py create-config") def handle_direct_args(): """Handle direct command-line arguments by passing them to the pipeline""" parser = argparse.ArgumentParser(description="Classification Data Processor") # Add all the same arguments as the pipeline parser.add_argument("--config", type=str, help="Path to YAML configuration file") parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source") parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name") parser.add_argument("--data-path", type=str, help="Path to custom data file") parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format") parser.add_argument("--input-field", type=str, help="Input field name") parser.add_argument("--label-field", type=str, help="Label field name") parser.add_argument("--id-field", type=str, help="Optional ID field name") parser.add_argument("--max-samples", type=int, help="Maximum samples to process") parser.add_argument("--train-split", type=float, help="Training split ratio") parser.add_argument("--validation-split", type=float, help="Validation split ratio") parser.add_argument("--test-split", type=float, help="Test split ratio") parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text") parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters") parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase") parser.add_argument("--min-length", type=int, help="Minimum text length") parser.add_argument("--max-length", type=int, help="Maximum text length") parser.add_argument("--output-format", choices=["classification", "instruction", "conversation", "qa"], help="Output format") parser.add_argument("--output-dir", type=str, help="Output directory") parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level") args = parser.parse_args() # Build command to call the pipeline cmd = ["python", "pipelines/classification/data_processor.py"] # Add all arguments that were provided for arg_name, arg_value in vars(args).items(): if arg_value is not None: if isinstance(arg_value, bool): if arg_value: # Only add flag if True cmd.append(f"--{arg_name.replace('_', '-')}") else: cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)]) print(f"Running: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("✅ Data processing completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"❌ Error running data processor: {e}") print(f"Error output: {e.stderr}") return False def main(): """Main function""" # Check if any command-line arguments were provided if len(sys.argv) > 1: # Check if it's a subcommand if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]: # Handle subcommands if sys.argv[1] == "examples": run_emotion_example() run_custom_example() elif sys.argv[1] == "emotion": run_emotion_example() elif sys.argv[1] == "custom": run_custom_example() elif sys.argv[1] == "create-config": create_custom_config() elif sys.argv[1] == "help": show_usage() else: # Handle direct arguments (pass through to pipeline) handle_direct_args() else: print("Classification Data Processor") print("============================") print() print("This script processes classification datasets using YAML configurations.") print() print("Usage:") print(" python scripts/classification/data_processor.py examples # Run examples") print(" python scripts/classification/data_processor.py emotion # Run emotion example") print(" python scripts/classification/data_processor.py custom # Run custom example") print(" python scripts/classification/data_processor.py create-config # Create custom config") print(" python scripts/classification/data_processor.py help # Show usage") print() print("Direct pipeline usage:") print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml") print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion") print() print("Benefits of YAML configurations:") print(" ✅ Easier to manage complex configurations") print(" ✅ Version control friendly") print(" ✅ Self-documenting") print(" ✅ Can still override with CLI args") print(" ✅ Better for team collaboration") if __name__ == "__main__": main()