DS-LLM-TEMPLATE-FINETUNING/scripts/classification/data_processor.py

#!/usr/bin/env python3
"""
Classification Data Processor Script
Uses YAML configurations for flexible and maintainable data processing.
"""

import sys
import os
import subprocess
import argparse
from pathlib import Path

def run_with_yaml_config(config_path: str, **cli_overrides):
    """Run data processor with YAML configuration"""
    print(f"=== Running Classification Data Processor ===")
    print(f"Config: {config_path}")

    cmd = [
        "python", "pipelines/classification/data_processor.py",
        "--config", config_path
    ]

    # Add CLI overrides
    for key, value in cli_overrides.items():
        if value is not None:
            cmd.extend([f"--{key.replace('_', '-')}", str(value)])

    print(f"Command: {' '.join(cmd)}")
    print()

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("✅ Data processing completed successfully!")
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running data processor: {e}")
        print(f"Error output: {e.stderr}")
        return False

def run_emotion_example():
    """Run emotion classification example"""
    print("=== Emotion Classification Example ===")

    success = run_with_yaml_config(
        "configs/classification/emotion.yaml",
        max_samples=500,  # Override YAML value
        output_dir="./data/emotion_small"
    )

    if success:
        print("✅ Emotion classification data processing completed!")
    else:
        print("❌ Emotion classification failed!")

def run_custom_example():
    """Run custom dataset example"""
    print("\n=== Custom Dataset Example ===")

    if os.path.exists("data/classification/train.jsonl"):
        success = run_with_yaml_config(
            "configs/classification/custom.yaml",
            data_source="custom",
            data_path="data/classification/train.jsonl",
            output_dir="./data/custom_processed"
        )
        if success:
            print("✅ Custom dataset processing completed!")
        else:
            print("❌ Custom dataset processing failed!")
    else:
        print("⚠️  Custom dataset not found, skipping...")

def create_custom_config():
    """Create a custom configuration file"""
    custom_config = """task:
  name: "classification"
  type: "sequence_classification"

data:
  source: "custom"
  data_format: "jsonl"
  input_field: "text"
  label_field: "label"
  max_samples: 1000
  train_split: 0.8
  validation_split: 0.1
  test_split: 0.1

processing:
  clean_text: true
  lowercase: true
  min_length: 10
  max_length: 1000

output:
  output_dir: "./data/custom_processed"
  output_format: "classification"
"""

    config_path = "configs/classification/custom.yaml"
    with open(config_path, 'w') as f:
        f.write(custom_config)

    print(f"✅ Created custom config: {config_path}")

def show_usage():
    """Show usage examples"""
    print("=== Classification Data Processor Usage ===")
    print()
    print("1. Use YAML config only:")
    print("   python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
    print()
    print("2. Override YAML values:")
    print("   python scripts/classification/data_processor.py --config configs/classification/emotion.yaml --max-samples 500")
    print()
    print("3. Use CLI only (backward compatibility):")
    print("   python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
    print()
    print("4. Run examples:")
    print("   python scripts/classification/data_processor.py examples")
    print()
    print("5. Create custom config:")
    print("   python scripts/classification/data_processor.py create-config")

def handle_direct_args():
    """Handle direct command-line arguments by passing them to the pipeline"""
    parser = argparse.ArgumentParser(description="Classification Data Processor")

    # Add all the same arguments as the pipeline
    parser.add_argument("--config", type=str, help="Path to YAML configuration file")
    parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
    parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
    parser.add_argument("--data-path", type=str, help="Path to custom data file")
    parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
    parser.add_argument("--input-field", type=str, help="Input field name")
    parser.add_argument("--label-field", type=str, help="Label field name")
    parser.add_argument("--id-field", type=str, help="Optional ID field name")
    parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
    parser.add_argument("--train-split", type=float, help="Training split ratio")
    parser.add_argument("--validation-split", type=float, help="Validation split ratio")
    parser.add_argument("--test-split", type=float, help="Test split ratio")
    parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
    parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
    parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
    parser.add_argument("--min-length", type=int, help="Minimum text length")
    parser.add_argument("--max-length", type=int, help="Maximum text length")
    parser.add_argument("--output-format", choices=["classification", "instruction", "conversation", "qa"], help="Output format")
    parser.add_argument("--output-dir", type=str, help="Output directory")
    parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")

    args = parser.parse_args()

    # Build command to call the pipeline
    cmd = ["python", "pipelines/classification/data_processor.py"]

    # Add all arguments that were provided
    for arg_name, arg_value in vars(args).items():
        if arg_value is not None:
            if isinstance(arg_value, bool):
                if arg_value:  # Only add flag if True
                    cmd.append(f"--{arg_name.replace('_', '-')}")
            else:
                cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])

    print(f"Running: {' '.join(cmd)}")
    print()

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("✅ Data processing completed successfully!")
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running data processor: {e}")
        print(f"Error output: {e.stderr}")
        return False

def main():
    """Main function"""
    # Check if any command-line arguments were provided
    if len(sys.argv) > 1:
        # Check if it's a subcommand
        if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
            # Handle subcommands
            if sys.argv[1] == "examples":
                run_emotion_example()
                run_custom_example()
            elif sys.argv[1] == "emotion":
                run_emotion_example()
            elif sys.argv[1] == "custom":
                run_custom_example()
            elif sys.argv[1] == "create-config":
                create_custom_config()
            elif sys.argv[1] == "help":
                show_usage()
        else:
            # Handle direct arguments (pass through to pipeline)
            handle_direct_args()
    else:
        print("Classification Data Processor")
        print("============================")
        print()
        print("This script processes classification datasets using YAML configurations.")
        print()
        print("Usage:")
        print("  python scripts/classification/data_processor.py examples     # Run examples")
        print("  python scripts/classification/data_processor.py emotion      # Run emotion example")
        print("  python scripts/classification/data_processor.py custom       # Run custom example")
        print("  python scripts/classification/data_processor.py create-config # Create custom config")
        print("  python scripts/classification/data_processor.py help         # Show usage")
        print()
        print("Direct pipeline usage:")
        print("  python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
        print("  python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
        print()
        print("Benefits of YAML configurations:")
        print("  ✅ Easier to manage complex configurations")
        print("  ✅ Version control friendly")
        print("  ✅ Self-documenting")
        print("  ✅ Can still override with CLI args")
        print("  ✅ Better for team collaboration")

if __name__ == "__main__":
    main()