DS-LLM-TEMPLATE-FINETUNING/scripts/styling/data_processor.py

#!/usr/bin/env python3
"""
Styling data processor script that uses YAML configurations.
This provides a flexible and maintainable approach for style transfer tasks.
"""

import sys
import os
import subprocess
import argparse
from pathlib import Path

def run_with_yaml_config(config_path: str, **cli_overrides):
    """Run styling data processor with YAML configuration"""
    print(f"=== Running Styling Data Processor with YAML config: {config_path} ===")

    cmd = [
        "python", "pipelines/styling/data_processor.py",
        "--config", config_path
    ]

    # Add CLI overrides
    for key, value in cli_overrides.items():
        if value is not None:
            cmd.extend([f"--{key.replace('_', '-')}", str(value)])

    print(f"Running command: {' '.join(cmd)}")
    print()

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("✅ Styling data processing completed successfully!")
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running styling data processor: {e}")
        print(f"Error output: {e.stderr}")
        return False

def run_styling_examples():
    """Run styling examples with YAML configs"""

    # Example 1: Formal style transfer
    print("=== Example 1: Formal Style Transfer ===")
    success = run_with_yaml_config(
        "configs/styling/formal.yaml",
        max_samples=1000,  # Override YAML value
        output_format="alpaca"
    )

    if success:
        print("✅ Formal style transfer completed!")

    # Example 2: Custom styling dataset (if available)
    print("\n=== Example 2: Custom Styling Dataset ===")
    if os.path.exists("data/raw/styling/custom_dataset.jsonl"):
        success = run_with_yaml_config(
            "configs/styling/formal.yaml",  # Use formal config as base
            data_source="custom",
            data_path="data/raw/styling/custom_dataset.jsonl",
            instruction="Rewrite the following text in a casual, friendly style",
            output_dir="./data/processed/styling/casual"
        )
        if success:
            print("✅ Custom styling dataset processing completed!")
    else:
        print("⚠️  Custom styling dataset not found, skipping...")
        print("   You can create one with the 'create-sample-data' option")

def create_sample_styling_data():
    """Create sample styling dataset for testing"""
    sample_data = [
        {
            "text": "Hey, what's up? How are you doing today?",
            "styled_text": "Hello, how are you doing today?"
        },
        {
            "text": "This is really cool stuff!",
            "styled_text": "This is quite impressive material."
        },
        {
            "text": "I'm gonna go to the store later.",
            "styled_text": "I will go to the store later."
        },
        {
            "text": "What's the deal with this?",
            "styled_text": "What is the situation regarding this matter?"
        },
        {
            "text": "That's totally awesome!",
            "styled_text": "That is quite remarkable!"
        }
    ]

    # Create directory structure
    data_dir = Path("data/raw/styling")
    data_dir.mkdir(parents=True, exist_ok=True)

    # Save sample data
    import json
    sample_file = data_dir / "sample_formal.jsonl"
    with open(sample_file, 'w', encoding='utf-8') as f:
        for item in sample_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f"✅ Created sample styling dataset: {sample_file}")
    print(f"   Contains {len(sample_data)} examples")
    print(f"   Format: text → styled_text")
    print(f"   Ready to use with configs/styling/formal.yaml")

def create_custom_styling_config():
    """Create a custom styling configuration file"""
    custom_config = """task:
  name: "styling"
  type: "style_transfer"

data:
  source: "custom"
  input_field: "text"
  output_field: "styled_text"
  instruction: "Rewrite the following text in a professional business style"
  data_format: "jsonl"
  max_length: 512
  min_length: 10
  clean_text: true
  lowercase: false
  train_split: 0.8
  validation_split: 0.1
  test_split: 0.1
  output_format: "alpaca"
  output_dir: "./data/processed/styling/professional"

model:
  name: "t5-base"
  max_length: 512

training:
  num_epochs: 3
  batch_size: 16
  learning_rate: 3e-5
  weight_decay: 0.01
  warmup_ratio: 0.1
  lr_scheduler_type: "linear"

inference:
  batch_size: 32
  max_new_tokens: 128
  temperature: 0.8
"""

    config_path = "configs/styling/professional.yaml"
    os.makedirs(os.path.dirname(config_path), exist_ok=True)

    with open(config_path, 'w') as f:
        f.write(custom_config)

    print(f"✅ Created custom styling config: {config_path}")
    print("   This config is set up for professional business style transfer")

def handle_direct_args():
    """Handle direct command-line arguments by passing them to the styling pipeline"""
    parser = argparse.ArgumentParser(description="Styling Data Processor")

    # Add all the same arguments as the styling pipeline
    parser.add_argument("--config", type=str, help="Path to YAML configuration file")
    parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
    parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
    parser.add_argument("--data-path", type=str, help="Path to custom data file")
    parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
    parser.add_argument("--input-field", type=str, help="Input field name")
    parser.add_argument("--output-field", type=str, help="Output field name")
    parser.add_argument("--instruction", type=str, help="Style instruction")
    parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
    parser.add_argument("--train-split", type=float, help="Training split ratio")
    parser.add_argument("--validation-split", type=float, help="Validation split ratio")
    parser.add_argument("--test-split", type=float, help="Test split ratio")
    parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
    parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
    parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
    parser.add_argument("--min-length", type=int, help="Minimum text length")
    parser.add_argument("--max-length", type=int, help="Maximum text length")
    parser.add_argument("--output-format", choices=["styling", "alpaca"], help="Output format")
    parser.add_argument("--output-dir", type=str, help="Output directory")

    # HuggingFace dataset options
    parser.add_argument("--create-hf-dataset", action="store_true", help="Create HuggingFace dataset")
    parser.add_argument("--hf-dataset-path", type=str, help="Path to save HuggingFace dataset")

    # Logging
    parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")

    args = parser.parse_args()

    # Build command to call the styling pipeline
    cmd = ["python", "pipelines/styling/data_processor.py"]

    # Add all arguments that were provided
    for arg_name, arg_value in vars(args).items():
        if arg_value is not None:
            if isinstance(arg_value, bool):
                if arg_value:  # Only add flag if True
                    cmd.append(f"--{arg_name.replace('_', '-')}")
            else:
                cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])

    print(f"Running: {' '.join(cmd)}")
    print()

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("✅ Styling data processing completed successfully!")
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running styling data processor: {e}")
        print(f"Error output: {e.stderr}")
        return False

def show_styling_features():
    """Show the features of the styling data processor"""
    print("=== Styling Data Processor Features ===")
    print()
    print("1. **Style Transfer Tasks**:")
    print("   - Formal vs. Informal style")
    print("   - Professional vs. Casual tone")
    print("   - Academic vs. Conversational")
    print("   - Any custom style instruction")
    print()
    print("2. **Data Formats Supported**:")
    print("   - HuggingFace datasets")
    print("   - Custom JSONL/CSV/JSON files")
    print("   - Automatic train/validation/test splits")
    print()
    print("3. **Output Formats**:")
    print("   - Raw styling format (input/output)")
    print("   - Alpaca format (instruction/input/output)")
    print("   - HuggingFace dataset format")
    print()
    print("4. **Advanced Features**:")
    print("   - Configurable field mapping")
    print("   - Text preprocessing options")
    print("   - Automatic dataset saving/loading")
    print("   - YAML configuration support")
    print()
    print("=== Usage Examples ===")
    print()
    print("1. Use YAML config only:")
    print("   python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
    print()
    print("2. Override YAML values:")
    print("   python scripts/styling/data_processor.py --config configs/styling/formal.yaml --max-samples 500")
    print()
    print("3. Create sample data:")
    print("   python scripts/styling/data_processor.py create-sample-data")
    print()
    print("4. Create custom config:")
    print("   python scripts/styling/data_processor.py create-config")

def main():
    """Main function"""
    if len(sys.argv) > 1:
        # Check if it's a subcommand
        if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
            # Handle subcommands
            if sys.argv[1] == "examples":
                run_styling_examples()
            elif sys.argv[1] == "create-sample-data":
                create_sample_styling_data()
            elif sys.argv[1] == "create-config":
                create_custom_styling_config()
            elif sys.argv[1] == "features":
                show_styling_features()
        else:
            # Handle direct arguments (pass through to pipeline)
            handle_direct_args()
    else:
        print("Styling Data Processor")
        print("=====================")
        print()
        print("This script runs the styling data processor for style transfer tasks.")
        print("It supports both YAML configurations and command-line overrides.")
        print()
        print("Usage:")
        print("  python scripts/styling/data_processor.py examples           # Run examples")
        print("  python scripts/styling/data_processor.py create-sample-data # Create sample dataset")
        print("  python scripts/styling/data_processor.py create-config      # Create custom config")
        print("  python scripts/styling/data_processor.py features           # Show features")
        print()
        print("Direct pipeline usage:")
        print("  python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
        print("  python scripts/styling/data_processor.py --data-source custom --data-path ./data.jsonl")
        print()
        print("Key Features:")
        print("  ✅ Style transfer with custom instructions")
        print("  ✅ Multiple data source support")
        print("  ✅ YAML configuration files")
        print("  ✅ CLI argument overrides")
        print("  ✅ Automatic data splitting")
        print("  ✅ HuggingFace dataset export")

if __name__ == "__main__":
    main()