DS-LLM-TEMPLATE-FINETUNING/scripts/instruct/data_processor.py

#!/usr/bin/env python3
"""
Instruct data processor script that uses YAML configurations.
This provides a flexible and maintainable approach for instruction fine-tuning tasks.
"""

import sys
import os
import subprocess
import argparse
from pathlib import Path

def run_with_yaml_config(config_path: str, **cli_overrides):
    """Run instruct data processor with YAML configuration"""
    print(f"=== Running Instruct Data Processor with YAML config: {config_path} ===")

    cmd = [
        "python", "pipelines/instruct/data_processor.py",
        "--config", config_path
    ]

    # Add CLI overrides
    for key, value in cli_overrides.items():
        if value is not None:
            cmd.extend([f"--{key.replace('_', '-')}", str(value)])

    print(f"Running command: {' '.join(cmd)}")
    print()

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("✅ Instruct data processing completed successfully!")
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running instruct data processor: {e}")
        print(f"Error output: {e.stderr}")
        return False

def run_instruct_examples():
    """Run instruct examples with YAML configs"""

    # Example 1: Code reasoning instruction tuning
    print("=== Example 1: Code Reasoning Instruction Tuning ===")
    success = run_with_yaml_config(
        "configs/instruct/code_reasoning.yaml",
        max_samples=1000,  # Override YAML value
        output_format="conversation"
    )

    if success:
        print("✅ Code reasoning instruction tuning data processing completed!")

    # Example 2: Custom conversation dataset (if available)
    print("\n=== Example 2: Custom Conversation Dataset ===")
    if os.path.exists("data/raw/instruct/custom_conversations.jsonl"):
        success = run_with_yaml_config(
            "configs/instruct/code_reasoning.yaml",  # Use code_reasoning config as base
            data_source="custom",
            data_path="data/raw/instruct/custom_conversations.jsonl",
            output_dir="./data/processed/instruct/custom"
        )
        if success:
            print("✅ Custom conversation dataset processing completed!")
    else:
        print("⚠️  Custom conversation dataset not found, skipping...")
        print("   You can create one with the 'create-sample-data' option")

def create_sample_instruct_data():
    """Create sample conversation dataset for testing"""
    sample_data = [
        {
            "conversation": [
                {"role": "user", "content": "Can you explain what a Python list comprehension is?"},
                {"role": "assistant", "content": "A Python list comprehension is a concise way to create lists. It allows you to generate a new list by applying an expression to each item in an existing iterable, optionally filtering items with a condition. The basic syntax is: [expression for item in iterable if condition]. For example: [x**2 for x in range(10) if x%2==0] creates a list of squares of even numbers from 0 to 8."}
            ]
        },
        {
            "conversation": [
                {"role": "user", "content": "How do I handle exceptions in Python?"},
                {"role": "assistant", "content": "In Python, you handle exceptions using try-except blocks. Here's the basic structure:\n\ntry:\n    # Code that might raise an exception\n    risky_code()\nexcept SpecificException as e:\n    # Handle specific exception\n    print(f'Error: {e}')\nexcept Exception as e:\n    # Handle any other exception\n    print(f'Unexpected error: {e}')\nelse:\n    # Runs if no exception occurred\n    print('Success!')\nfinally:\n    # Always runs, regardless of exceptions\n    cleanup_code()\n\nYou can catch multiple specific exceptions and handle them differently."}
            ]
        },
        {
            "conversation": [
                {"role": "user", "content": "What's the difference between a function and a method in Python?"},
                {"role": "assistant", "content": "In Python, the main differences between functions and methods are:\n\n1. **Definition location**: Functions are defined independently, while methods are defined inside classes.\n\n2. **Calling**: Functions are called directly by name, methods are called on objects using dot notation.\n\n3. **First parameter**: Methods automatically receive 'self' as the first parameter (the instance they're called on), functions don't have this automatic parameter.\n\nExample:\n```python\n# Function\ndef my_function(x):\n    return x * 2\n\n# Method (inside a class)\nclass MyClass:\n    def my_method(self, x):\n        return x * 2\n\n# Usage\nresult1 = my_function(5)  # Function call\nobj = MyClass()\nresult2 = obj.my_method(5)  # Method call\n```"}
            ]
        },
        {
            "conversation": [
                {"role": "user", "content": "How can I optimize a slow Python loop?"},
                {"role": "assistant", "content": "Here are several strategies to optimize slow Python loops:\n\n1. **Use list comprehensions** instead of explicit loops when possible\n2. **Vectorize with NumPy** for numerical operations\n3. **Use built-in functions** like map(), filter(), sum()\n4. **Avoid repeated lookups** - store frequently accessed values in variables\n5. **Use sets for membership testing** instead of lists\n6. **Consider using itertools** for efficient iteration patterns\n7. **Profile your code** with cProfile to identify bottlenecks\n8. **Use numba or Cython** for compute-intensive loops\n\nExample optimization:\n```python\n# Slow\nresult = []\nfor i in range(1000000):\n    if i % 2 == 0:\n        result.append(i ** 2)\n\n# Faster\nresult = [i ** 2 for i in range(1000000) if i % 2 == 0]\n\n# Even faster with NumPy\nimport numpy as np\narr = np.arange(1000000)\nresult = (arr[arr % 2 == 0] ** 2).tolist()\n```"}
            ]
        },
        {
            "conversation": [
                {"role": "user", "content": "What are Python decorators and how do I use them?"},
                {"role": "assistant", "content": "Python decorators are a powerful feature that allows you to modify or extend the behavior of functions or classes without permanently modifying their code. They use the @ symbol and are placed above the function definition.\n\n**Basic decorator example:**\n```python\ndef my_decorator(func):\n    def wrapper(*args, **kwargs):\n        print('Before function call')\n        result = func(*args, **kwargs)\n        print('After function call')\n        return result\n    return wrapper\n\n@my_decorator\ndef greet(name):\n    print(f'Hello, {name}!')\n\ngreet('Alice')  # Prints before, greeting, and after messages\n```\n\n**Common use cases:**\n- Timing function execution\n- Logging\n- Authentication/authorization\n- Caching results\n- Input validation\n\n**Built-in decorators:**\n- `@property` - creates getter/setter methods\n- `@staticmethod` - methods that don't need self or cls\n- `@classmethod` - methods that receive the class as first argument\n\nDecorators make code more modular and reusable by separating concerns."}
            ]
        }
    ]

    # Create directory structure
    data_dir = Path("data/raw/instruct")
    data_dir.mkdir(parents=True, exist_ok=True)

    # Save sample data
    import json
    sample_file = data_dir / "code_reasoning.jsonl"
    with open(sample_file, 'w', encoding='utf-8') as f:
        for item in sample_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f"✅ Created sample conversation dataset: {sample_file}")
    print(f"   Contains {len(sample_data)} conversation examples")
    print(f"   Format: conversation array with role/content pairs")
    print(f"   Ready to use with configs/instruct/code_reasoning.yaml")

def create_custom_instruct_config():
    """Create a custom instruct configuration file"""
    custom_config = """# Custom Instruct Configuration
task:
  name: "general_chat"
  type: "instruction_following"

data:
  source: "custom"
  data_path: "./data/raw/instruct/general_chat.jsonl"
  data_format: "jsonl"
  conversation_field: "conversation"
  max_length: 2048
  min_length: 10
  clean_text: true
  train_split: 0.8
  validation_split: 0.1
  test_split: 0.1
  output_format: "conversation"
  output_dir: "./data/processed/instruct/general_chat"

model:
  name: "unsloth/Qwen2.5-7B-Instruct"
  max_length: 2048
  max_seq_length: 2048
  dtype: null
  load_in_4bit: true
  token: null
  training_model: "unsloth/Qwen2.5-7B-Instruct"
  training_max_seq_length: 2048
  training_dtype: null
  training_load_in_4bit: true

training:
  num_epochs: 1
  batch_size: 1
  learning_rate: 2e-4
  weight_decay: 0.01
  warmup_steps: 5
  max_steps: 50
  gradient_accumulation_steps: 4
  lr_scheduler_type: "linear"
  seed: 3407
  lora_r: 16
  lora_alpha: 16
  lora_dropout: 0
  target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
  output_dir: "./outputs"
  model_output_dir: "./models/instruct/general_chat"

inference:
  batch_size: 1
  max_new_tokens: 256
  temperature: 0.8
  min_p: 0.1
  use_cache: true
"""

    config_path = "configs/instruct/general_chat.yaml"
    os.makedirs(os.path.dirname(config_path), exist_ok=True)

    with open(config_path, 'w') as f:
        f.write(custom_config)

    print(f"✅ Created custom instruct config: {config_path}")
    print("   This config is set up for general chat instruction tuning")

def handle_direct_args():
    """Handle direct command-line arguments by passing them to the instruct pipeline"""
    parser = argparse.ArgumentParser(description="Instruct Data Processor")

    # Add all the same arguments as the instruct pipeline
    parser.add_argument("--config", type=str, help="Path to YAML configuration file")
    parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
    parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
    parser.add_argument("--data-path", type=str, help="Path to custom data file")
    parser.add_argument("--data-format", choices=["jsonl", "json"], help="Data format")
    parser.add_argument("--conversation-field", type=str, help="Conversation field name")
    parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
    parser.add_argument("--train-split", type=float, help="Training split ratio")
    parser.add_argument("--validation-split", type=float, help="Validation split ratio")
    parser.add_argument("--test-split", type=float, help="Test split ratio")
    parser.add_argument("--output-dir", type=str, help="Output directory")

    # Logging
    parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")

    args = parser.parse_args()

    # Build command to call the instruct pipeline
    cmd = ["python", "pipelines/instruct/data_processor.py"]

    # Add all arguments that were provided
    for arg_name, arg_value in vars(args).items():
        if arg_value is not None:
            if isinstance(arg_value, bool):
                if arg_value:  # Only add flag if True
                    cmd.append(f"--{arg_name.replace('_', '-')}")
            else:
                cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])

    print(f"Running: {' '.join(cmd)}")
    print()

    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print("✅ Instruct data processing completed successfully!")
        print(result.stdout)
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running instruct data processor: {e}")
        print(f"Error output: {e.stderr}")
        return False

def show_instruct_features():
    """Show the features of the instruct data processor"""
    print("=== Instruct Data Processor Features ===")
    print()
    print("1. **Instruction Fine-tuning Tasks**:")
    print("   - Code reasoning and explanation")
    print("   - General conversation and chat")
    print("   - Question answering")
    print("   - Task-specific instruction following")
    print()
    print("2. **Conversation Data Formats Supported**:")
    print("   - HuggingFace conversation datasets")
    print("   - Custom JSONL/JSON files with conversation arrays")
    print("   - ShareGPT format with role/content structure")
    print("   - Automatic train/validation/test splits")
    print()
    print("3. **Conversation Validation**:")
    print("   - Role validation (user/assistant/system)")
    print("   - Content length and quality checks")
    print("   - Conversation structure validation")
    print("   - Turn-level statistics and analysis")
    print()
    print("4. **Advanced Features**:")
    print("   - Configurable conversation field mapping")
    print("   - Text preprocessing options")
    print("   - Automatic dataset saving/loading")
    print("   - YAML configuration support")
    print("   - Compatible with Unsloth chat templates")
    print()
    print("=== Usage Examples ===")
    print()
    print("1. Use YAML config only:")
    print("   python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
    print()
    print("2. Override YAML values:")
    print("   python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml --max-samples 500")
    print()
    print("3. Create sample data:")
    print("   python scripts/instruct/data_processor.py create-sample-data")
    print()
    print("4. Create custom config:")
    print("   python scripts/instruct/data_processor.py create-config")

def main():
    """Main function"""
    if len(sys.argv) > 1:
        # Check if it's a subcommand
        if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
            # Handle subcommands
            if sys.argv[1] == "examples":
                run_instruct_examples()
            elif sys.argv[1] == "create-sample-data":
                create_sample_instruct_data()
            elif sys.argv[1] == "create-config":
                create_custom_instruct_config()
            elif sys.argv[1] == "features":
                show_instruct_features()
        else:
            # Handle direct arguments (pass through to pipeline)
            handle_direct_args()
    else:
        print("Instruct Data Processor")
        print("======================")
        print()
        print("This script runs the instruct data processor for instruction fine-tuning tasks.")
        print("It supports both YAML configurations and command-line overrides.")
        print()
        print("Usage:")
        print("  python scripts/instruct/data_processor.py examples           # Run examples")
        print("  python scripts/instruct/data_processor.py create-sample-data # Create sample dataset")
        print("  python scripts/instruct/data_processor.py create-config      # Create custom config")
        print("  python scripts/instruct/data_processor.py features           # Show features")
        print()
        print("Direct pipeline usage:")
        print("  python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
        print("  python scripts/instruct/data_processor.py --data-source custom --data-path ./conversations.jsonl")
        print()
        print("Key Features:")
        print("  ✅ Instruction fine-tuning with conversation data")
        print("  ✅ Multiple data source support")
        print("  ✅ YAML configuration files")
        print("  ✅ CLI argument overrides")
        print("  ✅ Conversation validation and analysis")
        print("  ✅ Compatible with Unsloth chat templates")

if __name__ == "__main__":
    main()