#!/usr/bin/env python3 """ Instruct Training Script Provides a command-line interface to run the instruct training pipeline """ import sys import os import subprocess import argparse from pathlib import Path def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides): """Run the instruct training pipeline with YAML configuration""" print(f"Starting instruct training with config: {config_path}") if dataset_path: print(f"Training dataset: {dataset_path}") else: print("Training dataset: Will use data_path from YAML config") print() # Build command cmd = ["python", "pipelines/instruct/train.py", "--config", config_path] # Add dataset path if provided if dataset_path: cmd.extend(["--dataset", dataset_path]) # Add CLI overrides for key, value in cli_overrides.items(): if value is not None: if key == "output_dir": cmd.extend(["--output-dir", str(value)]) elif key == "epochs": cmd.extend(["--epochs", str(value)]) elif key == "batch_size": cmd.extend(["--batch-size", str(value)]) elif key == "learning_rate": cmd.extend(["--learning-rate", str(value)]) elif key == "max_steps": cmd.extend(["--max-steps", str(value)]) print(f"Running: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("Training completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"Training failed: {e}") print(f"Error output: {e.stderr}") return False def show_training_features(): """Show the features of the instruct training pipeline""" print("=== Instruct Training Pipeline Features ===") print() print("1. **Model Support**:") print(" - Unsloth optimized models (4x faster)") print(" - LoRA fine-tuning for efficiency") print(" - Support for Qwen2.5, Llama-3.1, Mistral, Phi-3") print(" - Chat template integration") print() print("2. **Training Features**:") print(" - SFTTrainer with conversation data") print(" - Response-only training (train only on assistant responses)") print(" - ShareGPT format standardization") print(" - Automatic mixed precision (FP16/BF16)") print(" - Gradient checkpointing for memory efficiency") print(" - Configurable LoRA parameters") print() print("3. **Conversation Handling**:") print(" - Multi-turn conversation support") print(" - Proper chat template formatting") print(" - Role-based training (user/assistant/system)") print(" - Context preservation across turns") print() print("4. **Configuration**:") print(" - YAML configuration files") print(" - CLI argument overrides") print(" - Automatic device detection") print(" - Flexible LoRA configuration") print() print("5. **Output**:") print(" - Saved LoRA models") print(" - Training logs and checkpoints") print(" - Ready for conversational inference") def create_training_example(): """Create a training example using the code reasoning configuration""" print("=== Training Example: Code Reasoning Instruction Tuning ===") print() # Check if we have the required files config_path = "configs/instruct/code_reasoning.yaml" if not Path(config_path).exists(): print(f"Configuration file not found: {config_path}") print(" Please ensure the configuration file exists") return False print("Found required files!") print(f" Config: {config_path}") print(" Dataset: Will use data_path from YAML config") print(" The training pipeline will automatically:") print(" - Load conversation data directly from JSONL file") print(" - Convert to HuggingFace dataset format") print(" - Apply ShareGPT standardization") print(" - Format conversations with chat templates") print(" - Train the model using SFTTrainer") print() # Run training without explicit dataset path - will use YAML config success = run_training_with_config( config_path=config_path, dataset_path=None, # Use data_path from YAML config epochs=1, batch_size=1, learning_rate=2e-4, max_steps=5 # Minimal steps for quick test ) if success: print("Training example completed!") print(" Model saved to: ./models/instruct") print(" Ready for conversational inference!") return success def create_quick_test(): """Create a quick test with minimal steps for testing""" print("=== Quick Test: Minimal Training Steps ===") print() config_path = "configs/instruct/code_reasoning.yaml" if not Path(config_path).exists(): print(f"Configuration file not found: {config_path}") print(" Please ensure the configuration file exists") return False print("Running quick test with minimal training steps...") print("This will load data directly from the JSONL file specified in config") # Run training with very few steps for quick testing success = run_training_with_config( config_path=config_path, dataset_path=None, # Use data_path from YAML config epochs=1, batch_size=1, learning_rate=2e-4, max_steps=3 # Very few steps for quick test ) if success: print("Quick test completed!") print(" Model saved with minimal training") print(" This is just for testing the pipeline") return success def main(): """Main function""" parser = argparse.ArgumentParser(description="Instruct Training Script") # Subcommands parser.add_argument("command", choices=["train", "example", "features", "quick-test"], help="Command to run") # Training arguments parser.add_argument("--config", type=str, help="Path to YAML configuration file") parser.add_argument("--dataset", type=str, help="Path to training dataset") parser.add_argument("--output-dir", type=str, help="Output directory for model") parser.add_argument("--epochs", type=int, help="Number of training epochs") parser.add_argument("--batch-size", type=int, help="Training batch size") parser.add_argument("--learning-rate", type=float, help="Learning rate") parser.add_argument("--max-steps", type=int, help="Maximum training steps") args = parser.parse_args() if args.command == "features": show_training_features() elif args.command == "example": create_training_example() elif args.command == "quick-test": create_quick_test() elif args.command == "train": if not args.config: print("❌ --config is required for training") print("Usage: python scripts/instruct/train.py train --config config.yaml") sys.exit(1) # If dataset is not provided, try to use output_dir from config dataset_path = args.dataset if args.dataset else None success = run_training_with_config( config_path=args.config, dataset_path=dataset_path, output_dir=args.output_dir, epochs=args.epochs, batch_size=args.batch_size, learning_rate=args.learning_rate, max_steps=args.max_steps ) if not success: sys.exit(1) if __name__ == "__main__": main()