DS-LLM-TEMPLATE-FINETUNING/configs/instruct/sample.yaml

# Comprehensive Instruct Configuration
# This file defines all parameters for instruction fine-tuning using conversational data
# Organized by level: task, data processing, model, training, and inference

# Task Configuration
task:
  name: "code_reasoning"                    # Task name: instruct, code_reasoning, general_chat
  type: "instruction_following"             # Model type: instruction_following, conversational

# Data Processing Configuration
data:
  source: "custom"                          # Data source: "huggingface" or "custom"
  data_path: "./data/raw/instruct/code_reasoning.jsonl"  # Path to conversation data file
  data_format: "jsonl"                      # Data format: "jsonl", "json"

  # Field Mapping for Conversation Data
  conversation_field: "conversation"       # Field name containing conversation array

  # Data Format & Processing
  max_length: 2048                          # Maximum text length (truncate longer texts)
  min_length: 10                            # Minimum text length (filter out shorter texts)

  # Text Preprocessing
  clean_text: true                          # Clean and normalize text

  # Data Splitting
  train_split: 0.8                          # Training split ratio (0.0 to 1.0)
  validation_split: 0.1                     # Validation split ratio (0.0 to 1.0)
  test_split: 0.1                           # Test split ratio (0.0 to 1.0)

  # Output Configuration
  output_format: "conversation"             # Output format: "conversation" (chat format)
  output_dir: "./data/processed/instruct/code_reasoning"  # Output directory for processed data

# Model Configuration
model:
  name: "unsloth/Qwen2.5-72B-Instruct"     # Model name from HuggingFace Hub (optimized for instruction following)
  max_length: 2048                          # Maximum sequence length for tokenization
  max_seq_length: 2048                      # Maximum sequence length for training (RoPE scaling supported)
  dtype: null                               # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
  load_in_4bit: true                        # Use 4bit quantization to reduce memory usage
  token: null                               # HuggingFace token for gated models (e.g., "hf_...")

  # Training Model Parameters
  training_model: "unsloth/Qwen2.5-72B-Instruct"  # Model to use for training
  training_max_seq_length: 2048             # Max sequence length for training
  training_dtype: null                      # Data type for training
  training_load_in_4bit: true               # 4bit quantization for training

# Training Configuration
training:
  num_epochs: 1                             # Number of training epochs (1 epoch is often sufficient for instruction tuning)
  batch_size: 1                             # Training batch size (small for large models)
  learning_rate: 2e-4                       # Learning rate (typical for instruction tuning)
  weight_decay: 0.01                        # Weight decay for optimizer (prevents overfitting)
  warmup_steps: 5                           # Warmup steps (fixed value)
  max_steps: 30                             # Maximum training steps (adjust based on dataset size)
  gradient_accumulation_steps: 4            # Gradient accumulation steps
  lr_scheduler_type: "linear"               # Scheduler type: "linear", "cosine", "polynomial"
  seed: 3407                                # Random seed for reproducibility

  # LoRA Configuration
  lora_r: 32                                # LoRA rank (higher = more parameters)
  lora_alpha: 16                            # LoRA alpha (scaling factor)
  lora_dropout: 0                           # LoRA dropout (0 is optimized)
  target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

  # Output Configuration
  output_dir: "./outputs"                   # Directory for training checkpoints
  model_output_dir: "./models/instruct"     # Directory to save the trained model

# Inference Configuration
inference:
  batch_size: 1                             # Batch size for inference
  max_new_tokens: 128                       # Maximum new tokens to generate during inference
  temperature: 1.5                          # Sampling temperature (higher = more creative)
  min_p: 0.1                                # Min-p sampling parameter
  use_cache: true                           # Use key-value cache for faster generation