# Comprehensive Instruct Configuration # This file defines all parameters for instruction fine-tuning using conversational data # Organized by level: task, data processing, model, training, and inference # Task Configuration task: name: "code_reasoning" # Task name: instruct, code_reasoning, general_chat type: "instruction_following" # Model type: instruction_following, conversational # Data Processing Configuration data: source: "custom" # Data source: "huggingface" or "custom" data_path: "data/raw/swe_reasoning_dataset (3).jsonl" # Path to conversation data file data_format: "jsonl" # Data format: "jsonl", "json" # Field Mapping for Conversation Data conversation_field: "conversation" # Field name containing conversation array # Data Format & Processing max_length: 128000 # Maximum text length (truncate longer texts) min_length: 10 # Minimum text length (filter out shorter texts) # Text Preprocessing clean_text: true # Clean and normalize text # Data Splitting train_split: 0.8 # Training split ratio (0.0 to 1.0) validation_split: 0.1 # Validation split ratio (0.0 to 1.0) test_split: 0.1 # Test split ratio (0.0 to 1.0) # Output Configuration output_format: "conversation" # Output format: "conversation" (chat format) output_dir: "./data/processed/instruct/code_reasoning" # Output directory for processed data # Model Configuration model: name: "unsloth/llama-3.3-70b-instruct-bnb-4bit" # Model name from HuggingFace Hub (optimized for instruction following) max_length: 128000 # Maximum sequence length for tokenization max_seq_length: 128000 # Maximum sequence length for training (RoPE scaling supported) dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+ load_in_4bit: true # Use 4bit quantization to reduce memory usage token: null # HuggingFace token for gated models (e.g., "hf_...") # Training Model Parameters training_model: "unsloth/llama-3.3-70b-instruct-bnb-4bit" # Model to use for training training_max_seq_length: 128000 # Max sequence length for training training_dtype: null # Data type for training training_load_in_4bit: true # 4bit quantization for training # Training Configuration training: num_epochs: 1 # Number of training epochs (1 epoch is often sufficient for instruction tuning) batch_size: 1 # Training batch size (small for large models) learning_rate: 2e-4 # Learning rate (typical for instruction tuning) weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting) warmup_steps: 5 # Warmup steps (fixed value) max_steps: 30 # Maximum training steps (adjust based on dataset size) gradient_accumulation_steps: 4 # Gradient accumulation steps lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial" seed: 3407 # Random seed for reproducibility # LoRA Configuration lora_r: 32 # LoRA rank (higher = more parameters) lora_alpha: 16 # LoRA alpha (scaling factor) lora_dropout: 0 # LoRA dropout (0 is optimized) target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] # Output Configuration output_dir: "./outputs" # Directory for training checkpoints save_name: "qwen_2.5_test" model_output_dir: "./models/instruct" # Directory to save the trained model # Inference Configuration inference: batch_size: 1 # Batch size for inference max_new_tokens: 1024 # Maximum new tokens to generate during inference temperature: 1.5 # Sampling temperature (higher = more creative) min_p: 0.1 # Min-p sampling parameter use_cache: true # Use key-value cache for faster generation