updated instruct

2025-08-28 14:12:30 +00:00
parent d7441f4089
commit 78d519efbf
15 changed files with 3641 additions and 0 deletions
@@ -0,0 +1,78 @@
+# Comprehensive Instruct Configuration
+# This file defines all parameters for instruction fine-tuning using conversational data
+# Organized by level: task, data processing, model, training, and inference
+
+# Task Configuration
+task:
+  name: "code_reasoning"                    # Task name: instruct, code_reasoning, general_chat
+  type: "instruction_following"             # Model type: instruction_following, conversational
+
+# Data Processing Configuration
+data:
+  source: "custom"                          # Data source: "huggingface" or "custom"
+  data_path: "./data/raw/instruct/code_reasoning.jsonl"  # Path to conversation data file
+  data_format: "jsonl"                      # Data format: "jsonl", "json"
+  
+  # Field Mapping for Conversation Data
+  conversation_field: "conversation"       # Field name containing conversation array
+  
+  # Data Format & Processing
+  max_length: 2048                          # Maximum text length (truncate longer texts)
+  min_length: 10                            # Minimum text length (filter out shorter texts)
+  
+  # Text Preprocessing
+  clean_text: true                          # Clean and normalize text
+  
+  # Data Splitting
+  train_split: 0.8                          # Training split ratio (0.0 to 1.0)
+  validation_split: 0.1                     # Validation split ratio (0.0 to 1.0)
+  test_split: 0.1                           # Test split ratio (0.0 to 1.0)
+  
+  # Output Configuration
+  output_format: "conversation"             # Output format: "conversation" (chat format)
+  output_dir: "./data/processed/instruct/code_reasoning"  # Output directory for processed data
+
+# Model Configuration
+model:
+  name: "unsloth/Qwen2.5-72B-Instruct"     # Model name from HuggingFace Hub (optimized for instruction following)
+  max_length: 2048                          # Maximum sequence length for tokenization
+  max_seq_length: 2048                      # Maximum sequence length for training (RoPE scaling supported)
+  dtype: null                               # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
+  load_in_4bit: true                        # Use 4bit quantization to reduce memory usage
+  token: null                               # HuggingFace token for gated models (e.g., "hf_...")
+  
+  # Training Model Parameters
+  training_model: "unsloth/Qwen2.5-72B-Instruct"  # Model to use for training
+  training_max_seq_length: 2048             # Max sequence length for training
+  training_dtype: null                      # Data type for training
+  training_load_in_4bit: true               # 4bit quantization for training
+
+# Training Configuration
+training:
+  num_epochs: 1                             # Number of training epochs (1 epoch is often sufficient for instruction tuning)
+  batch_size: 1                             # Training batch size (small for large models)
+  learning_rate: 2e-4                       # Learning rate (typical for instruction tuning)
+  weight_decay: 0.01                        # Weight decay for optimizer (prevents overfitting)
+  warmup_steps: 5                           # Warmup steps (fixed value)
+  max_steps: 30                             # Maximum training steps (adjust based on dataset size)
+  gradient_accumulation_steps: 4            # Gradient accumulation steps
+  lr_scheduler_type: "linear"               # Scheduler type: "linear", "cosine", "polynomial"
+  seed: 3407                                # Random seed for reproducibility
+  
+  # LoRA Configuration
+  lora_r: 32                                # LoRA rank (higher = more parameters)
+  lora_alpha: 16                            # LoRA alpha (scaling factor)
+  lora_dropout: 0                           # LoRA dropout (0 is optimized)
+  target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+  
+  # Output Configuration
+  output_dir: "./outputs"                   # Directory for training checkpoints
+  model_output_dir: "./models/instruct"     # Directory to save the trained model
+
+# Inference Configuration
+inference:
+  batch_size: 1                             # Batch size for inference
+  max_new_tokens: 128                       # Maximum new tokens to generate during inference
+  temperature: 1.5                          # Sampling temperature (higher = more creative)
+  min_p: 0.1                                # Min-p sampling parameter
+  use_cache: true                           # Use key-value cache for faster generation
@@ -0,0 +1,78 @@
+# Comprehensive Instruct Configuration
+# This file defines all parameters for instruction fine-tuning using conversational data
+# Organized by level: task, data processing, model, training, and inference
+
+# Task Configuration
+task:
+  name: "code_reasoning"                    # Task name: instruct, code_reasoning, general_chat
+  type: "instruction_following"             # Model type: instruction_following, conversational
+
+# Data Processing Configuration
+data:
+  source: "custom"                          # Data source: "huggingface" or "custom"
+  data_path: "./data/raw/instruct/code_reasoning.jsonl"  # Path to conversation data file
+  data_format: "jsonl"                      # Data format: "jsonl", "json"
+  
+  # Field Mapping for Conversation Data
+  conversation_field: "conversation"       # Field name containing conversation array
+  
+  # Data Format & Processing
+  max_length: 2048                          # Maximum text length (truncate longer texts)
+  min_length: 10                            # Minimum text length (filter out shorter texts)
+  
+  # Text Preprocessing
+  clean_text: true                          # Clean and normalize text
+  
+  # Data Splitting
+  train_split: 0.8                          # Training split ratio (0.0 to 1.0)
+  validation_split: 0.1                     # Validation split ratio (0.0 to 1.0)
+  test_split: 0.1                           # Test split ratio (0.0 to 1.0)
+  
+  # Output Configuration
+  output_format: "conversation"             # Output format: "conversation" (chat format)
+  output_dir: "./data/processed/instruct/code_reasoning"  # Output directory for processed data
+
+# Model Configuration
+model:
+  name: "unsloth/Qwen2.5-72B-Instruct"     # Model name from HuggingFace Hub (optimized for instruction following)
+  max_length: 2048                          # Maximum sequence length for tokenization
+  max_seq_length: 2048                      # Maximum sequence length for training (RoPE scaling supported)
+  dtype: null                               # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
+  load_in_4bit: true                        # Use 4bit quantization to reduce memory usage
+  token: null                               # HuggingFace token for gated models (e.g., "hf_...")
+  
+  # Training Model Parameters
+  training_model: "unsloth/Qwen2.5-72B-Instruct"  # Model to use for training
+  training_max_seq_length: 2048             # Max sequence length for training
+  training_dtype: null                      # Data type for training
+  training_load_in_4bit: true               # 4bit quantization for training
+
+# Training Configuration
+training:
+  num_epochs: 1                             # Number of training epochs (1 epoch is often sufficient for instruction tuning)
+  batch_size: 1                             # Training batch size (small for large models)
+  learning_rate: 2e-4                       # Learning rate (typical for instruction tuning)
+  weight_decay: 0.01                        # Weight decay for optimizer (prevents overfitting)
+  warmup_steps: 5                           # Warmup steps (fixed value)
+  max_steps: 30                             # Maximum training steps (adjust based on dataset size)
+  gradient_accumulation_steps: 4            # Gradient accumulation steps
+  lr_scheduler_type: "linear"               # Scheduler type: "linear", "cosine", "polynomial"
+  seed: 3407                                # Random seed for reproducibility
+  
+  # LoRA Configuration
+  lora_r: 32                                # LoRA rank (higher = more parameters)
+  lora_alpha: 16                            # LoRA alpha (scaling factor)
+  lora_dropout: 0                           # LoRA dropout (0 is optimized)
+  target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+  
+  # Output Configuration
+  output_dir: "./outputs"                   # Directory for training checkpoints
+  model_output_dir: "./models/instruct"     # Directory to save the trained model
+
+# Inference Configuration
+inference:
+  batch_size: 1                             # Batch size for inference
+  max_new_tokens: 128                       # Maximum new tokens to generate during inference
+  temperature: 1.5                          # Sampling temperature (higher = more creative)
+  min_p: 0.1                                # Min-p sampling parameter
+  use_cache: true                           # Use key-value cache for faster generation