updated instruct
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
# Comprehensive Instruct Configuration
|
||||
# This file defines all parameters for instruction fine-tuning using conversational data
|
||||
# Organized by level: task, data processing, model, training, and inference
|
||||
|
||||
# Task Configuration
|
||||
task:
|
||||
name: "code_reasoning" # Task name: instruct, code_reasoning, general_chat
|
||||
type: "instruction_following" # Model type: instruction_following, conversational
|
||||
|
||||
# Data Processing Configuration
|
||||
data:
|
||||
source: "custom" # Data source: "huggingface" or "custom"
|
||||
data_path: "./data/raw/instruct/code_reasoning.jsonl" # Path to conversation data file
|
||||
data_format: "jsonl" # Data format: "jsonl", "json"
|
||||
|
||||
# Field Mapping for Conversation Data
|
||||
conversation_field: "conversation" # Field name containing conversation array
|
||||
|
||||
# Data Format & Processing
|
||||
max_length: 2048 # Maximum text length (truncate longer texts)
|
||||
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||
|
||||
# Text Preprocessing
|
||||
clean_text: true # Clean and normalize text
|
||||
|
||||
# Data Splitting
|
||||
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
||||
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
||||
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||
|
||||
# Output Configuration
|
||||
output_format: "conversation" # Output format: "conversation" (chat format)
|
||||
output_dir: "./data/processed/instruct/code_reasoning" # Output directory for processed data
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
name: "unsloth/Qwen2.5-72B-Instruct" # Model name from HuggingFace Hub (optimized for instruction following)
|
||||
max_length: 2048 # Maximum sequence length for tokenization
|
||||
max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported)
|
||||
dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
|
||||
load_in_4bit: true # Use 4bit quantization to reduce memory usage
|
||||
token: null # HuggingFace token for gated models (e.g., "hf_...")
|
||||
|
||||
# Training Model Parameters
|
||||
training_model: "unsloth/Qwen2.5-72B-Instruct" # Model to use for training
|
||||
training_max_seq_length: 2048 # Max sequence length for training
|
||||
training_dtype: null # Data type for training
|
||||
training_load_in_4bit: true # 4bit quantization for training
|
||||
|
||||
# Training Configuration
|
||||
training:
|
||||
num_epochs: 1 # Number of training epochs (1 epoch is often sufficient for instruction tuning)
|
||||
batch_size: 1 # Training batch size (small for large models)
|
||||
learning_rate: 2e-4 # Learning rate (typical for instruction tuning)
|
||||
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
||||
warmup_steps: 5 # Warmup steps (fixed value)
|
||||
max_steps: 30 # Maximum training steps (adjust based on dataset size)
|
||||
gradient_accumulation_steps: 4 # Gradient accumulation steps
|
||||
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||
seed: 3407 # Random seed for reproducibility
|
||||
|
||||
# LoRA Configuration
|
||||
lora_r: 32 # LoRA rank (higher = more parameters)
|
||||
lora_alpha: 16 # LoRA alpha (scaling factor)
|
||||
lora_dropout: 0 # LoRA dropout (0 is optimized)
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
|
||||
# Output Configuration
|
||||
output_dir: "./outputs" # Directory for training checkpoints
|
||||
model_output_dir: "./models/instruct" # Directory to save the trained model
|
||||
|
||||
# Inference Configuration
|
||||
inference:
|
||||
batch_size: 1 # Batch size for inference
|
||||
max_new_tokens: 128 # Maximum new tokens to generate during inference
|
||||
temperature: 1.5 # Sampling temperature (higher = more creative)
|
||||
min_p: 0.1 # Min-p sampling parameter
|
||||
use_cache: true # Use key-value cache for faster generation
|
||||
@@ -0,0 +1,78 @@
|
||||
# Comprehensive Instruct Configuration
|
||||
# This file defines all parameters for instruction fine-tuning using conversational data
|
||||
# Organized by level: task, data processing, model, training, and inference
|
||||
|
||||
# Task Configuration
|
||||
task:
|
||||
name: "code_reasoning" # Task name: instruct, code_reasoning, general_chat
|
||||
type: "instruction_following" # Model type: instruction_following, conversational
|
||||
|
||||
# Data Processing Configuration
|
||||
data:
|
||||
source: "custom" # Data source: "huggingface" or "custom"
|
||||
data_path: "./data/raw/instruct/code_reasoning.jsonl" # Path to conversation data file
|
||||
data_format: "jsonl" # Data format: "jsonl", "json"
|
||||
|
||||
# Field Mapping for Conversation Data
|
||||
conversation_field: "conversation" # Field name containing conversation array
|
||||
|
||||
# Data Format & Processing
|
||||
max_length: 2048 # Maximum text length (truncate longer texts)
|
||||
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||
|
||||
# Text Preprocessing
|
||||
clean_text: true # Clean and normalize text
|
||||
|
||||
# Data Splitting
|
||||
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
||||
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
||||
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||
|
||||
# Output Configuration
|
||||
output_format: "conversation" # Output format: "conversation" (chat format)
|
||||
output_dir: "./data/processed/instruct/code_reasoning" # Output directory for processed data
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
name: "unsloth/Qwen2.5-72B-Instruct" # Model name from HuggingFace Hub (optimized for instruction following)
|
||||
max_length: 2048 # Maximum sequence length for tokenization
|
||||
max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported)
|
||||
dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
|
||||
load_in_4bit: true # Use 4bit quantization to reduce memory usage
|
||||
token: null # HuggingFace token for gated models (e.g., "hf_...")
|
||||
|
||||
# Training Model Parameters
|
||||
training_model: "unsloth/Qwen2.5-72B-Instruct" # Model to use for training
|
||||
training_max_seq_length: 2048 # Max sequence length for training
|
||||
training_dtype: null # Data type for training
|
||||
training_load_in_4bit: true # 4bit quantization for training
|
||||
|
||||
# Training Configuration
|
||||
training:
|
||||
num_epochs: 1 # Number of training epochs (1 epoch is often sufficient for instruction tuning)
|
||||
batch_size: 1 # Training batch size (small for large models)
|
||||
learning_rate: 2e-4 # Learning rate (typical for instruction tuning)
|
||||
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
||||
warmup_steps: 5 # Warmup steps (fixed value)
|
||||
max_steps: 30 # Maximum training steps (adjust based on dataset size)
|
||||
gradient_accumulation_steps: 4 # Gradient accumulation steps
|
||||
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||
seed: 3407 # Random seed for reproducibility
|
||||
|
||||
# LoRA Configuration
|
||||
lora_r: 32 # LoRA rank (higher = more parameters)
|
||||
lora_alpha: 16 # LoRA alpha (scaling factor)
|
||||
lora_dropout: 0 # LoRA dropout (0 is optimized)
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
|
||||
# Output Configuration
|
||||
output_dir: "./outputs" # Directory for training checkpoints
|
||||
model_output_dir: "./models/instruct" # Directory to save the trained model
|
||||
|
||||
# Inference Configuration
|
||||
inference:
|
||||
batch_size: 1 # Batch size for inference
|
||||
max_new_tokens: 128 # Maximum new tokens to generate during inference
|
||||
temperature: 1.5 # Sampling temperature (higher = more creative)
|
||||
min_p: 0.1 # Min-p sampling parameter
|
||||
use_cache: true # Use key-value cache for faster generation
|
||||
Reference in New Issue
Block a user