2025-08-13 21:17:01 +01:00
|
|
|
# Comprehensive Styling Configuration
|
|
|
|
|
# This file defines all parameters for formal style transfer tasks
|
|
|
|
|
# Organized by level: task, data processing, model, training, and inference
|
|
|
|
|
|
|
|
|
|
# Task Configuration
|
2025-08-06 22:45:37 +01:00
|
|
|
task:
|
2025-08-13 21:17:01 +01:00
|
|
|
name: "styling" # Task type: classification, completion, styling, matching
|
|
|
|
|
type: "style_transfer" # Model type: style_transfer, text_generation, etc.
|
2025-08-06 22:45:37 +01:00
|
|
|
|
2025-08-13 21:17:01 +01:00
|
|
|
# Data Processing Configuration
|
2025-08-06 22:45:37 +01:00
|
|
|
data:
|
2025-08-13 21:17:01 +01:00
|
|
|
source: "custom" # Data source: "huggingface" or "custom"
|
|
|
|
|
data_path: "./data/raw/styling/sample_formal.jsonl" # Path to custom data file (required for custom source)
|
|
|
|
|
dataset_name: null # HuggingFace dataset name (required for huggingface source)
|
|
|
|
|
|
|
|
|
|
# Field Mapping
|
|
|
|
|
input_field: "text" # Field name containing source text to be styled
|
|
|
|
|
output_field: "styled_text" # Field name containing the styled/transformed text
|
|
|
|
|
|
|
|
|
|
# Style Instruction
|
|
|
|
|
instruction: "Rewrite the following text in a formal style" # The style instruction that guides the transformation
|
|
|
|
|
|
|
|
|
|
# Data Format & Processing
|
|
|
|
|
data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data)
|
|
|
|
|
max_length: 256 # Maximum text length (truncate longer texts)
|
|
|
|
|
min_length: 10 # Minimum text length (filter out shorter texts)
|
|
|
|
|
|
|
|
|
|
# Text Preprocessing
|
|
|
|
|
clean_text: true # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
|
|
|
|
|
lowercase: false # Convert text to lowercase (false for formal style to preserve case)
|
|
|
|
|
|
|
|
|
|
# Data Splitting
|
|
|
|
|
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
|
|
|
|
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
|
|
|
|
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
|
|
|
|
|
|
|
|
|
# Output Configuration
|
|
|
|
|
output_format: "alpaca" # Output format: "styling" (raw), "alpaca" (instruction format)
|
|
|
|
|
output_dir: "./data/processed/styling/formal" # Output directory for processed data and HuggingFace datasets
|
2025-08-06 22:45:37 +01:00
|
|
|
|
2025-08-13 21:17:01 +01:00
|
|
|
# Model Configuration
|
2025-08-06 22:45:37 +01:00
|
|
|
model:
|
2025-08-13 21:17:01 +01:00
|
|
|
name: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model name from HuggingFace Hub
|
|
|
|
|
max_length: 2048 # Maximum sequence length for tokenization
|
|
|
|
|
max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported)
|
|
|
|
|
dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
|
|
|
|
|
load_in_4bit: true # Use 4bit quantization to reduce memory usage
|
|
|
|
|
token: null # HuggingFace token for gated models (e.g., "hf_...")
|
|
|
|
|
|
|
|
|
|
# Training Model Parameters
|
|
|
|
|
training_model: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model to use for training
|
|
|
|
|
training_max_seq_length: 2048 # Max sequence length for training
|
|
|
|
|
training_dtype: null # Data type for training
|
|
|
|
|
training_load_in_4bit: true # 4bit quantization for training
|
2025-08-06 22:45:37 +01:00
|
|
|
|
2025-08-13 21:17:01 +01:00
|
|
|
# Training Configuration
|
2025-08-06 22:45:37 +01:00
|
|
|
training:
|
2025-08-13 21:17:01 +01:00
|
|
|
num_epochs: 3 # Number of training epochs
|
|
|
|
|
batch_size: 16 # Training batch size (adjust based on GPU memory)
|
|
|
|
|
learning_rate: 3e-5 # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
|
|
|
|
|
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
|
|
|
|
warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)
|
|
|
|
|
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
2025-08-06 22:45:37 +01:00
|
|
|
|
2025-08-13 21:17:01 +01:00
|
|
|
# Inference Configuration
|
2025-08-06 22:45:37 +01:00
|
|
|
inference:
|
2025-08-13 21:17:01 +01:00
|
|
|
batch_size: 32 # Batch size for inference (can be larger than training)
|
|
|
|
|
max_new_tokens: 128 # Maximum new tokens to generate during inference
|
|
|
|
|
temperature: 0.8 # Sampling temperature (0.0 = deterministic, 1.0 = random)
|