# Comprehensive Styling Configuration
# This file defines all parameters for formal style transfer tasks
# Organized by level: task, data processing, model, training, and inference

# Task Configuration
task:
  name: "styling"                          # Task type: classification, completion, styling, matching
  type: "style_transfer"                   # Model type: style_transfer, text_generation, etc.

# Data Processing Configuration
data:
  source: "custom"                          # Data source: "huggingface" or "custom"
  data_path: "./data/raw/styling/sample_formal.jsonl"  # Path to custom data file (required for custom source)
  dataset_name: null                        # HuggingFace dataset name (required for huggingface source)
  
  # Field Mapping
  input_field: "text"                       # Field name containing source text to be styled
  output_field: "styled_text"               # Field name containing the styled/transformed text
  
  # Style Instruction
  instruction: "Rewrite the following text in a formal style"  # The style instruction that guides the transformation
  
  # Data Format & Processing
  data_format: "jsonl"                      # Data format: "jsonl", "csv", "json" (for custom data)
  max_length: 256                           # Maximum text length (truncate longer texts)
  min_length: 10                            # Minimum text length (filter out shorter texts)
  
  # Text Preprocessing
  clean_text: true                          # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
  lowercase: false                          # Convert text to lowercase (false for formal style to preserve case)
  
  # Data Splitting
  train_split: 0.8                          # Training split ratio (0.0 to 1.0)
  validation_split: 0.1                     # Validation split ratio (0.0 to 1.0)
  test_split: 0.1                           # Test split ratio (0.0 to 1.0)
  
  # Output Configuration
  output_format: "alpaca"                   # Output format: "styling" (raw), "alpaca" (instruction format)
  output_dir: "./data/processed/styling/formal"  # Output directory for processed data and HuggingFace datasets

# Model Configuration
model:
  name: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # Model name from HuggingFace Hub
  max_length: 2048                          # Maximum sequence length for tokenization
  max_seq_length: 2048                      # Maximum sequence length for training (RoPE scaling supported)
  dtype: null                               # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
  load_in_4bit: true                        # Use 4bit quantization to reduce memory usage
  token: null                               # HuggingFace token for gated models (e.g., "hf_...")
  
  # Training Model Parameters
  training_model: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # Model to use for training
  training_max_seq_length: 2048             # Max sequence length for training
  training_dtype: null                      # Data type for training
  training_load_in_4bit: true               # 4bit quantization for training

# Training Configuration
training:
  num_epochs: 3                             # Number of training epochs
  batch_size: 16                            # Training batch size (adjust based on GPU memory)
  learning_rate: 3e-5                       # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
  weight_decay: 0.01                        # Weight decay for optimizer (prevents overfitting)
  warmup_ratio: 0.1                         # Warmup ratio for scheduler (0.0 to 1.0)
  lr_scheduler_type: "linear"               # Scheduler type: "linear", "cosine", "polynomial"

# Inference Configuration
inference:
  batch_size: 32                            # Batch size for inference (can be larger than training)
  max_new_tokens: 128                       # Maximum new tokens to generate during inference
  temperature: 0.8                          # Sampling temperature (0.0 = deterministic, 1.0 = random)