configs/styling/formal.yaml

# Comprehensive Styling Configuration
# This file defines all parameters for formal style transfer tasks
# Organized by level: task, data processing, model, training, and inference

# Task Configuration
task:
  name: "styling"                          # Task type: classification, completion, styling, matching
  type: "style_transfer"                   # Model type: style_transfer, text_generation, etc.

# Data Processing Configuration
data:
  source: "custom"                          # Data source: "huggingface" or "custom"
  data_path: "./data/raw/styling/sample_formal.jsonl"  # Path to custom data file (required for custom source)
  dataset_name: null                        # HuggingFace dataset name (required for huggingface source)
  
  # Field Mapping
  input_field: "text"                       # Field name containing source text to be styled
  output_field: "styled_text"               # Field name containing the styled/transformed text
  
  # Style Instruction
  instruction: "Rewrite the following text in a formal style"  # The style instruction that guides the transformation
  
  # Data Format & Processing
  data_format: "jsonl"                      # Data format: "jsonl", "csv", "json" (for custom data)
  max_length: 256                           # Maximum text length (truncate longer texts)
  min_length: 10                            # Minimum text length (filter out shorter texts)
  
  # Text Preprocessing
  clean_text: true                          # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
  lowercase: false                          # Convert text to lowercase (false for formal style to preserve case)
  
  # Data Splitting
  train_split: 0.8                          # Training split ratio (0.0 to 1.0)
  validation_split: 0.1                     # Validation split ratio (0.0 to 1.0)
  test_split: 0.1                           # Test split ratio (0.0 to 1.0)
  
  # Output Configuration
  output_format: "alpaca"                   # Output format: "styling" (raw), "alpaca" (instruction format)
  output_dir: "./data/processed/styling/formal"  # Output directory for processed data and HuggingFace datasets

# Model Configuration
model:
  name: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # Model name from HuggingFace Hub
  max_length: 2048                          # Maximum sequence length for tokenization
  max_seq_length: 2048                      # Maximum sequence length for training (RoPE scaling supported)
  dtype: null                               # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
  load_in_4bit: true                        # Use 4bit quantization to reduce memory usage
  token: null                               # HuggingFace token for gated models (e.g., "hf_...")
  
  # Training Model Parameters
  training_model: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # Model to use for training
  training_max_seq_length: 2048             # Max sequence length for training
  training_dtype: null                      # Data type for training
  training_load_in_4bit: true               # 4bit quantization for training

# Training Configuration
training:
  num_epochs: 3                             # Number of training epochs
  batch_size: 16                            # Training batch size (adjust based on GPU memory)
  learning_rate: 3e-5                       # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
  weight_decay: 0.01                        # Weight decay for optimizer (prevents overfitting)
  warmup_ratio: 0.1                         # Warmup ratio for scheduler (0.0 to 1.0)
  lr_scheduler_type: "linear"               # Scheduler type: "linear", "cosine", "polynomial"

# Inference Configuration
inference:
  batch_size: 32                            # Batch size for inference (can be larger than training)
  max_new_tokens: 128                       # Maximum new tokens to generate during inference
  temperature: 0.8                          # Sampling temperature (0.0 = deterministic, 1.0 = random)
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`# Comprehensive Styling Configuration`
			`# This file defines all parameters for formal style transfer tasks`
			`# Organized by level: task, data processing, model, training, and inference`

			`# Task Configuration`
initial setupt 2025-08-06 22:45:37 +01:00			`task:`
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`name: "styling" # Task type: classification, completion, styling, matching`
			`type: "style_transfer" # Model type: style_transfer, text_generation, etc.`
initial setupt 2025-08-06 22:45:37 +01:00
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`# Data Processing Configuration`
initial setupt 2025-08-06 22:45:37 +01:00			`data:`
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`source: "custom" # Data source: "huggingface" or "custom"`
			`data_path: "./data/raw/styling/sample_formal.jsonl" # Path to custom data file (required for custom source)`
			`dataset_name: null # HuggingFace dataset name (required for huggingface source)`

			`# Field Mapping`
			`input_field: "text" # Field name containing source text to be styled`
			`output_field: "styled_text" # Field name containing the styled/transformed text`

			`# Style Instruction`
			`instruction: "Rewrite the following text in a formal style" # The style instruction that guides the transformation`

			`# Data Format & Processing`
			`data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data)`
			`max_length: 256 # Maximum text length (truncate longer texts)`
			`min_length: 10 # Minimum text length (filter out shorter texts)`

			`# Text Preprocessing`
			`clean_text: true # Clean and normalize text (remove extra spaces, normalize quotes, etc.)`
			`lowercase: false # Convert text to lowercase (false for formal style to preserve case)`

			`# Data Splitting`
			`train_split: 0.8 # Training split ratio (0.0 to 1.0)`
			`validation_split: 0.1 # Validation split ratio (0.0 to 1.0)`
			`test_split: 0.1 # Test split ratio (0.0 to 1.0)`

			`# Output Configuration`
			`output_format: "alpaca" # Output format: "styling" (raw), "alpaca" (instruction format)`
			`output_dir: "./data/processed/styling/formal" # Output directory for processed data and HuggingFace datasets`
initial setupt 2025-08-06 22:45:37 +01:00
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`# Model Configuration`
initial setupt 2025-08-06 22:45:37 +01:00			`model:`
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`name: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model name from HuggingFace Hub`
			`max_length: 2048 # Maximum sequence length for tokenization`
			`max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported)`
			`dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+`
			`load_in_4bit: true # Use 4bit quantization to reduce memory usage`
			`token: null # HuggingFace token for gated models (e.g., "hf_...")`

			`# Training Model Parameters`
			`training_model: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model to use for training`
			`training_max_seq_length: 2048 # Max sequence length for training`
			`training_dtype: null # Data type for training`
			`training_load_in_4bit: true # 4bit quantization for training`
initial setupt 2025-08-06 22:45:37 +01:00
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`# Training Configuration`
initial setupt 2025-08-06 22:45:37 +01:00			`training:`
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`num_epochs: 3 # Number of training epochs`
			`batch_size: 16 # Training batch size (adjust based on GPU memory)`
			`learning_rate: 3e-5 # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)`
			`weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)`
			`warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)`
			`lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"`
initial setupt 2025-08-06 22:45:37 +01:00
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`# Inference Configuration`
initial setupt 2025-08-06 22:45:37 +01:00			`inference:`
added style mimicking piepelines 2025-08-13 21:17:01 +01:00			`batch_size: 32 # Batch size for inference (can be larger than training)`
			`max_new_tokens: 128 # Maximum new tokens to generate during inference`
			`temperature: 0.8 # Sampling temperature (0.0 = deterministic, 1.0 = random)`