added style mimicking piepelines

2025-08-13 21:17:01 +01:00
parent fd54d4be39
commit 710d074b47
31 changed files with 3816 additions and 46 deletions
@@ -1,29 +1,69 @@
+# Comprehensive Styling Configuration
+# This file defines all parameters for formal style transfer tasks
+# Organized by level: task, data processing, model, training, and inference
+
+# Task Configuration
 task:
-  name: "styling"
-  type: "style_transfer"
+  name: "styling"                          # Task type: classification, completion, styling, matching
+  type: "style_transfer"                   # Model type: style_transfer, text_generation, etc.

+# Data Processing Configuration
 data:
-  source: "custom"
-  input_field: "text"
-  style_field: "style"
-  max_length: 256
-  train_split: 0.8
-  validation_split: 0.1
-  test_split: 0.1
+  source: "custom"                          # Data source: "huggingface" or "custom"
+  data_path: "./data/raw/styling/sample_formal.jsonl"  # Path to custom data file (required for custom source)
+  dataset_name: null                        # HuggingFace dataset name (required for huggingface source)
+  
+  # Field Mapping
+  input_field: "text"                       # Field name containing source text to be styled
+  output_field: "styled_text"               # Field name containing the styled/transformed text
+  
+  # Style Instruction
+  instruction: "Rewrite the following text in a formal style"  # The style instruction that guides the transformation
+  
+  # Data Format & Processing
+  data_format: "jsonl"                      # Data format: "jsonl", "csv", "json" (for custom data)
+  max_length: 256                           # Maximum text length (truncate longer texts)
+  min_length: 10                            # Minimum text length (filter out shorter texts)
+  
+  # Text Preprocessing
+  clean_text: true                          # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
+  lowercase: false                          # Convert text to lowercase (false for formal style to preserve case)
+  
+  # Data Splitting
+  train_split: 0.8                          # Training split ratio (0.0 to 1.0)
+  validation_split: 0.1                     # Validation split ratio (0.0 to 1.0)
+  test_split: 0.1                           # Test split ratio (0.0 to 1.0)
+  
+  # Output Configuration
+  output_format: "alpaca"                   # Output format: "styling" (raw), "alpaca" (instruction format)
+  output_dir: "./data/processed/styling/formal"  # Output directory for processed data and HuggingFace datasets

+# Model Configuration
 model:
-  name: "t5-base"
-  max_length: 256
+  name: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # Model name from HuggingFace Hub
+  max_length: 2048                          # Maximum sequence length for tokenization
+  max_seq_length: 2048                      # Maximum sequence length for training (RoPE scaling supported)
+  dtype: null                               # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
+  load_in_4bit: true                        # Use 4bit quantization to reduce memory usage
+  token: null                               # HuggingFace token for gated models (e.g., "hf_...")
+  
+  # Training Model Parameters
+  training_model: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"  # Model to use for training
+  training_max_seq_length: 2048             # Max sequence length for training
+  training_dtype: null                      # Data type for training
+  training_load_in_4bit: true               # 4bit quantization for training

+# Training Configuration
 training:
-  num_epochs: 3
-  batch_size: 16
-  learning_rate: 3e-5
-  weight_decay: 0.01
-  warmup_ratio: 0.1
-  lr_scheduler_type: "linear"
+  num_epochs: 3                             # Number of training epochs
+  batch_size: 16                            # Training batch size (adjust based on GPU memory)
+  learning_rate: 3e-5                       # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
+  weight_decay: 0.01                        # Weight decay for optimizer (prevents overfitting)
+  warmup_ratio: 0.1                         # Warmup ratio for scheduler (0.0 to 1.0)
+  lr_scheduler_type: "linear"               # Scheduler type: "linear", "cosine", "polynomial"

+# Inference Configuration
 inference:
-  batch_size: 32
-  max_new_tokens: 128
-  temperature: 0.8
+  batch_size: 32                            # Batch size for inference (can be larger than training)
+  max_new_tokens: 128                       # Maximum new tokens to generate during inference
+  temperature: 0.8                          # Sampling temperature (0.0 = deterministic, 1.0 = random)