added style mimicking piepelines

2025-08-13 21:17:01 +01:00
parent fd54d4be39
commit 710d074b47
31 changed files with 3816 additions and 46 deletions
@@ -1,6 +1,6 @@
 # Comprehensive Classification Configuration
 # This file defines all parameters for emotion classification using the dair-ai/emotion dataset
-# Organized by level: data processing, model, training, and inference
+# Organized by level: task, data processing, model, training, and inference

 # Task Configuration
 task:
@@ -15,9 +15,9 @@ data:
  data_format: "jsonl"                     # Data format: "jsonl", "csv", "json" (for custom data)
  
  # Field Mapping
-  input_field: "text"                      # Field name containing input text
-  label_field: "label"                     # Field name containing labels
-  id_field: null                           # Optional ID field name
+  input_field: "text"                      # Field name containing input text to be classified
+  label_field: "label"                     # Field name containing classification labels
+  id_field: null                           # Optional ID field name for tracking individual samples
  
  # Processing Parameters
  max_samples: 1000                        # Maximum samples to process (null for all samples)
@@ -26,54 +26,54 @@ data:
  test_split: 0.1                          # Test split ratio (0.0 to 1.0)
  
  # Text Preprocessing
-  clean_text: true                         # Clean and normalize text
-  remove_special_chars: false              # Remove special characters from text
-  lowercase: true                          # Convert text to lowercase
+  clean_text: true                         # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
+  remove_special_chars: false              # Remove special characters from text (keep for emotion analysis)
+  lowercase: true                          # Convert text to lowercase (standard for BERT models)
  min_length: 10                           # Minimum text length (filter out shorter texts)
  max_length: 1000                         # Maximum text length (truncate longer texts)
  
  # Label Processing
  label_encoding: "auto"                   # Label encoding: "auto", "numeric", "string"
-  multilabel: false                        # Enable multilabel classification
-  label_separator: ","                     # Separator for multilabel datasets
+  multilabel: false                        # Enable multilabel classification (false for single emotion per text)
+  label_separator: ","                     # Separator for multilabel datasets (comma-separated labels)
  
  # Output Configuration
  output_format: "classification"          # Output format: "classification", "instruction", "conversation", "qa"
-  output_dir: "./data/processed/classification/emotion"  # Specific output directory for this dataset
+  output_dir: "./data/processed/classification/emotion"  # Output directory for processed data and splits
  
  # HuggingFace Specific
-  hf_split: "train"                        # HuggingFace dataset split to use
-  hf_cache_dir: null                       # HuggingFace cache directory (null for default)
+  hf_split: "train"                        # HuggingFace dataset split to use as base
+  hf_cache_dir: null                       # HuggingFace cache directory (null for default ~/.cache/huggingface)
  
  # Split Configuration (Advanced)
  test_split_from: "train"                 # Source for test split: "train", "use_test_if_available", "use_val_if_available"
  val_split_from: "train"                  # Source for validation split: "train", "use_val_if_available"
  
  # Custom Data Specific
-  encoding: "utf-8"                        # File encoding for custom data
-  delimiter: ","                           # Delimiter for CSV files
+  encoding: "utf-8"                        # File encoding for custom data files
+  delimiter: ","                           # Delimiter for CSV files (comma for standard CSV)

 # Model Configuration
 model:
-  name: "bert-base-uncased"                # Model name from HuggingFace Hub
-  max_length: 512                          # Maximum sequence length for tokenization
-  num_labels: 6                            # Number of classification labels
+  name: "bert-base-uncased"                # Model name from HuggingFace Hub (good for text classification)
+  max_length: 512                          # Maximum sequence length for tokenization (BERT limit)
+  num_labels: 6                            # Number of classification labels (emotion categories)

 # Training Configuration
 training:
-  num_epochs: 3                            # Number of training epochs
-  batch_size: 16                           # Training batch size
-  learning_rate: 2e-5                      # Learning rate (typical range: 1e-5 to 5e-5)
-  weight_decay: 0.01                       # Weight decay for optimizer (typical range: 0.01 to 0.1)
+  num_epochs: 3                            # Number of training epochs (adjust based on dataset size)
+  batch_size: 16                           # Training batch size (adjust based on GPU memory)
+  learning_rate: 2e-5                      # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
+  weight_decay: 0.01                       # Weight decay for optimizer (prevents overfitting)
  lr_scheduler_type: "linear"              # Scheduler type: "linear", "cosine", "polynomial"
  warmup_ratio: 0.1                        # Warmup ratio for scheduler (0.0 to 1.0)
  data_dir: "./data/processed/classification/emotion"  # Directory containing train/validation/test JSONL files
-  output_dir: "./results/classification/emotion_model"  # Output directory for saved model
+  output_dir: "./results/classification/emotion_model"  # Output directory for saved model and checkpoints

 # Inference Configuration
 inference:
  model_path: "./results/classification/emotion_model"  # Path to saved model directory
-  device: "auto"                           # Device: "auto", "cuda", "cpu"
-  batch_size: 32                           # Batch size for inference
-  return_probabilities: true                # Return all class probabilities
-  return_top_k: 3                          # Return top K predictions
+  device: "auto"                           # Device: "auto", "cuda", "cpu" (auto detects best available)
+  batch_size: 32                           # Batch size for inference (can be larger than training)
+  return_probabilities: true                # Return all class probabilities (not just top prediction)
+  return_top_k: 3                          # Return top K predictions (useful for confidence analysis)