2025-08-06 22:45:37 +01:00
|
|
|
# Comprehensive Classification Configuration
|
|
|
|
|
# This file defines all parameters for emotion classification using the dair-ai/emotion dataset
|
2025-08-13 21:17:01 +01:00
|
|
|
# Organized by level: task, data processing, model, training, and inference
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# Task Configuration
|
|
|
|
|
task:
|
|
|
|
|
name: "classification" # Task type: classification, completion, styling, matching
|
|
|
|
|
type: "sequence_classification" # Model type: sequence_classification, token_classification, etc.
|
|
|
|
|
|
|
|
|
|
# Data Processing Configuration
|
|
|
|
|
data:
|
|
|
|
|
source: "huggingface" # Data source: "huggingface" or "custom"
|
|
|
|
|
dataset_name: "dair-ai/emotion" # HuggingFace dataset name (required for huggingface source)
|
|
|
|
|
data_path: null # Path to custom data file (required for custom source)
|
|
|
|
|
data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data)
|
|
|
|
|
|
|
|
|
|
# Field Mapping
|
2025-08-13 21:17:01 +01:00
|
|
|
input_field: "text" # Field name containing input text to be classified
|
|
|
|
|
label_field: "label" # Field name containing classification labels
|
|
|
|
|
id_field: null # Optional ID field name for tracking individual samples
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# Processing Parameters
|
|
|
|
|
max_samples: 1000 # Maximum samples to process (null for all samples)
|
|
|
|
|
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
|
|
|
|
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
|
|
|
|
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
|
|
|
|
|
|
|
|
|
# Text Preprocessing
|
2025-08-13 21:17:01 +01:00
|
|
|
clean_text: true # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
|
|
|
|
|
remove_special_chars: false # Remove special characters from text (keep for emotion analysis)
|
|
|
|
|
lowercase: true # Convert text to lowercase (standard for BERT models)
|
2025-08-06 22:45:37 +01:00
|
|
|
min_length: 10 # Minimum text length (filter out shorter texts)
|
|
|
|
|
max_length: 1000 # Maximum text length (truncate longer texts)
|
|
|
|
|
|
|
|
|
|
# Label Processing
|
|
|
|
|
label_encoding: "auto" # Label encoding: "auto", "numeric", "string"
|
2025-08-13 21:17:01 +01:00
|
|
|
multilabel: false # Enable multilabel classification (false for single emotion per text)
|
|
|
|
|
label_separator: "," # Separator for multilabel datasets (comma-separated labels)
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# Output Configuration
|
|
|
|
|
output_format: "classification" # Output format: "classification", "instruction", "conversation", "qa"
|
2025-08-13 21:17:01 +01:00
|
|
|
output_dir: "./data/processed/classification/emotion" # Output directory for processed data and splits
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# HuggingFace Specific
|
2025-08-13 21:17:01 +01:00
|
|
|
hf_split: "train" # HuggingFace dataset split to use as base
|
|
|
|
|
hf_cache_dir: null # HuggingFace cache directory (null for default ~/.cache/huggingface)
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# Split Configuration (Advanced)
|
|
|
|
|
test_split_from: "train" # Source for test split: "train", "use_test_if_available", "use_val_if_available"
|
|
|
|
|
val_split_from: "train" # Source for validation split: "train", "use_val_if_available"
|
|
|
|
|
|
|
|
|
|
# Custom Data Specific
|
2025-08-13 21:17:01 +01:00
|
|
|
encoding: "utf-8" # File encoding for custom data files
|
|
|
|
|
delimiter: "," # Delimiter for CSV files (comma for standard CSV)
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# Model Configuration
|
|
|
|
|
model:
|
2025-08-13 21:17:01 +01:00
|
|
|
name: "bert-base-uncased" # Model name from HuggingFace Hub (good for text classification)
|
|
|
|
|
max_length: 512 # Maximum sequence length for tokenization (BERT limit)
|
|
|
|
|
num_labels: 6 # Number of classification labels (emotion categories)
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# Training Configuration
|
|
|
|
|
training:
|
2025-08-13 21:17:01 +01:00
|
|
|
num_epochs: 3 # Number of training epochs (adjust based on dataset size)
|
|
|
|
|
batch_size: 16 # Training batch size (adjust based on GPU memory)
|
|
|
|
|
learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
|
|
|
|
|
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
2025-08-06 22:45:37 +01:00
|
|
|
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
|
|
|
|
warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)
|
|
|
|
|
data_dir: "./data/processed/classification/emotion" # Directory containing train/validation/test JSONL files
|
2025-08-13 21:17:01 +01:00
|
|
|
output_dir: "./results/classification/emotion_model" # Output directory for saved model and checkpoints
|
2025-08-06 22:45:37 +01:00
|
|
|
|
|
|
|
|
# Inference Configuration
|
|
|
|
|
inference:
|
|
|
|
|
model_path: "./results/classification/emotion_model" # Path to saved model directory
|
2025-08-13 21:17:01 +01:00
|
|
|
device: "auto" # Device: "auto", "cuda", "cpu" (auto detects best available)
|
|
|
|
|
batch_size: 32 # Batch size for inference (can be larger than training)
|
|
|
|
|
return_probabilities: true # Return all class probabilities (not just top prediction)
|
|
|
|
|
return_top_k: 3 # Return top K predictions (useful for confidence analysis)
|