initial setupt
This commit is contained in:
@@ -0,0 +1 @@
|
||||
# Base configuration for all tasks
|
||||
@@ -0,0 +1,79 @@
|
||||
# Comprehensive Custom Dataset Classification Configuration
|
||||
# This file defines all parameters for processing custom classification datasets
|
||||
# Organized by level: data processing, model, training, and inference
|
||||
|
||||
# Task Configuration
|
||||
task:
|
||||
name: "classification" # Task type: classification, completion, styling, matching
|
||||
type: "sequence_classification" # Model type: sequence_classification, token_classification, etc.
|
||||
|
||||
# Data Processing Configuration
|
||||
data:
|
||||
source: "custom" # Data source: "huggingface" or "custom"
|
||||
dataset_name: null # HuggingFace dataset name (not used for custom data)
|
||||
data_path: "./data/classification/train.jsonl" # Path to custom data file (required for custom source)
|
||||
data_format: "jsonl" # Data format: "jsonl", "csv", "json"
|
||||
|
||||
# Field Mapping
|
||||
input_field: "text" # Field name containing input text
|
||||
label_field: "label" # Field name containing labels
|
||||
id_field: "id" # Optional ID field name (set to null if not available)
|
||||
|
||||
# Processing Parameters
|
||||
max_samples: 1000 # Maximum samples to process (null for all samples)
|
||||
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
||||
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
||||
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||
|
||||
# Text Preprocessing
|
||||
clean_text: true # Clean and normalize text
|
||||
remove_special_chars: false # Remove special characters from text
|
||||
lowercase: true # Convert text to lowercase
|
||||
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||
max_length: 1000 # Maximum text length (truncate longer texts)
|
||||
|
||||
# Label Processing
|
||||
label_encoding: "auto" # Label encoding: "auto", "numeric", "string"
|
||||
multilabel: false # Enable multilabel classification
|
||||
label_separator: "," # Separator for multilabel datasets
|
||||
|
||||
# Output Configuration
|
||||
output_format: "classification" # Output format: "classification", "instruction", "conversation", "qa"
|
||||
output_dir: "./data/processed/classification/custom_dataset" # Specific output directory for custom dataset
|
||||
|
||||
# HuggingFace Specific (not used for custom data)
|
||||
hf_split: "train" # HuggingFace dataset split to use
|
||||
hf_cache_dir: null # HuggingFace cache directory (null for default)
|
||||
|
||||
# Split Configuration (Advanced)
|
||||
test_split_from: "train" # Source for test split: "train", "use_test_if_available", "use_val_if_available"
|
||||
val_split_from: "train" # Source for validation split: "train", "use_val_if_available"
|
||||
|
||||
# Custom Data Specific
|
||||
encoding: "utf-8" # File encoding for custom data
|
||||
delimiter: "," # Delimiter for CSV files
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
name: "bert-base-uncased" # Model name from HuggingFace Hub
|
||||
max_length: 512 # Maximum sequence length for tokenization
|
||||
num_labels: 3 # Number of classification labels (adjust based on your data)
|
||||
|
||||
# Training Configuration
|
||||
training:
|
||||
num_epochs: 3 # Number of training epochs
|
||||
batch_size: 16 # Training batch size
|
||||
learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5)
|
||||
weight_decay: 0.01 # Weight decay for optimizer (typical range: 0.01 to 0.1)
|
||||
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||
warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)
|
||||
data_dir: "./data/processed/classification/custom_dataset" # Directory containing train/validation/test JSONL files
|
||||
output_dir: "./results/classification/custom_model" # Output directory for saved model
|
||||
|
||||
# Inference Configuration
|
||||
inference:
|
||||
model_path: "./results/classification/custom_model" # Path to saved model directory
|
||||
device: "auto" # Device: "auto", "cuda", "cpu"
|
||||
batch_size: 32 # Batch size for inference
|
||||
return_probabilities: true # Return all class probabilities
|
||||
return_top_k: 3 # Return top K predictions
|
||||
@@ -0,0 +1,79 @@
|
||||
# Comprehensive Classification Configuration
|
||||
# This file defines all parameters for emotion classification using the dair-ai/emotion dataset
|
||||
# Organized by level: data processing, model, training, and inference
|
||||
|
||||
# Task Configuration
|
||||
task:
|
||||
name: "classification" # Task type: classification, completion, styling, matching
|
||||
type: "sequence_classification" # Model type: sequence_classification, token_classification, etc.
|
||||
|
||||
# Data Processing Configuration
|
||||
data:
|
||||
source: "huggingface" # Data source: "huggingface" or "custom"
|
||||
dataset_name: "dair-ai/emotion" # HuggingFace dataset name (required for huggingface source)
|
||||
data_path: null # Path to custom data file (required for custom source)
|
||||
data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data)
|
||||
|
||||
# Field Mapping
|
||||
input_field: "text" # Field name containing input text
|
||||
label_field: "label" # Field name containing labels
|
||||
id_field: null # Optional ID field name
|
||||
|
||||
# Processing Parameters
|
||||
max_samples: 1000 # Maximum samples to process (null for all samples)
|
||||
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
||||
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
||||
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||
|
||||
# Text Preprocessing
|
||||
clean_text: true # Clean and normalize text
|
||||
remove_special_chars: false # Remove special characters from text
|
||||
lowercase: true # Convert text to lowercase
|
||||
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||
max_length: 1000 # Maximum text length (truncate longer texts)
|
||||
|
||||
# Label Processing
|
||||
label_encoding: "auto" # Label encoding: "auto", "numeric", "string"
|
||||
multilabel: false # Enable multilabel classification
|
||||
label_separator: "," # Separator for multilabel datasets
|
||||
|
||||
# Output Configuration
|
||||
output_format: "classification" # Output format: "classification", "instruction", "conversation", "qa"
|
||||
output_dir: "./data/processed/classification/emotion" # Specific output directory for this dataset
|
||||
|
||||
# HuggingFace Specific
|
||||
hf_split: "train" # HuggingFace dataset split to use
|
||||
hf_cache_dir: null # HuggingFace cache directory (null for default)
|
||||
|
||||
# Split Configuration (Advanced)
|
||||
test_split_from: "train" # Source for test split: "train", "use_test_if_available", "use_val_if_available"
|
||||
val_split_from: "train" # Source for validation split: "train", "use_val_if_available"
|
||||
|
||||
# Custom Data Specific
|
||||
encoding: "utf-8" # File encoding for custom data
|
||||
delimiter: "," # Delimiter for CSV files
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
name: "bert-base-uncased" # Model name from HuggingFace Hub
|
||||
max_length: 512 # Maximum sequence length for tokenization
|
||||
num_labels: 6 # Number of classification labels
|
||||
|
||||
# Training Configuration
|
||||
training:
|
||||
num_epochs: 3 # Number of training epochs
|
||||
batch_size: 16 # Training batch size
|
||||
learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5)
|
||||
weight_decay: 0.01 # Weight decay for optimizer (typical range: 0.01 to 0.1)
|
||||
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||
warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)
|
||||
data_dir: "./data/processed/classification/emotion" # Directory containing train/validation/test JSONL files
|
||||
output_dir: "./results/classification/emotion_model" # Output directory for saved model
|
||||
|
||||
# Inference Configuration
|
||||
inference:
|
||||
model_path: "./results/classification/emotion_model" # Path to saved model directory
|
||||
device: "auto" # Device: "auto", "cuda", "cpu"
|
||||
batch_size: 32 # Batch size for inference
|
||||
return_probabilities: true # Return all class probabilities
|
||||
return_top_k: 3 # Return top K predictions
|
||||
@@ -0,0 +1,29 @@
|
||||
task:
|
||||
name: "completion"
|
||||
type: "text_generation"
|
||||
|
||||
data:
|
||||
source: "huggingface"
|
||||
dataset_name: "wikitext-2-raw-v1"
|
||||
input_field: "text"
|
||||
max_length: 512
|
||||
train_split: 0.8
|
||||
validation_split: 0.1
|
||||
test_split: 0.1
|
||||
|
||||
model:
|
||||
name: "gpt2"
|
||||
max_length: 512
|
||||
|
||||
training:
|
||||
num_epochs: 3
|
||||
batch_size: 8
|
||||
learning_rate: 5e-5
|
||||
weight_decay: 0.01
|
||||
warmup_ratio: 0.1
|
||||
lr_scheduler_type: "linear"
|
||||
|
||||
inference:
|
||||
batch_size: 16
|
||||
max_new_tokens: 100
|
||||
temperature: 0.7
|
||||
@@ -0,0 +1,30 @@
|
||||
task:
|
||||
name: "matching"
|
||||
type: "semantic_matching"
|
||||
|
||||
data:
|
||||
source: "huggingface"
|
||||
dataset_name: "sentence-transformers/paraphrase-MiniLM-L3-v2"
|
||||
input_field: "sentence1"
|
||||
target_field: "sentence2"
|
||||
label_field: "label"
|
||||
max_length: 128
|
||||
train_split: 0.8
|
||||
validation_split: 0.1
|
||||
test_split: 0.1
|
||||
|
||||
model:
|
||||
name: "sentence-transformers/all-MiniLM-L6-v2"
|
||||
max_length: 128
|
||||
|
||||
training:
|
||||
num_epochs: 3
|
||||
batch_size: 32
|
||||
learning_rate: 2e-5
|
||||
weight_decay: 0.01
|
||||
warmup_ratio: 0.1
|
||||
lr_scheduler_type: "linear"
|
||||
|
||||
inference:
|
||||
batch_size: 64
|
||||
similarity_threshold: 0.5
|
||||
@@ -0,0 +1,29 @@
|
||||
task:
|
||||
name: "styling"
|
||||
type: "style_transfer"
|
||||
|
||||
data:
|
||||
source: "custom"
|
||||
input_field: "text"
|
||||
style_field: "style"
|
||||
max_length: 256
|
||||
train_split: 0.8
|
||||
validation_split: 0.1
|
||||
test_split: 0.1
|
||||
|
||||
model:
|
||||
name: "t5-base"
|
||||
max_length: 256
|
||||
|
||||
training:
|
||||
num_epochs: 3
|
||||
batch_size: 16
|
||||
learning_rate: 3e-5
|
||||
weight_decay: 0.01
|
||||
warmup_ratio: 0.1
|
||||
lr_scheduler_type: "linear"
|
||||
|
||||
inference:
|
||||
batch_size: 32
|
||||
max_new_tokens: 128
|
||||
temperature: 0.8
|
||||
Reference in New Issue
Block a user