# Comprehensive Custom Dataset Classification Configuration # This file defines all parameters for processing custom classification datasets # Organized by level: data processing, model, training, and inference # Task Configuration task: name: "classification" # Task type: classification, completion, styling, matching type: "sequence_classification" # Model type: sequence_classification, token_classification, etc. # Data Processing Configuration data: source: "custom" # Data source: "huggingface" or "custom" dataset_name: null # HuggingFace dataset name (not used for custom data) data_path: "./data/classification/train.jsonl" # Path to custom data file (required for custom source) data_format: "jsonl" # Data format: "jsonl", "csv", "json" # Field Mapping input_field: "text" # Field name containing input text label_field: "label" # Field name containing labels id_field: "id" # Optional ID field name (set to null if not available) # Processing Parameters max_samples: 1000 # Maximum samples to process (null for all samples) train_split: 0.8 # Training split ratio (0.0 to 1.0) validation_split: 0.1 # Validation split ratio (0.0 to 1.0) test_split: 0.1 # Test split ratio (0.0 to 1.0) # Text Preprocessing clean_text: true # Clean and normalize text remove_special_chars: false # Remove special characters from text lowercase: true # Convert text to lowercase min_length: 10 # Minimum text length (filter out shorter texts) max_length: 1000 # Maximum text length (truncate longer texts) # Label Processing label_encoding: "auto" # Label encoding: "auto", "numeric", "string" multilabel: false # Enable multilabel classification label_separator: "," # Separator for multilabel datasets # Output Configuration output_format: "classification" # Output format: "classification", "instruction", "conversation", "qa" output_dir: "./data/processed/classification/custom_dataset" # Specific output directory for custom dataset # HuggingFace Specific (not used for custom data) hf_split: "train" # HuggingFace dataset split to use hf_cache_dir: null # HuggingFace cache directory (null for default) # Split Configuration (Advanced) test_split_from: "train" # Source for test split: "train", "use_test_if_available", "use_val_if_available" val_split_from: "train" # Source for validation split: "train", "use_val_if_available" # Custom Data Specific encoding: "utf-8" # File encoding for custom data delimiter: "," # Delimiter for CSV files # Model Configuration model: name: "bert-base-uncased" # Model name from HuggingFace Hub max_length: 512 # Maximum sequence length for tokenization num_labels: 3 # Number of classification labels (adjust based on your data) # Training Configuration training: num_epochs: 3 # Number of training epochs batch_size: 16 # Training batch size learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5) weight_decay: 0.01 # Weight decay for optimizer (typical range: 0.01 to 0.1) lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial" warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0) data_dir: "./data/processed/classification/custom_dataset" # Directory containing train/validation/test JSONL files output_dir: "./results/classification/custom_model" # Output directory for saved model # Inference Configuration inference: model_path: "./results/classification/custom_model" # Path to saved model directory device: "auto" # Device: "auto", "cuda", "cpu" batch_size: 32 # Batch size for inference return_probabilities: true # Return all class probabilities return_top_k: 3 # Return top K predictions