initial setupt

2025-08-06 22:45:37 +01:00
commit fef3f5ae35
42 changed files with 7147 additions and 0 deletions
@@ -0,0 +1 @@
+# Base configuration for all tasks
@@ -0,0 +1,79 @@
+# Comprehensive Custom Dataset Classification Configuration
+# This file defines all parameters for processing custom classification datasets
+# Organized by level: data processing, model, training, and inference
+
+# Task Configuration
+task:
+  name: "classification"                    # Task type: classification, completion, styling, matching
+  type: "sequence_classification"          # Model type: sequence_classification, token_classification, etc.
+
+# Data Processing Configuration
+data:
+  source: "custom"                         # Data source: "huggingface" or "custom"
+  dataset_name: null                       # HuggingFace dataset name (not used for custom data)
+  data_path: "./data/classification/train.jsonl"  # Path to custom data file (required for custom source)
+  data_format: "jsonl"                     # Data format: "jsonl", "csv", "json"
+  
+  # Field Mapping
+  input_field: "text"                      # Field name containing input text
+  label_field: "label"                     # Field name containing labels
+  id_field: "id"                           # Optional ID field name (set to null if not available)
+  
+  # Processing Parameters
+  max_samples: 1000                        # Maximum samples to process (null for all samples)
+  train_split: 0.8                         # Training split ratio (0.0 to 1.0)
+  validation_split: 0.1                    # Validation split ratio (0.0 to 1.0)
+  test_split: 0.1                          # Test split ratio (0.0 to 1.0)
+  
+  # Text Preprocessing
+  clean_text: true                         # Clean and normalize text
+  remove_special_chars: false              # Remove special characters from text
+  lowercase: true                          # Convert text to lowercase
+  min_length: 10                           # Minimum text length (filter out shorter texts)
+  max_length: 1000                         # Maximum text length (truncate longer texts)
+  
+  # Label Processing
+  label_encoding: "auto"                   # Label encoding: "auto", "numeric", "string"
+  multilabel: false                        # Enable multilabel classification
+  label_separator: ","                     # Separator for multilabel datasets
+  
+  # Output Configuration
+  output_format: "classification"          # Output format: "classification", "instruction", "conversation", "qa"
+  output_dir: "./data/processed/classification/custom_dataset"  # Specific output directory for custom dataset
+  
+  # HuggingFace Specific (not used for custom data)
+  hf_split: "train"                        # HuggingFace dataset split to use
+  hf_cache_dir: null                       # HuggingFace cache directory (null for default)
+  
+  # Split Configuration (Advanced)
+  test_split_from: "train"                 # Source for test split: "train", "use_test_if_available", "use_val_if_available"
+  val_split_from: "train"                  # Source for validation split: "train", "use_val_if_available"
+  
+  # Custom Data Specific
+  encoding: "utf-8"                        # File encoding for custom data
+  delimiter: ","                           # Delimiter for CSV files
+
+# Model Configuration
+model:
+  name: "bert-base-uncased"                # Model name from HuggingFace Hub
+  max_length: 512                          # Maximum sequence length for tokenization
+  num_labels: 3                            # Number of classification labels (adjust based on your data)
+
+# Training Configuration
+training:
+  num_epochs: 3                            # Number of training epochs
+  batch_size: 16                           # Training batch size
+  learning_rate: 2e-5                      # Learning rate (typical range: 1e-5 to 5e-5)
+  weight_decay: 0.01                       # Weight decay for optimizer (typical range: 0.01 to 0.1)
+  lr_scheduler_type: "linear"              # Scheduler type: "linear", "cosine", "polynomial"
+  warmup_ratio: 0.1                        # Warmup ratio for scheduler (0.0 to 1.0)
+  data_dir: "./data/processed/classification/custom_dataset"  # Directory containing train/validation/test JSONL files
+  output_dir: "./results/classification/custom_model"  # Output directory for saved model
+
+# Inference Configuration
+inference:
+  model_path: "./results/classification/custom_model"  # Path to saved model directory
+  device: "auto"                           # Device: "auto", "cuda", "cpu"
+  batch_size: 32                           # Batch size for inference
+  return_probabilities: true                # Return all class probabilities
+  return_top_k: 3                          # Return top K predictions 
@@ -0,0 +1,79 @@
+# Comprehensive Classification Configuration
+# This file defines all parameters for emotion classification using the dair-ai/emotion dataset
+# Organized by level: data processing, model, training, and inference
+
+# Task Configuration
+task:
+  name: "classification"                    # Task type: classification, completion, styling, matching
+  type: "sequence_classification"          # Model type: sequence_classification, token_classification, etc.
+
+# Data Processing Configuration
+data:
+  source: "huggingface"                    # Data source: "huggingface" or "custom"
+  dataset_name: "dair-ai/emotion"         # HuggingFace dataset name (required for huggingface source)
+  data_path: null                          # Path to custom data file (required for custom source)
+  data_format: "jsonl"                     # Data format: "jsonl", "csv", "json" (for custom data)
+  
+  # Field Mapping
+  input_field: "text"                      # Field name containing input text
+  label_field: "label"                     # Field name containing labels
+  id_field: null                           # Optional ID field name
+  
+  # Processing Parameters
+  max_samples: 1000                        # Maximum samples to process (null for all samples)
+  train_split: 0.8                         # Training split ratio (0.0 to 1.0)
+  validation_split: 0.1                    # Validation split ratio (0.0 to 1.0)
+  test_split: 0.1                          # Test split ratio (0.0 to 1.0)
+  
+  # Text Preprocessing
+  clean_text: true                         # Clean and normalize text
+  remove_special_chars: false              # Remove special characters from text
+  lowercase: true                          # Convert text to lowercase
+  min_length: 10                           # Minimum text length (filter out shorter texts)
+  max_length: 1000                         # Maximum text length (truncate longer texts)
+  
+  # Label Processing
+  label_encoding: "auto"                   # Label encoding: "auto", "numeric", "string"
+  multilabel: false                        # Enable multilabel classification
+  label_separator: ","                     # Separator for multilabel datasets
+  
+  # Output Configuration
+  output_format: "classification"          # Output format: "classification", "instruction", "conversation", "qa"
+  output_dir: "./data/processed/classification/emotion"  # Specific output directory for this dataset
+  
+  # HuggingFace Specific
+  hf_split: "train"                        # HuggingFace dataset split to use
+  hf_cache_dir: null                       # HuggingFace cache directory (null for default)
+  
+  # Split Configuration (Advanced)
+  test_split_from: "train"                 # Source for test split: "train", "use_test_if_available", "use_val_if_available"
+  val_split_from: "train"                  # Source for validation split: "train", "use_val_if_available"
+  
+  # Custom Data Specific
+  encoding: "utf-8"                        # File encoding for custom data
+  delimiter: ","                           # Delimiter for CSV files
+
+# Model Configuration
+model:
+  name: "bert-base-uncased"                # Model name from HuggingFace Hub
+  max_length: 512                          # Maximum sequence length for tokenization
+  num_labels: 6                            # Number of classification labels
+
+# Training Configuration
+training:
+  num_epochs: 3                            # Number of training epochs
+  batch_size: 16                           # Training batch size
+  learning_rate: 2e-5                      # Learning rate (typical range: 1e-5 to 5e-5)
+  weight_decay: 0.01                       # Weight decay for optimizer (typical range: 0.01 to 0.1)
+  lr_scheduler_type: "linear"              # Scheduler type: "linear", "cosine", "polynomial"
+  warmup_ratio: 0.1                        # Warmup ratio for scheduler (0.0 to 1.0)
+  data_dir: "./data/processed/classification/emotion"  # Directory containing train/validation/test JSONL files
+  output_dir: "./results/classification/emotion_model"  # Output directory for saved model
+
+# Inference Configuration
+inference:
+  model_path: "./results/classification/emotion_model"  # Path to saved model directory
+  device: "auto"                           # Device: "auto", "cuda", "cpu"
+  batch_size: 32                           # Batch size for inference
+  return_probabilities: true                # Return all class probabilities
+  return_top_k: 3                          # Return top K predictions
@@ -0,0 +1,29 @@
+task:
+  name: "completion"
+  type: "text_generation"
+
+data:
+  source: "huggingface"
+  dataset_name: "wikitext-2-raw-v1"
+  input_field: "text"
+  max_length: 512
+  train_split: 0.8
+  validation_split: 0.1
+  test_split: 0.1
+
+model:
+  name: "gpt2"
+  max_length: 512
+
+training:
+  num_epochs: 3
+  batch_size: 8
+  learning_rate: 5e-5
+  weight_decay: 0.01
+  warmup_ratio: 0.1
+  lr_scheduler_type: "linear"
+
+inference:
+  batch_size: 16
+  max_new_tokens: 100
+  temperature: 0.7
@@ -0,0 +1,30 @@
+task:
+  name: "matching"
+  type: "semantic_matching"
+
+data:
+  source: "huggingface"
+  dataset_name: "sentence-transformers/paraphrase-MiniLM-L3-v2"
+  input_field: "sentence1"
+  target_field: "sentence2"
+  label_field: "label"
+  max_length: 128
+  train_split: 0.8
+  validation_split: 0.1
+  test_split: 0.1
+
+model:
+  name: "sentence-transformers/all-MiniLM-L6-v2"
+  max_length: 128
+
+training:
+  num_epochs: 3
+  batch_size: 32
+  learning_rate: 2e-5
+  weight_decay: 0.01
+  warmup_ratio: 0.1
+  lr_scheduler_type: "linear"
+
+inference:
+  batch_size: 64
+  similarity_threshold: 0.5
@@ -0,0 +1,29 @@
+task:
+  name: "styling"
+  type: "style_transfer"
+
+data:
+  source: "custom"
+  input_field: "text"
+  style_field: "style"
+  max_length: 256
+  train_split: 0.8
+  validation_split: 0.1
+  test_split: 0.1
+
+model:
+  name: "t5-base"
+  max_length: 256
+
+training:
+  num_epochs: 3
+  batch_size: 16
+  learning_rate: 3e-5
+  weight_decay: 0.01
+  warmup_ratio: 0.1
+  lr_scheduler_type: "linear"
+
+inference:
+  batch_size: 32
+  max_new_tokens: 128
+  temperature: 0.8