initial setupt
This commit is contained in:
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Classification Data Processor Script
|
||||
Uses YAML configurations for flexible and maintainable data processing.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||
"""Run data processor with YAML configuration"""
|
||||
print(f"=== Running Classification Data Processor ===")
|
||||
print(f"Config: {config_path}")
|
||||
|
||||
cmd = [
|
||||
"python", "pipelines/classification/data_processor.py",
|
||||
"--config", config_path
|
||||
]
|
||||
|
||||
# Add CLI overrides
|
||||
for key, value in cli_overrides.items():
|
||||
if value is not None:
|
||||
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||
|
||||
print(f"Command: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Data processing completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running data processor: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def run_emotion_example():
|
||||
"""Run emotion classification example"""
|
||||
print("=== Emotion Classification Example ===")
|
||||
|
||||
success = run_with_yaml_config(
|
||||
"configs/classification/emotion.yaml",
|
||||
max_samples=500, # Override YAML value
|
||||
output_dir="./data/emotion_small"
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Emotion classification data processing completed!")
|
||||
else:
|
||||
print("❌ Emotion classification failed!")
|
||||
|
||||
def run_custom_example():
|
||||
"""Run custom dataset example"""
|
||||
print("\n=== Custom Dataset Example ===")
|
||||
|
||||
if os.path.exists("data/classification/train.jsonl"):
|
||||
success = run_with_yaml_config(
|
||||
"configs/classification/custom.yaml",
|
||||
data_source="custom",
|
||||
data_path="data/classification/train.jsonl",
|
||||
output_dir="./data/custom_processed"
|
||||
)
|
||||
if success:
|
||||
print("✅ Custom dataset processing completed!")
|
||||
else:
|
||||
print("❌ Custom dataset processing failed!")
|
||||
else:
|
||||
print("⚠️ Custom dataset not found, skipping...")
|
||||
|
||||
def create_custom_config():
|
||||
"""Create a custom configuration file"""
|
||||
custom_config = """task:
|
||||
name: "classification"
|
||||
type: "sequence_classification"
|
||||
|
||||
data:
|
||||
source: "custom"
|
||||
data_format: "jsonl"
|
||||
input_field: "text"
|
||||
label_field: "label"
|
||||
max_samples: 1000
|
||||
train_split: 0.8
|
||||
validation_split: 0.1
|
||||
test_split: 0.1
|
||||
|
||||
processing:
|
||||
clean_text: true
|
||||
lowercase: true
|
||||
min_length: 10
|
||||
max_length: 1000
|
||||
|
||||
output:
|
||||
output_dir: "./data/custom_processed"
|
||||
output_format: "classification"
|
||||
"""
|
||||
|
||||
config_path = "configs/classification/custom.yaml"
|
||||
with open(config_path, 'w') as f:
|
||||
f.write(custom_config)
|
||||
|
||||
print(f"✅ Created custom config: {config_path}")
|
||||
|
||||
def show_usage():
|
||||
"""Show usage examples"""
|
||||
print("=== Classification Data Processor Usage ===")
|
||||
print()
|
||||
print("1. Use YAML config only:")
|
||||
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
|
||||
print()
|
||||
print("2. Override YAML values:")
|
||||
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml --max-samples 500")
|
||||
print()
|
||||
print("3. Use CLI only (backward compatibility):")
|
||||
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
|
||||
print()
|
||||
print("4. Run examples:")
|
||||
print(" python scripts/classification/data_processor.py examples")
|
||||
print()
|
||||
print("5. Create custom config:")
|
||||
print(" python scripts/classification/data_processor.py create-config")
|
||||
|
||||
def handle_direct_args():
|
||||
"""Handle direct command-line arguments by passing them to the pipeline"""
|
||||
parser = argparse.ArgumentParser(description="Classification Data Processor")
|
||||
|
||||
# Add all the same arguments as the pipeline
|
||||
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||||
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||||
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||||
parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
|
||||
parser.add_argument("--input-field", type=str, help="Input field name")
|
||||
parser.add_argument("--label-field", type=str, help="Label field name")
|
||||
parser.add_argument("--id-field", type=str, help="Optional ID field name")
|
||||
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||||
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||||
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||||
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||||
parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
|
||||
parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
|
||||
parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
|
||||
parser.add_argument("--min-length", type=int, help="Minimum text length")
|
||||
parser.add_argument("--max-length", type=int, help="Maximum text length")
|
||||
parser.add_argument("--output-format", choices=["classification", "instruction", "conversation", "qa"], help="Output format")
|
||||
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||||
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build command to call the pipeline
|
||||
cmd = ["python", "pipelines/classification/data_processor.py"]
|
||||
|
||||
# Add all arguments that were provided
|
||||
for arg_name, arg_value in vars(args).items():
|
||||
if arg_value is not None:
|
||||
if isinstance(arg_value, bool):
|
||||
if arg_value: # Only add flag if True
|
||||
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||
else:
|
||||
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Data processing completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running data processor: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
# Check if any command-line arguments were provided
|
||||
if len(sys.argv) > 1:
|
||||
# Check if it's a subcommand
|
||||
if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
|
||||
# Handle subcommands
|
||||
if sys.argv[1] == "examples":
|
||||
run_emotion_example()
|
||||
run_custom_example()
|
||||
elif sys.argv[1] == "emotion":
|
||||
run_emotion_example()
|
||||
elif sys.argv[1] == "custom":
|
||||
run_custom_example()
|
||||
elif sys.argv[1] == "create-config":
|
||||
create_custom_config()
|
||||
elif sys.argv[1] == "help":
|
||||
show_usage()
|
||||
else:
|
||||
# Handle direct arguments (pass through to pipeline)
|
||||
handle_direct_args()
|
||||
else:
|
||||
print("Classification Data Processor")
|
||||
print("============================")
|
||||
print()
|
||||
print("This script processes classification datasets using YAML configurations.")
|
||||
print()
|
||||
print("Usage:")
|
||||
print(" python scripts/classification/data_processor.py examples # Run examples")
|
||||
print(" python scripts/classification/data_processor.py emotion # Run emotion example")
|
||||
print(" python scripts/classification/data_processor.py custom # Run custom example")
|
||||
print(" python scripts/classification/data_processor.py create-config # Create custom config")
|
||||
print(" python scripts/classification/data_processor.py help # Show usage")
|
||||
print()
|
||||
print("Direct pipeline usage:")
|
||||
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
|
||||
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
|
||||
print()
|
||||
print("Benefits of YAML configurations:")
|
||||
print(" ✅ Easier to manage complex configurations")
|
||||
print(" ✅ Version control friendly")
|
||||
print(" ✅ Self-documenting")
|
||||
print(" ✅ Can still override with CLI args")
|
||||
print(" ✅ Better for team collaboration")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,260 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Classification Inference Script
|
||||
Uses YAML configurations for flexible and maintainable model inference.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||
"""Run inference with YAML configuration"""
|
||||
print(f"=== Running Classification Inference ===")
|
||||
print(f"Config: {config_path}")
|
||||
|
||||
cmd = [
|
||||
"python", "pipelines/classification/inference.py",
|
||||
"--config", config_path
|
||||
]
|
||||
|
||||
# Add CLI overrides
|
||||
for key, value in cli_overrides.items():
|
||||
if value is not None:
|
||||
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||
|
||||
print(f"Command: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Inference completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running inference: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def run_single_text_inference():
|
||||
"""Run single text inference"""
|
||||
print("=== Single Text Inference ===")
|
||||
|
||||
# Check if model exists
|
||||
model_path = "./results/emotion_model"
|
||||
if not os.path.exists(model_path):
|
||||
print(f"⚠️ Model not found: {model_path}")
|
||||
print("Please train a model first using the trainer script.")
|
||||
return False
|
||||
|
||||
success = run_with_yaml_config(
|
||||
"configs/classification/emotion.yaml",
|
||||
model_path=model_path,
|
||||
input_text="I love this product! It's amazing.",
|
||||
return_top_k=3
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Single text inference completed!")
|
||||
else:
|
||||
print("❌ Single text inference failed!")
|
||||
|
||||
return success
|
||||
|
||||
def run_file_inference():
|
||||
"""Run file-based inference"""
|
||||
print("\n=== File-Based Inference ===")
|
||||
|
||||
# Check if model exists
|
||||
model_path = "./results/emotion_model"
|
||||
if not os.path.exists(model_path):
|
||||
print(f"⚠️ Model not found: {model_path}")
|
||||
print("Please train a model first using the trainer script.")
|
||||
return False
|
||||
|
||||
# Create sample input file
|
||||
sample_texts = [
|
||||
"I love this product! It's amazing.",
|
||||
"This is terrible, I hate it.",
|
||||
"The weather is okay today.",
|
||||
"Best purchase ever made!"
|
||||
]
|
||||
|
||||
input_file = "sample_texts.txt"
|
||||
with open(input_file, 'w') as f:
|
||||
for text in sample_texts:
|
||||
f.write(text + '\n')
|
||||
|
||||
success = run_with_yaml_config(
|
||||
"configs/classification/emotion.yaml",
|
||||
model_path=model_path,
|
||||
input_file=input_file,
|
||||
output_file="predictions.jsonl",
|
||||
batch_size=16
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ File-based inference completed!")
|
||||
print(f"Results saved to: predictions.jsonl")
|
||||
else:
|
||||
print("❌ File-based inference failed!")
|
||||
|
||||
return success
|
||||
|
||||
def run_interactive_inference():
|
||||
"""Run interactive inference"""
|
||||
print("\n=== Interactive Inference ===")
|
||||
|
||||
# Check if model exists
|
||||
model_path = "./results/emotion_model"
|
||||
if not os.path.exists(model_path):
|
||||
print(f"⚠️ Model not found: {model_path}")
|
||||
print("Please train a model first using the trainer script.")
|
||||
return False
|
||||
|
||||
success = run_with_yaml_config(
|
||||
"configs/classification/emotion.yaml",
|
||||
model_path=model_path,
|
||||
return_top_k=3
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Interactive inference completed!")
|
||||
else:
|
||||
print("❌ Interactive inference failed!")
|
||||
|
||||
return success
|
||||
|
||||
def create_inference_config():
|
||||
"""Create an inference configuration file"""
|
||||
inference_config = """model_path: "./results/emotion_model"
|
||||
device: "auto"
|
||||
batch_size: 32
|
||||
max_length: 512
|
||||
return_probabilities: true
|
||||
return_top_k: 3
|
||||
"""
|
||||
|
||||
config_path = "configs/classification/inference.yaml"
|
||||
with open(config_path, 'w') as f:
|
||||
f.write(inference_config)
|
||||
|
||||
print(f"✅ Created inference config: {config_path}")
|
||||
|
||||
def show_usage():
|
||||
"""Show usage examples"""
|
||||
print("=== Classification Inference Usage ===")
|
||||
print()
|
||||
print("1. Use YAML config only:")
|
||||
print(" python scripts/classification/inference.py --config configs/classification/inference.yaml")
|
||||
print()
|
||||
print("2. Override YAML values:")
|
||||
print(" python scripts/classification/inference.py --config configs/classification/inference.yaml --input-text 'Your text here'")
|
||||
print()
|
||||
print("3. Use CLI only (backward compatibility):")
|
||||
print(" python scripts/classification/inference.py --model-path ./results/emotion_model --input-text 'Your text here'")
|
||||
print()
|
||||
print("4. Run examples:")
|
||||
print(" python scripts/classification/inference.py examples")
|
||||
print()
|
||||
print("5. Create inference config:")
|
||||
print(" python scripts/classification/inference.py create-config")
|
||||
|
||||
def handle_direct_args():
|
||||
"""Handle direct command-line arguments by passing them to the pipeline"""
|
||||
parser = argparse.ArgumentParser(description="Classification Inference")
|
||||
|
||||
# Add all the same arguments as the pipeline
|
||||
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||
parser.add_argument("--model-path", type=str, help="Path to saved model directory")
|
||||
parser.add_argument("--device", choices=["auto", "cuda", "cpu"], help="Device to run inference on")
|
||||
parser.add_argument("--batch-size", type=int, help="Batch size for inference")
|
||||
parser.add_argument("--max-length", type=int, help="Maximum sequence length for tokenization")
|
||||
parser.add_argument("--return-probabilities", action="store_true", help="Return all class probabilities")
|
||||
parser.add_argument("--return-top-k", type=int, help="Return top K predictions")
|
||||
parser.add_argument("--input-text", type=str, help="Single text for prediction")
|
||||
parser.add_argument("--input-file", type=str, help="Input file path (txt or jsonl)")
|
||||
parser.add_argument("--output-file", type=str, help="Output file path for results")
|
||||
parser.add_argument("--chunk-size", type=int, help="Chunk size for large file processing")
|
||||
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build command to call the pipeline
|
||||
cmd = ["python", "pipelines/classification/inference.py"]
|
||||
|
||||
# Add all arguments that were provided
|
||||
for arg_name, arg_value in vars(args).items():
|
||||
if arg_value is not None:
|
||||
if isinstance(arg_value, bool):
|
||||
if arg_value: # Only add flag if True
|
||||
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||
else:
|
||||
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Inference completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running inference: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
# Check if any command-line arguments were provided
|
||||
if len(sys.argv) > 1:
|
||||
# Check if it's a subcommand
|
||||
if sys.argv[1] in ["examples", "single", "file", "interactive", "create-config", "help"]:
|
||||
# Handle subcommands
|
||||
if sys.argv[1] == "examples":
|
||||
run_single_text_inference()
|
||||
run_file_inference()
|
||||
run_interactive_inference()
|
||||
elif sys.argv[1] == "single":
|
||||
run_single_text_inference()
|
||||
elif sys.argv[1] == "file":
|
||||
run_file_inference()
|
||||
elif sys.argv[1] == "interactive":
|
||||
run_interactive_inference()
|
||||
elif sys.argv[1] == "create-config":
|
||||
create_inference_config()
|
||||
elif sys.argv[1] == "help":
|
||||
show_usage()
|
||||
else:
|
||||
# Handle direct arguments (pass through to pipeline)
|
||||
handle_direct_args()
|
||||
else:
|
||||
print("Classification Inference")
|
||||
print("=======================")
|
||||
print()
|
||||
print("This script performs inference using trained classification models.")
|
||||
print()
|
||||
print("Usage:")
|
||||
print(" python scripts/classification/inference.py examples # Run examples")
|
||||
print(" python scripts/classification/inference.py single # Single text inference")
|
||||
print(" python scripts/classification/inference.py file # File-based inference")
|
||||
print(" python scripts/classification/inference.py interactive # Interactive inference")
|
||||
print(" python scripts/classification/inference.py create-config # Create inference config")
|
||||
print(" python scripts/classification/inference.py help # Show usage")
|
||||
print()
|
||||
print("Direct pipeline usage:")
|
||||
print(" python scripts/classification/inference.py --config configs/classification/inference.yaml")
|
||||
print(" python scripts/classification/inference.py --model-path ./results/emotion_model --input-text 'Your text here'")
|
||||
print()
|
||||
print("Benefits of YAML configurations:")
|
||||
print(" ✅ Easier to manage complex configurations")
|
||||
print(" ✅ Version control friendly")
|
||||
print(" ✅ Self-documenting")
|
||||
print(" ✅ Can still override with CLI args")
|
||||
print(" ✅ Better for team collaboration")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,204 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Classification Trainer Script
|
||||
Uses YAML configurations for flexible and maintainable model training.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||
"""Run trainer with YAML configuration"""
|
||||
print(f"=== Running Classification Trainer ===")
|
||||
print(f"Config: {config_path}")
|
||||
|
||||
cmd = [
|
||||
"python", "pipelines/classification/train.py",
|
||||
"--config", config_path
|
||||
]
|
||||
|
||||
# Add CLI overrides
|
||||
for key, value in cli_overrides.items():
|
||||
if value is not None:
|
||||
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||
|
||||
print(f"Command: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Training completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running trainer: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def run_emotion_training():
|
||||
"""Run emotion classification training"""
|
||||
print("=== Emotion Classification Training ===")
|
||||
|
||||
success = run_with_yaml_config(
|
||||
"configs/classification/emotion.yaml",
|
||||
num_epochs=2, # Override YAML value
|
||||
batch_size=8, # Smaller batch for testing
|
||||
output_dir="./results/emotion_model"
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Emotion classification training completed!")
|
||||
else:
|
||||
print("❌ Emotion classification training failed!")
|
||||
|
||||
def run_custom_training():
|
||||
"""Run custom dataset training"""
|
||||
print("\n=== Custom Dataset Training ===")
|
||||
|
||||
if os.path.exists("data/custom_processed/train.jsonl"):
|
||||
success = run_with_yaml_config(
|
||||
"configs/classification/custom.yaml",
|
||||
data_dir="data/custom_processed",
|
||||
output_dir="./results/custom_model"
|
||||
)
|
||||
if success:
|
||||
print("✅ Custom dataset training completed!")
|
||||
else:
|
||||
print("❌ Custom dataset training failed!")
|
||||
else:
|
||||
print("⚠️ Custom dataset not found, skipping...")
|
||||
|
||||
def create_training_config():
|
||||
"""Create a training configuration file"""
|
||||
training_config = """model_name: "bert-base-uncased"
|
||||
max_length: 512
|
||||
num_epochs: 3
|
||||
batch_size: 16
|
||||
learning_rate: 2e-5
|
||||
weight_decay: 0.01
|
||||
lr_scheduler_type: "linear"
|
||||
warmup_ratio: 0.1
|
||||
data_dir: "./data/classification"
|
||||
output_dir: "./results/classification_model"
|
||||
"""
|
||||
|
||||
config_path = "configs/classification/training.yaml"
|
||||
with open(config_path, 'w') as f:
|
||||
f.write(training_config)
|
||||
|
||||
print(f"✅ Created training config: {config_path}")
|
||||
|
||||
def show_usage():
|
||||
"""Show usage examples"""
|
||||
print("=== Classification Trainer Usage ===")
|
||||
print()
|
||||
print("1. Use YAML config only:")
|
||||
print(" python scripts/classification/trainer.py --config configs/classification/emotion.yaml")
|
||||
print()
|
||||
print("2. Override YAML values:")
|
||||
print(" python scripts/classification/trainer.py --config configs/classification/emotion.yaml --num-epochs 5")
|
||||
print()
|
||||
print("3. Use CLI only (backward compatibility):")
|
||||
print(" python scripts/classification/trainer.py --model-name bert-base-uncased --num-epochs 3")
|
||||
print()
|
||||
print("4. Run examples:")
|
||||
print(" python scripts/classification/trainer.py examples")
|
||||
print()
|
||||
print("5. Create training config:")
|
||||
print(" python scripts/classification/trainer.py create-config")
|
||||
|
||||
def handle_direct_args():
|
||||
"""Handle direct command-line arguments by passing them to the pipeline"""
|
||||
parser = argparse.ArgumentParser(description="Classification Trainer")
|
||||
|
||||
# Add all the same arguments as the pipeline
|
||||
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||
parser.add_argument("--model-name", type=str, help="Model name from HuggingFace Hub")
|
||||
parser.add_argument("--max-length", type=int, help="Maximum sequence length for tokenization")
|
||||
parser.add_argument("--num-epochs", type=int, help="Number of training epochs")
|
||||
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||
parser.add_argument("--weight-decay", type=float, help="Weight decay for optimizer")
|
||||
parser.add_argument("--lr-scheduler-type", choices=["linear", "cosine", "polynomial"], help="Learning rate scheduler type")
|
||||
parser.add_argument("--warmup-ratio", type=float, help="Warmup ratio for scheduler")
|
||||
parser.add_argument("--data-dir", type=str, help="Directory containing train/validation/test JSONL files")
|
||||
parser.add_argument("--output-dir", type=str, help="Output directory for saved model")
|
||||
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build command to call the pipeline
|
||||
cmd = ["python", "pipelines/classification/train.py"]
|
||||
|
||||
# Add all arguments that were provided
|
||||
for arg_name, arg_value in vars(args).items():
|
||||
if arg_value is not None:
|
||||
if isinstance(arg_value, bool):
|
||||
if arg_value: # Only add flag if True
|
||||
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||
else:
|
||||
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Training completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running trainer: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
# Check if any command-line arguments were provided
|
||||
if len(sys.argv) > 1:
|
||||
# Check if it's a subcommand
|
||||
if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
|
||||
# Handle subcommands
|
||||
if sys.argv[1] == "examples":
|
||||
run_emotion_training()
|
||||
run_custom_training()
|
||||
elif sys.argv[1] == "emotion":
|
||||
run_emotion_training()
|
||||
elif sys.argv[1] == "custom":
|
||||
run_custom_training()
|
||||
elif sys.argv[1] == "create-config":
|
||||
create_training_config()
|
||||
elif sys.argv[1] == "help":
|
||||
show_usage()
|
||||
else:
|
||||
# Handle direct arguments (pass through to pipeline)
|
||||
handle_direct_args()
|
||||
else:
|
||||
print("Classification Trainer")
|
||||
print("====================")
|
||||
print()
|
||||
print("This script trains classification models using YAML configurations.")
|
||||
print()
|
||||
print("Usage:")
|
||||
print(" python scripts/classification/trainer.py examples # Run examples")
|
||||
print(" python scripts/classification/trainer.py emotion # Run emotion training")
|
||||
print(" python scripts/classification/trainer.py custom # Run custom training")
|
||||
print(" python scripts/classification/trainer.py create-config # Create training config")
|
||||
print(" python scripts/classification/trainer.py help # Show usage")
|
||||
print()
|
||||
print("Direct pipeline usage:")
|
||||
print(" python scripts/classification/trainer.py --config configs/classification/emotion.yaml")
|
||||
print(" python scripts/classification/trainer.py --model-name bert-base-uncased --num-epochs 3")
|
||||
print()
|
||||
print("Benefits of YAML configurations:")
|
||||
print(" ✅ Easier to manage complex configurations")
|
||||
print(" ✅ Version control friendly")
|
||||
print(" ✅ Self-documenting")
|
||||
print(" ✅ Can still override with CLI args")
|
||||
print(" ✅ Better for team collaboration")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user