initial setupt

This commit is contained in:
OwusuBlessing
2025-08-06 22:45:37 +01:00
commit fef3f5ae35
42 changed files with 7147 additions and 0 deletions
+225
View File
@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Classification Data Processor Script
Uses YAML configurations for flexible and maintainable data processing.
"""
import sys
import os
import subprocess
import argparse
from pathlib import Path
def run_with_yaml_config(config_path: str, **cli_overrides):
"""Run data processor with YAML configuration"""
print(f"=== Running Classification Data Processor ===")
print(f"Config: {config_path}")
cmd = [
"python", "pipelines/classification/data_processor.py",
"--config", config_path
]
# Add CLI overrides
for key, value in cli_overrides.items():
if value is not None:
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
print(f"Command: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Data processing completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running data processor: {e}")
print(f"Error output: {e.stderr}")
return False
def run_emotion_example():
"""Run emotion classification example"""
print("=== Emotion Classification Example ===")
success = run_with_yaml_config(
"configs/classification/emotion.yaml",
max_samples=500, # Override YAML value
output_dir="./data/emotion_small"
)
if success:
print("✅ Emotion classification data processing completed!")
else:
print("❌ Emotion classification failed!")
def run_custom_example():
"""Run custom dataset example"""
print("\n=== Custom Dataset Example ===")
if os.path.exists("data/classification/train.jsonl"):
success = run_with_yaml_config(
"configs/classification/custom.yaml",
data_source="custom",
data_path="data/classification/train.jsonl",
output_dir="./data/custom_processed"
)
if success:
print("✅ Custom dataset processing completed!")
else:
print("❌ Custom dataset processing failed!")
else:
print("⚠️ Custom dataset not found, skipping...")
def create_custom_config():
"""Create a custom configuration file"""
custom_config = """task:
name: "classification"
type: "sequence_classification"
data:
source: "custom"
data_format: "jsonl"
input_field: "text"
label_field: "label"
max_samples: 1000
train_split: 0.8
validation_split: 0.1
test_split: 0.1
processing:
clean_text: true
lowercase: true
min_length: 10
max_length: 1000
output:
output_dir: "./data/custom_processed"
output_format: "classification"
"""
config_path = "configs/classification/custom.yaml"
with open(config_path, 'w') as f:
f.write(custom_config)
print(f"✅ Created custom config: {config_path}")
def show_usage():
"""Show usage examples"""
print("=== Classification Data Processor Usage ===")
print()
print("1. Use YAML config only:")
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
print()
print("2. Override YAML values:")
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml --max-samples 500")
print()
print("3. Use CLI only (backward compatibility):")
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
print()
print("4. Run examples:")
print(" python scripts/classification/data_processor.py examples")
print()
print("5. Create custom config:")
print(" python scripts/classification/data_processor.py create-config")
def handle_direct_args():
"""Handle direct command-line arguments by passing them to the pipeline"""
parser = argparse.ArgumentParser(description="Classification Data Processor")
# Add all the same arguments as the pipeline
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
parser.add_argument("--data-path", type=str, help="Path to custom data file")
parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
parser.add_argument("--input-field", type=str, help="Input field name")
parser.add_argument("--label-field", type=str, help="Label field name")
parser.add_argument("--id-field", type=str, help="Optional ID field name")
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
parser.add_argument("--train-split", type=float, help="Training split ratio")
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
parser.add_argument("--test-split", type=float, help="Test split ratio")
parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
parser.add_argument("--min-length", type=int, help="Minimum text length")
parser.add_argument("--max-length", type=int, help="Maximum text length")
parser.add_argument("--output-format", choices=["classification", "instruction", "conversation", "qa"], help="Output format")
parser.add_argument("--output-dir", type=str, help="Output directory")
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
args = parser.parse_args()
# Build command to call the pipeline
cmd = ["python", "pipelines/classification/data_processor.py"]
# Add all arguments that were provided
for arg_name, arg_value in vars(args).items():
if arg_value is not None:
if isinstance(arg_value, bool):
if arg_value: # Only add flag if True
cmd.append(f"--{arg_name.replace('_', '-')}")
else:
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
print(f"Running: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Data processing completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running data processor: {e}")
print(f"Error output: {e.stderr}")
return False
def main():
"""Main function"""
# Check if any command-line arguments were provided
if len(sys.argv) > 1:
# Check if it's a subcommand
if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
# Handle subcommands
if sys.argv[1] == "examples":
run_emotion_example()
run_custom_example()
elif sys.argv[1] == "emotion":
run_emotion_example()
elif sys.argv[1] == "custom":
run_custom_example()
elif sys.argv[1] == "create-config":
create_custom_config()
elif sys.argv[1] == "help":
show_usage()
else:
# Handle direct arguments (pass through to pipeline)
handle_direct_args()
else:
print("Classification Data Processor")
print("============================")
print()
print("This script processes classification datasets using YAML configurations.")
print()
print("Usage:")
print(" python scripts/classification/data_processor.py examples # Run examples")
print(" python scripts/classification/data_processor.py emotion # Run emotion example")
print(" python scripts/classification/data_processor.py custom # Run custom example")
print(" python scripts/classification/data_processor.py create-config # Create custom config")
print(" python scripts/classification/data_processor.py help # Show usage")
print()
print("Direct pipeline usage:")
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
print()
print("Benefits of YAML configurations:")
print(" ✅ Easier to manage complex configurations")
print(" ✅ Version control friendly")
print(" ✅ Self-documenting")
print(" ✅ Can still override with CLI args")
print(" ✅ Better for team collaboration")
if __name__ == "__main__":
main()
+260
View File
@@ -0,0 +1,260 @@
#!/usr/bin/env python3
"""
Classification Inference Script
Uses YAML configurations for flexible and maintainable model inference.
"""
import sys
import os
import subprocess
import argparse
from pathlib import Path
def run_with_yaml_config(config_path: str, **cli_overrides):
"""Run inference with YAML configuration"""
print(f"=== Running Classification Inference ===")
print(f"Config: {config_path}")
cmd = [
"python", "pipelines/classification/inference.py",
"--config", config_path
]
# Add CLI overrides
for key, value in cli_overrides.items():
if value is not None:
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
print(f"Command: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Inference completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running inference: {e}")
print(f"Error output: {e.stderr}")
return False
def run_single_text_inference():
"""Run single text inference"""
print("=== Single Text Inference ===")
# Check if model exists
model_path = "./results/emotion_model"
if not os.path.exists(model_path):
print(f"⚠️ Model not found: {model_path}")
print("Please train a model first using the trainer script.")
return False
success = run_with_yaml_config(
"configs/classification/emotion.yaml",
model_path=model_path,
input_text="I love this product! It's amazing.",
return_top_k=3
)
if success:
print("✅ Single text inference completed!")
else:
print("❌ Single text inference failed!")
return success
def run_file_inference():
"""Run file-based inference"""
print("\n=== File-Based Inference ===")
# Check if model exists
model_path = "./results/emotion_model"
if not os.path.exists(model_path):
print(f"⚠️ Model not found: {model_path}")
print("Please train a model first using the trainer script.")
return False
# Create sample input file
sample_texts = [
"I love this product! It's amazing.",
"This is terrible, I hate it.",
"The weather is okay today.",
"Best purchase ever made!"
]
input_file = "sample_texts.txt"
with open(input_file, 'w') as f:
for text in sample_texts:
f.write(text + '\n')
success = run_with_yaml_config(
"configs/classification/emotion.yaml",
model_path=model_path,
input_file=input_file,
output_file="predictions.jsonl",
batch_size=16
)
if success:
print("✅ File-based inference completed!")
print(f"Results saved to: predictions.jsonl")
else:
print("❌ File-based inference failed!")
return success
def run_interactive_inference():
"""Run interactive inference"""
print("\n=== Interactive Inference ===")
# Check if model exists
model_path = "./results/emotion_model"
if not os.path.exists(model_path):
print(f"⚠️ Model not found: {model_path}")
print("Please train a model first using the trainer script.")
return False
success = run_with_yaml_config(
"configs/classification/emotion.yaml",
model_path=model_path,
return_top_k=3
)
if success:
print("✅ Interactive inference completed!")
else:
print("❌ Interactive inference failed!")
return success
def create_inference_config():
"""Create an inference configuration file"""
inference_config = """model_path: "./results/emotion_model"
device: "auto"
batch_size: 32
max_length: 512
return_probabilities: true
return_top_k: 3
"""
config_path = "configs/classification/inference.yaml"
with open(config_path, 'w') as f:
f.write(inference_config)
print(f"✅ Created inference config: {config_path}")
def show_usage():
"""Show usage examples"""
print("=== Classification Inference Usage ===")
print()
print("1. Use YAML config only:")
print(" python scripts/classification/inference.py --config configs/classification/inference.yaml")
print()
print("2. Override YAML values:")
print(" python scripts/classification/inference.py --config configs/classification/inference.yaml --input-text 'Your text here'")
print()
print("3. Use CLI only (backward compatibility):")
print(" python scripts/classification/inference.py --model-path ./results/emotion_model --input-text 'Your text here'")
print()
print("4. Run examples:")
print(" python scripts/classification/inference.py examples")
print()
print("5. Create inference config:")
print(" python scripts/classification/inference.py create-config")
def handle_direct_args():
"""Handle direct command-line arguments by passing them to the pipeline"""
parser = argparse.ArgumentParser(description="Classification Inference")
# Add all the same arguments as the pipeline
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
parser.add_argument("--model-path", type=str, help="Path to saved model directory")
parser.add_argument("--device", choices=["auto", "cuda", "cpu"], help="Device to run inference on")
parser.add_argument("--batch-size", type=int, help="Batch size for inference")
parser.add_argument("--max-length", type=int, help="Maximum sequence length for tokenization")
parser.add_argument("--return-probabilities", action="store_true", help="Return all class probabilities")
parser.add_argument("--return-top-k", type=int, help="Return top K predictions")
parser.add_argument("--input-text", type=str, help="Single text for prediction")
parser.add_argument("--input-file", type=str, help="Input file path (txt or jsonl)")
parser.add_argument("--output-file", type=str, help="Output file path for results")
parser.add_argument("--chunk-size", type=int, help="Chunk size for large file processing")
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
args = parser.parse_args()
# Build command to call the pipeline
cmd = ["python", "pipelines/classification/inference.py"]
# Add all arguments that were provided
for arg_name, arg_value in vars(args).items():
if arg_value is not None:
if isinstance(arg_value, bool):
if arg_value: # Only add flag if True
cmd.append(f"--{arg_name.replace('_', '-')}")
else:
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
print(f"Running: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Inference completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running inference: {e}")
print(f"Error output: {e.stderr}")
return False
def main():
"""Main function"""
# Check if any command-line arguments were provided
if len(sys.argv) > 1:
# Check if it's a subcommand
if sys.argv[1] in ["examples", "single", "file", "interactive", "create-config", "help"]:
# Handle subcommands
if sys.argv[1] == "examples":
run_single_text_inference()
run_file_inference()
run_interactive_inference()
elif sys.argv[1] == "single":
run_single_text_inference()
elif sys.argv[1] == "file":
run_file_inference()
elif sys.argv[1] == "interactive":
run_interactive_inference()
elif sys.argv[1] == "create-config":
create_inference_config()
elif sys.argv[1] == "help":
show_usage()
else:
# Handle direct arguments (pass through to pipeline)
handle_direct_args()
else:
print("Classification Inference")
print("=======================")
print()
print("This script performs inference using trained classification models.")
print()
print("Usage:")
print(" python scripts/classification/inference.py examples # Run examples")
print(" python scripts/classification/inference.py single # Single text inference")
print(" python scripts/classification/inference.py file # File-based inference")
print(" python scripts/classification/inference.py interactive # Interactive inference")
print(" python scripts/classification/inference.py create-config # Create inference config")
print(" python scripts/classification/inference.py help # Show usage")
print()
print("Direct pipeline usage:")
print(" python scripts/classification/inference.py --config configs/classification/inference.yaml")
print(" python scripts/classification/inference.py --model-path ./results/emotion_model --input-text 'Your text here'")
print()
print("Benefits of YAML configurations:")
print(" ✅ Easier to manage complex configurations")
print(" ✅ Version control friendly")
print(" ✅ Self-documenting")
print(" ✅ Can still override with CLI args")
print(" ✅ Better for team collaboration")
if __name__ == "__main__":
main()
+204
View File
@@ -0,0 +1,204 @@
#!/usr/bin/env python3
"""
Classification Trainer Script
Uses YAML configurations for flexible and maintainable model training.
"""
import sys
import os
import subprocess
import argparse
from pathlib import Path
def run_with_yaml_config(config_path: str, **cli_overrides):
"""Run trainer with YAML configuration"""
print(f"=== Running Classification Trainer ===")
print(f"Config: {config_path}")
cmd = [
"python", "pipelines/classification/train.py",
"--config", config_path
]
# Add CLI overrides
for key, value in cli_overrides.items():
if value is not None:
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
print(f"Command: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Training completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running trainer: {e}")
print(f"Error output: {e.stderr}")
return False
def run_emotion_training():
"""Run emotion classification training"""
print("=== Emotion Classification Training ===")
success = run_with_yaml_config(
"configs/classification/emotion.yaml",
num_epochs=2, # Override YAML value
batch_size=8, # Smaller batch for testing
output_dir="./results/emotion_model"
)
if success:
print("✅ Emotion classification training completed!")
else:
print("❌ Emotion classification training failed!")
def run_custom_training():
"""Run custom dataset training"""
print("\n=== Custom Dataset Training ===")
if os.path.exists("data/custom_processed/train.jsonl"):
success = run_with_yaml_config(
"configs/classification/custom.yaml",
data_dir="data/custom_processed",
output_dir="./results/custom_model"
)
if success:
print("✅ Custom dataset training completed!")
else:
print("❌ Custom dataset training failed!")
else:
print("⚠️ Custom dataset not found, skipping...")
def create_training_config():
"""Create a training configuration file"""
training_config = """model_name: "bert-base-uncased"
max_length: 512
num_epochs: 3
batch_size: 16
learning_rate: 2e-5
weight_decay: 0.01
lr_scheduler_type: "linear"
warmup_ratio: 0.1
data_dir: "./data/classification"
output_dir: "./results/classification_model"
"""
config_path = "configs/classification/training.yaml"
with open(config_path, 'w') as f:
f.write(training_config)
print(f"✅ Created training config: {config_path}")
def show_usage():
"""Show usage examples"""
print("=== Classification Trainer Usage ===")
print()
print("1. Use YAML config only:")
print(" python scripts/classification/trainer.py --config configs/classification/emotion.yaml")
print()
print("2. Override YAML values:")
print(" python scripts/classification/trainer.py --config configs/classification/emotion.yaml --num-epochs 5")
print()
print("3. Use CLI only (backward compatibility):")
print(" python scripts/classification/trainer.py --model-name bert-base-uncased --num-epochs 3")
print()
print("4. Run examples:")
print(" python scripts/classification/trainer.py examples")
print()
print("5. Create training config:")
print(" python scripts/classification/trainer.py create-config")
def handle_direct_args():
"""Handle direct command-line arguments by passing them to the pipeline"""
parser = argparse.ArgumentParser(description="Classification Trainer")
# Add all the same arguments as the pipeline
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
parser.add_argument("--model-name", type=str, help="Model name from HuggingFace Hub")
parser.add_argument("--max-length", type=int, help="Maximum sequence length for tokenization")
parser.add_argument("--num-epochs", type=int, help="Number of training epochs")
parser.add_argument("--batch-size", type=int, help="Training batch size")
parser.add_argument("--learning-rate", type=float, help="Learning rate")
parser.add_argument("--weight-decay", type=float, help="Weight decay for optimizer")
parser.add_argument("--lr-scheduler-type", choices=["linear", "cosine", "polynomial"], help="Learning rate scheduler type")
parser.add_argument("--warmup-ratio", type=float, help="Warmup ratio for scheduler")
parser.add_argument("--data-dir", type=str, help="Directory containing train/validation/test JSONL files")
parser.add_argument("--output-dir", type=str, help="Output directory for saved model")
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
args = parser.parse_args()
# Build command to call the pipeline
cmd = ["python", "pipelines/classification/train.py"]
# Add all arguments that were provided
for arg_name, arg_value in vars(args).items():
if arg_value is not None:
if isinstance(arg_value, bool):
if arg_value: # Only add flag if True
cmd.append(f"--{arg_name.replace('_', '-')}")
else:
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
print(f"Running: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Training completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running trainer: {e}")
print(f"Error output: {e.stderr}")
return False
def main():
"""Main function"""
# Check if any command-line arguments were provided
if len(sys.argv) > 1:
# Check if it's a subcommand
if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
# Handle subcommands
if sys.argv[1] == "examples":
run_emotion_training()
run_custom_training()
elif sys.argv[1] == "emotion":
run_emotion_training()
elif sys.argv[1] == "custom":
run_custom_training()
elif sys.argv[1] == "create-config":
create_training_config()
elif sys.argv[1] == "help":
show_usage()
else:
# Handle direct arguments (pass through to pipeline)
handle_direct_args()
else:
print("Classification Trainer")
print("====================")
print()
print("This script trains classification models using YAML configurations.")
print()
print("Usage:")
print(" python scripts/classification/trainer.py examples # Run examples")
print(" python scripts/classification/trainer.py emotion # Run emotion training")
print(" python scripts/classification/trainer.py custom # Run custom training")
print(" python scripts/classification/trainer.py create-config # Create training config")
print(" python scripts/classification/trainer.py help # Show usage")
print()
print("Direct pipeline usage:")
print(" python scripts/classification/trainer.py --config configs/classification/emotion.yaml")
print(" python scripts/classification/trainer.py --model-name bert-base-uncased --num-epochs 3")
print()
print("Benefits of YAML configurations:")
print(" ✅ Easier to manage complex configurations")
print(" ✅ Version control friendly")
print(" ✅ Self-documenting")
print(" ✅ Can still override with CLI args")
print(" ✅ Better for team collaboration")
if __name__ == "__main__":
main()