Files
DS-LLM-TEMPLATE-FINETUNING/scripts/classification/data_processor.py
T
OwusuBlessing fef3f5ae35 initial setupt
2025-08-06 22:45:37 +01:00

225 lines
8.8 KiB
Python

#!/usr/bin/env python3
"""
Classification Data Processor Script
Uses YAML configurations for flexible and maintainable data processing.
"""
import sys
import os
import subprocess
import argparse
from pathlib import Path
def run_with_yaml_config(config_path: str, **cli_overrides):
"""Run data processor with YAML configuration"""
print(f"=== Running Classification Data Processor ===")
print(f"Config: {config_path}")
cmd = [
"python", "pipelines/classification/data_processor.py",
"--config", config_path
]
# Add CLI overrides
for key, value in cli_overrides.items():
if value is not None:
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
print(f"Command: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Data processing completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running data processor: {e}")
print(f"Error output: {e.stderr}")
return False
def run_emotion_example():
"""Run emotion classification example"""
print("=== Emotion Classification Example ===")
success = run_with_yaml_config(
"configs/classification/emotion.yaml",
max_samples=500, # Override YAML value
output_dir="./data/emotion_small"
)
if success:
print("✅ Emotion classification data processing completed!")
else:
print("❌ Emotion classification failed!")
def run_custom_example():
"""Run custom dataset example"""
print("\n=== Custom Dataset Example ===")
if os.path.exists("data/classification/train.jsonl"):
success = run_with_yaml_config(
"configs/classification/custom.yaml",
data_source="custom",
data_path="data/classification/train.jsonl",
output_dir="./data/custom_processed"
)
if success:
print("✅ Custom dataset processing completed!")
else:
print("❌ Custom dataset processing failed!")
else:
print("⚠️ Custom dataset not found, skipping...")
def create_custom_config():
"""Create a custom configuration file"""
custom_config = """task:
name: "classification"
type: "sequence_classification"
data:
source: "custom"
data_format: "jsonl"
input_field: "text"
label_field: "label"
max_samples: 1000
train_split: 0.8
validation_split: 0.1
test_split: 0.1
processing:
clean_text: true
lowercase: true
min_length: 10
max_length: 1000
output:
output_dir: "./data/custom_processed"
output_format: "classification"
"""
config_path = "configs/classification/custom.yaml"
with open(config_path, 'w') as f:
f.write(custom_config)
print(f"✅ Created custom config: {config_path}")
def show_usage():
"""Show usage examples"""
print("=== Classification Data Processor Usage ===")
print()
print("1. Use YAML config only:")
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
print()
print("2. Override YAML values:")
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml --max-samples 500")
print()
print("3. Use CLI only (backward compatibility):")
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
print()
print("4. Run examples:")
print(" python scripts/classification/data_processor.py examples")
print()
print("5. Create custom config:")
print(" python scripts/classification/data_processor.py create-config")
def handle_direct_args():
"""Handle direct command-line arguments by passing them to the pipeline"""
parser = argparse.ArgumentParser(description="Classification Data Processor")
# Add all the same arguments as the pipeline
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
parser.add_argument("--data-path", type=str, help="Path to custom data file")
parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
parser.add_argument("--input-field", type=str, help="Input field name")
parser.add_argument("--label-field", type=str, help="Label field name")
parser.add_argument("--id-field", type=str, help="Optional ID field name")
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
parser.add_argument("--train-split", type=float, help="Training split ratio")
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
parser.add_argument("--test-split", type=float, help="Test split ratio")
parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
parser.add_argument("--min-length", type=int, help="Minimum text length")
parser.add_argument("--max-length", type=int, help="Maximum text length")
parser.add_argument("--output-format", choices=["classification", "instruction", "conversation", "qa"], help="Output format")
parser.add_argument("--output-dir", type=str, help="Output directory")
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
args = parser.parse_args()
# Build command to call the pipeline
cmd = ["python", "pipelines/classification/data_processor.py"]
# Add all arguments that were provided
for arg_name, arg_value in vars(args).items():
if arg_value is not None:
if isinstance(arg_value, bool):
if arg_value: # Only add flag if True
cmd.append(f"--{arg_name.replace('_', '-')}")
else:
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
print(f"Running: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Data processing completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running data processor: {e}")
print(f"Error output: {e.stderr}")
return False
def main():
"""Main function"""
# Check if any command-line arguments were provided
if len(sys.argv) > 1:
# Check if it's a subcommand
if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
# Handle subcommands
if sys.argv[1] == "examples":
run_emotion_example()
run_custom_example()
elif sys.argv[1] == "emotion":
run_emotion_example()
elif sys.argv[1] == "custom":
run_custom_example()
elif sys.argv[1] == "create-config":
create_custom_config()
elif sys.argv[1] == "help":
show_usage()
else:
# Handle direct arguments (pass through to pipeline)
handle_direct_args()
else:
print("Classification Data Processor")
print("============================")
print()
print("This script processes classification datasets using YAML configurations.")
print()
print("Usage:")
print(" python scripts/classification/data_processor.py examples # Run examples")
print(" python scripts/classification/data_processor.py emotion # Run emotion example")
print(" python scripts/classification/data_processor.py custom # Run custom example")
print(" python scripts/classification/data_processor.py create-config # Create custom config")
print(" python scripts/classification/data_processor.py help # Show usage")
print()
print("Direct pipeline usage:")
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
print()
print("Benefits of YAML configurations:")
print(" ✅ Easier to manage complex configurations")
print(" ✅ Version control friendly")
print(" ✅ Self-documenting")
print(" ✅ Can still override with CLI args")
print(" ✅ Better for team collaboration")
if __name__ == "__main__":
main()