225 lines
8.8 KiB
Python
225 lines
8.8 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Classification Data Processor Script
|
||
|
|
Uses YAML configurations for flexible and maintainable data processing.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import sys
|
||
|
|
import os
|
||
|
|
import subprocess
|
||
|
|
import argparse
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||
|
|
"""Run data processor with YAML configuration"""
|
||
|
|
print(f"=== Running Classification Data Processor ===")
|
||
|
|
print(f"Config: {config_path}")
|
||
|
|
|
||
|
|
cmd = [
|
||
|
|
"python", "pipelines/classification/data_processor.py",
|
||
|
|
"--config", config_path
|
||
|
|
]
|
||
|
|
|
||
|
|
# Add CLI overrides
|
||
|
|
for key, value in cli_overrides.items():
|
||
|
|
if value is not None:
|
||
|
|
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||
|
|
|
||
|
|
print(f"Command: {' '.join(cmd)}")
|
||
|
|
print()
|
||
|
|
|
||
|
|
try:
|
||
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||
|
|
print("✅ Data processing completed successfully!")
|
||
|
|
print(result.stdout)
|
||
|
|
return True
|
||
|
|
except subprocess.CalledProcessError as e:
|
||
|
|
print(f"❌ Error running data processor: {e}")
|
||
|
|
print(f"Error output: {e.stderr}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def run_emotion_example():
|
||
|
|
"""Run emotion classification example"""
|
||
|
|
print("=== Emotion Classification Example ===")
|
||
|
|
|
||
|
|
success = run_with_yaml_config(
|
||
|
|
"configs/classification/emotion.yaml",
|
||
|
|
max_samples=500, # Override YAML value
|
||
|
|
output_dir="./data/emotion_small"
|
||
|
|
)
|
||
|
|
|
||
|
|
if success:
|
||
|
|
print("✅ Emotion classification data processing completed!")
|
||
|
|
else:
|
||
|
|
print("❌ Emotion classification failed!")
|
||
|
|
|
||
|
|
def run_custom_example():
|
||
|
|
"""Run custom dataset example"""
|
||
|
|
print("\n=== Custom Dataset Example ===")
|
||
|
|
|
||
|
|
if os.path.exists("data/classification/train.jsonl"):
|
||
|
|
success = run_with_yaml_config(
|
||
|
|
"configs/classification/custom.yaml",
|
||
|
|
data_source="custom",
|
||
|
|
data_path="data/classification/train.jsonl",
|
||
|
|
output_dir="./data/custom_processed"
|
||
|
|
)
|
||
|
|
if success:
|
||
|
|
print("✅ Custom dataset processing completed!")
|
||
|
|
else:
|
||
|
|
print("❌ Custom dataset processing failed!")
|
||
|
|
else:
|
||
|
|
print("⚠️ Custom dataset not found, skipping...")
|
||
|
|
|
||
|
|
def create_custom_config():
|
||
|
|
"""Create a custom configuration file"""
|
||
|
|
custom_config = """task:
|
||
|
|
name: "classification"
|
||
|
|
type: "sequence_classification"
|
||
|
|
|
||
|
|
data:
|
||
|
|
source: "custom"
|
||
|
|
data_format: "jsonl"
|
||
|
|
input_field: "text"
|
||
|
|
label_field: "label"
|
||
|
|
max_samples: 1000
|
||
|
|
train_split: 0.8
|
||
|
|
validation_split: 0.1
|
||
|
|
test_split: 0.1
|
||
|
|
|
||
|
|
processing:
|
||
|
|
clean_text: true
|
||
|
|
lowercase: true
|
||
|
|
min_length: 10
|
||
|
|
max_length: 1000
|
||
|
|
|
||
|
|
output:
|
||
|
|
output_dir: "./data/custom_processed"
|
||
|
|
output_format: "classification"
|
||
|
|
"""
|
||
|
|
|
||
|
|
config_path = "configs/classification/custom.yaml"
|
||
|
|
with open(config_path, 'w') as f:
|
||
|
|
f.write(custom_config)
|
||
|
|
|
||
|
|
print(f"✅ Created custom config: {config_path}")
|
||
|
|
|
||
|
|
def show_usage():
|
||
|
|
"""Show usage examples"""
|
||
|
|
print("=== Classification Data Processor Usage ===")
|
||
|
|
print()
|
||
|
|
print("1. Use YAML config only:")
|
||
|
|
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
|
||
|
|
print()
|
||
|
|
print("2. Override YAML values:")
|
||
|
|
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml --max-samples 500")
|
||
|
|
print()
|
||
|
|
print("3. Use CLI only (backward compatibility):")
|
||
|
|
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
|
||
|
|
print()
|
||
|
|
print("4. Run examples:")
|
||
|
|
print(" python scripts/classification/data_processor.py examples")
|
||
|
|
print()
|
||
|
|
print("5. Create custom config:")
|
||
|
|
print(" python scripts/classification/data_processor.py create-config")
|
||
|
|
|
||
|
|
def handle_direct_args():
|
||
|
|
"""Handle direct command-line arguments by passing them to the pipeline"""
|
||
|
|
parser = argparse.ArgumentParser(description="Classification Data Processor")
|
||
|
|
|
||
|
|
# Add all the same arguments as the pipeline
|
||
|
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||
|
|
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||
|
|
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||
|
|
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||
|
|
parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
|
||
|
|
parser.add_argument("--input-field", type=str, help="Input field name")
|
||
|
|
parser.add_argument("--label-field", type=str, help="Label field name")
|
||
|
|
parser.add_argument("--id-field", type=str, help="Optional ID field name")
|
||
|
|
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||
|
|
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||
|
|
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||
|
|
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||
|
|
parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
|
||
|
|
parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
|
||
|
|
parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
|
||
|
|
parser.add_argument("--min-length", type=int, help="Minimum text length")
|
||
|
|
parser.add_argument("--max-length", type=int, help="Maximum text length")
|
||
|
|
parser.add_argument("--output-format", choices=["classification", "instruction", "conversation", "qa"], help="Output format")
|
||
|
|
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||
|
|
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||
|
|
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
# Build command to call the pipeline
|
||
|
|
cmd = ["python", "pipelines/classification/data_processor.py"]
|
||
|
|
|
||
|
|
# Add all arguments that were provided
|
||
|
|
for arg_name, arg_value in vars(args).items():
|
||
|
|
if arg_value is not None:
|
||
|
|
if isinstance(arg_value, bool):
|
||
|
|
if arg_value: # Only add flag if True
|
||
|
|
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||
|
|
else:
|
||
|
|
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||
|
|
|
||
|
|
print(f"Running: {' '.join(cmd)}")
|
||
|
|
print()
|
||
|
|
|
||
|
|
try:
|
||
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||
|
|
print("✅ Data processing completed successfully!")
|
||
|
|
print(result.stdout)
|
||
|
|
return True
|
||
|
|
except subprocess.CalledProcessError as e:
|
||
|
|
print(f"❌ Error running data processor: {e}")
|
||
|
|
print(f"Error output: {e.stderr}")
|
||
|
|
return False
|
||
|
|
|
||
|
|
def main():
|
||
|
|
"""Main function"""
|
||
|
|
# Check if any command-line arguments were provided
|
||
|
|
if len(sys.argv) > 1:
|
||
|
|
# Check if it's a subcommand
|
||
|
|
if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
|
||
|
|
# Handle subcommands
|
||
|
|
if sys.argv[1] == "examples":
|
||
|
|
run_emotion_example()
|
||
|
|
run_custom_example()
|
||
|
|
elif sys.argv[1] == "emotion":
|
||
|
|
run_emotion_example()
|
||
|
|
elif sys.argv[1] == "custom":
|
||
|
|
run_custom_example()
|
||
|
|
elif sys.argv[1] == "create-config":
|
||
|
|
create_custom_config()
|
||
|
|
elif sys.argv[1] == "help":
|
||
|
|
show_usage()
|
||
|
|
else:
|
||
|
|
# Handle direct arguments (pass through to pipeline)
|
||
|
|
handle_direct_args()
|
||
|
|
else:
|
||
|
|
print("Classification Data Processor")
|
||
|
|
print("============================")
|
||
|
|
print()
|
||
|
|
print("This script processes classification datasets using YAML configurations.")
|
||
|
|
print()
|
||
|
|
print("Usage:")
|
||
|
|
print(" python scripts/classification/data_processor.py examples # Run examples")
|
||
|
|
print(" python scripts/classification/data_processor.py emotion # Run emotion example")
|
||
|
|
print(" python scripts/classification/data_processor.py custom # Run custom example")
|
||
|
|
print(" python scripts/classification/data_processor.py create-config # Create custom config")
|
||
|
|
print(" python scripts/classification/data_processor.py help # Show usage")
|
||
|
|
print()
|
||
|
|
print("Direct pipeline usage:")
|
||
|
|
print(" python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
|
||
|
|
print(" python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
|
||
|
|
print()
|
||
|
|
print("Benefits of YAML configurations:")
|
||
|
|
print(" ✅ Easier to manage complex configurations")
|
||
|
|
print(" ✅ Version control friendly")
|
||
|
|
print(" ✅ Self-documenting")
|
||
|
|
print(" ✅ Can still override with CLI args")
|
||
|
|
print(" ✅ Better for team collaboration")
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|