Files
DS-LLM-TEMPLATE-FINETUNING/scripts/styling/data_processor.py
T
2025-08-13 21:17:01 +01:00

303 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Styling data processor script that uses YAML configurations.
This provides a flexible and maintainable approach for style transfer tasks.
"""
import sys
import os
import subprocess
import argparse
from pathlib import Path
def run_with_yaml_config(config_path: str, **cli_overrides):
"""Run styling data processor with YAML configuration"""
print(f"=== Running Styling Data Processor with YAML config: {config_path} ===")
cmd = [
"python", "pipelines/styling/data_processor.py",
"--config", config_path
]
# Add CLI overrides
for key, value in cli_overrides.items():
if value is not None:
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
print(f"Running command: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Styling data processing completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running styling data processor: {e}")
print(f"Error output: {e.stderr}")
return False
def run_styling_examples():
"""Run styling examples with YAML configs"""
# Example 1: Formal style transfer
print("=== Example 1: Formal Style Transfer ===")
success = run_with_yaml_config(
"configs/styling/formal.yaml",
max_samples=1000, # Override YAML value
output_format="alpaca"
)
if success:
print("✅ Formal style transfer completed!")
# Example 2: Custom styling dataset (if available)
print("\n=== Example 2: Custom Styling Dataset ===")
if os.path.exists("data/raw/styling/custom_dataset.jsonl"):
success = run_with_yaml_config(
"configs/styling/formal.yaml", # Use formal config as base
data_source="custom",
data_path="data/raw/styling/custom_dataset.jsonl",
instruction="Rewrite the following text in a casual, friendly style",
output_dir="./data/processed/styling/casual"
)
if success:
print("✅ Custom styling dataset processing completed!")
else:
print("⚠️ Custom styling dataset not found, skipping...")
print(" You can create one with the 'create-sample-data' option")
def create_sample_styling_data():
"""Create sample styling dataset for testing"""
sample_data = [
{
"text": "Hey, what's up? How are you doing today?",
"styled_text": "Hello, how are you doing today?"
},
{
"text": "This is really cool stuff!",
"styled_text": "This is quite impressive material."
},
{
"text": "I'm gonna go to the store later.",
"styled_text": "I will go to the store later."
},
{
"text": "What's the deal with this?",
"styled_text": "What is the situation regarding this matter?"
},
{
"text": "That's totally awesome!",
"styled_text": "That is quite remarkable!"
}
]
# Create directory structure
data_dir = Path("data/raw/styling")
data_dir.mkdir(parents=True, exist_ok=True)
# Save sample data
import json
sample_file = data_dir / "sample_formal.jsonl"
with open(sample_file, 'w', encoding='utf-8') as f:
for item in sample_data:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"✅ Created sample styling dataset: {sample_file}")
print(f" Contains {len(sample_data)} examples")
print(f" Format: text → styled_text")
print(f" Ready to use with configs/styling/formal.yaml")
def create_custom_styling_config():
"""Create a custom styling configuration file"""
custom_config = """task:
name: "styling"
type: "style_transfer"
data:
source: "custom"
input_field: "text"
output_field: "styled_text"
instruction: "Rewrite the following text in a professional business style"
data_format: "jsonl"
max_length: 512
min_length: 10
clean_text: true
lowercase: false
train_split: 0.8
validation_split: 0.1
test_split: 0.1
output_format: "alpaca"
output_dir: "./data/processed/styling/professional"
model:
name: "t5-base"
max_length: 512
training:
num_epochs: 3
batch_size: 16
learning_rate: 3e-5
weight_decay: 0.01
warmup_ratio: 0.1
lr_scheduler_type: "linear"
inference:
batch_size: 32
max_new_tokens: 128
temperature: 0.8
"""
config_path = "configs/styling/professional.yaml"
os.makedirs(os.path.dirname(config_path), exist_ok=True)
with open(config_path, 'w') as f:
f.write(custom_config)
print(f"✅ Created custom styling config: {config_path}")
print(" This config is set up for professional business style transfer")
def handle_direct_args():
"""Handle direct command-line arguments by passing them to the styling pipeline"""
parser = argparse.ArgumentParser(description="Styling Data Processor")
# Add all the same arguments as the styling pipeline
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
parser.add_argument("--data-path", type=str, help="Path to custom data file")
parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
parser.add_argument("--input-field", type=str, help="Input field name")
parser.add_argument("--output-field", type=str, help="Output field name")
parser.add_argument("--instruction", type=str, help="Style instruction")
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
parser.add_argument("--train-split", type=float, help="Training split ratio")
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
parser.add_argument("--test-split", type=float, help="Test split ratio")
parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
parser.add_argument("--min-length", type=int, help="Minimum text length")
parser.add_argument("--max-length", type=int, help="Maximum text length")
parser.add_argument("--output-format", choices=["styling", "alpaca"], help="Output format")
parser.add_argument("--output-dir", type=str, help="Output directory")
# HuggingFace dataset options
parser.add_argument("--create-hf-dataset", action="store_true", help="Create HuggingFace dataset")
parser.add_argument("--hf-dataset-path", type=str, help="Path to save HuggingFace dataset")
# Logging
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
args = parser.parse_args()
# Build command to call the styling pipeline
cmd = ["python", "pipelines/styling/data_processor.py"]
# Add all arguments that were provided
for arg_name, arg_value in vars(args).items():
if arg_value is not None:
if isinstance(arg_value, bool):
if arg_value: # Only add flag if True
cmd.append(f"--{arg_name.replace('_', '-')}")
else:
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
print(f"Running: {' '.join(cmd)}")
print()
try:
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
print("✅ Styling data processing completed successfully!")
print(result.stdout)
return True
except subprocess.CalledProcessError as e:
print(f"❌ Error running styling data processor: {e}")
print(f"Error output: {e.stderr}")
return False
def show_styling_features():
"""Show the features of the styling data processor"""
print("=== Styling Data Processor Features ===")
print()
print("1. **Style Transfer Tasks**:")
print(" - Formal vs. Informal style")
print(" - Professional vs. Casual tone")
print(" - Academic vs. Conversational")
print(" - Any custom style instruction")
print()
print("2. **Data Formats Supported**:")
print(" - HuggingFace datasets")
print(" - Custom JSONL/CSV/JSON files")
print(" - Automatic train/validation/test splits")
print()
print("3. **Output Formats**:")
print(" - Raw styling format (input/output)")
print(" - Alpaca format (instruction/input/output)")
print(" - HuggingFace dataset format")
print()
print("4. **Advanced Features**:")
print(" - Configurable field mapping")
print(" - Text preprocessing options")
print(" - Automatic dataset saving/loading")
print(" - YAML configuration support")
print()
print("=== Usage Examples ===")
print()
print("1. Use YAML config only:")
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
print()
print("2. Override YAML values:")
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml --max-samples 500")
print()
print("3. Create sample data:")
print(" python scripts/styling/data_processor.py create-sample-data")
print()
print("4. Create custom config:")
print(" python scripts/styling/data_processor.py create-config")
def main():
"""Main function"""
if len(sys.argv) > 1:
# Check if it's a subcommand
if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
# Handle subcommands
if sys.argv[1] == "examples":
run_styling_examples()
elif sys.argv[1] == "create-sample-data":
create_sample_styling_data()
elif sys.argv[1] == "create-config":
create_custom_styling_config()
elif sys.argv[1] == "features":
show_styling_features()
else:
# Handle direct arguments (pass through to pipeline)
handle_direct_args()
else:
print("Styling Data Processor")
print("=====================")
print()
print("This script runs the styling data processor for style transfer tasks.")
print("It supports both YAML configurations and command-line overrides.")
print()
print("Usage:")
print(" python scripts/styling/data_processor.py examples # Run examples")
print(" python scripts/styling/data_processor.py create-sample-data # Create sample dataset")
print(" python scripts/styling/data_processor.py create-config # Create custom config")
print(" python scripts/styling/data_processor.py features # Show features")
print()
print("Direct pipeline usage:")
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
print(" python scripts/styling/data_processor.py --data-source custom --data-path ./data.jsonl")
print()
print("Key Features:")
print(" ✅ Style transfer with custom instructions")
print(" ✅ Multiple data source support")
print(" ✅ YAML configuration files")
print(" ✅ CLI argument overrides")
print(" ✅ Automatic data splitting")
print(" ✅ HuggingFace dataset export")
if __name__ == "__main__":
main()