303 lines
12 KiB
Python
303 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Styling data processor script that uses YAML configurations.
|
|
This provides a flexible and maintainable approach for style transfer tasks.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import subprocess
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
def run_with_yaml_config(config_path: str, **cli_overrides):
|
|
"""Run styling data processor with YAML configuration"""
|
|
print(f"=== Running Styling Data Processor with YAML config: {config_path} ===")
|
|
|
|
cmd = [
|
|
"python", "pipelines/styling/data_processor.py",
|
|
"--config", config_path
|
|
]
|
|
|
|
# Add CLI overrides
|
|
for key, value in cli_overrides.items():
|
|
if value is not None:
|
|
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
|
|
|
print(f"Running command: {' '.join(cmd)}")
|
|
print()
|
|
|
|
try:
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
print("✅ Styling data processing completed successfully!")
|
|
print(result.stdout)
|
|
return True
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"❌ Error running styling data processor: {e}")
|
|
print(f"Error output: {e.stderr}")
|
|
return False
|
|
|
|
def run_styling_examples():
|
|
"""Run styling examples with YAML configs"""
|
|
|
|
# Example 1: Formal style transfer
|
|
print("=== Example 1: Formal Style Transfer ===")
|
|
success = run_with_yaml_config(
|
|
"configs/styling/formal.yaml",
|
|
max_samples=1000, # Override YAML value
|
|
output_format="alpaca"
|
|
)
|
|
|
|
if success:
|
|
print("✅ Formal style transfer completed!")
|
|
|
|
# Example 2: Custom styling dataset (if available)
|
|
print("\n=== Example 2: Custom Styling Dataset ===")
|
|
if os.path.exists("data/raw/styling/custom_dataset.jsonl"):
|
|
success = run_with_yaml_config(
|
|
"configs/styling/formal.yaml", # Use formal config as base
|
|
data_source="custom",
|
|
data_path="data/raw/styling/custom_dataset.jsonl",
|
|
instruction="Rewrite the following text in a casual, friendly style",
|
|
output_dir="./data/processed/styling/casual"
|
|
)
|
|
if success:
|
|
print("✅ Custom styling dataset processing completed!")
|
|
else:
|
|
print("⚠️ Custom styling dataset not found, skipping...")
|
|
print(" You can create one with the 'create-sample-data' option")
|
|
|
|
def create_sample_styling_data():
|
|
"""Create sample styling dataset for testing"""
|
|
sample_data = [
|
|
{
|
|
"text": "Hey, what's up? How are you doing today?",
|
|
"styled_text": "Hello, how are you doing today?"
|
|
},
|
|
{
|
|
"text": "This is really cool stuff!",
|
|
"styled_text": "This is quite impressive material."
|
|
},
|
|
{
|
|
"text": "I'm gonna go to the store later.",
|
|
"styled_text": "I will go to the store later."
|
|
},
|
|
{
|
|
"text": "What's the deal with this?",
|
|
"styled_text": "What is the situation regarding this matter?"
|
|
},
|
|
{
|
|
"text": "That's totally awesome!",
|
|
"styled_text": "That is quite remarkable!"
|
|
}
|
|
]
|
|
|
|
# Create directory structure
|
|
data_dir = Path("data/raw/styling")
|
|
data_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save sample data
|
|
import json
|
|
sample_file = data_dir / "sample_formal.jsonl"
|
|
with open(sample_file, 'w', encoding='utf-8') as f:
|
|
for item in sample_data:
|
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
|
|
|
print(f"✅ Created sample styling dataset: {sample_file}")
|
|
print(f" Contains {len(sample_data)} examples")
|
|
print(f" Format: text → styled_text")
|
|
print(f" Ready to use with configs/styling/formal.yaml")
|
|
|
|
def create_custom_styling_config():
|
|
"""Create a custom styling configuration file"""
|
|
custom_config = """task:
|
|
name: "styling"
|
|
type: "style_transfer"
|
|
|
|
data:
|
|
source: "custom"
|
|
input_field: "text"
|
|
output_field: "styled_text"
|
|
instruction: "Rewrite the following text in a professional business style"
|
|
data_format: "jsonl"
|
|
max_length: 512
|
|
min_length: 10
|
|
clean_text: true
|
|
lowercase: false
|
|
train_split: 0.8
|
|
validation_split: 0.1
|
|
test_split: 0.1
|
|
output_format: "alpaca"
|
|
output_dir: "./data/processed/styling/professional"
|
|
|
|
model:
|
|
name: "t5-base"
|
|
max_length: 512
|
|
|
|
training:
|
|
num_epochs: 3
|
|
batch_size: 16
|
|
learning_rate: 3e-5
|
|
weight_decay: 0.01
|
|
warmup_ratio: 0.1
|
|
lr_scheduler_type: "linear"
|
|
|
|
inference:
|
|
batch_size: 32
|
|
max_new_tokens: 128
|
|
temperature: 0.8
|
|
"""
|
|
|
|
config_path = "configs/styling/professional.yaml"
|
|
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
|
|
|
with open(config_path, 'w') as f:
|
|
f.write(custom_config)
|
|
|
|
print(f"✅ Created custom styling config: {config_path}")
|
|
print(" This config is set up for professional business style transfer")
|
|
|
|
def handle_direct_args():
|
|
"""Handle direct command-line arguments by passing them to the styling pipeline"""
|
|
parser = argparse.ArgumentParser(description="Styling Data Processor")
|
|
|
|
# Add all the same arguments as the styling pipeline
|
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
|
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
|
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
|
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
|
parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
|
|
parser.add_argument("--input-field", type=str, help="Input field name")
|
|
parser.add_argument("--output-field", type=str, help="Output field name")
|
|
parser.add_argument("--instruction", type=str, help="Style instruction")
|
|
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
|
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
|
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
|
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
|
parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
|
|
parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
|
|
parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
|
|
parser.add_argument("--min-length", type=int, help="Minimum text length")
|
|
parser.add_argument("--max-length", type=int, help="Maximum text length")
|
|
parser.add_argument("--output-format", choices=["styling", "alpaca"], help="Output format")
|
|
parser.add_argument("--output-dir", type=str, help="Output directory")
|
|
|
|
# HuggingFace dataset options
|
|
parser.add_argument("--create-hf-dataset", action="store_true", help="Create HuggingFace dataset")
|
|
parser.add_argument("--hf-dataset-path", type=str, help="Path to save HuggingFace dataset")
|
|
|
|
# Logging
|
|
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Build command to call the styling pipeline
|
|
cmd = ["python", "pipelines/styling/data_processor.py"]
|
|
|
|
# Add all arguments that were provided
|
|
for arg_name, arg_value in vars(args).items():
|
|
if arg_value is not None:
|
|
if isinstance(arg_value, bool):
|
|
if arg_value: # Only add flag if True
|
|
cmd.append(f"--{arg_name.replace('_', '-')}")
|
|
else:
|
|
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
|
|
|
print(f"Running: {' '.join(cmd)}")
|
|
print()
|
|
|
|
try:
|
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
print("✅ Styling data processing completed successfully!")
|
|
print(result.stdout)
|
|
return True
|
|
except subprocess.CalledProcessError as e:
|
|
print(f"❌ Error running styling data processor: {e}")
|
|
print(f"Error output: {e.stderr}")
|
|
return False
|
|
|
|
def show_styling_features():
|
|
"""Show the features of the styling data processor"""
|
|
print("=== Styling Data Processor Features ===")
|
|
print()
|
|
print("1. **Style Transfer Tasks**:")
|
|
print(" - Formal vs. Informal style")
|
|
print(" - Professional vs. Casual tone")
|
|
print(" - Academic vs. Conversational")
|
|
print(" - Any custom style instruction")
|
|
print()
|
|
print("2. **Data Formats Supported**:")
|
|
print(" - HuggingFace datasets")
|
|
print(" - Custom JSONL/CSV/JSON files")
|
|
print(" - Automatic train/validation/test splits")
|
|
print()
|
|
print("3. **Output Formats**:")
|
|
print(" - Raw styling format (input/output)")
|
|
print(" - Alpaca format (instruction/input/output)")
|
|
print(" - HuggingFace dataset format")
|
|
print()
|
|
print("4. **Advanced Features**:")
|
|
print(" - Configurable field mapping")
|
|
print(" - Text preprocessing options")
|
|
print(" - Automatic dataset saving/loading")
|
|
print(" - YAML configuration support")
|
|
print()
|
|
print("=== Usage Examples ===")
|
|
print()
|
|
print("1. Use YAML config only:")
|
|
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
|
|
print()
|
|
print("2. Override YAML values:")
|
|
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml --max-samples 500")
|
|
print()
|
|
print("3. Create sample data:")
|
|
print(" python scripts/styling/data_processor.py create-sample-data")
|
|
print()
|
|
print("4. Create custom config:")
|
|
print(" python scripts/styling/data_processor.py create-config")
|
|
|
|
def main():
|
|
"""Main function"""
|
|
if len(sys.argv) > 1:
|
|
# Check if it's a subcommand
|
|
if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
|
|
# Handle subcommands
|
|
if sys.argv[1] == "examples":
|
|
run_styling_examples()
|
|
elif sys.argv[1] == "create-sample-data":
|
|
create_sample_styling_data()
|
|
elif sys.argv[1] == "create-config":
|
|
create_custom_styling_config()
|
|
elif sys.argv[1] == "features":
|
|
show_styling_features()
|
|
else:
|
|
# Handle direct arguments (pass through to pipeline)
|
|
handle_direct_args()
|
|
else:
|
|
print("Styling Data Processor")
|
|
print("=====================")
|
|
print()
|
|
print("This script runs the styling data processor for style transfer tasks.")
|
|
print("It supports both YAML configurations and command-line overrides.")
|
|
print()
|
|
print("Usage:")
|
|
print(" python scripts/styling/data_processor.py examples # Run examples")
|
|
print(" python scripts/styling/data_processor.py create-sample-data # Create sample dataset")
|
|
print(" python scripts/styling/data_processor.py create-config # Create custom config")
|
|
print(" python scripts/styling/data_processor.py features # Show features")
|
|
print()
|
|
print("Direct pipeline usage:")
|
|
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
|
|
print(" python scripts/styling/data_processor.py --data-source custom --data-path ./data.jsonl")
|
|
print()
|
|
print("Key Features:")
|
|
print(" ✅ Style transfer with custom instructions")
|
|
print(" ✅ Multiple data source support")
|
|
print(" ✅ YAML configuration files")
|
|
print(" ✅ CLI argument overrides")
|
|
print(" ✅ Automatic data splitting")
|
|
print(" ✅ HuggingFace dataset export")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|