#!/usr/bin/env python3 """ Styling data processor script that uses YAML configurations. This provides a flexible and maintainable approach for style transfer tasks. """ import sys import os import subprocess import argparse from pathlib import Path def run_with_yaml_config(config_path: str, **cli_overrides): """Run styling data processor with YAML configuration""" print(f"=== Running Styling Data Processor with YAML config: {config_path} ===") cmd = [ "python", "pipelines/styling/data_processor.py", "--config", config_path ] # Add CLI overrides for key, value in cli_overrides.items(): if value is not None: cmd.extend([f"--{key.replace('_', '-')}", str(value)]) print(f"Running command: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("✅ Styling data processing completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"❌ Error running styling data processor: {e}") print(f"Error output: {e.stderr}") return False def run_styling_examples(): """Run styling examples with YAML configs""" # Example 1: Formal style transfer print("=== Example 1: Formal Style Transfer ===") success = run_with_yaml_config( "configs/styling/formal.yaml", max_samples=1000, # Override YAML value output_format="alpaca" ) if success: print("✅ Formal style transfer completed!") # Example 2: Custom styling dataset (if available) print("\n=== Example 2: Custom Styling Dataset ===") if os.path.exists("data/raw/styling/custom_dataset.jsonl"): success = run_with_yaml_config( "configs/styling/formal.yaml", # Use formal config as base data_source="custom", data_path="data/raw/styling/custom_dataset.jsonl", instruction="Rewrite the following text in a casual, friendly style", output_dir="./data/processed/styling/casual" ) if success: print("✅ Custom styling dataset processing completed!") else: print("⚠️ Custom styling dataset not found, skipping...") print(" You can create one with the 'create-sample-data' option") def create_sample_styling_data(): """Create sample styling dataset for testing""" sample_data = [ { "text": "Hey, what's up? How are you doing today?", "styled_text": "Hello, how are you doing today?" }, { "text": "This is really cool stuff!", "styled_text": "This is quite impressive material." }, { "text": "I'm gonna go to the store later.", "styled_text": "I will go to the store later." }, { "text": "What's the deal with this?", "styled_text": "What is the situation regarding this matter?" }, { "text": "That's totally awesome!", "styled_text": "That is quite remarkable!" } ] # Create directory structure data_dir = Path("data/raw/styling") data_dir.mkdir(parents=True, exist_ok=True) # Save sample data import json sample_file = data_dir / "sample_formal.jsonl" with open(sample_file, 'w', encoding='utf-8') as f: for item in sample_data: f.write(json.dumps(item, ensure_ascii=False) + '\n') print(f"✅ Created sample styling dataset: {sample_file}") print(f" Contains {len(sample_data)} examples") print(f" Format: text → styled_text") print(f" Ready to use with configs/styling/formal.yaml") def create_custom_styling_config(): """Create a custom styling configuration file""" custom_config = """task: name: "styling" type: "style_transfer" data: source: "custom" input_field: "text" output_field: "styled_text" instruction: "Rewrite the following text in a professional business style" data_format: "jsonl" max_length: 512 min_length: 10 clean_text: true lowercase: false train_split: 0.8 validation_split: 0.1 test_split: 0.1 output_format: "alpaca" output_dir: "./data/processed/styling/professional" model: name: "t5-base" max_length: 512 training: num_epochs: 3 batch_size: 16 learning_rate: 3e-5 weight_decay: 0.01 warmup_ratio: 0.1 lr_scheduler_type: "linear" inference: batch_size: 32 max_new_tokens: 128 temperature: 0.8 """ config_path = "configs/styling/professional.yaml" os.makedirs(os.path.dirname(config_path), exist_ok=True) with open(config_path, 'w') as f: f.write(custom_config) print(f"✅ Created custom styling config: {config_path}") print(" This config is set up for professional business style transfer") def handle_direct_args(): """Handle direct command-line arguments by passing them to the styling pipeline""" parser = argparse.ArgumentParser(description="Styling Data Processor") # Add all the same arguments as the styling pipeline parser.add_argument("--config", type=str, help="Path to YAML configuration file") parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source") parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name") parser.add_argument("--data-path", type=str, help="Path to custom data file") parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format") parser.add_argument("--input-field", type=str, help="Input field name") parser.add_argument("--output-field", type=str, help="Output field name") parser.add_argument("--instruction", type=str, help="Style instruction") parser.add_argument("--max-samples", type=int, help="Maximum samples to process") parser.add_argument("--train-split", type=float, help="Training split ratio") parser.add_argument("--validation-split", type=float, help="Validation split ratio") parser.add_argument("--test-split", type=float, help="Test split ratio") parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text") parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters") parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase") parser.add_argument("--min-length", type=int, help="Minimum text length") parser.add_argument("--max-length", type=int, help="Maximum text length") parser.add_argument("--output-format", choices=["styling", "alpaca"], help="Output format") parser.add_argument("--output-dir", type=str, help="Output directory") # HuggingFace dataset options parser.add_argument("--create-hf-dataset", action="store_true", help="Create HuggingFace dataset") parser.add_argument("--hf-dataset-path", type=str, help="Path to save HuggingFace dataset") # Logging parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level") args = parser.parse_args() # Build command to call the styling pipeline cmd = ["python", "pipelines/styling/data_processor.py"] # Add all arguments that were provided for arg_name, arg_value in vars(args).items(): if arg_value is not None: if isinstance(arg_value, bool): if arg_value: # Only add flag if True cmd.append(f"--{arg_name.replace('_', '-')}") else: cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)]) print(f"Running: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("✅ Styling data processing completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"❌ Error running styling data processor: {e}") print(f"Error output: {e.stderr}") return False def show_styling_features(): """Show the features of the styling data processor""" print("=== Styling Data Processor Features ===") print() print("1. **Style Transfer Tasks**:") print(" - Formal vs. Informal style") print(" - Professional vs. Casual tone") print(" - Academic vs. Conversational") print(" - Any custom style instruction") print() print("2. **Data Formats Supported**:") print(" - HuggingFace datasets") print(" - Custom JSONL/CSV/JSON files") print(" - Automatic train/validation/test splits") print() print("3. **Output Formats**:") print(" - Raw styling format (input/output)") print(" - Alpaca format (instruction/input/output)") print(" - HuggingFace dataset format") print() print("4. **Advanced Features**:") print(" - Configurable field mapping") print(" - Text preprocessing options") print(" - Automatic dataset saving/loading") print(" - YAML configuration support") print() print("=== Usage Examples ===") print() print("1. Use YAML config only:") print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml") print() print("2. Override YAML values:") print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml --max-samples 500") print() print("3. Create sample data:") print(" python scripts/styling/data_processor.py create-sample-data") print() print("4. Create custom config:") print(" python scripts/styling/data_processor.py create-config") def main(): """Main function""" if len(sys.argv) > 1: # Check if it's a subcommand if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]: # Handle subcommands if sys.argv[1] == "examples": run_styling_examples() elif sys.argv[1] == "create-sample-data": create_sample_styling_data() elif sys.argv[1] == "create-config": create_custom_styling_config() elif sys.argv[1] == "features": show_styling_features() else: # Handle direct arguments (pass through to pipeline) handle_direct_args() else: print("Styling Data Processor") print("=====================") print() print("This script runs the styling data processor for style transfer tasks.") print("It supports both YAML configurations and command-line overrides.") print() print("Usage:") print(" python scripts/styling/data_processor.py examples # Run examples") print(" python scripts/styling/data_processor.py create-sample-data # Create sample dataset") print(" python scripts/styling/data_processor.py create-config # Create custom config") print(" python scripts/styling/data_processor.py features # Show features") print() print("Direct pipeline usage:") print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml") print(" python scripts/styling/data_processor.py --data-source custom --data-path ./data.jsonl") print() print("Key Features:") print(" ✅ Style transfer with custom instructions") print(" ✅ Multiple data source support") print(" ✅ YAML configuration files") print(" ✅ CLI argument overrides") print(" ✅ Automatic data splitting") print(" ✅ HuggingFace dataset export") if __name__ == "__main__": main()