added style mimicking piepelines

2025-08-13 21:17:01 +01:00
parent fd54d4be39
commit 710d074b47
31 changed files with 3816 additions and 46 deletions
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""
+Styling Training Script
+Provides a command-line interface to run the styling training pipeline
+"""
+
+import sys
+import os
+import subprocess
+import argparse
+from pathlib import Path
+
+def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides):
+    """Run the styling training pipeline with YAML configuration"""
+    print(f"Starting styling training with config: {config_path}")
+    if dataset_path:
+        print(f"Training dataset: {dataset_path}")
+    else:
+        print("Training dataset: Will use output_dir from YAML config")
+    print()
+    
+    # Build command
+    cmd = ["python", "pipelines/styling/train.py", "--config", config_path]
+    
+    # Add dataset path if provided
+    if dataset_path:
+        cmd.extend(["--dataset", dataset_path])
+    
+    # Add CLI overrides
+    for key, value in cli_overrides.items():
+        if value is not None:
+            if key == "output_dir":
+                cmd.extend(["--output-dir", str(value)])
+            elif key == "epochs":
+                cmd.extend(["--epochs", str(value)])
+            elif key == "batch_size":
+                cmd.extend(["--batch-size", str(value)])
+            elif key == "learning_rate":
+                cmd.extend(["--learning-rate", str(value)])
+            elif key == "max_steps":
+                cmd.extend(["--max-steps", str(value)])
+    
+    print(f"Running: {' '.join(cmd)}")
+    print()
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print("Training completed successfully!")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Training failed: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def show_training_features():
+    """Show the features of the styling training pipeline"""
+    print("=== Styling Training Pipeline Features ===")
+    print()
+    print("1. **Model Support**:")
+    print("   - Unsloth optimized models (4x faster)")
+    print("   - LoRA fine-tuning for efficiency")
+    print("   - Support for Llama-3.1, Mistral, Phi-3, Gemma")
+    print()
+    print("2. **Training Features**:")
+    print("   - SFTTrainer with instruction tuning")
+    print("   - Automatic mixed precision (FP16/BF16)")
+    print("   - Gradient checkpointing for memory efficiency")
+    print("   - Configurable LoRA parameters")
+    print()
+    print("3. **Configuration**:")
+    print("   - YAML configuration files")
+    print("   - CLI argument overrides")
+    print("   - Automatic device detection")
+    print()
+    print("4. **Output**:")
+    print("   - Saved LoRA models")
+    print("   - Training logs and checkpoints")
+    print("   - Ready for inference")
+
+def create_training_example():
+    """Create a training example using the formal style configuration"""
+    print("=== Training Example: Formal Style Transfer ===")
+    print()
+    
+    # Check if we have the required files
+    config_path = "configs/styling/formal.yaml"
+    
+    if not Path(config_path).exists():
+        print(f"Configuration file not found: {config_path}")
+        print("   Please run the data processor first to create the configuration")
+        return False
+    
+    print("Found required files!")
+    print(f"   Config: {config_path}")
+    print("   Dataset: Will use output_dir from YAML config")
+    print("   The training pipeline will automatically:")
+    print("   - Load data from the output_dir specified in YAML")
+    print("   - Convert JSONL files to HuggingFace dataset format")
+    print("   - Apply formatting with EOS tokens")
+    print("   - Train the model using SFTTrainer")
+    print()
+    
+    # Run training without explicit dataset path - will use YAML config
+    success = run_training_with_config(
+        config_path=config_path,
+        dataset_path=None,  # Use output_dir from YAML config
+        epochs=1,
+        batch_size=2,
+        learning_rate=2e-4
+    )
+    
+    if success:
+        print("Training example completed!")
+        print("   Model saved to: ./models/styling")
+        print("   Ready for inference!")
+    
+    return success
+
+def main():
+    """Main function"""
+    parser = argparse.ArgumentParser(description="Styling Training Script")
+    
+    # Subcommands
+    parser.add_argument("command", choices=["train", "example", "features"], 
+                       help="Command to run")
+    
+    # Training arguments
+    parser.add_argument("--config", type=str, help="Path to YAML configuration file")
+    parser.add_argument("--dataset", type=str, help="Path to training dataset")
+    parser.add_argument("--output-dir", type=str, help="Output directory for model")
+    parser.add_argument("--epochs", type=int, help="Number of training epochs")
+    parser.add_argument("--batch-size", type=int, help="Training batch size")
+    parser.add_argument("--learning-rate", type=float, help="Learning rate")
+    parser.add_argument("--max-steps", type=int, help="Maximum training steps")
+    
+    args = parser.parse_args()
+    
+    if args.command == "features":
+        show_training_features()
+    
+    elif args.command == "example":
+        create_training_example()
+    
+    elif args.command == "train":
+        if not args.config:
+            print("❌ --config is required for training")
+            print("Usage: python scripts/styling/train.py train --config config.yaml")
+            sys.exit(1)
+        
+        # If dataset is not provided, try to use output_dir from config
+        dataset_path = args.dataset if args.dataset else None
+        
+        success = run_training_with_config(
+            config_path=args.config,
+            dataset_path=dataset_path,
+            output_dir=args.output_dir,
+            epochs=args.epochs,
+            batch_size=args.batch_size,
+            learning_rate=args.learning_rate,
+            max_steps=args.max_steps
+        )
+        
+        if not success:
+            sys.exit(1)
+
+if __name__ == "__main__":
+    main()