instruct model setup

2025-08-28 17:57:59 +00:00
parent 77c563f358
commit d49b4ff2d5
55 changed files with 27760 additions and 326 deletions
@@ -16,7 +16,7 @@ def run_training_with_config(config_path: str, dataset_path: str = None, **cli_o
    if dataset_path:
        print(f"Training dataset: {dataset_path}")
    else:
-        print("Training dataset: Will use output_dir from YAML config")
+        print("Training dataset: Will use data_path from YAML config")
    print()
    
    # Build command
@@ -98,28 +98,28 @@ def create_training_example():
    
    if not Path(config_path).exists():
        print(f"Configuration file not found: {config_path}")
-        print("   Please run the data processor first to create the configuration")
+        print("   Please ensure the configuration file exists")
        return False
    
    print("Found required files!")
    print(f"   Config: {config_path}")
-    print("   Dataset: Will use output_dir from YAML config")
+    print("   Dataset: Will use data_path from YAML config")
    print("   The training pipeline will automatically:")
-    print("   - Load conversation data from the output_dir specified in YAML")
-    print("   - Convert JSONL files to HuggingFace dataset format")
+    print("   - Load conversation data directly from JSONL file")
+    print("   - Convert to HuggingFace dataset format")
    print("   - Apply ShareGPT standardization")
    print("   - Format conversations with chat templates")
-    print("   - Train the model using SFTTrainer with response-only training")
+    print("   - Train the model using SFTTrainer")
    print()
    
    # Run training without explicit dataset path - will use YAML config
    success = run_training_with_config(
        config_path=config_path,
-        dataset_path=None,  # Use output_dir from YAML config
+        dataset_path=None,  # Use data_path from YAML config
        epochs=1,
        batch_size=1,
        learning_rate=2e-4,
-        max_steps=30
+        max_steps=5  # Minimal steps for quick test
    )
    
    if success:
@@ -138,19 +138,20 @@ def create_quick_test():
    
    if not Path(config_path).exists():
        print(f"Configuration file not found: {config_path}")
-        print("   Please run the data processor first to create the configuration")
+        print("   Please ensure the configuration file exists")
        return False
    
    print("Running quick test with minimal training steps...")
+    print("This will load data directly from the JSONL file specified in config")
    
    # Run training with very few steps for quick testing
    success = run_training_with_config(
        config_path=config_path,
-        dataset_path=None,
+        dataset_path=None,  # Use data_path from YAML config
        epochs=1,
        batch_size=1,
        learning_rate=2e-4,
-        max_steps=5  # Very few steps for quick test
+        max_steps=3  # Very few steps for quick test
    )
    
    if success:
@@ -16,7 +16,7 @@ def run_training_with_config(config_path: str, dataset_path: str = None, **cli_o
    if dataset_path:
        print(f"Training dataset: {dataset_path}")
    else:
-        print("Training dataset: Will use output_dir from YAML config")
+        print("Training dataset: Will use data_path from YAML config")
    print()
    
    # Build command
@@ -98,28 +98,28 @@ def create_training_example():
    
    if not Path(config_path).exists():
        print(f"Configuration file not found: {config_path}")
-        print("   Please run the data processor first to create the configuration")
+        print("   Please ensure the configuration file exists")
        return False
    
    print("Found required files!")
    print(f"   Config: {config_path}")
-    print("   Dataset: Will use output_dir from YAML config")
+    print("   Dataset: Will use data_path from YAML config")
    print("   The training pipeline will automatically:")
-    print("   - Load conversation data from the output_dir specified in YAML")
-    print("   - Convert JSONL files to HuggingFace dataset format")
+    print("   - Load conversation data directly from JSONL file")
+    print("   - Convert to HuggingFace dataset format")
    print("   - Apply ShareGPT standardization")
    print("   - Format conversations with chat templates")
-    print("   - Train the model using SFTTrainer with response-only training")
+    print("   - Train the model using SFTTrainer")
    print()
    
    # Run training without explicit dataset path - will use YAML config
    success = run_training_with_config(
        config_path=config_path,
-        dataset_path=None,  # Use output_dir from YAML config
+        dataset_path=None,  # Use data_path from YAML config
        epochs=1,
        batch_size=1,
        learning_rate=2e-4,
-        max_steps=30
+        max_steps=5  # Minimal steps for quick test
    )
    
    if success:
@@ -138,19 +138,20 @@ def create_quick_test():
    
    if not Path(config_path).exists():
        print(f"Configuration file not found: {config_path}")
-        print("   Please run the data processor first to create the configuration")
+        print("   Please ensure the configuration file exists")
        return False
    
    print("Running quick test with minimal training steps...")
+    print("This will load data directly from the JSONL file specified in config")
    
    # Run training with very few steps for quick testing
    success = run_training_with_config(
        config_path=config_path,
-        dataset_path=None,
+        dataset_path=None,  # Use data_path from YAML config
        epochs=1,
        batch_size=1,
        learning_rate=2e-4,
-        max_steps=5  # Very few steps for quick test
+        max_steps=3  # Very few steps for quick test
    )
    
    if success: