updated instruct
This commit is contained in:
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instruct data processor script that uses YAML configurations.
|
||||
This provides a flexible and maintainable approach for instruction fine-tuning tasks.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||
"""Run instruct data processor with YAML configuration"""
|
||||
print(f"=== Running Instruct Data Processor with YAML config: {config_path} ===")
|
||||
|
||||
cmd = [
|
||||
"python", "pipelines/instruct/data_processor.py",
|
||||
"--config", config_path
|
||||
]
|
||||
|
||||
# Add CLI overrides
|
||||
for key, value in cli_overrides.items():
|
||||
if value is not None:
|
||||
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Instruct data processing completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running instruct data processor: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def run_instruct_examples():
|
||||
"""Run instruct examples with YAML configs"""
|
||||
|
||||
# Example 1: Code reasoning instruction tuning
|
||||
print("=== Example 1: Code Reasoning Instruction Tuning ===")
|
||||
success = run_with_yaml_config(
|
||||
"configs/instruct/code_reasoning.yaml",
|
||||
max_samples=1000, # Override YAML value
|
||||
output_format="conversation"
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Code reasoning instruction tuning data processing completed!")
|
||||
|
||||
# Example 2: Custom conversation dataset (if available)
|
||||
print("\n=== Example 2: Custom Conversation Dataset ===")
|
||||
if os.path.exists("data/raw/instruct/custom_conversations.jsonl"):
|
||||
success = run_with_yaml_config(
|
||||
"configs/instruct/code_reasoning.yaml", # Use code_reasoning config as base
|
||||
data_source="custom",
|
||||
data_path="data/raw/instruct/custom_conversations.jsonl",
|
||||
output_dir="./data/processed/instruct/custom"
|
||||
)
|
||||
if success:
|
||||
print("✅ Custom conversation dataset processing completed!")
|
||||
else:
|
||||
print("⚠️ Custom conversation dataset not found, skipping...")
|
||||
print(" You can create one with the 'create-sample-data' option")
|
||||
|
||||
def create_sample_instruct_data():
|
||||
"""Create sample conversation dataset for testing"""
|
||||
sample_data = [
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "Can you explain what a Python list comprehension is?"},
|
||||
{"role": "assistant", "content": "A Python list comprehension is a concise way to create lists. It allows you to generate a new list by applying an expression to each item in an existing iterable, optionally filtering items with a condition. The basic syntax is: [expression for item in iterable if condition]. For example: [x**2 for x in range(10) if x%2==0] creates a list of squares of even numbers from 0 to 8."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "How do I handle exceptions in Python?"},
|
||||
{"role": "assistant", "content": "In Python, you handle exceptions using try-except blocks. Here's the basic structure:\n\ntry:\n # Code that might raise an exception\n risky_code()\nexcept SpecificException as e:\n # Handle specific exception\n print(f'Error: {e}')\nexcept Exception as e:\n # Handle any other exception\n print(f'Unexpected error: {e}')\nelse:\n # Runs if no exception occurred\n print('Success!')\nfinally:\n # Always runs, regardless of exceptions\n cleanup_code()\n\nYou can catch multiple specific exceptions and handle them differently."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "What's the difference between a function and a method in Python?"},
|
||||
{"role": "assistant", "content": "In Python, the main differences between functions and methods are:\n\n1. **Definition location**: Functions are defined independently, while methods are defined inside classes.\n\n2. **Calling**: Functions are called directly by name, methods are called on objects using dot notation.\n\n3. **First parameter**: Methods automatically receive 'self' as the first parameter (the instance they're called on), functions don't have this automatic parameter.\n\nExample:\n```python\n# Function\ndef my_function(x):\n return x * 2\n\n# Method (inside a class)\nclass MyClass:\n def my_method(self, x):\n return x * 2\n\n# Usage\nresult1 = my_function(5) # Function call\nobj = MyClass()\nresult2 = obj.my_method(5) # Method call\n```"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "How can I optimize a slow Python loop?"},
|
||||
{"role": "assistant", "content": "Here are several strategies to optimize slow Python loops:\n\n1. **Use list comprehensions** instead of explicit loops when possible\n2. **Vectorize with NumPy** for numerical operations\n3. **Use built-in functions** like map(), filter(), sum()\n4. **Avoid repeated lookups** - store frequently accessed values in variables\n5. **Use sets for membership testing** instead of lists\n6. **Consider using itertools** for efficient iteration patterns\n7. **Profile your code** with cProfile to identify bottlenecks\n8. **Use numba or Cython** for compute-intensive loops\n\nExample optimization:\n```python\n# Slow\nresult = []\nfor i in range(1000000):\n if i % 2 == 0:\n result.append(i ** 2)\n\n# Faster\nresult = [i ** 2 for i in range(1000000) if i % 2 == 0]\n\n# Even faster with NumPy\nimport numpy as np\narr = np.arange(1000000)\nresult = (arr[arr % 2 == 0] ** 2).tolist()\n```"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "What are Python decorators and how do I use them?"},
|
||||
{"role": "assistant", "content": "Python decorators are a powerful feature that allows you to modify or extend the behavior of functions or classes without permanently modifying their code. They use the @ symbol and are placed above the function definition.\n\n**Basic decorator example:**\n```python\ndef my_decorator(func):\n def wrapper(*args, **kwargs):\n print('Before function call')\n result = func(*args, **kwargs)\n print('After function call')\n return result\n return wrapper\n\n@my_decorator\ndef greet(name):\n print(f'Hello, {name}!')\n\ngreet('Alice') # Prints before, greeting, and after messages\n```\n\n**Common use cases:**\n- Timing function execution\n- Logging\n- Authentication/authorization\n- Caching results\n- Input validation\n\n**Built-in decorators:**\n- `@property` - creates getter/setter methods\n- `@staticmethod` - methods that don't need self or cls\n- `@classmethod` - methods that receive the class as first argument\n\nDecorators make code more modular and reusable by separating concerns."}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
# Create directory structure
|
||||
data_dir = Path("data/raw/instruct")
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save sample data
|
||||
import json
|
||||
sample_file = data_dir / "code_reasoning.jsonl"
|
||||
with open(sample_file, 'w', encoding='utf-8') as f:
|
||||
for item in sample_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
print(f"✅ Created sample conversation dataset: {sample_file}")
|
||||
print(f" Contains {len(sample_data)} conversation examples")
|
||||
print(f" Format: conversation array with role/content pairs")
|
||||
print(f" Ready to use with configs/instruct/code_reasoning.yaml")
|
||||
|
||||
def create_custom_instruct_config():
|
||||
"""Create a custom instruct configuration file"""
|
||||
custom_config = """# Custom Instruct Configuration
|
||||
task:
|
||||
name: "general_chat"
|
||||
type: "instruction_following"
|
||||
|
||||
data:
|
||||
source: "custom"
|
||||
data_path: "./data/raw/instruct/general_chat.jsonl"
|
||||
data_format: "jsonl"
|
||||
conversation_field: "conversation"
|
||||
max_length: 2048
|
||||
min_length: 10
|
||||
clean_text: true
|
||||
train_split: 0.8
|
||||
validation_split: 0.1
|
||||
test_split: 0.1
|
||||
output_format: "conversation"
|
||||
output_dir: "./data/processed/instruct/general_chat"
|
||||
|
||||
model:
|
||||
name: "unsloth/Qwen2.5-7B-Instruct"
|
||||
max_length: 2048
|
||||
max_seq_length: 2048
|
||||
dtype: null
|
||||
load_in_4bit: true
|
||||
token: null
|
||||
training_model: "unsloth/Qwen2.5-7B-Instruct"
|
||||
training_max_seq_length: 2048
|
||||
training_dtype: null
|
||||
training_load_in_4bit: true
|
||||
|
||||
training:
|
||||
num_epochs: 1
|
||||
batch_size: 1
|
||||
learning_rate: 2e-4
|
||||
weight_decay: 0.01
|
||||
warmup_steps: 5
|
||||
max_steps: 50
|
||||
gradient_accumulation_steps: 4
|
||||
lr_scheduler_type: "linear"
|
||||
seed: 3407
|
||||
lora_r: 16
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
output_dir: "./outputs"
|
||||
model_output_dir: "./models/instruct/general_chat"
|
||||
|
||||
inference:
|
||||
batch_size: 1
|
||||
max_new_tokens: 256
|
||||
temperature: 0.8
|
||||
min_p: 0.1
|
||||
use_cache: true
|
||||
"""
|
||||
|
||||
config_path = "configs/instruct/general_chat.yaml"
|
||||
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
||||
|
||||
with open(config_path, 'w') as f:
|
||||
f.write(custom_config)
|
||||
|
||||
print(f"✅ Created custom instruct config: {config_path}")
|
||||
print(" This config is set up for general chat instruction tuning")
|
||||
|
||||
def handle_direct_args():
|
||||
"""Handle direct command-line arguments by passing them to the instruct pipeline"""
|
||||
parser = argparse.ArgumentParser(description="Instruct Data Processor")
|
||||
|
||||
# Add all the same arguments as the instruct pipeline
|
||||
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||||
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||||
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||||
parser.add_argument("--data-format", choices=["jsonl", "json"], help="Data format")
|
||||
parser.add_argument("--conversation-field", type=str, help="Conversation field name")
|
||||
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||||
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||||
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||||
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||||
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||||
|
||||
# Logging
|
||||
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build command to call the instruct pipeline
|
||||
cmd = ["python", "pipelines/instruct/data_processor.py"]
|
||||
|
||||
# Add all arguments that were provided
|
||||
for arg_name, arg_value in vars(args).items():
|
||||
if arg_value is not None:
|
||||
if isinstance(arg_value, bool):
|
||||
if arg_value: # Only add flag if True
|
||||
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||
else:
|
||||
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Instruct data processing completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running instruct data processor: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def show_instruct_features():
|
||||
"""Show the features of the instruct data processor"""
|
||||
print("=== Instruct Data Processor Features ===")
|
||||
print()
|
||||
print("1. **Instruction Fine-tuning Tasks**:")
|
||||
print(" - Code reasoning and explanation")
|
||||
print(" - General conversation and chat")
|
||||
print(" - Question answering")
|
||||
print(" - Task-specific instruction following")
|
||||
print()
|
||||
print("2. **Conversation Data Formats Supported**:")
|
||||
print(" - HuggingFace conversation datasets")
|
||||
print(" - Custom JSONL/JSON files with conversation arrays")
|
||||
print(" - ShareGPT format with role/content structure")
|
||||
print(" - Automatic train/validation/test splits")
|
||||
print()
|
||||
print("3. **Conversation Validation**:")
|
||||
print(" - Role validation (user/assistant/system)")
|
||||
print(" - Content length and quality checks")
|
||||
print(" - Conversation structure validation")
|
||||
print(" - Turn-level statistics and analysis")
|
||||
print()
|
||||
print("4. **Advanced Features**:")
|
||||
print(" - Configurable conversation field mapping")
|
||||
print(" - Text preprocessing options")
|
||||
print(" - Automatic dataset saving/loading")
|
||||
print(" - YAML configuration support")
|
||||
print(" - Compatible with Unsloth chat templates")
|
||||
print()
|
||||
print("=== Usage Examples ===")
|
||||
print()
|
||||
print("1. Use YAML config only:")
|
||||
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||
print()
|
||||
print("2. Override YAML values:")
|
||||
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml --max-samples 500")
|
||||
print()
|
||||
print("3. Create sample data:")
|
||||
print(" python scripts/instruct/data_processor.py create-sample-data")
|
||||
print()
|
||||
print("4. Create custom config:")
|
||||
print(" python scripts/instruct/data_processor.py create-config")
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
if len(sys.argv) > 1:
|
||||
# Check if it's a subcommand
|
||||
if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
|
||||
# Handle subcommands
|
||||
if sys.argv[1] == "examples":
|
||||
run_instruct_examples()
|
||||
elif sys.argv[1] == "create-sample-data":
|
||||
create_sample_instruct_data()
|
||||
elif sys.argv[1] == "create-config":
|
||||
create_custom_instruct_config()
|
||||
elif sys.argv[1] == "features":
|
||||
show_instruct_features()
|
||||
else:
|
||||
# Handle direct arguments (pass through to pipeline)
|
||||
handle_direct_args()
|
||||
else:
|
||||
print("Instruct Data Processor")
|
||||
print("======================")
|
||||
print()
|
||||
print("This script runs the instruct data processor for instruction fine-tuning tasks.")
|
||||
print("It supports both YAML configurations and command-line overrides.")
|
||||
print()
|
||||
print("Usage:")
|
||||
print(" python scripts/instruct/data_processor.py examples # Run examples")
|
||||
print(" python scripts/instruct/data_processor.py create-sample-data # Create sample dataset")
|
||||
print(" python scripts/instruct/data_processor.py create-config # Create custom config")
|
||||
print(" python scripts/instruct/data_processor.py features # Show features")
|
||||
print()
|
||||
print("Direct pipeline usage:")
|
||||
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||
print(" python scripts/instruct/data_processor.py --data-source custom --data-path ./conversations.jsonl")
|
||||
print()
|
||||
print("Key Features:")
|
||||
print(" ✅ Instruction fine-tuning with conversation data")
|
||||
print(" ✅ Multiple data source support")
|
||||
print(" ✅ YAML configuration files")
|
||||
print(" ✅ CLI argument overrides")
|
||||
print(" ✅ Conversation validation and analysis")
|
||||
print(" ✅ Compatible with Unsloth chat templates")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,291 @@
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instruct Inference Script
|
||||
Provides a command-line interface to run the instruct inference pipeline
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_inference_with_config(config_path: str, message: str = "", max_tokens: int = 128, stream: bool = False, interactive: bool = False):
|
||||
"""Run inference using a YAML configuration file"""
|
||||
print(f"Running instruct inference with config: {config_path}")
|
||||
if interactive:
|
||||
print("Mode: Interactive chat")
|
||||
elif message:
|
||||
print(f"Message: {message}")
|
||||
print(f"Max tokens: {max_tokens}")
|
||||
print(f"Streaming: {stream}")
|
||||
|
||||
cmd = [
|
||||
"python", "pipelines/instruct/inference.py",
|
||||
"--config", config_path,
|
||||
"--max-tokens", str(max_tokens)
|
||||
]
|
||||
|
||||
if interactive:
|
||||
cmd.append("--interactive")
|
||||
elif message:
|
||||
cmd.extend(["--message", message])
|
||||
|
||||
if stream:
|
||||
cmd.append("--stream")
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
if interactive:
|
||||
# For interactive mode, don't capture output
|
||||
result = subprocess.run(cmd, check=True)
|
||||
return True
|
||||
else:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
print("✅ Inference completed successfully!")
|
||||
print("Output:")
|
||||
print(result.stdout)
|
||||
return result.stdout
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Inference failed: {e}")
|
||||
print("Error output:")
|
||||
print(e.stderr)
|
||||
return None
|
||||
|
||||
def run_conversation_example(config_path: str):
|
||||
"""Run a conversation example"""
|
||||
print(f"=== Conversation Example ===")
|
||||
print(f"Config: {config_path}")
|
||||
|
||||
example_messages = [
|
||||
"Can you explain what recursion is in programming?",
|
||||
"How do I debug a Python program?",
|
||||
"What's the difference between a list and a tuple in Python?",
|
||||
"Can you show me how to use a for loop?",
|
||||
"What are the benefits of using functions in programming?"
|
||||
]
|
||||
|
||||
print("Running example conversations...")
|
||||
print()
|
||||
|
||||
for i, message in enumerate(example_messages):
|
||||
print(f"--- Example {i+1} ---")
|
||||
result = run_inference_with_config(config_path, message, max_tokens=256)
|
||||
if not result:
|
||||
print(f"❌ Failed to process message {i+1}")
|
||||
print()
|
||||
|
||||
print("✅ Conversation examples completed!")
|
||||
|
||||
def show_inference_features():
|
||||
"""Show the features of the instruct inference pipeline"""
|
||||
print("=== Instruct Inference Pipeline Features ===")
|
||||
print()
|
||||
print("1. **Model Support**:")
|
||||
print(" - Trained LoRA models from instruct training pipeline")
|
||||
print(" - Automatic model loading from config")
|
||||
print(" - Native Unsloth inference optimization")
|
||||
print(" - Chat template integration")
|
||||
print()
|
||||
print("2. **Inference Modes**:")
|
||||
print(" - Single message inference")
|
||||
print(" - Interactive chat session")
|
||||
print(" - Streaming generation")
|
||||
print(" - Batch conversation processing")
|
||||
print()
|
||||
print("3. **Conversation Features**:")
|
||||
print(" - Multi-turn conversation support")
|
||||
print(" - Context preservation across turns")
|
||||
print(" - Proper role handling (user/assistant/system)")
|
||||
print(" - Chat history management")
|
||||
print()
|
||||
print("4. **Generation Control**:")
|
||||
print(" - Configurable max tokens")
|
||||
print(" - Temperature and sampling parameters")
|
||||
print(" - Streaming output support")
|
||||
print(" - Chat template formatting")
|
||||
print()
|
||||
print("5. **Interactive Features**:")
|
||||
print(" - Real-time chat interface")
|
||||
print(" - Command support (clear, stream toggle)")
|
||||
print(" - Conversation history tracking")
|
||||
print(" - Graceful exit handling")
|
||||
print()
|
||||
print("6. **Usage Examples**:")
|
||||
print(" - Single message: --message 'your question here'")
|
||||
print(" - Interactive chat: --interactive")
|
||||
print(" - Streaming: add --stream flag")
|
||||
print(" - Custom tokens: --max-tokens 256")
|
||||
|
||||
def create_inference_example():
|
||||
"""Create an inference example using the code reasoning configuration"""
|
||||
print("=== Inference Example: Code Reasoning Chat ===")
|
||||
print()
|
||||
|
||||
# Check if we have the required files
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"❌ Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor and training first")
|
||||
return False
|
||||
|
||||
print("✅ Found configuration file!")
|
||||
print(f" Config: {config_path}")
|
||||
print()
|
||||
|
||||
# Example conversation
|
||||
example_message = "Can you explain what a Python decorator is and show me a simple example?"
|
||||
|
||||
print(f"Example message: {example_message}")
|
||||
print()
|
||||
|
||||
# Run inference
|
||||
success = run_inference_with_config(
|
||||
config_path=config_path,
|
||||
message=example_message,
|
||||
max_tokens=256
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Example inference completed successfully!")
|
||||
return True
|
||||
else:
|
||||
print("❌ Example inference failed!")
|
||||
return False
|
||||
|
||||
def start_interactive_chat(config_path: str, stream: bool = False):
|
||||
"""Start an interactive chat session"""
|
||||
print("=== Interactive Chat Session ===")
|
||||
print()
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"❌ Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor and training first")
|
||||
return False
|
||||
|
||||
print(f"Starting interactive chat with config: {config_path}")
|
||||
print("Streaming:", "enabled" if stream else "disabled")
|
||||
print()
|
||||
|
||||
# Run interactive inference
|
||||
success = run_inference_with_config(
|
||||
config_path=config_path,
|
||||
interactive=True,
|
||||
stream=stream
|
||||
)
|
||||
|
||||
return success
|
||||
|
||||
def create_batch_test():
|
||||
"""Create a batch test with multiple questions"""
|
||||
print("=== Batch Test: Multiple Questions ===")
|
||||
print()
|
||||
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"❌ Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor and training first")
|
||||
return False
|
||||
|
||||
# Create a batch of test questions
|
||||
test_questions = [
|
||||
"What is object-oriented programming?",
|
||||
"How do you handle errors in Python?",
|
||||
"Explain the concept of variables in programming.",
|
||||
"What's the difference between a compiler and an interpreter?"
|
||||
]
|
||||
|
||||
print("Running batch test with multiple questions...")
|
||||
print()
|
||||
|
||||
success_count = 0
|
||||
for i, question in enumerate(test_questions):
|
||||
print(f"Question {i+1}: {question}")
|
||||
result = run_inference_with_config(config_path, question, max_tokens=200)
|
||||
if result:
|
||||
success_count += 1
|
||||
print("-" * 50)
|
||||
|
||||
print(f"✅ Batch test completed: {success_count}/{len(test_questions)} questions processed successfully")
|
||||
return success_count == len(test_questions)
|
||||
|
||||
def main():
|
||||
"""Main inference function"""
|
||||
parser = argparse.ArgumentParser(description="Instruct Inference Pipeline")
|
||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||
|
||||
# Inference command
|
||||
infer_parser = subparsers.add_parser("infer", help="Run single inference")
|
||||
infer_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
infer_parser.add_argument("--message", type=str, required=True, help="Message to send to the model")
|
||||
infer_parser.add_argument("--max-tokens", type=int, default=128, help="Maximum new tokens to generate")
|
||||
infer_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||
|
||||
# Interactive command
|
||||
interactive_parser = subparsers.add_parser("chat", help="Start interactive chat")
|
||||
interactive_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
interactive_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||
|
||||
# Batch test command
|
||||
batch_parser = subparsers.add_parser("batch", help="Run batch test")
|
||||
batch_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
|
||||
# Conversation example command
|
||||
conv_parser = subparsers.add_parser("conversation", help="Run conversation examples")
|
||||
conv_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
|
||||
# Features command
|
||||
subparsers.add_parser("features", help="Show available features")
|
||||
|
||||
# Example command
|
||||
subparsers.add_parser("example", help="Run example inference")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "infer":
|
||||
run_inference_with_config(
|
||||
args.config,
|
||||
args.message,
|
||||
args.max_tokens,
|
||||
args.stream
|
||||
)
|
||||
elif args.command == "chat":
|
||||
start_interactive_chat(args.config, args.stream)
|
||||
elif args.command == "batch":
|
||||
create_batch_test()
|
||||
elif args.command == "conversation":
|
||||
run_conversation_example(args.config)
|
||||
elif args.command == "features":
|
||||
show_inference_features()
|
||||
elif args.command == "example":
|
||||
create_inference_example()
|
||||
else:
|
||||
print("Instruct Inference Pipeline")
|
||||
print("==========================")
|
||||
print()
|
||||
print("Available commands:")
|
||||
print(" infer - Run single message inference")
|
||||
print(" chat - Start interactive chat session")
|
||||
print(" batch - Run batch test with multiple questions")
|
||||
print(" conversation - Run conversation examples")
|
||||
print(" features - Show available features")
|
||||
print(" example - Run example inference")
|
||||
print()
|
||||
print("Examples:")
|
||||
print(" python scripts/instruct/inference.py infer --config configs/instruct/code_reasoning.yaml --message 'Explain Python loops'")
|
||||
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml")
|
||||
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml --stream")
|
||||
print()
|
||||
print("Key Features:")
|
||||
print(" ✅ Interactive chat with conversation history")
|
||||
print(" ✅ Streaming generation support")
|
||||
print(" ✅ Multi-turn conversation handling")
|
||||
print(" ✅ Chat template integration")
|
||||
print(" ✅ Configurable generation parameters")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instruct Training Script
|
||||
Provides a command-line interface to run the instruct training pipeline
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides):
|
||||
"""Run the instruct training pipeline with YAML configuration"""
|
||||
print(f"Starting instruct training with config: {config_path}")
|
||||
if dataset_path:
|
||||
print(f"Training dataset: {dataset_path}")
|
||||
else:
|
||||
print("Training dataset: Will use output_dir from YAML config")
|
||||
print()
|
||||
|
||||
# Build command
|
||||
cmd = ["python", "pipelines/instruct/train.py", "--config", config_path]
|
||||
|
||||
# Add dataset path if provided
|
||||
if dataset_path:
|
||||
cmd.extend(["--dataset", dataset_path])
|
||||
|
||||
# Add CLI overrides
|
||||
for key, value in cli_overrides.items():
|
||||
if value is not None:
|
||||
if key == "output_dir":
|
||||
cmd.extend(["--output-dir", str(value)])
|
||||
elif key == "epochs":
|
||||
cmd.extend(["--epochs", str(value)])
|
||||
elif key == "batch_size":
|
||||
cmd.extend(["--batch-size", str(value)])
|
||||
elif key == "learning_rate":
|
||||
cmd.extend(["--learning-rate", str(value)])
|
||||
elif key == "max_steps":
|
||||
cmd.extend(["--max-steps", str(value)])
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("Training completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Training failed: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def show_training_features():
|
||||
"""Show the features of the instruct training pipeline"""
|
||||
print("=== Instruct Training Pipeline Features ===")
|
||||
print()
|
||||
print("1. **Model Support**:")
|
||||
print(" - Unsloth optimized models (4x faster)")
|
||||
print(" - LoRA fine-tuning for efficiency")
|
||||
print(" - Support for Qwen2.5, Llama-3.1, Mistral, Phi-3")
|
||||
print(" - Chat template integration")
|
||||
print()
|
||||
print("2. **Training Features**:")
|
||||
print(" - SFTTrainer with conversation data")
|
||||
print(" - Response-only training (train only on assistant responses)")
|
||||
print(" - ShareGPT format standardization")
|
||||
print(" - Automatic mixed precision (FP16/BF16)")
|
||||
print(" - Gradient checkpointing for memory efficiency")
|
||||
print(" - Configurable LoRA parameters")
|
||||
print()
|
||||
print("3. **Conversation Handling**:")
|
||||
print(" - Multi-turn conversation support")
|
||||
print(" - Proper chat template formatting")
|
||||
print(" - Role-based training (user/assistant/system)")
|
||||
print(" - Context preservation across turns")
|
||||
print()
|
||||
print("4. **Configuration**:")
|
||||
print(" - YAML configuration files")
|
||||
print(" - CLI argument overrides")
|
||||
print(" - Automatic device detection")
|
||||
print(" - Flexible LoRA configuration")
|
||||
print()
|
||||
print("5. **Output**:")
|
||||
print(" - Saved LoRA models")
|
||||
print(" - Training logs and checkpoints")
|
||||
print(" - Ready for conversational inference")
|
||||
|
||||
def create_training_example():
|
||||
"""Create a training example using the code reasoning configuration"""
|
||||
print("=== Training Example: Code Reasoning Instruction Tuning ===")
|
||||
print()
|
||||
|
||||
# Check if we have the required files
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor first to create the configuration")
|
||||
return False
|
||||
|
||||
print("Found required files!")
|
||||
print(f" Config: {config_path}")
|
||||
print(" Dataset: Will use output_dir from YAML config")
|
||||
print(" The training pipeline will automatically:")
|
||||
print(" - Load conversation data from the output_dir specified in YAML")
|
||||
print(" - Convert JSONL files to HuggingFace dataset format")
|
||||
print(" - Apply ShareGPT standardization")
|
||||
print(" - Format conversations with chat templates")
|
||||
print(" - Train the model using SFTTrainer with response-only training")
|
||||
print()
|
||||
|
||||
# Run training without explicit dataset path - will use YAML config
|
||||
success = run_training_with_config(
|
||||
config_path=config_path,
|
||||
dataset_path=None, # Use output_dir from YAML config
|
||||
epochs=1,
|
||||
batch_size=1,
|
||||
learning_rate=2e-4,
|
||||
max_steps=30
|
||||
)
|
||||
|
||||
if success:
|
||||
print("Training example completed!")
|
||||
print(" Model saved to: ./models/instruct")
|
||||
print(" Ready for conversational inference!")
|
||||
|
||||
return success
|
||||
|
||||
def create_quick_test():
|
||||
"""Create a quick test with minimal steps for testing"""
|
||||
print("=== Quick Test: Minimal Training Steps ===")
|
||||
print()
|
||||
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor first to create the configuration")
|
||||
return False
|
||||
|
||||
print("Running quick test with minimal training steps...")
|
||||
|
||||
# Run training with very few steps for quick testing
|
||||
success = run_training_with_config(
|
||||
config_path=config_path,
|
||||
dataset_path=None,
|
||||
epochs=1,
|
||||
batch_size=1,
|
||||
learning_rate=2e-4,
|
||||
max_steps=5 # Very few steps for quick test
|
||||
)
|
||||
|
||||
if success:
|
||||
print("Quick test completed!")
|
||||
print(" Model saved with minimal training")
|
||||
print(" This is just for testing the pipeline")
|
||||
|
||||
return success
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
parser = argparse.ArgumentParser(description="Instruct Training Script")
|
||||
|
||||
# Subcommands
|
||||
parser.add_argument("command", choices=["train", "example", "features", "quick-test"],
|
||||
help="Command to run")
|
||||
|
||||
# Training arguments
|
||||
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||
parser.add_argument("--dataset", type=str, help="Path to training dataset")
|
||||
parser.add_argument("--output-dir", type=str, help="Output directory for model")
|
||||
parser.add_argument("--epochs", type=int, help="Number of training epochs")
|
||||
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||
parser.add_argument("--max-steps", type=int, help="Maximum training steps")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "features":
|
||||
show_training_features()
|
||||
|
||||
elif args.command == "example":
|
||||
create_training_example()
|
||||
|
||||
elif args.command == "quick-test":
|
||||
create_quick_test()
|
||||
|
||||
elif args.command == "train":
|
||||
if not args.config:
|
||||
print("❌ --config is required for training")
|
||||
print("Usage: python scripts/instruct/train.py train --config config.yaml")
|
||||
sys.exit(1)
|
||||
|
||||
# If dataset is not provided, try to use output_dir from config
|
||||
dataset_path = args.dataset if args.dataset else None
|
||||
|
||||
success = run_training_with_config(
|
||||
config_path=args.config,
|
||||
dataset_path=dataset_path,
|
||||
output_dir=args.output_dir,
|
||||
epochs=args.epochs,
|
||||
batch_size=args.batch_size,
|
||||
learning_rate=args.learning_rate,
|
||||
max_steps=args.max_steps
|
||||
)
|
||||
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,320 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instruct data processor script that uses YAML configurations.
|
||||
This provides a flexible and maintainable approach for instruction fine-tuning tasks.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||
"""Run instruct data processor with YAML configuration"""
|
||||
print(f"=== Running Instruct Data Processor with YAML config: {config_path} ===")
|
||||
|
||||
cmd = [
|
||||
"python", "pipelines/instruct/data_processor.py",
|
||||
"--config", config_path
|
||||
]
|
||||
|
||||
# Add CLI overrides
|
||||
for key, value in cli_overrides.items():
|
||||
if value is not None:
|
||||
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Instruct data processing completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running instruct data processor: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def run_instruct_examples():
|
||||
"""Run instruct examples with YAML configs"""
|
||||
|
||||
# Example 1: Code reasoning instruction tuning
|
||||
print("=== Example 1: Code Reasoning Instruction Tuning ===")
|
||||
success = run_with_yaml_config(
|
||||
"configs/instruct/code_reasoning.yaml",
|
||||
max_samples=1000, # Override YAML value
|
||||
output_format="conversation"
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Code reasoning instruction tuning data processing completed!")
|
||||
|
||||
# Example 2: Custom conversation dataset (if available)
|
||||
print("\n=== Example 2: Custom Conversation Dataset ===")
|
||||
if os.path.exists("data/raw/instruct/custom_conversations.jsonl"):
|
||||
success = run_with_yaml_config(
|
||||
"configs/instruct/code_reasoning.yaml", # Use code_reasoning config as base
|
||||
data_source="custom",
|
||||
data_path="data/raw/instruct/custom_conversations.jsonl",
|
||||
output_dir="./data/processed/instruct/custom"
|
||||
)
|
||||
if success:
|
||||
print("✅ Custom conversation dataset processing completed!")
|
||||
else:
|
||||
print("⚠️ Custom conversation dataset not found, skipping...")
|
||||
print(" You can create one with the 'create-sample-data' option")
|
||||
|
||||
def create_sample_instruct_data():
|
||||
"""Create sample conversation dataset for testing"""
|
||||
sample_data = [
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "Can you explain what a Python list comprehension is?"},
|
||||
{"role": "assistant", "content": "A Python list comprehension is a concise way to create lists. It allows you to generate a new list by applying an expression to each item in an existing iterable, optionally filtering items with a condition. The basic syntax is: [expression for item in iterable if condition]. For example: [x**2 for x in range(10) if x%2==0] creates a list of squares of even numbers from 0 to 8."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "How do I handle exceptions in Python?"},
|
||||
{"role": "assistant", "content": "In Python, you handle exceptions using try-except blocks. Here's the basic structure:\n\ntry:\n # Code that might raise an exception\n risky_code()\nexcept SpecificException as e:\n # Handle specific exception\n print(f'Error: {e}')\nexcept Exception as e:\n # Handle any other exception\n print(f'Unexpected error: {e}')\nelse:\n # Runs if no exception occurred\n print('Success!')\nfinally:\n # Always runs, regardless of exceptions\n cleanup_code()\n\nYou can catch multiple specific exceptions and handle them differently."}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "What's the difference between a function and a method in Python?"},
|
||||
{"role": "assistant", "content": "In Python, the main differences between functions and methods are:\n\n1. **Definition location**: Functions are defined independently, while methods are defined inside classes.\n\n2. **Calling**: Functions are called directly by name, methods are called on objects using dot notation.\n\n3. **First parameter**: Methods automatically receive 'self' as the first parameter (the instance they're called on), functions don't have this automatic parameter.\n\nExample:\n```python\n# Function\ndef my_function(x):\n return x * 2\n\n# Method (inside a class)\nclass MyClass:\n def my_method(self, x):\n return x * 2\n\n# Usage\nresult1 = my_function(5) # Function call\nobj = MyClass()\nresult2 = obj.my_method(5) # Method call\n```"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "How can I optimize a slow Python loop?"},
|
||||
{"role": "assistant", "content": "Here are several strategies to optimize slow Python loops:\n\n1. **Use list comprehensions** instead of explicit loops when possible\n2. **Vectorize with NumPy** for numerical operations\n3. **Use built-in functions** like map(), filter(), sum()\n4. **Avoid repeated lookups** - store frequently accessed values in variables\n5. **Use sets for membership testing** instead of lists\n6. **Consider using itertools** for efficient iteration patterns\n7. **Profile your code** with cProfile to identify bottlenecks\n8. **Use numba or Cython** for compute-intensive loops\n\nExample optimization:\n```python\n# Slow\nresult = []\nfor i in range(1000000):\n if i % 2 == 0:\n result.append(i ** 2)\n\n# Faster\nresult = [i ** 2 for i in range(1000000) if i % 2 == 0]\n\n# Even faster with NumPy\nimport numpy as np\narr = np.arange(1000000)\nresult = (arr[arr % 2 == 0] ** 2).tolist()\n```"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"conversation": [
|
||||
{"role": "user", "content": "What are Python decorators and how do I use them?"},
|
||||
{"role": "assistant", "content": "Python decorators are a powerful feature that allows you to modify or extend the behavior of functions or classes without permanently modifying their code. They use the @ symbol and are placed above the function definition.\n\n**Basic decorator example:**\n```python\ndef my_decorator(func):\n def wrapper(*args, **kwargs):\n print('Before function call')\n result = func(*args, **kwargs)\n print('After function call')\n return result\n return wrapper\n\n@my_decorator\ndef greet(name):\n print(f'Hello, {name}!')\n\ngreet('Alice') # Prints before, greeting, and after messages\n```\n\n**Common use cases:**\n- Timing function execution\n- Logging\n- Authentication/authorization\n- Caching results\n- Input validation\n\n**Built-in decorators:**\n- `@property` - creates getter/setter methods\n- `@staticmethod` - methods that don't need self or cls\n- `@classmethod` - methods that receive the class as first argument\n\nDecorators make code more modular and reusable by separating concerns."}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
# Create directory structure
|
||||
data_dir = Path("data/raw/instruct")
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save sample data
|
||||
import json
|
||||
sample_file = data_dir / "code_reasoning.jsonl"
|
||||
with open(sample_file, 'w', encoding='utf-8') as f:
|
||||
for item in sample_data:
|
||||
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||
|
||||
print(f"✅ Created sample conversation dataset: {sample_file}")
|
||||
print(f" Contains {len(sample_data)} conversation examples")
|
||||
print(f" Format: conversation array with role/content pairs")
|
||||
print(f" Ready to use with configs/instruct/code_reasoning.yaml")
|
||||
|
||||
def create_custom_instruct_config():
|
||||
"""Create a custom instruct configuration file"""
|
||||
custom_config = """# Custom Instruct Configuration
|
||||
task:
|
||||
name: "general_chat"
|
||||
type: "instruction_following"
|
||||
|
||||
data:
|
||||
source: "custom"
|
||||
data_path: "./data/raw/instruct/general_chat.jsonl"
|
||||
data_format: "jsonl"
|
||||
conversation_field: "conversation"
|
||||
max_length: 2048
|
||||
min_length: 10
|
||||
clean_text: true
|
||||
train_split: 0.8
|
||||
validation_split: 0.1
|
||||
test_split: 0.1
|
||||
output_format: "conversation"
|
||||
output_dir: "./data/processed/instruct/general_chat"
|
||||
|
||||
model:
|
||||
name: "unsloth/Qwen2.5-7B-Instruct"
|
||||
max_length: 2048
|
||||
max_seq_length: 2048
|
||||
dtype: null
|
||||
load_in_4bit: true
|
||||
token: null
|
||||
training_model: "unsloth/Qwen2.5-7B-Instruct"
|
||||
training_max_seq_length: 2048
|
||||
training_dtype: null
|
||||
training_load_in_4bit: true
|
||||
|
||||
training:
|
||||
num_epochs: 1
|
||||
batch_size: 1
|
||||
learning_rate: 2e-4
|
||||
weight_decay: 0.01
|
||||
warmup_steps: 5
|
||||
max_steps: 50
|
||||
gradient_accumulation_steps: 4
|
||||
lr_scheduler_type: "linear"
|
||||
seed: 3407
|
||||
lora_r: 16
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0
|
||||
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||
output_dir: "./outputs"
|
||||
model_output_dir: "./models/instruct/general_chat"
|
||||
|
||||
inference:
|
||||
batch_size: 1
|
||||
max_new_tokens: 256
|
||||
temperature: 0.8
|
||||
min_p: 0.1
|
||||
use_cache: true
|
||||
"""
|
||||
|
||||
config_path = "configs/instruct/general_chat.yaml"
|
||||
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
||||
|
||||
with open(config_path, 'w') as f:
|
||||
f.write(custom_config)
|
||||
|
||||
print(f"✅ Created custom instruct config: {config_path}")
|
||||
print(" This config is set up for general chat instruction tuning")
|
||||
|
||||
def handle_direct_args():
|
||||
"""Handle direct command-line arguments by passing them to the instruct pipeline"""
|
||||
parser = argparse.ArgumentParser(description="Instruct Data Processor")
|
||||
|
||||
# Add all the same arguments as the instruct pipeline
|
||||
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||||
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||||
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||||
parser.add_argument("--data-format", choices=["jsonl", "json"], help="Data format")
|
||||
parser.add_argument("--conversation-field", type=str, help="Conversation field name")
|
||||
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||||
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||||
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||||
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||||
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||||
|
||||
# Logging
|
||||
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Build command to call the instruct pipeline
|
||||
cmd = ["python", "pipelines/instruct/data_processor.py"]
|
||||
|
||||
# Add all arguments that were provided
|
||||
for arg_name, arg_value in vars(args).items():
|
||||
if arg_value is not None:
|
||||
if isinstance(arg_value, bool):
|
||||
if arg_value: # Only add flag if True
|
||||
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||
else:
|
||||
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("✅ Instruct data processing completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Error running instruct data processor: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def show_instruct_features():
|
||||
"""Show the features of the instruct data processor"""
|
||||
print("=== Instruct Data Processor Features ===")
|
||||
print()
|
||||
print("1. **Instruction Fine-tuning Tasks**:")
|
||||
print(" - Code reasoning and explanation")
|
||||
print(" - General conversation and chat")
|
||||
print(" - Question answering")
|
||||
print(" - Task-specific instruction following")
|
||||
print()
|
||||
print("2. **Conversation Data Formats Supported**:")
|
||||
print(" - HuggingFace conversation datasets")
|
||||
print(" - Custom JSONL/JSON files with conversation arrays")
|
||||
print(" - ShareGPT format with role/content structure")
|
||||
print(" - Automatic train/validation/test splits")
|
||||
print()
|
||||
print("3. **Conversation Validation**:")
|
||||
print(" - Role validation (user/assistant/system)")
|
||||
print(" - Content length and quality checks")
|
||||
print(" - Conversation structure validation")
|
||||
print(" - Turn-level statistics and analysis")
|
||||
print()
|
||||
print("4. **Advanced Features**:")
|
||||
print(" - Configurable conversation field mapping")
|
||||
print(" - Text preprocessing options")
|
||||
print(" - Automatic dataset saving/loading")
|
||||
print(" - YAML configuration support")
|
||||
print(" - Compatible with Unsloth chat templates")
|
||||
print()
|
||||
print("=== Usage Examples ===")
|
||||
print()
|
||||
print("1. Use YAML config only:")
|
||||
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||
print()
|
||||
print("2. Override YAML values:")
|
||||
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml --max-samples 500")
|
||||
print()
|
||||
print("3. Create sample data:")
|
||||
print(" python scripts/instruct/data_processor.py create-sample-data")
|
||||
print()
|
||||
print("4. Create custom config:")
|
||||
print(" python scripts/instruct/data_processor.py create-config")
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
if len(sys.argv) > 1:
|
||||
# Check if it's a subcommand
|
||||
if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
|
||||
# Handle subcommands
|
||||
if sys.argv[1] == "examples":
|
||||
run_instruct_examples()
|
||||
elif sys.argv[1] == "create-sample-data":
|
||||
create_sample_instruct_data()
|
||||
elif sys.argv[1] == "create-config":
|
||||
create_custom_instruct_config()
|
||||
elif sys.argv[1] == "features":
|
||||
show_instruct_features()
|
||||
else:
|
||||
# Handle direct arguments (pass through to pipeline)
|
||||
handle_direct_args()
|
||||
else:
|
||||
print("Instruct Data Processor")
|
||||
print("======================")
|
||||
print()
|
||||
print("This script runs the instruct data processor for instruction fine-tuning tasks.")
|
||||
print("It supports both YAML configurations and command-line overrides.")
|
||||
print()
|
||||
print("Usage:")
|
||||
print(" python scripts/instruct/data_processor.py examples # Run examples")
|
||||
print(" python scripts/instruct/data_processor.py create-sample-data # Create sample dataset")
|
||||
print(" python scripts/instruct/data_processor.py create-config # Create custom config")
|
||||
print(" python scripts/instruct/data_processor.py features # Show features")
|
||||
print()
|
||||
print("Direct pipeline usage:")
|
||||
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||
print(" python scripts/instruct/data_processor.py --data-source custom --data-path ./conversations.jsonl")
|
||||
print()
|
||||
print("Key Features:")
|
||||
print(" ✅ Instruction fine-tuning with conversation data")
|
||||
print(" ✅ Multiple data source support")
|
||||
print(" ✅ YAML configuration files")
|
||||
print(" ✅ CLI argument overrides")
|
||||
print(" ✅ Conversation validation and analysis")
|
||||
print(" ✅ Compatible with Unsloth chat templates")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,291 @@
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instruct Inference Script
|
||||
Provides a command-line interface to run the instruct inference pipeline
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_inference_with_config(config_path: str, message: str = "", max_tokens: int = 128, stream: bool = False, interactive: bool = False):
|
||||
"""Run inference using a YAML configuration file"""
|
||||
print(f"Running instruct inference with config: {config_path}")
|
||||
if interactive:
|
||||
print("Mode: Interactive chat")
|
||||
elif message:
|
||||
print(f"Message: {message}")
|
||||
print(f"Max tokens: {max_tokens}")
|
||||
print(f"Streaming: {stream}")
|
||||
|
||||
cmd = [
|
||||
"python", "pipelines/instruct/inference.py",
|
||||
"--config", config_path,
|
||||
"--max-tokens", str(max_tokens)
|
||||
]
|
||||
|
||||
if interactive:
|
||||
cmd.append("--interactive")
|
||||
elif message:
|
||||
cmd.extend(["--message", message])
|
||||
|
||||
if stream:
|
||||
cmd.append("--stream")
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
|
||||
try:
|
||||
if interactive:
|
||||
# For interactive mode, don't capture output
|
||||
result = subprocess.run(cmd, check=True)
|
||||
return True
|
||||
else:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
print("✅ Inference completed successfully!")
|
||||
print("Output:")
|
||||
print(result.stdout)
|
||||
return result.stdout
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Inference failed: {e}")
|
||||
print("Error output:")
|
||||
print(e.stderr)
|
||||
return None
|
||||
|
||||
def run_conversation_example(config_path: str):
|
||||
"""Run a conversation example"""
|
||||
print(f"=== Conversation Example ===")
|
||||
print(f"Config: {config_path}")
|
||||
|
||||
example_messages = [
|
||||
"Can you explain what recursion is in programming?",
|
||||
"How do I debug a Python program?",
|
||||
"What's the difference between a list and a tuple in Python?",
|
||||
"Can you show me how to use a for loop?",
|
||||
"What are the benefits of using functions in programming?"
|
||||
]
|
||||
|
||||
print("Running example conversations...")
|
||||
print()
|
||||
|
||||
for i, message in enumerate(example_messages):
|
||||
print(f"--- Example {i+1} ---")
|
||||
result = run_inference_with_config(config_path, message, max_tokens=256)
|
||||
if not result:
|
||||
print(f"❌ Failed to process message {i+1}")
|
||||
print()
|
||||
|
||||
print("✅ Conversation examples completed!")
|
||||
|
||||
def show_inference_features():
|
||||
"""Show the features of the instruct inference pipeline"""
|
||||
print("=== Instruct Inference Pipeline Features ===")
|
||||
print()
|
||||
print("1. **Model Support**:")
|
||||
print(" - Trained LoRA models from instruct training pipeline")
|
||||
print(" - Automatic model loading from config")
|
||||
print(" - Native Unsloth inference optimization")
|
||||
print(" - Chat template integration")
|
||||
print()
|
||||
print("2. **Inference Modes**:")
|
||||
print(" - Single message inference")
|
||||
print(" - Interactive chat session")
|
||||
print(" - Streaming generation")
|
||||
print(" - Batch conversation processing")
|
||||
print()
|
||||
print("3. **Conversation Features**:")
|
||||
print(" - Multi-turn conversation support")
|
||||
print(" - Context preservation across turns")
|
||||
print(" - Proper role handling (user/assistant/system)")
|
||||
print(" - Chat history management")
|
||||
print()
|
||||
print("4. **Generation Control**:")
|
||||
print(" - Configurable max tokens")
|
||||
print(" - Temperature and sampling parameters")
|
||||
print(" - Streaming output support")
|
||||
print(" - Chat template formatting")
|
||||
print()
|
||||
print("5. **Interactive Features**:")
|
||||
print(" - Real-time chat interface")
|
||||
print(" - Command support (clear, stream toggle)")
|
||||
print(" - Conversation history tracking")
|
||||
print(" - Graceful exit handling")
|
||||
print()
|
||||
print("6. **Usage Examples**:")
|
||||
print(" - Single message: --message 'your question here'")
|
||||
print(" - Interactive chat: --interactive")
|
||||
print(" - Streaming: add --stream flag")
|
||||
print(" - Custom tokens: --max-tokens 256")
|
||||
|
||||
def create_inference_example():
|
||||
"""Create an inference example using the code reasoning configuration"""
|
||||
print("=== Inference Example: Code Reasoning Chat ===")
|
||||
print()
|
||||
|
||||
# Check if we have the required files
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"❌ Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor and training first")
|
||||
return False
|
||||
|
||||
print("✅ Found configuration file!")
|
||||
print(f" Config: {config_path}")
|
||||
print()
|
||||
|
||||
# Example conversation
|
||||
example_message = "Can you explain what a Python decorator is and show me a simple example?"
|
||||
|
||||
print(f"Example message: {example_message}")
|
||||
print()
|
||||
|
||||
# Run inference
|
||||
success = run_inference_with_config(
|
||||
config_path=config_path,
|
||||
message=example_message,
|
||||
max_tokens=256
|
||||
)
|
||||
|
||||
if success:
|
||||
print("✅ Example inference completed successfully!")
|
||||
return True
|
||||
else:
|
||||
print("❌ Example inference failed!")
|
||||
return False
|
||||
|
||||
def start_interactive_chat(config_path: str, stream: bool = False):
|
||||
"""Start an interactive chat session"""
|
||||
print("=== Interactive Chat Session ===")
|
||||
print()
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"❌ Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor and training first")
|
||||
return False
|
||||
|
||||
print(f"Starting interactive chat with config: {config_path}")
|
||||
print("Streaming:", "enabled" if stream else "disabled")
|
||||
print()
|
||||
|
||||
# Run interactive inference
|
||||
success = run_inference_with_config(
|
||||
config_path=config_path,
|
||||
interactive=True,
|
||||
stream=stream
|
||||
)
|
||||
|
||||
return success
|
||||
|
||||
def create_batch_test():
|
||||
"""Create a batch test with multiple questions"""
|
||||
print("=== Batch Test: Multiple Questions ===")
|
||||
print()
|
||||
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"❌ Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor and training first")
|
||||
return False
|
||||
|
||||
# Create a batch of test questions
|
||||
test_questions = [
|
||||
"What is object-oriented programming?",
|
||||
"How do you handle errors in Python?",
|
||||
"Explain the concept of variables in programming.",
|
||||
"What's the difference between a compiler and an interpreter?"
|
||||
]
|
||||
|
||||
print("Running batch test with multiple questions...")
|
||||
print()
|
||||
|
||||
success_count = 0
|
||||
for i, question in enumerate(test_questions):
|
||||
print(f"Question {i+1}: {question}")
|
||||
result = run_inference_with_config(config_path, question, max_tokens=200)
|
||||
if result:
|
||||
success_count += 1
|
||||
print("-" * 50)
|
||||
|
||||
print(f"✅ Batch test completed: {success_count}/{len(test_questions)} questions processed successfully")
|
||||
return success_count == len(test_questions)
|
||||
|
||||
def main():
|
||||
"""Main inference function"""
|
||||
parser = argparse.ArgumentParser(description="Instruct Inference Pipeline")
|
||||
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||
|
||||
# Inference command
|
||||
infer_parser = subparsers.add_parser("infer", help="Run single inference")
|
||||
infer_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
infer_parser.add_argument("--message", type=str, required=True, help="Message to send to the model")
|
||||
infer_parser.add_argument("--max-tokens", type=int, default=128, help="Maximum new tokens to generate")
|
||||
infer_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||
|
||||
# Interactive command
|
||||
interactive_parser = subparsers.add_parser("chat", help="Start interactive chat")
|
||||
interactive_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
interactive_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||
|
||||
# Batch test command
|
||||
batch_parser = subparsers.add_parser("batch", help="Run batch test")
|
||||
batch_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
|
||||
# Conversation example command
|
||||
conv_parser = subparsers.add_parser("conversation", help="Run conversation examples")
|
||||
conv_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||
|
||||
# Features command
|
||||
subparsers.add_parser("features", help="Show available features")
|
||||
|
||||
# Example command
|
||||
subparsers.add_parser("example", help="Run example inference")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "infer":
|
||||
run_inference_with_config(
|
||||
args.config,
|
||||
args.message,
|
||||
args.max_tokens,
|
||||
args.stream
|
||||
)
|
||||
elif args.command == "chat":
|
||||
start_interactive_chat(args.config, args.stream)
|
||||
elif args.command == "batch":
|
||||
create_batch_test()
|
||||
elif args.command == "conversation":
|
||||
run_conversation_example(args.config)
|
||||
elif args.command == "features":
|
||||
show_inference_features()
|
||||
elif args.command == "example":
|
||||
create_inference_example()
|
||||
else:
|
||||
print("Instruct Inference Pipeline")
|
||||
print("==========================")
|
||||
print()
|
||||
print("Available commands:")
|
||||
print(" infer - Run single message inference")
|
||||
print(" chat - Start interactive chat session")
|
||||
print(" batch - Run batch test with multiple questions")
|
||||
print(" conversation - Run conversation examples")
|
||||
print(" features - Show available features")
|
||||
print(" example - Run example inference")
|
||||
print()
|
||||
print("Examples:")
|
||||
print(" python scripts/instruct/inference.py infer --config configs/instruct/code_reasoning.yaml --message 'Explain Python loops'")
|
||||
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml")
|
||||
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml --stream")
|
||||
print()
|
||||
print("Key Features:")
|
||||
print(" ✅ Interactive chat with conversation history")
|
||||
print(" ✅ Streaming generation support")
|
||||
print(" ✅ Multi-turn conversation handling")
|
||||
print(" ✅ Chat template integration")
|
||||
print(" ✅ Configurable generation parameters")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,214 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Instruct Training Script
|
||||
Provides a command-line interface to run the instruct training pipeline
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides):
|
||||
"""Run the instruct training pipeline with YAML configuration"""
|
||||
print(f"Starting instruct training with config: {config_path}")
|
||||
if dataset_path:
|
||||
print(f"Training dataset: {dataset_path}")
|
||||
else:
|
||||
print("Training dataset: Will use output_dir from YAML config")
|
||||
print()
|
||||
|
||||
# Build command
|
||||
cmd = ["python", "pipelines/instruct/train.py", "--config", config_path]
|
||||
|
||||
# Add dataset path if provided
|
||||
if dataset_path:
|
||||
cmd.extend(["--dataset", dataset_path])
|
||||
|
||||
# Add CLI overrides
|
||||
for key, value in cli_overrides.items():
|
||||
if value is not None:
|
||||
if key == "output_dir":
|
||||
cmd.extend(["--output-dir", str(value)])
|
||||
elif key == "epochs":
|
||||
cmd.extend(["--epochs", str(value)])
|
||||
elif key == "batch_size":
|
||||
cmd.extend(["--batch-size", str(value)])
|
||||
elif key == "learning_rate":
|
||||
cmd.extend(["--learning-rate", str(value)])
|
||||
elif key == "max_steps":
|
||||
cmd.extend(["--max-steps", str(value)])
|
||||
|
||||
print(f"Running: {' '.join(cmd)}")
|
||||
print()
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
print("Training completed successfully!")
|
||||
print(result.stdout)
|
||||
return True
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Training failed: {e}")
|
||||
print(f"Error output: {e.stderr}")
|
||||
return False
|
||||
|
||||
def show_training_features():
|
||||
"""Show the features of the instruct training pipeline"""
|
||||
print("=== Instruct Training Pipeline Features ===")
|
||||
print()
|
||||
print("1. **Model Support**:")
|
||||
print(" - Unsloth optimized models (4x faster)")
|
||||
print(" - LoRA fine-tuning for efficiency")
|
||||
print(" - Support for Qwen2.5, Llama-3.1, Mistral, Phi-3")
|
||||
print(" - Chat template integration")
|
||||
print()
|
||||
print("2. **Training Features**:")
|
||||
print(" - SFTTrainer with conversation data")
|
||||
print(" - Response-only training (train only on assistant responses)")
|
||||
print(" - ShareGPT format standardization")
|
||||
print(" - Automatic mixed precision (FP16/BF16)")
|
||||
print(" - Gradient checkpointing for memory efficiency")
|
||||
print(" - Configurable LoRA parameters")
|
||||
print()
|
||||
print("3. **Conversation Handling**:")
|
||||
print(" - Multi-turn conversation support")
|
||||
print(" - Proper chat template formatting")
|
||||
print(" - Role-based training (user/assistant/system)")
|
||||
print(" - Context preservation across turns")
|
||||
print()
|
||||
print("4. **Configuration**:")
|
||||
print(" - YAML configuration files")
|
||||
print(" - CLI argument overrides")
|
||||
print(" - Automatic device detection")
|
||||
print(" - Flexible LoRA configuration")
|
||||
print()
|
||||
print("5. **Output**:")
|
||||
print(" - Saved LoRA models")
|
||||
print(" - Training logs and checkpoints")
|
||||
print(" - Ready for conversational inference")
|
||||
|
||||
def create_training_example():
|
||||
"""Create a training example using the code reasoning configuration"""
|
||||
print("=== Training Example: Code Reasoning Instruction Tuning ===")
|
||||
print()
|
||||
|
||||
# Check if we have the required files
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor first to create the configuration")
|
||||
return False
|
||||
|
||||
print("Found required files!")
|
||||
print(f" Config: {config_path}")
|
||||
print(" Dataset: Will use output_dir from YAML config")
|
||||
print(" The training pipeline will automatically:")
|
||||
print(" - Load conversation data from the output_dir specified in YAML")
|
||||
print(" - Convert JSONL files to HuggingFace dataset format")
|
||||
print(" - Apply ShareGPT standardization")
|
||||
print(" - Format conversations with chat templates")
|
||||
print(" - Train the model using SFTTrainer with response-only training")
|
||||
print()
|
||||
|
||||
# Run training without explicit dataset path - will use YAML config
|
||||
success = run_training_with_config(
|
||||
config_path=config_path,
|
||||
dataset_path=None, # Use output_dir from YAML config
|
||||
epochs=1,
|
||||
batch_size=1,
|
||||
learning_rate=2e-4,
|
||||
max_steps=30
|
||||
)
|
||||
|
||||
if success:
|
||||
print("Training example completed!")
|
||||
print(" Model saved to: ./models/instruct")
|
||||
print(" Ready for conversational inference!")
|
||||
|
||||
return success
|
||||
|
||||
def create_quick_test():
|
||||
"""Create a quick test with minimal steps for testing"""
|
||||
print("=== Quick Test: Minimal Training Steps ===")
|
||||
print()
|
||||
|
||||
config_path = "configs/instruct/code_reasoning.yaml"
|
||||
|
||||
if not Path(config_path).exists():
|
||||
print(f"Configuration file not found: {config_path}")
|
||||
print(" Please run the data processor first to create the configuration")
|
||||
return False
|
||||
|
||||
print("Running quick test with minimal training steps...")
|
||||
|
||||
# Run training with very few steps for quick testing
|
||||
success = run_training_with_config(
|
||||
config_path=config_path,
|
||||
dataset_path=None,
|
||||
epochs=1,
|
||||
batch_size=1,
|
||||
learning_rate=2e-4,
|
||||
max_steps=5 # Very few steps for quick test
|
||||
)
|
||||
|
||||
if success:
|
||||
print("Quick test completed!")
|
||||
print(" Model saved with minimal training")
|
||||
print(" This is just for testing the pipeline")
|
||||
|
||||
return success
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
parser = argparse.ArgumentParser(description="Instruct Training Script")
|
||||
|
||||
# Subcommands
|
||||
parser.add_argument("command", choices=["train", "example", "features", "quick-test"],
|
||||
help="Command to run")
|
||||
|
||||
# Training arguments
|
||||
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||
parser.add_argument("--dataset", type=str, help="Path to training dataset")
|
||||
parser.add_argument("--output-dir", type=str, help="Output directory for model")
|
||||
parser.add_argument("--epochs", type=int, help="Number of training epochs")
|
||||
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||
parser.add_argument("--max-steps", type=int, help="Maximum training steps")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.command == "features":
|
||||
show_training_features()
|
||||
|
||||
elif args.command == "example":
|
||||
create_training_example()
|
||||
|
||||
elif args.command == "quick-test":
|
||||
create_quick_test()
|
||||
|
||||
elif args.command == "train":
|
||||
if not args.config:
|
||||
print("❌ --config is required for training")
|
||||
print("Usage: python scripts/instruct/train.py train --config config.yaml")
|
||||
sys.exit(1)
|
||||
|
||||
# If dataset is not provided, try to use output_dir from config
|
||||
dataset_path = args.dataset if args.dataset else None
|
||||
|
||||
success = run_training_with_config(
|
||||
config_path=args.config,
|
||||
dataset_path=dataset_path,
|
||||
output_dir=args.output_dir,
|
||||
epochs=args.epochs,
|
||||
batch_size=args.batch_size,
|
||||
learning_rate=args.learning_rate,
|
||||
max_steps=args.max_steps
|
||||
)
|
||||
|
||||
if not success:
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user