updated instruct
This commit is contained in:
@@ -0,0 +1,78 @@
|
|||||||
|
# Comprehensive Instruct Configuration
|
||||||
|
# This file defines all parameters for instruction fine-tuning using conversational data
|
||||||
|
# Organized by level: task, data processing, model, training, and inference
|
||||||
|
|
||||||
|
# Task Configuration
|
||||||
|
task:
|
||||||
|
name: "code_reasoning" # Task name: instruct, code_reasoning, general_chat
|
||||||
|
type: "instruction_following" # Model type: instruction_following, conversational
|
||||||
|
|
||||||
|
# Data Processing Configuration
|
||||||
|
data:
|
||||||
|
source: "custom" # Data source: "huggingface" or "custom"
|
||||||
|
data_path: "./data/raw/instruct/code_reasoning.jsonl" # Path to conversation data file
|
||||||
|
data_format: "jsonl" # Data format: "jsonl", "json"
|
||||||
|
|
||||||
|
# Field Mapping for Conversation Data
|
||||||
|
conversation_field: "conversation" # Field name containing conversation array
|
||||||
|
|
||||||
|
# Data Format & Processing
|
||||||
|
max_length: 2048 # Maximum text length (truncate longer texts)
|
||||||
|
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||||
|
|
||||||
|
# Text Preprocessing
|
||||||
|
clean_text: true # Clean and normalize text
|
||||||
|
|
||||||
|
# Data Splitting
|
||||||
|
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
||||||
|
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
||||||
|
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||||
|
|
||||||
|
# Output Configuration
|
||||||
|
output_format: "conversation" # Output format: "conversation" (chat format)
|
||||||
|
output_dir: "./data/processed/instruct/code_reasoning" # Output directory for processed data
|
||||||
|
|
||||||
|
# Model Configuration
|
||||||
|
model:
|
||||||
|
name: "unsloth/Qwen2.5-72B-Instruct" # Model name from HuggingFace Hub (optimized for instruction following)
|
||||||
|
max_length: 2048 # Maximum sequence length for tokenization
|
||||||
|
max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported)
|
||||||
|
dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
|
||||||
|
load_in_4bit: true # Use 4bit quantization to reduce memory usage
|
||||||
|
token: null # HuggingFace token for gated models (e.g., "hf_...")
|
||||||
|
|
||||||
|
# Training Model Parameters
|
||||||
|
training_model: "unsloth/Qwen2.5-72B-Instruct" # Model to use for training
|
||||||
|
training_max_seq_length: 2048 # Max sequence length for training
|
||||||
|
training_dtype: null # Data type for training
|
||||||
|
training_load_in_4bit: true # 4bit quantization for training
|
||||||
|
|
||||||
|
# Training Configuration
|
||||||
|
training:
|
||||||
|
num_epochs: 1 # Number of training epochs (1 epoch is often sufficient for instruction tuning)
|
||||||
|
batch_size: 1 # Training batch size (small for large models)
|
||||||
|
learning_rate: 2e-4 # Learning rate (typical for instruction tuning)
|
||||||
|
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
||||||
|
warmup_steps: 5 # Warmup steps (fixed value)
|
||||||
|
max_steps: 30 # Maximum training steps (adjust based on dataset size)
|
||||||
|
gradient_accumulation_steps: 4 # Gradient accumulation steps
|
||||||
|
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||||
|
seed: 3407 # Random seed for reproducibility
|
||||||
|
|
||||||
|
# LoRA Configuration
|
||||||
|
lora_r: 32 # LoRA rank (higher = more parameters)
|
||||||
|
lora_alpha: 16 # LoRA alpha (scaling factor)
|
||||||
|
lora_dropout: 0 # LoRA dropout (0 is optimized)
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
|
||||||
|
# Output Configuration
|
||||||
|
output_dir: "./outputs" # Directory for training checkpoints
|
||||||
|
model_output_dir: "./models/instruct" # Directory to save the trained model
|
||||||
|
|
||||||
|
# Inference Configuration
|
||||||
|
inference:
|
||||||
|
batch_size: 1 # Batch size for inference
|
||||||
|
max_new_tokens: 128 # Maximum new tokens to generate during inference
|
||||||
|
temperature: 1.5 # Sampling temperature (higher = more creative)
|
||||||
|
min_p: 0.1 # Min-p sampling parameter
|
||||||
|
use_cache: true # Use key-value cache for faster generation
|
||||||
@@ -0,0 +1,78 @@
|
|||||||
|
# Comprehensive Instruct Configuration
|
||||||
|
# This file defines all parameters for instruction fine-tuning using conversational data
|
||||||
|
# Organized by level: task, data processing, model, training, and inference
|
||||||
|
|
||||||
|
# Task Configuration
|
||||||
|
task:
|
||||||
|
name: "code_reasoning" # Task name: instruct, code_reasoning, general_chat
|
||||||
|
type: "instruction_following" # Model type: instruction_following, conversational
|
||||||
|
|
||||||
|
# Data Processing Configuration
|
||||||
|
data:
|
||||||
|
source: "custom" # Data source: "huggingface" or "custom"
|
||||||
|
data_path: "./data/raw/instruct/code_reasoning.jsonl" # Path to conversation data file
|
||||||
|
data_format: "jsonl" # Data format: "jsonl", "json"
|
||||||
|
|
||||||
|
# Field Mapping for Conversation Data
|
||||||
|
conversation_field: "conversation" # Field name containing conversation array
|
||||||
|
|
||||||
|
# Data Format & Processing
|
||||||
|
max_length: 2048 # Maximum text length (truncate longer texts)
|
||||||
|
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||||
|
|
||||||
|
# Text Preprocessing
|
||||||
|
clean_text: true # Clean and normalize text
|
||||||
|
|
||||||
|
# Data Splitting
|
||||||
|
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
||||||
|
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
||||||
|
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||||
|
|
||||||
|
# Output Configuration
|
||||||
|
output_format: "conversation" # Output format: "conversation" (chat format)
|
||||||
|
output_dir: "./data/processed/instruct/code_reasoning" # Output directory for processed data
|
||||||
|
|
||||||
|
# Model Configuration
|
||||||
|
model:
|
||||||
|
name: "unsloth/Qwen2.5-72B-Instruct" # Model name from HuggingFace Hub (optimized for instruction following)
|
||||||
|
max_length: 2048 # Maximum sequence length for tokenization
|
||||||
|
max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported)
|
||||||
|
dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
|
||||||
|
load_in_4bit: true # Use 4bit quantization to reduce memory usage
|
||||||
|
token: null # HuggingFace token for gated models (e.g., "hf_...")
|
||||||
|
|
||||||
|
# Training Model Parameters
|
||||||
|
training_model: "unsloth/Qwen2.5-72B-Instruct" # Model to use for training
|
||||||
|
training_max_seq_length: 2048 # Max sequence length for training
|
||||||
|
training_dtype: null # Data type for training
|
||||||
|
training_load_in_4bit: true # 4bit quantization for training
|
||||||
|
|
||||||
|
# Training Configuration
|
||||||
|
training:
|
||||||
|
num_epochs: 1 # Number of training epochs (1 epoch is often sufficient for instruction tuning)
|
||||||
|
batch_size: 1 # Training batch size (small for large models)
|
||||||
|
learning_rate: 2e-4 # Learning rate (typical for instruction tuning)
|
||||||
|
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
||||||
|
warmup_steps: 5 # Warmup steps (fixed value)
|
||||||
|
max_steps: 30 # Maximum training steps (adjust based on dataset size)
|
||||||
|
gradient_accumulation_steps: 4 # Gradient accumulation steps
|
||||||
|
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||||
|
seed: 3407 # Random seed for reproducibility
|
||||||
|
|
||||||
|
# LoRA Configuration
|
||||||
|
lora_r: 32 # LoRA rank (higher = more parameters)
|
||||||
|
lora_alpha: 16 # LoRA alpha (scaling factor)
|
||||||
|
lora_dropout: 0 # LoRA dropout (0 is optimized)
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
|
||||||
|
# Output Configuration
|
||||||
|
output_dir: "./outputs" # Directory for training checkpoints
|
||||||
|
model_output_dir: "./models/instruct" # Directory to save the trained model
|
||||||
|
|
||||||
|
# Inference Configuration
|
||||||
|
inference:
|
||||||
|
batch_size: 1 # Batch size for inference
|
||||||
|
max_new_tokens: 128 # Maximum new tokens to generate during inference
|
||||||
|
temperature: 1.5 # Sampling temperature (higher = more creative)
|
||||||
|
min_p: 0.1 # Min-p sampling parameter
|
||||||
|
use_cache: true # Use key-value cache for faster generation
|
||||||
@@ -0,0 +1,917 @@
|
|||||||
|
import json
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Union, Any, Tuple
|
||||||
|
from datasets import Dataset, load_dataset
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
import logging
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class InstructConfig:
|
||||||
|
"""Configuration for instruction fine-tuning tasks"""
|
||||||
|
# Data source configuration
|
||||||
|
data_source: str = "custom" # "huggingface" or "custom"
|
||||||
|
dataset_name: Optional[str] = None # For Hugging Face datasets
|
||||||
|
data_path: Optional[str] = None # For custom datasets
|
||||||
|
data_format: str = "jsonl" # jsonl, json
|
||||||
|
|
||||||
|
# Field mapping - conversation data specific
|
||||||
|
conversation_field: str = "conversation" # Field containing conversation array
|
||||||
|
|
||||||
|
# Data processing
|
||||||
|
max_samples: Optional[int] = None
|
||||||
|
train_split: float = 0.8
|
||||||
|
validation_split: float = 0.1
|
||||||
|
test_split: float = 0.1
|
||||||
|
|
||||||
|
# Text preprocessing
|
||||||
|
clean_text: bool = True
|
||||||
|
min_length: int = 10
|
||||||
|
max_length: int = 2048
|
||||||
|
|
||||||
|
# Output configuration
|
||||||
|
output_format: str = "conversation" # conversation, alpaca
|
||||||
|
output_dir: str = "./data/processed/instruct"
|
||||||
|
|
||||||
|
# Hugging Face specific
|
||||||
|
hf_split: str = "train"
|
||||||
|
hf_cache_dir: Optional[str] = None
|
||||||
|
|
||||||
|
# Split configuration
|
||||||
|
test_split_from: str = "train"
|
||||||
|
val_split_from: str = "train"
|
||||||
|
|
||||||
|
# Custom data specific
|
||||||
|
encoding: str = "utf-8"
|
||||||
|
|
||||||
|
class ConversationValidator:
|
||||||
|
"""Validates conversation data quality and format"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def validate_conversation_data(data: Dict[str, List[Dict]], config: InstructConfig, is_processed: bool = False) -> Tuple[bool, List[str]]:
|
||||||
|
"""Validate conversation dataset splits"""
|
||||||
|
errors = []
|
||||||
|
|
||||||
|
# Check if we have the expected splits
|
||||||
|
expected_splits = ["train", "validation", "test"]
|
||||||
|
for split in expected_splits:
|
||||||
|
if split not in data:
|
||||||
|
errors.append(f"Missing '{split}' split")
|
||||||
|
elif split == "train" and not data[split]:
|
||||||
|
errors.append(f"Train split cannot be empty")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
return False, errors
|
||||||
|
|
||||||
|
total_samples = sum(len(split_data) for split_data in data.values())
|
||||||
|
logger.info(f"Validating {total_samples} total samples across all splits...")
|
||||||
|
|
||||||
|
# Determine field names based on whether data is processed or not
|
||||||
|
conversation_field = "conversation" if not is_processed else "conversation"
|
||||||
|
|
||||||
|
# Validate each split
|
||||||
|
for split_name, split_data in data.items():
|
||||||
|
if not split_data:
|
||||||
|
logger.info(f"Skipping validation for empty {split_name} split")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Validating {split_name} split with {len(split_data)} samples...")
|
||||||
|
|
||||||
|
# Check required fields
|
||||||
|
missing_conversation_count = 0
|
||||||
|
|
||||||
|
for i, item in enumerate(split_data):
|
||||||
|
if conversation_field not in item:
|
||||||
|
errors.append(f"Missing conversation field '{conversation_field}' in {split_name} split, item {i}")
|
||||||
|
missing_conversation_count += 1
|
||||||
|
else:
|
||||||
|
# Validate conversation structure
|
||||||
|
conversation = item[conversation_field]
|
||||||
|
if not isinstance(conversation, list):
|
||||||
|
errors.append(f"Conversation field must be a list in {split_name} split, item {i}")
|
||||||
|
else:
|
||||||
|
# Validate each turn in conversation
|
||||||
|
for j, turn in enumerate(conversation):
|
||||||
|
if not isinstance(turn, dict):
|
||||||
|
errors.append(f"Each conversation turn must be a dict in {split_name} split, item {i}, turn {j}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for required fields in conversation turn
|
||||||
|
if "role" not in turn:
|
||||||
|
errors.append(f"Missing 'role' field in conversation turn {j}, {split_name} split, item {i}")
|
||||||
|
if "content" not in turn:
|
||||||
|
errors.append(f"Missing 'content' field in conversation turn {j}, {split_name} split, item {i}")
|
||||||
|
|
||||||
|
# Validate role values
|
||||||
|
if "role" in turn and turn["role"] not in ["user", "assistant", "system"]:
|
||||||
|
errors.append(f"Invalid role '{turn['role']}' in conversation turn {j}, {split_name} split, item {i}. Must be 'user', 'assistant', or 'system'")
|
||||||
|
|
||||||
|
logger.info(f"{split_name} - Items missing conversation field: {missing_conversation_count}")
|
||||||
|
|
||||||
|
# Show sample of processed data for debugging
|
||||||
|
if split_data:
|
||||||
|
logger.info(f"Sample conversation from {split_name}:")
|
||||||
|
for i in range(min(2, len(split_data))):
|
||||||
|
item = split_data[i]
|
||||||
|
conversation = item.get(conversation_field, [])
|
||||||
|
logger.info(f" Item {i} conversation length: {len(conversation)} turns")
|
||||||
|
for j, turn in enumerate(conversation[:3]): # Show first 3 turns
|
||||||
|
role = turn.get("role", "unknown")
|
||||||
|
content = turn.get("content", "")[:100] + "..." if len(turn.get("content", "")) > 100 else turn.get("content", "")
|
||||||
|
logger.info(f" Turn {j}: {role} -> '{content}'")
|
||||||
|
|
||||||
|
return len(errors) == 0, errors
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def analyze_conversation_dataset(data: Dict[str, List[Dict]], config: InstructConfig, is_processed: bool = False) -> Dict[str, Any]:
|
||||||
|
"""Analyze conversation dataset characteristics across all splits"""
|
||||||
|
analysis = {
|
||||||
|
"splits": {},
|
||||||
|
"overall": {
|
||||||
|
"total_samples": 0,
|
||||||
|
"split_sizes": {},
|
||||||
|
"conversation_stats": {
|
||||||
|
"total_turns": 0,
|
||||||
|
"avg_turns_per_conversation": 0,
|
||||||
|
"role_distribution": {"user": 0, "assistant": 0, "system": 0}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
conversation_field = "conversation" if not is_processed else "conversation"
|
||||||
|
total_turns = 0
|
||||||
|
total_conversations = 0
|
||||||
|
role_counts = {"user": 0, "assistant": 0, "system": 0}
|
||||||
|
|
||||||
|
# Analyze each split
|
||||||
|
for split_name, split_data in data.items():
|
||||||
|
if not split_data:
|
||||||
|
split_analysis = {
|
||||||
|
"total_samples": 0,
|
||||||
|
"conversation_stats": {},
|
||||||
|
"missing_values": {}
|
||||||
|
}
|
||||||
|
analysis["splits"][split_name] = split_analysis
|
||||||
|
analysis["overall"]["split_sizes"][split_name] = 0
|
||||||
|
continue
|
||||||
|
|
||||||
|
split_analysis = {
|
||||||
|
"total_samples": len(split_data),
|
||||||
|
"conversation_stats": {},
|
||||||
|
"missing_values": {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Conversation statistics
|
||||||
|
split_turns = 0
|
||||||
|
split_conversations = len(split_data)
|
||||||
|
split_role_counts = {"user": 0, "assistant": 0, "system": 0}
|
||||||
|
conversation_lengths = []
|
||||||
|
|
||||||
|
for item in split_data:
|
||||||
|
conversation = item.get(conversation_field, [])
|
||||||
|
if isinstance(conversation, list):
|
||||||
|
conversation_lengths.append(len(conversation))
|
||||||
|
split_turns += len(conversation)
|
||||||
|
for turn in conversation:
|
||||||
|
if isinstance(turn, dict) and "role" in turn:
|
||||||
|
role = turn["role"]
|
||||||
|
if role in split_role_counts:
|
||||||
|
split_role_counts[role] += 1
|
||||||
|
|
||||||
|
if conversation_lengths:
|
||||||
|
split_analysis["conversation_stats"] = {
|
||||||
|
"total_turns": split_turns,
|
||||||
|
"avg_turns_per_conversation": np.mean(conversation_lengths),
|
||||||
|
"min_turns": min(conversation_lengths),
|
||||||
|
"max_turns": max(conversation_lengths),
|
||||||
|
"median_turns": np.median(conversation_lengths),
|
||||||
|
"role_distribution": split_role_counts
|
||||||
|
}
|
||||||
|
|
||||||
|
# Missing values
|
||||||
|
missing_count = sum(1 for item in split_data if not item.get(conversation_field))
|
||||||
|
split_analysis["missing_values"][conversation_field] = missing_count
|
||||||
|
|
||||||
|
analysis["splits"][split_name] = split_analysis
|
||||||
|
analysis["overall"]["total_samples"] += len(split_data)
|
||||||
|
analysis["overall"]["split_sizes"][split_name] = len(split_data)
|
||||||
|
|
||||||
|
# Accumulate overall stats
|
||||||
|
total_turns += split_turns
|
||||||
|
total_conversations += split_conversations
|
||||||
|
for role, count in split_role_counts.items():
|
||||||
|
role_counts[role] += count
|
||||||
|
|
||||||
|
# Calculate overall conversation stats
|
||||||
|
if total_conversations > 0:
|
||||||
|
analysis["overall"]["conversation_stats"]["total_turns"] = total_turns
|
||||||
|
analysis["overall"]["conversation_stats"]["avg_turns_per_conversation"] = total_turns / total_conversations
|
||||||
|
analysis["overall"]["conversation_stats"]["role_distribution"] = role_counts
|
||||||
|
|
||||||
|
return analysis
|
||||||
|
|
||||||
|
class BaseInstructDataLoader(ABC):
|
||||||
|
"""Abstract base class for instruction data loaders"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def load(self, config: InstructConfig) -> Dict[str, List[Dict]]:
|
||||||
|
"""Load data and return dictionary with train/val/test splits"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def preprocess(self, data: Dict[str, List[Dict]], config: InstructConfig) -> Dict[str, List[Dict]]:
|
||||||
|
"""Apply preprocessing steps to all splits"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class HuggingFaceInstructDataLoader(BaseInstructDataLoader):
|
||||||
|
"""Load conversation datasets from Hugging Face Hub"""
|
||||||
|
|
||||||
|
def load(self, config: InstructConfig) -> Dict[str, List[Dict]]:
|
||||||
|
"""Load dataset from Hugging Face Hub with flexible split handling"""
|
||||||
|
if not config.dataset_name:
|
||||||
|
raise ValueError("Dataset name is required for Hugging Face datasets")
|
||||||
|
|
||||||
|
logger.info(f"Loading Hugging Face conversation dataset: {config.dataset_name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
dataset = load_dataset(
|
||||||
|
config.dataset_name,
|
||||||
|
cache_dir=config.hf_cache_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
available_splits = list(dataset.keys())
|
||||||
|
logger.info(f"Available splits in dataset: {available_splits}")
|
||||||
|
|
||||||
|
splits_data = {
|
||||||
|
"train": [],
|
||||||
|
"validation": [],
|
||||||
|
"test": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Handle train split
|
||||||
|
if "train" in available_splits:
|
||||||
|
train_dataset = dataset["train"]
|
||||||
|
logger.info(f"Using 'train' split with {len(train_dataset)} samples")
|
||||||
|
splits_data["train"] = list(train_dataset)
|
||||||
|
else:
|
||||||
|
logger.error("No 'train' split found in dataset!")
|
||||||
|
raise ValueError(f"Dataset {config.dataset_name} does not have a 'train' split")
|
||||||
|
|
||||||
|
# Handle validation and test splits (similar logic to styling pipeline)
|
||||||
|
# ... [validation and test split handling logic similar to styling pipeline]
|
||||||
|
|
||||||
|
# Apply max_samples limit if specified
|
||||||
|
if config.max_samples:
|
||||||
|
for split_name in splits_data:
|
||||||
|
if splits_data[split_name]:
|
||||||
|
original_size = len(splits_data[split_name])
|
||||||
|
splits_data[split_name] = splits_data[split_name][:config.max_samples]
|
||||||
|
logger.info(f"Limited {split_name} split from {original_size} to {len(splits_data[split_name])} samples")
|
||||||
|
|
||||||
|
logger.info(f"Successfully loaded dataset {config.dataset_name}")
|
||||||
|
return splits_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading dataset {config.dataset_name}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def preprocess(self, data: Dict[str, List[Dict]], config: InstructConfig) -> Dict[str, List[Dict]]:
|
||||||
|
"""Apply preprocessing steps to all splits separately"""
|
||||||
|
processed_splits = {}
|
||||||
|
|
||||||
|
logger.info(f"=== PREPROCESSING CONVERSATION DATA ===")
|
||||||
|
|
||||||
|
for split_name, split_data in data.items():
|
||||||
|
logger.info(f"Processing {split_name} split with {len(split_data)} items...")
|
||||||
|
|
||||||
|
processed_data = []
|
||||||
|
processed_count = 0
|
||||||
|
skipped_count = 0
|
||||||
|
|
||||||
|
for i, item in enumerate(split_data):
|
||||||
|
processed_item = self._preprocess_item(item, config)
|
||||||
|
if processed_item is not None:
|
||||||
|
processed_data.append(processed_item)
|
||||||
|
processed_count += 1
|
||||||
|
else:
|
||||||
|
skipped_count += 1
|
||||||
|
|
||||||
|
processed_splits[split_name] = processed_data
|
||||||
|
logger.info(f"{split_name} - Preprocessed {processed_count} samples, skipped {skipped_count} samples")
|
||||||
|
|
||||||
|
return processed_splits
|
||||||
|
|
||||||
|
def _preprocess_item(self, item: Dict, config: InstructConfig) -> Optional[Dict]:
|
||||||
|
"""Preprocess a single conversation item"""
|
||||||
|
conversation = item.get(config.conversation_field, [])
|
||||||
|
|
||||||
|
if not isinstance(conversation, list) or not conversation:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Validate conversation structure
|
||||||
|
valid_conversation = []
|
||||||
|
for turn in conversation:
|
||||||
|
if not isinstance(turn, dict):
|
||||||
|
continue
|
||||||
|
if "role" not in turn or "content" not in turn:
|
||||||
|
continue
|
||||||
|
if turn["role"] not in ["user", "assistant", "system"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = str(turn["content"]).strip()
|
||||||
|
if len(content) < config.min_length or len(content) > config.max_length:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if config.clean_text:
|
||||||
|
content = self._clean_text(content)
|
||||||
|
|
||||||
|
valid_conversation.append({
|
||||||
|
"role": turn["role"],
|
||||||
|
"content": content
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(valid_conversation) < 2: # Need at least 2 turns for a conversation
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {"conversation": valid_conversation}
|
||||||
|
|
||||||
|
def _clean_text(self, text: str) -> str:
|
||||||
|
"""Clean and normalize text"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove extra whitespace
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
class CustomInstructDataLoader(BaseInstructDataLoader):
|
||||||
|
"""Load custom conversation datasets from local files"""
|
||||||
|
|
||||||
|
def load(self, config: InstructConfig) -> Dict[str, List[Dict]]:
|
||||||
|
"""Load custom conversation dataset from local file and create splits"""
|
||||||
|
if not config.data_path:
|
||||||
|
raise ValueError("Data path is required for custom datasets")
|
||||||
|
|
||||||
|
file_path = Path(config.data_path)
|
||||||
|
|
||||||
|
if not file_path.exists():
|
||||||
|
raise FileNotFoundError(f"Data file not found: {file_path}")
|
||||||
|
|
||||||
|
logger.info(f"Loading custom conversation dataset: {file_path}")
|
||||||
|
|
||||||
|
if config.data_format == "jsonl":
|
||||||
|
raw_data = self._load_jsonl(file_path, config)
|
||||||
|
elif config.data_format == "json":
|
||||||
|
raw_data = self._load_json(file_path, config)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported format: {config.data_format}")
|
||||||
|
|
||||||
|
if config.max_samples:
|
||||||
|
raw_data = raw_data[:config.max_samples]
|
||||||
|
|
||||||
|
logger.info(f"Loaded {len(raw_data)} conversation samples from {file_path}")
|
||||||
|
|
||||||
|
# Create splits from the raw data
|
||||||
|
splits_data = self._create_splits(raw_data, config)
|
||||||
|
|
||||||
|
return splits_data
|
||||||
|
|
||||||
|
def _create_splits(self, data: List[Dict], config: InstructConfig) -> Dict[str, List[Dict]]:
|
||||||
|
"""Create train/validation/test splits from raw data"""
|
||||||
|
logger.info(f"Creating splits from {len(data)} conversation samples...")
|
||||||
|
|
||||||
|
# Handle very small datasets
|
||||||
|
if len(data) < 3:
|
||||||
|
logger.warning(f"Dataset has only {len(data)} samples. Using all data for training.")
|
||||||
|
return {
|
||||||
|
"train": data,
|
||||||
|
"validation": [],
|
||||||
|
"test": []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Calculate split sizes
|
||||||
|
total_samples = len(data)
|
||||||
|
|
||||||
|
# Adjust split ratios if dataset is too small
|
||||||
|
if total_samples < 10:
|
||||||
|
config.train_split = 0.6
|
||||||
|
config.validation_split = 0.2
|
||||||
|
config.test_split = 0.2
|
||||||
|
logger.info(f"Small dataset detected. Adjusted split ratios to: train={config.train_split}, val={config.validation_split}, test={config.test_split}")
|
||||||
|
|
||||||
|
val_size = max(1, int(total_samples * config.validation_split))
|
||||||
|
test_size = max(1, int(total_samples * config.test_split))
|
||||||
|
train_size = total_samples - val_size - test_size
|
||||||
|
|
||||||
|
# Ensure train split has at least 1 sample
|
||||||
|
if train_size < 1:
|
||||||
|
if val_size > 1:
|
||||||
|
val_size -= 1
|
||||||
|
train_size += 1
|
||||||
|
elif test_size > 1:
|
||||||
|
test_size -= 1
|
||||||
|
train_size += 1
|
||||||
|
|
||||||
|
logger.info(f"Split sizes: train={train_size}, validation={val_size}, test={test_size}")
|
||||||
|
|
||||||
|
# Create splits
|
||||||
|
if val_size == 0 and test_size == 0:
|
||||||
|
splits_data = {
|
||||||
|
"train": data,
|
||||||
|
"validation": [],
|
||||||
|
"test": []
|
||||||
|
}
|
||||||
|
elif val_size == 0:
|
||||||
|
train_data, test_data = train_test_split(data, test_size=test_size, random_state=42)
|
||||||
|
splits_data = {
|
||||||
|
"train": train_data,
|
||||||
|
"validation": [],
|
||||||
|
"test": test_data
|
||||||
|
}
|
||||||
|
elif test_size == 0:
|
||||||
|
train_data, val_data = train_test_split(data, test_size=val_size, random_state=42)
|
||||||
|
splits_data = {
|
||||||
|
"train": train_data,
|
||||||
|
"validation": val_data,
|
||||||
|
"test": []
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
# Full three-way split
|
||||||
|
train_data, temp_data = train_test_split(
|
||||||
|
data,
|
||||||
|
test_size=val_size + test_size,
|
||||||
|
random_state=42
|
||||||
|
)
|
||||||
|
|
||||||
|
val_data, test_data = train_test_split(
|
||||||
|
temp_data,
|
||||||
|
test_size=test_size,
|
||||||
|
random_state=42
|
||||||
|
)
|
||||||
|
|
||||||
|
splits_data = {
|
||||||
|
"train": train_data,
|
||||||
|
"validation": val_data,
|
||||||
|
"test": test_data
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Created conversation splits:")
|
||||||
|
logger.info(f" Train: {len(splits_data['train'])} samples")
|
||||||
|
logger.info(f" Validation: {len(splits_data['validation'])} samples")
|
||||||
|
logger.info(f" Test: {len(splits_data['test'])} samples")
|
||||||
|
|
||||||
|
return splits_data
|
||||||
|
|
||||||
|
def _load_jsonl(self, file_path: Path, config: InstructConfig) -> List[Dict]:
|
||||||
|
"""Load JSONL file"""
|
||||||
|
data = []
|
||||||
|
with open(file_path, 'r', encoding=config.encoding) as f:
|
||||||
|
for line_num, line in enumerate(f, 1):
|
||||||
|
if line.strip():
|
||||||
|
try:
|
||||||
|
data.append(json.loads(line))
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logger.warning(f"Invalid JSON at line {line_num}: {e}")
|
||||||
|
return data
|
||||||
|
|
||||||
|
def _load_json(self, file_path: Path, config: InstructConfig) -> List[Dict]:
|
||||||
|
"""Load JSON file"""
|
||||||
|
with open(file_path, 'r', encoding=config.encoding) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
if isinstance(data, list):
|
||||||
|
return data
|
||||||
|
elif isinstance(data, dict) and "data" in data:
|
||||||
|
return data["data"]
|
||||||
|
else:
|
||||||
|
return [data]
|
||||||
|
|
||||||
|
def preprocess(self, data: Dict[str, List[Dict]], config: InstructConfig) -> Dict[str, List[Dict]]:
|
||||||
|
"""Apply preprocessing steps to all splits separately"""
|
||||||
|
processed_splits = {}
|
||||||
|
|
||||||
|
logger.info(f"=== PREPROCESSING CUSTOM CONVERSATION DATA ===")
|
||||||
|
|
||||||
|
for split_name, split_data in data.items():
|
||||||
|
logger.info(f"Processing {split_name} split with {len(split_data)} items...")
|
||||||
|
|
||||||
|
processed_data = []
|
||||||
|
processed_count = 0
|
||||||
|
skipped_count = 0
|
||||||
|
|
||||||
|
for i, item in enumerate(split_data):
|
||||||
|
processed_item = self._preprocess_item(item, config)
|
||||||
|
if processed_item is not None:
|
||||||
|
processed_data.append(processed_item)
|
||||||
|
processed_count += 1
|
||||||
|
else:
|
||||||
|
skipped_count += 1
|
||||||
|
if skipped_count <= 3: # Log first few skipped items
|
||||||
|
logger.info(f"Skipped item {i} from {split_name}: {item}")
|
||||||
|
|
||||||
|
processed_splits[split_name] = processed_data
|
||||||
|
logger.info(f"{split_name} - Preprocessed {processed_count} samples, skipped {skipped_count} samples")
|
||||||
|
|
||||||
|
return processed_splits
|
||||||
|
|
||||||
|
def _preprocess_item(self, item: Dict, config: InstructConfig) -> Optional[Dict]:
|
||||||
|
"""Preprocess a single conversation item"""
|
||||||
|
conversation = item.get(config.conversation_field, [])
|
||||||
|
|
||||||
|
if not isinstance(conversation, list) or not conversation:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Validate conversation structure
|
||||||
|
valid_conversation = []
|
||||||
|
for turn in conversation:
|
||||||
|
if not isinstance(turn, dict):
|
||||||
|
continue
|
||||||
|
if "role" not in turn or "content" not in turn:
|
||||||
|
continue
|
||||||
|
if turn["role"] not in ["user", "assistant", "system"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = str(turn["content"]).strip()
|
||||||
|
if len(content) < config.min_length or len(content) > config.max_length:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if config.clean_text:
|
||||||
|
content = self._clean_text(content)
|
||||||
|
|
||||||
|
valid_conversation.append({
|
||||||
|
"role": turn["role"],
|
||||||
|
"content": content
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(valid_conversation) < 2: # Need at least 2 turns for a conversation
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {"conversation": valid_conversation}
|
||||||
|
|
||||||
|
def _clean_text(self, text: str) -> str:
|
||||||
|
"""Clean and normalize text"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# Remove extra whitespace
|
||||||
|
text = re.sub(r'\s+', ' ', text).strip()
|
||||||
|
return text
|
||||||
|
|
||||||
|
class InstructDataPipeline:
|
||||||
|
"""Main instruction fine-tuning data pipeline"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.validator = ConversationValidator()
|
||||||
|
self.hf_loader = HuggingFaceInstructDataLoader()
|
||||||
|
self.custom_loader = CustomInstructDataLoader()
|
||||||
|
|
||||||
|
def create_config(
|
||||||
|
self,
|
||||||
|
data_source: str,
|
||||||
|
dataset_name: Optional[str] = None,
|
||||||
|
data_path: Optional[str] = None,
|
||||||
|
conversation_field: str = "conversation",
|
||||||
|
**kwargs
|
||||||
|
) -> InstructConfig:
|
||||||
|
"""Create instruction configuration"""
|
||||||
|
return InstructConfig(
|
||||||
|
data_source=data_source,
|
||||||
|
dataset_name=dataset_name,
|
||||||
|
data_path=data_path,
|
||||||
|
conversation_field=conversation_field,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
|
||||||
|
def load_config_from_yaml(self, yaml_path: str) -> InstructConfig:
|
||||||
|
"""Load configuration from YAML file"""
|
||||||
|
try:
|
||||||
|
config_dict = load_yaml_config(yaml_path)
|
||||||
|
|
||||||
|
# Create configuration object from YAML data
|
||||||
|
config = InstructConfig(
|
||||||
|
data_source=config_dict.get('data_source', 'custom'),
|
||||||
|
dataset_name=config_dict.get('dataset_name'),
|
||||||
|
data_path=config_dict.get('data_path'),
|
||||||
|
data_format=config_dict.get('data_format', 'jsonl'),
|
||||||
|
conversation_field=config_dict.get('conversation_field', 'conversation'),
|
||||||
|
max_samples=config_dict.get('max_samples'),
|
||||||
|
train_split=config_dict.get('train_split', 0.8),
|
||||||
|
validation_split=config_dict.get('validation_split', 0.1),
|
||||||
|
test_split=config_dict.get('test_split', 0.1),
|
||||||
|
clean_text=config_dict.get('clean_text', True),
|
||||||
|
min_length=config_dict.get('min_length', 10),
|
||||||
|
max_length=config_dict.get('max_length', 2048),
|
||||||
|
output_format=config_dict.get('output_format', 'conversation'),
|
||||||
|
output_dir=config_dict.get('output_dir', './data/processed/instruct'),
|
||||||
|
hf_split=config_dict.get('hf_split', 'train'),
|
||||||
|
hf_cache_dir=config_dict.get('hf_cache_dir'),
|
||||||
|
test_split_from=config_dict.get('test_split_from', 'train'),
|
||||||
|
val_split_from=config_dict.get('val_split_from', 'train'),
|
||||||
|
encoding=config_dict.get('encoding', 'utf-8')
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Configuration loaded from YAML: {yaml_path}")
|
||||||
|
logger.info(f"Output directory: {config.output_dir}")
|
||||||
|
logger.info(f"Conversation field: {config.conversation_field}")
|
||||||
|
|
||||||
|
return config
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading configuration from YAML {yaml_path}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def load_and_preprocess(self, config: InstructConfig) -> Tuple[Dict[str, List[Dict]], Dict[str, Any]]:
|
||||||
|
"""Load and preprocess conversation data"""
|
||||||
|
|
||||||
|
logger.info(f"Starting conversation data loading and preprocessing...")
|
||||||
|
logger.info(f"Data source: {config.data_source}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load data
|
||||||
|
if config.data_source == "huggingface":
|
||||||
|
logger.info("Loading HuggingFace conversation dataset...")
|
||||||
|
raw_splits = self.hf_loader.load(config)
|
||||||
|
logger.info("Preprocessing HuggingFace conversation dataset...")
|
||||||
|
processed_splits = self.hf_loader.preprocess(raw_splits, config)
|
||||||
|
elif config.data_source == "custom":
|
||||||
|
logger.info("Loading custom conversation dataset...")
|
||||||
|
raw_splits = self.custom_loader.load(config)
|
||||||
|
logger.info("Preprocessing custom conversation dataset...")
|
||||||
|
processed_splits = self.custom_loader.preprocess(raw_splits, config)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported data source: {config.data_source}")
|
||||||
|
|
||||||
|
logger.info(f"Conversation data loading and preprocessing completed successfully")
|
||||||
|
|
||||||
|
# Validate processed data
|
||||||
|
logger.info("Validating processed conversation data...")
|
||||||
|
is_valid, errors = self.validator.validate_conversation_data(processed_splits, config, is_processed=True)
|
||||||
|
if not is_valid:
|
||||||
|
logger.error("Conversation data validation failed:")
|
||||||
|
for error in errors:
|
||||||
|
logger.error(f" - {error}")
|
||||||
|
raise ValueError("Conversation data validation failed")
|
||||||
|
|
||||||
|
logger.info("Conversation data validation passed")
|
||||||
|
|
||||||
|
# Analyze dataset
|
||||||
|
logger.info("Analyzing conversation dataset...")
|
||||||
|
analysis = self.validator.analyze_conversation_dataset(processed_splits, config, is_processed=True)
|
||||||
|
logger.info("Conversation dataset analysis completed")
|
||||||
|
|
||||||
|
return processed_splits, analysis
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in load_and_preprocess: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def save_data(self, data: Dict[str, List[Dict]], output_dir: str, format: str = "jsonl"):
|
||||||
|
"""Save processed conversation data splits to files"""
|
||||||
|
output_path = Path(output_dir)
|
||||||
|
output_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for split_name, split_data in data.items():
|
||||||
|
if format == "jsonl":
|
||||||
|
output_file = output_path / f"{split_name}.jsonl"
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
for item in split_data:
|
||||||
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||||
|
elif format == "json":
|
||||||
|
output_file = output_path / f"{split_name}.json"
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(split_data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"Saved {len(split_data)} conversation samples to {output_file}")
|
||||||
|
|
||||||
|
def run_pipeline(
|
||||||
|
self,
|
||||||
|
config: InstructConfig,
|
||||||
|
save_splits: bool = True
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""Run complete instruction data pipeline"""
|
||||||
|
|
||||||
|
logger.info("Starting instruction data pipeline...")
|
||||||
|
|
||||||
|
# Load and preprocess data
|
||||||
|
processed_splits, analysis = self.load_and_preprocess(config)
|
||||||
|
|
||||||
|
# Save data if requested
|
||||||
|
if save_splits:
|
||||||
|
output_dir = Path(config.output_dir)
|
||||||
|
self.save_data(processed_splits, str(output_dir))
|
||||||
|
|
||||||
|
# Create result summary
|
||||||
|
result = {
|
||||||
|
"config": config,
|
||||||
|
"analysis": analysis,
|
||||||
|
"splits": {
|
||||||
|
split_name: len(split_data) for split_name, split_data in processed_splits.items()
|
||||||
|
},
|
||||||
|
"output_format": config.output_format,
|
||||||
|
"output_dir": config.output_dir,
|
||||||
|
"data": processed_splits, # Include the actual processed data
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Instruction data pipeline completed successfully!")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def load_yaml_config(config_path: str) -> Dict[str, Any]:
|
||||||
|
"""Load and parse YAML configuration file with proper structure handling"""
|
||||||
|
try:
|
||||||
|
with open(config_path, 'r', encoding='utf-8') as f:
|
||||||
|
yaml_data = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Extract configuration from YAML structure
|
||||||
|
config_dict = {}
|
||||||
|
|
||||||
|
# Handle task section
|
||||||
|
if 'task' in yaml_data:
|
||||||
|
task_data = yaml_data['task']
|
||||||
|
config_dict.update({
|
||||||
|
'task_name': task_data.get('name'),
|
||||||
|
'task_type': task_data.get('type')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Handle data section
|
||||||
|
if 'data' in yaml_data:
|
||||||
|
data_config = yaml_data['data']
|
||||||
|
config_dict.update({
|
||||||
|
'data_source': data_config.get('source'),
|
||||||
|
'dataset_name': data_config.get('dataset_name'),
|
||||||
|
'data_path': data_config.get('data_path'),
|
||||||
|
'data_format': data_config.get('data_format'),
|
||||||
|
'conversation_field': data_config.get('conversation_field'),
|
||||||
|
'max_samples': data_config.get('max_samples'),
|
||||||
|
'train_split': data_config.get('train_split'),
|
||||||
|
'validation_split': data_config.get('validation_split'),
|
||||||
|
'test_split': data_config.get('test_split'),
|
||||||
|
'clean_text': data_config.get('clean_text'),
|
||||||
|
'min_length': data_config.get('min_length'),
|
||||||
|
'max_length': data_config.get('max_length'),
|
||||||
|
'output_format': data_config.get('output_format'),
|
||||||
|
'output_dir': data_config.get('output_dir'),
|
||||||
|
'encoding': data_config.get('encoding')
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info(f"Successfully parsed YAML configuration from: {config_path}")
|
||||||
|
logger.info(f"Extracted {len(config_dict)} configuration parameters")
|
||||||
|
|
||||||
|
return config_dict
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading YAML config from {config_path}: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function with YAML configuration support"""
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Instruction Data Processing Pipeline")
|
||||||
|
|
||||||
|
# YAML configuration
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
|
||||||
|
# Data source arguments
|
||||||
|
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||||||
|
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||||||
|
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||||||
|
parser.add_argument("--data-format", choices=["jsonl", "json"], help="Data format")
|
||||||
|
|
||||||
|
# Field mapping
|
||||||
|
parser.add_argument("--conversation-field", type=str, help="Conversation field name")
|
||||||
|
|
||||||
|
# Data processing
|
||||||
|
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||||||
|
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||||||
|
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||||||
|
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||||||
|
|
||||||
|
# Output configuration
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Set up logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=getattr(logging, args.log_level),
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load configuration
|
||||||
|
config_dict = {}
|
||||||
|
|
||||||
|
# Load YAML config if provided
|
||||||
|
if args.config:
|
||||||
|
try:
|
||||||
|
config_dict = load_yaml_config(args.config)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading YAML config: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Override YAML config with CLI arguments (similar to styling pipeline)
|
||||||
|
cli_overrides = {}
|
||||||
|
if args.data_source:
|
||||||
|
cli_overrides['data_source'] = args.data_source
|
||||||
|
if args.dataset_name:
|
||||||
|
cli_overrides['dataset_name'] = args.dataset_name
|
||||||
|
if args.data_path:
|
||||||
|
cli_overrides['data_path'] = args.data_path
|
||||||
|
if args.data_format:
|
||||||
|
cli_overrides['data_format'] = args.data_format
|
||||||
|
if args.conversation_field:
|
||||||
|
cli_overrides['conversation_field'] = args.conversation_field
|
||||||
|
if args.max_samples:
|
||||||
|
cli_overrides['max_samples'] = args.max_samples
|
||||||
|
if args.train_split:
|
||||||
|
cli_overrides['train_split'] = args.train_split
|
||||||
|
if args.validation_split:
|
||||||
|
cli_overrides['validation_split'] = args.validation_split
|
||||||
|
if args.test_split:
|
||||||
|
cli_overrides['test_split'] = args.test_split
|
||||||
|
if args.output_dir:
|
||||||
|
cli_overrides['output_dir'] = args.output_dir
|
||||||
|
|
||||||
|
# Merge configurations
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if key in config_dict:
|
||||||
|
logger.info(f"Overriding YAML config '{key}' with CLI value: {value}")
|
||||||
|
config_dict[key] = value
|
||||||
|
|
||||||
|
# Validate required arguments
|
||||||
|
if not config_dict.get('data_source'):
|
||||||
|
parser.error("--data-source is required (either in YAML config or CLI)")
|
||||||
|
|
||||||
|
if config_dict.get('data_source') == "huggingface" and not config_dict.get('dataset_name'):
|
||||||
|
parser.error("--dataset-name is required for HuggingFace datasets")
|
||||||
|
|
||||||
|
if config_dict.get('data_source') == "custom" and not config_dict.get('data_path'):
|
||||||
|
parser.error("--data-path is required for custom datasets")
|
||||||
|
|
||||||
|
# Create configuration object
|
||||||
|
config = InstructConfig(
|
||||||
|
data_source=config_dict.get('data_source', 'custom'),
|
||||||
|
dataset_name=config_dict.get('dataset_name'),
|
||||||
|
data_path=config_dict.get('data_path'),
|
||||||
|
data_format=config_dict.get('data_format', 'jsonl'),
|
||||||
|
conversation_field=config_dict.get('conversation_field', 'conversation'),
|
||||||
|
max_samples=config_dict.get('max_samples'),
|
||||||
|
train_split=config_dict.get('train_split', 0.8),
|
||||||
|
validation_split=config_dict.get('validation_split', 0.1),
|
||||||
|
test_split=config_dict.get('test_split', 0.1),
|
||||||
|
clean_text=config_dict.get('clean_text', True),
|
||||||
|
min_length=config_dict.get('min_length', 10),
|
||||||
|
max_length=config_dict.get('max_length', 2048),
|
||||||
|
output_format=config_dict.get('output_format', 'conversation'),
|
||||||
|
output_dir=config_dict.get('output_dir', './data/processed/instruct'),
|
||||||
|
hf_split=config_dict.get('hf_split', 'train'),
|
||||||
|
hf_cache_dir=config_dict.get('hf_cache_dir'),
|
||||||
|
test_split_from=config_dict.get('test_split_from', 'train'),
|
||||||
|
val_split_from=config_dict.get('val_split_from', 'train'),
|
||||||
|
encoding=config_dict.get('encoding', 'utf-8')
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialize pipeline
|
||||||
|
pipeline = InstructDataPipeline()
|
||||||
|
|
||||||
|
try:
|
||||||
|
print(f"Starting instruction data pipeline with {config.data_source} data source...")
|
||||||
|
if args.config:
|
||||||
|
print(f"Using YAML configuration: {args.config}")
|
||||||
|
print(f"Conversation field: {config.conversation_field}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
result = pipeline.run_pipeline(config, save_splits=True)
|
||||||
|
|
||||||
|
print(f"✅ Pipeline completed successfully!")
|
||||||
|
print(f" Data source: {config.data_source}")
|
||||||
|
if config.data_source == "huggingface":
|
||||||
|
print(f" Dataset: {config.dataset_name}")
|
||||||
|
else:
|
||||||
|
print(f" Data file: {config.data_path}")
|
||||||
|
print(f" Total samples: {result['analysis']['overall']['total_samples']}")
|
||||||
|
print(f" Split sizes: {result['analysis']['overall']['split_sizes']}")
|
||||||
|
print(f" Output directory: {config.output_dir}")
|
||||||
|
print(f" Conversation stats: {result['analysis']['overall']['conversation_stats']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error running pipeline: {e}")
|
||||||
|
import traceback
|
||||||
|
print("Full error traceback:")
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,393 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct Inference Pipeline using Trained Models
|
||||||
|
Supports conversational inference with streaming and batch processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional, List, Union
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Add the project root to the path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
# Inference imports
|
||||||
|
import torch
|
||||||
|
from datasets import load_from_disk, Dataset
|
||||||
|
from unsloth import FastLanguageModel
|
||||||
|
from unsloth.chat_templates import get_chat_template
|
||||||
|
from transformers import TextStreamer
|
||||||
|
|
||||||
|
class InstructInference:
|
||||||
|
"""Instruction fine-tuning inference using trained models"""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict[str, Any]):
|
||||||
|
self.config = config
|
||||||
|
self.model = None
|
||||||
|
self.tokenizer = None
|
||||||
|
|
||||||
|
# Set device
|
||||||
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
print(f"Using device: {self.device}")
|
||||||
|
|
||||||
|
# Model parameters
|
||||||
|
self.model_output_dir = config.get('model_output_dir', './models/instruct')
|
||||||
|
self.base_model_name = config.get('base_model_name', 'unsloth/Qwen2.5-72B-Instruct')
|
||||||
|
self.max_seq_length = config.get('max_seq_length', 2048)
|
||||||
|
self.dtype = config.get('dtype', None)
|
||||||
|
self.load_in_4bit = config.get('load_in_4bit', True)
|
||||||
|
self.hf_token = config.get('hf_token', None)
|
||||||
|
|
||||||
|
# Inference parameters
|
||||||
|
self.batch_size = config.get('batch_size', 1)
|
||||||
|
self.max_new_tokens = config.get('max_new_tokens', 128)
|
||||||
|
self.temperature = config.get('temperature', 1.5)
|
||||||
|
self.min_p = config.get('min_p', 0.1)
|
||||||
|
self.use_cache = config.get('use_cache', True)
|
||||||
|
|
||||||
|
# Chat template
|
||||||
|
self.chat_template = config.get('chat_template', 'qwen-2.5')
|
||||||
|
|
||||||
|
def load_model_and_tokenizer(self):
|
||||||
|
"""Load the trained model and tokenizer"""
|
||||||
|
print("Loading trained instruction model and tokenizer...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load the saved LoRA model
|
||||||
|
model_path = self.model_output_dir
|
||||||
|
print(f"Loading model from: {model_path}")
|
||||||
|
|
||||||
|
# Check if the model directory exists
|
||||||
|
if not Path(model_path).exists():
|
||||||
|
raise FileNotFoundError(f"Model directory not found: {model_path}")
|
||||||
|
|
||||||
|
# Load the model directly from the saved path
|
||||||
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name=model_path,
|
||||||
|
max_seq_length=self.max_seq_length,
|
||||||
|
dtype=self.dtype,
|
||||||
|
load_in_4bit=self.load_in_4bit,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enable native 2x faster inference
|
||||||
|
FastLanguageModel.for_inference(self.model)
|
||||||
|
|
||||||
|
print(f"✅ Model loaded from: {model_path}")
|
||||||
|
print(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error loading model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def setup_chat_template(self):
|
||||||
|
"""Setup chat template for conversation formatting"""
|
||||||
|
print("Setting up chat template...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.tokenizer = get_chat_template(
|
||||||
|
self.tokenizer,
|
||||||
|
chat_template=self.chat_template,
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"✅ Chat template configured: {self.chat_template}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error setting up chat template: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def format_messages(self, messages: List[Dict[str, str]]) -> str:
|
||||||
|
"""Format messages using chat template"""
|
||||||
|
try:
|
||||||
|
# Apply chat template to format the conversation
|
||||||
|
formatted_prompt = self.tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True, # Add generation prompt for inference
|
||||||
|
)
|
||||||
|
|
||||||
|
return formatted_prompt
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error formatting messages: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def generate_response(
|
||||||
|
self,
|
||||||
|
messages: List[Dict[str, str]],
|
||||||
|
max_new_tokens: Optional[int] = None,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
stream: bool = False
|
||||||
|
) -> str:
|
||||||
|
"""Generate response using the trained instruction model"""
|
||||||
|
try:
|
||||||
|
# Use default values if not provided
|
||||||
|
max_tokens = max_new_tokens or self.max_new_tokens
|
||||||
|
temp = temperature or self.temperature
|
||||||
|
|
||||||
|
# Format the messages
|
||||||
|
formatted_prompt = self.format_messages(messages)
|
||||||
|
print(f"Formatted prompt: {formatted_prompt[:200]}...")
|
||||||
|
|
||||||
|
# Tokenize the input
|
||||||
|
inputs = self.tokenizer(
|
||||||
|
[formatted_prompt],
|
||||||
|
return_tensors="pt"
|
||||||
|
).to(self.device)
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
# Streaming generation
|
||||||
|
text_streamer = TextStreamer(self.tokenizer, skip_prompt=True)
|
||||||
|
print("Generating with streaming...")
|
||||||
|
_ = self.model.generate(
|
||||||
|
input_ids=inputs.input_ids,
|
||||||
|
streamer=text_streamer,
|
||||||
|
max_new_tokens=max_tokens,
|
||||||
|
use_cache=self.use_cache,
|
||||||
|
temperature=temp,
|
||||||
|
min_p=self.min_p
|
||||||
|
)
|
||||||
|
return "" # Streaming output is handled by streamer
|
||||||
|
else:
|
||||||
|
# Non-streaming generation
|
||||||
|
print("Generating response...")
|
||||||
|
outputs = self.model.generate(
|
||||||
|
input_ids=inputs.input_ids,
|
||||||
|
max_new_tokens=max_tokens,
|
||||||
|
use_cache=self.use_cache,
|
||||||
|
temperature=temp,
|
||||||
|
min_p=self.min_p,
|
||||||
|
pad_token_id=self.tokenizer.eos_token_id
|
||||||
|
)
|
||||||
|
|
||||||
|
# Decode the generated text
|
||||||
|
full_response = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
||||||
|
|
||||||
|
# Extract only the new generated response (remove the input prompt)
|
||||||
|
prompt_length = len(formatted_prompt)
|
||||||
|
response = full_response[prompt_length:].strip()
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error generating response: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def chat(self, user_input: str, conversation_history: Optional[List[Dict[str, str]]] = None, stream: bool = False) -> str:
|
||||||
|
"""Have a chat conversation with the model"""
|
||||||
|
try:
|
||||||
|
# Initialize conversation history if not provided
|
||||||
|
if conversation_history is None:
|
||||||
|
conversation_history = []
|
||||||
|
|
||||||
|
# Add user input to conversation
|
||||||
|
messages = conversation_history + [{"role": "user", "content": user_input}]
|
||||||
|
|
||||||
|
print(f"User: {user_input}")
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
print("Assistant: ", end="", flush=True)
|
||||||
|
self.generate_response(messages, stream=True)
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
# Generate response
|
||||||
|
response = self.generate_response(messages, stream=False)
|
||||||
|
print(f"Assistant: {response}")
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error in chat: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def batch_inference(
|
||||||
|
self,
|
||||||
|
conversations: List[List[Dict[str, str]]],
|
||||||
|
max_new_tokens: Optional[int] = None
|
||||||
|
) -> List[str]:
|
||||||
|
"""Perform batch inference on multiple conversations"""
|
||||||
|
responses = []
|
||||||
|
|
||||||
|
for i, messages in enumerate(conversations):
|
||||||
|
print(f"Processing conversation {i+1}/{len(conversations)}")
|
||||||
|
response = self.generate_response(messages, max_new_tokens)
|
||||||
|
responses.append(response)
|
||||||
|
|
||||||
|
return responses
|
||||||
|
|
||||||
|
def interactive_chat(self):
|
||||||
|
"""Start an interactive chat session"""
|
||||||
|
print("🤖 Starting interactive chat session...")
|
||||||
|
print("Type 'quit', 'exit', or 'bye' to end the conversation.")
|
||||||
|
print("Type 'clear' to clear conversation history.")
|
||||||
|
print("Type 'stream on' or 'stream off' to toggle streaming.")
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
conversation_history = []
|
||||||
|
streaming = False
|
||||||
|
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
user_input = input("\n👤 You: ").strip()
|
||||||
|
|
||||||
|
if user_input.lower() in ['quit', 'exit', 'bye']:
|
||||||
|
print("👋 Goodbye!")
|
||||||
|
break
|
||||||
|
elif user_input.lower() == 'clear':
|
||||||
|
conversation_history = []
|
||||||
|
print("🗑️ Conversation history cleared.")
|
||||||
|
continue
|
||||||
|
elif user_input.lower() == 'stream on':
|
||||||
|
streaming = True
|
||||||
|
print("🔄 Streaming enabled.")
|
||||||
|
continue
|
||||||
|
elif user_input.lower() == 'stream off':
|
||||||
|
streaming = False
|
||||||
|
print("⏸️ Streaming disabled.")
|
||||||
|
continue
|
||||||
|
elif not user_input:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Generate response
|
||||||
|
if streaming:
|
||||||
|
print("🤖 Assistant: ", end="", flush=True)
|
||||||
|
self.chat(user_input, conversation_history, stream=True)
|
||||||
|
# Add to history (we don't have the actual response text for streaming)
|
||||||
|
conversation_history.extend([
|
||||||
|
{"role": "user", "content": user_input},
|
||||||
|
{"role": "assistant", "content": "[Streamed response]"}
|
||||||
|
])
|
||||||
|
else:
|
||||||
|
response = self.chat(user_input, conversation_history, stream=False)
|
||||||
|
# Add to history
|
||||||
|
conversation_history.extend([
|
||||||
|
{"role": "user", "content": user_input},
|
||||||
|
{"role": "assistant", "content": response}
|
||||||
|
])
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n👋 Goodbye!")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
def load_inference_config(config_path: str) -> Dict[str, Any]:
|
||||||
|
"""Load inference configuration from YAML file"""
|
||||||
|
try:
|
||||||
|
with open(config_path, 'r', encoding='utf-8') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Extract inference configuration
|
||||||
|
inference_config = {}
|
||||||
|
|
||||||
|
# Model configuration
|
||||||
|
if 'model' in config:
|
||||||
|
model_data = config['model']
|
||||||
|
inference_config.update({
|
||||||
|
'base_model_name': model_data.get('training_model', 'unsloth/Qwen2.5-72B-Instruct'),
|
||||||
|
'max_seq_length': model_data.get('training_max_seq_length', 2048),
|
||||||
|
'dtype': model_data.get('training_dtype'),
|
||||||
|
'load_in_4bit': model_data.get('training_load_in_4bit', True),
|
||||||
|
'hf_token': model_data.get('training_token')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Training configuration - to get model_output_dir
|
||||||
|
if 'training' in config:
|
||||||
|
training_data = config['training']
|
||||||
|
inference_config.update({
|
||||||
|
'model_output_dir': training_data.get('model_output_dir', './models/instruct')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Inference configuration
|
||||||
|
if 'inference' in config:
|
||||||
|
inference_data = config['inference']
|
||||||
|
inference_config.update({
|
||||||
|
'batch_size': inference_data.get('batch_size', 1),
|
||||||
|
'max_new_tokens': inference_data.get('max_new_tokens', 128),
|
||||||
|
'temperature': inference_data.get('temperature', 1.5),
|
||||||
|
'min_p': inference_data.get('min_p', 0.1),
|
||||||
|
'use_cache': inference_data.get('use_cache', True)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Chat template
|
||||||
|
inference_config.update({
|
||||||
|
'chat_template': 'qwen-2.5' # Use Qwen chat template by default
|
||||||
|
})
|
||||||
|
|
||||||
|
return inference_config
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading inference config: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main inference function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruction Inference Pipeline")
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--interactive", action="store_true", help="Start interactive chat session")
|
||||||
|
parser.add_argument("--message", type=str, help="Single message to send to the model")
|
||||||
|
parser.add_argument("--max-tokens", type=int, help="Maximum new tokens to generate")
|
||||||
|
parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||||
|
parser.add_argument("--temperature", type=float, help="Sampling temperature")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load configuration
|
||||||
|
print(f"Loading configuration from: {args.config}")
|
||||||
|
inference_config = load_inference_config(args.config)
|
||||||
|
|
||||||
|
# Override with CLI arguments
|
||||||
|
if args.max_tokens:
|
||||||
|
inference_config['max_new_tokens'] = args.max_tokens
|
||||||
|
if args.temperature:
|
||||||
|
inference_config['temperature'] = args.temperature
|
||||||
|
|
||||||
|
print("Inference configuration:")
|
||||||
|
for key, value in inference_config.items():
|
||||||
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
|
# Initialize inference
|
||||||
|
inference = InstructInference(inference_config)
|
||||||
|
|
||||||
|
# Load model and tokenizer
|
||||||
|
inference.load_model_and_tokenizer()
|
||||||
|
|
||||||
|
# Setup chat template
|
||||||
|
inference.setup_chat_template()
|
||||||
|
|
||||||
|
# Run inference based on mode
|
||||||
|
if args.interactive:
|
||||||
|
# Interactive chat mode
|
||||||
|
inference.interactive_chat()
|
||||||
|
elif args.message:
|
||||||
|
# Single message mode
|
||||||
|
print("Running single message inference...")
|
||||||
|
messages = [{"role": "user", "content": args.message}]
|
||||||
|
|
||||||
|
if args.stream:
|
||||||
|
print("User:", args.message)
|
||||||
|
print("Assistant: ", end="", flush=True)
|
||||||
|
inference.generate_response(messages, stream=True)
|
||||||
|
else:
|
||||||
|
response = inference.generate_response(messages, stream=False)
|
||||||
|
print(f"User: {args.message}")
|
||||||
|
print(f"Assistant: {response}")
|
||||||
|
else:
|
||||||
|
# Default to interactive mode if no specific mode is chosen
|
||||||
|
print("No specific mode chosen. Starting interactive chat...")
|
||||||
|
inference.interactive_chat()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Inference failed: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,525 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct Training Pipeline using Unsloth and SFTTrainer
|
||||||
|
Supports instruction fine-tuning with conversational data and LoRA fine-tuning
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional, List
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Add the project root to the path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
from utils.config.config_manager import ConfigManager
|
||||||
|
|
||||||
|
# Training imports
|
||||||
|
import torch
|
||||||
|
from datasets import load_from_disk, Dataset
|
||||||
|
from unsloth import FastLanguageModel, is_bfloat16_supported
|
||||||
|
from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
|
||||||
|
from trl import SFTTrainer, SFTConfig
|
||||||
|
from transformers import DataCollatorForSeq2Seq
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class InstructTrainer:
|
||||||
|
"""Instruction fine-tuning trainer using Unsloth and SFTTrainer"""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict[str, Any]):
|
||||||
|
self.config = config
|
||||||
|
self.model = None
|
||||||
|
self.tokenizer = None
|
||||||
|
self.trainer = None
|
||||||
|
|
||||||
|
# Set device
|
||||||
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
logger.info(f"Using device: {self.device}")
|
||||||
|
|
||||||
|
# Model parameters
|
||||||
|
self.model_name = config.get('model_name', 'unsloth/Qwen2.5-72B-Instruct')
|
||||||
|
self.max_seq_length = config.get('max_seq_length', 2048)
|
||||||
|
self.dtype = config.get('dtype', None)
|
||||||
|
self.load_in_4bit = config.get('load_in_4bit', True)
|
||||||
|
self.hf_token = config.get('hf_token', None)
|
||||||
|
|
||||||
|
# LoRA parameters
|
||||||
|
self.lora_r = config.get('lora_r', 32)
|
||||||
|
self.lora_alpha = config.get('lora_alpha', 16)
|
||||||
|
self.lora_dropout = config.get('lora_dropout', 0)
|
||||||
|
self.target_modules = config.get('target_modules', [
|
||||||
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||||
|
"gate_proj", "up_proj", "down_proj"
|
||||||
|
])
|
||||||
|
|
||||||
|
# Training arguments
|
||||||
|
self.batch_size = config.get('batch_size', 1)
|
||||||
|
self.gradient_accumulation_steps = config.get('gradient_accumulation_steps', 4)
|
||||||
|
self.learning_rate = config.get('learning_rate', 2e-4)
|
||||||
|
self.num_epochs = config.get('num_epochs', 1)
|
||||||
|
self.max_steps = config.get('max_steps', 30)
|
||||||
|
self.warmup_steps = config.get('warmup_steps', 5)
|
||||||
|
self.weight_decay = config.get('weight_decay', 0.01)
|
||||||
|
self.seed = config.get('seed', 3407)
|
||||||
|
|
||||||
|
# Output paths
|
||||||
|
self.output_dir = config.get('output_dir', './outputs')
|
||||||
|
self.model_output_dir = config.get('model_output_dir', './models/instruct')
|
||||||
|
|
||||||
|
# Chat template
|
||||||
|
self.chat_template = config.get('chat_template', 'qwen-2.5')
|
||||||
|
|
||||||
|
def load_model_and_tokenizer(self):
|
||||||
|
"""Load the pre-trained model and tokenizer"""
|
||||||
|
logger.info("Loading model and tokenizer...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name=self.model_name,
|
||||||
|
max_seq_length=self.max_seq_length,
|
||||||
|
dtype=self.dtype,
|
||||||
|
load_in_4bit=self.load_in_4bit,
|
||||||
|
token=self.hf_token
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"✅ Model loaded: {self.model_name}")
|
||||||
|
logger.info(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error loading model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def setup_lora(self):
|
||||||
|
"""Setup LoRA for efficient fine-tuning"""
|
||||||
|
logger.info("Setting up LoRA configuration...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.model = FastLanguageModel.get_peft_model(
|
||||||
|
self.model,
|
||||||
|
r=self.lora_r,
|
||||||
|
target_modules=self.target_modules,
|
||||||
|
lora_alpha=self.lora_alpha,
|
||||||
|
lora_dropout=self.lora_dropout,
|
||||||
|
bias="none",
|
||||||
|
use_gradient_checkpointing="unsloth",
|
||||||
|
random_state=self.seed,
|
||||||
|
use_rslora=False,
|
||||||
|
loftq_config=None
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"✅ LoRA configured with r={self.lora_r}, alpha={self.lora_alpha}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error setting up LoRA: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def setup_chat_template(self):
|
||||||
|
"""Setup chat template for conversation formatting"""
|
||||||
|
logger.info("Setting up chat template...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.tokenizer = get_chat_template(
|
||||||
|
self.tokenizer,
|
||||||
|
chat_template=self.chat_template,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"✅ Chat template configured: {self.chat_template}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error setting up chat template: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def load_dataset(self, dataset_path: str) -> Dataset:
|
||||||
|
"""Load the conversation training dataset"""
|
||||||
|
logger.info(f"Loading conversation dataset from: {dataset_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if Path(dataset_path).exists():
|
||||||
|
# Check if it's a HuggingFace dataset directory
|
||||||
|
if (Path(dataset_path) / "dataset_info.json").exists():
|
||||||
|
# Load from HuggingFace dataset directory
|
||||||
|
dataset = load_from_disk(dataset_path)
|
||||||
|
logger.info(f"Loaded HuggingFace dataset from disk: {len(dataset)} samples")
|
||||||
|
else:
|
||||||
|
# Load from processed conversation data files (JSONL format)
|
||||||
|
logger.info("Loading from processed conversation data files...")
|
||||||
|
from datasets import Dataset
|
||||||
|
import json
|
||||||
|
|
||||||
|
all_data = []
|
||||||
|
data_dir = Path(dataset_path)
|
||||||
|
|
||||||
|
# Look for train.jsonl, validation.jsonl, test.jsonl
|
||||||
|
for split_file in ["train.jsonl", "validation.jsonl", "test.jsonl"]:
|
||||||
|
file_path = data_dir / split_file
|
||||||
|
if file_path.exists():
|
||||||
|
logger.info(f"Loading {split_file}...")
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.strip():
|
||||||
|
data = json.loads(line)
|
||||||
|
all_data.append(data)
|
||||||
|
|
||||||
|
if not all_data:
|
||||||
|
raise ValueError(f"No conversation data found in {dataset_path}")
|
||||||
|
|
||||||
|
# Create HuggingFace dataset
|
||||||
|
dataset = Dataset.from_list(all_data)
|
||||||
|
logger.info(f"Created HuggingFace dataset from {len(all_data)} conversation samples")
|
||||||
|
else:
|
||||||
|
# Try loading from HuggingFace Hub
|
||||||
|
logger.info(f"Attempting to load from HuggingFace Hub: {dataset_path}")
|
||||||
|
dataset = Dataset.load_dataset(dataset_path, split="train")
|
||||||
|
logger.info(f"Loaded from HuggingFace Hub: {len(dataset)} samples")
|
||||||
|
|
||||||
|
logger.info(f"Dataset loaded: {len(dataset)} samples")
|
||||||
|
logger.info(f"Dataset features: {dataset.features}")
|
||||||
|
|
||||||
|
# Verify required fields exist for conversation data
|
||||||
|
required_fields = ["conversation"]
|
||||||
|
missing_fields = [field for field in required_fields if field not in dataset.features]
|
||||||
|
if missing_fields:
|
||||||
|
raise ValueError(f"Missing required fields in conversation dataset: {missing_fields}")
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading conversation dataset: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def format_dataset_for_training(self, dataset: Dataset) -> Dataset:
|
||||||
|
"""Format conversation dataset for training using standardize_sharegpt and apply_chat_template"""
|
||||||
|
logger.info("Formatting conversation dataset for training...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Standardize the ShareGPT format
|
||||||
|
logger.info("Standardizing ShareGPT format...")
|
||||||
|
dataset = standardize_sharegpt(dataset)
|
||||||
|
|
||||||
|
# Define the formatting function for chat templates
|
||||||
|
def formatting_prompts_func(examples):
|
||||||
|
convos = examples["conversation"]
|
||||||
|
texts = [
|
||||||
|
self.tokenizer.apply_chat_template(
|
||||||
|
convo,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=False
|
||||||
|
) for convo in convos
|
||||||
|
]
|
||||||
|
return {"text": texts}
|
||||||
|
|
||||||
|
# Apply the formatting function
|
||||||
|
logger.info("Applying chat template formatting...")
|
||||||
|
dataset = dataset.map(formatting_prompts_func, batched=True)
|
||||||
|
|
||||||
|
logger.info(f"✅ Dataset formatted for training with {len(dataset)} samples")
|
||||||
|
logger.info(f"Sample formatted text: {dataset[0]['text'][:200]}...")
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error formatting dataset: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def setup_trainer(self, train_dataset: Dataset):
|
||||||
|
"""Setup the SFTTrainer for instruction fine-tuning"""
|
||||||
|
logger.info("Setting up SFTTrainer for instruction fine-tuning...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# SFT Configuration
|
||||||
|
sft_config = SFTConfig(
|
||||||
|
per_device_train_batch_size=self.batch_size,
|
||||||
|
gradient_accumulation_steps=self.gradient_accumulation_steps,
|
||||||
|
warmup_steps=self.warmup_steps,
|
||||||
|
max_steps=self.max_steps,
|
||||||
|
learning_rate=self.learning_rate,
|
||||||
|
logging_steps=1,
|
||||||
|
optim="paged_adamw_8bit",
|
||||||
|
weight_decay=self.weight_decay,
|
||||||
|
lr_scheduler_type="linear",
|
||||||
|
seed=self.seed,
|
||||||
|
output_dir=self.output_dir,
|
||||||
|
report_to="none", # Disable wandb for now
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("SFT Configuration:")
|
||||||
|
logger.info(f" batch_size: {self.batch_size}")
|
||||||
|
logger.info(f" gradient_accumulation_steps: {self.gradient_accumulation_steps}")
|
||||||
|
logger.info(f" warmup_steps: {self.warmup_steps}")
|
||||||
|
logger.info(f" max_steps: {self.max_steps}")
|
||||||
|
logger.info(f" learning_rate: {self.learning_rate}")
|
||||||
|
|
||||||
|
# Create SFTTrainer
|
||||||
|
self.trainer = SFTTrainer(
|
||||||
|
model=self.model,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
train_dataset=train_dataset,
|
||||||
|
dataset_text_field="text",
|
||||||
|
max_seq_length=self.max_seq_length,
|
||||||
|
data_collator=DataCollatorForSeq2Seq(tokenizer=self.tokenizer),
|
||||||
|
packing=False, # Disable packing for conversation data
|
||||||
|
args=sft_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("✅ SFTTrainer configured successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error setting up trainer: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error("Full error traceback:")
|
||||||
|
traceback.print_exc()
|
||||||
|
raise
|
||||||
|
|
||||||
|
def setup_response_only_training(self):
|
||||||
|
"""Setup training to only learn from assistant responses"""
|
||||||
|
logger.info("Setting up response-only training...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Configure trainer to only train on responses
|
||||||
|
self.trainer = train_on_responses_only(
|
||||||
|
self.trainer,
|
||||||
|
instruction_part="<|im_start|>user\n",
|
||||||
|
response_part="<|im_start|>assistant\n",
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("✅ Response-only training configured")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error setting up response-only training: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def train(self, dataset_path: str):
|
||||||
|
"""Run the instruction fine-tuning process"""
|
||||||
|
logger.info("🚀 Starting instruction fine-tuning process...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load model and tokenizer
|
||||||
|
logger.info("Step 1: Loading model and tokenizer...")
|
||||||
|
self.load_model_and_tokenizer()
|
||||||
|
|
||||||
|
# Setup LoRA
|
||||||
|
logger.info("Step 2: Setting up LoRA...")
|
||||||
|
self.setup_lora()
|
||||||
|
|
||||||
|
# Setup chat template
|
||||||
|
logger.info("Step 3: Setting up chat template...")
|
||||||
|
self.setup_chat_template()
|
||||||
|
|
||||||
|
# Load dataset
|
||||||
|
logger.info(f"Step 4: Loading conversation dataset from: {dataset_path}")
|
||||||
|
train_dataset = self.load_dataset(dataset_path)
|
||||||
|
|
||||||
|
# Format dataset for training
|
||||||
|
logger.info("Step 5: Formatting dataset for training...")
|
||||||
|
formatted_dataset = self.format_dataset_for_training(train_dataset)
|
||||||
|
|
||||||
|
# Setup trainer
|
||||||
|
logger.info("Step 6: Setting up trainer...")
|
||||||
|
self.setup_trainer(formatted_dataset)
|
||||||
|
|
||||||
|
# Setup response-only training (optional but recommended for chat models)
|
||||||
|
logger.info("Step 7: Setting up response-only training...")
|
||||||
|
self.setup_response_only_training()
|
||||||
|
|
||||||
|
# Start training
|
||||||
|
logger.info("Step 8: Starting training...")
|
||||||
|
trainer_stats = self.trainer.train()
|
||||||
|
|
||||||
|
logger.info("✅ Instruction fine-tuning completed successfully!")
|
||||||
|
logger.info(f"Training stats: {trainer_stats}")
|
||||||
|
|
||||||
|
# Save the model
|
||||||
|
self.save_model()
|
||||||
|
|
||||||
|
return trainer_stats
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Instruction fine-tuning failed: {e}")
|
||||||
|
import traceback
|
||||||
|
logger.error("Full error traceback:")
|
||||||
|
traceback.print_exc()
|
||||||
|
raise
|
||||||
|
|
||||||
|
def save_model(self):
|
||||||
|
"""Save the trained instruction model"""
|
||||||
|
logger.info("Saving trained instruction model...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create output directory
|
||||||
|
Path(self.model_output_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save model and tokenizer
|
||||||
|
self.model.save_pretrained(self.model_output_dir)
|
||||||
|
self.tokenizer.save_pretrained(self.model_output_dir)
|
||||||
|
|
||||||
|
# Save training config
|
||||||
|
config_path = Path(self.model_output_dir) / "training_config.json"
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
json.dump(self.config, f, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"✅ Instruction model saved to: {self.model_output_dir}")
|
||||||
|
logger.info(f"✅ You can now use this model for inference")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error saving model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def prepare_for_inference(self):
|
||||||
|
"""Prepare model for inference"""
|
||||||
|
logger.info("Preparing model for inference...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
FastLanguageModel.for_inference(self.model)
|
||||||
|
logger.info("✅ Model prepared for inference")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error preparing for inference: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def load_training_config(yaml_path: str) -> Dict[str, Any]:
|
||||||
|
"""Load training configuration from YAML file"""
|
||||||
|
try:
|
||||||
|
with open(yaml_path, 'r') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
training_config = {}
|
||||||
|
|
||||||
|
# Model configuration - extract from model section
|
||||||
|
if 'model' in config:
|
||||||
|
model_config = config['model']
|
||||||
|
training_config.update({
|
||||||
|
'model_name': model_config.get('name', 'unsloth/Qwen2.5-72B-Instruct'),
|
||||||
|
'max_seq_length': int(model_config.get('max_seq_length', 2048)),
|
||||||
|
'dtype': model_config.get('dtype', None),
|
||||||
|
'load_in_4bit': model_config.get('load_in_4bit', True),
|
||||||
|
'hf_token': model_config.get('token', None)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Training configuration - extract from training section
|
||||||
|
if 'training' in config:
|
||||||
|
training_data = config['training']
|
||||||
|
logger.info("Training data from YAML:")
|
||||||
|
logger.info(f" num_epochs: {training_data.get('num_epochs')} (type: {type(training_data.get('num_epochs'))})")
|
||||||
|
logger.info(f" batch_size: {training_data.get('batch_size')} (type: {type(training_data.get('batch_size'))})")
|
||||||
|
logger.info(f" learning_rate: {training_data.get('learning_rate')} (type: {type(training_data.get('learning_rate'))})")
|
||||||
|
logger.info(f" weight_decay: {training_data.get('weight_decay')} (type: {type(training_data.get('weight_decay'))})")
|
||||||
|
logger.info(f" warmup_steps: {training_data.get('warmup_steps')} (type: {type(training_data.get('warmup_steps'))})")
|
||||||
|
logger.info(f" max_steps: {training_data.get('max_steps')} (type: {type(training_data.get('max_steps'))})")
|
||||||
|
logger.info(f" gradient_accumulation_steps: {training_data.get('gradient_accumulation_steps')} (type: {type(training_data.get('gradient_accumulation_steps'))})")
|
||||||
|
logger.info(f" seed: {training_data.get('seed')} (type: {type(training_data.get('seed'))})")
|
||||||
|
logger.info(f" model_output_dir: {training_data.get('model_output_dir')} (type: {type(training_data.get('model_output_dir'))})")
|
||||||
|
|
||||||
|
training_config.update({
|
||||||
|
'num_epochs': int(training_data.get('num_epochs', 1)),
|
||||||
|
'batch_size': int(training_data.get('batch_size', 1)),
|
||||||
|
'learning_rate': float(training_data.get('learning_rate', 2e-4)),
|
||||||
|
'weight_decay': float(training_data.get('weight_decay', 0.01)),
|
||||||
|
'warmup_steps': int(training_data.get('warmup_steps', 5)),
|
||||||
|
'max_steps': int(training_data.get('max_steps', 30)),
|
||||||
|
'gradient_accumulation_steps': int(training_data.get('gradient_accumulation_steps', 4)),
|
||||||
|
'lr_scheduler_type': training_data.get('lr_scheduler_type', 'linear'),
|
||||||
|
'seed': int(training_data.get('seed', 3407)),
|
||||||
|
'model_output_dir': training_data.get('model_output_dir', './models/instruct'),
|
||||||
|
# LoRA configuration
|
||||||
|
'lora_r': int(training_data.get('lora_r', 32)),
|
||||||
|
'lora_alpha': int(training_data.get('lora_alpha', 16)),
|
||||||
|
'lora_dropout': float(training_data.get('lora_dropout', 0)),
|
||||||
|
'target_modules': training_data.get('target_modules', [
|
||||||
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||||
|
"gate_proj", "up_proj", "down_proj"
|
||||||
|
])
|
||||||
|
})
|
||||||
|
|
||||||
|
# Data configuration - use output_dir from data section
|
||||||
|
if 'data' in config:
|
||||||
|
data_config = config['data']
|
||||||
|
output_dir = data_config.get('output_dir', './data/processed/instruct')
|
||||||
|
training_config.update({
|
||||||
|
'data_output_dir': output_dir,
|
||||||
|
'dataset_path': output_dir, # Default dataset path is the output_dir
|
||||||
|
})
|
||||||
|
|
||||||
|
# Output configuration
|
||||||
|
training_config.update({
|
||||||
|
'output_dir': './outputs',
|
||||||
|
'chat_template': 'qwen-2.5' # Use Qwen chat template by default
|
||||||
|
})
|
||||||
|
|
||||||
|
logger.info("Final training_config:")
|
||||||
|
for key, value in training_config.items():
|
||||||
|
logger.info(f" {key}: {value} (type: {type(value)})")
|
||||||
|
|
||||||
|
return training_config
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading training config: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main training function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruction Fine-tuning Training Pipeline")
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--dataset", type=str, help="Path to training dataset (conversation data path)")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory for model")
|
||||||
|
parser.add_argument("--epochs", type=int, help="Number of training epochs")
|
||||||
|
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||||
|
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||||
|
parser.add_argument("--max-steps", type=int, help="Maximum training steps")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load configuration
|
||||||
|
logger.info(f"Loading configuration from: {args.config}")
|
||||||
|
training_config = load_training_config(args.config)
|
||||||
|
|
||||||
|
# Override with CLI arguments
|
||||||
|
if args.output_dir:
|
||||||
|
training_config['model_output_dir'] = args.output_dir
|
||||||
|
if args.epochs:
|
||||||
|
training_config['num_epochs'] = int(args.epochs)
|
||||||
|
if args.batch_size:
|
||||||
|
training_config['batch_size'] = int(args.batch_size)
|
||||||
|
if args.learning_rate:
|
||||||
|
training_config['learning_rate'] = float(args.learning_rate)
|
||||||
|
if args.max_steps:
|
||||||
|
training_config['max_steps'] = int(args.max_steps)
|
||||||
|
|
||||||
|
# Determine dataset path: CLI argument takes precedence, then YAML config
|
||||||
|
dataset_path = args.dataset or training_config.get('dataset_path')
|
||||||
|
if not dataset_path:
|
||||||
|
logger.error("No dataset path provided. Use --dataset or ensure output_dir is set in YAML config.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
logger.info("Training configuration:")
|
||||||
|
for key, value in training_config.items():
|
||||||
|
logger.info(f" {key}: {value}")
|
||||||
|
logger.info(f" Dataset path: {dataset_path}")
|
||||||
|
|
||||||
|
# Initialize trainer
|
||||||
|
trainer = InstructTrainer(training_config)
|
||||||
|
|
||||||
|
# Start training
|
||||||
|
trainer.train(dataset_path)
|
||||||
|
|
||||||
|
logger.info("Instruction fine-tuning completed successfully!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Instruction fine-tuning failed: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,320 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct data processor script that uses YAML configurations.
|
||||||
|
This provides a flexible and maintainable approach for instruction fine-tuning tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||||
|
"""Run instruct data processor with YAML configuration"""
|
||||||
|
print(f"=== Running Instruct Data Processor with YAML config: {config_path} ===")
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"python", "pipelines/instruct/data_processor.py",
|
||||||
|
"--config", config_path
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add CLI overrides
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if value is not None:
|
||||||
|
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||||
|
|
||||||
|
print(f"Running command: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("✅ Instruct data processing completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Error running instruct data processor: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def run_instruct_examples():
|
||||||
|
"""Run instruct examples with YAML configs"""
|
||||||
|
|
||||||
|
# Example 1: Code reasoning instruction tuning
|
||||||
|
print("=== Example 1: Code Reasoning Instruction Tuning ===")
|
||||||
|
success = run_with_yaml_config(
|
||||||
|
"configs/instruct/code_reasoning.yaml",
|
||||||
|
max_samples=1000, # Override YAML value
|
||||||
|
output_format="conversation"
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("✅ Code reasoning instruction tuning data processing completed!")
|
||||||
|
|
||||||
|
# Example 2: Custom conversation dataset (if available)
|
||||||
|
print("\n=== Example 2: Custom Conversation Dataset ===")
|
||||||
|
if os.path.exists("data/raw/instruct/custom_conversations.jsonl"):
|
||||||
|
success = run_with_yaml_config(
|
||||||
|
"configs/instruct/code_reasoning.yaml", # Use code_reasoning config as base
|
||||||
|
data_source="custom",
|
||||||
|
data_path="data/raw/instruct/custom_conversations.jsonl",
|
||||||
|
output_dir="./data/processed/instruct/custom"
|
||||||
|
)
|
||||||
|
if success:
|
||||||
|
print("✅ Custom conversation dataset processing completed!")
|
||||||
|
else:
|
||||||
|
print("⚠️ Custom conversation dataset not found, skipping...")
|
||||||
|
print(" You can create one with the 'create-sample-data' option")
|
||||||
|
|
||||||
|
def create_sample_instruct_data():
|
||||||
|
"""Create sample conversation dataset for testing"""
|
||||||
|
sample_data = [
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "Can you explain what a Python list comprehension is?"},
|
||||||
|
{"role": "assistant", "content": "A Python list comprehension is a concise way to create lists. It allows you to generate a new list by applying an expression to each item in an existing iterable, optionally filtering items with a condition. The basic syntax is: [expression for item in iterable if condition]. For example: [x**2 for x in range(10) if x%2==0] creates a list of squares of even numbers from 0 to 8."}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "How do I handle exceptions in Python?"},
|
||||||
|
{"role": "assistant", "content": "In Python, you handle exceptions using try-except blocks. Here's the basic structure:\n\ntry:\n # Code that might raise an exception\n risky_code()\nexcept SpecificException as e:\n # Handle specific exception\n print(f'Error: {e}')\nexcept Exception as e:\n # Handle any other exception\n print(f'Unexpected error: {e}')\nelse:\n # Runs if no exception occurred\n print('Success!')\nfinally:\n # Always runs, regardless of exceptions\n cleanup_code()\n\nYou can catch multiple specific exceptions and handle them differently."}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "What's the difference between a function and a method in Python?"},
|
||||||
|
{"role": "assistant", "content": "In Python, the main differences between functions and methods are:\n\n1. **Definition location**: Functions are defined independently, while methods are defined inside classes.\n\n2. **Calling**: Functions are called directly by name, methods are called on objects using dot notation.\n\n3. **First parameter**: Methods automatically receive 'self' as the first parameter (the instance they're called on), functions don't have this automatic parameter.\n\nExample:\n```python\n# Function\ndef my_function(x):\n return x * 2\n\n# Method (inside a class)\nclass MyClass:\n def my_method(self, x):\n return x * 2\n\n# Usage\nresult1 = my_function(5) # Function call\nobj = MyClass()\nresult2 = obj.my_method(5) # Method call\n```"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "How can I optimize a slow Python loop?"},
|
||||||
|
{"role": "assistant", "content": "Here are several strategies to optimize slow Python loops:\n\n1. **Use list comprehensions** instead of explicit loops when possible\n2. **Vectorize with NumPy** for numerical operations\n3. **Use built-in functions** like map(), filter(), sum()\n4. **Avoid repeated lookups** - store frequently accessed values in variables\n5. **Use sets for membership testing** instead of lists\n6. **Consider using itertools** for efficient iteration patterns\n7. **Profile your code** with cProfile to identify bottlenecks\n8. **Use numba or Cython** for compute-intensive loops\n\nExample optimization:\n```python\n# Slow\nresult = []\nfor i in range(1000000):\n if i % 2 == 0:\n result.append(i ** 2)\n\n# Faster\nresult = [i ** 2 for i in range(1000000) if i % 2 == 0]\n\n# Even faster with NumPy\nimport numpy as np\narr = np.arange(1000000)\nresult = (arr[arr % 2 == 0] ** 2).tolist()\n```"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "What are Python decorators and how do I use them?"},
|
||||||
|
{"role": "assistant", "content": "Python decorators are a powerful feature that allows you to modify or extend the behavior of functions or classes without permanently modifying their code. They use the @ symbol and are placed above the function definition.\n\n**Basic decorator example:**\n```python\ndef my_decorator(func):\n def wrapper(*args, **kwargs):\n print('Before function call')\n result = func(*args, **kwargs)\n print('After function call')\n return result\n return wrapper\n\n@my_decorator\ndef greet(name):\n print(f'Hello, {name}!')\n\ngreet('Alice') # Prints before, greeting, and after messages\n```\n\n**Common use cases:**\n- Timing function execution\n- Logging\n- Authentication/authorization\n- Caching results\n- Input validation\n\n**Built-in decorators:**\n- `@property` - creates getter/setter methods\n- `@staticmethod` - methods that don't need self or cls\n- `@classmethod` - methods that receive the class as first argument\n\nDecorators make code more modular and reusable by separating concerns."}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create directory structure
|
||||||
|
data_dir = Path("data/raw/instruct")
|
||||||
|
data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save sample data
|
||||||
|
import json
|
||||||
|
sample_file = data_dir / "code_reasoning.jsonl"
|
||||||
|
with open(sample_file, 'w', encoding='utf-8') as f:
|
||||||
|
for item in sample_data:
|
||||||
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||||
|
|
||||||
|
print(f"✅ Created sample conversation dataset: {sample_file}")
|
||||||
|
print(f" Contains {len(sample_data)} conversation examples")
|
||||||
|
print(f" Format: conversation array with role/content pairs")
|
||||||
|
print(f" Ready to use with configs/instruct/code_reasoning.yaml")
|
||||||
|
|
||||||
|
def create_custom_instruct_config():
|
||||||
|
"""Create a custom instruct configuration file"""
|
||||||
|
custom_config = """# Custom Instruct Configuration
|
||||||
|
task:
|
||||||
|
name: "general_chat"
|
||||||
|
type: "instruction_following"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "custom"
|
||||||
|
data_path: "./data/raw/instruct/general_chat.jsonl"
|
||||||
|
data_format: "jsonl"
|
||||||
|
conversation_field: "conversation"
|
||||||
|
max_length: 2048
|
||||||
|
min_length: 10
|
||||||
|
clean_text: true
|
||||||
|
train_split: 0.8
|
||||||
|
validation_split: 0.1
|
||||||
|
test_split: 0.1
|
||||||
|
output_format: "conversation"
|
||||||
|
output_dir: "./data/processed/instruct/general_chat"
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "unsloth/Qwen2.5-7B-Instruct"
|
||||||
|
max_length: 2048
|
||||||
|
max_seq_length: 2048
|
||||||
|
dtype: null
|
||||||
|
load_in_4bit: true
|
||||||
|
token: null
|
||||||
|
training_model: "unsloth/Qwen2.5-7B-Instruct"
|
||||||
|
training_max_seq_length: 2048
|
||||||
|
training_dtype: null
|
||||||
|
training_load_in_4bit: true
|
||||||
|
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
batch_size: 1
|
||||||
|
learning_rate: 2e-4
|
||||||
|
weight_decay: 0.01
|
||||||
|
warmup_steps: 5
|
||||||
|
max_steps: 50
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
lr_scheduler_type: "linear"
|
||||||
|
seed: 3407
|
||||||
|
lora_r: 16
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
output_dir: "./outputs"
|
||||||
|
model_output_dir: "./models/instruct/general_chat"
|
||||||
|
|
||||||
|
inference:
|
||||||
|
batch_size: 1
|
||||||
|
max_new_tokens: 256
|
||||||
|
temperature: 0.8
|
||||||
|
min_p: 0.1
|
||||||
|
use_cache: true
|
||||||
|
"""
|
||||||
|
|
||||||
|
config_path = "configs/instruct/general_chat.yaml"
|
||||||
|
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
||||||
|
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
f.write(custom_config)
|
||||||
|
|
||||||
|
print(f"✅ Created custom instruct config: {config_path}")
|
||||||
|
print(" This config is set up for general chat instruction tuning")
|
||||||
|
|
||||||
|
def handle_direct_args():
|
||||||
|
"""Handle direct command-line arguments by passing them to the instruct pipeline"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruct Data Processor")
|
||||||
|
|
||||||
|
# Add all the same arguments as the instruct pipeline
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||||||
|
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||||||
|
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||||||
|
parser.add_argument("--data-format", choices=["jsonl", "json"], help="Data format")
|
||||||
|
parser.add_argument("--conversation-field", type=str, help="Conversation field name")
|
||||||
|
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||||||
|
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||||||
|
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||||||
|
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Build command to call the instruct pipeline
|
||||||
|
cmd = ["python", "pipelines/instruct/data_processor.py"]
|
||||||
|
|
||||||
|
# Add all arguments that were provided
|
||||||
|
for arg_name, arg_value in vars(args).items():
|
||||||
|
if arg_value is not None:
|
||||||
|
if isinstance(arg_value, bool):
|
||||||
|
if arg_value: # Only add flag if True
|
||||||
|
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||||
|
else:
|
||||||
|
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("✅ Instruct data processing completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Error running instruct data processor: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def show_instruct_features():
|
||||||
|
"""Show the features of the instruct data processor"""
|
||||||
|
print("=== Instruct Data Processor Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Instruction Fine-tuning Tasks**:")
|
||||||
|
print(" - Code reasoning and explanation")
|
||||||
|
print(" - General conversation and chat")
|
||||||
|
print(" - Question answering")
|
||||||
|
print(" - Task-specific instruction following")
|
||||||
|
print()
|
||||||
|
print("2. **Conversation Data Formats Supported**:")
|
||||||
|
print(" - HuggingFace conversation datasets")
|
||||||
|
print(" - Custom JSONL/JSON files with conversation arrays")
|
||||||
|
print(" - ShareGPT format with role/content structure")
|
||||||
|
print(" - Automatic train/validation/test splits")
|
||||||
|
print()
|
||||||
|
print("3. **Conversation Validation**:")
|
||||||
|
print(" - Role validation (user/assistant/system)")
|
||||||
|
print(" - Content length and quality checks")
|
||||||
|
print(" - Conversation structure validation")
|
||||||
|
print(" - Turn-level statistics and analysis")
|
||||||
|
print()
|
||||||
|
print("4. **Advanced Features**:")
|
||||||
|
print(" - Configurable conversation field mapping")
|
||||||
|
print(" - Text preprocessing options")
|
||||||
|
print(" - Automatic dataset saving/loading")
|
||||||
|
print(" - YAML configuration support")
|
||||||
|
print(" - Compatible with Unsloth chat templates")
|
||||||
|
print()
|
||||||
|
print("=== Usage Examples ===")
|
||||||
|
print()
|
||||||
|
print("1. Use YAML config only:")
|
||||||
|
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||||
|
print()
|
||||||
|
print("2. Override YAML values:")
|
||||||
|
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml --max-samples 500")
|
||||||
|
print()
|
||||||
|
print("3. Create sample data:")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-sample-data")
|
||||||
|
print()
|
||||||
|
print("4. Create custom config:")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-config")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
# Check if it's a subcommand
|
||||||
|
if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
|
||||||
|
# Handle subcommands
|
||||||
|
if sys.argv[1] == "examples":
|
||||||
|
run_instruct_examples()
|
||||||
|
elif sys.argv[1] == "create-sample-data":
|
||||||
|
create_sample_instruct_data()
|
||||||
|
elif sys.argv[1] == "create-config":
|
||||||
|
create_custom_instruct_config()
|
||||||
|
elif sys.argv[1] == "features":
|
||||||
|
show_instruct_features()
|
||||||
|
else:
|
||||||
|
# Handle direct arguments (pass through to pipeline)
|
||||||
|
handle_direct_args()
|
||||||
|
else:
|
||||||
|
print("Instruct Data Processor")
|
||||||
|
print("======================")
|
||||||
|
print()
|
||||||
|
print("This script runs the instruct data processor for instruction fine-tuning tasks.")
|
||||||
|
print("It supports both YAML configurations and command-line overrides.")
|
||||||
|
print()
|
||||||
|
print("Usage:")
|
||||||
|
print(" python scripts/instruct/data_processor.py examples # Run examples")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-sample-data # Create sample dataset")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-config # Create custom config")
|
||||||
|
print(" python scripts/instruct/data_processor.py features # Show features")
|
||||||
|
print()
|
||||||
|
print("Direct pipeline usage:")
|
||||||
|
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||||
|
print(" python scripts/instruct/data_processor.py --data-source custom --data-path ./conversations.jsonl")
|
||||||
|
print()
|
||||||
|
print("Key Features:")
|
||||||
|
print(" ✅ Instruction fine-tuning with conversation data")
|
||||||
|
print(" ✅ Multiple data source support")
|
||||||
|
print(" ✅ YAML configuration files")
|
||||||
|
print(" ✅ CLI argument overrides")
|
||||||
|
print(" ✅ Conversation validation and analysis")
|
||||||
|
print(" ✅ Compatible with Unsloth chat templates")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,291 @@
|
|||||||
|
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct Inference Script
|
||||||
|
Provides a command-line interface to run the instruct inference pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_inference_with_config(config_path: str, message: str = "", max_tokens: int = 128, stream: bool = False, interactive: bool = False):
|
||||||
|
"""Run inference using a YAML configuration file"""
|
||||||
|
print(f"Running instruct inference with config: {config_path}")
|
||||||
|
if interactive:
|
||||||
|
print("Mode: Interactive chat")
|
||||||
|
elif message:
|
||||||
|
print(f"Message: {message}")
|
||||||
|
print(f"Max tokens: {max_tokens}")
|
||||||
|
print(f"Streaming: {stream}")
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"python", "pipelines/instruct/inference.py",
|
||||||
|
"--config", config_path,
|
||||||
|
"--max-tokens", str(max_tokens)
|
||||||
|
]
|
||||||
|
|
||||||
|
if interactive:
|
||||||
|
cmd.append("--interactive")
|
||||||
|
elif message:
|
||||||
|
cmd.extend(["--message", message])
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
cmd.append("--stream")
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if interactive:
|
||||||
|
# For interactive mode, don't capture output
|
||||||
|
result = subprocess.run(cmd, check=True)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||||
|
print("✅ Inference completed successfully!")
|
||||||
|
print("Output:")
|
||||||
|
print(result.stdout)
|
||||||
|
return result.stdout
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Inference failed: {e}")
|
||||||
|
print("Error output:")
|
||||||
|
print(e.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def run_conversation_example(config_path: str):
|
||||||
|
"""Run a conversation example"""
|
||||||
|
print(f"=== Conversation Example ===")
|
||||||
|
print(f"Config: {config_path}")
|
||||||
|
|
||||||
|
example_messages = [
|
||||||
|
"Can you explain what recursion is in programming?",
|
||||||
|
"How do I debug a Python program?",
|
||||||
|
"What's the difference between a list and a tuple in Python?",
|
||||||
|
"Can you show me how to use a for loop?",
|
||||||
|
"What are the benefits of using functions in programming?"
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Running example conversations...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for i, message in enumerate(example_messages):
|
||||||
|
print(f"--- Example {i+1} ---")
|
||||||
|
result = run_inference_with_config(config_path, message, max_tokens=256)
|
||||||
|
if not result:
|
||||||
|
print(f"❌ Failed to process message {i+1}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("✅ Conversation examples completed!")
|
||||||
|
|
||||||
|
def show_inference_features():
|
||||||
|
"""Show the features of the instruct inference pipeline"""
|
||||||
|
print("=== Instruct Inference Pipeline Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Model Support**:")
|
||||||
|
print(" - Trained LoRA models from instruct training pipeline")
|
||||||
|
print(" - Automatic model loading from config")
|
||||||
|
print(" - Native Unsloth inference optimization")
|
||||||
|
print(" - Chat template integration")
|
||||||
|
print()
|
||||||
|
print("2. **Inference Modes**:")
|
||||||
|
print(" - Single message inference")
|
||||||
|
print(" - Interactive chat session")
|
||||||
|
print(" - Streaming generation")
|
||||||
|
print(" - Batch conversation processing")
|
||||||
|
print()
|
||||||
|
print("3. **Conversation Features**:")
|
||||||
|
print(" - Multi-turn conversation support")
|
||||||
|
print(" - Context preservation across turns")
|
||||||
|
print(" - Proper role handling (user/assistant/system)")
|
||||||
|
print(" - Chat history management")
|
||||||
|
print()
|
||||||
|
print("4. **Generation Control**:")
|
||||||
|
print(" - Configurable max tokens")
|
||||||
|
print(" - Temperature and sampling parameters")
|
||||||
|
print(" - Streaming output support")
|
||||||
|
print(" - Chat template formatting")
|
||||||
|
print()
|
||||||
|
print("5. **Interactive Features**:")
|
||||||
|
print(" - Real-time chat interface")
|
||||||
|
print(" - Command support (clear, stream toggle)")
|
||||||
|
print(" - Conversation history tracking")
|
||||||
|
print(" - Graceful exit handling")
|
||||||
|
print()
|
||||||
|
print("6. **Usage Examples**:")
|
||||||
|
print(" - Single message: --message 'your question here'")
|
||||||
|
print(" - Interactive chat: --interactive")
|
||||||
|
print(" - Streaming: add --stream flag")
|
||||||
|
print(" - Custom tokens: --max-tokens 256")
|
||||||
|
|
||||||
|
def create_inference_example():
|
||||||
|
"""Create an inference example using the code reasoning configuration"""
|
||||||
|
print("=== Inference Example: Code Reasoning Chat ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check if we have the required files
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor and training first")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("✅ Found configuration file!")
|
||||||
|
print(f" Config: {config_path}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Example conversation
|
||||||
|
example_message = "Can you explain what a Python decorator is and show me a simple example?"
|
||||||
|
|
||||||
|
print(f"Example message: {example_message}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
success = run_inference_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
message=example_message,
|
||||||
|
max_tokens=256
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("✅ Example inference completed successfully!")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Example inference failed!")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def start_interactive_chat(config_path: str, stream: bool = False):
|
||||||
|
"""Start an interactive chat session"""
|
||||||
|
print("=== Interactive Chat Session ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor and training first")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"Starting interactive chat with config: {config_path}")
|
||||||
|
print("Streaming:", "enabled" if stream else "disabled")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run interactive inference
|
||||||
|
success = run_inference_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
interactive=True,
|
||||||
|
stream=stream
|
||||||
|
)
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def create_batch_test():
|
||||||
|
"""Create a batch test with multiple questions"""
|
||||||
|
print("=== Batch Test: Multiple Questions ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor and training first")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create a batch of test questions
|
||||||
|
test_questions = [
|
||||||
|
"What is object-oriented programming?",
|
||||||
|
"How do you handle errors in Python?",
|
||||||
|
"Explain the concept of variables in programming.",
|
||||||
|
"What's the difference between a compiler and an interpreter?"
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Running batch test with multiple questions...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
for i, question in enumerate(test_questions):
|
||||||
|
print(f"Question {i+1}: {question}")
|
||||||
|
result = run_inference_with_config(config_path, question, max_tokens=200)
|
||||||
|
if result:
|
||||||
|
success_count += 1
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
print(f"✅ Batch test completed: {success_count}/{len(test_questions)} questions processed successfully")
|
||||||
|
return success_count == len(test_questions)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main inference function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruct Inference Pipeline")
|
||||||
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||||
|
|
||||||
|
# Inference command
|
||||||
|
infer_parser = subparsers.add_parser("infer", help="Run single inference")
|
||||||
|
infer_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
infer_parser.add_argument("--message", type=str, required=True, help="Message to send to the model")
|
||||||
|
infer_parser.add_argument("--max-tokens", type=int, default=128, help="Maximum new tokens to generate")
|
||||||
|
infer_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||||
|
|
||||||
|
# Interactive command
|
||||||
|
interactive_parser = subparsers.add_parser("chat", help="Start interactive chat")
|
||||||
|
interactive_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
interactive_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||||
|
|
||||||
|
# Batch test command
|
||||||
|
batch_parser = subparsers.add_parser("batch", help="Run batch test")
|
||||||
|
batch_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
|
||||||
|
# Conversation example command
|
||||||
|
conv_parser = subparsers.add_parser("conversation", help="Run conversation examples")
|
||||||
|
conv_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
|
||||||
|
# Features command
|
||||||
|
subparsers.add_parser("features", help="Show available features")
|
||||||
|
|
||||||
|
# Example command
|
||||||
|
subparsers.add_parser("example", help="Run example inference")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "infer":
|
||||||
|
run_inference_with_config(
|
||||||
|
args.config,
|
||||||
|
args.message,
|
||||||
|
args.max_tokens,
|
||||||
|
args.stream
|
||||||
|
)
|
||||||
|
elif args.command == "chat":
|
||||||
|
start_interactive_chat(args.config, args.stream)
|
||||||
|
elif args.command == "batch":
|
||||||
|
create_batch_test()
|
||||||
|
elif args.command == "conversation":
|
||||||
|
run_conversation_example(args.config)
|
||||||
|
elif args.command == "features":
|
||||||
|
show_inference_features()
|
||||||
|
elif args.command == "example":
|
||||||
|
create_inference_example()
|
||||||
|
else:
|
||||||
|
print("Instruct Inference Pipeline")
|
||||||
|
print("==========================")
|
||||||
|
print()
|
||||||
|
print("Available commands:")
|
||||||
|
print(" infer - Run single message inference")
|
||||||
|
print(" chat - Start interactive chat session")
|
||||||
|
print(" batch - Run batch test with multiple questions")
|
||||||
|
print(" conversation - Run conversation examples")
|
||||||
|
print(" features - Show available features")
|
||||||
|
print(" example - Run example inference")
|
||||||
|
print()
|
||||||
|
print("Examples:")
|
||||||
|
print(" python scripts/instruct/inference.py infer --config configs/instruct/code_reasoning.yaml --message 'Explain Python loops'")
|
||||||
|
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml")
|
||||||
|
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml --stream")
|
||||||
|
print()
|
||||||
|
print("Key Features:")
|
||||||
|
print(" ✅ Interactive chat with conversation history")
|
||||||
|
print(" ✅ Streaming generation support")
|
||||||
|
print(" ✅ Multi-turn conversation handling")
|
||||||
|
print(" ✅ Chat template integration")
|
||||||
|
print(" ✅ Configurable generation parameters")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,214 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct Training Script
|
||||||
|
Provides a command-line interface to run the instruct training pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides):
|
||||||
|
"""Run the instruct training pipeline with YAML configuration"""
|
||||||
|
print(f"Starting instruct training with config: {config_path}")
|
||||||
|
if dataset_path:
|
||||||
|
print(f"Training dataset: {dataset_path}")
|
||||||
|
else:
|
||||||
|
print("Training dataset: Will use output_dir from YAML config")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Build command
|
||||||
|
cmd = ["python", "pipelines/instruct/train.py", "--config", config_path]
|
||||||
|
|
||||||
|
# Add dataset path if provided
|
||||||
|
if dataset_path:
|
||||||
|
cmd.extend(["--dataset", dataset_path])
|
||||||
|
|
||||||
|
# Add CLI overrides
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if value is not None:
|
||||||
|
if key == "output_dir":
|
||||||
|
cmd.extend(["--output-dir", str(value)])
|
||||||
|
elif key == "epochs":
|
||||||
|
cmd.extend(["--epochs", str(value)])
|
||||||
|
elif key == "batch_size":
|
||||||
|
cmd.extend(["--batch-size", str(value)])
|
||||||
|
elif key == "learning_rate":
|
||||||
|
cmd.extend(["--learning-rate", str(value)])
|
||||||
|
elif key == "max_steps":
|
||||||
|
cmd.extend(["--max-steps", str(value)])
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("Training completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Training failed: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def show_training_features():
|
||||||
|
"""Show the features of the instruct training pipeline"""
|
||||||
|
print("=== Instruct Training Pipeline Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Model Support**:")
|
||||||
|
print(" - Unsloth optimized models (4x faster)")
|
||||||
|
print(" - LoRA fine-tuning for efficiency")
|
||||||
|
print(" - Support for Qwen2.5, Llama-3.1, Mistral, Phi-3")
|
||||||
|
print(" - Chat template integration")
|
||||||
|
print()
|
||||||
|
print("2. **Training Features**:")
|
||||||
|
print(" - SFTTrainer with conversation data")
|
||||||
|
print(" - Response-only training (train only on assistant responses)")
|
||||||
|
print(" - ShareGPT format standardization")
|
||||||
|
print(" - Automatic mixed precision (FP16/BF16)")
|
||||||
|
print(" - Gradient checkpointing for memory efficiency")
|
||||||
|
print(" - Configurable LoRA parameters")
|
||||||
|
print()
|
||||||
|
print("3. **Conversation Handling**:")
|
||||||
|
print(" - Multi-turn conversation support")
|
||||||
|
print(" - Proper chat template formatting")
|
||||||
|
print(" - Role-based training (user/assistant/system)")
|
||||||
|
print(" - Context preservation across turns")
|
||||||
|
print()
|
||||||
|
print("4. **Configuration**:")
|
||||||
|
print(" - YAML configuration files")
|
||||||
|
print(" - CLI argument overrides")
|
||||||
|
print(" - Automatic device detection")
|
||||||
|
print(" - Flexible LoRA configuration")
|
||||||
|
print()
|
||||||
|
print("5. **Output**:")
|
||||||
|
print(" - Saved LoRA models")
|
||||||
|
print(" - Training logs and checkpoints")
|
||||||
|
print(" - Ready for conversational inference")
|
||||||
|
|
||||||
|
def create_training_example():
|
||||||
|
"""Create a training example using the code reasoning configuration"""
|
||||||
|
print("=== Training Example: Code Reasoning Instruction Tuning ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check if we have the required files
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor first to create the configuration")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("Found required files!")
|
||||||
|
print(f" Config: {config_path}")
|
||||||
|
print(" Dataset: Will use output_dir from YAML config")
|
||||||
|
print(" The training pipeline will automatically:")
|
||||||
|
print(" - Load conversation data from the output_dir specified in YAML")
|
||||||
|
print(" - Convert JSONL files to HuggingFace dataset format")
|
||||||
|
print(" - Apply ShareGPT standardization")
|
||||||
|
print(" - Format conversations with chat templates")
|
||||||
|
print(" - Train the model using SFTTrainer with response-only training")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run training without explicit dataset path - will use YAML config
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
dataset_path=None, # Use output_dir from YAML config
|
||||||
|
epochs=1,
|
||||||
|
batch_size=1,
|
||||||
|
learning_rate=2e-4,
|
||||||
|
max_steps=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("Training example completed!")
|
||||||
|
print(" Model saved to: ./models/instruct")
|
||||||
|
print(" Ready for conversational inference!")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def create_quick_test():
|
||||||
|
"""Create a quick test with minimal steps for testing"""
|
||||||
|
print("=== Quick Test: Minimal Training Steps ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor first to create the configuration")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("Running quick test with minimal training steps...")
|
||||||
|
|
||||||
|
# Run training with very few steps for quick testing
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
dataset_path=None,
|
||||||
|
epochs=1,
|
||||||
|
batch_size=1,
|
||||||
|
learning_rate=2e-4,
|
||||||
|
max_steps=5 # Very few steps for quick test
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("Quick test completed!")
|
||||||
|
print(" Model saved with minimal training")
|
||||||
|
print(" This is just for testing the pipeline")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruct Training Script")
|
||||||
|
|
||||||
|
# Subcommands
|
||||||
|
parser.add_argument("command", choices=["train", "example", "features", "quick-test"],
|
||||||
|
help="Command to run")
|
||||||
|
|
||||||
|
# Training arguments
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--dataset", type=str, help="Path to training dataset")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory for model")
|
||||||
|
parser.add_argument("--epochs", type=int, help="Number of training epochs")
|
||||||
|
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||||
|
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||||
|
parser.add_argument("--max-steps", type=int, help="Maximum training steps")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "features":
|
||||||
|
show_training_features()
|
||||||
|
|
||||||
|
elif args.command == "example":
|
||||||
|
create_training_example()
|
||||||
|
|
||||||
|
elif args.command == "quick-test":
|
||||||
|
create_quick_test()
|
||||||
|
|
||||||
|
elif args.command == "train":
|
||||||
|
if not args.config:
|
||||||
|
print("❌ --config is required for training")
|
||||||
|
print("Usage: python scripts/instruct/train.py train --config config.yaml")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# If dataset is not provided, try to use output_dir from config
|
||||||
|
dataset_path = args.dataset if args.dataset else None
|
||||||
|
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=args.config,
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
output_dir=args.output_dir,
|
||||||
|
epochs=args.epochs,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
learning_rate=args.learning_rate,
|
||||||
|
max_steps=args.max_steps
|
||||||
|
)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,320 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct data processor script that uses YAML configurations.
|
||||||
|
This provides a flexible and maintainable approach for instruction fine-tuning tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||||
|
"""Run instruct data processor with YAML configuration"""
|
||||||
|
print(f"=== Running Instruct Data Processor with YAML config: {config_path} ===")
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"python", "pipelines/instruct/data_processor.py",
|
||||||
|
"--config", config_path
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add CLI overrides
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if value is not None:
|
||||||
|
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||||
|
|
||||||
|
print(f"Running command: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("✅ Instruct data processing completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Error running instruct data processor: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def run_instruct_examples():
|
||||||
|
"""Run instruct examples with YAML configs"""
|
||||||
|
|
||||||
|
# Example 1: Code reasoning instruction tuning
|
||||||
|
print("=== Example 1: Code Reasoning Instruction Tuning ===")
|
||||||
|
success = run_with_yaml_config(
|
||||||
|
"configs/instruct/code_reasoning.yaml",
|
||||||
|
max_samples=1000, # Override YAML value
|
||||||
|
output_format="conversation"
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("✅ Code reasoning instruction tuning data processing completed!")
|
||||||
|
|
||||||
|
# Example 2: Custom conversation dataset (if available)
|
||||||
|
print("\n=== Example 2: Custom Conversation Dataset ===")
|
||||||
|
if os.path.exists("data/raw/instruct/custom_conversations.jsonl"):
|
||||||
|
success = run_with_yaml_config(
|
||||||
|
"configs/instruct/code_reasoning.yaml", # Use code_reasoning config as base
|
||||||
|
data_source="custom",
|
||||||
|
data_path="data/raw/instruct/custom_conversations.jsonl",
|
||||||
|
output_dir="./data/processed/instruct/custom"
|
||||||
|
)
|
||||||
|
if success:
|
||||||
|
print("✅ Custom conversation dataset processing completed!")
|
||||||
|
else:
|
||||||
|
print("⚠️ Custom conversation dataset not found, skipping...")
|
||||||
|
print(" You can create one with the 'create-sample-data' option")
|
||||||
|
|
||||||
|
def create_sample_instruct_data():
|
||||||
|
"""Create sample conversation dataset for testing"""
|
||||||
|
sample_data = [
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "Can you explain what a Python list comprehension is?"},
|
||||||
|
{"role": "assistant", "content": "A Python list comprehension is a concise way to create lists. It allows you to generate a new list by applying an expression to each item in an existing iterable, optionally filtering items with a condition. The basic syntax is: [expression for item in iterable if condition]. For example: [x**2 for x in range(10) if x%2==0] creates a list of squares of even numbers from 0 to 8."}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "How do I handle exceptions in Python?"},
|
||||||
|
{"role": "assistant", "content": "In Python, you handle exceptions using try-except blocks. Here's the basic structure:\n\ntry:\n # Code that might raise an exception\n risky_code()\nexcept SpecificException as e:\n # Handle specific exception\n print(f'Error: {e}')\nexcept Exception as e:\n # Handle any other exception\n print(f'Unexpected error: {e}')\nelse:\n # Runs if no exception occurred\n print('Success!')\nfinally:\n # Always runs, regardless of exceptions\n cleanup_code()\n\nYou can catch multiple specific exceptions and handle them differently."}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "What's the difference between a function and a method in Python?"},
|
||||||
|
{"role": "assistant", "content": "In Python, the main differences between functions and methods are:\n\n1. **Definition location**: Functions are defined independently, while methods are defined inside classes.\n\n2. **Calling**: Functions are called directly by name, methods are called on objects using dot notation.\n\n3. **First parameter**: Methods automatically receive 'self' as the first parameter (the instance they're called on), functions don't have this automatic parameter.\n\nExample:\n```python\n# Function\ndef my_function(x):\n return x * 2\n\n# Method (inside a class)\nclass MyClass:\n def my_method(self, x):\n return x * 2\n\n# Usage\nresult1 = my_function(5) # Function call\nobj = MyClass()\nresult2 = obj.my_method(5) # Method call\n```"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "How can I optimize a slow Python loop?"},
|
||||||
|
{"role": "assistant", "content": "Here are several strategies to optimize slow Python loops:\n\n1. **Use list comprehensions** instead of explicit loops when possible\n2. **Vectorize with NumPy** for numerical operations\n3. **Use built-in functions** like map(), filter(), sum()\n4. **Avoid repeated lookups** - store frequently accessed values in variables\n5. **Use sets for membership testing** instead of lists\n6. **Consider using itertools** for efficient iteration patterns\n7. **Profile your code** with cProfile to identify bottlenecks\n8. **Use numba or Cython** for compute-intensive loops\n\nExample optimization:\n```python\n# Slow\nresult = []\nfor i in range(1000000):\n if i % 2 == 0:\n result.append(i ** 2)\n\n# Faster\nresult = [i ** 2 for i in range(1000000) if i % 2 == 0]\n\n# Even faster with NumPy\nimport numpy as np\narr = np.arange(1000000)\nresult = (arr[arr % 2 == 0] ** 2).tolist()\n```"}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"conversation": [
|
||||||
|
{"role": "user", "content": "What are Python decorators and how do I use them?"},
|
||||||
|
{"role": "assistant", "content": "Python decorators are a powerful feature that allows you to modify or extend the behavior of functions or classes without permanently modifying their code. They use the @ symbol and are placed above the function definition.\n\n**Basic decorator example:**\n```python\ndef my_decorator(func):\n def wrapper(*args, **kwargs):\n print('Before function call')\n result = func(*args, **kwargs)\n print('After function call')\n return result\n return wrapper\n\n@my_decorator\ndef greet(name):\n print(f'Hello, {name}!')\n\ngreet('Alice') # Prints before, greeting, and after messages\n```\n\n**Common use cases:**\n- Timing function execution\n- Logging\n- Authentication/authorization\n- Caching results\n- Input validation\n\n**Built-in decorators:**\n- `@property` - creates getter/setter methods\n- `@staticmethod` - methods that don't need self or cls\n- `@classmethod` - methods that receive the class as first argument\n\nDecorators make code more modular and reusable by separating concerns."}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create directory structure
|
||||||
|
data_dir = Path("data/raw/instruct")
|
||||||
|
data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save sample data
|
||||||
|
import json
|
||||||
|
sample_file = data_dir / "code_reasoning.jsonl"
|
||||||
|
with open(sample_file, 'w', encoding='utf-8') as f:
|
||||||
|
for item in sample_data:
|
||||||
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||||
|
|
||||||
|
print(f"✅ Created sample conversation dataset: {sample_file}")
|
||||||
|
print(f" Contains {len(sample_data)} conversation examples")
|
||||||
|
print(f" Format: conversation array with role/content pairs")
|
||||||
|
print(f" Ready to use with configs/instruct/code_reasoning.yaml")
|
||||||
|
|
||||||
|
def create_custom_instruct_config():
|
||||||
|
"""Create a custom instruct configuration file"""
|
||||||
|
custom_config = """# Custom Instruct Configuration
|
||||||
|
task:
|
||||||
|
name: "general_chat"
|
||||||
|
type: "instruction_following"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "custom"
|
||||||
|
data_path: "./data/raw/instruct/general_chat.jsonl"
|
||||||
|
data_format: "jsonl"
|
||||||
|
conversation_field: "conversation"
|
||||||
|
max_length: 2048
|
||||||
|
min_length: 10
|
||||||
|
clean_text: true
|
||||||
|
train_split: 0.8
|
||||||
|
validation_split: 0.1
|
||||||
|
test_split: 0.1
|
||||||
|
output_format: "conversation"
|
||||||
|
output_dir: "./data/processed/instruct/general_chat"
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "unsloth/Qwen2.5-7B-Instruct"
|
||||||
|
max_length: 2048
|
||||||
|
max_seq_length: 2048
|
||||||
|
dtype: null
|
||||||
|
load_in_4bit: true
|
||||||
|
token: null
|
||||||
|
training_model: "unsloth/Qwen2.5-7B-Instruct"
|
||||||
|
training_max_seq_length: 2048
|
||||||
|
training_dtype: null
|
||||||
|
training_load_in_4bit: true
|
||||||
|
|
||||||
|
training:
|
||||||
|
num_epochs: 1
|
||||||
|
batch_size: 1
|
||||||
|
learning_rate: 2e-4
|
||||||
|
weight_decay: 0.01
|
||||||
|
warmup_steps: 5
|
||||||
|
max_steps: 50
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
lr_scheduler_type: "linear"
|
||||||
|
seed: 3407
|
||||||
|
lora_r: 16
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0
|
||||||
|
target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
|
||||||
|
output_dir: "./outputs"
|
||||||
|
model_output_dir: "./models/instruct/general_chat"
|
||||||
|
|
||||||
|
inference:
|
||||||
|
batch_size: 1
|
||||||
|
max_new_tokens: 256
|
||||||
|
temperature: 0.8
|
||||||
|
min_p: 0.1
|
||||||
|
use_cache: true
|
||||||
|
"""
|
||||||
|
|
||||||
|
config_path = "configs/instruct/general_chat.yaml"
|
||||||
|
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
||||||
|
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
f.write(custom_config)
|
||||||
|
|
||||||
|
print(f"✅ Created custom instruct config: {config_path}")
|
||||||
|
print(" This config is set up for general chat instruction tuning")
|
||||||
|
|
||||||
|
def handle_direct_args():
|
||||||
|
"""Handle direct command-line arguments by passing them to the instruct pipeline"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruct Data Processor")
|
||||||
|
|
||||||
|
# Add all the same arguments as the instruct pipeline
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||||||
|
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||||||
|
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||||||
|
parser.add_argument("--data-format", choices=["jsonl", "json"], help="Data format")
|
||||||
|
parser.add_argument("--conversation-field", type=str, help="Conversation field name")
|
||||||
|
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||||||
|
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||||||
|
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||||||
|
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Build command to call the instruct pipeline
|
||||||
|
cmd = ["python", "pipelines/instruct/data_processor.py"]
|
||||||
|
|
||||||
|
# Add all arguments that were provided
|
||||||
|
for arg_name, arg_value in vars(args).items():
|
||||||
|
if arg_value is not None:
|
||||||
|
if isinstance(arg_value, bool):
|
||||||
|
if arg_value: # Only add flag if True
|
||||||
|
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||||
|
else:
|
||||||
|
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("✅ Instruct data processing completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Error running instruct data processor: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def show_instruct_features():
|
||||||
|
"""Show the features of the instruct data processor"""
|
||||||
|
print("=== Instruct Data Processor Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Instruction Fine-tuning Tasks**:")
|
||||||
|
print(" - Code reasoning and explanation")
|
||||||
|
print(" - General conversation and chat")
|
||||||
|
print(" - Question answering")
|
||||||
|
print(" - Task-specific instruction following")
|
||||||
|
print()
|
||||||
|
print("2. **Conversation Data Formats Supported**:")
|
||||||
|
print(" - HuggingFace conversation datasets")
|
||||||
|
print(" - Custom JSONL/JSON files with conversation arrays")
|
||||||
|
print(" - ShareGPT format with role/content structure")
|
||||||
|
print(" - Automatic train/validation/test splits")
|
||||||
|
print()
|
||||||
|
print("3. **Conversation Validation**:")
|
||||||
|
print(" - Role validation (user/assistant/system)")
|
||||||
|
print(" - Content length and quality checks")
|
||||||
|
print(" - Conversation structure validation")
|
||||||
|
print(" - Turn-level statistics and analysis")
|
||||||
|
print()
|
||||||
|
print("4. **Advanced Features**:")
|
||||||
|
print(" - Configurable conversation field mapping")
|
||||||
|
print(" - Text preprocessing options")
|
||||||
|
print(" - Automatic dataset saving/loading")
|
||||||
|
print(" - YAML configuration support")
|
||||||
|
print(" - Compatible with Unsloth chat templates")
|
||||||
|
print()
|
||||||
|
print("=== Usage Examples ===")
|
||||||
|
print()
|
||||||
|
print("1. Use YAML config only:")
|
||||||
|
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||||
|
print()
|
||||||
|
print("2. Override YAML values:")
|
||||||
|
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml --max-samples 500")
|
||||||
|
print()
|
||||||
|
print("3. Create sample data:")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-sample-data")
|
||||||
|
print()
|
||||||
|
print("4. Create custom config:")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-config")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
# Check if it's a subcommand
|
||||||
|
if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
|
||||||
|
# Handle subcommands
|
||||||
|
if sys.argv[1] == "examples":
|
||||||
|
run_instruct_examples()
|
||||||
|
elif sys.argv[1] == "create-sample-data":
|
||||||
|
create_sample_instruct_data()
|
||||||
|
elif sys.argv[1] == "create-config":
|
||||||
|
create_custom_instruct_config()
|
||||||
|
elif sys.argv[1] == "features":
|
||||||
|
show_instruct_features()
|
||||||
|
else:
|
||||||
|
# Handle direct arguments (pass through to pipeline)
|
||||||
|
handle_direct_args()
|
||||||
|
else:
|
||||||
|
print("Instruct Data Processor")
|
||||||
|
print("======================")
|
||||||
|
print()
|
||||||
|
print("This script runs the instruct data processor for instruction fine-tuning tasks.")
|
||||||
|
print("It supports both YAML configurations and command-line overrides.")
|
||||||
|
print()
|
||||||
|
print("Usage:")
|
||||||
|
print(" python scripts/instruct/data_processor.py examples # Run examples")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-sample-data # Create sample dataset")
|
||||||
|
print(" python scripts/instruct/data_processor.py create-config # Create custom config")
|
||||||
|
print(" python scripts/instruct/data_processor.py features # Show features")
|
||||||
|
print()
|
||||||
|
print("Direct pipeline usage:")
|
||||||
|
print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml")
|
||||||
|
print(" python scripts/instruct/data_processor.py --data-source custom --data-path ./conversations.jsonl")
|
||||||
|
print()
|
||||||
|
print("Key Features:")
|
||||||
|
print(" ✅ Instruction fine-tuning with conversation data")
|
||||||
|
print(" ✅ Multiple data source support")
|
||||||
|
print(" ✅ YAML configuration files")
|
||||||
|
print(" ✅ CLI argument overrides")
|
||||||
|
print(" ✅ Conversation validation and analysis")
|
||||||
|
print(" ✅ Compatible with Unsloth chat templates")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,291 @@
|
|||||||
|
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct Inference Script
|
||||||
|
Provides a command-line interface to run the instruct inference pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_inference_with_config(config_path: str, message: str = "", max_tokens: int = 128, stream: bool = False, interactive: bool = False):
|
||||||
|
"""Run inference using a YAML configuration file"""
|
||||||
|
print(f"Running instruct inference with config: {config_path}")
|
||||||
|
if interactive:
|
||||||
|
print("Mode: Interactive chat")
|
||||||
|
elif message:
|
||||||
|
print(f"Message: {message}")
|
||||||
|
print(f"Max tokens: {max_tokens}")
|
||||||
|
print(f"Streaming: {stream}")
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"python", "pipelines/instruct/inference.py",
|
||||||
|
"--config", config_path,
|
||||||
|
"--max-tokens", str(max_tokens)
|
||||||
|
]
|
||||||
|
|
||||||
|
if interactive:
|
||||||
|
cmd.append("--interactive")
|
||||||
|
elif message:
|
||||||
|
cmd.extend(["--message", message])
|
||||||
|
|
||||||
|
if stream:
|
||||||
|
cmd.append("--stream")
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if interactive:
|
||||||
|
# For interactive mode, don't capture output
|
||||||
|
result = subprocess.run(cmd, check=True)
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||||
|
print("✅ Inference completed successfully!")
|
||||||
|
print("Output:")
|
||||||
|
print(result.stdout)
|
||||||
|
return result.stdout
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Inference failed: {e}")
|
||||||
|
print("Error output:")
|
||||||
|
print(e.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def run_conversation_example(config_path: str):
|
||||||
|
"""Run a conversation example"""
|
||||||
|
print(f"=== Conversation Example ===")
|
||||||
|
print(f"Config: {config_path}")
|
||||||
|
|
||||||
|
example_messages = [
|
||||||
|
"Can you explain what recursion is in programming?",
|
||||||
|
"How do I debug a Python program?",
|
||||||
|
"What's the difference between a list and a tuple in Python?",
|
||||||
|
"Can you show me how to use a for loop?",
|
||||||
|
"What are the benefits of using functions in programming?"
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Running example conversations...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
for i, message in enumerate(example_messages):
|
||||||
|
print(f"--- Example {i+1} ---")
|
||||||
|
result = run_inference_with_config(config_path, message, max_tokens=256)
|
||||||
|
if not result:
|
||||||
|
print(f"❌ Failed to process message {i+1}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
print("✅ Conversation examples completed!")
|
||||||
|
|
||||||
|
def show_inference_features():
|
||||||
|
"""Show the features of the instruct inference pipeline"""
|
||||||
|
print("=== Instruct Inference Pipeline Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Model Support**:")
|
||||||
|
print(" - Trained LoRA models from instruct training pipeline")
|
||||||
|
print(" - Automatic model loading from config")
|
||||||
|
print(" - Native Unsloth inference optimization")
|
||||||
|
print(" - Chat template integration")
|
||||||
|
print()
|
||||||
|
print("2. **Inference Modes**:")
|
||||||
|
print(" - Single message inference")
|
||||||
|
print(" - Interactive chat session")
|
||||||
|
print(" - Streaming generation")
|
||||||
|
print(" - Batch conversation processing")
|
||||||
|
print()
|
||||||
|
print("3. **Conversation Features**:")
|
||||||
|
print(" - Multi-turn conversation support")
|
||||||
|
print(" - Context preservation across turns")
|
||||||
|
print(" - Proper role handling (user/assistant/system)")
|
||||||
|
print(" - Chat history management")
|
||||||
|
print()
|
||||||
|
print("4. **Generation Control**:")
|
||||||
|
print(" - Configurable max tokens")
|
||||||
|
print(" - Temperature and sampling parameters")
|
||||||
|
print(" - Streaming output support")
|
||||||
|
print(" - Chat template formatting")
|
||||||
|
print()
|
||||||
|
print("5. **Interactive Features**:")
|
||||||
|
print(" - Real-time chat interface")
|
||||||
|
print(" - Command support (clear, stream toggle)")
|
||||||
|
print(" - Conversation history tracking")
|
||||||
|
print(" - Graceful exit handling")
|
||||||
|
print()
|
||||||
|
print("6. **Usage Examples**:")
|
||||||
|
print(" - Single message: --message 'your question here'")
|
||||||
|
print(" - Interactive chat: --interactive")
|
||||||
|
print(" - Streaming: add --stream flag")
|
||||||
|
print(" - Custom tokens: --max-tokens 256")
|
||||||
|
|
||||||
|
def create_inference_example():
|
||||||
|
"""Create an inference example using the code reasoning configuration"""
|
||||||
|
print("=== Inference Example: Code Reasoning Chat ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check if we have the required files
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor and training first")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("✅ Found configuration file!")
|
||||||
|
print(f" Config: {config_path}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Example conversation
|
||||||
|
example_message = "Can you explain what a Python decorator is and show me a simple example?"
|
||||||
|
|
||||||
|
print(f"Example message: {example_message}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
success = run_inference_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
message=example_message,
|
||||||
|
max_tokens=256
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("✅ Example inference completed successfully!")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Example inference failed!")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def start_interactive_chat(config_path: str, stream: bool = False):
|
||||||
|
"""Start an interactive chat session"""
|
||||||
|
print("=== Interactive Chat Session ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor and training first")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(f"Starting interactive chat with config: {config_path}")
|
||||||
|
print("Streaming:", "enabled" if stream else "disabled")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run interactive inference
|
||||||
|
success = run_inference_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
interactive=True,
|
||||||
|
stream=stream
|
||||||
|
)
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def create_batch_test():
|
||||||
|
"""Create a batch test with multiple questions"""
|
||||||
|
print("=== Batch Test: Multiple Questions ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor and training first")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Create a batch of test questions
|
||||||
|
test_questions = [
|
||||||
|
"What is object-oriented programming?",
|
||||||
|
"How do you handle errors in Python?",
|
||||||
|
"Explain the concept of variables in programming.",
|
||||||
|
"What's the difference between a compiler and an interpreter?"
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Running batch test with multiple questions...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
for i, question in enumerate(test_questions):
|
||||||
|
print(f"Question {i+1}: {question}")
|
||||||
|
result = run_inference_with_config(config_path, question, max_tokens=200)
|
||||||
|
if result:
|
||||||
|
success_count += 1
|
||||||
|
print("-" * 50)
|
||||||
|
|
||||||
|
print(f"✅ Batch test completed: {success_count}/{len(test_questions)} questions processed successfully")
|
||||||
|
return success_count == len(test_questions)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main inference function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruct Inference Pipeline")
|
||||||
|
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
||||||
|
|
||||||
|
# Inference command
|
||||||
|
infer_parser = subparsers.add_parser("infer", help="Run single inference")
|
||||||
|
infer_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
infer_parser.add_argument("--message", type=str, required=True, help="Message to send to the model")
|
||||||
|
infer_parser.add_argument("--max-tokens", type=int, default=128, help="Maximum new tokens to generate")
|
||||||
|
infer_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||||
|
|
||||||
|
# Interactive command
|
||||||
|
interactive_parser = subparsers.add_parser("chat", help="Start interactive chat")
|
||||||
|
interactive_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
interactive_parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
|
||||||
|
|
||||||
|
# Batch test command
|
||||||
|
batch_parser = subparsers.add_parser("batch", help="Run batch test")
|
||||||
|
batch_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
|
||||||
|
# Conversation example command
|
||||||
|
conv_parser = subparsers.add_parser("conversation", help="Run conversation examples")
|
||||||
|
conv_parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
|
||||||
|
# Features command
|
||||||
|
subparsers.add_parser("features", help="Show available features")
|
||||||
|
|
||||||
|
# Example command
|
||||||
|
subparsers.add_parser("example", help="Run example inference")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "infer":
|
||||||
|
run_inference_with_config(
|
||||||
|
args.config,
|
||||||
|
args.message,
|
||||||
|
args.max_tokens,
|
||||||
|
args.stream
|
||||||
|
)
|
||||||
|
elif args.command == "chat":
|
||||||
|
start_interactive_chat(args.config, args.stream)
|
||||||
|
elif args.command == "batch":
|
||||||
|
create_batch_test()
|
||||||
|
elif args.command == "conversation":
|
||||||
|
run_conversation_example(args.config)
|
||||||
|
elif args.command == "features":
|
||||||
|
show_inference_features()
|
||||||
|
elif args.command == "example":
|
||||||
|
create_inference_example()
|
||||||
|
else:
|
||||||
|
print("Instruct Inference Pipeline")
|
||||||
|
print("==========================")
|
||||||
|
print()
|
||||||
|
print("Available commands:")
|
||||||
|
print(" infer - Run single message inference")
|
||||||
|
print(" chat - Start interactive chat session")
|
||||||
|
print(" batch - Run batch test with multiple questions")
|
||||||
|
print(" conversation - Run conversation examples")
|
||||||
|
print(" features - Show available features")
|
||||||
|
print(" example - Run example inference")
|
||||||
|
print()
|
||||||
|
print("Examples:")
|
||||||
|
print(" python scripts/instruct/inference.py infer --config configs/instruct/code_reasoning.yaml --message 'Explain Python loops'")
|
||||||
|
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml")
|
||||||
|
print(" python scripts/instruct/inference.py chat --config configs/instruct/code_reasoning.yaml --stream")
|
||||||
|
print()
|
||||||
|
print("Key Features:")
|
||||||
|
print(" ✅ Interactive chat with conversation history")
|
||||||
|
print(" ✅ Streaming generation support")
|
||||||
|
print(" ✅ Multi-turn conversation handling")
|
||||||
|
print(" ✅ Chat template integration")
|
||||||
|
print(" ✅ Configurable generation parameters")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,214 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Instruct Training Script
|
||||||
|
Provides a command-line interface to run the instruct training pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides):
|
||||||
|
"""Run the instruct training pipeline with YAML configuration"""
|
||||||
|
print(f"Starting instruct training with config: {config_path}")
|
||||||
|
if dataset_path:
|
||||||
|
print(f"Training dataset: {dataset_path}")
|
||||||
|
else:
|
||||||
|
print("Training dataset: Will use output_dir from YAML config")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Build command
|
||||||
|
cmd = ["python", "pipelines/instruct/train.py", "--config", config_path]
|
||||||
|
|
||||||
|
# Add dataset path if provided
|
||||||
|
if dataset_path:
|
||||||
|
cmd.extend(["--dataset", dataset_path])
|
||||||
|
|
||||||
|
# Add CLI overrides
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if value is not None:
|
||||||
|
if key == "output_dir":
|
||||||
|
cmd.extend(["--output-dir", str(value)])
|
||||||
|
elif key == "epochs":
|
||||||
|
cmd.extend(["--epochs", str(value)])
|
||||||
|
elif key == "batch_size":
|
||||||
|
cmd.extend(["--batch-size", str(value)])
|
||||||
|
elif key == "learning_rate":
|
||||||
|
cmd.extend(["--learning-rate", str(value)])
|
||||||
|
elif key == "max_steps":
|
||||||
|
cmd.extend(["--max-steps", str(value)])
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("Training completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Training failed: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def show_training_features():
|
||||||
|
"""Show the features of the instruct training pipeline"""
|
||||||
|
print("=== Instruct Training Pipeline Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Model Support**:")
|
||||||
|
print(" - Unsloth optimized models (4x faster)")
|
||||||
|
print(" - LoRA fine-tuning for efficiency")
|
||||||
|
print(" - Support for Qwen2.5, Llama-3.1, Mistral, Phi-3")
|
||||||
|
print(" - Chat template integration")
|
||||||
|
print()
|
||||||
|
print("2. **Training Features**:")
|
||||||
|
print(" - SFTTrainer with conversation data")
|
||||||
|
print(" - Response-only training (train only on assistant responses)")
|
||||||
|
print(" - ShareGPT format standardization")
|
||||||
|
print(" - Automatic mixed precision (FP16/BF16)")
|
||||||
|
print(" - Gradient checkpointing for memory efficiency")
|
||||||
|
print(" - Configurable LoRA parameters")
|
||||||
|
print()
|
||||||
|
print("3. **Conversation Handling**:")
|
||||||
|
print(" - Multi-turn conversation support")
|
||||||
|
print(" - Proper chat template formatting")
|
||||||
|
print(" - Role-based training (user/assistant/system)")
|
||||||
|
print(" - Context preservation across turns")
|
||||||
|
print()
|
||||||
|
print("4. **Configuration**:")
|
||||||
|
print(" - YAML configuration files")
|
||||||
|
print(" - CLI argument overrides")
|
||||||
|
print(" - Automatic device detection")
|
||||||
|
print(" - Flexible LoRA configuration")
|
||||||
|
print()
|
||||||
|
print("5. **Output**:")
|
||||||
|
print(" - Saved LoRA models")
|
||||||
|
print(" - Training logs and checkpoints")
|
||||||
|
print(" - Ready for conversational inference")
|
||||||
|
|
||||||
|
def create_training_example():
|
||||||
|
"""Create a training example using the code reasoning configuration"""
|
||||||
|
print("=== Training Example: Code Reasoning Instruction Tuning ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check if we have the required files
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor first to create the configuration")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("Found required files!")
|
||||||
|
print(f" Config: {config_path}")
|
||||||
|
print(" Dataset: Will use output_dir from YAML config")
|
||||||
|
print(" The training pipeline will automatically:")
|
||||||
|
print(" - Load conversation data from the output_dir specified in YAML")
|
||||||
|
print(" - Convert JSONL files to HuggingFace dataset format")
|
||||||
|
print(" - Apply ShareGPT standardization")
|
||||||
|
print(" - Format conversations with chat templates")
|
||||||
|
print(" - Train the model using SFTTrainer with response-only training")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run training without explicit dataset path - will use YAML config
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
dataset_path=None, # Use output_dir from YAML config
|
||||||
|
epochs=1,
|
||||||
|
batch_size=1,
|
||||||
|
learning_rate=2e-4,
|
||||||
|
max_steps=30
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("Training example completed!")
|
||||||
|
print(" Model saved to: ./models/instruct")
|
||||||
|
print(" Ready for conversational inference!")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def create_quick_test():
|
||||||
|
"""Create a quick test with minimal steps for testing"""
|
||||||
|
print("=== Quick Test: Minimal Training Steps ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
config_path = "configs/instruct/code_reasoning.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor first to create the configuration")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("Running quick test with minimal training steps...")
|
||||||
|
|
||||||
|
# Run training with very few steps for quick testing
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
dataset_path=None,
|
||||||
|
epochs=1,
|
||||||
|
batch_size=1,
|
||||||
|
learning_rate=2e-4,
|
||||||
|
max_steps=5 # Very few steps for quick test
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("Quick test completed!")
|
||||||
|
print(" Model saved with minimal training")
|
||||||
|
print(" This is just for testing the pipeline")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Instruct Training Script")
|
||||||
|
|
||||||
|
# Subcommands
|
||||||
|
parser.add_argument("command", choices=["train", "example", "features", "quick-test"],
|
||||||
|
help="Command to run")
|
||||||
|
|
||||||
|
# Training arguments
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--dataset", type=str, help="Path to training dataset")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory for model")
|
||||||
|
parser.add_argument("--epochs", type=int, help="Number of training epochs")
|
||||||
|
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||||
|
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||||
|
parser.add_argument("--max-steps", type=int, help="Maximum training steps")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "features":
|
||||||
|
show_training_features()
|
||||||
|
|
||||||
|
elif args.command == "example":
|
||||||
|
create_training_example()
|
||||||
|
|
||||||
|
elif args.command == "quick-test":
|
||||||
|
create_quick_test()
|
||||||
|
|
||||||
|
elif args.command == "train":
|
||||||
|
if not args.config:
|
||||||
|
print("❌ --config is required for training")
|
||||||
|
print("Usage: python scripts/instruct/train.py train --config config.yaml")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# If dataset is not provided, try to use output_dir from config
|
||||||
|
dataset_path = args.dataset if args.dataset else None
|
||||||
|
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=args.config,
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
output_dir=args.output_dir,
|
||||||
|
epochs=args.epochs,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
learning_rate=args.learning_rate,
|
||||||
|
max_steps=args.max_steps
|
||||||
|
)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user