instruct model setup

2025-08-28 17:57:59 +00:00
parent 77c563f358
commit d49b4ff2d5
55 changed files with 27760 additions and 326 deletions
@@ -20,7 +20,7 @@ from utils.config.config_manager import ConfigManager
 # Training imports
 import torch
 from datasets import load_from_disk, Dataset
-from unsloth import FastLanguageModel #is_bfloat16_supported
+from unsloth import FastLanguageModel, is_bfloat16_supported
 from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
 from trl import SFTTrainer, SFTConfig
 from transformers import DataCollatorForSeq2Seq
@@ -132,56 +132,23 @@ class InstructTrainer:
            raise
    
    def load_dataset(self, dataset_path: str) -> Dataset:
-        """Load the conversation training dataset"""
+        """Load the conversation training dataset directly from JSONL file"""
        print(f"Loading conversation dataset from: {dataset_path}")
        
        try:
-            if Path(dataset_path).exists():
-                # Check if it's a HuggingFace dataset directory
-                if (Path(dataset_path) / "dataset_info.json").exists():
-                    # Load from HuggingFace dataset directory
-                    dataset = load_from_disk(dataset_path)
-                    print(f"Loaded HuggingFace dataset from disk: {len(dataset)} samples")
-                else:
-                    # Load from processed conversation data files (JSONL format)
-                    print("Loading from processed conversation data files...")
-                    from datasets import Dataset
-                    import json
-                    
-                    all_data = []
-                    data_dir = Path(dataset_path)
-                    
-                    # Look for train.jsonl, validation.jsonl, test.jsonl
-                    for split_file in ["train.jsonl", "validation.jsonl", "test.jsonl"]:
-                        file_path = data_dir / split_file
-                        if file_path.exists():
-                            print(f"Loading {split_file}...")
-                            with open(file_path, 'r', encoding='utf-8') as f:
-                                for line in f:
-                                    if line.strip():
-                                        data = json.loads(line)
-                                        all_data.append(data)
-                    
-                    if not all_data:
-                        raise ValueError(f"No conversation data found in {dataset_path}")
-                    
-                    # Create HuggingFace dataset
-                    dataset = Dataset.from_list(all_data)
-                    print(f"Created HuggingFace dataset from {len(all_data)} conversation samples")
-            else:
-                # Try loading from HuggingFace Hub
-                print(f"Attempting to load from HuggingFace Hub: {dataset_path}")
-                dataset = Dataset.load_dataset(dataset_path, split="train")
-                print(f"Loaded from HuggingFace Hub: {len(dataset)} samples")
-            
-            print(f"Dataset loaded: {len(dataset)} samples")
-            print(f"Dataset features: {dataset.features}")
-            
-            # Verify required fields exist for conversation data
-            required_fields = ["conversation"]
-            missing_fields = [field for field in required_fields if field not in dataset.features]
-            if missing_fields:
-                raise ValueError(f"Missing required fields in conversation dataset: {missing_fields}")
+            # Load JSONL data exactly as provided
+            data = []
+            with open(dataset_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    data.append(json.loads(line))
+
+            print(f"Loaded {len(data)} examples")
+
+            # Convert to HuggingFace Dataset
+            dataset = Dataset.from_list(data)
+
+            print(dataset)
+            print(dataset[0])  # Show first example
            
            return dataset
            
@@ -194,22 +161,16 @@ class InstructTrainer:
        print("Formatting conversation dataset for training...")
        
        try:
+            # Define the formatting function exactly as provided
+            def formatting_prompts_func(examples):
+                convos = examples["conversation"]
+                texts = [self.tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
+                return {"text": texts}
+            
            # Standardize the ShareGPT format
            print("Standardizing ShareGPT format...")
            dataset = standardize_sharegpt(dataset)
            
-            # Define the formatting function for chat templates
-            def formatting_prompts_func(examples):
-                convos = examples["conversation"]
-                texts = [
-                    self.tokenizer.apply_chat_template(
-                        convo, 
-                        tokenize=False, 
-                        add_generation_prompt=False
-                    ) for convo in convos
-                ]
-                return {"text": texts}
-            
            # Apply the formatting function
            print("Applying chat template formatting...")
            dataset = dataset.map(formatting_prompts_func, batched=True)
@@ -277,18 +238,29 @@ class InstructTrainer:
        print("Setting up response-only training...")
        
        try:
+            # For Qwen models, we need to use the correct chat template tokens
+            # Qwen uses different tokens than Llama
+            if "qwen" in self.model_name.lower():
+                instruction_part = "<|im_start|>user\n"
+                response_part = "<|im_start|>assistant\n"
+            else:
+                # Default for other models
+                instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n"
+                response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            
            # Configure trainer to only train on responses
            self.trainer = train_on_responses_only(
                self.trainer,
-                instruction_part="<|im_start|>user\n",
-                response_part="<|im_start|>assistant\n",
+                instruction_part=instruction_part,
+                response_part=response_part,
            )
            
            print("✅ Response-only training configured")
            
        except Exception as e:
            print(f"❌ Error setting up response-only training: {e}")
-            raise
+            print("Skipping response-only training and proceeding with full training...")
+            # Don't raise the exception, just continue with regular training
    
    def train(self, dataset_path: str):
        """Run the instruction fine-tuning process"""
@@ -321,7 +293,11 @@ class InstructTrainer:
            
            # Setup response-only training (optional but recommended for chat models)
            print("Step 7: Setting up response-only training...")
-            self.setup_response_only_training()
+            try:
+                self.setup_response_only_training()
+            except Exception as e:
+                print(f"⚠️  Response-only training failed: {e}")
+                print("Continuing with full training (will train on all tokens)...")
            
            # Start training
            print("Step 8: Starting training...")
@@ -432,13 +408,12 @@ def load_training_config(yaml_path: str) -> Dict[str, Any]:
                ])
            })
        
-        # Data configuration - use output_dir from data section
+        # Data configuration - use data_path from data section
        if 'data' in config:
            data_config = config['data']
-            output_dir = data_config.get('output_dir', './data/processed/instruct')
+            data_path = data_config.get('data_path', './data/raw/instruct/code_reasoning.jsonl')
            training_config.update({
-                'data_output_dir': output_dir,
-                'dataset_path': output_dir,  # Default dataset path is the output_dir
+                'dataset_path': data_path,  # Use data_path directly for JSONL file
            })
        
        # Output configuration
@@ -20,7 +20,7 @@ from utils.config.config_manager import ConfigManager
 # Training imports
 import torch
 from datasets import load_from_disk, Dataset
-from unsloth import FastLanguageModel #is_bfloat16_supported
+from unsloth import FastLanguageModel, is_bfloat16_supported
 from unsloth.chat_templates import get_chat_template, standardize_sharegpt, train_on_responses_only
 from trl import SFTTrainer, SFTConfig
 from transformers import DataCollatorForSeq2Seq
@@ -132,56 +132,23 @@ class InstructTrainer:
            raise
    
    def load_dataset(self, dataset_path: str) -> Dataset:
-        """Load the conversation training dataset"""
+        """Load the conversation training dataset directly from JSONL file"""
        print(f"Loading conversation dataset from: {dataset_path}")
        
        try:
-            if Path(dataset_path).exists():
-                # Check if it's a HuggingFace dataset directory
-                if (Path(dataset_path) / "dataset_info.json").exists():
-                    # Load from HuggingFace dataset directory
-                    dataset = load_from_disk(dataset_path)
-                    print(f"Loaded HuggingFace dataset from disk: {len(dataset)} samples")
-                else:
-                    # Load from processed conversation data files (JSONL format)
-                    print("Loading from processed conversation data files...")
-                    from datasets import Dataset
-                    import json
-                    
-                    all_data = []
-                    data_dir = Path(dataset_path)
-                    
-                    # Look for train.jsonl, validation.jsonl, test.jsonl
-                    for split_file in ["train.jsonl", "validation.jsonl", "test.jsonl"]:
-                        file_path = data_dir / split_file
-                        if file_path.exists():
-                            print(f"Loading {split_file}...")
-                            with open(file_path, 'r', encoding='utf-8') as f:
-                                for line in f:
-                                    if line.strip():
-                                        data = json.loads(line)
-                                        all_data.append(data)
-                    
-                    if not all_data:
-                        raise ValueError(f"No conversation data found in {dataset_path}")
-                    
-                    # Create HuggingFace dataset
-                    dataset = Dataset.from_list(all_data)
-                    print(f"Created HuggingFace dataset from {len(all_data)} conversation samples")
-            else:
-                # Try loading from HuggingFace Hub
-                print(f"Attempting to load from HuggingFace Hub: {dataset_path}")
-                dataset = Dataset.load_dataset(dataset_path, split="train")
-                print(f"Loaded from HuggingFace Hub: {len(dataset)} samples")
-            
-            print(f"Dataset loaded: {len(dataset)} samples")
-            print(f"Dataset features: {dataset.features}")
-            
-            # Verify required fields exist for conversation data
-            required_fields = ["conversation"]
-            missing_fields = [field for field in required_fields if field not in dataset.features]
-            if missing_fields:
-                raise ValueError(f"Missing required fields in conversation dataset: {missing_fields}")
+            # Load JSONL data exactly as provided
+            data = []
+            with open(dataset_path, "r", encoding="utf-8") as f:
+                for line in f:
+                    data.append(json.loads(line))
+
+            print(f"Loaded {len(data)} examples")
+
+            # Convert to HuggingFace Dataset
+            dataset = Dataset.from_list(data)
+
+            print(dataset)
+            print(dataset[0])  # Show first example
            
            return dataset
            
@@ -194,22 +161,16 @@ class InstructTrainer:
        print("Formatting conversation dataset for training...")
        
        try:
+            # Define the formatting function exactly as provided
+            def formatting_prompts_func(examples):
+                convos = examples["conversation"]
+                texts = [self.tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
+                return {"text": texts}
+            
            # Standardize the ShareGPT format
            print("Standardizing ShareGPT format...")
            dataset = standardize_sharegpt(dataset)
            
-            # Define the formatting function for chat templates
-            def formatting_prompts_func(examples):
-                convos = examples["conversation"]
-                texts = [
-                    self.tokenizer.apply_chat_template(
-                        convo, 
-                        tokenize=False, 
-                        add_generation_prompt=False
-                    ) for convo in convos
-                ]
-                return {"text": texts}
-            
            # Apply the formatting function
            print("Applying chat template formatting...")
            dataset = dataset.map(formatting_prompts_func, batched=True)
@@ -277,18 +238,29 @@ class InstructTrainer:
        print("Setting up response-only training...")
        
        try:
+            # For Qwen models, we need to use the correct chat template tokens
+            # Qwen uses different tokens than Llama
+            if "qwen" in self.model_name.lower():
+                instruction_part = "<|im_start|>user\n"
+                response_part = "<|im_start|>assistant\n"
+            else:
+                # Default for other models
+                instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n"
+                response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            
            # Configure trainer to only train on responses
            self.trainer = train_on_responses_only(
                self.trainer,
-                instruction_part="<|im_start|>user\n",
-                response_part="<|im_start|>assistant\n",
+                instruction_part=instruction_part,
+                response_part=response_part,
            )
            
            print("✅ Response-only training configured")
            
        except Exception as e:
            print(f"❌ Error setting up response-only training: {e}")
-            raise
+            print("Skipping response-only training and proceeding with full training...")
+            # Don't raise the exception, just continue with regular training
    
    def train(self, dataset_path: str):
        """Run the instruction fine-tuning process"""
@@ -321,7 +293,11 @@ class InstructTrainer:
            
            # Setup response-only training (optional but recommended for chat models)
            print("Step 7: Setting up response-only training...")
-            self.setup_response_only_training()
+            try:
+                self.setup_response_only_training()
+            except Exception as e:
+                print(f"⚠️  Response-only training failed: {e}")
+                print("Continuing with full training (will train on all tokens)...")
            
            # Start training
            print("Step 8: Starting training...")
@@ -432,13 +408,12 @@ def load_training_config(yaml_path: str) -> Dict[str, Any]:
                ])
            })
        
-        # Data configuration - use output_dir from data section
+        # Data configuration - use data_path from data section
        if 'data' in config:
            data_config = config['data']
-            output_dir = data_config.get('output_dir', './data/processed/instruct')
+            data_path = data_config.get('data_path', './data/raw/instruct/code_reasoning.jsonl')
            training_config.update({
-                'data_output_dir': output_dir,
-                'dataset_path': output_dir,  # Default dataset path is the output_dir
+                'dataset_path': data_path,  # Use data_path directly for JSONL file
            })
        
        # Output configuration