#!/usr/bin/env python3 """ Instruct data processor script that uses YAML configurations. This provides a flexible and maintainable approach for instruction fine-tuning tasks. """ import sys import os import subprocess import argparse from pathlib import Path def run_with_yaml_config(config_path: str, **cli_overrides): """Run instruct data processor with YAML configuration""" print(f"=== Running Instruct Data Processor with YAML config: {config_path} ===") cmd = [ "python", "pipelines/instruct/data_processor.py", "--config", config_path ] # Add CLI overrides for key, value in cli_overrides.items(): if value is not None: cmd.extend([f"--{key.replace('_', '-')}", str(value)]) print(f"Running command: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("✅ Instruct data processing completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"❌ Error running instruct data processor: {e}") print(f"Error output: {e.stderr}") return False def run_instruct_examples(): """Run instruct examples with YAML configs""" # Example 1: Code reasoning instruction tuning print("=== Example 1: Code Reasoning Instruction Tuning ===") success = run_with_yaml_config( "configs/instruct/code_reasoning.yaml", max_samples=1000, # Override YAML value output_format="conversation" ) if success: print("✅ Code reasoning instruction tuning data processing completed!") # Example 2: Custom conversation dataset (if available) print("\n=== Example 2: Custom Conversation Dataset ===") if os.path.exists("data/raw/instruct/custom_conversations.jsonl"): success = run_with_yaml_config( "configs/instruct/code_reasoning.yaml", # Use code_reasoning config as base data_source="custom", data_path="data/raw/instruct/custom_conversations.jsonl", output_dir="./data/processed/instruct/custom" ) if success: print("✅ Custom conversation dataset processing completed!") else: print("⚠️ Custom conversation dataset not found, skipping...") print(" You can create one with the 'create-sample-data' option") def create_sample_instruct_data(): """Create sample conversation dataset for testing""" sample_data = [ { "conversation": [ {"role": "user", "content": "Can you explain what a Python list comprehension is?"}, {"role": "assistant", "content": "A Python list comprehension is a concise way to create lists. It allows you to generate a new list by applying an expression to each item in an existing iterable, optionally filtering items with a condition. The basic syntax is: [expression for item in iterable if condition]. For example: [x**2 for x in range(10) if x%2==0] creates a list of squares of even numbers from 0 to 8."} ] }, { "conversation": [ {"role": "user", "content": "How do I handle exceptions in Python?"}, {"role": "assistant", "content": "In Python, you handle exceptions using try-except blocks. Here's the basic structure:\n\ntry:\n # Code that might raise an exception\n risky_code()\nexcept SpecificException as e:\n # Handle specific exception\n print(f'Error: {e}')\nexcept Exception as e:\n # Handle any other exception\n print(f'Unexpected error: {e}')\nelse:\n # Runs if no exception occurred\n print('Success!')\nfinally:\n # Always runs, regardless of exceptions\n cleanup_code()\n\nYou can catch multiple specific exceptions and handle them differently."} ] }, { "conversation": [ {"role": "user", "content": "What's the difference between a function and a method in Python?"}, {"role": "assistant", "content": "In Python, the main differences between functions and methods are:\n\n1. **Definition location**: Functions are defined independently, while methods are defined inside classes.\n\n2. **Calling**: Functions are called directly by name, methods are called on objects using dot notation.\n\n3. **First parameter**: Methods automatically receive 'self' as the first parameter (the instance they're called on), functions don't have this automatic parameter.\n\nExample:\n```python\n# Function\ndef my_function(x):\n return x * 2\n\n# Method (inside a class)\nclass MyClass:\n def my_method(self, x):\n return x * 2\n\n# Usage\nresult1 = my_function(5) # Function call\nobj = MyClass()\nresult2 = obj.my_method(5) # Method call\n```"} ] }, { "conversation": [ {"role": "user", "content": "How can I optimize a slow Python loop?"}, {"role": "assistant", "content": "Here are several strategies to optimize slow Python loops:\n\n1. **Use list comprehensions** instead of explicit loops when possible\n2. **Vectorize with NumPy** for numerical operations\n3. **Use built-in functions** like map(), filter(), sum()\n4. **Avoid repeated lookups** - store frequently accessed values in variables\n5. **Use sets for membership testing** instead of lists\n6. **Consider using itertools** for efficient iteration patterns\n7. **Profile your code** with cProfile to identify bottlenecks\n8. **Use numba or Cython** for compute-intensive loops\n\nExample optimization:\n```python\n# Slow\nresult = []\nfor i in range(1000000):\n if i % 2 == 0:\n result.append(i ** 2)\n\n# Faster\nresult = [i ** 2 for i in range(1000000) if i % 2 == 0]\n\n# Even faster with NumPy\nimport numpy as np\narr = np.arange(1000000)\nresult = (arr[arr % 2 == 0] ** 2).tolist()\n```"} ] }, { "conversation": [ {"role": "user", "content": "What are Python decorators and how do I use them?"}, {"role": "assistant", "content": "Python decorators are a powerful feature that allows you to modify or extend the behavior of functions or classes without permanently modifying their code. They use the @ symbol and are placed above the function definition.\n\n**Basic decorator example:**\n```python\ndef my_decorator(func):\n def wrapper(*args, **kwargs):\n print('Before function call')\n result = func(*args, **kwargs)\n print('After function call')\n return result\n return wrapper\n\n@my_decorator\ndef greet(name):\n print(f'Hello, {name}!')\n\ngreet('Alice') # Prints before, greeting, and after messages\n```\n\n**Common use cases:**\n- Timing function execution\n- Logging\n- Authentication/authorization\n- Caching results\n- Input validation\n\n**Built-in decorators:**\n- `@property` - creates getter/setter methods\n- `@staticmethod` - methods that don't need self or cls\n- `@classmethod` - methods that receive the class as first argument\n\nDecorators make code more modular and reusable by separating concerns."} ] } ] # Create directory structure data_dir = Path("data/raw/instruct") data_dir.mkdir(parents=True, exist_ok=True) # Save sample data import json sample_file = data_dir / "code_reasoning.jsonl" with open(sample_file, 'w', encoding='utf-8') as f: for item in sample_data: f.write(json.dumps(item, ensure_ascii=False) + '\n') print(f"✅ Created sample conversation dataset: {sample_file}") print(f" Contains {len(sample_data)} conversation examples") print(f" Format: conversation array with role/content pairs") print(f" Ready to use with configs/instruct/code_reasoning.yaml") def create_custom_instruct_config(): """Create a custom instruct configuration file""" custom_config = """# Custom Instruct Configuration task: name: "general_chat" type: "instruction_following" data: source: "custom" data_path: "./data/raw/instruct/general_chat.jsonl" data_format: "jsonl" conversation_field: "conversation" max_length: 2048 min_length: 10 clean_text: true train_split: 0.8 validation_split: 0.1 test_split: 0.1 output_format: "conversation" output_dir: "./data/processed/instruct/general_chat" model: name: "unsloth/Qwen2.5-7B-Instruct" max_length: 2048 max_seq_length: 2048 dtype: null load_in_4bit: true token: null training_model: "unsloth/Qwen2.5-7B-Instruct" training_max_seq_length: 2048 training_dtype: null training_load_in_4bit: true training: num_epochs: 1 batch_size: 1 learning_rate: 2e-4 weight_decay: 0.01 warmup_steps: 5 max_steps: 50 gradient_accumulation_steps: 4 lr_scheduler_type: "linear" seed: 3407 lora_r: 16 lora_alpha: 16 lora_dropout: 0 target_modules: ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] output_dir: "./outputs" model_output_dir: "./models/instruct/general_chat" inference: batch_size: 1 max_new_tokens: 256 temperature: 0.8 min_p: 0.1 use_cache: true """ config_path = "configs/instruct/general_chat.yaml" os.makedirs(os.path.dirname(config_path), exist_ok=True) with open(config_path, 'w') as f: f.write(custom_config) print(f"✅ Created custom instruct config: {config_path}") print(" This config is set up for general chat instruction tuning") def handle_direct_args(): """Handle direct command-line arguments by passing them to the instruct pipeline""" parser = argparse.ArgumentParser(description="Instruct Data Processor") # Add all the same arguments as the instruct pipeline parser.add_argument("--config", type=str, help="Path to YAML configuration file") parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source") parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name") parser.add_argument("--data-path", type=str, help="Path to custom data file") parser.add_argument("--data-format", choices=["jsonl", "json"], help="Data format") parser.add_argument("--conversation-field", type=str, help="Conversation field name") parser.add_argument("--max-samples", type=int, help="Maximum samples to process") parser.add_argument("--train-split", type=float, help="Training split ratio") parser.add_argument("--validation-split", type=float, help="Validation split ratio") parser.add_argument("--test-split", type=float, help="Test split ratio") parser.add_argument("--output-dir", type=str, help="Output directory") # Logging parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level") args = parser.parse_args() # Build command to call the instruct pipeline cmd = ["python", "pipelines/instruct/data_processor.py"] # Add all arguments that were provided for arg_name, arg_value in vars(args).items(): if arg_value is not None: if isinstance(arg_value, bool): if arg_value: # Only add flag if True cmd.append(f"--{arg_name.replace('_', '-')}") else: cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)]) print(f"Running: {' '.join(cmd)}") print() try: result = subprocess.run(cmd, check=True, capture_output=True, text=True) print("✅ Instruct data processing completed successfully!") print(result.stdout) return True except subprocess.CalledProcessError as e: print(f"❌ Error running instruct data processor: {e}") print(f"Error output: {e.stderr}") return False def show_instruct_features(): """Show the features of the instruct data processor""" print("=== Instruct Data Processor Features ===") print() print("1. **Instruction Fine-tuning Tasks**:") print(" - Code reasoning and explanation") print(" - General conversation and chat") print(" - Question answering") print(" - Task-specific instruction following") print() print("2. **Conversation Data Formats Supported**:") print(" - HuggingFace conversation datasets") print(" - Custom JSONL/JSON files with conversation arrays") print(" - ShareGPT format with role/content structure") print(" - Automatic train/validation/test splits") print() print("3. **Conversation Validation**:") print(" - Role validation (user/assistant/system)") print(" - Content length and quality checks") print(" - Conversation structure validation") print(" - Turn-level statistics and analysis") print() print("4. **Advanced Features**:") print(" - Configurable conversation field mapping") print(" - Text preprocessing options") print(" - Automatic dataset saving/loading") print(" - YAML configuration support") print(" - Compatible with Unsloth chat templates") print() print("=== Usage Examples ===") print() print("1. Use YAML config only:") print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml") print() print("2. Override YAML values:") print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml --max-samples 500") print() print("3. Create sample data:") print(" python scripts/instruct/data_processor.py create-sample-data") print() print("4. Create custom config:") print(" python scripts/instruct/data_processor.py create-config") def main(): """Main function""" if len(sys.argv) > 1: # Check if it's a subcommand if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]: # Handle subcommands if sys.argv[1] == "examples": run_instruct_examples() elif sys.argv[1] == "create-sample-data": create_sample_instruct_data() elif sys.argv[1] == "create-config": create_custom_instruct_config() elif sys.argv[1] == "features": show_instruct_features() else: # Handle direct arguments (pass through to pipeline) handle_direct_args() else: print("Instruct Data Processor") print("======================") print() print("This script runs the instruct data processor for instruction fine-tuning tasks.") print("It supports both YAML configurations and command-line overrides.") print() print("Usage:") print(" python scripts/instruct/data_processor.py examples # Run examples") print(" python scripts/instruct/data_processor.py create-sample-data # Create sample dataset") print(" python scripts/instruct/data_processor.py create-config # Create custom config") print(" python scripts/instruct/data_processor.py features # Show features") print() print("Direct pipeline usage:") print(" python scripts/instruct/data_processor.py --config configs/instruct/code_reasoning.yaml") print(" python scripts/instruct/data_processor.py --data-source custom --data-path ./conversations.jsonl") print() print("Key Features:") print(" ✅ Instruction fine-tuning with conversation data") print(" ✅ Multiple data source support") print(" ✅ YAML configuration files") print(" ✅ CLI argument overrides") print(" ✅ Conversation validation and analysis") print(" ✅ Compatible with Unsloth chat templates") if __name__ == "__main__": main()