pipelines/styling/.ipynb_checkpoints/inference-checkpoint.py

#!/usr/bin/env python3
"""
Styling Inference Pipeline using Trained Models
Supports style transfer inference with streaming and batch processing
"""

import os
import sys
import json
import argparse
from pathlib import Path
from typing import Dict, Any, Optional, List, Union
import yaml

# Add the project root to the path
sys.path.append(str(Path(__file__).parent.parent.parent))

# Inference imports
import torch
from datasets import load_from_disk, Dataset
from unsloth import FastLanguageModel
from transformers import TextStreamer

class StylingInference:
    """Styling task inference using trained models"""
    
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.model = None
        self.tokenizer = None
        
        # Set device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        # Model parameters
        self.model_output_dir = config.get('model_output_dir', './models/styling')
        self.max_seq_length = config.get('max_seq_length', 2048)
        self.dtype = config.get('dtype', None)
        self.load_in_4bit = config.get('load_in_4bit', True)
        self.hf_token = config.get('hf_token', None)
        
        # Inference parameters
        self.batch_size = config.get('batch_size', 1)
        self.max_new_tokens = config.get('max_new_tokens', 128)
        self.temperature = config.get('temperature', 0.8)
        self.top_p = config.get('top_p', 0.9)
        self.do_sample = config.get('do_sample', True)
        
        # Alpaca prompt template
        self.alpaca_prompt = config.get('alpaca_prompt', """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction

### Instruction:
{}

### Input:
{}

### Response:
{}""")
        
        # Style instruction
        self.style_instruction = config.get('style_instruction', 'Rewrite the following text in a formal style')
    
    def load_model_and_tokenizer(self):
        """Load the trained model and tokenizer"""
        print("Loading trained model and tokenizer...")
        
        try:
            # Load the saved LoRA model
            model_path = self.config.get('model_output_dir', './models/styling')
            print(f"Loading model from: {model_path}")
            
            self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                model_name=model_path,
                max_seq_length=self.max_seq_length,
                dtype=self.dtype,
                load_in_4bit=self.load_in_4bit,
            )
            
            # Enable native 2x faster inference
            FastLanguageModel.for_inference(self.model)
            
            print(f"✅ Model loaded from: {model_path}")
            print(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}")
            
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            raise
    
    def format_prompt(self, instruction: str, input_text: str = "") -> str:
        """Format prompt using the same alpaca format as training"""
        # Use the exact same alpaca prompt as training
        alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction

### Instruction:
{}

### Input:
{}

### Response:
{}"""
        
        # For inference, output is empty (will be generated)
        return alpaca_prompt.format(instruction, input_text, "")
    
    def generate_text(self, instruction: str, input_text: str = "", max_new_tokens: int = 128, stream: bool = False):
        """Generate text using the trained model"""
        try:
            # Format the prompt
            prompt = self.format_prompt(instruction, input_text)
            print(f"Formatted prompt: {prompt}")
            
            # Tokenize the input
            inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
            
            if stream:
                # Streaming generation
                from transformers import TextStreamer
                text_streamer = TextStreamer(self.tokenizer)
                print("Generating with streaming...")
                _ = self.model.generate(
                    **inputs, 
                    streamer=text_streamer, 
                    max_new_tokens=max_new_tokens
                )
                return None  # Streaming output is handled by streamer
            else:
                # Non-streaming generation
                print("Generating...")
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=0.7,
                    pad_token_id=self.tokenizer.eos_token_id
                )
                
                # Decode the generated text
                generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                
                # Extract only the generated response (remove the input prompt)
                response_start = generated_text.find("### Response:")
                if response_start != -1:
                    response = generated_text[response_start + len("### Response:"):].strip()
                else:
                    response = generated_text
                
                return response
                
        except Exception as e:
            print(f"❌ Error generating text: {e}")
            raise
    
    def style_transfer(self, input_text: str, instruction: Optional[str] = None, streaming: bool = False) -> str:
        """Perform style transfer on a single input text"""
        try:
            # Use default instruction if none provided
            if instruction is None:
                instruction = self.style_instruction
            
            print(f"Style transfer prompt: {instruction}")
            print(f"Input text: {input_text}")
            
            # Format prompt
            prompt = self.format_prompt(instruction, input_text)
            
            print(f"Style transfer prompt: {prompt}")
            
            if streaming:
                print("Generating with streaming...")
                self.generate_text_streaming(prompt)
                return ""
            else:
                print("Generating text...")
                result = self.generate_text(instruction, input_text, self.max_new_tokens)
                print(f"Generated result: {result}")
                return result
                
        except Exception as e:
            print(f"❌ Error in style transfer: {e}")
            raise
    
    def generate_text_streaming(self, prompt: str, max_new_tokens: Optional[int] = None):
        """Generate text with streaming output"""
        try:
            # Tokenize input
            inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
            
            # Setup text streamer
            text_streamer = TextStreamer(self.tokenizer)
            
            # Set generation parameters
            gen_kwargs = {
                "max_new_tokens": max_new_tokens or self.max_new_tokens,
                "temperature": self.temperature,
                "top_p": self.top_p,
                "do_sample": self.do_sample,
                "use_cache": True,
                "pad_token_id": self.tokenizer.eos_token_id
            }
            
            # Generate with streaming
            with torch.no_grad():
                _ = self.model.generate(**inputs, streamer=text_streamer, **gen_kwargs)
                
        except Exception as e:
            print(f"❌ Error in streaming generation: {e}")
    
    def batch_style_transfer(self, input_texts: List[str], instruction: Optional[str] = None) -> List[str]:
        """Perform style transfer on multiple input texts"""
        results = []
        
        for i, input_text in enumerate(input_texts):
            print(f"Processing text {i+1}/{len(input_texts)}")
            result = self.style_transfer(input_text, instruction)
            results.append(result)
        
        return results

def load_inference_config(config_path: str) -> Dict[str, Any]:
    """Load inference configuration from YAML file"""
    try:
        with open(config_path, 'r', encoding='utf-8') as f:
            config = yaml.safe_load(f)
        
        # Extract inference configuration
        inference_config = {}
        
        # Model configuration
        if 'model' in config:
            model_data = config['model']
            inference_config.update({
                'base_model_name': model_data.get('training_model', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'),
                'max_seq_length': model_data.get('training_max_seq_length', 2048),
                'dtype': model_data.get('training_dtype'),
                'load_in_4bit': model_data.get('training_load_in_4bit', True),
                'hf_token': model_data.get('training_token')
            })
        
        # Training configuration - to get model_output_dir
        if 'training' in config:
            training_data = config['training']
            inference_config.update({
                'model_output_dir': training_data.get('model_output_dir', './models/styling')
            })
        
        # Inference configuration
        if 'inference' in config:
            inference_data = config['inference']
            inference_config.update({
                'batch_size': inference_data.get('batch_size', 1),
                'max_new_tokens': inference_data.get('max_new_tokens', 128),
                'temperature': inference_data.get('temperature', 0.8)
            })
        
        # Style configuration
        if 'data' in config:
            data_config = config['data']
            inference_config.update({
                'style_instruction': data_config.get('instruction', 'Rewrite the following text in a formal style')
            })
        
        return inference_config
        
    except Exception as e:
        print(f"Error loading inference config: {e}")
        raise

def main():
    """Main inference function"""
    parser = argparse.ArgumentParser(description="Styling Inference Pipeline")
    
    # Configuration
    parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
    parser.add_argument("--instruction", type=str, required=True, help="Style instruction")
    parser.add_argument("--input-text", type=str, default="", help="Input text to style")
    parser.add_argument("--max-tokens", type=int, default=128, help="Maximum new tokens to generate")
    parser.add_argument("--stream", action="store_true", help="Enable streaming generation")
    
    args = parser.parse_args()
    
    try:
        # Load configuration
        print(f"Loading configuration from: {args.config}")
        inference_config = load_inference_config(args.config)
        
        # Override with CLI arguments
        if args.max_tokens:
            inference_config['max_new_tokens'] = args.max_tokens
        
        print("Inference configuration:")
        for key, value in inference_config.items():
            print(f"  {key}: {value}")
        
        # Initialize inference
        inference = StylingInference(inference_config)
        
        # Load model and tokenizer
        inference.load_model_and_tokenizer()

        # Run inference
        if args.stream:
            print("Running streaming inference...")
            inference.generate_text(args.instruction, args.input_text, args.max_tokens, stream=True)
        else:
            print("Running inference...")
            result = inference.generate_text(args.instruction, args.input_text, args.max_tokens)
            print(f"✅ Generated text: {result}")
        
    except Exception as e:
        print(f"Inference failed: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()
updated style mimciking fine tuning 2025-08-13 23:50:20 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Styling Inference Pipeline using Trained Models`
			`Supports style transfer inference with streaming and batch processing`
			`"""`

			`import os`
			`import sys`
			`import json`
			`import argparse`
			`from pathlib import Path`
			`from typing import Dict, Any, Optional, List, Union`
			`import yaml`

			`# Add the project root to the path`
			`sys.path.append(str(Path(__file__).parent.parent.parent))`

			`# Inference imports`
			`import torch`
			`from datasets import load_from_disk, Dataset`
			`from unsloth import FastLanguageModel`
			`from transformers import TextStreamer`

			`class StylingInference:`
			`"""Styling task inference using trained models"""`

			`def __init__(self, config: Dict[str, Any]):`
			`self.config = config`
			`self.model = None`
			`self.tokenizer = None`

			`# Set device`
			`self.device = "cuda" if torch.cuda.is_available() else "cpu"`
			`print(f"Using device: {self.device}")`

			`# Model parameters`
			`self.model_output_dir = config.get('model_output_dir', './models/styling')`
			`self.max_seq_length = config.get('max_seq_length', 2048)`
			`self.dtype = config.get('dtype', None)`
			`self.load_in_4bit = config.get('load_in_4bit', True)`
			`self.hf_token = config.get('hf_token', None)`

			`# Inference parameters`
			`self.batch_size = config.get('batch_size', 1)`
			`self.max_new_tokens = config.get('max_new_tokens', 128)`
			`self.temperature = config.get('temperature', 0.8)`
			`self.top_p = config.get('top_p', 0.9)`
			`self.do_sample = config.get('do_sample', True)`

			`# Alpaca prompt template`
			`self.alpaca_prompt = config.get('alpaca_prompt', """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction`

			`### Instruction:`
			`{}`

			`### Input:`
			`{}`

			`### Response:`
			`{}""")`

			`# Style instruction`
			`self.style_instruction = config.get('style_instruction', 'Rewrite the following text in a formal style')`

			`def load_model_and_tokenizer(self):`
			`"""Load the trained model and tokenizer"""`
			`print("Loading trained model and tokenizer...")`

			`try:`
			`# Load the saved LoRA model`
			`model_path = self.config.get('model_output_dir', './models/styling')`
			`print(f"Loading model from: {model_path}")`

			`self.model, self.tokenizer = FastLanguageModel.from_pretrained(`
			`model_name=model_path,`
			`max_seq_length=self.max_seq_length,`
			`dtype=self.dtype,`
			`load_in_4bit=self.load_in_4bit,`
			`)`

			`# Enable native 2x faster inference`
			`FastLanguageModel.for_inference(self.model)`

			`print(f"✅ Model loaded from: {model_path}")`
			`print(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}")`

			`except Exception as e:`
			`print(f"❌ Error loading model: {e}")`
			`raise`

			`def format_prompt(self, instruction: str, input_text: str = "") -> str:`
			`"""Format prompt using the same alpaca format as training"""`
			`# Use the exact same alpaca prompt as training`
			`alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction`

			`### Instruction:`
			`{}`

			`### Input:`
			`{}`

			`### Response:`
			`{}"""`

			`# For inference, output is empty (will be generated)`
			`return alpaca_prompt.format(instruction, input_text, "")`

			`def generate_text(self, instruction: str, input_text: str = "", max_new_tokens: int = 128, stream: bool = False):`
			`"""Generate text using the trained model"""`
			`try:`
			`# Format the prompt`
			`prompt = self.format_prompt(instruction, input_text)`
			`print(f"Formatted prompt: {prompt}")`

			`# Tokenize the input`
			`inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)`

			`if stream:`
			`# Streaming generation`
			`from transformers import TextStreamer`
			`text_streamer = TextStreamer(self.tokenizer)`
			`print("Generating with streaming...")`
			`_ = self.model.generate(`
			`**inputs,`
			`streamer=text_streamer,`
			`max_new_tokens=max_new_tokens`
			`)`
			`return None # Streaming output is handled by streamer`
			`else:`
			`# Non-streaming generation`
			`print("Generating...")`
			`outputs = self.model.generate(`
			`**inputs,`
			`max_new_tokens=max_new_tokens,`
			`do_sample=True,`
			`temperature=0.7,`
			`pad_token_id=self.tokenizer.eos_token_id`
			`)`

			`# Decode the generated text`
			`generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)`

			`# Extract only the generated response (remove the input prompt)`
			`response_start = generated_text.find("### Response:")`
			`if response_start != -1:`
			`response = generated_text[response_start + len("### Response:"):].strip()`
			`else:`
			`response = generated_text`

			`return response`

			`except Exception as e:`
			`print(f"❌ Error generating text: {e}")`
			`raise`

			`def style_transfer(self, input_text: str, instruction: Optional[str] = None, streaming: bool = False) -> str:`
			`"""Perform style transfer on a single input text"""`
			`try:`
			`# Use default instruction if none provided`
			`if instruction is None:`
			`instruction = self.style_instruction`

			`print(f"Style transfer prompt: {instruction}")`
			`print(f"Input text: {input_text}")`

			`# Format prompt`
			`prompt = self.format_prompt(instruction, input_text)`

			`print(f"Style transfer prompt: {prompt}")`

			`if streaming:`
			`print("Generating with streaming...")`
			`self.generate_text_streaming(prompt)`
			`return ""`
			`else:`
			`print("Generating text...")`
			`result = self.generate_text(instruction, input_text, self.max_new_tokens)`
			`print(f"Generated result: {result}")`
			`return result`

			`except Exception as e:`
			`print(f"❌ Error in style transfer: {e}")`
			`raise`

			`def generate_text_streaming(self, prompt: str, max_new_tokens: Optional[int] = None):`
			`"""Generate text with streaming output"""`
			`try:`
			`# Tokenize input`
			`inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)`

			`# Setup text streamer`
			`text_streamer = TextStreamer(self.tokenizer)`

			`# Set generation parameters`
			`gen_kwargs = {`
			`"max_new_tokens": max_new_tokens or self.max_new_tokens,`
			`"temperature": self.temperature,`
			`"top_p": self.top_p,`
			`"do_sample": self.do_sample,`
			`"use_cache": True,`
			`"pad_token_id": self.tokenizer.eos_token_id`
			`}`

			`# Generate with streaming`
			`with torch.no_grad():`
			`_ = self.model.generate(inputs, streamer=text_streamer, gen_kwargs)`

			`except Exception as e:`
			`print(f"❌ Error in streaming generation: {e}")`

			`def batch_style_transfer(self, input_texts: List[str], instruction: Optional[str] = None) -> List[str]:`
			`"""Perform style transfer on multiple input texts"""`
			`results = []`

			`for i, input_text in enumerate(input_texts):`
			`print(f"Processing text {i+1}/{len(input_texts)}")`
			`result = self.style_transfer(input_text, instruction)`
			`results.append(result)`

			`return results`

			`def load_inference_config(config_path: str) -> Dict[str, Any]:`
			`"""Load inference configuration from YAML file"""`
			`try:`
			`with open(config_path, 'r', encoding='utf-8') as f:`
			`config = yaml.safe_load(f)`

			`# Extract inference configuration`
			`inference_config = {}`

			`# Model configuration`
			`if 'model' in config:`
			`model_data = config['model']`
			`inference_config.update({`
			`'base_model_name': model_data.get('training_model', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'),`
			`'max_seq_length': model_data.get('training_max_seq_length', 2048),`
			`'dtype': model_data.get('training_dtype'),`
			`'load_in_4bit': model_data.get('training_load_in_4bit', True),`
			`'hf_token': model_data.get('training_token')`
			`})`

			`# Training configuration - to get model_output_dir`
			`if 'training' in config:`
			`training_data = config['training']`
			`inference_config.update({`
			`'model_output_dir': training_data.get('model_output_dir', './models/styling')`
			`})`

			`# Inference configuration`
			`if 'inference' in config:`
			`inference_data = config['inference']`
			`inference_config.update({`
			`'batch_size': inference_data.get('batch_size', 1),`
			`'max_new_tokens': inference_data.get('max_new_tokens', 128),`
			`'temperature': inference_data.get('temperature', 0.8)`
			`})`

			`# Style configuration`
			`if 'data' in config:`
			`data_config = config['data']`
			`inference_config.update({`
			`'style_instruction': data_config.get('instruction', 'Rewrite the following text in a formal style')`
			`})`

			`return inference_config`

			`except Exception as e:`
			`print(f"Error loading inference config: {e}")`
			`raise`

			`def main():`
			`"""Main inference function"""`
			`parser = argparse.ArgumentParser(description="Styling Inference Pipeline")`

			`# Configuration`
			`parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")`
			`parser.add_argument("--instruction", type=str, required=True, help="Style instruction")`
			`parser.add_argument("--input-text", type=str, default="", help="Input text to style")`
			`parser.add_argument("--max-tokens", type=int, default=128, help="Maximum new tokens to generate")`
			`parser.add_argument("--stream", action="store_true", help="Enable streaming generation")`

			`args = parser.parse_args()`

			`try:`
			`# Load configuration`
			`print(f"Loading configuration from: {args.config}")`
			`inference_config = load_inference_config(args.config)`

			`# Override with CLI arguments`
			`if args.max_tokens:`
			`inference_config['max_new_tokens'] = args.max_tokens`

			`print("Inference configuration:")`
			`for key, value in inference_config.items():`
			`print(f" {key}: {value}")`

			`# Initialize inference`
			`inference = StylingInference(inference_config)`

			`# Load model and tokenizer`
			`inference.load_model_and_tokenizer()`

			`# Run inference`
			`if args.stream:`
			`print("Running streaming inference...")`
			`inference.generate_text(args.instruction, args.input_text, args.max_tokens, stream=True)`
			`else:`
			`print("Running inference...")`
			`result = inference.generate_text(args.instruction, args.input_text, args.max_tokens)`
			`print(f"✅ Generated text: {result}")`

			`except Exception as e:`
			`print(f"Inference failed: {e}")`
			`import traceback`
			`traceback.print_exc()`
			`sys.exit(1)`

			`if __name__ == "__main__":`
			`main()`