added style mimicking piepelines
This commit is contained in:
@@ -0,0 +1,191 @@
|
|||||||
|
# Quick Reference Card
|
||||||
|
|
||||||
|
## Essential Parameters (Most Common)
|
||||||
|
|
||||||
|
### Data Source & Location
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
source: "huggingface|custom" # REQUIRED: Data source type
|
||||||
|
dataset_name: "dataset/name" # REQUIRED for huggingface
|
||||||
|
data_path: "./path/to/file" # REQUIRED for custom
|
||||||
|
data_format: "jsonl|csv|json" # REQUIRED for custom
|
||||||
|
```
|
||||||
|
|
||||||
|
### Field Mapping
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
input_field: "text" # REQUIRED: Input text field
|
||||||
|
label_field: "label" # REQUIRED for classification
|
||||||
|
output_field: "styled_text" # REQUIRED for styling
|
||||||
|
instruction: "Style instruction" # REQUIRED for styling
|
||||||
|
```
|
||||||
|
|
||||||
|
### Basic Processing
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
max_samples: 1000 # Limit total samples
|
||||||
|
train_split: 0.8 # Training ratio (0.0-1.0)
|
||||||
|
validation_split: 0.1 # Validation ratio (0.0-1.0)
|
||||||
|
test_split: 0.1 # Test ratio (0.0-1.0)
|
||||||
|
output_dir: "./output/path" # Output directory
|
||||||
|
```
|
||||||
|
|
||||||
|
### Text Preprocessing
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
clean_text: true # Clean/normalize text
|
||||||
|
lowercase: true # Convert to lowercase
|
||||||
|
min_length: 10 # Minimum text length
|
||||||
|
max_length: 512 # Maximum text length
|
||||||
|
```
|
||||||
|
|
||||||
|
### Model & Training
|
||||||
|
```yaml
|
||||||
|
model:
|
||||||
|
name: "bert-base-uncased" # Model name
|
||||||
|
max_length: 512 # Max sequence length
|
||||||
|
|
||||||
|
training:
|
||||||
|
num_epochs: 3 # Training epochs
|
||||||
|
batch_size: 16 # Batch size
|
||||||
|
learning_rate: 2e-5 # Learning rate
|
||||||
|
```
|
||||||
|
|
||||||
|
## Common Configurations by Task
|
||||||
|
|
||||||
|
### Classification
|
||||||
|
```yaml
|
||||||
|
task:
|
||||||
|
name: "classification"
|
||||||
|
type: "sequence_classification"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "huggingface"
|
||||||
|
dataset_name: "dair-ai/emotion"
|
||||||
|
input_field: "text"
|
||||||
|
label_field: "label"
|
||||||
|
output_format: "classification"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Styling
|
||||||
|
```yaml
|
||||||
|
task:
|
||||||
|
name: "styling"
|
||||||
|
type: "style_transfer"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "custom"
|
||||||
|
data_path: "./data.jsonl"
|
||||||
|
input_field: "text"
|
||||||
|
output_field: "styled_text"
|
||||||
|
instruction: "Rewrite in formal style"
|
||||||
|
output_format: "alpaca"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Text Generation
|
||||||
|
```yaml
|
||||||
|
task:
|
||||||
|
name: "completion"
|
||||||
|
type: "text_generation"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "custom"
|
||||||
|
data_path: "./prompts.jsonl"
|
||||||
|
input_field: "prompt"
|
||||||
|
output_field: "completion"
|
||||||
|
output_format: "instruction"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Quick Start Templates
|
||||||
|
|
||||||
|
### 1. HuggingFace Dataset
|
||||||
|
```yaml
|
||||||
|
task:
|
||||||
|
name: "classification"
|
||||||
|
type: "sequence_classification"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "huggingface"
|
||||||
|
dataset_name: "your/dataset"
|
||||||
|
input_field: "text"
|
||||||
|
label_field: "label"
|
||||||
|
max_samples: 1000
|
||||||
|
output_dir: "./output"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Custom JSONL File
|
||||||
|
```yaml
|
||||||
|
task:
|
||||||
|
name: "styling"
|
||||||
|
type: "style_transfer"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "custom"
|
||||||
|
data_path: "./your_data.jsonl"
|
||||||
|
data_format: "jsonl"
|
||||||
|
input_field: "source"
|
||||||
|
output_field: "target"
|
||||||
|
instruction: "Your style instruction"
|
||||||
|
output_dir: "./output"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. CSV File
|
||||||
|
```yaml
|
||||||
|
task:
|
||||||
|
name: "classification"
|
||||||
|
type: "sequence_classification"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "custom"
|
||||||
|
data_path: "./your_data.csv"
|
||||||
|
data_format: "csv"
|
||||||
|
input_field: "text"
|
||||||
|
label_field: "label"
|
||||||
|
delimiter: ","
|
||||||
|
output_dir: "./output"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Parameter Ranges & Recommendations
|
||||||
|
|
||||||
|
### Split Ratios
|
||||||
|
- **Total must be ≤ 1.0**
|
||||||
|
- **Common**: train=0.8, val=0.1, test=0.1
|
||||||
|
- **Small datasets**: train=0.7, val=0.15, test=0.15
|
||||||
|
|
||||||
|
### Learning Rates
|
||||||
|
- **Fine-tuning**: 1e-5 to 5e-5
|
||||||
|
- **Training from scratch**: 1e-4 to 1e-3
|
||||||
|
- **Start with**: 2e-5
|
||||||
|
|
||||||
|
### Batch Sizes
|
||||||
|
- **GPU Memory**: 8, 16, 32, 64
|
||||||
|
- **CPU**: 4, 8, 16
|
||||||
|
- **Start with**: 16
|
||||||
|
|
||||||
|
### Text Lengths
|
||||||
|
- **BERT**: 512 (max)
|
||||||
|
- **GPT-2**: 1024 (max)
|
||||||
|
- **T5**: 512 (max)
|
||||||
|
- **Start with**: 256
|
||||||
|
|
||||||
|
## Common Issues & Fixes
|
||||||
|
|
||||||
|
| Issue | Cause | Fix |
|
||||||
|
|-------|-------|-----|
|
||||||
|
| "File not found" | Wrong path | Check `data_path` and `output_dir` |
|
||||||
|
| "Memory error" | Batch too large | Reduce `batch_size` |
|
||||||
|
| "Split error" | Ratios > 1.0 | Ensure splits sum to ≤ 1.0 |
|
||||||
|
| "Poor performance" | Wrong learning rate | Try 1e-5 to 5e-5 range |
|
||||||
|
| "Slow processing" | Text too long | Reduce `max_length` |
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
```bash
|
||||||
|
# Set cache directory
|
||||||
|
export HF_HOME="./cache"
|
||||||
|
|
||||||
|
# Set output directory
|
||||||
|
export OUTPUT_DIR="./results"
|
||||||
|
|
||||||
|
# Set log level
|
||||||
|
export LOG_LEVEL="INFO"
|
||||||
|
```
|
||||||
@@ -0,0 +1,207 @@
|
|||||||
|
# Configuration Files Documentation
|
||||||
|
|
||||||
|
This directory contains YAML configuration files for different machine learning tasks. Each configuration file is organized into logical sections and includes comprehensive documentation for all parameters.
|
||||||
|
|
||||||
|
## Configuration Structure
|
||||||
|
|
||||||
|
All configuration files follow a consistent structure organized into these main sections:
|
||||||
|
|
||||||
|
### 1. Task Configuration
|
||||||
|
```yaml
|
||||||
|
task:
|
||||||
|
name: "task_type" # Task type: classification, completion, styling, matching
|
||||||
|
type: "specific_type" # Specific model/task type
|
||||||
|
```
|
||||||
|
|
||||||
|
**Available Task Types:**
|
||||||
|
- **classification**: Text classification tasks (emotion, sentiment, topic, etc.)
|
||||||
|
- **completion**: Text generation and completion tasks
|
||||||
|
- **styling**: Style transfer and text transformation tasks
|
||||||
|
- **matching**: Semantic matching and similarity tasks
|
||||||
|
|
||||||
|
### 2. Data Processing Configuration
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
# Data Source
|
||||||
|
source: "huggingface|custom" # Where to get data from
|
||||||
|
|
||||||
|
# Data Location
|
||||||
|
dataset_name: "dataset/name" # HuggingFace dataset name (for huggingface source)
|
||||||
|
data_path: "./path/to/file" # Path to custom data file (for custom source)
|
||||||
|
data_format: "jsonl|csv|json" # File format for custom data
|
||||||
|
|
||||||
|
# Field Mapping
|
||||||
|
input_field: "text" # Field containing input text
|
||||||
|
output_field: "styled_text" # Field containing output (for styling)
|
||||||
|
label_field: "label" # Field containing labels (for classification)
|
||||||
|
id_field: "id" # Optional ID field for tracking
|
||||||
|
|
||||||
|
# Processing Parameters
|
||||||
|
max_samples: 1000 # Maximum samples to process
|
||||||
|
train_split: 0.8 # Training split ratio
|
||||||
|
validation_split: 0.1 # Validation split ratio
|
||||||
|
test_split: 0.1 # Test split ratio
|
||||||
|
|
||||||
|
# Text Preprocessing
|
||||||
|
clean_text: true # Clean and normalize text
|
||||||
|
remove_special_chars: false # Remove special characters
|
||||||
|
lowercase: true # Convert to lowercase
|
||||||
|
min_length: 10 # Minimum text length
|
||||||
|
max_length: 1000 # Maximum text length
|
||||||
|
|
||||||
|
# Output Configuration
|
||||||
|
output_format: "format_type" # Output format
|
||||||
|
output_dir: "./output/path" # Output directory
|
||||||
|
```
|
||||||
|
|
||||||
|
**Data Source Types:**
|
||||||
|
- **huggingface**: Use datasets from HuggingFace Hub
|
||||||
|
- **custom**: Use local files (JSONL, CSV, JSON)
|
||||||
|
|
||||||
|
**Output Formats:**
|
||||||
|
- **classification**: Raw classification format
|
||||||
|
- **instruction**: Instruction-following format
|
||||||
|
- **conversation**: Conversational format
|
||||||
|
- **qa**: Question-answer format
|
||||||
|
- **styling**: Raw styling format
|
||||||
|
- **alpaca**: Alpaca instruction format
|
||||||
|
|
||||||
|
### 3. Model Configuration
|
||||||
|
```yaml
|
||||||
|
model:
|
||||||
|
name: "model_name" # Model from HuggingFace Hub
|
||||||
|
max_length: 512 # Maximum sequence length
|
||||||
|
num_labels: 6 # Number of labels (for classification)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Recommended Models by Task:**
|
||||||
|
- **Classification**: `bert-base-uncased`, `distilbert-base-uncased`
|
||||||
|
- **Styling**: `t5-base`, `gpt2-medium`
|
||||||
|
- **Completion**: `gpt2-medium`, `gpt2-large`
|
||||||
|
- **Matching**: `sentence-transformers/all-MiniLM-L6-v2`
|
||||||
|
|
||||||
|
### 4. Training Configuration
|
||||||
|
```yaml
|
||||||
|
training:
|
||||||
|
num_epochs: 3 # Number of training epochs
|
||||||
|
batch_size: 16 # Training batch size
|
||||||
|
learning_rate: 2e-5 # Learning rate
|
||||||
|
weight_decay: 0.01 # Weight decay
|
||||||
|
lr_scheduler_type: "linear" # Learning rate scheduler
|
||||||
|
warmup_ratio: 0.1 # Warmup ratio
|
||||||
|
data_dir: "./data/path" # Training data directory
|
||||||
|
output_dir: "./model/output" # Model output directory
|
||||||
|
```
|
||||||
|
|
||||||
|
**Learning Rate Guidelines:**
|
||||||
|
- **Fine-tuning**: 1e-5 to 5e-5
|
||||||
|
- **Training from scratch**: 1e-4 to 1e-3
|
||||||
|
|
||||||
|
**Scheduler Types:**
|
||||||
|
- **linear**: Linear decay
|
||||||
|
- **cosine**: Cosine annealing
|
||||||
|
- **polynomial**: Polynomial decay
|
||||||
|
|
||||||
|
### 5. Inference Configuration
|
||||||
|
```yaml
|
||||||
|
inference:
|
||||||
|
model_path: "./model/path" # Path to saved model
|
||||||
|
device: "auto" # Device to use
|
||||||
|
batch_size: 32 # Inference batch size
|
||||||
|
return_probabilities: true # Return probabilities
|
||||||
|
return_top_k: 3 # Return top K predictions
|
||||||
|
max_new_tokens: 128 # Max tokens to generate
|
||||||
|
temperature: 0.8 # Sampling temperature
|
||||||
|
```
|
||||||
|
|
||||||
|
**Device Options:**
|
||||||
|
- **auto**: Automatically detect best device
|
||||||
|
- **cuda**: Use GPU if available
|
||||||
|
- **cpu**: Force CPU usage
|
||||||
|
|
||||||
|
**Temperature Guidelines:**
|
||||||
|
- **0.0**: Deterministic (always same output)
|
||||||
|
- **0.7-0.9**: Balanced creativity
|
||||||
|
- **1.0+**: More random/creative
|
||||||
|
|
||||||
|
## Task-Specific Parameters
|
||||||
|
|
||||||
|
### Classification Tasks
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
label_encoding: "auto|numeric|string" # How to encode labels
|
||||||
|
multilabel: false # Multi-label vs single-label
|
||||||
|
label_separator: "," # Separator for multi-label
|
||||||
|
```
|
||||||
|
|
||||||
|
### Styling Tasks
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
instruction: "Style instruction text" # The style instruction
|
||||||
|
```
|
||||||
|
|
||||||
|
### Completion Tasks
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
prompt_template: "template" # Prompt template
|
||||||
|
completion_length: 100 # Target completion length
|
||||||
|
```
|
||||||
|
|
||||||
|
## Advanced Configuration
|
||||||
|
|
||||||
|
### HuggingFace Specific
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
hf_split: "train" # Dataset split to use
|
||||||
|
hf_cache_dir: "./cache" # Cache directory
|
||||||
|
test_split_from: "train" # Source for test split
|
||||||
|
val_split_from: "train" # Source for validation split
|
||||||
|
```
|
||||||
|
|
||||||
|
### Custom Data Specific
|
||||||
|
```yaml
|
||||||
|
data:
|
||||||
|
encoding: "utf-8" # File encoding
|
||||||
|
delimiter: "," # CSV delimiter
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage Examples
|
||||||
|
|
||||||
|
### Basic Usage
|
||||||
|
```bash
|
||||||
|
# Use YAML configuration
|
||||||
|
python scripts/task_type/data_processor.py --config configs/task_type/config.yaml
|
||||||
|
|
||||||
|
# Override specific parameters
|
||||||
|
python scripts/task_type/data_processor.py \
|
||||||
|
--config configs/task_type/config.yaml \
|
||||||
|
--max-samples 1000 \
|
||||||
|
--learning-rate 3e-5
|
||||||
|
```
|
||||||
|
|
||||||
|
### Creating Custom Configurations
|
||||||
|
1. Copy an existing config file
|
||||||
|
2. Modify parameters for your specific use case
|
||||||
|
3. Update paths and model names
|
||||||
|
4. Test with a small dataset first
|
||||||
|
|
||||||
|
## Best Practices
|
||||||
|
|
||||||
|
1. **Start with Defaults**: Use default values and adjust based on results
|
||||||
|
2. **Validate Paths**: Ensure all file paths are correct and accessible
|
||||||
|
3. **Monitor Resources**: Adjust batch sizes based on available GPU memory
|
||||||
|
4. **Test Incrementally**: Test with small datasets before full processing
|
||||||
|
5. **Version Control**: Keep configurations in version control for reproducibility
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Common Issues:
|
||||||
|
- **File Not Found**: Check `data_path` and `output_dir` paths
|
||||||
|
- **Memory Errors**: Reduce `batch_size` or `max_length`
|
||||||
|
- **Poor Performance**: Adjust `learning_rate` or `num_epochs`
|
||||||
|
- **Split Errors**: Ensure split ratios sum to ≤ 1.0
|
||||||
|
|
||||||
|
### Getting Help:
|
||||||
|
- Check the script help: `python script.py --help`
|
||||||
|
- Review the pipeline logs for detailed error messages
|
||||||
|
- Verify YAML syntax and parameter values
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# Comprehensive Classification Configuration
|
# Comprehensive Classification Configuration
|
||||||
# This file defines all parameters for emotion classification using the dair-ai/emotion dataset
|
# This file defines all parameters for emotion classification using the dair-ai/emotion dataset
|
||||||
# Organized by level: data processing, model, training, and inference
|
# Organized by level: task, data processing, model, training, and inference
|
||||||
|
|
||||||
# Task Configuration
|
# Task Configuration
|
||||||
task:
|
task:
|
||||||
@@ -15,9 +15,9 @@ data:
|
|||||||
data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data)
|
data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data)
|
||||||
|
|
||||||
# Field Mapping
|
# Field Mapping
|
||||||
input_field: "text" # Field name containing input text
|
input_field: "text" # Field name containing input text to be classified
|
||||||
label_field: "label" # Field name containing labels
|
label_field: "label" # Field name containing classification labels
|
||||||
id_field: null # Optional ID field name
|
id_field: null # Optional ID field name for tracking individual samples
|
||||||
|
|
||||||
# Processing Parameters
|
# Processing Parameters
|
||||||
max_samples: 1000 # Maximum samples to process (null for all samples)
|
max_samples: 1000 # Maximum samples to process (null for all samples)
|
||||||
@@ -26,54 +26,54 @@ data:
|
|||||||
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||||
|
|
||||||
# Text Preprocessing
|
# Text Preprocessing
|
||||||
clean_text: true # Clean and normalize text
|
clean_text: true # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
|
||||||
remove_special_chars: false # Remove special characters from text
|
remove_special_chars: false # Remove special characters from text (keep for emotion analysis)
|
||||||
lowercase: true # Convert text to lowercase
|
lowercase: true # Convert text to lowercase (standard for BERT models)
|
||||||
min_length: 10 # Minimum text length (filter out shorter texts)
|
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||||
max_length: 1000 # Maximum text length (truncate longer texts)
|
max_length: 1000 # Maximum text length (truncate longer texts)
|
||||||
|
|
||||||
# Label Processing
|
# Label Processing
|
||||||
label_encoding: "auto" # Label encoding: "auto", "numeric", "string"
|
label_encoding: "auto" # Label encoding: "auto", "numeric", "string"
|
||||||
multilabel: false # Enable multilabel classification
|
multilabel: false # Enable multilabel classification (false for single emotion per text)
|
||||||
label_separator: "," # Separator for multilabel datasets
|
label_separator: "," # Separator for multilabel datasets (comma-separated labels)
|
||||||
|
|
||||||
# Output Configuration
|
# Output Configuration
|
||||||
output_format: "classification" # Output format: "classification", "instruction", "conversation", "qa"
|
output_format: "classification" # Output format: "classification", "instruction", "conversation", "qa"
|
||||||
output_dir: "./data/processed/classification/emotion" # Specific output directory for this dataset
|
output_dir: "./data/processed/classification/emotion" # Output directory for processed data and splits
|
||||||
|
|
||||||
# HuggingFace Specific
|
# HuggingFace Specific
|
||||||
hf_split: "train" # HuggingFace dataset split to use
|
hf_split: "train" # HuggingFace dataset split to use as base
|
||||||
hf_cache_dir: null # HuggingFace cache directory (null for default)
|
hf_cache_dir: null # HuggingFace cache directory (null for default ~/.cache/huggingface)
|
||||||
|
|
||||||
# Split Configuration (Advanced)
|
# Split Configuration (Advanced)
|
||||||
test_split_from: "train" # Source for test split: "train", "use_test_if_available", "use_val_if_available"
|
test_split_from: "train" # Source for test split: "train", "use_test_if_available", "use_val_if_available"
|
||||||
val_split_from: "train" # Source for validation split: "train", "use_val_if_available"
|
val_split_from: "train" # Source for validation split: "train", "use_val_if_available"
|
||||||
|
|
||||||
# Custom Data Specific
|
# Custom Data Specific
|
||||||
encoding: "utf-8" # File encoding for custom data
|
encoding: "utf-8" # File encoding for custom data files
|
||||||
delimiter: "," # Delimiter for CSV files
|
delimiter: "," # Delimiter for CSV files (comma for standard CSV)
|
||||||
|
|
||||||
# Model Configuration
|
# Model Configuration
|
||||||
model:
|
model:
|
||||||
name: "bert-base-uncased" # Model name from HuggingFace Hub
|
name: "bert-base-uncased" # Model name from HuggingFace Hub (good for text classification)
|
||||||
max_length: 512 # Maximum sequence length for tokenization
|
max_length: 512 # Maximum sequence length for tokenization (BERT limit)
|
||||||
num_labels: 6 # Number of classification labels
|
num_labels: 6 # Number of classification labels (emotion categories)
|
||||||
|
|
||||||
# Training Configuration
|
# Training Configuration
|
||||||
training:
|
training:
|
||||||
num_epochs: 3 # Number of training epochs
|
num_epochs: 3 # Number of training epochs (adjust based on dataset size)
|
||||||
batch_size: 16 # Training batch size
|
batch_size: 16 # Training batch size (adjust based on GPU memory)
|
||||||
learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5)
|
learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
|
||||||
weight_decay: 0.01 # Weight decay for optimizer (typical range: 0.01 to 0.1)
|
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
||||||
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||||
warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)
|
warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)
|
||||||
data_dir: "./data/processed/classification/emotion" # Directory containing train/validation/test JSONL files
|
data_dir: "./data/processed/classification/emotion" # Directory containing train/validation/test JSONL files
|
||||||
output_dir: "./results/classification/emotion_model" # Output directory for saved model
|
output_dir: "./results/classification/emotion_model" # Output directory for saved model and checkpoints
|
||||||
|
|
||||||
# Inference Configuration
|
# Inference Configuration
|
||||||
inference:
|
inference:
|
||||||
model_path: "./results/classification/emotion_model" # Path to saved model directory
|
model_path: "./results/classification/emotion_model" # Path to saved model directory
|
||||||
device: "auto" # Device: "auto", "cuda", "cpu"
|
device: "auto" # Device: "auto", "cuda", "cpu" (auto detects best available)
|
||||||
batch_size: 32 # Batch size for inference
|
batch_size: 32 # Batch size for inference (can be larger than training)
|
||||||
return_probabilities: true # Return all class probabilities
|
return_probabilities: true # Return all class probabilities (not just top prediction)
|
||||||
return_top_k: 3 # Return top K predictions
|
return_top_k: 3 # Return top K predictions (useful for confidence analysis)
|
||||||
|
|||||||
+60
-20
@@ -1,29 +1,69 @@
|
|||||||
|
# Comprehensive Styling Configuration
|
||||||
|
# This file defines all parameters for formal style transfer tasks
|
||||||
|
# Organized by level: task, data processing, model, training, and inference
|
||||||
|
|
||||||
|
# Task Configuration
|
||||||
task:
|
task:
|
||||||
name: "styling"
|
name: "styling" # Task type: classification, completion, styling, matching
|
||||||
type: "style_transfer"
|
type: "style_transfer" # Model type: style_transfer, text_generation, etc.
|
||||||
|
|
||||||
|
# Data Processing Configuration
|
||||||
data:
|
data:
|
||||||
source: "custom"
|
source: "custom" # Data source: "huggingface" or "custom"
|
||||||
input_field: "text"
|
data_path: "./data/raw/styling/sample_formal.jsonl" # Path to custom data file (required for custom source)
|
||||||
style_field: "style"
|
dataset_name: null # HuggingFace dataset name (required for huggingface source)
|
||||||
max_length: 256
|
|
||||||
train_split: 0.8
|
# Field Mapping
|
||||||
validation_split: 0.1
|
input_field: "text" # Field name containing source text to be styled
|
||||||
test_split: 0.1
|
output_field: "styled_text" # Field name containing the styled/transformed text
|
||||||
|
|
||||||
|
# Style Instruction
|
||||||
|
instruction: "Rewrite the following text in a formal style" # The style instruction that guides the transformation
|
||||||
|
|
||||||
|
# Data Format & Processing
|
||||||
|
data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data)
|
||||||
|
max_length: 256 # Maximum text length (truncate longer texts)
|
||||||
|
min_length: 10 # Minimum text length (filter out shorter texts)
|
||||||
|
|
||||||
|
# Text Preprocessing
|
||||||
|
clean_text: true # Clean and normalize text (remove extra spaces, normalize quotes, etc.)
|
||||||
|
lowercase: false # Convert text to lowercase (false for formal style to preserve case)
|
||||||
|
|
||||||
|
# Data Splitting
|
||||||
|
train_split: 0.8 # Training split ratio (0.0 to 1.0)
|
||||||
|
validation_split: 0.1 # Validation split ratio (0.0 to 1.0)
|
||||||
|
test_split: 0.1 # Test split ratio (0.0 to 1.0)
|
||||||
|
|
||||||
|
# Output Configuration
|
||||||
|
output_format: "alpaca" # Output format: "styling" (raw), "alpaca" (instruction format)
|
||||||
|
output_dir: "./data/processed/styling/formal" # Output directory for processed data and HuggingFace datasets
|
||||||
|
|
||||||
|
# Model Configuration
|
||||||
model:
|
model:
|
||||||
name: "t5-base"
|
name: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model name from HuggingFace Hub
|
||||||
max_length: 256
|
max_length: 2048 # Maximum sequence length for tokenization
|
||||||
|
max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported)
|
||||||
|
dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+
|
||||||
|
load_in_4bit: true # Use 4bit quantization to reduce memory usage
|
||||||
|
token: null # HuggingFace token for gated models (e.g., "hf_...")
|
||||||
|
|
||||||
|
# Training Model Parameters
|
||||||
|
training_model: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model to use for training
|
||||||
|
training_max_seq_length: 2048 # Max sequence length for training
|
||||||
|
training_dtype: null # Data type for training
|
||||||
|
training_load_in_4bit: true # 4bit quantization for training
|
||||||
|
|
||||||
|
# Training Configuration
|
||||||
training:
|
training:
|
||||||
num_epochs: 3
|
num_epochs: 3 # Number of training epochs
|
||||||
batch_size: 16
|
batch_size: 16 # Training batch size (adjust based on GPU memory)
|
||||||
learning_rate: 3e-5
|
learning_rate: 3e-5 # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning)
|
||||||
weight_decay: 0.01
|
weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting)
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0)
|
||||||
lr_scheduler_type: "linear"
|
lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial"
|
||||||
|
|
||||||
|
# Inference Configuration
|
||||||
inference:
|
inference:
|
||||||
batch_size: 32
|
batch_size: 32 # Batch size for inference (can be larger than training)
|
||||||
max_new_tokens: 128
|
max_new_tokens: 128 # Maximum new tokens to generate during inference
|
||||||
temperature: 0.8
|
temperature: 0.8 # Sampling temperature (0.0 = deterministic, 1.0 = random)
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "This is really cool stuff!", "output": "This is quite impressive material."}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "I'm gonna go to the store later.", "output": "I will go to the store later."}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"}
|
||||||
Binary file not shown.
@@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
"citation": "",
|
||||||
|
"description": "",
|
||||||
|
"features": {
|
||||||
|
"instruction": {
|
||||||
|
"dtype": "string",
|
||||||
|
"_type": "Value"
|
||||||
|
},
|
||||||
|
"input": {
|
||||||
|
"dtype": "string",
|
||||||
|
"_type": "Value"
|
||||||
|
},
|
||||||
|
"output": {
|
||||||
|
"dtype": "string",
|
||||||
|
"_type": "Value"
|
||||||
|
},
|
||||||
|
"text": {
|
||||||
|
"dtype": "string",
|
||||||
|
"_type": "Value"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"homepage": "",
|
||||||
|
"license": ""
|
||||||
|
}
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
{
|
||||||
|
"_data_files": [
|
||||||
|
{
|
||||||
|
"filename": "data-00000-of-00001.arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"_fingerprint": "4e028847697e7b16",
|
||||||
|
"_format_columns": null,
|
||||||
|
"_format_kwargs": {},
|
||||||
|
"_format_type": null,
|
||||||
|
"_output_all_columns": false,
|
||||||
|
"_split": null
|
||||||
|
}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "That's totally awesome!", "output": "That is quite remarkable!"}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "I'm gonna go to the store later.", "output": "I will go to the store later."}
|
||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"}
|
||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "What's the deal with this?", "output": "What is the situation regarding this matter?"}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "This is really cool stuff!", "output": "This is quite impressive material."}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "That's totally awesome!", "output": "That is quite remarkable!"}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "I'm gonna go to the store later.", "output": "I will go to the store later."}
|
||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"}
|
||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "What's the deal with this?", "output": "What is the situation regarding this matter?"}
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
{"instruction": "Rewrite the following text in a formal style", "input": "This is really cool stuff!", "output": "This is quite impressive material."}
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{"text": "Hey, what's up? How are you doing today?", "styled_text": "Hello, how are you doing today?"}
|
||||||
|
{"text": "This is really cool stuff!", "styled_text": "This is quite impressive material."}
|
||||||
|
{"text": "I'm gonna go to the store later.", "styled_text": "I will go to the store later."}
|
||||||
|
{"text": "What's the deal with this?", "styled_text": "What is the situation regarding this matter?"}
|
||||||
|
{"text": "That's totally awesome!", "styled_text": "That is quite remarkable!"}
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
{"input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"}
|
||||||
|
{"input": "This is really cool stuff!", "output": "This is quite impressive material."}
|
||||||
|
{"input": "I'm gonna go to the store later.", "output": "I will go to the store later."}
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
{"text": "Hello world", "styled_text": "Greetings, world."}
|
||||||
|
{"styled_text": "This is a formal greeting."}
|
||||||
|
{"text": "How are you?", "styled_text": "How are you doing?"}
|
||||||
|
{"text": null, "styled_text": "Empty input example."}
|
||||||
|
{"styled_text": "Another example with no input."}
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,346 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Styling Inference Pipeline using Trained Models
|
||||||
|
Supports style transfer inference with streaming and batch processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional, List, Union
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Add the project root to the path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
from utils.config.config_manager import ConfigManager
|
||||||
|
from utils.logging.logging import setup_logging
|
||||||
|
|
||||||
|
# Inference imports
|
||||||
|
import torch
|
||||||
|
from datasets import load_from_disk, Dataset
|
||||||
|
from unsloth import FastLanguageModel
|
||||||
|
from transformers import TextStreamer
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class StylingInference:
|
||||||
|
"""Styling task inference using trained models"""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict[str, Any]):
|
||||||
|
self.config = config
|
||||||
|
self.model = None
|
||||||
|
self.tokenizer = None
|
||||||
|
|
||||||
|
# Set device
|
||||||
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
logger.info(f"Using device: {self.device}")
|
||||||
|
|
||||||
|
# Model parameters
|
||||||
|
self.model_path = config.get('model_path')
|
||||||
|
self.max_seq_length = config.get('max_seq_length', 2048)
|
||||||
|
self.dtype = config.get('dtype', None)
|
||||||
|
self.load_in_4bit = config.get('load_in_4bit', True)
|
||||||
|
self.hf_token = config.get('hf_token', None)
|
||||||
|
|
||||||
|
# Inference parameters
|
||||||
|
self.batch_size = config.get('batch_size', 1)
|
||||||
|
self.max_new_tokens = config.get('max_new_tokens', 128)
|
||||||
|
self.temperature = config.get('temperature', 0.8)
|
||||||
|
self.top_p = config.get('top_p', 0.9)
|
||||||
|
self.do_sample = config.get('do_sample', True)
|
||||||
|
|
||||||
|
# Alpaca prompt template
|
||||||
|
self.alpaca_prompt = config.get('alpaca_prompt', """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction
|
||||||
|
|
||||||
|
### Instruction:
|
||||||
|
{}
|
||||||
|
|
||||||
|
### Input:
|
||||||
|
{}
|
||||||
|
|
||||||
|
### Response:
|
||||||
|
{}""")
|
||||||
|
|
||||||
|
# Style instruction
|
||||||
|
self.style_instruction = config.get('style_instruction', 'Rewrite the following text in a formal style')
|
||||||
|
|
||||||
|
def load_model_and_tokenizer(self):
|
||||||
|
"""Load the trained model and tokenizer"""
|
||||||
|
logger.info("Loading model and tokenizer...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.model_path and Path(self.model_path).exists():
|
||||||
|
# Load local trained model
|
||||||
|
logger.info(f"Loading local model from: {self.model_path}")
|
||||||
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name=self.model_path,
|
||||||
|
max_seq_length=self.max_seq_length,
|
||||||
|
dtype=self.dtype,
|
||||||
|
load_in_4bit=self.load_in_4bit,
|
||||||
|
token=self.hf_token
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Load base model from HuggingFace Hub
|
||||||
|
logger.info(f"Loading base model: {self.config.get('base_model_name', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit')}")
|
||||||
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name=self.config.get('base_model_name', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'),
|
||||||
|
max_seq_length=self.max_seq_length,
|
||||||
|
dtype=self.dtype,
|
||||||
|
load_in_4bit=self.load_in_4bit,
|
||||||
|
token=self.hf_token
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare for inference
|
||||||
|
FastLanguageModel.for_inference(self.model)
|
||||||
|
|
||||||
|
logger.info(f"✅ Model loaded successfully")
|
||||||
|
logger.info(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error loading model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def format_prompt(self, instruction: str, input_text: str, output: str = "") -> str:
|
||||||
|
"""Format the prompt using Alpaca template"""
|
||||||
|
return self.alpaca_prompt.format(instruction, input_text, output)
|
||||||
|
|
||||||
|
def generate_text(self, prompt: str, max_new_tokens: Optional[int] = None) -> str:
|
||||||
|
"""Generate text from a single prompt"""
|
||||||
|
try:
|
||||||
|
# Tokenize input
|
||||||
|
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
|
||||||
|
|
||||||
|
# Set generation parameters
|
||||||
|
gen_kwargs = {
|
||||||
|
"max_new_tokens": max_new_tokens or self.max_new_tokens,
|
||||||
|
"temperature": self.temperature,
|
||||||
|
"top_p": self.top_p,
|
||||||
|
"do_sample": self.do_sample,
|
||||||
|
"use_cache": True,
|
||||||
|
"pad_token_id": self.tokenizer.eos_token_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
with torch.no_grad():
|
||||||
|
outputs = self.model.generate(**inputs, **gen_kwargs)
|
||||||
|
|
||||||
|
# Decode
|
||||||
|
generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
||||||
|
|
||||||
|
# Extract only the generated part (remove input prompt)
|
||||||
|
if prompt in generated_text:
|
||||||
|
generated_text = generated_text[len(prompt):].strip()
|
||||||
|
|
||||||
|
return generated_text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error generating text: {e}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def style_transfer(self, input_text: str, instruction: Optional[str] = None, streaming: bool = False) -> str:
|
||||||
|
"""Perform style transfer on input text"""
|
||||||
|
if instruction is None:
|
||||||
|
instruction = self.style_instruction
|
||||||
|
|
||||||
|
# Format prompt
|
||||||
|
prompt = self.format_prompt(instruction, input_text, "")
|
||||||
|
|
||||||
|
logger.info(f"Style transfer prompt: {prompt}")
|
||||||
|
|
||||||
|
if streaming:
|
||||||
|
logger.info("Generating with streaming...")
|
||||||
|
self.generate_text_streaming(prompt)
|
||||||
|
return ""
|
||||||
|
else:
|
||||||
|
logger.info("Generating text...")
|
||||||
|
result = self.generate_text(prompt)
|
||||||
|
logger.info(f"Generated result: {result}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def generate_text_streaming(self, prompt: str, max_new_tokens: Optional[int] = None):
|
||||||
|
"""Generate text with streaming output"""
|
||||||
|
try:
|
||||||
|
# Tokenize input
|
||||||
|
inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
|
||||||
|
|
||||||
|
# Setup text streamer
|
||||||
|
text_streamer = TextStreamer(self.tokenizer)
|
||||||
|
|
||||||
|
# Set generation parameters
|
||||||
|
gen_kwargs = {
|
||||||
|
"max_new_tokens": max_new_tokens or self.max_new_tokens,
|
||||||
|
"temperature": self.temperature,
|
||||||
|
"top_p": self.top_p,
|
||||||
|
"do_sample": self.do_sample,
|
||||||
|
"use_cache": True,
|
||||||
|
"pad_token_id": self.tokenizer.eos_token_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# Generate with streaming
|
||||||
|
with torch.no_grad():
|
||||||
|
_ = self.model.generate(**inputs, streamer=text_streamer, **gen_kwargs)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error in streaming generation: {e}")
|
||||||
|
|
||||||
|
def batch_style_transfer(self, input_texts: List[str], instruction: Optional[str] = None) -> List[str]:
|
||||||
|
"""Perform style transfer on multiple input texts"""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i, input_text in enumerate(input_texts):
|
||||||
|
logger.info(f"Processing text {i+1}/{len(input_texts)}")
|
||||||
|
result = self.style_transfer(input_text, instruction)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def load_inference_config(config_path: str) -> Dict[str, Any]:
|
||||||
|
"""Load inference configuration from YAML file"""
|
||||||
|
try:
|
||||||
|
with open(config_path, 'r', encoding='utf-8') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Extract inference configuration
|
||||||
|
inference_config = {}
|
||||||
|
|
||||||
|
# Model configuration
|
||||||
|
if 'model' in config:
|
||||||
|
model_data = config['model']
|
||||||
|
inference_config.update({
|
||||||
|
'base_model_name': model_data.get('training_model', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'),
|
||||||
|
'max_seq_length': model_data.get('training_max_seq_length', 2048),
|
||||||
|
'dtype': model_data.get('training_dtype'),
|
||||||
|
'load_in_4bit': model_data.get('training_load_in_4bit', True),
|
||||||
|
'hf_token': model_data.get('training_token')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Inference configuration
|
||||||
|
if 'inference' in config:
|
||||||
|
inference_data = config['inference']
|
||||||
|
inference_config.update({
|
||||||
|
'batch_size': inference_data.get('batch_size', 1),
|
||||||
|
'max_new_tokens': inference_data.get('max_new_tokens', 128),
|
||||||
|
'temperature': inference_data.get('temperature', 0.8)
|
||||||
|
})
|
||||||
|
|
||||||
|
# Style configuration
|
||||||
|
if 'data' in config:
|
||||||
|
data_config = config['data']
|
||||||
|
inference_config.update({
|
||||||
|
'style_instruction': data_config.get('instruction', 'Rewrite the following text in a formal style')
|
||||||
|
})
|
||||||
|
|
||||||
|
return inference_config
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading inference config: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main inference function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Styling Inference Pipeline")
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--model-path", type=str, help="Path to trained model (optional, uses base model if not provided)")
|
||||||
|
|
||||||
|
# Inference modes
|
||||||
|
parser.add_argument("--text", type=str, help="Single text to style transfer")
|
||||||
|
parser.add_argument("--input-file", type=str, help="File containing texts to process (one per line)")
|
||||||
|
|
||||||
|
# Generation parameters
|
||||||
|
parser.add_argument("--max-tokens", type=int, help="Maximum new tokens to generate")
|
||||||
|
parser.add_argument("--temperature", type=float, help="Sampling temperature")
|
||||||
|
parser.add_argument("--streaming", action="store_true", help="Enable streaming generation")
|
||||||
|
parser.add_argument("--instruction", type=str, help="Custom style instruction")
|
||||||
|
|
||||||
|
# Output
|
||||||
|
parser.add_argument("--output-file", type=str, help="Output file for results")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
setup_logging()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load configuration
|
||||||
|
logger.info(f"Loading configuration from: {args.config}")
|
||||||
|
inference_config = load_inference_config(args.config)
|
||||||
|
|
||||||
|
# Override with CLI arguments
|
||||||
|
if args.model_path:
|
||||||
|
inference_config['model_path'] = args.model_path
|
||||||
|
if args.max_tokens:
|
||||||
|
inference_config['max_new_tokens'] = args.max_tokens
|
||||||
|
if args.temperature:
|
||||||
|
inference_config['temperature'] = args.temperature
|
||||||
|
if args.instruction:
|
||||||
|
inference_config['style_instruction'] = args.instruction
|
||||||
|
|
||||||
|
logger.info("Inference configuration:")
|
||||||
|
for key, value in inference_config.items():
|
||||||
|
logger.info(f" {key}: {value}")
|
||||||
|
|
||||||
|
# Initialize inference
|
||||||
|
inferencer = StylingInference(inference_config)
|
||||||
|
|
||||||
|
# Load model
|
||||||
|
inferencer.load_model_and_tokenizer()
|
||||||
|
|
||||||
|
# Run inference based on mode
|
||||||
|
if args.text:
|
||||||
|
# Single text inference
|
||||||
|
logger.info("Running single text inference...")
|
||||||
|
result = inferencer.style_transfer(args.text, args.instruction, args.streaming)
|
||||||
|
if not args.streaming:
|
||||||
|
print(f"\nGenerated text: {result}")
|
||||||
|
|
||||||
|
elif args.input_file:
|
||||||
|
# Batch file inference
|
||||||
|
logger.info("Running batch file inference...")
|
||||||
|
with open(args.input_file, 'r', encoding='utf-8') as f:
|
||||||
|
input_texts = [line.strip() for line in f if line.strip()]
|
||||||
|
|
||||||
|
results = inferencer.batch_style_transfer(input_texts, args.instruction)
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
output_file = args.output_file or f"{Path(args.input_file).stem}_styled.txt"
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
for input_text, result in zip(input_texts, results):
|
||||||
|
f.write(f"Input: {input_text}\n")
|
||||||
|
f.write(f"Output: {result}\n")
|
||||||
|
f.write("-" * 50 + "\n")
|
||||||
|
|
||||||
|
logger.info(f"✅ Results saved to: {output_file}")
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Interactive mode
|
||||||
|
logger.info("Entering interactive mode. Type 'quit' to exit.")
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
user_input = input("\nEnter text to style (or 'quit'): ").strip()
|
||||||
|
if user_input.lower() == 'quit':
|
||||||
|
break
|
||||||
|
|
||||||
|
if user_input:
|
||||||
|
result = inferencer.style_transfer(user_input, args.instruction, args.streaming)
|
||||||
|
if not args.streaming:
|
||||||
|
print(f"\nStyled text: {result}")
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing input: {e}")
|
||||||
|
|
||||||
|
logger.info("🎉 Inference completed successfully!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Inference failed: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,446 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Styling Training Pipeline using Unsloth and SFTTrainer
|
||||||
|
Supports style transfer tasks with LoRA fine-tuning
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
# Add the project root to the path
|
||||||
|
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
from utils.config.config_manager import ConfigManager
|
||||||
|
#from utils.logging.logging import setup_logging
|
||||||
|
|
||||||
|
# Training imports
|
||||||
|
import torch
|
||||||
|
from datasets import load_from_disk, Dataset
|
||||||
|
from unsloth import FastLanguageModel, is_bfloat16_supported
|
||||||
|
from trl import SFTTrainer
|
||||||
|
from transformers import TrainingArguments
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class StylingTrainer:
|
||||||
|
"""Styling task trainer using Unsloth and SFTTrainer"""
|
||||||
|
|
||||||
|
def __init__(self, config: Dict[str, Any]):
|
||||||
|
self.config = config
|
||||||
|
self.model = None
|
||||||
|
self.tokenizer = None
|
||||||
|
self.trainer = None
|
||||||
|
|
||||||
|
# Set device
|
||||||
|
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
logger.info(f"Using device: {self.device}")
|
||||||
|
|
||||||
|
# Training parameters
|
||||||
|
self.max_seq_length = config.get('max_seq_length', 2048)
|
||||||
|
self.dtype = config.get('dtype', None)
|
||||||
|
self.load_in_4bit = config.get('load_in_4bit', True)
|
||||||
|
self.hf_token = config.get('hf_token', None)
|
||||||
|
|
||||||
|
# LoRA parameters
|
||||||
|
self.lora_r = config.get('lora_r', 16)
|
||||||
|
self.lora_alpha = config.get('lora_alpha', 16)
|
||||||
|
self.lora_dropout = config.get('lora_dropout', 0)
|
||||||
|
self.target_modules = config.get('target_modules', [
|
||||||
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||||
|
"gate_proj", "up_proj", "down_proj"
|
||||||
|
])
|
||||||
|
|
||||||
|
# Training arguments
|
||||||
|
self.batch_size = config.get('batch_size', 2)
|
||||||
|
self.gradient_accumulation_steps = config.get('gradient_accumulation_steps', 4)
|
||||||
|
self.learning_rate = config.get('learning_rate', 2e-4)
|
||||||
|
self.num_epochs = config.get('num_epochs', 1)
|
||||||
|
self.max_steps = config.get('max_steps', None)
|
||||||
|
self.warmup_steps = config.get('warmup_steps', 5)
|
||||||
|
self.weight_decay = config.get('weight_decay', 0.01)
|
||||||
|
self.seed = config.get('seed', 3407)
|
||||||
|
|
||||||
|
# Output paths
|
||||||
|
self.output_dir = config.get('output_dir', './outputs')
|
||||||
|
self.model_output_dir = config.get('model_output_dir', './models/styling')
|
||||||
|
|
||||||
|
def load_model_and_tokenizer(self):
|
||||||
|
"""Load the pre-trained model and tokenizer"""
|
||||||
|
logger.info("Loading model and tokenizer...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name=self.config['model_name'],
|
||||||
|
max_seq_length=self.max_seq_length,
|
||||||
|
dtype=self.dtype,
|
||||||
|
load_in_4bit=self.load_in_4bit,
|
||||||
|
token=self.hf_token
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"✅ Model loaded: {self.config['model_name']}")
|
||||||
|
logger.info(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error loading model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def setup_lora(self):
|
||||||
|
"""Setup LoRA for efficient fine-tuning"""
|
||||||
|
logger.info("Setting up LoRA configuration...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.model = FastLanguageModel.get_peft_model(
|
||||||
|
self.model,
|
||||||
|
r=self.lora_r,
|
||||||
|
target_modules=self.target_modules,
|
||||||
|
lora_alpha=self.lora_alpha,
|
||||||
|
lora_dropout=self.lora_dropout,
|
||||||
|
bias="none",
|
||||||
|
use_gradient_checkpointing="unsloth",
|
||||||
|
random_state=self.seed,
|
||||||
|
use_rslora=False,
|
||||||
|
loftq_config=None
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"✅ LoRA configured with r={self.lora_r}, alpha={self.lora_alpha}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error setting up LoRA: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def load_dataset(self, dataset_path: str) -> Dataset:
|
||||||
|
"""Load the training dataset"""
|
||||||
|
logger.info(f"Loading dataset from: {dataset_path}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if Path(dataset_path).exists():
|
||||||
|
# Check if it's a HuggingFace dataset directory
|
||||||
|
if (Path(dataset_path) / "dataset_info.json").exists():
|
||||||
|
# Load from HuggingFace dataset directory
|
||||||
|
dataset = load_from_disk(dataset_path)
|
||||||
|
logger.info(f"Loaded HuggingFace dataset from disk: {len(dataset)} samples")
|
||||||
|
else:
|
||||||
|
# Load from processed data files (JSONL format)
|
||||||
|
logger.info("Loading from processed data files...")
|
||||||
|
from datasets import Dataset
|
||||||
|
import json
|
||||||
|
|
||||||
|
all_data = []
|
||||||
|
data_dir = Path(dataset_path)
|
||||||
|
|
||||||
|
# Look for train.jsonl, validation.jsonl, test.jsonl
|
||||||
|
for split_file in ["train.jsonl", "validation.jsonl", "test.jsonl"]:
|
||||||
|
file_path = data_dir / split_file
|
||||||
|
if file_path.exists():
|
||||||
|
logger.info(f"Loading {split_file}...")
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.strip():
|
||||||
|
data = json.loads(line)
|
||||||
|
all_data.append(data)
|
||||||
|
|
||||||
|
if not all_data:
|
||||||
|
raise ValueError(f"No data found in {dataset_path}")
|
||||||
|
|
||||||
|
# Create HuggingFace dataset
|
||||||
|
dataset = Dataset.from_list(all_data)
|
||||||
|
logger.info(f"Created HuggingFace dataset from {len(all_data)} samples")
|
||||||
|
else:
|
||||||
|
# Try loading from HuggingFace Hub
|
||||||
|
logger.info(f"Attempting to load from HuggingFace Hub: {dataset_path}")
|
||||||
|
dataset = Dataset.load_dataset(dataset_path, split="train")
|
||||||
|
logger.info(f"Loaded from HuggingFace Hub: {len(dataset)} samples")
|
||||||
|
|
||||||
|
logger.info(f"Dataset loaded: {len(dataset)} samples")
|
||||||
|
logger.info(f"Dataset features: {dataset.features}")
|
||||||
|
|
||||||
|
# Verify required fields exist
|
||||||
|
required_fields = ["instruction", "input", "output"]
|
||||||
|
missing_fields = [field for field in required_fields if field not in dataset.features]
|
||||||
|
if missing_fields:
|
||||||
|
raise ValueError(f"Missing required fields in dataset: {missing_fields}")
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading dataset: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def setup_trainer(self, train_dataset: Dataset):
|
||||||
|
"""Setup the SFTTrainer"""
|
||||||
|
logger.info("Setting up SFTTrainer...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# First, map the dataset to create the text field with EOS token
|
||||||
|
def formatting_prompts_func(examples):
|
||||||
|
instructions = examples["instruction"]
|
||||||
|
inputs = examples["input"]
|
||||||
|
outputs = examples["output"]
|
||||||
|
texts = []
|
||||||
|
|
||||||
|
for instruction, input_text, output in zip(instructions, inputs, outputs):
|
||||||
|
# Must add EOS_TOKEN, otherwise your generation will go on forever!
|
||||||
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction
|
||||||
|
|
||||||
|
### Instruction:
|
||||||
|
{}
|
||||||
|
|
||||||
|
### Input:
|
||||||
|
{}
|
||||||
|
|
||||||
|
### Response:
|
||||||
|
{}"""
|
||||||
|
text = alpaca_prompt.format(instruction, input_text, output) + self.tokenizer.eos_token
|
||||||
|
texts.append(text)
|
||||||
|
|
||||||
|
return {"text": texts}
|
||||||
|
|
||||||
|
# Apply the formatting function to create the text field
|
||||||
|
logger.info("Mapping dataset to create text field with EOS token...")
|
||||||
|
formatted_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=train_dataset.column_names)
|
||||||
|
|
||||||
|
logger.info(f"Dataset mapped successfully. New features: {formatted_dataset.features}")
|
||||||
|
logger.info(f"Sample text field: {formatted_dataset[0]['text'][:100]}...")
|
||||||
|
|
||||||
|
# Training arguments
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
per_device_train_batch_size=self.batch_size,
|
||||||
|
gradient_accumulation_steps=self.gradient_accumulation_steps,
|
||||||
|
warmup_steps=self.warmup_steps,
|
||||||
|
num_train_epochs=self.num_epochs,
|
||||||
|
max_steps=self.max_steps,
|
||||||
|
learning_rate=self.learning_rate,
|
||||||
|
fp16=not is_bfloat16_supported(),
|
||||||
|
bf16=is_bfloat16_supported(),
|
||||||
|
logging_steps=1,
|
||||||
|
optim="adamw_8bit",
|
||||||
|
weight_decay=self.weight_decay,
|
||||||
|
lr_scheduler_type="linear",
|
||||||
|
seed=self.seed,
|
||||||
|
output_dir=self.output_dir,
|
||||||
|
report_to="none", # Disable wandb for now
|
||||||
|
save_strategy="epoch",
|
||||||
|
save_total_limit=2,
|
||||||
|
evaluation_strategy="no", # No validation for now
|
||||||
|
load_best_model_at_end=False,
|
||||||
|
remove_unused_columns=False,
|
||||||
|
dataloader_pin_memory=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create trainer with the formatted dataset
|
||||||
|
self.trainer = SFTTrainer(
|
||||||
|
model=self.model,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
train_dataset=formatted_dataset, # Use the formatted dataset
|
||||||
|
dataset_text_field="text", # The field we just created
|
||||||
|
max_seq_length=self.max_seq_length,
|
||||||
|
dataset_num_proc=2,
|
||||||
|
packing=False, # Can make training 5x faster for short sequences
|
||||||
|
args=training_args
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("SFTTrainer configured successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error setting up trainer: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def train(self, dataset_path: str):
|
||||||
|
"""Run the training process"""
|
||||||
|
logger.info("🚀 Starting training process...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load model and tokenizer
|
||||||
|
self.load_model_and_tokenizer()
|
||||||
|
|
||||||
|
# Setup LoRA
|
||||||
|
self.setup_lora()
|
||||||
|
|
||||||
|
# Load dataset
|
||||||
|
train_dataset = self.load_dataset(dataset_path)
|
||||||
|
|
||||||
|
# Setup trainer
|
||||||
|
self.setup_trainer(train_dataset)
|
||||||
|
|
||||||
|
# Start training
|
||||||
|
logger.info("Starting training...")
|
||||||
|
trainer_stats = self.trainer.train()
|
||||||
|
|
||||||
|
logger.info("✅ Training completed successfully!")
|
||||||
|
logger.info(f"Training stats: {trainer_stats}")
|
||||||
|
|
||||||
|
# Save the model
|
||||||
|
self.save_model()
|
||||||
|
|
||||||
|
return trainer_stats
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Training failed: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def save_model(self):
|
||||||
|
"""Save the trained model"""
|
||||||
|
logger.info("Saving trained model...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create output directory
|
||||||
|
Path(self.model_output_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save model and tokenizer
|
||||||
|
self.model.save_pretrained(self.model_output_dir)
|
||||||
|
self.tokenizer.save_pretrained(self.model_output_dir)
|
||||||
|
|
||||||
|
# Save training config
|
||||||
|
config_path = Path(self.model_output_dir) / "training_config.json"
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
json.dump(self.config, f, indent=2)
|
||||||
|
|
||||||
|
logger.info(f"✅ Model saved to: {self.model_output_dir}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error saving model: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def prepare_for_inference(self):
|
||||||
|
"""Prepare model for inference"""
|
||||||
|
logger.info("Preparing model for inference...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
FastLanguageModel.for_inference(self.model)
|
||||||
|
logger.info("✅ Model prepared for inference")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"❌ Error preparing for inference: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def load_training_config(config_path: str) -> Dict[str, Any]:
|
||||||
|
"""Load training configuration from YAML file"""
|
||||||
|
try:
|
||||||
|
with open(config_path, 'r', encoding='utf-8') as f:
|
||||||
|
config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Extract training configuration
|
||||||
|
training_config = {}
|
||||||
|
|
||||||
|
# Model configuration
|
||||||
|
if 'model' in config:
|
||||||
|
model_data = config['model']
|
||||||
|
training_config.update({
|
||||||
|
'model_name': model_data.get('training_model', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'),
|
||||||
|
'max_seq_length': model_data.get('training_max_seq_length', 2048),
|
||||||
|
'dtype': model_data.get('training_dtype'),
|
||||||
|
'load_in_4bit': model_data.get('training_load_in_4bit', True),
|
||||||
|
'hf_token': model_data.get('training_token')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Training configuration
|
||||||
|
if 'training' in config:
|
||||||
|
training_data = config['training']
|
||||||
|
training_config.update({
|
||||||
|
'num_epochs': training_data.get('num_epochs', 3),
|
||||||
|
'batch_size': training_data.get('batch_size', 2),
|
||||||
|
'learning_rate': training_data.get('learning_rate', 2e-4),
|
||||||
|
'weight_decay': training_data.get('weight_decay', 0.01),
|
||||||
|
'warmup_ratio': training_data.get('warmup_ratio', 0.1),
|
||||||
|
'lr_scheduler_type': training_data.get('lr_scheduler_type', 'linear')
|
||||||
|
})
|
||||||
|
|
||||||
|
# Data configuration - use output_dir from data section
|
||||||
|
if 'data' in config:
|
||||||
|
data_config = config['data']
|
||||||
|
output_dir = data_config.get('output_dir', './data/processed/styling')
|
||||||
|
training_config.update({
|
||||||
|
'data_output_dir': output_dir,
|
||||||
|
'dataset_path': output_dir, # Default dataset path is the output_dir
|
||||||
|
'style_instruction': data_config.get('instruction', 'Rewrite the following text in a formal style')
|
||||||
|
})
|
||||||
|
|
||||||
|
# LoRA configuration
|
||||||
|
training_config.update({
|
||||||
|
'lora_r': 16,
|
||||||
|
'lora_alpha': 16,
|
||||||
|
'lora_dropout': 0,
|
||||||
|
'target_modules': [
|
||||||
|
"q_proj", "k_proj", "v_proj", "o_proj",
|
||||||
|
"gate_proj", "up_proj", "down_proj"
|
||||||
|
],
|
||||||
|
'gradient_accumulation_steps': 4,
|
||||||
|
'max_steps': None,
|
||||||
|
'warmup_steps': 5,
|
||||||
|
'seed': 3407,
|
||||||
|
'output_dir': './outputs',
|
||||||
|
'model_output_dir': './models/styling'
|
||||||
|
})
|
||||||
|
|
||||||
|
return training_config
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading training config: {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main training function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Styling Training Pipeline")
|
||||||
|
|
||||||
|
# Configuration
|
||||||
|
parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--dataset", type=str, help="Path to training dataset (HF dataset path or local path)")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory for model")
|
||||||
|
parser.add_argument("--epochs", type=int, help="Number of training epochs")
|
||||||
|
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||||
|
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||||
|
parser.add_argument("--max-steps", type=int, help="Maximum training steps")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
# setup_logging() # Commented out as per user's change
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load configuration
|
||||||
|
logger.info(f"Loading configuration from: {args.config}")
|
||||||
|
training_config = load_training_config(args.config)
|
||||||
|
|
||||||
|
# Override with CLI arguments
|
||||||
|
if args.output_dir:
|
||||||
|
training_config['model_output_dir'] = args.output_dir
|
||||||
|
if args.epochs:
|
||||||
|
training_config['num_epochs'] = args.epochs
|
||||||
|
if args.batch_size:
|
||||||
|
training_config['batch_size'] = args.batch_size
|
||||||
|
if args.learning_rate:
|
||||||
|
training_config['learning_rate'] = args.learning_rate
|
||||||
|
if args.max_steps:
|
||||||
|
training_config['max_steps'] = args.max_steps
|
||||||
|
|
||||||
|
# Determine dataset path: CLI argument takes precedence, then YAML config
|
||||||
|
dataset_path = args.dataset or training_config.get('dataset_path')
|
||||||
|
if not dataset_path:
|
||||||
|
logger.error("No dataset path provided. Use --dataset or ensure output_dir is set in YAML config.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
logger.info("Training configuration:")
|
||||||
|
for key, value in training_config.items():
|
||||||
|
logger.info(f" {key}: {value}")
|
||||||
|
logger.info(f" Dataset path: {dataset_path}")
|
||||||
|
|
||||||
|
# Initialize trainer
|
||||||
|
trainer = StylingTrainer(training_config)
|
||||||
|
|
||||||
|
# Start training
|
||||||
|
trainer.train(dataset_path)
|
||||||
|
|
||||||
|
logger.info("Training completed successfully!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Training failed: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
"""
|
||||||
|
Styling Scripts Package
|
||||||
|
Provides command-line interfaces for styling data processing, training, and inference
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .data_processor import (
|
||||||
|
run_with_yaml_config,
|
||||||
|
run_styling_examples,
|
||||||
|
create_sample_styling_data,
|
||||||
|
create_custom_styling_config,
|
||||||
|
show_styling_features
|
||||||
|
)
|
||||||
|
|
||||||
|
from .train import (
|
||||||
|
run_training_with_config,
|
||||||
|
create_training_example,
|
||||||
|
show_training_features
|
||||||
|
)
|
||||||
|
|
||||||
|
from .inference import (
|
||||||
|
run_inference_with_config,
|
||||||
|
create_inference_example,
|
||||||
|
run_batch_inference_example,
|
||||||
|
show_inference_features
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Data processing
|
||||||
|
'run_with_yaml_config',
|
||||||
|
'run_styling_examples',
|
||||||
|
'create_sample_styling_data',
|
||||||
|
'create_custom_styling_config',
|
||||||
|
'show_styling_features',
|
||||||
|
|
||||||
|
# Training
|
||||||
|
'run_training_with_config',
|
||||||
|
'create_training_example',
|
||||||
|
'show_training_features',
|
||||||
|
|
||||||
|
# Inference
|
||||||
|
'run_inference_with_config',
|
||||||
|
'create_inference_example',
|
||||||
|
'run_batch_inference_example',
|
||||||
|
'show_inference_features'
|
||||||
|
]
|
||||||
@@ -0,0 +1,302 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Styling data processor script that uses YAML configurations.
|
||||||
|
This provides a flexible and maintainable approach for style transfer tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_with_yaml_config(config_path: str, **cli_overrides):
|
||||||
|
"""Run styling data processor with YAML configuration"""
|
||||||
|
print(f"=== Running Styling Data Processor with YAML config: {config_path} ===")
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"python", "pipelines/styling/data_processor.py",
|
||||||
|
"--config", config_path
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add CLI overrides
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if value is not None:
|
||||||
|
cmd.extend([f"--{key.replace('_', '-')}", str(value)])
|
||||||
|
|
||||||
|
print(f"Running command: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("✅ Styling data processing completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Error running styling data processor: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def run_styling_examples():
|
||||||
|
"""Run styling examples with YAML configs"""
|
||||||
|
|
||||||
|
# Example 1: Formal style transfer
|
||||||
|
print("=== Example 1: Formal Style Transfer ===")
|
||||||
|
success = run_with_yaml_config(
|
||||||
|
"configs/styling/formal.yaml",
|
||||||
|
max_samples=1000, # Override YAML value
|
||||||
|
output_format="alpaca"
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("✅ Formal style transfer completed!")
|
||||||
|
|
||||||
|
# Example 2: Custom styling dataset (if available)
|
||||||
|
print("\n=== Example 2: Custom Styling Dataset ===")
|
||||||
|
if os.path.exists("data/raw/styling/custom_dataset.jsonl"):
|
||||||
|
success = run_with_yaml_config(
|
||||||
|
"configs/styling/formal.yaml", # Use formal config as base
|
||||||
|
data_source="custom",
|
||||||
|
data_path="data/raw/styling/custom_dataset.jsonl",
|
||||||
|
instruction="Rewrite the following text in a casual, friendly style",
|
||||||
|
output_dir="./data/processed/styling/casual"
|
||||||
|
)
|
||||||
|
if success:
|
||||||
|
print("✅ Custom styling dataset processing completed!")
|
||||||
|
else:
|
||||||
|
print("⚠️ Custom styling dataset not found, skipping...")
|
||||||
|
print(" You can create one with the 'create-sample-data' option")
|
||||||
|
|
||||||
|
def create_sample_styling_data():
|
||||||
|
"""Create sample styling dataset for testing"""
|
||||||
|
sample_data = [
|
||||||
|
{
|
||||||
|
"text": "Hey, what's up? How are you doing today?",
|
||||||
|
"styled_text": "Hello, how are you doing today?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "This is really cool stuff!",
|
||||||
|
"styled_text": "This is quite impressive material."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "I'm gonna go to the store later.",
|
||||||
|
"styled_text": "I will go to the store later."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "What's the deal with this?",
|
||||||
|
"styled_text": "What is the situation regarding this matter?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"text": "That's totally awesome!",
|
||||||
|
"styled_text": "That is quite remarkable!"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Create directory structure
|
||||||
|
data_dir = Path("data/raw/styling")
|
||||||
|
data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Save sample data
|
||||||
|
import json
|
||||||
|
sample_file = data_dir / "sample_formal.jsonl"
|
||||||
|
with open(sample_file, 'w', encoding='utf-8') as f:
|
||||||
|
for item in sample_data:
|
||||||
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||||
|
|
||||||
|
print(f"✅ Created sample styling dataset: {sample_file}")
|
||||||
|
print(f" Contains {len(sample_data)} examples")
|
||||||
|
print(f" Format: text → styled_text")
|
||||||
|
print(f" Ready to use with configs/styling/formal.yaml")
|
||||||
|
|
||||||
|
def create_custom_styling_config():
|
||||||
|
"""Create a custom styling configuration file"""
|
||||||
|
custom_config = """task:
|
||||||
|
name: "styling"
|
||||||
|
type: "style_transfer"
|
||||||
|
|
||||||
|
data:
|
||||||
|
source: "custom"
|
||||||
|
input_field: "text"
|
||||||
|
output_field: "styled_text"
|
||||||
|
instruction: "Rewrite the following text in a professional business style"
|
||||||
|
data_format: "jsonl"
|
||||||
|
max_length: 512
|
||||||
|
min_length: 10
|
||||||
|
clean_text: true
|
||||||
|
lowercase: false
|
||||||
|
train_split: 0.8
|
||||||
|
validation_split: 0.1
|
||||||
|
test_split: 0.1
|
||||||
|
output_format: "alpaca"
|
||||||
|
output_dir: "./data/processed/styling/professional"
|
||||||
|
|
||||||
|
model:
|
||||||
|
name: "t5-base"
|
||||||
|
max_length: 512
|
||||||
|
|
||||||
|
training:
|
||||||
|
num_epochs: 3
|
||||||
|
batch_size: 16
|
||||||
|
learning_rate: 3e-5
|
||||||
|
weight_decay: 0.01
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
lr_scheduler_type: "linear"
|
||||||
|
|
||||||
|
inference:
|
||||||
|
batch_size: 32
|
||||||
|
max_new_tokens: 128
|
||||||
|
temperature: 0.8
|
||||||
|
"""
|
||||||
|
|
||||||
|
config_path = "configs/styling/professional.yaml"
|
||||||
|
os.makedirs(os.path.dirname(config_path), exist_ok=True)
|
||||||
|
|
||||||
|
with open(config_path, 'w') as f:
|
||||||
|
f.write(custom_config)
|
||||||
|
|
||||||
|
print(f"✅ Created custom styling config: {config_path}")
|
||||||
|
print(" This config is set up for professional business style transfer")
|
||||||
|
|
||||||
|
def handle_direct_args():
|
||||||
|
"""Handle direct command-line arguments by passing them to the styling pipeline"""
|
||||||
|
parser = argparse.ArgumentParser(description="Styling Data Processor")
|
||||||
|
|
||||||
|
# Add all the same arguments as the styling pipeline
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
|
||||||
|
parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
|
||||||
|
parser.add_argument("--data-path", type=str, help="Path to custom data file")
|
||||||
|
parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
|
||||||
|
parser.add_argument("--input-field", type=str, help="Input field name")
|
||||||
|
parser.add_argument("--output-field", type=str, help="Output field name")
|
||||||
|
parser.add_argument("--instruction", type=str, help="Style instruction")
|
||||||
|
parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
|
||||||
|
parser.add_argument("--train-split", type=float, help="Training split ratio")
|
||||||
|
parser.add_argument("--validation-split", type=float, help="Validation split ratio")
|
||||||
|
parser.add_argument("--test-split", type=float, help="Test split ratio")
|
||||||
|
parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
|
||||||
|
parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
|
||||||
|
parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
|
||||||
|
parser.add_argument("--min-length", type=int, help="Minimum text length")
|
||||||
|
parser.add_argument("--max-length", type=int, help="Maximum text length")
|
||||||
|
parser.add_argument("--output-format", choices=["styling", "alpaca"], help="Output format")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory")
|
||||||
|
|
||||||
|
# HuggingFace dataset options
|
||||||
|
parser.add_argument("--create-hf-dataset", action="store_true", help="Create HuggingFace dataset")
|
||||||
|
parser.add_argument("--hf-dataset-path", type=str, help="Path to save HuggingFace dataset")
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Build command to call the styling pipeline
|
||||||
|
cmd = ["python", "pipelines/styling/data_processor.py"]
|
||||||
|
|
||||||
|
# Add all arguments that were provided
|
||||||
|
for arg_name, arg_value in vars(args).items():
|
||||||
|
if arg_value is not None:
|
||||||
|
if isinstance(arg_value, bool):
|
||||||
|
if arg_value: # Only add flag if True
|
||||||
|
cmd.append(f"--{arg_name.replace('_', '-')}")
|
||||||
|
else:
|
||||||
|
cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("✅ Styling data processing completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Error running styling data processor: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def show_styling_features():
|
||||||
|
"""Show the features of the styling data processor"""
|
||||||
|
print("=== Styling Data Processor Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Style Transfer Tasks**:")
|
||||||
|
print(" - Formal vs. Informal style")
|
||||||
|
print(" - Professional vs. Casual tone")
|
||||||
|
print(" - Academic vs. Conversational")
|
||||||
|
print(" - Any custom style instruction")
|
||||||
|
print()
|
||||||
|
print("2. **Data Formats Supported**:")
|
||||||
|
print(" - HuggingFace datasets")
|
||||||
|
print(" - Custom JSONL/CSV/JSON files")
|
||||||
|
print(" - Automatic train/validation/test splits")
|
||||||
|
print()
|
||||||
|
print("3. **Output Formats**:")
|
||||||
|
print(" - Raw styling format (input/output)")
|
||||||
|
print(" - Alpaca format (instruction/input/output)")
|
||||||
|
print(" - HuggingFace dataset format")
|
||||||
|
print()
|
||||||
|
print("4. **Advanced Features**:")
|
||||||
|
print(" - Configurable field mapping")
|
||||||
|
print(" - Text preprocessing options")
|
||||||
|
print(" - Automatic dataset saving/loading")
|
||||||
|
print(" - YAML configuration support")
|
||||||
|
print()
|
||||||
|
print("=== Usage Examples ===")
|
||||||
|
print()
|
||||||
|
print("1. Use YAML config only:")
|
||||||
|
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
|
||||||
|
print()
|
||||||
|
print("2. Override YAML values:")
|
||||||
|
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml --max-samples 500")
|
||||||
|
print()
|
||||||
|
print("3. Create sample data:")
|
||||||
|
print(" python scripts/styling/data_processor.py create-sample-data")
|
||||||
|
print()
|
||||||
|
print("4. Create custom config:")
|
||||||
|
print(" python scripts/styling/data_processor.py create-config")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
# Check if it's a subcommand
|
||||||
|
if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]:
|
||||||
|
# Handle subcommands
|
||||||
|
if sys.argv[1] == "examples":
|
||||||
|
run_styling_examples()
|
||||||
|
elif sys.argv[1] == "create-sample-data":
|
||||||
|
create_sample_styling_data()
|
||||||
|
elif sys.argv[1] == "create-config":
|
||||||
|
create_custom_styling_config()
|
||||||
|
elif sys.argv[1] == "features":
|
||||||
|
show_styling_features()
|
||||||
|
else:
|
||||||
|
# Handle direct arguments (pass through to pipeline)
|
||||||
|
handle_direct_args()
|
||||||
|
else:
|
||||||
|
print("Styling Data Processor")
|
||||||
|
print("=====================")
|
||||||
|
print()
|
||||||
|
print("This script runs the styling data processor for style transfer tasks.")
|
||||||
|
print("It supports both YAML configurations and command-line overrides.")
|
||||||
|
print()
|
||||||
|
print("Usage:")
|
||||||
|
print(" python scripts/styling/data_processor.py examples # Run examples")
|
||||||
|
print(" python scripts/styling/data_processor.py create-sample-data # Create sample dataset")
|
||||||
|
print(" python scripts/styling/data_processor.py create-config # Create custom config")
|
||||||
|
print(" python scripts/styling/data_processor.py features # Show features")
|
||||||
|
print()
|
||||||
|
print("Direct pipeline usage:")
|
||||||
|
print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml")
|
||||||
|
print(" python scripts/styling/data_processor.py --data-source custom --data-path ./data.jsonl")
|
||||||
|
print()
|
||||||
|
print("Key Features:")
|
||||||
|
print(" ✅ Style transfer with custom instructions")
|
||||||
|
print(" ✅ Multiple data source support")
|
||||||
|
print(" ✅ YAML configuration files")
|
||||||
|
print(" ✅ CLI argument overrides")
|
||||||
|
print(" ✅ Automatic data splitting")
|
||||||
|
print(" ✅ HuggingFace dataset export")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,223 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Styling Inference Script
|
||||||
|
Provides a command-line interface to run the styling inference pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_inference_with_config(config_path: str, **cli_overrides):
|
||||||
|
"""Run the styling inference pipeline with YAML configuration"""
|
||||||
|
print(f"🚀 Starting styling inference with config: {config_path}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Build command
|
||||||
|
cmd = ["python", "pipelines/styling/inference.py", "--config", config_path]
|
||||||
|
|
||||||
|
# Add CLI overrides
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if value is not None:
|
||||||
|
if key == "model_path":
|
||||||
|
cmd.extend(["--model-path", str(value)])
|
||||||
|
elif key == "text":
|
||||||
|
cmd.extend(["--text", str(value)])
|
||||||
|
elif key == "input_file":
|
||||||
|
cmd.extend(["--input-file", str(value)])
|
||||||
|
elif key == "max_tokens":
|
||||||
|
cmd.extend(["--max-tokens", str(value)])
|
||||||
|
elif key == "temperature":
|
||||||
|
cmd.extend(["--temperature", str(value)])
|
||||||
|
elif key == "instruction":
|
||||||
|
cmd.extend(["--instruction", str(value)])
|
||||||
|
elif key == "output_file":
|
||||||
|
cmd.extend(["--output-file", str(value)])
|
||||||
|
elif key == "streaming":
|
||||||
|
cmd.append("--streaming")
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("✅ Inference completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Inference failed: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def show_inference_features():
|
||||||
|
"""Show the features of the styling inference pipeline"""
|
||||||
|
print("=== Styling Inference Pipeline Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Model Support**:")
|
||||||
|
print(" - Trained LoRA models")
|
||||||
|
print(" - Base models from HuggingFace Hub")
|
||||||
|
print(" - Automatic model loading and preparation")
|
||||||
|
print()
|
||||||
|
print("2. **Inference Modes**:")
|
||||||
|
print(" - Single text inference")
|
||||||
|
print(" - Batch file processing")
|
||||||
|
print(" - Interactive mode")
|
||||||
|
print(" - Streaming generation")
|
||||||
|
print()
|
||||||
|
print("3. **Generation Control**:")
|
||||||
|
print(" - Configurable temperature and top-p")
|
||||||
|
print(" - Adjustable max tokens")
|
||||||
|
print(" - Custom style instructions")
|
||||||
|
print()
|
||||||
|
print("4. **Output Options**:")
|
||||||
|
print(" - Console output")
|
||||||
|
print(" - File output")
|
||||||
|
print(" - Streaming real-time generation")
|
||||||
|
|
||||||
|
def create_inference_example():
|
||||||
|
"""Create an inference example using the formal style configuration"""
|
||||||
|
print("=== Inference Example: Formal Style Transfer ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check if we have the required files
|
||||||
|
config_path = "configs/styling/formal.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor first to create the configuration")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("✅ Found configuration file!")
|
||||||
|
print(f" Config: {config_path}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Example text
|
||||||
|
example_text = "Hey, what's up? I'm gonna go grab some food later."
|
||||||
|
|
||||||
|
print(f"📝 Example text: {example_text}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
success = run_inference_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
text=example_text,
|
||||||
|
instruction="Rewrite the following text in a formal style"
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("🎉 Inference example completed!")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def create_test_file():
|
||||||
|
"""Create a test file with sample texts for batch inference"""
|
||||||
|
test_file = "test_texts.txt"
|
||||||
|
|
||||||
|
test_texts = [
|
||||||
|
"Hey, what's up? How are you doing today?",
|
||||||
|
"I'm gonna go to the store later to get some stuff.",
|
||||||
|
"This is pretty cool, right?",
|
||||||
|
"Can you help me out with this?",
|
||||||
|
"Thanks a lot for your help!"
|
||||||
|
]
|
||||||
|
|
||||||
|
with open(test_file, 'w', encoding='utf-8') as f:
|
||||||
|
for text in test_texts:
|
||||||
|
f.write(text + '\n')
|
||||||
|
|
||||||
|
print(f"✅ Created test file: {test_file}")
|
||||||
|
print(f" Contains {len(test_texts)} sample texts")
|
||||||
|
return test_file
|
||||||
|
|
||||||
|
def run_batch_inference_example():
|
||||||
|
"""Run a batch inference example"""
|
||||||
|
print("=== Batch Inference Example ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Create test file
|
||||||
|
test_file = create_test_file()
|
||||||
|
|
||||||
|
# Check configuration
|
||||||
|
config_path = "configs/styling/formal.yaml"
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"❌ Configuration file not found: {config_path}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("✅ Running batch inference...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run batch inference
|
||||||
|
success = run_inference_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
input_file=test_file,
|
||||||
|
output_file="styled_results.txt",
|
||||||
|
instruction="Rewrite the following text in a formal style"
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("🎉 Batch inference completed!")
|
||||||
|
print(" Results saved to: styled_results.txt")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Styling Inference Script")
|
||||||
|
|
||||||
|
# Subcommands
|
||||||
|
parser.add_argument("command", choices=["infer", "example", "batch", "features"],
|
||||||
|
help="Command to run")
|
||||||
|
|
||||||
|
# Inference arguments
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--model-path", type=str, help="Path to trained model")
|
||||||
|
parser.add_argument("--text", type=str, help="Single text to style transfer")
|
||||||
|
parser.add_argument("--input-file", type=str, help="File containing texts to process")
|
||||||
|
parser.add_argument("--max-tokens", type=int, help="Maximum new tokens to generate")
|
||||||
|
parser.add_argument("--temperature", type=float, help="Sampling temperature")
|
||||||
|
parser.add_argument("--instruction", type=str, help="Custom style instruction")
|
||||||
|
parser.add_argument("--output-file", type=str, help="Output file for results")
|
||||||
|
parser.add_argument("--streaming", action="store_true", help="Enable streaming generation")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "features":
|
||||||
|
show_inference_features()
|
||||||
|
|
||||||
|
elif args.command == "example":
|
||||||
|
create_inference_example()
|
||||||
|
|
||||||
|
elif args.command == "batch":
|
||||||
|
run_batch_inference_example()
|
||||||
|
|
||||||
|
elif args.command == "infer":
|
||||||
|
if not args.config:
|
||||||
|
print("❌ --config is required for inference")
|
||||||
|
print("Usage: python scripts/styling/inference.py infer --config config.yaml [options]")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Check if we have input
|
||||||
|
if not args.text and not args.input_file:
|
||||||
|
print("❌ Either --text or --input-file is required")
|
||||||
|
print("Usage: python scripts/styling/inference.py infer --config config.yaml --text 'your text'")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
success = run_inference_with_config(
|
||||||
|
config_path=args.config,
|
||||||
|
model_path=args.model_path,
|
||||||
|
text=args.text,
|
||||||
|
input_file=args.input_file,
|
||||||
|
max_tokens=args.max_tokens,
|
||||||
|
temperature=args.temperature,
|
||||||
|
instruction=args.instruction,
|
||||||
|
output_file=args.output_file,
|
||||||
|
streaming=args.streaming
|
||||||
|
)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,168 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Styling Training Script
|
||||||
|
Provides a command-line interface to run the styling training pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides):
|
||||||
|
"""Run the styling training pipeline with YAML configuration"""
|
||||||
|
print(f"Starting styling training with config: {config_path}")
|
||||||
|
if dataset_path:
|
||||||
|
print(f"Training dataset: {dataset_path}")
|
||||||
|
else:
|
||||||
|
print("Training dataset: Will use output_dir from YAML config")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Build command
|
||||||
|
cmd = ["python", "pipelines/styling/train.py", "--config", config_path]
|
||||||
|
|
||||||
|
# Add dataset path if provided
|
||||||
|
if dataset_path:
|
||||||
|
cmd.extend(["--dataset", dataset_path])
|
||||||
|
|
||||||
|
# Add CLI overrides
|
||||||
|
for key, value in cli_overrides.items():
|
||||||
|
if value is not None:
|
||||||
|
if key == "output_dir":
|
||||||
|
cmd.extend(["--output-dir", str(value)])
|
||||||
|
elif key == "epochs":
|
||||||
|
cmd.extend(["--epochs", str(value)])
|
||||||
|
elif key == "batch_size":
|
||||||
|
cmd.extend(["--batch-size", str(value)])
|
||||||
|
elif key == "learning_rate":
|
||||||
|
cmd.extend(["--learning-rate", str(value)])
|
||||||
|
elif key == "max_steps":
|
||||||
|
cmd.extend(["--max-steps", str(value)])
|
||||||
|
|
||||||
|
print(f"Running: {' '.join(cmd)}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||||
|
print("Training completed successfully!")
|
||||||
|
print(result.stdout)
|
||||||
|
return True
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"Training failed: {e}")
|
||||||
|
print(f"Error output: {e.stderr}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def show_training_features():
|
||||||
|
"""Show the features of the styling training pipeline"""
|
||||||
|
print("=== Styling Training Pipeline Features ===")
|
||||||
|
print()
|
||||||
|
print("1. **Model Support**:")
|
||||||
|
print(" - Unsloth optimized models (4x faster)")
|
||||||
|
print(" - LoRA fine-tuning for efficiency")
|
||||||
|
print(" - Support for Llama-3.1, Mistral, Phi-3, Gemma")
|
||||||
|
print()
|
||||||
|
print("2. **Training Features**:")
|
||||||
|
print(" - SFTTrainer with instruction tuning")
|
||||||
|
print(" - Automatic mixed precision (FP16/BF16)")
|
||||||
|
print(" - Gradient checkpointing for memory efficiency")
|
||||||
|
print(" - Configurable LoRA parameters")
|
||||||
|
print()
|
||||||
|
print("3. **Configuration**:")
|
||||||
|
print(" - YAML configuration files")
|
||||||
|
print(" - CLI argument overrides")
|
||||||
|
print(" - Automatic device detection")
|
||||||
|
print()
|
||||||
|
print("4. **Output**:")
|
||||||
|
print(" - Saved LoRA models")
|
||||||
|
print(" - Training logs and checkpoints")
|
||||||
|
print(" - Ready for inference")
|
||||||
|
|
||||||
|
def create_training_example():
|
||||||
|
"""Create a training example using the formal style configuration"""
|
||||||
|
print("=== Training Example: Formal Style Transfer ===")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check if we have the required files
|
||||||
|
config_path = "configs/styling/formal.yaml"
|
||||||
|
|
||||||
|
if not Path(config_path).exists():
|
||||||
|
print(f"Configuration file not found: {config_path}")
|
||||||
|
print(" Please run the data processor first to create the configuration")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print("Found required files!")
|
||||||
|
print(f" Config: {config_path}")
|
||||||
|
print(" Dataset: Will use output_dir from YAML config")
|
||||||
|
print(" The training pipeline will automatically:")
|
||||||
|
print(" - Load data from the output_dir specified in YAML")
|
||||||
|
print(" - Convert JSONL files to HuggingFace dataset format")
|
||||||
|
print(" - Apply formatting with EOS tokens")
|
||||||
|
print(" - Train the model using SFTTrainer")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Run training without explicit dataset path - will use YAML config
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=config_path,
|
||||||
|
dataset_path=None, # Use output_dir from YAML config
|
||||||
|
epochs=1,
|
||||||
|
batch_size=2,
|
||||||
|
learning_rate=2e-4
|
||||||
|
)
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("Training example completed!")
|
||||||
|
print(" Model saved to: ./models/styling")
|
||||||
|
print(" Ready for inference!")
|
||||||
|
|
||||||
|
return success
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function"""
|
||||||
|
parser = argparse.ArgumentParser(description="Styling Training Script")
|
||||||
|
|
||||||
|
# Subcommands
|
||||||
|
parser.add_argument("command", choices=["train", "example", "features"],
|
||||||
|
help="Command to run")
|
||||||
|
|
||||||
|
# Training arguments
|
||||||
|
parser.add_argument("--config", type=str, help="Path to YAML configuration file")
|
||||||
|
parser.add_argument("--dataset", type=str, help="Path to training dataset")
|
||||||
|
parser.add_argument("--output-dir", type=str, help="Output directory for model")
|
||||||
|
parser.add_argument("--epochs", type=int, help="Number of training epochs")
|
||||||
|
parser.add_argument("--batch-size", type=int, help="Training batch size")
|
||||||
|
parser.add_argument("--learning-rate", type=float, help="Learning rate")
|
||||||
|
parser.add_argument("--max-steps", type=int, help="Maximum training steps")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.command == "features":
|
||||||
|
show_training_features()
|
||||||
|
|
||||||
|
elif args.command == "example":
|
||||||
|
create_training_example()
|
||||||
|
|
||||||
|
elif args.command == "train":
|
||||||
|
if not args.config:
|
||||||
|
print("❌ --config is required for training")
|
||||||
|
print("Usage: python scripts/styling/train.py train --config config.yaml")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# If dataset is not provided, try to use output_dir from config
|
||||||
|
dataset_path = args.dataset if args.dataset else None
|
||||||
|
|
||||||
|
success = run_training_with_config(
|
||||||
|
config_path=args.config,
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
output_dir=args.output_dir,
|
||||||
|
epochs=args.epochs,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
learning_rate=args.learning_rate,
|
||||||
|
max_steps=args.max_steps
|
||||||
|
)
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,251 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Test script for the styling data processor
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
from pipelines.styling.data_processor import StylingDataPipeline, create_custom_config, create_huggingface_config
|
||||||
|
|
||||||
|
def test_styling_pipeline():
|
||||||
|
"""Test the styling data processor with custom data"""
|
||||||
|
|
||||||
|
print("Testing Styling Data Processor")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# Initialize the pipeline
|
||||||
|
pipeline = StylingDataPipeline()
|
||||||
|
|
||||||
|
# Example 1: Load configuration from YAML
|
||||||
|
print("\n1. Loading configuration from YAML...")
|
||||||
|
try:
|
||||||
|
yaml_config = pipeline.load_config_from_yaml("./configs/styling/formal.yaml")
|
||||||
|
print(f" ✅ YAML config loaded successfully!")
|
||||||
|
print(f" Output directory: {yaml_config.output_dir}")
|
||||||
|
print(f" Instruction: {yaml_config.instruction}")
|
||||||
|
print(f" Input field: {yaml_config.input_field}")
|
||||||
|
print(f" Output field: {yaml_config.output_field}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Error loading YAML config: {e}")
|
||||||
|
yaml_config = None
|
||||||
|
|
||||||
|
# Example 2: Create custom dataset configuration
|
||||||
|
print("\n2. Creating custom dataset configuration...")
|
||||||
|
custom_config = create_custom_config(
|
||||||
|
data_path="./data/raw/styling/formal_dataset.jsonl",
|
||||||
|
data_format="jsonl",
|
||||||
|
input_field="text",
|
||||||
|
output_field="styled_text",
|
||||||
|
instruction="Rewrite the following text in a formal style",
|
||||||
|
max_samples=1000,
|
||||||
|
min_length=10,
|
||||||
|
max_length=256,
|
||||||
|
clean_text=True,
|
||||||
|
lowercase=False,
|
||||||
|
output_format="alpaca"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f" Input field: {custom_config.input_field} (maps to 'input')")
|
||||||
|
print(f" Output field: {custom_config.output_field} (maps to 'output')")
|
||||||
|
print(f" Instruction: {custom_config.instruction}")
|
||||||
|
print(f" Max samples: {custom_config.max_samples}")
|
||||||
|
|
||||||
|
# Example 3: Test with sample data (if available)
|
||||||
|
print("\n3. Testing pipeline with sample data...")
|
||||||
|
|
||||||
|
# Create a sample dataset for testing
|
||||||
|
sample_data = [
|
||||||
|
{
|
||||||
|
"input": "Hey, what's up? How are you doing today?",
|
||||||
|
"output": "Hello, how are you doing today?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"input": "This is really cool stuff!",
|
||||||
|
"output": "This is quite impressive material."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"input": "I'm gonna go to the store later.",
|
||||||
|
"output": "I will go to the store later."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Save sample data to test file
|
||||||
|
import json
|
||||||
|
test_file = "./data/raw/styling/test_formal.jsonl"
|
||||||
|
os.makedirs(os.path.dirname(test_file), exist_ok=True)
|
||||||
|
|
||||||
|
with open(test_file, 'w', encoding='utf-8') as f:
|
||||||
|
for item in sample_data:
|
||||||
|
f.write(json.dumps(item, ensure_ascii=False) + '\n')
|
||||||
|
|
||||||
|
print(f" Created test file: {test_file}")
|
||||||
|
|
||||||
|
# Test the pipeline with the sample data
|
||||||
|
try:
|
||||||
|
test_config = create_custom_config(
|
||||||
|
data_path=test_file,
|
||||||
|
data_format="jsonl",
|
||||||
|
input_field="input",
|
||||||
|
output_field="output",
|
||||||
|
instruction="Rewrite the following text in a formal style",
|
||||||
|
max_samples=10,
|
||||||
|
output_format="alpaca"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(" Running pipeline...")
|
||||||
|
result = pipeline.run_pipeline(test_config, output_format="alpaca", save_splits=True, create_hf_dataset=True, save_hf_dataset=True)
|
||||||
|
|
||||||
|
print(" ✅ Pipeline completed successfully!")
|
||||||
|
print(f" Total samples: {result['analysis']['overall']['total_samples']}")
|
||||||
|
print(f" Split sizes: {result['analysis']['overall']['split_sizes']}")
|
||||||
|
print(f" Output directory: {result['output_dir']}")
|
||||||
|
|
||||||
|
# Show HuggingFace dataset info if created
|
||||||
|
if 'hf_dataset' in result:
|
||||||
|
hf_dataset = result['hf_dataset']
|
||||||
|
print(f" HuggingFace dataset created with {len(hf_dataset)} entries")
|
||||||
|
print(f" Dataset features: {hf_dataset.features}")
|
||||||
|
|
||||||
|
# Show save path if saved to disk
|
||||||
|
if 'hf_dataset_path' in result:
|
||||||
|
print(f" Dataset saved to: {result['hf_dataset_path']}")
|
||||||
|
|
||||||
|
# Show formatted example
|
||||||
|
if len(hf_dataset) > 0:
|
||||||
|
print(f" Example formatted text:")
|
||||||
|
print(f" {hf_dataset[0]['text'][:200]}...")
|
||||||
|
|
||||||
|
# Show sample processed data
|
||||||
|
print("\n Sample processed data:")
|
||||||
|
for split_name, split_data in result['data'].items():
|
||||||
|
if split_data:
|
||||||
|
print(f" {split_name} split:")
|
||||||
|
for i, item in enumerate(split_data[:2]): # Show first 2 items
|
||||||
|
print(f" Item {i+1}:")
|
||||||
|
print(f" Instruction: {item['instruction']}")
|
||||||
|
print(f" Input: {item['input'][:50]}...")
|
||||||
|
print(f" Output: {item['output'][:50]}...")
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Error running pipeline: {e}")
|
||||||
|
|
||||||
|
print("\n" + "=" * 50)
|
||||||
|
print("Test completed!")
|
||||||
|
|
||||||
|
def test_hf_dataset_save_load():
|
||||||
|
"""Test HuggingFace dataset save and load functionality"""
|
||||||
|
|
||||||
|
print("\nTesting HuggingFace Dataset Save/Load")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
from pipelines.styling.data_processor import save_hf_dataset_to_disk, load_hf_dataset_from_disk
|
||||||
|
|
||||||
|
# Create a sample dataset for testing
|
||||||
|
sample_data = [
|
||||||
|
{
|
||||||
|
"instruction": "Rewrite in formal style",
|
||||||
|
"input": "Hey, what's up?",
|
||||||
|
"output": "Hello, how are you?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"instruction": "Rewrite in formal style",
|
||||||
|
"input": "This is really cool!",
|
||||||
|
"output": "This is quite impressive."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Test configuration
|
||||||
|
config = create_custom_config(
|
||||||
|
data_path="dummy",
|
||||||
|
instruction="Rewrite in formal style"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to HuggingFace dataset
|
||||||
|
pipeline = StylingDataPipeline()
|
||||||
|
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
|
||||||
|
|
||||||
|
print(f"Created HuggingFace dataset with {len(hf_dataset)} entries")
|
||||||
|
|
||||||
|
# Test saving to disk
|
||||||
|
save_path = "./data/processed/styling/test_hf_dataset"
|
||||||
|
print(f"\nSaving dataset to: {save_path}")
|
||||||
|
|
||||||
|
success = save_hf_dataset_to_disk(hf_dataset, save_path)
|
||||||
|
if success:
|
||||||
|
print("✅ Dataset saved successfully!")
|
||||||
|
|
||||||
|
# Test loading from disk
|
||||||
|
print(f"\nLoading dataset from: {save_path}")
|
||||||
|
loaded_dataset = load_hf_dataset_from_disk(save_path)
|
||||||
|
|
||||||
|
if loaded_dataset is not None:
|
||||||
|
print("✅ Dataset loaded successfully!")
|
||||||
|
print(f"Loaded dataset has {len(loaded_dataset)} entries")
|
||||||
|
print(f"Features: {loaded_dataset.features}")
|
||||||
|
|
||||||
|
# Show sample data
|
||||||
|
print("\nSample loaded data:")
|
||||||
|
for i in range(len(loaded_dataset)):
|
||||||
|
print(f" Entry {i+1}: {loaded_dataset[i]['text'][:100]}...")
|
||||||
|
else:
|
||||||
|
print("❌ Failed to load dataset")
|
||||||
|
else:
|
||||||
|
print("❌ Failed to save dataset")
|
||||||
|
|
||||||
|
return hf_dataset
|
||||||
|
|
||||||
|
def test_hf_dataset_conversion():
|
||||||
|
"""Test the HuggingFace dataset conversion"""
|
||||||
|
|
||||||
|
print("\nTesting HuggingFace Dataset Conversion")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
pipeline = StylingDataPipeline()
|
||||||
|
|
||||||
|
# Sample data with instruction field
|
||||||
|
sample_data = [
|
||||||
|
{
|
||||||
|
"instruction": "Rewrite in formal style",
|
||||||
|
"input": "Hey, what's up?",
|
||||||
|
"output": "Hello, how are you?"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"instruction": "Rewrite in formal style",
|
||||||
|
"input": "This is really cool!",
|
||||||
|
"output": "This is quite impressive."
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
# Test configuration
|
||||||
|
config = create_custom_config(
|
||||||
|
data_path="dummy",
|
||||||
|
instruction="Rewrite in formal style"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert to HuggingFace dataset
|
||||||
|
hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
|
||||||
|
|
||||||
|
print(f"HuggingFace dataset created with {len(hf_dataset)} entries")
|
||||||
|
print(f"Dataset features: {hf_dataset.features}")
|
||||||
|
|
||||||
|
# Show formatted examples
|
||||||
|
print("\nFormatted examples:")
|
||||||
|
for i in range(len(hf_dataset)):
|
||||||
|
print(f" Example {i+1}:")
|
||||||
|
print(f" {hf_dataset[i]['text'][:150]}...")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Test the dataset can be used for training
|
||||||
|
print("Dataset ready for training!")
|
||||||
|
print(f"Number of training examples: {len(hf_dataset)}")
|
||||||
|
|
||||||
|
return hf_dataset
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_styling_pipeline()
|
||||||
|
# test_hf_dataset_save_load()
|
||||||
|
# test_hf_dataset_conversion()
|
||||||
Reference in New Issue
Block a user