From 710d074b47f0ab65e74ef9304f64622167610c95 Mon Sep 17 00:00:00 2001 From: OwusuBlessing Date: Wed, 13 Aug 2025 21:17:01 +0100 Subject: [PATCH] added style mimicking piepelines --- configs/QUICK_REFERENCE.md | 191 +++ configs/README.md | 207 +++ configs/classification/emotion.yaml | 52 +- configs/styling/formal.yaml | 80 +- data/alpaca/test.jsonl | 1 + data/alpaca/train.jsonl | 1 + data/alpaca/validation.jsonl | 1 + data/hf_dataset/data-00000-of-00001.arrow | Bin 0 -> 2208 bytes data/hf_dataset/dataset_info.json | 24 + data/hf_dataset/state.json | 13 + .../styling/formal/alpaca/test.jsonl | 1 + .../styling/formal/alpaca/train.jsonl | 3 + .../styling/formal/alpaca/validation.jsonl | 1 + data/processed/styling/formal/test.jsonl | 1 + data/processed/styling/formal/train.jsonl | 3 + .../processed/styling/formal/validation.jsonl | 1 + data/raw/styling/sample_formal.jsonl | 5 + data/raw/styling/test_formal.jsonl | 3 + data/raw/styling/test_missing_fields.jsonl | 5 + .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 184 bytes .../__pycache__/__init__.cpython-311.pyc | Bin 0 -> 192 bytes .../data_processor.cpython-311.pyc | Bin 0 -> 75235 bytes pipelines/styling/data_processor.py | 1488 +++++++++++++++++ pipelines/styling/inference.py | 346 ++++ pipelines/styling/train.py | 446 +++++ scripts/styling/__init__.py | 45 + scripts/styling/data_processor.py | 302 ++++ scripts/styling/inference.py | 223 +++ scripts/styling/train.py | 168 ++ test.py | 251 +++ test.readme | 0 31 files changed, 3816 insertions(+), 46 deletions(-) create mode 100644 configs/QUICK_REFERENCE.md create mode 100644 configs/README.md create mode 100644 data/alpaca/test.jsonl create mode 100644 data/alpaca/train.jsonl create mode 100644 data/alpaca/validation.jsonl create mode 100644 data/hf_dataset/data-00000-of-00001.arrow create mode 100644 data/hf_dataset/dataset_info.json create mode 100644 data/hf_dataset/state.json create mode 100644 data/processed/styling/formal/alpaca/test.jsonl create mode 100644 data/processed/styling/formal/alpaca/train.jsonl create mode 100644 data/processed/styling/formal/alpaca/validation.jsonl create mode 100644 data/processed/styling/formal/test.jsonl create mode 100644 data/processed/styling/formal/train.jsonl create mode 100644 data/processed/styling/formal/validation.jsonl create mode 100644 data/raw/styling/sample_formal.jsonl create mode 100644 data/raw/styling/test_formal.jsonl create mode 100644 data/raw/styling/test_missing_fields.jsonl create mode 100644 pipelines/__pycache__/__init__.cpython-311.pyc create mode 100644 pipelines/styling/__pycache__/__init__.cpython-311.pyc create mode 100644 pipelines/styling/__pycache__/data_processor.cpython-311.pyc create mode 100644 pipelines/styling/data_processor.py create mode 100644 pipelines/styling/inference.py create mode 100644 pipelines/styling/train.py create mode 100644 scripts/styling/__init__.py create mode 100644 scripts/styling/data_processor.py create mode 100644 scripts/styling/inference.py create mode 100644 scripts/styling/train.py create mode 100644 test.py create mode 100644 test.readme diff --git a/configs/QUICK_REFERENCE.md b/configs/QUICK_REFERENCE.md new file mode 100644 index 0000000..f198c42 --- /dev/null +++ b/configs/QUICK_REFERENCE.md @@ -0,0 +1,191 @@ +# Quick Reference Card + +## Essential Parameters (Most Common) + +### Data Source & Location +```yaml +data: + source: "huggingface|custom" # REQUIRED: Data source type + dataset_name: "dataset/name" # REQUIRED for huggingface + data_path: "./path/to/file" # REQUIRED for custom + data_format: "jsonl|csv|json" # REQUIRED for custom +``` + +### Field Mapping +```yaml +data: + input_field: "text" # REQUIRED: Input text field + label_field: "label" # REQUIRED for classification + output_field: "styled_text" # REQUIRED for styling + instruction: "Style instruction" # REQUIRED for styling +``` + +### Basic Processing +```yaml +data: + max_samples: 1000 # Limit total samples + train_split: 0.8 # Training ratio (0.0-1.0) + validation_split: 0.1 # Validation ratio (0.0-1.0) + test_split: 0.1 # Test ratio (0.0-1.0) + output_dir: "./output/path" # Output directory +``` + +### Text Preprocessing +```yaml +data: + clean_text: true # Clean/normalize text + lowercase: true # Convert to lowercase + min_length: 10 # Minimum text length + max_length: 512 # Maximum text length +``` + +### Model & Training +```yaml +model: + name: "bert-base-uncased" # Model name + max_length: 512 # Max sequence length + +training: + num_epochs: 3 # Training epochs + batch_size: 16 # Batch size + learning_rate: 2e-5 # Learning rate +``` + +## Common Configurations by Task + +### Classification +```yaml +task: + name: "classification" + type: "sequence_classification" + +data: + source: "huggingface" + dataset_name: "dair-ai/emotion" + input_field: "text" + label_field: "label" + output_format: "classification" +``` + +### Styling +```yaml +task: + name: "styling" + type: "style_transfer" + +data: + source: "custom" + data_path: "./data.jsonl" + input_field: "text" + output_field: "styled_text" + instruction: "Rewrite in formal style" + output_format: "alpaca" +``` + +### Text Generation +```yaml +task: + name: "completion" + type: "text_generation" + +data: + source: "custom" + data_path: "./prompts.jsonl" + input_field: "prompt" + output_field: "completion" + output_format: "instruction" +``` + +## Quick Start Templates + +### 1. HuggingFace Dataset +```yaml +task: + name: "classification" + type: "sequence_classification" + +data: + source: "huggingface" + dataset_name: "your/dataset" + input_field: "text" + label_field: "label" + max_samples: 1000 + output_dir: "./output" +``` + +### 2. Custom JSONL File +```yaml +task: + name: "styling" + type: "style_transfer" + +data: + source: "custom" + data_path: "./your_data.jsonl" + data_format: "jsonl" + input_field: "source" + output_field: "target" + instruction: "Your style instruction" + output_dir: "./output" +``` + +### 3. CSV File +```yaml +task: + name: "classification" + type: "sequence_classification" + +data: + source: "custom" + data_path: "./your_data.csv" + data_format: "csv" + input_field: "text" + label_field: "label" + delimiter: "," + output_dir: "./output" +``` + +## Parameter Ranges & Recommendations + +### Split Ratios +- **Total must be ≤ 1.0** +- **Common**: train=0.8, val=0.1, test=0.1 +- **Small datasets**: train=0.7, val=0.15, test=0.15 + +### Learning Rates +- **Fine-tuning**: 1e-5 to 5e-5 +- **Training from scratch**: 1e-4 to 1e-3 +- **Start with**: 2e-5 + +### Batch Sizes +- **GPU Memory**: 8, 16, 32, 64 +- **CPU**: 4, 8, 16 +- **Start with**: 16 + +### Text Lengths +- **BERT**: 512 (max) +- **GPT-2**: 1024 (max) +- **T5**: 512 (max) +- **Start with**: 256 + +## Common Issues & Fixes + +| Issue | Cause | Fix | +|-------|-------|-----| +| "File not found" | Wrong path | Check `data_path` and `output_dir` | +| "Memory error" | Batch too large | Reduce `batch_size` | +| "Split error" | Ratios > 1.0 | Ensure splits sum to ≤ 1.0 | +| "Poor performance" | Wrong learning rate | Try 1e-5 to 5e-5 range | +| "Slow processing" | Text too long | Reduce `max_length` | + +## Environment Variables +```bash +# Set cache directory +export HF_HOME="./cache" + +# Set output directory +export OUTPUT_DIR="./results" + +# Set log level +export LOG_LEVEL="INFO" +``` diff --git a/configs/README.md b/configs/README.md new file mode 100644 index 0000000..6bf0d43 --- /dev/null +++ b/configs/README.md @@ -0,0 +1,207 @@ +# Configuration Files Documentation + +This directory contains YAML configuration files for different machine learning tasks. Each configuration file is organized into logical sections and includes comprehensive documentation for all parameters. + +## Configuration Structure + +All configuration files follow a consistent structure organized into these main sections: + +### 1. Task Configuration +```yaml +task: + name: "task_type" # Task type: classification, completion, styling, matching + type: "specific_type" # Specific model/task type +``` + +**Available Task Types:** +- **classification**: Text classification tasks (emotion, sentiment, topic, etc.) +- **completion**: Text generation and completion tasks +- **styling**: Style transfer and text transformation tasks +- **matching**: Semantic matching and similarity tasks + +### 2. Data Processing Configuration +```yaml +data: + # Data Source + source: "huggingface|custom" # Where to get data from + + # Data Location + dataset_name: "dataset/name" # HuggingFace dataset name (for huggingface source) + data_path: "./path/to/file" # Path to custom data file (for custom source) + data_format: "jsonl|csv|json" # File format for custom data + + # Field Mapping + input_field: "text" # Field containing input text + output_field: "styled_text" # Field containing output (for styling) + label_field: "label" # Field containing labels (for classification) + id_field: "id" # Optional ID field for tracking + + # Processing Parameters + max_samples: 1000 # Maximum samples to process + train_split: 0.8 # Training split ratio + validation_split: 0.1 # Validation split ratio + test_split: 0.1 # Test split ratio + + # Text Preprocessing + clean_text: true # Clean and normalize text + remove_special_chars: false # Remove special characters + lowercase: true # Convert to lowercase + min_length: 10 # Minimum text length + max_length: 1000 # Maximum text length + + # Output Configuration + output_format: "format_type" # Output format + output_dir: "./output/path" # Output directory +``` + +**Data Source Types:** +- **huggingface**: Use datasets from HuggingFace Hub +- **custom**: Use local files (JSONL, CSV, JSON) + +**Output Formats:** +- **classification**: Raw classification format +- **instruction**: Instruction-following format +- **conversation**: Conversational format +- **qa**: Question-answer format +- **styling**: Raw styling format +- **alpaca**: Alpaca instruction format + +### 3. Model Configuration +```yaml +model: + name: "model_name" # Model from HuggingFace Hub + max_length: 512 # Maximum sequence length + num_labels: 6 # Number of labels (for classification) +``` + +**Recommended Models by Task:** +- **Classification**: `bert-base-uncased`, `distilbert-base-uncased` +- **Styling**: `t5-base`, `gpt2-medium` +- **Completion**: `gpt2-medium`, `gpt2-large` +- **Matching**: `sentence-transformers/all-MiniLM-L6-v2` + +### 4. Training Configuration +```yaml +training: + num_epochs: 3 # Number of training epochs + batch_size: 16 # Training batch size + learning_rate: 2e-5 # Learning rate + weight_decay: 0.01 # Weight decay + lr_scheduler_type: "linear" # Learning rate scheduler + warmup_ratio: 0.1 # Warmup ratio + data_dir: "./data/path" # Training data directory + output_dir: "./model/output" # Model output directory +``` + +**Learning Rate Guidelines:** +- **Fine-tuning**: 1e-5 to 5e-5 +- **Training from scratch**: 1e-4 to 1e-3 + +**Scheduler Types:** +- **linear**: Linear decay +- **cosine**: Cosine annealing +- **polynomial**: Polynomial decay + +### 5. Inference Configuration +```yaml +inference: + model_path: "./model/path" # Path to saved model + device: "auto" # Device to use + batch_size: 32 # Inference batch size + return_probabilities: true # Return probabilities + return_top_k: 3 # Return top K predictions + max_new_tokens: 128 # Max tokens to generate + temperature: 0.8 # Sampling temperature +``` + +**Device Options:** +- **auto**: Automatically detect best device +- **cuda**: Use GPU if available +- **cpu**: Force CPU usage + +**Temperature Guidelines:** +- **0.0**: Deterministic (always same output) +- **0.7-0.9**: Balanced creativity +- **1.0+**: More random/creative + +## Task-Specific Parameters + +### Classification Tasks +```yaml +data: + label_encoding: "auto|numeric|string" # How to encode labels + multilabel: false # Multi-label vs single-label + label_separator: "," # Separator for multi-label +``` + +### Styling Tasks +```yaml +data: + instruction: "Style instruction text" # The style instruction +``` + +### Completion Tasks +```yaml +data: + prompt_template: "template" # Prompt template + completion_length: 100 # Target completion length +``` + +## Advanced Configuration + +### HuggingFace Specific +```yaml +data: + hf_split: "train" # Dataset split to use + hf_cache_dir: "./cache" # Cache directory + test_split_from: "train" # Source for test split + val_split_from: "train" # Source for validation split +``` + +### Custom Data Specific +```yaml +data: + encoding: "utf-8" # File encoding + delimiter: "," # CSV delimiter +``` + +## Usage Examples + +### Basic Usage +```bash +# Use YAML configuration +python scripts/task_type/data_processor.py --config configs/task_type/config.yaml + +# Override specific parameters +python scripts/task_type/data_processor.py \ + --config configs/task_type/config.yaml \ + --max-samples 1000 \ + --learning-rate 3e-5 +``` + +### Creating Custom Configurations +1. Copy an existing config file +2. Modify parameters for your specific use case +3. Update paths and model names +4. Test with a small dataset first + +## Best Practices + +1. **Start with Defaults**: Use default values and adjust based on results +2. **Validate Paths**: Ensure all file paths are correct and accessible +3. **Monitor Resources**: Adjust batch sizes based on available GPU memory +4. **Test Incrementally**: Test with small datasets before full processing +5. **Version Control**: Keep configurations in version control for reproducibility + +## Troubleshooting + +### Common Issues: +- **File Not Found**: Check `data_path` and `output_dir` paths +- **Memory Errors**: Reduce `batch_size` or `max_length` +- **Poor Performance**: Adjust `learning_rate` or `num_epochs` +- **Split Errors**: Ensure split ratios sum to ≤ 1.0 + +### Getting Help: +- Check the script help: `python script.py --help` +- Review the pipeline logs for detailed error messages +- Verify YAML syntax and parameter values diff --git a/configs/classification/emotion.yaml b/configs/classification/emotion.yaml index dd6958e..2827292 100644 --- a/configs/classification/emotion.yaml +++ b/configs/classification/emotion.yaml @@ -1,6 +1,6 @@ # Comprehensive Classification Configuration # This file defines all parameters for emotion classification using the dair-ai/emotion dataset -# Organized by level: data processing, model, training, and inference +# Organized by level: task, data processing, model, training, and inference # Task Configuration task: @@ -15,9 +15,9 @@ data: data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data) # Field Mapping - input_field: "text" # Field name containing input text - label_field: "label" # Field name containing labels - id_field: null # Optional ID field name + input_field: "text" # Field name containing input text to be classified + label_field: "label" # Field name containing classification labels + id_field: null # Optional ID field name for tracking individual samples # Processing Parameters max_samples: 1000 # Maximum samples to process (null for all samples) @@ -26,54 +26,54 @@ data: test_split: 0.1 # Test split ratio (0.0 to 1.0) # Text Preprocessing - clean_text: true # Clean and normalize text - remove_special_chars: false # Remove special characters from text - lowercase: true # Convert text to lowercase + clean_text: true # Clean and normalize text (remove extra spaces, normalize quotes, etc.) + remove_special_chars: false # Remove special characters from text (keep for emotion analysis) + lowercase: true # Convert text to lowercase (standard for BERT models) min_length: 10 # Minimum text length (filter out shorter texts) max_length: 1000 # Maximum text length (truncate longer texts) # Label Processing label_encoding: "auto" # Label encoding: "auto", "numeric", "string" - multilabel: false # Enable multilabel classification - label_separator: "," # Separator for multilabel datasets + multilabel: false # Enable multilabel classification (false for single emotion per text) + label_separator: "," # Separator for multilabel datasets (comma-separated labels) # Output Configuration output_format: "classification" # Output format: "classification", "instruction", "conversation", "qa" - output_dir: "./data/processed/classification/emotion" # Specific output directory for this dataset + output_dir: "./data/processed/classification/emotion" # Output directory for processed data and splits # HuggingFace Specific - hf_split: "train" # HuggingFace dataset split to use - hf_cache_dir: null # HuggingFace cache directory (null for default) + hf_split: "train" # HuggingFace dataset split to use as base + hf_cache_dir: null # HuggingFace cache directory (null for default ~/.cache/huggingface) # Split Configuration (Advanced) test_split_from: "train" # Source for test split: "train", "use_test_if_available", "use_val_if_available" val_split_from: "train" # Source for validation split: "train", "use_val_if_available" # Custom Data Specific - encoding: "utf-8" # File encoding for custom data - delimiter: "," # Delimiter for CSV files + encoding: "utf-8" # File encoding for custom data files + delimiter: "," # Delimiter for CSV files (comma for standard CSV) # Model Configuration model: - name: "bert-base-uncased" # Model name from HuggingFace Hub - max_length: 512 # Maximum sequence length for tokenization - num_labels: 6 # Number of classification labels + name: "bert-base-uncased" # Model name from HuggingFace Hub (good for text classification) + max_length: 512 # Maximum sequence length for tokenization (BERT limit) + num_labels: 6 # Number of classification labels (emotion categories) # Training Configuration training: - num_epochs: 3 # Number of training epochs - batch_size: 16 # Training batch size - learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5) - weight_decay: 0.01 # Weight decay for optimizer (typical range: 0.01 to 0.1) + num_epochs: 3 # Number of training epochs (adjust based on dataset size) + batch_size: 16 # Training batch size (adjust based on GPU memory) + learning_rate: 2e-5 # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning) + weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting) lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial" warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0) data_dir: "./data/processed/classification/emotion" # Directory containing train/validation/test JSONL files - output_dir: "./results/classification/emotion_model" # Output directory for saved model + output_dir: "./results/classification/emotion_model" # Output directory for saved model and checkpoints # Inference Configuration inference: model_path: "./results/classification/emotion_model" # Path to saved model directory - device: "auto" # Device: "auto", "cuda", "cpu" - batch_size: 32 # Batch size for inference - return_probabilities: true # Return all class probabilities - return_top_k: 3 # Return top K predictions + device: "auto" # Device: "auto", "cuda", "cpu" (auto detects best available) + batch_size: 32 # Batch size for inference (can be larger than training) + return_probabilities: true # Return all class probabilities (not just top prediction) + return_top_k: 3 # Return top K predictions (useful for confidence analysis) diff --git a/configs/styling/formal.yaml b/configs/styling/formal.yaml index fb79712..d13d2be 100644 --- a/configs/styling/formal.yaml +++ b/configs/styling/formal.yaml @@ -1,29 +1,69 @@ +# Comprehensive Styling Configuration +# This file defines all parameters for formal style transfer tasks +# Organized by level: task, data processing, model, training, and inference + +# Task Configuration task: - name: "styling" - type: "style_transfer" + name: "styling" # Task type: classification, completion, styling, matching + type: "style_transfer" # Model type: style_transfer, text_generation, etc. +# Data Processing Configuration data: - source: "custom" - input_field: "text" - style_field: "style" - max_length: 256 - train_split: 0.8 - validation_split: 0.1 - test_split: 0.1 + source: "custom" # Data source: "huggingface" or "custom" + data_path: "./data/raw/styling/sample_formal.jsonl" # Path to custom data file (required for custom source) + dataset_name: null # HuggingFace dataset name (required for huggingface source) + + # Field Mapping + input_field: "text" # Field name containing source text to be styled + output_field: "styled_text" # Field name containing the styled/transformed text + + # Style Instruction + instruction: "Rewrite the following text in a formal style" # The style instruction that guides the transformation + + # Data Format & Processing + data_format: "jsonl" # Data format: "jsonl", "csv", "json" (for custom data) + max_length: 256 # Maximum text length (truncate longer texts) + min_length: 10 # Minimum text length (filter out shorter texts) + + # Text Preprocessing + clean_text: true # Clean and normalize text (remove extra spaces, normalize quotes, etc.) + lowercase: false # Convert text to lowercase (false for formal style to preserve case) + + # Data Splitting + train_split: 0.8 # Training split ratio (0.0 to 1.0) + validation_split: 0.1 # Validation split ratio (0.0 to 1.0) + test_split: 0.1 # Test split ratio (0.0 to 1.0) + + # Output Configuration + output_format: "alpaca" # Output format: "styling" (raw), "alpaca" (instruction format) + output_dir: "./data/processed/styling/formal" # Output directory for processed data and HuggingFace datasets +# Model Configuration model: - name: "t5-base" - max_length: 256 + name: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model name from HuggingFace Hub + max_length: 2048 # Maximum sequence length for tokenization + max_seq_length: 2048 # Maximum sequence length for training (RoPE scaling supported) + dtype: null # Data type: null for auto detection, float16 for Tesla T4/V100, bfloat16 for Ampere+ + load_in_4bit: true # Use 4bit quantization to reduce memory usage + token: null # HuggingFace token for gated models (e.g., "hf_...") + + # Training Model Parameters + training_model: "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit" # Model to use for training + training_max_seq_length: 2048 # Max sequence length for training + training_dtype: null # Data type for training + training_load_in_4bit: true # 4bit quantization for training +# Training Configuration training: - num_epochs: 3 - batch_size: 16 - learning_rate: 3e-5 - weight_decay: 0.01 - warmup_ratio: 0.1 - lr_scheduler_type: "linear" + num_epochs: 3 # Number of training epochs + batch_size: 16 # Training batch size (adjust based on GPU memory) + learning_rate: 3e-5 # Learning rate (typical range: 1e-5 to 5e-5 for fine-tuning) + weight_decay: 0.01 # Weight decay for optimizer (prevents overfitting) + warmup_ratio: 0.1 # Warmup ratio for scheduler (0.0 to 1.0) + lr_scheduler_type: "linear" # Scheduler type: "linear", "cosine", "polynomial" +# Inference Configuration inference: - batch_size: 32 - max_new_tokens: 128 - temperature: 0.8 + batch_size: 32 # Batch size for inference (can be larger than training) + max_new_tokens: 128 # Maximum new tokens to generate during inference + temperature: 0.8 # Sampling temperature (0.0 = deterministic, 1.0 = random) diff --git a/data/alpaca/test.jsonl b/data/alpaca/test.jsonl new file mode 100644 index 0000000..659cab5 --- /dev/null +++ b/data/alpaca/test.jsonl @@ -0,0 +1 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "This is really cool stuff!", "output": "This is quite impressive material."} diff --git a/data/alpaca/train.jsonl b/data/alpaca/train.jsonl new file mode 100644 index 0000000..2af6ff3 --- /dev/null +++ b/data/alpaca/train.jsonl @@ -0,0 +1 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "I'm gonna go to the store later.", "output": "I will go to the store later."} diff --git a/data/alpaca/validation.jsonl b/data/alpaca/validation.jsonl new file mode 100644 index 0000000..4be4e50 --- /dev/null +++ b/data/alpaca/validation.jsonl @@ -0,0 +1 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"} diff --git a/data/hf_dataset/data-00000-of-00001.arrow b/data/hf_dataset/data-00000-of-00001.arrow new file mode 100644 index 0000000000000000000000000000000000000000..52b04b72bebabeab4bba7fd0154605e1b2d258e4 GIT binary patch literal 2208 zcmds2v2GJV5Zw^lSjIvwB9sPcI2UP*p|&C-QQ2*TAO!+#>^*zM`S!Toi?Je06cm&x zDfj?NK7x;+KzsrJVBYNA1{Kr8O-w6=eSe3?a6{Q%$DMt3jQ>E{_Zxa1=r}Y%kIsWoNtdO{`iYLMOS7TVCie4@ zn-60_PdR^9L|3v4?up z#9d#ZR#C@14g!W_W8oZD>H$0dguoWd6BKxlL6D|+P;^Gq?6j#J=~PB!-8@qN3yUzk z5bk{_b2>Pf(%8hY#GM>2#AlsJ6-CmiwA)wn4$UUg9azeeBkG$ONqFZbqml7Bn30?x zy_jfAxKbsfXih_8IGHRAAK{Qf9^!cj<=PBrrlV-%p&+P3lWWd;pE09)n&87@^_ik6 zhpc7P%@2Hx+K%w;J|SP}Xr$~g)u#$xa?-x*P$G4zMu^c(@)h{h z7kC#|op?8)0s+vfLsov0GI`R!z``2UCB(!Y48 BL1q8| literal 0 HcmV?d00001 diff --git a/data/hf_dataset/dataset_info.json b/data/hf_dataset/dataset_info.json new file mode 100644 index 0000000..6bff0b3 --- /dev/null +++ b/data/hf_dataset/dataset_info.json @@ -0,0 +1,24 @@ +{ + "citation": "", + "description": "", + "features": { + "instruction": { + "dtype": "string", + "_type": "Value" + }, + "input": { + "dtype": "string", + "_type": "Value" + }, + "output": { + "dtype": "string", + "_type": "Value" + }, + "text": { + "dtype": "string", + "_type": "Value" + } + }, + "homepage": "", + "license": "" +} \ No newline at end of file diff --git a/data/hf_dataset/state.json b/data/hf_dataset/state.json new file mode 100644 index 0000000..711aac0 --- /dev/null +++ b/data/hf_dataset/state.json @@ -0,0 +1,13 @@ +{ + "_data_files": [ + { + "filename": "data-00000-of-00001.arrow" + } + ], + "_fingerprint": "4e028847697e7b16", + "_format_columns": null, + "_format_kwargs": {}, + "_format_type": null, + "_output_all_columns": false, + "_split": null +} \ No newline at end of file diff --git a/data/processed/styling/formal/alpaca/test.jsonl b/data/processed/styling/formal/alpaca/test.jsonl new file mode 100644 index 0000000..ee6ece1 --- /dev/null +++ b/data/processed/styling/formal/alpaca/test.jsonl @@ -0,0 +1 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "That's totally awesome!", "output": "That is quite remarkable!"} diff --git a/data/processed/styling/formal/alpaca/train.jsonl b/data/processed/styling/formal/alpaca/train.jsonl new file mode 100644 index 0000000..93f0ddf --- /dev/null +++ b/data/processed/styling/formal/alpaca/train.jsonl @@ -0,0 +1,3 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "I'm gonna go to the store later.", "output": "I will go to the store later."} +{"instruction": "Rewrite the following text in a formal style", "input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"} +{"instruction": "Rewrite the following text in a formal style", "input": "What's the deal with this?", "output": "What is the situation regarding this matter?"} diff --git a/data/processed/styling/formal/alpaca/validation.jsonl b/data/processed/styling/formal/alpaca/validation.jsonl new file mode 100644 index 0000000..659cab5 --- /dev/null +++ b/data/processed/styling/formal/alpaca/validation.jsonl @@ -0,0 +1 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "This is really cool stuff!", "output": "This is quite impressive material."} diff --git a/data/processed/styling/formal/test.jsonl b/data/processed/styling/formal/test.jsonl new file mode 100644 index 0000000..ee6ece1 --- /dev/null +++ b/data/processed/styling/formal/test.jsonl @@ -0,0 +1 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "That's totally awesome!", "output": "That is quite remarkable!"} diff --git a/data/processed/styling/formal/train.jsonl b/data/processed/styling/formal/train.jsonl new file mode 100644 index 0000000..93f0ddf --- /dev/null +++ b/data/processed/styling/formal/train.jsonl @@ -0,0 +1,3 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "I'm gonna go to the store later.", "output": "I will go to the store later."} +{"instruction": "Rewrite the following text in a formal style", "input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"} +{"instruction": "Rewrite the following text in a formal style", "input": "What's the deal with this?", "output": "What is the situation regarding this matter?"} diff --git a/data/processed/styling/formal/validation.jsonl b/data/processed/styling/formal/validation.jsonl new file mode 100644 index 0000000..659cab5 --- /dev/null +++ b/data/processed/styling/formal/validation.jsonl @@ -0,0 +1 @@ +{"instruction": "Rewrite the following text in a formal style", "input": "This is really cool stuff!", "output": "This is quite impressive material."} diff --git a/data/raw/styling/sample_formal.jsonl b/data/raw/styling/sample_formal.jsonl new file mode 100644 index 0000000..0a2d5a2 --- /dev/null +++ b/data/raw/styling/sample_formal.jsonl @@ -0,0 +1,5 @@ +{"text": "Hey, what's up? How are you doing today?", "styled_text": "Hello, how are you doing today?"} +{"text": "This is really cool stuff!", "styled_text": "This is quite impressive material."} +{"text": "I'm gonna go to the store later.", "styled_text": "I will go to the store later."} +{"text": "What's the deal with this?", "styled_text": "What is the situation regarding this matter?"} +{"text": "That's totally awesome!", "styled_text": "That is quite remarkable!"} diff --git a/data/raw/styling/test_formal.jsonl b/data/raw/styling/test_formal.jsonl new file mode 100644 index 0000000..7d6d9fb --- /dev/null +++ b/data/raw/styling/test_formal.jsonl @@ -0,0 +1,3 @@ +{"input": "Hey, what's up? How are you doing today?", "output": "Hello, how are you doing today?"} +{"input": "This is really cool stuff!", "output": "This is quite impressive material."} +{"input": "I'm gonna go to the store later.", "output": "I will go to the store later."} diff --git a/data/raw/styling/test_missing_fields.jsonl b/data/raw/styling/test_missing_fields.jsonl new file mode 100644 index 0000000..2005649 --- /dev/null +++ b/data/raw/styling/test_missing_fields.jsonl @@ -0,0 +1,5 @@ +{"text": "Hello world", "styled_text": "Greetings, world."} +{"styled_text": "This is a formal greeting."} +{"text": "How are you?", "styled_text": "How are you doing?"} +{"text": null, "styled_text": "Empty input example."} +{"styled_text": "Another example with no input."} diff --git a/pipelines/__pycache__/__init__.cpython-311.pyc b/pipelines/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..11c84889e3fe7ca847d5d00ec9fa53b86d27b1be GIT binary patch literal 184 zcmZ3^%ge<81fTa#&H&MmK?DpiLK&agfQ;!3DGb33nv8xc8H$*I{LdiCU*7tm#i>Qb z`nie8N%{HN`Yx%(*(Lb}`bjyd#l@L<>G6q~`nlOD`e~VYsk$YlAR@6iTfZQ)ATAn}2jk&*EO1B@tQ28sayB&9Di literal 0 HcmV?d00001 diff --git a/pipelines/styling/__pycache__/__init__.cpython-311.pyc b/pipelines/styling/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd21bb234da2fa28b3f421778cc872160c878f0a GIT binary patch literal 192 zcmZ3^%ge<81fTa#&H&MmK?DpiLK&agfQ;!3DGb33nv8xc8H$*I{LdiCU%~pJ#i>Qb z`nie8N%{HN`Yx%(*(Lb}`bjyd#l@L<>G6q~`nlOD`e~VYsk$YlAR@6iTfZQ)AT&M4u=4F<|$LkeT{^GF7%}*)KNwq6t1=;~}MKM2+_`uA_$oPQ)Miemv F#Q^?{GIRg{ literal 0 HcmV?d00001 diff --git a/pipelines/styling/__pycache__/data_processor.cpython-311.pyc b/pipelines/styling/__pycache__/data_processor.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5e118b1ffbf3eefe0e2008e32b063710c803d75b GIT binary patch literal 75235 zcmeFa33OX`dM}2JBtQTJNP;A|0^A8oBDL?7#6{FXky@ygwnRa^phStAEd|9#*8J9c}%4!{5W&)&Py@DDoO-;g2svnMp4 zUoq%(@8~!kryti1$zT1Dftig%CVUO!ITN`|Pjp=yIJPS&)F|+j^buyZD!XL*9TlTAMVf zzX(;AG;71?b%A1o?ud>vzpmpf-$spm7yt6_+xUDpY0wSTGk^K;XVv-}YG5ulxYz?F z-!>u?{^j3LBXcW&nMojjM%^4O|@yv2oPI)xT~U+Qc;ke4_@g5&AZ+iMg2rzJP7i zP@XJ?4t|^A-p;kWt{d7M@Nm9cte$uq+GMk~rqGW1LH6(4_$kFO;M2!Vr~IKCzTCK} zXKW-CH=P&@hT`Uv)1k4cN&k2}=iDSTabxG?y?D;qnd$L>&k)b+@rV4uKqzh-pYn6V zoNPi|7SfK4`-4IFGIn-##|!+|f+5~N5}F8vZcK6TMMOhb7Ab08;G7jVf*xL7h=a+yos zP(F*n%uJRcD>LOYlXb|(Og3i1_+h33W-<*GAV!BKEeCVKxC#^^(fdR%Y5TRL4vknQ7Bd zJu|g2lW9=rYmeIpL-)qVCa-r-O^%LT4}0YA4DYA@Zafosi609EyrCNbWNLhT>LujCOFq40lU_e% zbizN*@``MYO`;Faj8GBc)(QXJ;h=v4BO&%AK0^=Y;K#4#u5#{NrON z1;WW)J)WQH_GjbyBjW)-jfA_Q-nfeoOibMgz+Yfw%s)Ona>LID;}+yBz>i?CeM;*_ zJbwa1XFM=@J#^#K|A4Uq3uH&Oi037$Hf~QAmetME+_rYA%6NXlnj7QeIV{6*^NrC& z&TP<)_(yI8hRNBH%IEMXKQ$3AKv86$%=|QGCN#QfcibG99GT)!cTe>j(L{mqu?bWw zAO3H;0?4^{Ea>%5qDGbKLxuZ8UM>(E;m588;L5teM(?zLj1O?$m&QUj$BY~;V@E8|45Vr)Tg2SPyTY*VmL)<*f zdg<_RJb!q2Vv3s?4?u4l9=<)}AE*8cQ{M0}H#IUm%okHj8c`>4$MCR!a&jufx=0XC zE;#YI1WE{$5uiDTuOv`SfL7VKacnZgdr6lwia9vM(<;i>6KEh?)3vFoalVnvO$1s9 zv=Z1zppC$00$T`dC9sXa4g$Lf>?W`W;3M7VfV!ub+Rp_8e6W4OKZ3|_wf6*qw?b3X z?bk36g6ILm{;~FnTU`6-*koW+Xodhrc>DC&GO?9!-8resxv?W!q9i1CV#hInv5KmS8YbWOiQ}I;PzBVx5trP}6(&H&{ zBdzbMid@Res?L&=%fM$$SE?(mlYJ&CMvj#4NjCZjG|Ere1OGX!QtLmh{s9x0 z&lzdO3FN6)ef4gLGX>0CPQb$D2J!(Ob=+B9Z!L%FH2QBz8v(sAZUF3M@iEN8^P z8@T+*+=%{WPSXFIPG)Omwo2S@79zKKNk8hZHg`R`dE@0RFQQztd9&t-Ip|qO{%nyv z#0@VT+&Ct}OLfA;w` z#qw1eu^?X=J${{`%Od&NLZcS#&#A@tleHM4l|NbAwx~6!SAXAda&GP6GvrRWYh8kC zw?{c^kBVouhdoh)_I2f}@Yj;BO4hRL%-8+jTW8x;MeNzyMm_7yIY;$e_3Idy&zV&U zG@%L+s%9Ob9FYRfd5g_KeASwx#1kn%zxJ|Lv1g9Th+|!GI1oqeT5+ti#_A|1zw%h9 zk01}3p(AGg51yYs*TC}Lm>E7qTj9hnL_6Zdk866?45u9v+F1v4*yyY^Vvbl-n}u21 zI`Y^|a~RhWS@$UAd|d0lP$6#+dc&HbGsaSu&>P>+)0RcAyMKM&cz>IwF4AT*^dGja z%B!w#Q`nQ-i)gkYK=63kfj0P&k#9#W$MbOrOq^%wz#R;QE&XF`^Y3jA=QYc|nb&aT zS;F$jHr^3-$njnacmp^^-Rm7uFoTGdh69~FKHm<|r_*EC7tviVp4Ep`z9M>UG0*GX z(W9s?PB*CgNI&qAK5oPz7)U2W+va#a%e9P_qJnz8{9$~aTJ~QLOa|^w^9RB^HJh=v zM7ePKmXQ-IY}@|vsS*Ep@Iadq&W~tUrxk(_TDsl63r3O%{H|WMLKh&~Ft z%-HpWzHKfe3WDxA9@9=s?W zy!hzgm6d~6#DiC*gID2W;Wj{;VqNt=j@sm}p-B7)K%(;aQL>4@`e*{<-;Km2lybcxXsEH1z1u)s;h6#Y5MmL)XZM9w1H0!#uZlY6hpau$K*> zpp4C8GakA!Lue)Ni{aW#ABo|SL_vlN(a{gnY$=CDPY)Lx0S=A2 z8L(wQSb3l7_kK!!Eu80NgVh_h?3JVIXdd{Kus-1e%ni*Oy>f&d%}-5a_BrCY{^{w! zBnLFr%tSmFv(8K)7|$J_x_&*tqbpBh@=@?on5`&Heh?7?EjBqb5dgk75YHb2UMLvy zPmTn54sQGb@{q&(C$9(MMqo01Mf_2+KStmhoXN!Gyg0#TT#XfO08v&E?`#xmT#z12yM} zl{BpCOqK%-%pHz7Tnjyd!wVI2mcC2 zJZFv7w+m%Q=kLznT{cYGl~R$RTR zGuX=?qo~%mtU}evU(1`<&u@LCGA*AX9SMAt3JbxUyFin&@AJ4F{NwdIRM$rkn|3Uc_OtWk>dSS~twc#%{u zN>3Wmc3HAr7HpSM-pZ=EzQ@{vHc~+w<$^vgtBp2%r{!BMi~E)hV%6ptCMe!Tv`%=;FUoH~s_Dgm9e>x(T zbxCDi+GuYtp-OI(IyaoqOd=Js5na6G;sqC16;Y(RNs}9Tsede+k^C zN4?Z74#6bW_$9zfzf|d;JMkDkRCD^`O_)T_hy++QB2|sd^*?b`F8Eg*9>L*J2F>+o zM5wt=3K^9(HP=Pw4as>!aNdYHsupgqXvgMQbXKStBXt3*FVobFiO!pn^QPduscPI_ z!QH!T1Ar23MfKf>zMC( zj2_?jR^O^lS9{#BSSx>JxWxVOjem1v`Q?XQ;?9%O&XdBaLE-cuptSR>+MmGPc*Kpb zaO1+nJ>lh9k$XkrUV(>0hHj*%>y$ra`UjmZpzk3|B&LR+>2xK189o-l(qWCwwXW)$Q8D@kUh32tkuRzB zur;S_IZH|}`?$X$q{1~u($0XX_&MNuK|laZ@2FwafOGB$%ZcSp+nfzJi+tcL^3pLK zv-y#HejsE~DnJ*>*B-t?Dx6e?TM~hf;5U^#KGPSq3j3w|;;iqPB4*B}D!qlSn(uQF z6O{<%X?fG8#;dVX*|U{OTLPT+ao*<&QZc{5F8%NOvbRWDdpPbI?y64l&uR<8WsVp} z43zI@`AbV7qs8a-_ZuTdTqHQdTRK6QaW5S3(yag;(&?%g6czyO)3)Q+4ELK*Cwg}H zLIG(9%C?#~kI|t^-r?o2bFSv(@HBt_IBIJIx4H+yTeMp_FQKjp{&K`K$x2wvLuS$dx#%RuTEnM+W(nidtiv@;4B( zrf%dD>E`uTN)o5?-%&htrbwPe>(2`HXBQhD^!(}3pC5gA?q?@|aS|5MPJlGUs)he5 z!m=Lc%TH~!`P*d6+LbDerxS17FgeZB@s0m&0uul}d-_Sj%$hMc7F5L*ChzQ=P#_P2 zq@Q>`>mo9xiA~ktaRx|sb#|8E-$WEan&Z^F|HRJUk=Ap<)z7TMC`Dz!N<^D|v#zZC zNo`%sQ@fgDbQi4Z2;}6t^06~><P+N)VvLQWE8lt^b=2Abie4?EO|CRR%yL;ALK3CMDGsCyW??P!w0^_6Jp&S zscz4zCFh79Nf4YL5bAeIu3gLdqO1FhRj0192|Km&?by}WkLqPOpYzyTukM9QtuTr8 zXC*-IS;>2L-X7a~WWGS~v`fy-OC_Rnm*Cv>WYfO6qw^OfTg^gPw6(@;uK7!n%@cJ9 zw#`tIZSxPim-D}W@&_kHTeoZ{+BQG#IP{?6N5elEMp)+EF4@`^5y4)lCyuf?2O9t& z#_RPJ@O#lM_|IDb%EqeXMM4BLd z%`I9r?8tGi>HrqF=(a`E;@OY$qX(rL&=$aM`7-+mLs#VQttnwTD9qdyrteC(?lHR* zP-0KU$eiQ~p}H8szw*mZ=sy2_XflWo41pYw9}w|@c}=nc6K8teq(v1DSwL1G6D_P6 zj$2RMxm<2W-1%G{)<1K+pbI+znr%*iAQSXg4V?+HOYb!}co|71Ll|>JV?b1!Amx$q zE`2EL2xcMWM|)f&{y)zJ02wmY44)yqmttmH4n~?FQyq9Y(2Rn*&ClPHv0aR+tR1_8 zcDf!&AhZWqC&XUE6by`yvQ&eVU;-~6H&K4V#o8jYk>&qD*`Qv{&>&vbLaH*_&wmp0txgOVU>fSz%&?b4<$W|smqEL^W&VF9rA3%h*S**Ggk zhb>-mR3>BX1sSVq7}hI;2K)2`Qa0h!OJ2Q4XRfU=3T>zsHVU7XQ2l@c1s3uz=uqE2 zb7@y&U(tt5NuN4hl;X(vfwD&xA8u(4+C8IQXVGa+fGXc=T9}WG+N+aULx}c9iC@I9 z9*P)k%Cp9GmO*160}{%xq^r>QID$AGN`Ja-^U!9dQ7AASw}slg+M(rQvAukT_dMXC8y5R*j-mn z3x7j7{hh2iy`rf*6)q*!6}X~j_JizWdaZu&{FW+4K54#TKhXPNTZ#55N$(?i-DHy{ z@7eQFnl-kv%-G;h88@6e(c}8Ka@LMVFmlUP?U=WYlq)ptn1&TwM~_PjfgZ=$G3ND? zdo+1{-so6QUMqhyd1a#`Rja9-W*;5v$!XPZD5o?!QaMew|9VD8^*{UQpbeXLbkt<_ z=XH$^4@#XGTei`G`7YJ}c?SG5~pQ=M{<9cg?+mu=Uw7shBHx{#99dpFC-aM;o=DE4IYSQ>J zJxyu7=ce&Yn(>G=jQwJyfFRu&aT^o;rzc{Cp6xn#+_P{S!)5ScB_E8kJAcv%I@qL- ze9|)?1t}lv=vQq-X^q}Q6qp+rooZ&!X-xGTqV7aqVZpV3yjjcc%kIP+v(Av(zf@4tyWS5#ff@jpNx&B`MC3}pz ze5j8#a_^We%$EDfOO$(6o=^YEdHymzP@Ct1HS?^laiyNu(T}!j=i0OrSN1tgOI@p3 zg|)GvZ)Q6!K7U8PT{BZ_`pT~m+08+E9}@XuI>J-?PZEQ^$_ zs}2t*>aZ+Z9fEe^$LnZ^Uer{ws)7H=#EC{A;-Fp^YEwMv8g%@IS+^$c?0CH++PJxW z5qGwnenrRaoOQqAeo6n5?ym8Y?j^nNX!xKSi7XTLrsI(1%a_sdz}+#hV<(sZZh(c9 zL@|W75=NQ`MTkz8c=Gwc?HR^z4=P@upqT&$86Qp8Lxidh*HEH#U6Rdtua{(fDAEGL z)tz_zW8;1$=^&BrC!*Tx<@-@#GFM3XIpP%eHnVG@1iy%!_owD0vqHFTU@FbWD5I-R z#aSCJN>34_(B@<@!!=%R3VgL-toPpV-wAmA+W6wGnP6ZTmsG=Jqr?7W*7*TsGhCC% zVSO=^x3fvMlH2UHtR0B-LgQf~jQhM%WBjE;5 zTfC-Jxf5(aNq0YP)ab&GF`k4yDSz2LbfBJ67lbGf(aopY_jxts&Q>E7U${kG7wQz! zYk?m_QJ)qjyUYy~05V1Hd7m~V`(c|`E(I~>DAmp?5?#3MI)dZ3dNABFNc2?RX^80v zOosf|(eXI+8eD$PpyH>zTiP~zTdt!kh6{ad-T@|9WGqZ^fGejNas2yj?DsbZ6M{qj z?BLVggOENVCrjg&3xpUke4DqEy9pi{Q28_Z6)zL3LYJ9)w;*+EP{*H!fAG8IFv&0pI=6V-xHRCpTZ*C_T`Gq3VII#zW}9yPS+fyGAeNK^k39Js~m6(>Tf5jAIVpIE zWN-v$#zXPkke?^Uzg%J?4RHLQQW^h@%E-uF{t5k)6)e?1A>$3XRp8?7zvzBW?q~F4 z$!b84q}o7_qz_E}vWY%&tc!)}*k!%kKdVyxQ|5|F%&lxs0%o+C5t8aNIx~)?ln8jx zk7-V)MJ!x;m{>NMHEjZu_C7b0dKAyc(lHZYe)uV>xM>1(Fl0iCKqW+y=}egD5}al^ zjAx6|NGVgrl zG8NDZw`DgAbPe*o^67y zD#1o0xQ`3F!9Q~xO0@M!wm!ku2U4gCNOJPV+!df{tt6V(%7VE*(6lB{;h%HfGf$Z0!=Nd!R(;Vaa(|a2|eK>J@5-7kj0R2l17a zSb9||y(&1b65~&)cO=EN%l#}yTC9@ZPI{2Vc?C6&| z`h}_svJ#yaCFe!KdGYba%}bTy#tvy?2Qqs?bovCRFIHG46*dX2hgJ#?35ACqoD&AF z2!}~g^nv##Rk5G?;mGm^ac8fzvsb7(B`eW+T5_HioTt|lJLR+BEfC&vqB9^l1A;T4 zWW8J1(kWDRLy68F$=M?~dla|8Vy@J3@Ik%Ua#*Me$VzmMO3qP%DXMYCHaPk4ywuML z*KdmbH-)N6S&7am$vGuBrxKOfDEJPoI1UMpLl4diCx?Z@q+*_?=$-doowq58p7~*L zdB3>psI&`AR%c`-ItL}^px_*Q+|;_bUu@bfHSI=S_KMDWC{~xv!iLV3!cL*E^TD`q zepKisMX^UfcBhK|>c1U)SS21lFC9KFR9%&o==4iYzu@#|MlWPWAE3Da2_tQ#sjfY^BOSRYyl_Q4az&`RCM(f7A~{C{=Lnk0Sy=6Wh<69`(t;_HwxY@5Sm{W&fFl#l<1B~?ug)yJlV7j zaR^-(#Z4C%awT_bZ0l}m>oMWPHF4`TvS^QO+=|gC96B#uu2Q-EGu>f* z$*$!W9+pZSr)0w?jV<3RUL5+!^RZ`nr?jtM^!1C415)F_LjL3WrVoRQd%yYW2d^%d zN_$R;&8Nir(^CEE1vBH4TnK;t;Clxb&#Y9o3zh9pwrpSMm8#leJNM9cTWs?-`fhq$ zRrig)cZnC!ciSM0j7cLivR;9&fO`+Si&ymL*nWa-jNupVR}37vTro^(T(AeJW?N)H z_yRz8hRzx@a854y#zN5rpIp%F$^ncS4&5!2CYFigi zf2%rL9rOAYPk*x@YEc{$l_|8$JO~4jy5vu2ZU$wISanCLx+7HGL8WXyq>oiK#hTid z%EhL=;NieBwYlOm9l$>j5ZxOWEYUr`EYNN0h*dSl`p*d${bIi#jL9n}Muiij(Xpl7 z%VXlUBVyB00cw0gj${)>vWX(ugh+e~c~SSOO}BZ^f;D;sk;b;~gAUBzTVicHq3adh z8wB@;Cm3W^QuWSNovUOQ)%NZ0)qihfsr93&kEbve4_y%3E{LrcrPhmL{UxdX5^9_B zMFo(PZKGt{DA_h7+lsvJ0vk$}WCiyYN~~0>24}VfiJ{f>-`n=R;L_fYUj6vh2c=Ty z8L|D0xM5J*Feo;hl^V{X)!GiQ#M-zU}Y6WoVoCE9u=Td!d2MQ3wXMfVAgHmKOfw#4cj zbCfK+AUIl}z|NCcX2>r#1mM%nv3D6_b(^q{DI~rOg7e2?Wnk}-%D~-jEa@`PFzQ>* zmo{QL?vYx07VNRD9m_Xq4my6BhRkJrrL9-c>c>TQJ56WScdZzvjh%+Yu>6(bgITrd zuZAuQSH^^!A?eDDICMuEx}$cL^%(f&T{I&A!UmA!4{X@^sCC~;>proyLuv&zd!c<4 zj7$VYXA{_mjzbc_XF4aOZYB7V1@HOg3IHe!qcelQn3WBp>w@IEAh<3l{;n^D1@CpJ zzt}FE8p2UfI(7NcscS2zu8F5W(awo|0jVz_dl6kXB-ahWbpu&)*GOf)rHbXwA5;rv z2cQzOIx^jI!LW2k+I39U%1{`hLnT~f_*Y?>)0wd|_Zd!5i4Ph24gV*7&KUk){-81r z8P3XXXX#w_GA2;?q>nl_sG<;PAM`)k-n+8BSKNM7+I|#%a8^brK!gHBC$9v9 zlfE2%NM6r`7LKN1j?}qpvPP^5NNg+lkIIa2g4!Bqh7(YttzWYB3$}j9h7fEuUwq;K z(~`jge!3)}@qOaW6GHikr6y_XVSJ$;7RujLa9iOQ_^sw37`{P5hyD0w{;eF{-{s`r z+H3qfN7Jnx#=qN<13lwT(XhVI37IYyKmYqY;4uxv)D9wgSR{YhS!>hT1|WJ-A62sf zC4;Al*JJ`L|7&)AZ2~#H;tQ+>K)z)Dz-nN^wWCqP{G3K24|h`*L3cU4q+vqH;Z#f+ z(VWb*KsX1>?i5hif$z540T!_#u zi62+;tns5IY}EF82ub*3kkjTwR3ZgHP@0Sw-!;Z96NJi@vAT3;_S#7P`euvEFR{4- z;$hW-8$lFVmCAxP|9pq0tTt_Vxbg%!Uh6u_P{GIpD2%+|?Gs|@U!n)(k zi`cTw5cluiUxuhZ|G9w#%e;)NZK4oLMT~HSwKvwix#G8C#8$zzhThU z)vPn({H@A}JzA9VmC{9=+PcFWU%=HcBFuvOyCFza);pjpxy42jU%!qT@GxR)xo6}t zfwvhm&i(5Vd&C$qF*2-FhSun7tdDt_*wb1P8VUmQj2I(!{$H*mZ}saQeGR|z=(B5U zRofz3H?-=8S<5=|-KeQkyLz;upJmj>{f!Xv`EQv(kfx_w*U9GU4F;wmDI^qt^pJ{< zjt=jsGlx%|IoW-9aPZi`5pPfD*-kI4pIT4J!ubhH&tF38;f|L{c%vI@+9L2jBCre) zt~i0W0LUX5uaufauiW$K&7 zg=vaHC!)Wj4_08)l647C`_wkcQllad140~ep*oLbAUlL++YH$uGP5#mue9`vxChPtCW6j_=Gw%XZl^nR845~#lXsro|HwM*bW=K-mnoP zV}QR!b}s-twKLJK3Av5lrzGF>l)|qL>!(FYf|QNkAW4CR9seQ4=27J`F^muQsPk%~ z(HwRpMsqR=U$N?X?Kb6O5CDQ610jBejBFsV7X3>yPLONP3=-yfn#h>&8a_xmCeP6} z%muE^T$iOP@;v!=;(dZ7%l$A_20spKUtt=5ecY)?txVjiv70IRDoSEb(Aw>RNOvy7O6O?F!V3A9L?GmAWAOy~AKW+fJHyU25{b2S{^PZLF zJ!11dsd=ARzhA1~FF3slqJD4*BH{)~b)J?rqVufeJS#ZQDt=}NU)(2D&5#-nNW*+W zUo|5-??}!&g7eOk;&Q3DQP|WW6q6#9%Zay6V3^e%F)WozTj+)wS{VvwA-p4HgnmYU zIN>HcVEC3CrO^ur8^9PP&?K}S5Q<5u5;$gH$7>3Bj74(HAe_IdHbVcBEQ>_(K?xjZ z2{5z_^uVuHdod*|iUa|^Dtca(0IOb=s$QMze_ZWFT*^G8)-z-nTo5rJaxUv@m>zI$ zAb~|7>5ONg9$c2cqV1eyJ15xAB^sLC&qFI*96};+-)jBl#t(4kVk$%| z0AD*k-@Q;m0v;Q`S1guolFEQ^G?wfE?mqDS)*rMk2Ul>RxwR{SIushZ7K;I(mP#O^ z!nhRrL8WX46y4pDyIWxD6B*YiH0+`PyP%eGsj0QsCMT{?M`Imyja^K)%MI6#ENT$;!QDE z-6NN8#pPRU6J2{G*B-&OC+4b;LJVcg{K@%~Ph7Rp)1s?szCTvhu&7@t^9f}>rHrSS zI)#SQP!Lkez7My{U(tO=a-R|0XTEr{VGr)37a)wZMb+}R1^4ZxR{=;3T~BD-w?!K- z*?7Um(-5yg1>3Lal}0DC=64l@6l6259ZK<) z5z3@FN{+`OzU zQN50qa5JQ9hBToFNhvFT>Q0?(ynZjSf`D0toe!>{!|-1aphJ;-lyW8xE`(iRCn|Q5 z3Ril)Yv3G!axgZNOnj{mHK8KTUg2u5w{sNIQt2g0$UF^##{-8hcBsJFj3*33*r67S zPqx&?lyo8_J={t8N`(tsD2R8{0ptQ1$nYoUg3uxrUa#_;M5Sl#^u(HjssNIQ2%UEQPa$l4=F;ndRdgt@aE}0(t-J^;M!Fj9cYA%8#!W zD5Qq+(aoWlWna9J4oNgeCy!b33F{wW6K=|mYDqt0{$CWuhuCE>QE{ae?_7NA;=-s@ zvw5jbEZHHI?3l~P^NH`=c>ZPLCUxosStHu6NVY42?Fx2(wo<`b9V@S#J07z_ zTNA5*M0XUUw0xoBYr}63&*eXcc7s^jB$YPJS=Y$z)N;jx?gwWcbpOO7luwbm#0h~L z!srb^>C#xjvV2OkO-r_E!8T1v7op%dI#`1GSamb)w|&@emsdY3_pOxs7F(A_#qtiR zyaQRQsCiV;x>C^!G4i=$#X+g!;9OtKQ7k#U(ase|qu^+y-76z_oLoNeu>RrJhxO9F z0ik-5)TI%DyDkK-14@@}BrK~Z6*4^~?Cs@~54#_pdDt!OKP6P(CiTK7i2H;akfnc9 zdf`^Wvii2@XmxVt!zCfY(1AXyEyXI z^ca3xzkKsY?LTRkO)?aQu1Z66y%$!tT@bciSTlRQ22eA-Ebi-*_R+wHp{B99{StD(>lt26@hAJB3^KW$fRw=_at1*oTk4IKlAPKKNT&suBhj%o zXSN~Hvos)-_dqG~_@6j{1F4n|3Utw}IVjMbE;}e7Q#dGSUw{fJ+g052LZ3Pu1hGZM za;>@%zDcpL9|zIycs&Z{)Z6b-WjmkrMc;4Z|GSC>_;U=YM<00?jvrY!_E|zH_q}}S z8hJ8t;QrC(iTHsaoH^&IyPFmlCS-Fm!7PTSjWMS(A=hE!i@^;~_1BZpX zkIX7uoPo!~DnZB;44@qH%fEj@x<&x73cW{qs&qA4TNZE0=VSg1*RC@pE$}F8a_0Njt0;kBuC35N85^{O>}IQ z9GmA%iD{&gf_TyA-`V@t-mmR{d;eTd!i7jvESnz}m%P1mp*4DKu|X_umx|lxdY(9n z=d7&j3}89iO9k0SU_XHl0)+&qbyf0li6otm5}*d;X;ERNO&78GEpj06zu-Sen~~R) zznBTH3~yRBoiyeYk$_G5W=Pm1OW8>SujO{?bG!@bvNg$)glp1w4e6TXBzw@nUc>&J znkZgZz-I}W!RNunhuklKQiC=fMD)%aSJb$w!)Ng_`z#09XH6iFPO6F#{42lYaq06R z^p$^-$WG1sz3US9nG(^NpYX4VLlM!L7g)>P!o+p1?M~u6ac}!;5hrRzTm_s>5hvh!b_du8?zLqg)twc7w}vYN7`iYvRL(K;K5j1U|y7ldLldfuih{I^3|~ z9qu(`iBi93)9QVU-Y+RjR5hupdDyjgYO@B&`c$S|rNcm@ogRZlYJwh+1{r{JNINY+ ze(Ce^AULCEh9nZE6tRu+K5-dN@5=dKowo?2plTPAS>xmjnmBS2QVA)JWcOD`q01Tv zNefgK0WMb+hpJ?$XtKsprjg&7r;ahR}H5nuCpw}c2La|X}PJpFRH)U>1J?hGVD4x8N~h9 z6i;lKxaXemGnVE&HZ;M}T#&q>lk*K3?u0nt1p+iE@T%Ee@U8{pg}uncz*MN0_}$Yv z-H7rB=dE$%YM7E8k6UB|Wh6gkFY~XlJG+8mxl$6hXTBU2>zaiMJWLe?Mn(Tij2DcL zA&nr7Xm-w0acq^OPdJP@_%x=A;AKqq31%(G=GR+}%gkCw#iCmYT$}JyO{o^709BpB-Id7b(?8 zp;gKzXM&2oBUa`SyxXFsQtdW;rLye^mt<4xeSaj{^7V=L!1~b(wT!C*`ikZwlKF^W zK7x%6@?Yv(DfKP3EcJ?|`=ru+IGH&r1apPF+8#x#X!gQ1WwA-yjOiPDVHlIYD4Hbm z$LZC(O$<2rN{JH2pfzMcIcuhOZ3xatNDuVkEt<5mPV=1UIceu)rXA5Uam{!LNxOGX z&p9S6)zr)7X`Ua#ME6`VcBX$~_#8~9mPB0~p{Z|}7F71*tT3gfdNL_Xji%PJx0nrQ z3?(kKkR4_vHkb=wR^r37pFsztE?LB+tr>M(&#j|eO-wl%F)6utZcKTY>~ooDz&s`z z5RY7Od5PNV0RlQNf&R3To(cZTB_E6`%?#0id5VvG(lZ}oRR_^-qAo$hiy9;< z(9C*EHXy!hj3y_G1t{~iV?hau5{tVXc}?oESSNe-tYz{gm9Hu_#cM>5FvTCcwC6xZ z1DH?Z5+Pd6%AN_cOrBH4o*qZ;nsL}7`4JlgqSI4jmdR~tY3D}rh!1$p(Ao#55lWx+v=(VL#yNQmN5zqlc9M6m3$qe8fZ1);r;yn9MG525}Ts-6FR z(tVdeKLLh=uO(eGfsYA%kHCLH;138a0ff7=G>6}(Wd98%>yT?8Ap|G5m;WI-wG;S|z#kDH zX-@nf6F3QgC#49%Ez1<3y%R4HxKH2^0WX0%0`&wO0NBSdgvn8|I7Z+efhPq1Cjx&> z;13D>I|8)1;{Qj0gb*D@Z^ZcbNyq;UT!IAvNgTKQl{aB^SRcZkj7^w!e>Y9rvA@Uq z3*1X`=h5}O?j_Sl$38wLxVmH|n!6=)w_xs;(VU?-_T1kyfBVgjxemfZ54<@bR9*$! z-1d*|eS8mES&25kWb+F)|C91+h5y|}{O=6d3&2rWUbWEkb<2B}IXhx@S1xS9rf1Vq zy;!$Htl25mU{|(FD%&+@dknWuJTTd@v`wtvDSCEEo?SA%#+;onJmm{pSFF`QVLz#D zm;-lPO{~dB-};!hp1#!yVhG?I%l!sn@RBr$mqtjRu$hoa{21C)v4z`!%sn+=7*6<< z{R59_zeb2wK-d5x|KJFYd^osREi=OUPKk}DrN+}@@foT3%$)hLt8BpzUfB*}p*g%Buk19Dq%k>VLvNjzd`9A!;7tgYZ&UmZuu*m1<3|kh~}%3`Kn+h@d0?piuI6% zOW4$6>YV1#}_P5 z%IjI!TD1%XIV{!AmANaQ)Nfnrd2m#C;j&PFMXbLf0ouF>Yi41a(&R7TcEhP_+!1rS zVik4KOJYS^tlERf8zJxk01Tat1&OrLPwTg#eq$B&vY&eLQ%`>C;m0#~bbbu9LKO|M z8lybC2S+ zA04Z5=|#!cB{p_TjqukmE73e4nFj>(fZ~Rn}O-y2EWooUx|2v+Ta&_xx>BDF(?_{COm5jj(7WJl8A z8Ppu?3v$?AFav3E2|J|OyjSw>-;uv5;fwcQwZM)3w6%hVFs${uExMrTC4(%cQLnqC z)8mg(KbmlWgJkk{7l(j!g!dIh^8Vh=dFb z%g|Ash|Hl2iJayNVc%KkWJ7 z_+m(G*e*3}7u`D~_YQ%nFCJHVq)J4mhhH^J@TLmQ1Iu>-pdN0KzoPr3% zk=Ehdp(6c)oK z7xd-C4bvQn&H|PTh%L33(9|$JCx_=z*|tmRym13J8l?GzHcaw%$xFSnVvXL3X#bk} zCT$3@Q?fhmzpzlgc>d$zhq=6vwzZqWe$yZo{xq+c#G&M@6 zb5^7OH#FKBUB~QdRL`!ddQL~E;<+j#{jGPF(VW#y>bX%H0}tL?;8s-F6W|CGGNkld zRNh9_H5g=}24V*wzDhgbwf~cvXv&}!aV#M@QT=w=J z?6~AhMkDM9qbGyUNcRtlG3Ah^VS;=k_u`HI&*?%bv;ayc)_5TW0#RSQ{N`rK>~Df&e(wD zn|9i^_-yGX>8Iq0&`*rnhsVwdL?{Fd1Qb>tnJ|oIEU)S(uK`imW|FR_0H zvASx&XRNtei4eCdBsZ5K*rj8c)muX9Lmlko>PfyyvX> z|7r5pq{-u*)a{B1lv3ZL$qju^?s$xBF3UMNMKQMx{`oQr-GhJwzM?NjxJH;^Mn%BW zk(~Dus3j0ZY|o)+D<^*h252e#AvCWke=$?LU}|4ARp(R^K^s09^lY$W4k~cKug|HB zZeKKguy<8Q`lVU<`_T1pL>RmPhHPo@^1~_V7>LoxJqzqL=7S&&7p9k9?ntp-gVUrH>^K&YqJrqU75_k~tdC>FF^^4<7n-1L7 zS*Y@0M?`t}tU;3p7v`)22kK;UJbb7;_$0~kpk-ziIQpxS<54?~pv{$EVM6!$Q@Es) zT*xS~4e_l<9M(w^WC7QL@TD9`Cwv>@#evd|?hvN!Me@Xs`{zUei4;Z~D6DJ@vIWl5)uIby! z6=wJ@TGMw?7T@3sT_Y#OoGT;lk~MvoaHSc(%hvQ=mc_SwP2X;=JR|OkHGNlbl^MRP z*7RM)RcH9FS<`n7=gIKxUDLOhtIhCTx2Eqp?f675sbAA?eU_LT*7V(w#dqVHz8krw zj9fRb>ARV0N%#HR;FJ&1*);Ksnvy#8m&2RLnKaxWgL;0*)`&61l=Lgcu>o-?DP@a; zaEQ2aC0dol4;QUXxGGjcfoM)a10?;>m)l4s<2DhlGE4MyoBaxNb655~v`+)?T(e*O1Xmz(b&pKrMlK z0u2CX8G}@>%rM1}5Y6P&O29{8BY_PBXwSfJBCweN-7&G}N%*a#+eU!herzWlU48L8 z0m9yH^HStg87NC*M7}*uT76tm=`fdW5Q;RU=#>Ec_e8?+!0s5v%u0)%%6&{i`_?C|6h6 z^o{m++ZQ*86+5MhokGRV)jV=A>kvYYYL{5mEwNW7RxRX|pK{tMR&_~LT|!mYs+F8< zx*DIH&hllkx=*U^6RP`G?c`LTgcA0RiB&hHD!evvbJam^PQ^{E+ACE-0Mp*pLUJfl z9G16fz@(ys?arU`OfgShJ_t{56_8>gHq$5&^WkS zNp4k2JuS_M6&+GVhfvY6T1`$hsThmI%KcL1exY*zs)wAsDJNB8wd7Q%^R#|r>fNcu zspWfO%`vIwm{4pWtUXhB~*5;Hj-15uA=c9o_9Tq6-yUn&SIgwW3`!_ zT9m3!HL;HzT6MJ>zvK9pL)hH;V3+9ale~R`w{LX=xoy;WH+rC#N}qY)6Uxs( zJ-jV{Mcbfc8x(AVu`Cj$zjEAjyiWJ$b@_co#y{U!+h;faxAq+9*>Kiurs;S_8|2SV zVT-M07vuB^dgq-G5kbc;_RGGF%An&nXu?nm8NaNoP#3|zBwuD$M$G zP^P|=B!1#lelizOFk|Ub3+x(~j);RZeVn7E$4XwjMohuJbx<{>(M{pv5)zj$dm!if z`4myIgzGda4$xY8H6=A`H)d%iql0Y54~N$*C4)#2VD$QMySje zAv*pT`c=4G&2vTcv)phE8|8e*Wml3){{oLv(9-$3zd-0|uiGxOvSrQtR=z(P@K}S*(y|cEf;oMPva6xMO&)=27{^ zmGX^?=f(2vQu%g>Ver(y_sS#B?iJ5&$a^?1dQM256OcLvmEfJ)y<*=W*f$`wRKDR+ z`R0}K&0wD@-z}BzX2B)TmPelbE1vz!Lk~+u&nd}sN)FGsrut>cfYtp%b-!3NAQcS= zwgIfNRo?dwJgPjnQh87~bV@jLQMi6fs5~fEj!Tu}qS-xXo^Ogd%0TaB*+p|C6MV5% zv~81Yc%y4utejZQ>_FiH%;zM8JuF;_9$o--s_5J(IX4PSv4_KUh($Z4qMd?mCsKB~ z-#+lDc*{!h7KogC&?{WJEEI1Mi?2w82Fr!2SK~&fgT?+0EqH;NjT>&W{ZnP~&%wPy0vMv%%n8l%MDwOKmJu*est>-Zfu9 zX8DT%uweK4bcfHJl4IZ~S+Id%_p`{8V?nvnk6RNClnEx5Gs{SDW2*=WJfUHbekHui z?3Lk-*@nYfz87gsd=**3MZddZ&aS!CH(PLd0Fb!SXR=aa?m8ksEM{Qd6mwQSa@MUl z>tgjjseVVSx;EPU-sM%9*Q^)*0M4F^P(ezA<1@y51$({-Ml_y!x%%zDB06mjlUfJ-d zZ0kzd)|jh8ay7-kY+LqrKYiS94}6wqau=aXx{I(FlvGNthR-Y}cjLSXXO=?mBWKf! zvnd)<&zG?cdy_IJ1zU@XkR$G(kqRNe!-=NSbYSWfY*wVHfF2nkyy&=Y0{9NS&H%D! z)m)awcsgsEGvGl8A^L?SrHb|E)>JUOo=GN|+=Ol>N8 zB}u6Wi5?`+bJW5Kix66l5jbK0Ksy)BO$cE&it+z>4B&V17<2?BOwGs;&I0yXl^I)E z&ZsgQG2@%GIA_A+YX=urh`leO{IejYpUr#2^4twH?f@cr+Bi50Z1!2_)G`%H0Tcx;-damJsgJrFfDh|$e~yX-sLaAH^lMy^3ih)UcHReEx8d0&%=@7pts~_CpZv+=*C8^}=?_ z+s>uPgYyqBh}+Id+s@&NnbB|B@pxl9-q#vhEPbLHl}q~`2~0h4RV*ByKPcGh*z$l~Y&`FpKQwY9z@h5RiNXX~*Nj^j@`OEj zYn<>;gRb^Hs!Ni)B4b~uF5~&BO5mwC#|slRfN3uoA6+~pQ{l7YXtH|! zGpbI%T%AwM4WODtWBr zfLL`}syZ#0%bz&Bg4xR|DsBn-@5q7-G&7QNNd51T6}s_RYnKKBE!53-@h|^?XI17H z;4xMDqz~XRaUYaV3<)A9u%0-7T|nUu|$;xG!-9W1Sx&C7{PpEnl-*+r~j$+^}~Lh0i+6Ve;d~g14Tk#p5YcAXOP&5SP7xzXYeIr>!#f5Kgr8W}7$@{`YzZ zdO@deWA)C6@`JZjsKOM2FhQt)169itWvt9gLdBJ4KK{O=XWIk7=H*J=PHLLNLRdH*}7BrNBNyC#-Hh`JO7{7zC5;#E6tOX zNU?Z`5-CceWF6FL9hPiamn|Q%BwMj{`H*ZU>9!SFq;17IoT8N2w48Xd8H~E&-O+fm z6LrQxIF0Te^|rIC9du(fgE^cuFk|%0?o?@OVJ@)&j2>ix`C|bYodp{CV|Rbwt75T= zWI5JiOQnxh?|tv6SMR-g?>lRxxe& z8KFT;vV)>KX}VD64<^wdGIghyNU5`ikSaq6;(Z274KI-^1essVm{6DsF0_Plu%Tno zHs>0T zDZ(N9FZfEak=GOW#&uj?G6hJ*`5P{=F!{!t2)`PdT7)8W7`QEZ;(ey^r^$K@JFO56 zDOyu#2H!YZC(b3Z<&$Gk(W!V#O~XcICL(ucN;$`wM${|DNnG z_E>Q(^PCvRgw{#Zn!$?Etqk`6j4y&-wj=dB1QBCP6zS6sOb}+MY9fc^Y8b@5mvzr) zzSO|BjHI#&r6Pq~8X}nz3~$RPSrJH|Ky6Yp*swu_gu$bfwICoKfycwnfhMw-QQUa52A+oN)6kU8hX?kCgk?e8gEj6VMnSGES!MBb8sS7 z`-^wQJ@Tfu)?!f-C#soblcB)^oddQM;xT%LFSB~=s`-9X(23ncVWei9RwGHv!nDXt zn3TZcftjG2Xv~V+;ARTD$T004VEAfs;RYKsk(^f9u$B28p7BRXb8HX`VV7(#Vsa`x zJu~x-*)hiSTq9P7{x7t)f`6uaK&DMckoKp$a1*h#Le&q+trU1Gl0&tFIWCuKjtg~y z5eNGC#;tQVz*&_^3C)y~^&K8Yr4@zR;L_a9FtpR~!?#ncoo4Vserhfh&%ug?G!>J~ zQksN8MrKQLMhR2r0@ds>tpbwzEov~ekgV-?6?yPoC@M^%V6yo%Gf$P$dAoA>^a5Y0P zL4<%i;o>ccg`?q(i$dWMvG9mg2x-?Wva2M(=4 zt1TR5nz*CpHA~E2@~Q1(+m9_ja;$f3!lUkSF>qW89DlqBj-2Qpk(dWnEY!B4c#CA& zl#iU744<={zh+PyZgh=V+b{~760d))Ys*t5c&e1=IwtI)Q>-|a`MJ!VVLaEDpeCPM zdvm>Nv+jvm@C}K+A;~v{3sf5jcxDwskr3C=BveDRvFX>W!kLkO`^;0k*7=}vwNj~U zuds(sv7(nMTm0d7qfIbBAr-c)X2tToAK4$+*P7SQuiC}@b}7GI$Z!7&GJF;C#9blVvgK#Z+- zfWLH=476E)*=8B&wEyye8IVasObjx~h`5t=N0}WG_b8%&$rVOCkHu>PlGm~XPDKlQa`#=he#G~^?ruKh6!YPF)`7Ywzexx^jFfOMnfXhU=3erC z{ry%v8%=3x2OL7Cwx9DQ-mO(6Y_l{wZ4$rTDb6AO)qHayK)XQa=k{i;r ztC(HdKaxQvHo#>V0zhJr0-R zqYB+6GeNoojYAxxWz(fXHfYT-wrkdrnp0l?A7Mq z{NVJ%)A-D5;~9Yu#LZJwJ>r)09}w6>fUT+6>WUGR|9~31L8+dq&7i(^*aBOco(_e> zSC?jIZ&x+uYnE$g=w(e1pm|-+_+QiQQMzRePF4ibyLqC?%X-Way1Gd~LIk`fvh^OT zDy!hXp{U;{uu6a`$V36;2O+& zhlw|lSRrbD<;9{)`-VU=%%Jz6#mJ}k^*&oNLK;6|@@T)vwM$&Pz_nwwRNwTp?#Nc% zkQ6S z;P7YJkA7KF&U_ij$`u<&#FAqWEpX+_*2tPa8{a7X>ExqHRq_cbi%Q|Iqp}9v)R?pE zxv9i{*8GgzT*R7so;DrZYC0x1^-E3t#K8r#3_#4UUv;bv#mZnk-r_+R!0Pd5@XU5x zDBg!N*4iPp9$T}lT@wAxF<<3VU;UPkxMRLOl5dZ|PI4vJ0--+mt|)2WDrw&sehg8F zlTyh^4fW#uUk^NX{Ke_Nf+d;r@+rD5NG?441(l4Ekwk&V%7o@22nPYp7t>y^D4Pt(yYL4uv$n{EGufX-j@(NdSg{8 zel|}$u6-PSa$Goj{vU@PzbExyB$=7ZQ^NFB;qq1KwQEBEHR@5-Y81=_b zqDI$*>o=t7S>f`m^x7PYNd?t@Yt_cYX6?_RPKwJXC<#BQpo6oPY>q=P#0pk`9{8MA z|3*95RaVds^`O*erHvB7wP3Uz^__U+i1tbqK&QaR1@iBcY56w;+;6Jb9V-jOs&~f% zb(r9`^G40qqHPnwXV8p7F1RK8irY4J32!T<+t{BW)B&lgZ6~{=DQY{HUBZ}Mxspr1 z$>Cm^UTt5we!pTH`{_i+a`x$v4L`Y86Ty?o{KzV8KDs6TcB&$q1!0MUh8HdMe^U>j zCw9V`5m@H3GFvd~AsEn#8k;53c``H$U%~-m^O9gYEsf+QR$xlEJOcMIso%y#12G@a zhfvl5NU5DXR53Pzl&SV75S1#gUXs|VE?v8Z1>)7I>5!t-#{V(U-@wC$yHOx$D^7F< zn=2>gtBMs?;p-v8Z70H51O}+nvz@Ennrzk_xp`?KlT{Lkb;K6~Osm^N+|bsFwPFJ< zj9U@*6|x>esO;t1QC5~NsrF znwXGHzxj6nj1_E(;wt?}-Ts3WOzD&ZJvJjxO=0??`iTRatMU{jLIZ~sjJ=!nZkEE{ z#g&Ffl@sYWzc@2Hvv^xK7`|BI8QT9^b3{pe(Z;*uvuR@#P$z2tnv=-8-c#zw_-qu(Fm1!;yE7^bF7J7pb?-dk+Sw#tAtsx2c& zD?XR0v}IX-KtoMoT}fvq4(v`NNdb;Y3drnE$P8$@l0DLusVfohPB*6#$rv(TNiHU1 zf%2q5%`Xvnn?Q1pdmFc7m@bmM$WIA@DBwKXAh8Z-VXsT(RVMk4G+@x{ly1kCR|Gzw zZuf=K?d(=xjB6J3XRNSh^)gudg{@LyE1cP7?t?72>yzN3Q2>1zkv zTtnPU3eHA#NFiV$U%31=sBd0c9r6uvQ^>H$*JFt5Nz2!3i0e(mEil9_NW=9R;`-8X z3k`8odL({Ohg;_gbtHN-5_V;bU> z>+ymW53_!pB8h}iSgFnom6|aK!Gg?V5YpnAM||qcV1$>_e!)P7nW2i+>5nikRB7sj z&!$10YzB3z*0;d25njzp;?=$+UfoOL)xRX(ZarSG;bBT^<2^RAHhUdy)~IPSxQ{nz zGXrstU}{?!;Wg>&WF$TkY}Vri_oOSkXzkl(W}89bJoDmCv(Pd~*{WsmWRQm2o9;_Q zkz~{8*kKV@f7h<7C+&^Xj^IA3XRwV-gJx8Zx7@YAYu7C6_D4>P&=L+hVDNRuheH9T z6|dXTF|iAoIRVKG+F8pwy2Bm47?Gy%#(nFW%2gS5TrKj z6gJ6%X^$)G)hCJkofJmfY`zVv z2S{qbp!qE7<6swZh4SU=xv5)|ve)Z4$DE7c+2RfuGoK9IT$sKdjysr0^(1){iRZ%O z2v1(RCt)Hd6nDK7nz?p;aWWX1p1O^@DSmG0CPmFG#67e8WSB&;mmr@p$=V}_{|vR^ z|2={KKwyKweDH&tss^bTZOZ$OeI?pzGbq2-w9Ut@OMr(#Yf zYsdebaw3y#kh;^bF67r7%=n%mc?gb@;{;s6PEC_9x=4wBb5Du!Mj_S`j#TR94K@8n zUi>pF0dR}LgkdbCr%#gBS3}ISQaoqrCbkM8*=VC)@RThz(c!x|}J?>666wBvdkZ+iVFzq7bL*Bv;QKITcW>F@;9!Wg7}8LiA6C$MxdL> z2(%_F8JoPKt!pMM5Y(I|FzQm4m#m36SzZ9!N_{wj%lZu89tU7!OWJB3n=y1MXOU`Pv=Q zY+t`6RvwTl4{T>6^jUsI)JeMV8*VYsD+PMDtq4UuJ!Na(VRj3?BbFbO$`5YaD4a8S zDwJIAij_yCN|2!`9Lhwg;Z6#7nLL##`R7u&I~_ld!t>L^JrwRu4=JpH$ViT}9#5>G(AiUYjuIFeH|BOJ&_L#VSRj)prvTy*>t> zn|$=%0dNb@N{;Dg1ZO4ASV?uX8w$F_H%oMHQ6>lh1>Cpy&EpE;)qmnBvz3cP}TahYX4T% z{*7L-YCx(Qhy_}n2HLg)Z5yp(pic_)r3cjQe_GeIRo5kS4~ccdQXPzB*yynVo||m+ z*Z{Z%@PTc()+jh@aAJ%^??~nCFs-aUc0JFjv zgjhW&RS(80Tb@?7ZB@2yw1}0*q{?GZ;-ta=pu|aq0pK<_Pvi6D?Bij6YY;Tm)02tWB_B+{7PjovL1{i+%EBF_EYhVc-a$(LB5^;JBUrM7l%?8rZE)8PQkLmbmhT{Cg)U{~ z4pPGKSg`t0%}e(-7zhj2?Et?X_`7$2-=J&v#vMFKlP+cR4pQz3wgg)->h|uSH}BK+ zthODLwjX1s9b=~>*!id{Rd60N196Dbj-MdpXKE;Dn;H?dq!n|m$4ad^GKiL`J3Bp{ zGuZv;fIh?w*(s8kN;<>lFN5Nm+ysM5nT*r#M(<|LpM)W_QnjKc?DP&nKdUSm64lf^ z85)ENKqdyjVpS67P0Y{o5$w^IOWve{DCgbmc~1UK?g{ZU7S!0ANXtYv)jMzr4K!op z6bz!GCo$AJVDu{(>f2(asCU2^P%zYY#4yx5U>sL4$TVEC6!i`mClrh_9fo=bj6ns1 zkhN0d8V?@EA?KL|X zhI$8#Q3Zqc-boDg4j5w$!xE*9dD@{Lo2353>)$!p$yo=hSwhpuU_l$X1 z8#||OjdTSLZM^!G^6T=L2TnnvhMAsnUf+fbz*pCJnecVF--HV9Tk1l~6lB6*a%sVE zGW=WfS$si0OFPKtZPZ>}!F+3}?*RLSJs^098S+rqi6&avw_#%kxxg=~V*P>RAf^Y-@C&5v0Om#@;qU>9u+Cs8Jk8JC zB(cf}*U}>Mi6WI}pp8mmmKr92;x>RWkDy%pW6*-)mg|trjO4bo5cR!9;r7thLN>|h zWF^bNq~tq&ePITb;J-lHh+9qt*`gNWAVvI{^kP_yQ^}viVj})T)wBYOq$R6XLpEsx znKmXzxeH~<%E5AN7mNh6W57pdq z#6xNnp@7EZ5tRZ?PfJ)fhZHFtow_wMw=}0!H&%3`36=3ST9{H~#5cji83(;Bb{}9| zr--+uMR9k5_oZMKT_7Wp#FT>Xj+>ATRB&C$Fce-?u<|Exof^+v4gu@dV%!mi1?dnB zmMw*N51I_GD+eqiWGcU{qLvmuG`H|}2>FGkXQpObrmus&7%4f&LXk5I4Nxetnh=H8 zHoNdnh@Xc1Sfpe?F*V5SYoTW)qfpVgnfVr(OBX2^#bZ+&F+?ShP|%Jliqec)l)U8o zV)o-?R3bsnJI4;`rVOyB~rDbE!X=R3VRz&MN8az{6zoxSK^jaV}qyT zwu^n|#!ijB61Sc>ckc8#{_k1-BMVF#I3Op<=6{LMxGfmEI<+*rD7RlnT?2I07iZ=| zjbYdtt!to8rd+aQn)^A7)2VBS#Y?cVNM>FuvdfS`*Arka7{@fi&b0MrqQj{|xEa{3 z6EeS*_GN}l*NFc#3@`FCLF$v5X(6yX;@vGXlLtmlk#W@}m|Tq<(9H>fnQ(v)y|n~O z<-tHh2oo2W&hRy(%?D=wkU?XlT|Y1Al49^?5>5<9nl)2IS_;iff!QW3&kHo!%R<&= zbj1w4SK(@q-%a<81a@w77Fzkc|;>SS)D*oG424}e@Ga&^i9XbRlqDzP+!;?@%gOe`!;%_e3jbS$Q2 z_q)^_w*e@-aW=~k#~fyA_i@??PN8pH!I&}EH6#wPx+l+yx%|3cYihi}8s1I$t^ z{y&bcml#~KPT@@|Gf05we6p4W6P_i_BG}c-VGqW~E%a!NXU^r(0hB7HxCI!o6 z%;J!f3;u@B!otCE7C9kVkkN!1xt9CcfY5b@#hsNbX9dgIWL%!mc1k(VTw*AfB?}6< ztSkPrY2m;Siyf9M!-8d4%|I!6ghh@@mQlensz$EmQ9|J3sYWCVkVew7)jhK|Wk~4B zTxY?*=NFwq??smFtCHna!SZS{q5HG*!a>6#-RsLj|94r+OOgc-d`V3y`1fs=2_x57 zy9Q5xs^;0|;=p^ueivHQs< zM8~5;VrjQj+PyNoGW;T&Pdytjjo*V)bRCjhhXmK5FI{=}Pp+Q*-tc`SP5votSi2t= zkas97zBd}PIVBquSk2Z0vE9wm?(UV`)j5%?W#Lc+ z#OfMPQ7mefikhj@K+g~0dybXtmGd#Dd$k!ZTXCvtxo0=p1^-!` zPwM1f(REI8ofBN=V!Hw>&eZ{!CWoRk@^%&fe((2t*N%wJM#Q_uhYR^{!~ElWcYCnb>NdTx&eRTT4>F6lb zt9^ZvuTSvx!4rx*=UHLkqsa%8(aU0Chg8@h6n4N93R32n3MxL@`(SUhS@gC_-d4fe z3Qs7AfFActSyhSB=xLDFBmGlWEeei^WxFQqUq4z;=^zcT3SkNgIbP5HX+r<>+Pe%E~ zf-b3`ODO1qClo}LD!xR7mVw9TlYU%wQCykYPU}mew^QimN|bc(4#%*tjhg^+`p2LQx+)p&+U@naVE~^hgCgLO~Ba zp&+W>6AR3f~OOnP!QFiHhuCHH&R5CsigMPypQvQhJ%~kqJKd0 z4+#DNctSzk9+SW3qxT-XC+zOtEEbFVq~boIxDTFC5Z9Wqti2SqFWDMl(c3P0+XZhs zJfXOA?hXF01E%~MF|St2tG#z}+fropF@>7(jT~to)W^ox?-)l2di1z$(;9cz8T|-=eEa za*=}poL^CGJ0RRwy&u0gMOUZf>J(g^EcYBCaG%2|a^n&= zE^y-vYb=@*ji4GE2c^J~%{eJBBDlwJiri_5J1uah8PH!xVA4j=Y4L~^F8N%=Zfx6`5Ic8l2aSj zoxgJVhN?`zs&Wk1XaB0XZMZ7?AF8al{_jUmxUaZPzjiyO?b*Nf*H2rr|Jh>2H96v> zb?D?INH=VE_A0GmX|n+D;Ea}|-?KHg$+pDj4p7>jEcg=SA( zndZqD5Kk6*cmWINcyag!lOArB<&MLlSvch)@(FPuc^^>teXeBV;;bCigw|7c+GT`csoz-aYmgGdb?DKlK@(*-KZ7C41@G z0nuJ7*=z6R#7sHAm-l|&s_nyE(NrOsDg;x-GqYnmtIG@r5b5AKg?&xndA2Fb`9X<{ zdDDC<%ZvqIX84?if31zZU~!srVLTuWtd2jY(65tGWg)8>12r>T8+*=TzShQ{aV}+3 zlFV@R-RCUkYi;~Z+eM2RMg}s&=PdkdZR{dvH)Cx9us*Te(i8k#GGD$^syYEby~|Ebt?oJ1_svzr}u$#o4Thti<(1_7t*9B-LD%a^^xZ z%YdKF4)&7@ikxQOIxH~YCl!>9na%EK&9;et*K60q8w2aN9<|E%8Q_(yt67wb2A;E; kEUuNvR(6SyT@tf!_m2P0;0J^E*xz=x$x@17rttrN0qT_D)Bpeg literal 0 HcmV?d00001 diff --git a/pipelines/styling/data_processor.py b/pipelines/styling/data_processor.py new file mode 100644 index 0000000..cacb5b6 --- /dev/null +++ b/pipelines/styling/data_processor.py @@ -0,0 +1,1488 @@ +import json +import pandas as pd +import numpy as np +from pathlib import Path +from typing import Dict, List, Optional, Union, Any, Tuple +from datasets import Dataset, load_dataset +import os +from dataclasses import dataclass +from abc import ABC, abstractmethod +import logging +from sklearn.model_selection import train_test_split +import re +import argparse +import sys +import yaml + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +@dataclass +class StylingConfig: + """Configuration for styling tasks""" + # Data source configuration + data_source: str = "huggingface" # "huggingface" or "custom" + dataset_name: Optional[str] = None # For Hugging Face datasets + data_path: Optional[str] = None # For custom datasets + data_format: str = "jsonl" # jsonl, csv, json + + # Field mapping - User configures which fields map to input/output + input_field: str = "text" # Field in dataset containing source text (e.g., "text", "source", etc.) + output_field: str = "styled_text" # Field in dataset containing styled text (e.g., "styled_text", "target", etc.) + instruction: str = "Rewrite the following text in a formal style" # Style instruction from YAML + + # Data processing + max_samples: Optional[int] = None + train_split: float = 0.8 + validation_split: float = 0.1 + test_split: float = 0.1 + + # Text preprocessing + clean_text: bool = True + remove_special_chars: bool = False + lowercase: bool = False # Keep original case for styling + min_length: int = 10 + max_length: int = 1000 + + # Output configuration + output_format: str = "styling" # instruction, conversation, qa + output_dir: str = "./data" + + # Hugging Face specific + hf_split: str = "train" + hf_cache_dir: Optional[str] = None + + # Split configuration + test_split_from: str = "train" + val_split_from: str = "train" + + # Custom data specific + encoding: str = "utf-8" + delimiter: str = "," # For CSV files + + # Alpaca prompt configuration + alpaca_prompt: str = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction + +### Instruction: +{} + +### Input: +{} + +### Response: +{}""" + + eos_token: str = "<|eot_id|>" # Use <|eot_id|> as EOS token + +class DataValidator: + """Validates styling data quality and format""" + + @staticmethod + def validate_styling_data(data: Dict[str, List[Dict]], config: StylingConfig, is_processed: bool = False) -> Tuple[bool, List[str]]: + """Validate styling dataset splits""" + errors = [] + + # Check if we have the expected splits + expected_splits = ["train", "validation", "test"] + for split in expected_splits: + if split not in data: + errors.append(f"Missing '{split}' split") + elif split == "train" and not data[split]: + errors.append(f"Train split cannot be empty") + # Allow validation and test splits to be empty for small datasets + + if errors: + return False, errors + + total_samples = sum(len(split_data) for split_data in data.values()) + logger.info(f"Validating {total_samples} total samples across all splits...") + + # Determine field names based on whether data is processed or not + input_field = "input" if is_processed else config.input_field + output_field = "output" if is_processed else config.output_field + + # Validate each split + for split_name, split_data in data.items(): + if not split_data: + logger.info(f"Skipping validation for empty {split_name} split") + continue + + logger.info(f"Validating {split_name} split with {len(split_data)} samples...") + + # Check required fields + missing_input_count = 0 + missing_output_count = 0 + + for i, item in enumerate(split_data): + if input_field not in item: + errors.append(f"Missing input field '{input_field}' in {split_name} split, item {i}") + missing_input_count += 1 + if output_field not in item: + errors.append(f"Missing output field '{output_field}' in {split_name} split, item {i}") + missing_output_count += 1 + + logger.info(f"{split_name} - Items missing input field: {missing_input_count}") + logger.info(f"{split_name} - Items missing output field: {missing_output_count}") + + # Check data types + type_errors = 0 + for i, item in enumerate(split_data): + if not isinstance(item.get(input_field, ""), str): + errors.append(f"Input field '{input_field}' must be string in {split_name} split, item {i}") + type_errors += 1 + if not isinstance(item.get(output_field, ""), str): + errors.append(f"Output field '{output_field}' must be string in {split_name} split, item {i}") + type_errors += 1 + + logger.info(f"{split_name} - Type errors: {type_errors}") + + # Check for empty inputs/outputs + empty_inputs = sum(1 for item in split_data if not item.get(input_field, "").strip()) + empty_outputs = sum(1 for item in split_data if not item.get(output_field, "").strip()) + + if empty_inputs > 0: + errors.append(f"Found {empty_inputs} items with empty input text in {split_name} split") + if empty_outputs > 0: + errors.append(f"Found {empty_outputs} items with empty output text in {split_name} split") + + logger.info(f"{split_name} - Empty inputs: {empty_inputs}") + logger.info(f"{split_name} - Empty outputs: {empty_outputs}") + + # Show sample of processed data for debugging + if split_data: + logger.info(f"Sample processed items from {split_name}:") + for i in range(min(3, len(split_data))): + item = split_data[i] + logger.info(f" Item {i}: input='{item.get(input_field, '')[:50]}...', output='{item.get(output_field, '')[:50]}...'") + + return len(errors) == 0, errors + + @staticmethod + def analyze_dataset(data: Dict[str, List[Dict]], config: StylingConfig, is_processed: bool = False) -> Dict[str, Any]: + """Analyze dataset characteristics across all splits""" + analysis = { + "splits": {}, + "overall": { + "total_samples": 0, + "split_sizes": {} + } + } + + # Determine field names based on whether data is processed or not + input_field = "input" if is_processed else config.input_field + output_field = "output" if is_processed else config.output_field + + # Analyze each split + for split_name, split_data in data.items(): + if not split_data: + # Handle empty splits + split_analysis = { + "total_samples": 0, + "text_length_stats": {}, + "missing_values": {} + } + analysis["splits"][split_name] = split_analysis + analysis["overall"]["split_sizes"][split_name] = 0 + continue + + split_analysis = { + "total_samples": len(split_data), + "text_length_stats": {}, + "missing_values": {} + } + + # Text length statistics for both input and output + for field_name, field in [("input", input_field), ("output", output_field)]: + text_lengths = [len(item.get(field, "")) for item in split_data] + if text_lengths: + split_analysis["text_length_stats"][field_name] = { + "min": min(text_lengths), + "max": max(text_lengths), + "mean": np.mean(text_lengths), + "median": np.median(text_lengths) + } + + # Missing values + for field in [input_field, output_field]: + missing_count = sum(1 for item in split_data if not item.get(field)) + split_analysis["missing_values"][field] = missing_count + + analysis["splits"][split_name] = split_analysis + analysis["overall"]["total_samples"] += len(split_data) + analysis["overall"]["split_sizes"][split_name] = len(split_data) + + return analysis + +class BaseDataLoader(ABC): + """Abstract base class for data loaders""" + + @abstractmethod + def load(self, config: StylingConfig) -> Dict[str, List[Dict]]: + """Load data and return dictionary with train/val/test splits""" + pass + + @abstractmethod + def preprocess(self, data: Dict[str, List[Dict]], config: StylingConfig) -> Dict[str, List[Dict]]: + """Apply preprocessing steps to all splits""" + pass + + +class HuggingFaceDataLoader(BaseDataLoader): + """Load datasets from Hugging Face Hub""" + + def load(self, config: StylingConfig) -> Dict[str, List[Dict]]: + """Load dataset from Hugging Face Hub with flexible split handling""" + if not config.dataset_name: + raise ValueError("Dataset name is required for Hugging Face datasets") + + logger.info(f"Loading Hugging Face dataset: {config.dataset_name}") + + try: + # First, let's check what splits are available in the dataset + dataset = load_dataset( + config.dataset_name, + cache_dir=config.hf_cache_dir + ) + + # Log available splits + available_splits = list(dataset.keys()) + logger.info(f"Available splits in dataset: {available_splits}") + + # Initialize split data + splits_data = { + "train": [], + "validation": [], + "test": [] + } + + # Handle train split + if "train" in available_splits: + train_dataset = dataset["train"] + logger.info(f"Using 'train' split with {len(train_dataset)} samples") + splits_data["train"] = list(train_dataset) + else: + logger.error("No 'train' split found in dataset!") + logger.error(f"Available splits: {available_splits}") + raise ValueError(f"Dataset {config.dataset_name} does not have a 'train' split") + + # Handle validation split + if config.val_split_from == "use_val_if_available" and "validation" in available_splits: + val_dataset = dataset["validation"] + logger.info(f"Using 'validation' split with {len(val_dataset)} samples") + splits_data["validation"] = list(val_dataset) + elif config.val_split_from == "use_val_if_available" and "val" in available_splits: + val_dataset = dataset["val"] + logger.info(f"Using 'val' split with {len(val_dataset)} samples") + splits_data["validation"] = list(val_dataset) + elif config.val_split_from == "use_val_if_available": + logger.warning("No validation split found in dataset. Will create from train split.") + logger.info(f"Available splits: {available_splits}") + logger.info(f"Will use {config.validation_split * 100}% of train data for validation") + else: + logger.info(f"Will create validation split from train data ({config.validation_split * 100}%)") + + # Handle test split + if config.test_split_from == "use_test_if_available" and "test" in available_splits: + test_dataset = dataset["test"] + logger.info(f"Using 'test' split with {len(test_dataset)} samples") + splits_data["test"] = list(test_dataset) + elif config.test_split_from == "use_val_if_available" and "validation" in available_splits: + test_dataset = dataset["validation"] + logger.info(f"Using 'validation' split as test with {len(test_dataset)} samples") + splits_data["test"] = list(test_dataset) + elif config.test_split_from == "use_val_if_available" and "val" in available_splits: + test_dataset = dataset["val"] + logger.info(f"Using 'val' split as test with {len(test_dataset)} samples") + splits_data["test"] = list(test_dataset) + elif config.test_split_from == "use_test_if_available": + logger.warning("No test split found in dataset. Will create from train split.") + logger.info(f"Available splits: {available_splits}") + logger.info(f"Will use {config.test_split * 100}% of train data for test") + else: + logger.info(f"Will create test split from train data ({config.test_split * 100}%)") + + # If we need to create splits from train data + if not splits_data["validation"] or not splits_data["test"]: + train_data = splits_data["train"] + + # Handle very small datasets + if len(train_data) < 3: + logger.warning(f"Dataset has only {len(train_data)} samples. Using all data for training.") + splits_data["train"] = train_data + splits_data["validation"] = [] + splits_data["test"] = [] + else: + # Calculate remaining percentages for train + total_train_percentage = config.train_split + config.validation_split + config.test_split + if total_train_percentage != 1.0: + logger.warning(f"Split percentages don't sum to 1.0 (got {total_train_percentage}). Normalizing...") + # Normalize percentages + config.train_split = config.train_split / total_train_percentage + config.validation_split = config.validation_split / total_train_percentage + config.test_split = config.test_split / total_train_percentage + + # Create splits from train data + if not splits_data["validation"] and not splits_data["test"]: + # Split train into train, val, test + train_size = int(len(train_data) * config.train_split) + val_size = int(len(train_data) * config.validation_split) + + # Handle small datasets + if len(train_data) < 10: + # For small datasets, use more conservative splits + config.train_split = 0.6 + config.validation_split = 0.2 + config.test_split = 0.2 + logger.info(f"Small dataset detected. Adjusted split ratios to: train={config.train_split}, val={config.validation_split}, test={config.test_split}") + + # Ensure minimum sizes + min_val_size = max(1, int(len(train_data) * 0.1)) + min_test_size = max(1, int(len(train_data) * 0.1)) + + val_size = max(min_val_size, int(len(train_data) * config.validation_split)) + test_size = max(min_test_size, int(len(train_data) * config.test_split)) + train_size = len(train_data) - val_size - test_size + + # Ensure train has at least 1 sample + if train_size < 1: + if val_size > 1: + val_size -= 1 + train_size += 1 + elif test_size > 1: + test_size -= 1 + train_size += 1 + logger.info(f"Adjusted split sizes: train={train_size}, val={val_size}, test={test_size}") + + # First split: train + (val+test) + new_train, temp_data = train_test_split( + train_data, + test_size=val_size + test_size, + random_state=42 + ) + + # Second split: val + test + new_val, new_test = train_test_split( + temp_data, + test_size=test_size / (val_size + test_size) if (val_size + test_size) > 0 else 0, + random_state=42 + ) + + splits_data["train"] = new_train + splits_data["validation"] = new_val + splits_data["test"] = new_test + + elif not splits_data["validation"]: + # Only need to create val from train + val_size = max(1, int(len(train_data) * config.validation_split)) + new_train, new_val = train_test_split( + train_data, + test_size=val_size, + random_state=42 + ) + splits_data["train"] = new_train + splits_data["validation"] = new_val + + elif not splits_data["test"]: + # Only need to create test from train + test_size = max(1, int(len(train_data) * config.test_split)) + new_train, new_test = train_test_split( + train_data, + test_size=test_size, + random_state=42 + ) + splits_data["train"] = new_train + splits_data["test"] = new_test + + logger.info(f"Final split sizes:") + logger.info(f" Train: {len(splits_data['train'])} samples") + logger.info(f" Validation: {len(splits_data['validation'])} samples") + logger.info(f" Test: {len(splits_data['test'])} samples") + + # Ensure all splits exist (even if empty) for the pipeline + if "validation" not in splits_data: + splits_data["validation"] = [] + if "test" not in splits_data: + splits_data["test"] = [] + + # Apply max_samples limit to each split if specified + if config.max_samples: + for split_name in splits_data: + if splits_data[split_name]: + original_size = len(splits_data[split_name]) + splits_data[split_name] = splits_data[split_name][:config.max_samples] + logger.info(f"Limited {split_name} split from {original_size} to {len(splits_data[split_name])} samples") + + # Log dataset info for debugging + for split_name, split_data in splits_data.items(): + if split_data: + logger.info(f"Sample data item from {split_name}: {split_data[0]}") + logger.info(f"Available fields in {split_name} split: {list(split_data[0].keys())}") + + # Check if the required fields exist + if config.input_field not in split_data[0]: + logger.warning(f"Input field '{config.input_field}' not found in {split_name}. Available fields: {list(split_data[0].keys())}") + # Suggest alternative fields + text_fields = [f for f in split_data[0].keys() if any(keyword in f.lower() for keyword in ['text', 'sentence', 'content', 'input', 'comment', 'message'])] + if text_fields: + logger.info(f"Suggested text fields for {split_name}: {text_fields}") + if config.output_field not in split_data[0]: + logger.warning(f"Output field '{config.output_field}' not found in {split_name}. Available fields: {list(split_data[0].keys())}") + # Suggest alternative fields + output_fields = [f for f in split_data[0].keys() if any(keyword in f.lower() for keyword in ['output', 'response', 'result', 'target', 'styled'])] + if output_fields: + logger.info(f"Suggested output fields for {split_name}: {output_fields}") + + logger.info(f"Successfully loaded dataset {config.dataset_name}") + return splits_data + + except Exception as e: + logger.error(f"Error loading dataset {config.dataset_name}: {e}") + raise + + def preprocess(self, data: Dict[str, List[Dict]], config: StylingConfig) -> Dict[str, List[Dict]]: + """Apply preprocessing steps to all splits separately""" + processed_splits = {} + + logger.info(f"=== PREPROCESSING DATA ===") + + for split_name, split_data in data.items(): + logger.info(f"Processing {split_name} split with {len(split_data)} items...") + + # Log field availability for debugging + if split_data: + available_fields = set(split_data[0].keys()) + logger.info(f"Available fields in {split_name}: {available_fields}") + logger.info(f"Looking for input field: '{config.input_field}', output field: '{config.output_field}'") + + if config.input_field not in available_fields: + logger.error(f"Input field '{config.input_field}' not found in {split_name}. Available fields: {available_fields}") + if config.output_field not in available_fields: + logger.error(f"Output field '{config.output_field}' not found in {split_name}. Available fields: {available_fields}") + + # Count items with missing fields + missing_input = sum(1 for item in split_data if config.input_field not in item or not item.get(config.input_field)) + missing_output = sum(1 for item in split_data if config.output_field not in item or not item.get(config.output_field)) + + logger.info(f"{split_name} - Items missing input field: {missing_input}") + logger.info(f"{split_name} - Items missing output field: {missing_output}") + + # Show sample of raw data before preprocessing + logger.info(f"=== SAMPLE RAW DATA FROM {split_name.upper()} BEFORE PREPROCESSING ===") + for i in range(min(3, len(split_data))): + item = split_data[i] + logger.info(f"Raw item {i} from {split_name}:") + for key, value in item.items(): + if isinstance(value, str) and len(value) > 100: + logger.info(f" {key}: '{value[:100]}...'") + else: + logger.info(f" {key}: {value}") + + # Process each item in the split + processed_data = [] + processed_count = 0 + skipped_count = 0 + + # Reset debug counter for each split + self._debug_count = 0 + + for i, item in enumerate(split_data): + processed_item = self._preprocess_item(item, config) + if processed_item is not None: + processed_data.append(processed_item) + processed_count += 1 + else: + skipped_count += 1 + if skipped_count <= 3: # Log first few skipped items + logger.info(f"Skipped item {i} from {split_name}: {item}") + + processed_splits[split_name] = processed_data + logger.info(f"{split_name} - Preprocessed {processed_count} samples, skipped {skipped_count} samples") + + # Show sample of processed data + if processed_data: + logger.info(f"=== SAMPLE PROCESSED DATA FROM {split_name.upper()} ===") + for i in range(min(3, len(processed_data))): + logger.info(f"Processed item {i} from {split_name}: {processed_data[i]}") + + return processed_splits + + def _preprocess_item(self, item: Dict, config: StylingConfig) -> Optional[Dict]: + """Preprocess a single item""" + # Extract input and output using configurable field names + input_text = item.get(config.input_field, "") + output_text = item.get(config.output_field, "") + + # Log what we're extracting (for first few items) + if hasattr(self, '_debug_count'): + self._debug_count += 1 + else: + self._debug_count = 1 + + if self._debug_count <= 3: + logger.debug(f"Processing item {self._debug_count}:") + logger.debug(f" Looking for input field '{config.input_field}': {input_text}") + logger.debug(f" Looking for output field '{config.output_field}': {output_text}") + + # Handle None values + if input_text is None: + input_text = "" + if output_text is None: + output_text = "" + + # Convert to string if needed + input_text = str(input_text) + output_text = str(output_text) + + if self._debug_count <= 3: + logger.debug(f" After conversion - input: '{input_text[:50]}...', output: '{output_text[:50]}...'") + + # Clean text if requested + if config.clean_text: + original_input = input_text + original_output = output_text + input_text = self._clean_text(input_text, config) + output_text = self._clean_text(output_text, config) + if self._debug_count <= 3: + logger.debug(f" After cleaning - input: '{original_input[:50]}...' -> '{input_text[:50]}...'") + logger.debug(f" After cleaning - output: '{original_output[:50]}...' -> '{output_text[:50]}...'") + + # Check length constraints + if len(input_text) < config.min_length or len(input_text) > config.max_length: + if self._debug_count <= 3: + logger.debug(f" Skipping - input length {len(input_text)} not in range [{config.min_length}, {config.max_length}]") + return None + + if len(output_text) < config.min_length or len(output_text) > config.max_length: + if self._debug_count <= 3: + logger.debug(f" Skipping - output length {len(output_text)} not in range [{config.min_length}, {config.max_length}]") + return None + + # Create processed item - Always use "input" and "output" for internal processing + processed_item = { + "input": input_text, + "output": output_text + } + + if self._debug_count <= 3: + logger.debug(f" Final processed item: {processed_item}") + + return processed_item + + def _clean_text(self, text: str, config: StylingConfig) -> str: + """Clean and normalize text""" + if not isinstance(text, str): + return "" + + # Remove extra whitespace + text = re.sub(r'\s+', ' ', text).strip() + + # Convert to lowercase if requested + if config.lowercase: + text = text.lower() + + # Remove special characters if requested + if config.remove_special_chars: + text = re.sub(r'[^\w\s]', '', text) + + return text + + +class CustomDataLoader(BaseDataLoader): + """Load custom datasets from local files""" + + def load(self, config: StylingConfig) -> Dict[str, List[Dict]]: + """Load custom dataset from local file and create splits""" + if not config.data_path: + raise ValueError("Data path is required for custom datasets") + + file_path = Path(config.data_path) + + if not file_path.exists(): + raise FileNotFoundError(f"Data file not found: {file_path}") + + logger.info(f"Loading custom dataset: {file_path}") + + if config.data_format == "jsonl": + raw_data = self._load_jsonl(file_path, config) + elif config.data_format == "csv": + raw_data = self._load_csv(file_path, config) + elif config.data_format == "json": + raw_data = self._load_json(file_path, config) + else: + raise ValueError(f"Unsupported format: {config.data_format}") + + if config.max_samples: + raw_data = raw_data[:config.max_samples] + + logger.info(f"Loaded {len(raw_data)} samples from {file_path}") + + # Create splits from the raw data + splits_data = self._create_splits(raw_data, config) + + return splits_data + + def _create_splits(self, data: List[Dict], config: StylingConfig) -> Dict[str, List[Dict]]: + """Create train/validation/test splits from raw data""" + logger.info(f"Creating splits from {len(data)} samples...") + + # Handle very small datasets + if len(data) < 3: + logger.warning(f"Dataset has only {len(data)} samples. Using all data for training.") + return { + "train": data, + "validation": [], + "test": [] + } + + # Calculate split sizes with minimum guarantees + total_samples = len(data) + + # Ensure minimum sizes for each split + min_val_size = max(1, int(total_samples * 0.1)) # At least 1 sample for validation + min_test_size = max(1, int(total_samples * 0.1)) # At least 1 sample for test + + # Adjust split ratios if dataset is too small + if total_samples < 10: + # For small datasets, use more conservative splits + config.train_split = 0.6 + config.validation_split = 0.2 + config.test_split = 0.2 + logger.info(f"Small dataset detected. Adjusted split ratios to: train={config.train_split}, val={config.validation_split}, test={config.test_split}") + + # Calculate actual split sizes + val_size = max(min_val_size, int(total_samples * config.validation_split)) + test_size = max(min_test_size, int(total_samples * config.test_split)) + train_size = total_samples - val_size - test_size + + # Ensure train split has at least 1 sample + if train_size < 1: + # Adjust validation and test to ensure train has at least 1 sample + if val_size > 1: + val_size -= 1 + train_size += 1 + elif test_size > 1: + test_size -= 1 + train_size += 1 + logger.info(f"Adjusted split sizes to ensure train has at least 1 sample: train={train_size}, val={val_size}, test={test_size}") + + logger.info(f"Split sizes: train={train_size}, validation={val_size}, test={test_size}") + + # Create splits + if val_size == 0 and test_size == 0: + # All data goes to train + splits_data = { + "train": data, + "validation": [], + "test": [] + } + elif val_size == 0: + # Split between train and test + train_data, test_data = train_test_split(data, test_size=test_size, random_state=42) + splits_data = { + "train": train_data, + "validation": [], + "test": test_data + } + elif test_size == 0: + # Split between train and validation + train_data, val_data = train_test_split(data, test_size=val_size, random_state=42) + splits_data = { + "train": train_data, + "validation": val_data, + "test": [] + } + else: + # Full three-way split + # First split: train + (val+test) + train_data, temp_data = train_test_split( + data, + test_size=val_size + test_size, + random_state=42 + ) + + # Second split: val + test + val_data, test_data = train_test_split( + temp_data, + test_size=test_size, + random_state=42 + ) + + splits_data = { + "train": train_data, + "validation": val_data, + "test": test_data + } + + logger.info(f"Created splits:") + logger.info(f" Train: {len(splits_data['train'])} samples") + logger.info(f" Validation: {len(splits_data['validation'])} samples") + logger.info(f" Test: {len(splits_data['test'])} samples") + + return splits_data + + def _load_jsonl(self, file_path: Path, config: StylingConfig) -> List[Dict]: + """Load JSONL file""" + data = [] + with open(file_path, 'r', encoding=config.encoding) as f: + for line_num, line in enumerate(f, 1): + if line.strip(): + try: + data.append(json.loads(line)) + except json.JSONDecodeError as e: + logger.warning(f"Invalid JSON at line {line_num}: {e}") + return data + + def _load_csv(self, file_path: Path, config: StylingConfig) -> List[Dict]: + """Load CSV file""" + df = pd.read_csv(file_path, encoding=config.encoding, delimiter=config.delimiter) + return df.to_dict('records') + + def _load_json(self, file_path: Path, config: StylingConfig) -> List[Dict]: + """Load JSON file""" + with open(file_path, 'r', encoding=config.encoding) as f: + data = json.load(f) + + if isinstance(data, list): + return data + elif isinstance(data, dict) and "data" in data: + return data["data"] + else: + return [data] + + def preprocess(self, data: Dict[str, List[Dict]], config: StylingConfig) -> Dict[str, List[Dict]]: + """Apply preprocessing steps to all splits separately""" + processed_splits = {} + + logger.info(f"=== PREPROCESSING CUSTOM DATA ===") + + for split_name, split_data in data.items(): + logger.info(f"Processing {split_name} split with {len(split_data)} items...") + + processed_data = [] + processed_count = 0 + skipped_count = 0 + + # Reset debug counter for each split + self._debug_count = 0 + + for i, item in enumerate(split_data): + processed_item = self._preprocess_item(item, config) + if processed_item is not None: + processed_data.append(processed_item) + processed_count += 1 + else: + skipped_count += 1 + if skipped_count <= 3: # Log first few skipped items + logger.info(f"Skipped item {i} from {split_name}: {item}") + + processed_splits[split_name] = processed_data + logger.info(f"{split_name} - Preprocessed {processed_count} samples, skipped {skipped_count} samples") + + return processed_splits + + def _preprocess_item(self, item: Dict, config: StylingConfig) -> Optional[Dict]: + """Preprocess a single item""" + # Extract input and output using configurable field names + input_text = item.get(config.input_field, "") + output_text = item.get(config.output_field, "") + + # Handle None values + if input_text is None: + input_text = "" + if output_text is None: + output_text = "" + + # Convert to string if needed + input_text = str(input_text) + output_text = str(output_text) + + # Clean text if requested + if config.clean_text: + input_text = self._clean_text(input_text, config) + output_text = self._clean_text(output_text, config) + + # Check length constraints + if len(input_text) < config.min_length or len(input_text) > config.max_length: + return None + + if len(output_text) < config.min_length or len(output_text) > config.max_length: + return None + + # Create processed item - Always use "input" and "output" for internal processing + processed_item = { + "input": input_text, + "output": output_text + } + + return processed_item + + def _clean_text(self, text: str, config: StylingConfig) -> str: + """Clean and normalize text""" + if not isinstance(text, str): + return "" + + # Remove extra whitespace + text = re.sub(r'\s+', ' ', text).strip() + + # Convert to lowercase if requested + if config.lowercase: + text = text.lower() + + # Remove special characters if requested + if config.remove_special_chars: + text = re.sub(r'[^\w\s]', '', text) + + return text + + +class StylingDataPipeline: + """Main styling pipeline""" + + def __init__(self): + self.validator = DataValidator() + self.hf_loader = HuggingFaceDataLoader() + self.custom_loader = CustomDataLoader() + + def create_config( + self, + data_source: str, + dataset_name: Optional[str] = None, + data_path: Optional[str] = None, + input_field: str = "input", + output_field: str = "output", + instruction: str = "Rewrite the following text in a formal style", + **kwargs + ) -> StylingConfig: + """Create styling configuration""" + return StylingConfig( + data_source=data_source, + dataset_name=dataset_name, + data_path=data_path, + input_field=input_field, + output_field=output_field, + instruction=instruction, + **kwargs + ) + + def load_config_from_yaml(self, yaml_path: str) -> StylingConfig: + """Load configuration from YAML file""" + try: + config_dict = load_yaml_config(yaml_path) + + # Create configuration object from YAML data + config = StylingConfig( + data_source=config_dict.get('data_source', 'custom'), + dataset_name=config_dict.get('dataset_name'), + data_path=config_dict.get('data_path'), + data_format=config_dict.get('data_format', 'jsonl'), + input_field=config_dict.get('input_field', 'text'), + output_field=config_dict.get('output_field', 'styled_text'), + instruction=config_dict.get('instruction', 'Rewrite the following text in a formal style'), + max_samples=config_dict.get('max_samples'), + train_split=config_dict.get('train_split', 0.8), + validation_split=config_dict.get('validation_split', 0.1), + test_split=config_dict.get('test_split', 0.1), + clean_text=config_dict.get('clean_text', True), + remove_special_chars=config_dict.get('remove_special_chars', False), + lowercase=config_dict.get('lowercase', False), + min_length=config_dict.get('min_length', 10), + max_length=config_dict.get('max_length', 1000), + output_format=config_dict.get('output_format', 'styling'), + output_dir=config_dict.get('output_dir', './data'), + hf_split=config_dict.get('hf_split', 'train'), + hf_cache_dir=config_dict.get('hf_cache_dir'), + test_split_from=config_dict.get('test_split_from', 'train'), + val_split_from=config_dict.get('val_split_from', 'train'), + encoding=config_dict.get('encoding', 'utf-8'), + delimiter=config_dict.get('delimiter', ',') + ) + + logger.info(f"Configuration loaded from YAML: {yaml_path}") + logger.info(f"Output directory: {config.output_dir}") + logger.info(f"Instruction: {config.instruction}") + + return config + + except Exception as e: + logger.error(f"Error loading configuration from YAML {yaml_path}: {e}") + raise + + def load_and_preprocess(self, config: StylingConfig) -> Tuple[Dict[str, List[Dict]], Dict[str, Any]]: + """Load and preprocess data""" + + # Load data + if config.data_source == "huggingface": + raw_splits = self.hf_loader.load(config) + processed_splits = self.hf_loader.preprocess(raw_splits, config) + elif config.data_source == "custom": + raw_splits = self.custom_loader.load(config) + processed_splits = self.custom_loader.preprocess(raw_splits, config) + else: + raise ValueError(f"Unsupported data source: {config.data_source}") + + # Validate processed data + is_valid, errors = self.validator.validate_styling_data(processed_splits, config, is_processed=True) + if not is_valid: + logger.error("Data validation failed:") + for error in errors: + logger.error(f" - {error}") + raise ValueError("Data validation failed") + + # Analyze dataset + analysis = self.validator.analyze_dataset(processed_splits, config, is_processed=True) + + return processed_splits, analysis + + def convert_to_alpaca_format(self, data: Dict[str, List[Dict]], config: StylingConfig) -> Dict[str, List[Dict]]: + """Convert styling data to Alpaca format with instruction""" + alpaca_splits = {} + + for split_name, split_data in data.items(): + alpaca_data = [] + for item in split_data: + # Ensure input and output fields exist, default to empty string if missing + input_text = item.get("input", "") + output_text = item.get("output", "") + + # Handle None values + if input_text is None: + input_text = "" + if output_text is None: + output_text = "" + + # Convert to string if needed + input_text = str(input_text) + output_text = str(output_text) + + alpaca_data.append({ + "instruction": config.instruction, + "input": input_text, + "output": output_text + }) + alpaca_splits[split_name] = alpaca_data + + return alpaca_splits + + def format_for_training(self, data: Dict[str, List[Dict]], config: StylingConfig) -> Dict[str, List[str]]: + """Format entries for training using Alpaca prompt format""" + formatted_splits = {} + + for split_name, split_data in data.items(): + formatted_texts = [] + for item in split_data: + # Ensure input and output fields exist, default to empty string if missing + input_text = item.get("input", "") + output_text = item.get("output", "") + + # Handle None values + if input_text is None: + input_text = "" + if output_text is None: + output_text = "" + + # Convert to string if needed + input_text = str(input_text) + output_text = str(output_text) + + text = config.alpaca_prompt.format( + config.instruction, + input_text, + output_text + ) + config.eos_token + formatted_texts.append(text) + formatted_splits[split_name] = formatted_texts + + return formatted_splits + + def convert_to_hf_dataset(self, dataset_entries: List[Dict], config: StylingConfig): + """Convert dataset entries to HuggingFace dataset format with text formatting""" + from datasets import Dataset + + # Create HuggingFace dataset from list of dictionaries + hf_dataset = Dataset.from_list(dataset_entries) + + # Apply formatting function to generate the text field + def formatting_prompts_func(examples): + instructions = examples["instruction"] + inputs = examples["input"] + outputs = examples["output"] + texts = [] + + for instruction, input_text, output in zip(instructions, inputs, outputs): + # Handle None values and ensure strings + if input_text is None: + input_text = "" + if output is None: + output = "" + + # Convert to string if needed + input_text = str(input_text) + output = str(output) + + # Use the config's EOS token and alpaca prompt + text = config.alpaca_prompt.format(instruction, input_text, output) + config.eos_token + texts.append(text) + + return {"text": texts} + + # Apply the formatting function + formatted_dataset = hf_dataset.map(formatting_prompts_func, batched=True) + + return formatted_dataset + + def save_hf_dataset_to_disk(self, hf_dataset, save_path: str): + """Save HuggingFace dataset to disk""" + try: + hf_dataset.save_to_disk(save_path) + logger.info(f"HuggingFace dataset saved to disk at: {save_path}") + return True + except Exception as e: + logger.error(f"Error saving HuggingFace dataset to disk: {e}") + return False + + def load_hf_dataset_from_disk(self, load_path: str): + """Load HuggingFace dataset from disk""" + try: + from datasets import load_from_disk + hf_dataset = load_from_disk(load_path) + logger.info(f"HuggingFace dataset loaded from disk: {load_path}") + logger.info(f"Dataset has {len(hf_dataset)} entries") + logger.info(f"Dataset features: {hf_dataset.features}") + return hf_dataset + except Exception as e: + logger.error(f"Error loading HuggingFace dataset from disk: {e}") + return None + + def save_data(self, data: Dict[str, List[Dict]], output_dir: str, format: str = "jsonl"): + """Save processed data splits to files""" + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + for split_name, split_data in data.items(): + if format == "jsonl": + output_file = output_path / f"{split_name}.jsonl" + with open(output_file, 'w', encoding='utf-8') as f: + for item in split_data: + f.write(json.dumps(item, ensure_ascii=False) + '\n') + elif format == "json": + output_file = output_path / f"{split_name}.json" + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(split_data, f, ensure_ascii=False, indent=2) + elif format == "csv": + output_file = output_path / f"{split_name}.csv" + df = pd.DataFrame(split_data) + df.to_csv(output_file, index=False) + + logger.info(f"Saved {len(split_data)} samples to {output_file}") + + def run_pipeline( + self, + config: StylingConfig, + output_format: str = "styling", + save_splits: bool = True, + create_hf_dataset: bool = False, + save_hf_dataset: bool = False, + hf_dataset_path: str = None + ) -> Dict[str, Any]: + """Run complete styling pipeline""" + + logger.info("Starting styling pipeline...") + + # Load and preprocess data + processed_splits, analysis = self.load_and_preprocess(config) + + # Convert to desired output format + if output_format == "alpaca": + formatted_splits = self.convert_to_alpaca_format(processed_splits, config) + else: + formatted_splits = processed_splits + + # Save data if requested + if save_splits: + # Save directly in the output directory, not in a subdirectory + output_dir = Path(config.output_dir) + self.save_data(formatted_splits, str(output_dir)) + + # Convert to HuggingFace dataset if requested + hf_dataset = None + hf_dataset_save_path = None + if create_hf_dataset: + # Flatten all splits into one list for HF dataset + all_entries = [] + for split_name, split_data in formatted_splits.items(): + for item in split_data: + # Ensure we have the instruction field + if "instruction" not in item: + item["instruction"] = config.instruction + all_entries.append(item) + + hf_dataset = self.convert_to_hf_dataset(all_entries, config) + logger.info(f"HuggingFace dataset created with {len(hf_dataset)} entries") + logger.info(f"Dataset features: {hf_dataset.features}") + + # Save HuggingFace dataset to disk if requested + if save_hf_dataset: + if hf_dataset_path is None: + # Generate default path using the YAML output_dir + hf_dataset_path = str(Path(config.output_dir) / "hf_dataset") + + success = self.save_hf_dataset_to_disk(hf_dataset, hf_dataset_path) + if success: + hf_dataset_save_path = hf_dataset_path + logger.info(f"HuggingFace dataset saved to: {hf_dataset_save_path}") + else: + logger.warning("Failed to save HuggingFace dataset to disk") + + # Create result summary + result = { + "config": config, + "analysis": analysis, + "splits": { + split_name: len(split_data) for split_name, split_data in formatted_splits.items() + }, + "output_format": output_format, + "output_dir": config.output_dir, + "data": formatted_splits, # Include the actual processed data + "instruction": config.instruction + } + + # Add HuggingFace dataset info to result if created + if hf_dataset is not None: + result["hf_dataset"] = hf_dataset + if hf_dataset_save_path: + result["hf_dataset_path"] = hf_dataset_save_path + + logger.info("Styling pipeline completed successfully!") + return result + +# Helper functions +def create_huggingface_config(dataset_name: str, input_field: str = "text", output_field: str = "output", instruction: str = "Rewrite the following text in a formal style", **kwargs) -> StylingConfig: + """Helper function to create a HuggingFace configuration""" + return StylingConfig( + data_source="huggingface", + dataset_name=dataset_name, + input_field=input_field, + output_field=output_field, + instruction=instruction, + **kwargs + ) + + +def create_custom_config(data_path: str, data_format: str = "jsonl", input_field: str = "text", output_field: str = "styled_text", instruction: str = "Rewrite the following text in a formal style", **kwargs) -> StylingConfig: + """Helper function to create a custom data configuration""" + return StylingConfig( + data_source="custom", + data_path=data_path, + data_format=data_format, + input_field=input_field, + output_field=output_field, + instruction=instruction, + **kwargs + ) + + +def save_hf_dataset_to_disk(hf_dataset, save_path: str) -> bool: + """Utility function to save HuggingFace dataset to disk""" + try: + hf_dataset.save_to_disk(save_path) + print(f"HuggingFace dataset saved to disk at: {save_path}") + return True + except Exception as e: + print(f"Error saving HuggingFace dataset to disk: {e}") + return False + + +def load_hf_dataset_from_disk(load_path: str): + """Utility function to load HuggingFace dataset from disk""" + try: + from datasets import load_from_disk + hf_dataset = load_from_disk(load_path) + print(f"HuggingFace dataset loaded from disk: {load_path}") + print(f"Dataset has {len(hf_dataset)} entries") + print(f"Dataset features: {hf_dataset.features}") + return hf_dataset + except Exception as e: + print(f"Error loading HuggingFace dataset from disk: {e}") + return None + + +def load_yaml_config(config_path: str) -> Dict[str, Any]: + """Load and parse YAML configuration file with proper structure handling""" + try: + with open(config_path, 'r', encoding='utf-8') as f: + yaml_data = yaml.safe_load(f) + + # Extract configuration from YAML structure + config_dict = {} + + # Handle task section + if 'task' in yaml_data: + task_data = yaml_data['task'] + config_dict.update({ + 'task_name': task_data.get('name'), + 'task_type': task_data.get('type') + }) + + # Handle data section + if 'data' in yaml_data: + data_config = yaml_data['data'] + config_dict.update({ + 'data_source': data_config.get('source'), + 'dataset_name': data_config.get('dataset_name'), + 'data_path': data_config.get('data_path'), + 'data_format': data_config.get('data_format'), + 'input_field': data_config.get('input_field'), + 'output_field': data_config.get('output_field'), + 'instruction': data_config.get('instruction'), + 'max_samples': data_config.get('max_samples'), + 'train_split': data_config.get('train_split'), + 'validation_split': data_config.get('validation_split'), + 'test_split': data_config.get('test_split'), + 'clean_text': data_config.get('clean_text'), + 'lowercase': data_config.get('lowercase'), + 'min_length': data_config.get('min_length'), + 'max_length': data_config.get('max_length'), + 'output_format': data_config.get('output_format'), + 'output_dir': data_config.get('output_dir'), + 'encoding': data_config.get('encoding'), + 'delimiter': data_config.get('delimiter') + }) + + # Handle model section + if 'model' in yaml_data: + model_data = yaml_data['model'] + config_dict.update({ + 'model_name': model_data.get('name'), + 'model_max_length': model_data.get('max_length') + }) + + # Handle training section + if 'training' in yaml_data: + training_data = yaml_data['training'] + config_dict.update({ + 'num_epochs': training_data.get('num_epochs'), + 'batch_size': training_data.get('batch_size'), + 'learning_rate': training_data.get('learning_rate'), + 'weight_decay': training_data.get('weight_decay'), + 'warmup_ratio': training_data.get('warmup_ratio'), + 'lr_scheduler_type': training_data.get('lr_scheduler_type') + }) + + # Handle inference section + if 'inference' in yaml_data: + inference_data = yaml_data['inference'] + config_dict.update({ + 'inference_batch_size': inference_data.get('batch_size'), + 'max_new_tokens': inference_data.get('max_new_tokens'), + 'temperature': inference_data.get('temperature') + }) + + logger.info(f"Successfully parsed YAML configuration from: {config_path}") + logger.info(f"Extracted {len(config_dict)} configuration parameters") + + return config_dict + + except Exception as e: + logger.error(f"Error loading YAML config from {config_path}: {e}") + raise + + +def main(): + """Main function with YAML configuration support""" + + parser = argparse.ArgumentParser(description="Styling Data Processing Pipeline") + + # YAML configuration + parser.add_argument("--config", type=str, help="Path to YAML configuration file") + + # Data source arguments + parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source") + parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name") + parser.add_argument("--data-path", type=str, help="Path to custom data file") + parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format") + + # Field mapping + parser.add_argument("--input-field", type=str, help="Input field name") + parser.add_argument("--output-field", type=str, help="Output field name") + parser.add_argument("--instruction", type=str, help="Style instruction") + + # Data processing + parser.add_argument("--max-samples", type=int, help="Maximum samples to process") + parser.add_argument("--train-split", type=float, help="Training split ratio") + parser.add_argument("--validation-split", type=float, help="Validation split ratio") + parser.add_argument("--test-split", type=float, help="Test split ratio") + + # Text preprocessing + parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text") + parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters") + parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase") + parser.add_argument("--min-length", type=int, help="Minimum text length") + parser.add_argument("--max-length", type=int, help="Maximum text length") + + # Output configuration + parser.add_argument("--output-format", choices=["styling", "alpaca"], help="Output format") + parser.add_argument("--output-dir", type=str, help="Output directory") + + # HuggingFace dataset options + parser.add_argument("--create-hf-dataset", action="store_true", help="Create HuggingFace dataset") + parser.add_argument("--hf-dataset-path", type=str, help="Path to save HuggingFace dataset") + + # Logging + parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level") + + args = parser.parse_args() + + # Set up logging + logging.basicConfig( + level=getattr(logging, args.log_level), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + # Load configuration + config_dict = {} + + # Load YAML config if provided + if args.config: + try: + config_dict = load_yaml_config(args.config) + except Exception as e: + logger.error(f"Error loading YAML config: {e}") + sys.exit(1) + + # Override YAML config with CLI arguments + cli_overrides = {} + if args.data_source: + cli_overrides['data_source'] = args.data_source + if args.dataset_name: + cli_overrides['dataset_name'] = args.dataset_name + if args.data_path: + cli_overrides['data_path'] = args.data_path + if args.data_format: + cli_overrides['data_format'] = args.data_format + if args.input_field: + cli_overrides['input_field'] = args.input_field + if args.output_field: + cli_overrides['output_field'] = args.output_field + if args.instruction: + cli_overrides['instruction'] = args.instruction + if args.max_samples: + cli_overrides['max_samples'] = args.max_samples + if args.train_split: + cli_overrides['train_split'] = args.train_split + if args.validation_split: + cli_overrides['validation_split'] = args.validation_split + if args.test_split: + cli_overrides['test_split'] = args.test_split + if args.clean_text: + cli_overrides['clean_text'] = True + if args.remove_special_chars: + cli_overrides['remove_special_chars'] = True + if args.lowercase: + cli_overrides['lowercase'] = True + if args.min_length: + cli_overrides['min_length'] = args.min_length + if args.max_length: + cli_overrides['max_length'] = args.max_length + if args.output_format: + cli_overrides['output_format'] = args.output_format + if args.output_dir: + cli_overrides['output_dir'] = args.output_dir + + # HuggingFace dataset options + if args.create_hf_dataset: + cli_overrides['create_hf_dataset'] = True + if args.hf_dataset_path: + cli_overrides['hf_dataset_path'] = args.hf_dataset_path + + # Logging + if args.log_level: + cli_overrides['log_level'] = args.log_level + + # Merge configurations + for key, value in cli_overrides.items(): + if key in config_dict: + logger.info(f"Overriding YAML config '{key}' with CLI value: {value}") + config_dict[key] = value + + # Validate required arguments + if not config_dict.get('data_source'): + parser.error("--data-source is required (either in YAML config or CLI)") + + if config_dict.get('data_source') == "huggingface" and not config_dict.get('dataset_name'): + parser.error("--dataset-name is required for HuggingFace datasets") + + if config_dict.get('data_source') == "custom" and not config_dict.get('data_path'): + parser.error("--data-path is required for custom datasets") + + # Create configuration object - properly handle YAML structure + config = StylingConfig( + data_source=config_dict.get('data_source', 'huggingface'), + dataset_name=config_dict.get('dataset_name'), + data_path=config_dict.get('data_path'), + data_format=config_dict.get('data_format', 'jsonl'), + input_field=config_dict.get('input_field', 'text'), + output_field=config_dict.get('output_field', 'styled_text'), + instruction=config_dict.get('instruction', 'Rewrite the following text in a formal style'), + max_samples=config_dict.get('max_samples'), + train_split=config_dict.get('train_split', 0.8), + validation_split=config_dict.get('validation_split', 0.1), + test_split=config_dict.get('test_split', 0.1), + clean_text=config_dict.get('clean_text', True), + remove_special_chars=config_dict.get('remove_special_chars', False), + lowercase=config_dict.get('lowercase', False), + min_length=config_dict.get('min_length', 10), + max_length=config_dict.get('max_length', 1000), + output_format=config_dict.get('output_format', 'styling'), + output_dir=config_dict.get('output_dir', './data'), + hf_split=config_dict.get('hf_split', 'train'), + hf_cache_dir=config_dict.get('hf_cache_dir'), + test_split_from=config_dict.get('test_split_from', 'train'), + val_split_from=config_dict.get('val_split_from', 'train'), + encoding=config_dict.get('encoding', 'utf-8'), + delimiter=config_dict.get('delimiter', ',') + ) + + # Initialize pipeline + pipeline = StylingDataPipeline() + + try: + print(f"Starting styling pipeline with {config.data_source} data source...") + if args.config: + print(f"Using YAML configuration: {args.config}") + print(f"Style instruction: {config.instruction}") + print() + + # Check if we should create HuggingFace dataset + create_hf_dataset = cli_overrides.get('create_hf_dataset', False) + hf_dataset_path = cli_overrides.get('hf_dataset_path') + + # If creating HF dataset, also save it by default + save_hf_dataset = create_hf_dataset + + result = pipeline.run_pipeline( + config, + config.output_format, + save_splits=True, + create_hf_dataset=create_hf_dataset, + save_hf_dataset=save_hf_dataset, + hf_dataset_path=hf_dataset_path + ) + + print(f"✅ Pipeline completed successfully!") + print(f" Data source: {config.data_source}") + if config.data_source == "huggingface": + print(f" Dataset: {config.dataset_name}") + else: + print(f" Data file: {config.data_path}") + print(f" Total samples: {result['analysis']['overall']['total_samples']}") + print(f" Split sizes: {result['analysis']['overall']['split_sizes']}") + print(f" Output directory: {config.output_dir}") + print(f" Style instruction: {config.instruction}") + + except Exception as e: + print(f"❌ Error running pipeline: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/pipelines/styling/inference.py b/pipelines/styling/inference.py new file mode 100644 index 0000000..9dccafb --- /dev/null +++ b/pipelines/styling/inference.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +""" +Styling Inference Pipeline using Trained Models +Supports style transfer inference with streaming and batch processing +""" + +import os +import sys +import json +import logging +import argparse +from pathlib import Path +from typing import Dict, Any, Optional, List, Union +import yaml + +# Add the project root to the path +sys.path.append(str(Path(__file__).parent.parent.parent)) + +from utils.config.config_manager import ConfigManager +from utils.logging.logging import setup_logging + +# Inference imports +import torch +from datasets import load_from_disk, Dataset +from unsloth import FastLanguageModel +from transformers import TextStreamer + +logger = logging.getLogger(__name__) + +class StylingInference: + """Styling task inference using trained models""" + + def __init__(self, config: Dict[str, Any]): + self.config = config + self.model = None + self.tokenizer = None + + # Set device + self.device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info(f"Using device: {self.device}") + + # Model parameters + self.model_path = config.get('model_path') + self.max_seq_length = config.get('max_seq_length', 2048) + self.dtype = config.get('dtype', None) + self.load_in_4bit = config.get('load_in_4bit', True) + self.hf_token = config.get('hf_token', None) + + # Inference parameters + self.batch_size = config.get('batch_size', 1) + self.max_new_tokens = config.get('max_new_tokens', 128) + self.temperature = config.get('temperature', 0.8) + self.top_p = config.get('top_p', 0.9) + self.do_sample = config.get('do_sample', True) + + # Alpaca prompt template + self.alpaca_prompt = config.get('alpaca_prompt', """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction + +### Instruction: +{} + +### Input: +{} + +### Response: +{}""") + + # Style instruction + self.style_instruction = config.get('style_instruction', 'Rewrite the following text in a formal style') + + def load_model_and_tokenizer(self): + """Load the trained model and tokenizer""" + logger.info("Loading model and tokenizer...") + + try: + if self.model_path and Path(self.model_path).exists(): + # Load local trained model + logger.info(f"Loading local model from: {self.model_path}") + self.model, self.tokenizer = FastLanguageModel.from_pretrained( + model_name=self.model_path, + max_seq_length=self.max_seq_length, + dtype=self.dtype, + load_in_4bit=self.load_in_4bit, + token=self.hf_token + ) + else: + # Load base model from HuggingFace Hub + logger.info(f"Loading base model: {self.config.get('base_model_name', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit')}") + self.model, self.tokenizer = FastLanguageModel.from_pretrained( + model_name=self.config.get('base_model_name', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'), + max_seq_length=self.max_seq_length, + dtype=self.dtype, + load_in_4bit=self.load_in_4bit, + token=self.hf_token + ) + + # Prepare for inference + FastLanguageModel.for_inference(self.model) + + logger.info(f"✅ Model loaded successfully") + logger.info(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}") + + except Exception as e: + logger.error(f"❌ Error loading model: {e}") + raise + + def format_prompt(self, instruction: str, input_text: str, output: str = "") -> str: + """Format the prompt using Alpaca template""" + return self.alpaca_prompt.format(instruction, input_text, output) + + def generate_text(self, prompt: str, max_new_tokens: Optional[int] = None) -> str: + """Generate text from a single prompt""" + try: + # Tokenize input + inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) + + # Set generation parameters + gen_kwargs = { + "max_new_tokens": max_new_tokens or self.max_new_tokens, + "temperature": self.temperature, + "top_p": self.top_p, + "do_sample": self.do_sample, + "use_cache": True, + "pad_token_id": self.tokenizer.eos_token_id + } + + # Generate + with torch.no_grad(): + outputs = self.model.generate(**inputs, **gen_kwargs) + + # Decode + generated_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] + + # Extract only the generated part (remove input prompt) + if prompt in generated_text: + generated_text = generated_text[len(prompt):].strip() + + return generated_text + + except Exception as e: + logger.error(f"❌ Error generating text: {e}") + return "" + + def style_transfer(self, input_text: str, instruction: Optional[str] = None, streaming: bool = False) -> str: + """Perform style transfer on input text""" + if instruction is None: + instruction = self.style_instruction + + # Format prompt + prompt = self.format_prompt(instruction, input_text, "") + + logger.info(f"Style transfer prompt: {prompt}") + + if streaming: + logger.info("Generating with streaming...") + self.generate_text_streaming(prompt) + return "" + else: + logger.info("Generating text...") + result = self.generate_text(prompt) + logger.info(f"Generated result: {result}") + return result + + def generate_text_streaming(self, prompt: str, max_new_tokens: Optional[int] = None): + """Generate text with streaming output""" + try: + # Tokenize input + inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device) + + # Setup text streamer + text_streamer = TextStreamer(self.tokenizer) + + # Set generation parameters + gen_kwargs = { + "max_new_tokens": max_new_tokens or self.max_new_tokens, + "temperature": self.temperature, + "top_p": self.top_p, + "do_sample": self.do_sample, + "use_cache": True, + "pad_token_id": self.tokenizer.eos_token_id + } + + # Generate with streaming + with torch.no_grad(): + _ = self.model.generate(**inputs, streamer=text_streamer, **gen_kwargs) + + except Exception as e: + logger.error(f"❌ Error in streaming generation: {e}") + + def batch_style_transfer(self, input_texts: List[str], instruction: Optional[str] = None) -> List[str]: + """Perform style transfer on multiple input texts""" + results = [] + + for i, input_text in enumerate(input_texts): + logger.info(f"Processing text {i+1}/{len(input_texts)}") + result = self.style_transfer(input_text, instruction) + results.append(result) + + return results + +def load_inference_config(config_path: str) -> Dict[str, Any]: + """Load inference configuration from YAML file""" + try: + with open(config_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + + # Extract inference configuration + inference_config = {} + + # Model configuration + if 'model' in config: + model_data = config['model'] + inference_config.update({ + 'base_model_name': model_data.get('training_model', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'), + 'max_seq_length': model_data.get('training_max_seq_length', 2048), + 'dtype': model_data.get('training_dtype'), + 'load_in_4bit': model_data.get('training_load_in_4bit', True), + 'hf_token': model_data.get('training_token') + }) + + # Inference configuration + if 'inference' in config: + inference_data = config['inference'] + inference_config.update({ + 'batch_size': inference_data.get('batch_size', 1), + 'max_new_tokens': inference_data.get('max_new_tokens', 128), + 'temperature': inference_data.get('temperature', 0.8) + }) + + # Style configuration + if 'data' in config: + data_config = config['data'] + inference_config.update({ + 'style_instruction': data_config.get('instruction', 'Rewrite the following text in a formal style') + }) + + return inference_config + + except Exception as e: + logger.error(f"Error loading inference config: {e}") + raise + +def main(): + """Main inference function""" + parser = argparse.ArgumentParser(description="Styling Inference Pipeline") + + # Configuration + parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file") + parser.add_argument("--model-path", type=str, help="Path to trained model (optional, uses base model if not provided)") + + # Inference modes + parser.add_argument("--text", type=str, help="Single text to style transfer") + parser.add_argument("--input-file", type=str, help="File containing texts to process (one per line)") + + # Generation parameters + parser.add_argument("--max-tokens", type=int, help="Maximum new tokens to generate") + parser.add_argument("--temperature", type=float, help="Sampling temperature") + parser.add_argument("--streaming", action="store_true", help="Enable streaming generation") + parser.add_argument("--instruction", type=str, help="Custom style instruction") + + # Output + parser.add_argument("--output-file", type=str, help="Output file for results") + + args = parser.parse_args() + + # Setup logging + setup_logging() + + try: + # Load configuration + logger.info(f"Loading configuration from: {args.config}") + inference_config = load_inference_config(args.config) + + # Override with CLI arguments + if args.model_path: + inference_config['model_path'] = args.model_path + if args.max_tokens: + inference_config['max_new_tokens'] = args.max_tokens + if args.temperature: + inference_config['temperature'] = args.temperature + if args.instruction: + inference_config['style_instruction'] = args.instruction + + logger.info("Inference configuration:") + for key, value in inference_config.items(): + logger.info(f" {key}: {value}") + + # Initialize inference + inferencer = StylingInference(inference_config) + + # Load model + inferencer.load_model_and_tokenizer() + + # Run inference based on mode + if args.text: + # Single text inference + logger.info("Running single text inference...") + result = inferencer.style_transfer(args.text, args.instruction, args.streaming) + if not args.streaming: + print(f"\nGenerated text: {result}") + + elif args.input_file: + # Batch file inference + logger.info("Running batch file inference...") + with open(args.input_file, 'r', encoding='utf-8') as f: + input_texts = [line.strip() for line in f if line.strip()] + + results = inferencer.batch_style_transfer(input_texts, args.instruction) + + # Save results + output_file = args.output_file or f"{Path(args.input_file).stem}_styled.txt" + with open(output_file, 'w', encoding='utf-8') as f: + for input_text, result in zip(input_texts, results): + f.write(f"Input: {input_text}\n") + f.write(f"Output: {result}\n") + f.write("-" * 50 + "\n") + + logger.info(f"✅ Results saved to: {output_file}") + + else: + # Interactive mode + logger.info("Entering interactive mode. Type 'quit' to exit.") + while True: + try: + user_input = input("\nEnter text to style (or 'quit'): ").strip() + if user_input.lower() == 'quit': + break + + if user_input: + result = inferencer.style_transfer(user_input, args.instruction, args.streaming) + if not args.streaming: + print(f"\nStyled text: {result}") + + except KeyboardInterrupt: + break + except Exception as e: + logger.error(f"Error processing input: {e}") + + logger.info("🎉 Inference completed successfully!") + + except Exception as e: + logger.error(f"❌ Inference failed: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/pipelines/styling/train.py b/pipelines/styling/train.py new file mode 100644 index 0000000..2afaaf8 --- /dev/null +++ b/pipelines/styling/train.py @@ -0,0 +1,446 @@ +#!/usr/bin/env python3 +""" +Styling Training Pipeline using Unsloth and SFTTrainer +Supports style transfer tasks with LoRA fine-tuning +""" + +import os +import sys +import json +import logging +import argparse +from pathlib import Path +from typing import Dict, Any, Optional +import yaml + +# Add the project root to the path +sys.path.append(str(Path(__file__).parent.parent.parent)) + +from utils.config.config_manager import ConfigManager +#from utils.logging.logging import setup_logging + +# Training imports +import torch +from datasets import load_from_disk, Dataset +from unsloth import FastLanguageModel, is_bfloat16_supported +from trl import SFTTrainer +from transformers import TrainingArguments + +logger = logging.getLogger(__name__) + +class StylingTrainer: + """Styling task trainer using Unsloth and SFTTrainer""" + + def __init__(self, config: Dict[str, Any]): + self.config = config + self.model = None + self.tokenizer = None + self.trainer = None + + # Set device + self.device = "cuda" if torch.cuda.is_available() else "cpu" + logger.info(f"Using device: {self.device}") + + # Training parameters + self.max_seq_length = config.get('max_seq_length', 2048) + self.dtype = config.get('dtype', None) + self.load_in_4bit = config.get('load_in_4bit', True) + self.hf_token = config.get('hf_token', None) + + # LoRA parameters + self.lora_r = config.get('lora_r', 16) + self.lora_alpha = config.get('lora_alpha', 16) + self.lora_dropout = config.get('lora_dropout', 0) + self.target_modules = config.get('target_modules', [ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj" + ]) + + # Training arguments + self.batch_size = config.get('batch_size', 2) + self.gradient_accumulation_steps = config.get('gradient_accumulation_steps', 4) + self.learning_rate = config.get('learning_rate', 2e-4) + self.num_epochs = config.get('num_epochs', 1) + self.max_steps = config.get('max_steps', None) + self.warmup_steps = config.get('warmup_steps', 5) + self.weight_decay = config.get('weight_decay', 0.01) + self.seed = config.get('seed', 3407) + + # Output paths + self.output_dir = config.get('output_dir', './outputs') + self.model_output_dir = config.get('model_output_dir', './models/styling') + + def load_model_and_tokenizer(self): + """Load the pre-trained model and tokenizer""" + logger.info("Loading model and tokenizer...") + + try: + self.model, self.tokenizer = FastLanguageModel.from_pretrained( + model_name=self.config['model_name'], + max_seq_length=self.max_seq_length, + dtype=self.dtype, + load_in_4bit=self.load_in_4bit, + token=self.hf_token + ) + + logger.info(f"✅ Model loaded: {self.config['model_name']}") + logger.info(f"✅ Tokenizer loaded with vocab size: {self.tokenizer.vocab_size}") + + except Exception as e: + logger.error(f"❌ Error loading model: {e}") + raise + + def setup_lora(self): + """Setup LoRA for efficient fine-tuning""" + logger.info("Setting up LoRA configuration...") + + try: + self.model = FastLanguageModel.get_peft_model( + self.model, + r=self.lora_r, + target_modules=self.target_modules, + lora_alpha=self.lora_alpha, + lora_dropout=self.lora_dropout, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=self.seed, + use_rslora=False, + loftq_config=None + ) + + logger.info(f"✅ LoRA configured with r={self.lora_r}, alpha={self.lora_alpha}") + + except Exception as e: + logger.error(f"❌ Error setting up LoRA: {e}") + raise + + def load_dataset(self, dataset_path: str) -> Dataset: + """Load the training dataset""" + logger.info(f"Loading dataset from: {dataset_path}") + + try: + if Path(dataset_path).exists(): + # Check if it's a HuggingFace dataset directory + if (Path(dataset_path) / "dataset_info.json").exists(): + # Load from HuggingFace dataset directory + dataset = load_from_disk(dataset_path) + logger.info(f"Loaded HuggingFace dataset from disk: {len(dataset)} samples") + else: + # Load from processed data files (JSONL format) + logger.info("Loading from processed data files...") + from datasets import Dataset + import json + + all_data = [] + data_dir = Path(dataset_path) + + # Look for train.jsonl, validation.jsonl, test.jsonl + for split_file in ["train.jsonl", "validation.jsonl", "test.jsonl"]: + file_path = data_dir / split_file + if file_path.exists(): + logger.info(f"Loading {split_file}...") + with open(file_path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + data = json.loads(line) + all_data.append(data) + + if not all_data: + raise ValueError(f"No data found in {dataset_path}") + + # Create HuggingFace dataset + dataset = Dataset.from_list(all_data) + logger.info(f"Created HuggingFace dataset from {len(all_data)} samples") + else: + # Try loading from HuggingFace Hub + logger.info(f"Attempting to load from HuggingFace Hub: {dataset_path}") + dataset = Dataset.load_dataset(dataset_path, split="train") + logger.info(f"Loaded from HuggingFace Hub: {len(dataset)} samples") + + logger.info(f"Dataset loaded: {len(dataset)} samples") + logger.info(f"Dataset features: {dataset.features}") + + # Verify required fields exist + required_fields = ["instruction", "input", "output"] + missing_fields = [field for field in required_fields if field not in dataset.features] + if missing_fields: + raise ValueError(f"Missing required fields in dataset: {missing_fields}") + + return dataset + + except Exception as e: + logger.error(f"Error loading dataset: {e}") + raise + + def setup_trainer(self, train_dataset: Dataset): + """Setup the SFTTrainer""" + logger.info("Setting up SFTTrainer...") + + try: + # First, map the dataset to create the text field with EOS token + def formatting_prompts_func(examples): + instructions = examples["instruction"] + inputs = examples["input"] + outputs = examples["output"] + texts = [] + + for instruction, input_text, output in zip(instructions, inputs, outputs): + # Must add EOS_TOKEN, otherwise your generation will go on forever! + alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that follows the instruction + +### Instruction: +{} + +### Input: +{} + +### Response: +{}""" + text = alpaca_prompt.format(instruction, input_text, output) + self.tokenizer.eos_token + texts.append(text) + + return {"text": texts} + + # Apply the formatting function to create the text field + logger.info("Mapping dataset to create text field with EOS token...") + formatted_dataset = train_dataset.map(formatting_prompts_func, batched=True, remove_columns=train_dataset.column_names) + + logger.info(f"Dataset mapped successfully. New features: {formatted_dataset.features}") + logger.info(f"Sample text field: {formatted_dataset[0]['text'][:100]}...") + + # Training arguments + training_args = TrainingArguments( + per_device_train_batch_size=self.batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + warmup_steps=self.warmup_steps, + num_train_epochs=self.num_epochs, + max_steps=self.max_steps, + learning_rate=self.learning_rate, + fp16=not is_bfloat16_supported(), + bf16=is_bfloat16_supported(), + logging_steps=1, + optim="adamw_8bit", + weight_decay=self.weight_decay, + lr_scheduler_type="linear", + seed=self.seed, + output_dir=self.output_dir, + report_to="none", # Disable wandb for now + save_strategy="epoch", + save_total_limit=2, + evaluation_strategy="no", # No validation for now + load_best_model_at_end=False, + remove_unused_columns=False, + dataloader_pin_memory=False, + ) + + # Create trainer with the formatted dataset + self.trainer = SFTTrainer( + model=self.model, + tokenizer=self.tokenizer, + train_dataset=formatted_dataset, # Use the formatted dataset + dataset_text_field="text", # The field we just created + max_seq_length=self.max_seq_length, + dataset_num_proc=2, + packing=False, # Can make training 5x faster for short sequences + args=training_args + ) + + logger.info("SFTTrainer configured successfully") + + except Exception as e: + logger.error(f"Error setting up trainer: {e}") + raise + + def train(self, dataset_path: str): + """Run the training process""" + logger.info("🚀 Starting training process...") + + try: + # Load model and tokenizer + self.load_model_and_tokenizer() + + # Setup LoRA + self.setup_lora() + + # Load dataset + train_dataset = self.load_dataset(dataset_path) + + # Setup trainer + self.setup_trainer(train_dataset) + + # Start training + logger.info("Starting training...") + trainer_stats = self.trainer.train() + + logger.info("✅ Training completed successfully!") + logger.info(f"Training stats: {trainer_stats}") + + # Save the model + self.save_model() + + return trainer_stats + + except Exception as e: + logger.error(f"❌ Training failed: {e}") + raise + + def save_model(self): + """Save the trained model""" + logger.info("Saving trained model...") + + try: + # Create output directory + Path(self.model_output_dir).mkdir(parents=True, exist_ok=True) + + # Save model and tokenizer + self.model.save_pretrained(self.model_output_dir) + self.tokenizer.save_pretrained(self.model_output_dir) + + # Save training config + config_path = Path(self.model_output_dir) / "training_config.json" + with open(config_path, 'w') as f: + json.dump(self.config, f, indent=2) + + logger.info(f"✅ Model saved to: {self.model_output_dir}") + + except Exception as e: + logger.error(f"❌ Error saving model: {e}") + raise + + def prepare_for_inference(self): + """Prepare model for inference""" + logger.info("Preparing model for inference...") + + try: + FastLanguageModel.for_inference(self.model) + logger.info("✅ Model prepared for inference") + + except Exception as e: + logger.error(f"❌ Error preparing for inference: {e}") + raise + +def load_training_config(config_path: str) -> Dict[str, Any]: + """Load training configuration from YAML file""" + try: + with open(config_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + + # Extract training configuration + training_config = {} + + # Model configuration + if 'model' in config: + model_data = config['model'] + training_config.update({ + 'model_name': model_data.get('training_model', 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'), + 'max_seq_length': model_data.get('training_max_seq_length', 2048), + 'dtype': model_data.get('training_dtype'), + 'load_in_4bit': model_data.get('training_load_in_4bit', True), + 'hf_token': model_data.get('training_token') + }) + + # Training configuration + if 'training' in config: + training_data = config['training'] + training_config.update({ + 'num_epochs': training_data.get('num_epochs', 3), + 'batch_size': training_data.get('batch_size', 2), + 'learning_rate': training_data.get('learning_rate', 2e-4), + 'weight_decay': training_data.get('weight_decay', 0.01), + 'warmup_ratio': training_data.get('warmup_ratio', 0.1), + 'lr_scheduler_type': training_data.get('lr_scheduler_type', 'linear') + }) + + # Data configuration - use output_dir from data section + if 'data' in config: + data_config = config['data'] + output_dir = data_config.get('output_dir', './data/processed/styling') + training_config.update({ + 'data_output_dir': output_dir, + 'dataset_path': output_dir, # Default dataset path is the output_dir + 'style_instruction': data_config.get('instruction', 'Rewrite the following text in a formal style') + }) + + # LoRA configuration + training_config.update({ + 'lora_r': 16, + 'lora_alpha': 16, + 'lora_dropout': 0, + 'target_modules': [ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj" + ], + 'gradient_accumulation_steps': 4, + 'max_steps': None, + 'warmup_steps': 5, + 'seed': 3407, + 'output_dir': './outputs', + 'model_output_dir': './models/styling' + }) + + return training_config + + except Exception as e: + logger.error(f"Error loading training config: {e}") + raise + +def main(): + """Main training function""" + parser = argparse.ArgumentParser(description="Styling Training Pipeline") + + # Configuration + parser.add_argument("--config", type=str, required=True, help="Path to YAML configuration file") + parser.add_argument("--dataset", type=str, help="Path to training dataset (HF dataset path or local path)") + parser.add_argument("--output-dir", type=str, help="Output directory for model") + parser.add_argument("--epochs", type=int, help="Number of training epochs") + parser.add_argument("--batch-size", type=int, help="Training batch size") + parser.add_argument("--learning-rate", type=float, help="Learning rate") + parser.add_argument("--max-steps", type=int, help="Maximum training steps") + + args = parser.parse_args() + + # Setup logging + # setup_logging() # Commented out as per user's change + + try: + # Load configuration + logger.info(f"Loading configuration from: {args.config}") + training_config = load_training_config(args.config) + + # Override with CLI arguments + if args.output_dir: + training_config['model_output_dir'] = args.output_dir + if args.epochs: + training_config['num_epochs'] = args.epochs + if args.batch_size: + training_config['batch_size'] = args.batch_size + if args.learning_rate: + training_config['learning_rate'] = args.learning_rate + if args.max_steps: + training_config['max_steps'] = args.max_steps + + # Determine dataset path: CLI argument takes precedence, then YAML config + dataset_path = args.dataset or training_config.get('dataset_path') + if not dataset_path: + logger.error("No dataset path provided. Use --dataset or ensure output_dir is set in YAML config.") + sys.exit(1) + + logger.info("Training configuration:") + for key, value in training_config.items(): + logger.info(f" {key}: {value}") + logger.info(f" Dataset path: {dataset_path}") + + # Initialize trainer + trainer = StylingTrainer(training_config) + + # Start training + trainer.train(dataset_path) + + logger.info("Training completed successfully!") + + except Exception as e: + logger.error(f"Training failed: {e}") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/scripts/styling/__init__.py b/scripts/styling/__init__.py new file mode 100644 index 0000000..5e53d8b --- /dev/null +++ b/scripts/styling/__init__.py @@ -0,0 +1,45 @@ +""" +Styling Scripts Package +Provides command-line interfaces for styling data processing, training, and inference +""" + +from .data_processor import ( + run_with_yaml_config, + run_styling_examples, + create_sample_styling_data, + create_custom_styling_config, + show_styling_features +) + +from .train import ( + run_training_with_config, + create_training_example, + show_training_features +) + +from .inference import ( + run_inference_with_config, + create_inference_example, + run_batch_inference_example, + show_inference_features +) + +__all__ = [ + # Data processing + 'run_with_yaml_config', + 'run_styling_examples', + 'create_sample_styling_data', + 'create_custom_styling_config', + 'show_styling_features', + + # Training + 'run_training_with_config', + 'create_training_example', + 'show_training_features', + + # Inference + 'run_inference_with_config', + 'create_inference_example', + 'run_batch_inference_example', + 'show_inference_features' +] diff --git a/scripts/styling/data_processor.py b/scripts/styling/data_processor.py new file mode 100644 index 0000000..fb18c63 --- /dev/null +++ b/scripts/styling/data_processor.py @@ -0,0 +1,302 @@ +#!/usr/bin/env python3 +""" +Styling data processor script that uses YAML configurations. +This provides a flexible and maintainable approach for style transfer tasks. +""" + +import sys +import os +import subprocess +import argparse +from pathlib import Path + +def run_with_yaml_config(config_path: str, **cli_overrides): + """Run styling data processor with YAML configuration""" + print(f"=== Running Styling Data Processor with YAML config: {config_path} ===") + + cmd = [ + "python", "pipelines/styling/data_processor.py", + "--config", config_path + ] + + # Add CLI overrides + for key, value in cli_overrides.items(): + if value is not None: + cmd.extend([f"--{key.replace('_', '-')}", str(value)]) + + print(f"Running command: {' '.join(cmd)}") + print() + + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + print("✅ Styling data processing completed successfully!") + print(result.stdout) + return True + except subprocess.CalledProcessError as e: + print(f"❌ Error running styling data processor: {e}") + print(f"Error output: {e.stderr}") + return False + +def run_styling_examples(): + """Run styling examples with YAML configs""" + + # Example 1: Formal style transfer + print("=== Example 1: Formal Style Transfer ===") + success = run_with_yaml_config( + "configs/styling/formal.yaml", + max_samples=1000, # Override YAML value + output_format="alpaca" + ) + + if success: + print("✅ Formal style transfer completed!") + + # Example 2: Custom styling dataset (if available) + print("\n=== Example 2: Custom Styling Dataset ===") + if os.path.exists("data/raw/styling/custom_dataset.jsonl"): + success = run_with_yaml_config( + "configs/styling/formal.yaml", # Use formal config as base + data_source="custom", + data_path="data/raw/styling/custom_dataset.jsonl", + instruction="Rewrite the following text in a casual, friendly style", + output_dir="./data/processed/styling/casual" + ) + if success: + print("✅ Custom styling dataset processing completed!") + else: + print("⚠️ Custom styling dataset not found, skipping...") + print(" You can create one with the 'create-sample-data' option") + +def create_sample_styling_data(): + """Create sample styling dataset for testing""" + sample_data = [ + { + "text": "Hey, what's up? How are you doing today?", + "styled_text": "Hello, how are you doing today?" + }, + { + "text": "This is really cool stuff!", + "styled_text": "This is quite impressive material." + }, + { + "text": "I'm gonna go to the store later.", + "styled_text": "I will go to the store later." + }, + { + "text": "What's the deal with this?", + "styled_text": "What is the situation regarding this matter?" + }, + { + "text": "That's totally awesome!", + "styled_text": "That is quite remarkable!" + } + ] + + # Create directory structure + data_dir = Path("data/raw/styling") + data_dir.mkdir(parents=True, exist_ok=True) + + # Save sample data + import json + sample_file = data_dir / "sample_formal.jsonl" + with open(sample_file, 'w', encoding='utf-8') as f: + for item in sample_data: + f.write(json.dumps(item, ensure_ascii=False) + '\n') + + print(f"✅ Created sample styling dataset: {sample_file}") + print(f" Contains {len(sample_data)} examples") + print(f" Format: text → styled_text") + print(f" Ready to use with configs/styling/formal.yaml") + +def create_custom_styling_config(): + """Create a custom styling configuration file""" + custom_config = """task: + name: "styling" + type: "style_transfer" + +data: + source: "custom" + input_field: "text" + output_field: "styled_text" + instruction: "Rewrite the following text in a professional business style" + data_format: "jsonl" + max_length: 512 + min_length: 10 + clean_text: true + lowercase: false + train_split: 0.8 + validation_split: 0.1 + test_split: 0.1 + output_format: "alpaca" + output_dir: "./data/processed/styling/professional" + +model: + name: "t5-base" + max_length: 512 + +training: + num_epochs: 3 + batch_size: 16 + learning_rate: 3e-5 + weight_decay: 0.01 + warmup_ratio: 0.1 + lr_scheduler_type: "linear" + +inference: + batch_size: 32 + max_new_tokens: 128 + temperature: 0.8 +""" + + config_path = "configs/styling/professional.yaml" + os.makedirs(os.path.dirname(config_path), exist_ok=True) + + with open(config_path, 'w') as f: + f.write(custom_config) + + print(f"✅ Created custom styling config: {config_path}") + print(" This config is set up for professional business style transfer") + +def handle_direct_args(): + """Handle direct command-line arguments by passing them to the styling pipeline""" + parser = argparse.ArgumentParser(description="Styling Data Processor") + + # Add all the same arguments as the styling pipeline + parser.add_argument("--config", type=str, help="Path to YAML configuration file") + parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source") + parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name") + parser.add_argument("--data-path", type=str, help="Path to custom data file") + parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format") + parser.add_argument("--input-field", type=str, help="Input field name") + parser.add_argument("--output-field", type=str, help="Output field name") + parser.add_argument("--instruction", type=str, help="Style instruction") + parser.add_argument("--max-samples", type=int, help="Maximum samples to process") + parser.add_argument("--train-split", type=float, help="Training split ratio") + parser.add_argument("--validation-split", type=float, help="Validation split ratio") + parser.add_argument("--test-split", type=float, help="Test split ratio") + parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text") + parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters") + parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase") + parser.add_argument("--min-length", type=int, help="Minimum text length") + parser.add_argument("--max-length", type=int, help="Maximum text length") + parser.add_argument("--output-format", choices=["styling", "alpaca"], help="Output format") + parser.add_argument("--output-dir", type=str, help="Output directory") + + # HuggingFace dataset options + parser.add_argument("--create-hf-dataset", action="store_true", help="Create HuggingFace dataset") + parser.add_argument("--hf-dataset-path", type=str, help="Path to save HuggingFace dataset") + + # Logging + parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level") + + args = parser.parse_args() + + # Build command to call the styling pipeline + cmd = ["python", "pipelines/styling/data_processor.py"] + + # Add all arguments that were provided + for arg_name, arg_value in vars(args).items(): + if arg_value is not None: + if isinstance(arg_value, bool): + if arg_value: # Only add flag if True + cmd.append(f"--{arg_name.replace('_', '-')}") + else: + cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)]) + + print(f"Running: {' '.join(cmd)}") + print() + + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + print("✅ Styling data processing completed successfully!") + print(result.stdout) + return True + except subprocess.CalledProcessError as e: + print(f"❌ Error running styling data processor: {e}") + print(f"Error output: {e.stderr}") + return False + +def show_styling_features(): + """Show the features of the styling data processor""" + print("=== Styling Data Processor Features ===") + print() + print("1. **Style Transfer Tasks**:") + print(" - Formal vs. Informal style") + print(" - Professional vs. Casual tone") + print(" - Academic vs. Conversational") + print(" - Any custom style instruction") + print() + print("2. **Data Formats Supported**:") + print(" - HuggingFace datasets") + print(" - Custom JSONL/CSV/JSON files") + print(" - Automatic train/validation/test splits") + print() + print("3. **Output Formats**:") + print(" - Raw styling format (input/output)") + print(" - Alpaca format (instruction/input/output)") + print(" - HuggingFace dataset format") + print() + print("4. **Advanced Features**:") + print(" - Configurable field mapping") + print(" - Text preprocessing options") + print(" - Automatic dataset saving/loading") + print(" - YAML configuration support") + print() + print("=== Usage Examples ===") + print() + print("1. Use YAML config only:") + print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml") + print() + print("2. Override YAML values:") + print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml --max-samples 500") + print() + print("3. Create sample data:") + print(" python scripts/styling/data_processor.py create-sample-data") + print() + print("4. Create custom config:") + print(" python scripts/styling/data_processor.py create-config") + +def main(): + """Main function""" + if len(sys.argv) > 1: + # Check if it's a subcommand + if sys.argv[1] in ["examples", "create-sample-data", "create-config", "features"]: + # Handle subcommands + if sys.argv[1] == "examples": + run_styling_examples() + elif sys.argv[1] == "create-sample-data": + create_sample_styling_data() + elif sys.argv[1] == "create-config": + create_custom_styling_config() + elif sys.argv[1] == "features": + show_styling_features() + else: + # Handle direct arguments (pass through to pipeline) + handle_direct_args() + else: + print("Styling Data Processor") + print("=====================") + print() + print("This script runs the styling data processor for style transfer tasks.") + print("It supports both YAML configurations and command-line overrides.") + print() + print("Usage:") + print(" python scripts/styling/data_processor.py examples # Run examples") + print(" python scripts/styling/data_processor.py create-sample-data # Create sample dataset") + print(" python scripts/styling/data_processor.py create-config # Create custom config") + print(" python scripts/styling/data_processor.py features # Show features") + print() + print("Direct pipeline usage:") + print(" python scripts/styling/data_processor.py --config configs/styling/formal.yaml") + print(" python scripts/styling/data_processor.py --data-source custom --data-path ./data.jsonl") + print() + print("Key Features:") + print(" ✅ Style transfer with custom instructions") + print(" ✅ Multiple data source support") + print(" ✅ YAML configuration files") + print(" ✅ CLI argument overrides") + print(" ✅ Automatic data splitting") + print(" ✅ HuggingFace dataset export") + +if __name__ == "__main__": + main() diff --git a/scripts/styling/inference.py b/scripts/styling/inference.py new file mode 100644 index 0000000..08beb8f --- /dev/null +++ b/scripts/styling/inference.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +""" +Styling Inference Script +Provides a command-line interface to run the styling inference pipeline +""" + +import sys +import os +import subprocess +import argparse +from pathlib import Path + +def run_inference_with_config(config_path: str, **cli_overrides): + """Run the styling inference pipeline with YAML configuration""" + print(f"🚀 Starting styling inference with config: {config_path}") + print() + + # Build command + cmd = ["python", "pipelines/styling/inference.py", "--config", config_path] + + # Add CLI overrides + for key, value in cli_overrides.items(): + if value is not None: + if key == "model_path": + cmd.extend(["--model-path", str(value)]) + elif key == "text": + cmd.extend(["--text", str(value)]) + elif key == "input_file": + cmd.extend(["--input-file", str(value)]) + elif key == "max_tokens": + cmd.extend(["--max-tokens", str(value)]) + elif key == "temperature": + cmd.extend(["--temperature", str(value)]) + elif key == "instruction": + cmd.extend(["--instruction", str(value)]) + elif key == "output_file": + cmd.extend(["--output-file", str(value)]) + elif key == "streaming": + cmd.append("--streaming") + + print(f"Running: {' '.join(cmd)}") + print() + + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + print("✅ Inference completed successfully!") + print(result.stdout) + return True + except subprocess.CalledProcessError as e: + print(f"❌ Inference failed: {e}") + print(f"Error output: {e.stderr}") + return False + +def show_inference_features(): + """Show the features of the styling inference pipeline""" + print("=== Styling Inference Pipeline Features ===") + print() + print("1. **Model Support**:") + print(" - Trained LoRA models") + print(" - Base models from HuggingFace Hub") + print(" - Automatic model loading and preparation") + print() + print("2. **Inference Modes**:") + print(" - Single text inference") + print(" - Batch file processing") + print(" - Interactive mode") + print(" - Streaming generation") + print() + print("3. **Generation Control**:") + print(" - Configurable temperature and top-p") + print(" - Adjustable max tokens") + print(" - Custom style instructions") + print() + print("4. **Output Options**:") + print(" - Console output") + print(" - File output") + print(" - Streaming real-time generation") + +def create_inference_example(): + """Create an inference example using the formal style configuration""" + print("=== Inference Example: Formal Style Transfer ===") + print() + + # Check if we have the required files + config_path = "configs/styling/formal.yaml" + + if not Path(config_path).exists(): + print(f"❌ Configuration file not found: {config_path}") + print(" Please run the data processor first to create the configuration") + return False + + print("✅ Found configuration file!") + print(f" Config: {config_path}") + print() + + # Example text + example_text = "Hey, what's up? I'm gonna go grab some food later." + + print(f"📝 Example text: {example_text}") + print() + + # Run inference + success = run_inference_with_config( + config_path=config_path, + text=example_text, + instruction="Rewrite the following text in a formal style" + ) + + if success: + print("🎉 Inference example completed!") + + return success + +def create_test_file(): + """Create a test file with sample texts for batch inference""" + test_file = "test_texts.txt" + + test_texts = [ + "Hey, what's up? How are you doing today?", + "I'm gonna go to the store later to get some stuff.", + "This is pretty cool, right?", + "Can you help me out with this?", + "Thanks a lot for your help!" + ] + + with open(test_file, 'w', encoding='utf-8') as f: + for text in test_texts: + f.write(text + '\n') + + print(f"✅ Created test file: {test_file}") + print(f" Contains {len(test_texts)} sample texts") + return test_file + +def run_batch_inference_example(): + """Run a batch inference example""" + print("=== Batch Inference Example ===") + print() + + # Create test file + test_file = create_test_file() + + # Check configuration + config_path = "configs/styling/formal.yaml" + if not Path(config_path).exists(): + print(f"❌ Configuration file not found: {config_path}") + return False + + print("✅ Running batch inference...") + print() + + # Run batch inference + success = run_inference_with_config( + config_path=config_path, + input_file=test_file, + output_file="styled_results.txt", + instruction="Rewrite the following text in a formal style" + ) + + if success: + print("🎉 Batch inference completed!") + print(" Results saved to: styled_results.txt") + + return success + +def main(): + """Main function""" + parser = argparse.ArgumentParser(description="Styling Inference Script") + + # Subcommands + parser.add_argument("command", choices=["infer", "example", "batch", "features"], + help="Command to run") + + # Inference arguments + parser.add_argument("--config", type=str, help="Path to YAML configuration file") + parser.add_argument("--model-path", type=str, help="Path to trained model") + parser.add_argument("--text", type=str, help="Single text to style transfer") + parser.add_argument("--input-file", type=str, help="File containing texts to process") + parser.add_argument("--max-tokens", type=int, help="Maximum new tokens to generate") + parser.add_argument("--temperature", type=float, help="Sampling temperature") + parser.add_argument("--instruction", type=str, help="Custom style instruction") + parser.add_argument("--output-file", type=str, help="Output file for results") + parser.add_argument("--streaming", action="store_true", help="Enable streaming generation") + + args = parser.parse_args() + + if args.command == "features": + show_inference_features() + + elif args.command == "example": + create_inference_example() + + elif args.command == "batch": + run_batch_inference_example() + + elif args.command == "infer": + if not args.config: + print("❌ --config is required for inference") + print("Usage: python scripts/styling/inference.py infer --config config.yaml [options]") + sys.exit(1) + + # Check if we have input + if not args.text and not args.input_file: + print("❌ Either --text or --input-file is required") + print("Usage: python scripts/styling/inference.py infer --config config.yaml --text 'your text'") + sys.exit(1) + + success = run_inference_with_config( + config_path=args.config, + model_path=args.model_path, + text=args.text, + input_file=args.input_file, + max_tokens=args.max_tokens, + temperature=args.temperature, + instruction=args.instruction, + output_file=args.output_file, + streaming=args.streaming + ) + + if not success: + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/scripts/styling/train.py b/scripts/styling/train.py new file mode 100644 index 0000000..7742320 --- /dev/null +++ b/scripts/styling/train.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +""" +Styling Training Script +Provides a command-line interface to run the styling training pipeline +""" + +import sys +import os +import subprocess +import argparse +from pathlib import Path + +def run_training_with_config(config_path: str, dataset_path: str = None, **cli_overrides): + """Run the styling training pipeline with YAML configuration""" + print(f"Starting styling training with config: {config_path}") + if dataset_path: + print(f"Training dataset: {dataset_path}") + else: + print("Training dataset: Will use output_dir from YAML config") + print() + + # Build command + cmd = ["python", "pipelines/styling/train.py", "--config", config_path] + + # Add dataset path if provided + if dataset_path: + cmd.extend(["--dataset", dataset_path]) + + # Add CLI overrides + for key, value in cli_overrides.items(): + if value is not None: + if key == "output_dir": + cmd.extend(["--output-dir", str(value)]) + elif key == "epochs": + cmd.extend(["--epochs", str(value)]) + elif key == "batch_size": + cmd.extend(["--batch-size", str(value)]) + elif key == "learning_rate": + cmd.extend(["--learning-rate", str(value)]) + elif key == "max_steps": + cmd.extend(["--max-steps", str(value)]) + + print(f"Running: {' '.join(cmd)}") + print() + + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + print("Training completed successfully!") + print(result.stdout) + return True + except subprocess.CalledProcessError as e: + print(f"Training failed: {e}") + print(f"Error output: {e.stderr}") + return False + +def show_training_features(): + """Show the features of the styling training pipeline""" + print("=== Styling Training Pipeline Features ===") + print() + print("1. **Model Support**:") + print(" - Unsloth optimized models (4x faster)") + print(" - LoRA fine-tuning for efficiency") + print(" - Support for Llama-3.1, Mistral, Phi-3, Gemma") + print() + print("2. **Training Features**:") + print(" - SFTTrainer with instruction tuning") + print(" - Automatic mixed precision (FP16/BF16)") + print(" - Gradient checkpointing for memory efficiency") + print(" - Configurable LoRA parameters") + print() + print("3. **Configuration**:") + print(" - YAML configuration files") + print(" - CLI argument overrides") + print(" - Automatic device detection") + print() + print("4. **Output**:") + print(" - Saved LoRA models") + print(" - Training logs and checkpoints") + print(" - Ready for inference") + +def create_training_example(): + """Create a training example using the formal style configuration""" + print("=== Training Example: Formal Style Transfer ===") + print() + + # Check if we have the required files + config_path = "configs/styling/formal.yaml" + + if not Path(config_path).exists(): + print(f"Configuration file not found: {config_path}") + print(" Please run the data processor first to create the configuration") + return False + + print("Found required files!") + print(f" Config: {config_path}") + print(" Dataset: Will use output_dir from YAML config") + print(" The training pipeline will automatically:") + print(" - Load data from the output_dir specified in YAML") + print(" - Convert JSONL files to HuggingFace dataset format") + print(" - Apply formatting with EOS tokens") + print(" - Train the model using SFTTrainer") + print() + + # Run training without explicit dataset path - will use YAML config + success = run_training_with_config( + config_path=config_path, + dataset_path=None, # Use output_dir from YAML config + epochs=1, + batch_size=2, + learning_rate=2e-4 + ) + + if success: + print("Training example completed!") + print(" Model saved to: ./models/styling") + print(" Ready for inference!") + + return success + +def main(): + """Main function""" + parser = argparse.ArgumentParser(description="Styling Training Script") + + # Subcommands + parser.add_argument("command", choices=["train", "example", "features"], + help="Command to run") + + # Training arguments + parser.add_argument("--config", type=str, help="Path to YAML configuration file") + parser.add_argument("--dataset", type=str, help="Path to training dataset") + parser.add_argument("--output-dir", type=str, help="Output directory for model") + parser.add_argument("--epochs", type=int, help="Number of training epochs") + parser.add_argument("--batch-size", type=int, help="Training batch size") + parser.add_argument("--learning-rate", type=float, help="Learning rate") + parser.add_argument("--max-steps", type=int, help="Maximum training steps") + + args = parser.parse_args() + + if args.command == "features": + show_training_features() + + elif args.command == "example": + create_training_example() + + elif args.command == "train": + if not args.config: + print("❌ --config is required for training") + print("Usage: python scripts/styling/train.py train --config config.yaml") + sys.exit(1) + + # If dataset is not provided, try to use output_dir from config + dataset_path = args.dataset if args.dataset else None + + success = run_training_with_config( + config_path=args.config, + dataset_path=dataset_path, + output_dir=args.output_dir, + epochs=args.epochs, + batch_size=args.batch_size, + learning_rate=args.learning_rate, + max_steps=args.max_steps + ) + + if not success: + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/test.py b/test.py new file mode 100644 index 0000000..4743b82 --- /dev/null +++ b/test.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +Test script for the styling data processor +""" + +import sys +import os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from pipelines.styling.data_processor import StylingDataPipeline, create_custom_config, create_huggingface_config + +def test_styling_pipeline(): + """Test the styling data processor with custom data""" + + print("Testing Styling Data Processor") + print("=" * 50) + + # Initialize the pipeline + pipeline = StylingDataPipeline() + + # Example 1: Load configuration from YAML + print("\n1. Loading configuration from YAML...") + try: + yaml_config = pipeline.load_config_from_yaml("./configs/styling/formal.yaml") + print(f" ✅ YAML config loaded successfully!") + print(f" Output directory: {yaml_config.output_dir}") + print(f" Instruction: {yaml_config.instruction}") + print(f" Input field: {yaml_config.input_field}") + print(f" Output field: {yaml_config.output_field}") + except Exception as e: + print(f" ❌ Error loading YAML config: {e}") + yaml_config = None + + # Example 2: Create custom dataset configuration + print("\n2. Creating custom dataset configuration...") + custom_config = create_custom_config( + data_path="./data/raw/styling/formal_dataset.jsonl", + data_format="jsonl", + input_field="text", + output_field="styled_text", + instruction="Rewrite the following text in a formal style", + max_samples=1000, + min_length=10, + max_length=256, + clean_text=True, + lowercase=False, + output_format="alpaca" + ) + + print(f" Input field: {custom_config.input_field} (maps to 'input')") + print(f" Output field: {custom_config.output_field} (maps to 'output')") + print(f" Instruction: {custom_config.instruction}") + print(f" Max samples: {custom_config.max_samples}") + + # Example 3: Test with sample data (if available) + print("\n3. Testing pipeline with sample data...") + + # Create a sample dataset for testing + sample_data = [ + { + "input": "Hey, what's up? How are you doing today?", + "output": "Hello, how are you doing today?" + }, + { + "input": "This is really cool stuff!", + "output": "This is quite impressive material." + }, + { + "input": "I'm gonna go to the store later.", + "output": "I will go to the store later." + } + ] + + # Save sample data to test file + import json + test_file = "./data/raw/styling/test_formal.jsonl" + os.makedirs(os.path.dirname(test_file), exist_ok=True) + + with open(test_file, 'w', encoding='utf-8') as f: + for item in sample_data: + f.write(json.dumps(item, ensure_ascii=False) + '\n') + + print(f" Created test file: {test_file}") + + # Test the pipeline with the sample data + try: + test_config = create_custom_config( + data_path=test_file, + data_format="jsonl", + input_field="input", + output_field="output", + instruction="Rewrite the following text in a formal style", + max_samples=10, + output_format="alpaca" + ) + + print(" Running pipeline...") + result = pipeline.run_pipeline(test_config, output_format="alpaca", save_splits=True, create_hf_dataset=True, save_hf_dataset=True) + + print(" ✅ Pipeline completed successfully!") + print(f" Total samples: {result['analysis']['overall']['total_samples']}") + print(f" Split sizes: {result['analysis']['overall']['split_sizes']}") + print(f" Output directory: {result['output_dir']}") + + # Show HuggingFace dataset info if created + if 'hf_dataset' in result: + hf_dataset = result['hf_dataset'] + print(f" HuggingFace dataset created with {len(hf_dataset)} entries") + print(f" Dataset features: {hf_dataset.features}") + + # Show save path if saved to disk + if 'hf_dataset_path' in result: + print(f" Dataset saved to: {result['hf_dataset_path']}") + + # Show formatted example + if len(hf_dataset) > 0: + print(f" Example formatted text:") + print(f" {hf_dataset[0]['text'][:200]}...") + + # Show sample processed data + print("\n Sample processed data:") + for split_name, split_data in result['data'].items(): + if split_data: + print(f" {split_name} split:") + for i, item in enumerate(split_data[:2]): # Show first 2 items + print(f" Item {i+1}:") + print(f" Instruction: {item['instruction']}") + print(f" Input: {item['input'][:50]}...") + print(f" Output: {item['output'][:50]}...") + break + + except Exception as e: + print(f" ❌ Error running pipeline: {e}") + + print("\n" + "=" * 50) + print("Test completed!") + +def test_hf_dataset_save_load(): + """Test HuggingFace dataset save and load functionality""" + + print("\nTesting HuggingFace Dataset Save/Load") + print("=" * 50) + + from pipelines.styling.data_processor import save_hf_dataset_to_disk, load_hf_dataset_from_disk + + # Create a sample dataset for testing + sample_data = [ + { + "instruction": "Rewrite in formal style", + "input": "Hey, what's up?", + "output": "Hello, how are you?" + }, + { + "instruction": "Rewrite in formal style", + "input": "This is really cool!", + "output": "This is quite impressive." + } + ] + + # Test configuration + config = create_custom_config( + data_path="dummy", + instruction="Rewrite in formal style" + ) + + # Convert to HuggingFace dataset + pipeline = StylingDataPipeline() + hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config) + + print(f"Created HuggingFace dataset with {len(hf_dataset)} entries") + + # Test saving to disk + save_path = "./data/processed/styling/test_hf_dataset" + print(f"\nSaving dataset to: {save_path}") + + success = save_hf_dataset_to_disk(hf_dataset, save_path) + if success: + print("✅ Dataset saved successfully!") + + # Test loading from disk + print(f"\nLoading dataset from: {save_path}") + loaded_dataset = load_hf_dataset_from_disk(save_path) + + if loaded_dataset is not None: + print("✅ Dataset loaded successfully!") + print(f"Loaded dataset has {len(loaded_dataset)} entries") + print(f"Features: {loaded_dataset.features}") + + # Show sample data + print("\nSample loaded data:") + for i in range(len(loaded_dataset)): + print(f" Entry {i+1}: {loaded_dataset[i]['text'][:100]}...") + else: + print("❌ Failed to load dataset") + else: + print("❌ Failed to save dataset") + + return hf_dataset + +def test_hf_dataset_conversion(): + """Test the HuggingFace dataset conversion""" + + print("\nTesting HuggingFace Dataset Conversion") + print("=" * 50) + + pipeline = StylingDataPipeline() + + # Sample data with instruction field + sample_data = [ + { + "instruction": "Rewrite in formal style", + "input": "Hey, what's up?", + "output": "Hello, how are you?" + }, + { + "instruction": "Rewrite in formal style", + "input": "This is really cool!", + "output": "This is quite impressive." + } + ] + + # Test configuration + config = create_custom_config( + data_path="dummy", + instruction="Rewrite in formal style" + ) + + # Convert to HuggingFace dataset + hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config) + + print(f"HuggingFace dataset created with {len(hf_dataset)} entries") + print(f"Dataset features: {hf_dataset.features}") + + # Show formatted examples + print("\nFormatted examples:") + for i in range(len(hf_dataset)): + print(f" Example {i+1}:") + print(f" {hf_dataset[i]['text'][:150]}...") + print() + + # Test the dataset can be used for training + print("Dataset ready for training!") + print(f"Number of training examples: {len(hf_dataset)}") + + return hf_dataset + + +if __name__ == "__main__": + test_styling_pipeline() + # test_hf_dataset_save_load() + # test_hf_dataset_conversion() diff --git a/test.readme b/test.readme new file mode 100644 index 0000000..e69de29