initial setupt

2025-08-06 22:45:37 +01:00
commit fef3f5ae35
42 changed files with 7147 additions and 0 deletions
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Classification Data Processor Script
+Uses YAML configurations for flexible and maintainable data processing.
+"""
+
+import sys
+import os
+import subprocess
+import argparse
+from pathlib import Path
+
+def run_with_yaml_config(config_path: str, **cli_overrides):
+    """Run data processor with YAML configuration"""
+    print(f"=== Running Classification Data Processor ===")
+    print(f"Config: {config_path}")
+    
+    cmd = [
+        "python", "pipelines/classification/data_processor.py",
+        "--config", config_path
+    ]
+    
+    # Add CLI overrides
+    for key, value in cli_overrides.items():
+        if value is not None:
+            cmd.extend([f"--{key.replace('_', '-')}", str(value)])
+    
+    print(f"Command: {' '.join(cmd)}")
+    print()
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print("✅ Data processing completed successfully!")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error running data processor: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def run_emotion_example():
+    """Run emotion classification example"""
+    print("=== Emotion Classification Example ===")
+    
+    success = run_with_yaml_config(
+        "configs/classification/emotion.yaml",
+        max_samples=500,  # Override YAML value
+        output_dir="./data/emotion_small"
+    )
+    
+    if success:
+        print("✅ Emotion classification data processing completed!")
+    else:
+        print("❌ Emotion classification failed!")
+
+def run_custom_example():
+    """Run custom dataset example"""
+    print("\n=== Custom Dataset Example ===")
+    
+    if os.path.exists("data/classification/train.jsonl"):
+        success = run_with_yaml_config(
+            "configs/classification/custom.yaml",
+            data_source="custom",
+            data_path="data/classification/train.jsonl",
+            output_dir="./data/custom_processed"
+        )
+        if success:
+            print("✅ Custom dataset processing completed!")
+        else:
+            print("❌ Custom dataset processing failed!")
+    else:
+        print("⚠️  Custom dataset not found, skipping...")
+
+def create_custom_config():
+    """Create a custom configuration file"""
+    custom_config = """task:
+  name: "classification"
+  type: "sequence_classification"
+
+data:
+  source: "custom"
+  data_format: "jsonl"
+  input_field: "text"
+  label_field: "label"
+  max_samples: 1000
+  train_split: 0.8
+  validation_split: 0.1
+  test_split: 0.1
+
+processing:
+  clean_text: true
+  lowercase: true
+  min_length: 10
+  max_length: 1000
+
+output:
+  output_dir: "./data/custom_processed"
+  output_format: "classification"
+"""
+    
+    config_path = "configs/classification/custom.yaml"
+    with open(config_path, 'w') as f:
+        f.write(custom_config)
+    
+    print(f"✅ Created custom config: {config_path}")
+
+def show_usage():
+    """Show usage examples"""
+    print("=== Classification Data Processor Usage ===")
+    print()
+    print("1. Use YAML config only:")
+    print("   python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
+    print()
+    print("2. Override YAML values:")
+    print("   python scripts/classification/data_processor.py --config configs/classification/emotion.yaml --max-samples 500")
+    print()
+    print("3. Use CLI only (backward compatibility):")
+    print("   python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
+    print()
+    print("4. Run examples:")
+    print("   python scripts/classification/data_processor.py examples")
+    print()
+    print("5. Create custom config:")
+    print("   python scripts/classification/data_processor.py create-config")
+
+def handle_direct_args():
+    """Handle direct command-line arguments by passing them to the pipeline"""
+    parser = argparse.ArgumentParser(description="Classification Data Processor")
+    
+    # Add all the same arguments as the pipeline
+    parser.add_argument("--config", type=str, help="Path to YAML configuration file")
+    parser.add_argument("--data-source", choices=["huggingface", "custom"], help="Data source")
+    parser.add_argument("--dataset-name", type=str, help="HuggingFace dataset name")
+    parser.add_argument("--data-path", type=str, help="Path to custom data file")
+    parser.add_argument("--data-format", choices=["jsonl", "csv", "json"], help="Data format")
+    parser.add_argument("--input-field", type=str, help="Input field name")
+    parser.add_argument("--label-field", type=str, help="Label field name")
+    parser.add_argument("--id-field", type=str, help="Optional ID field name")
+    parser.add_argument("--max-samples", type=int, help="Maximum samples to process")
+    parser.add_argument("--train-split", type=float, help="Training split ratio")
+    parser.add_argument("--validation-split", type=float, help="Validation split ratio")
+    parser.add_argument("--test-split", type=float, help="Test split ratio")
+    parser.add_argument("--clean-text", action="store_true", help="Clean and normalize text")
+    parser.add_argument("--remove-special-chars", action="store_true", help="Remove special characters")
+    parser.add_argument("--lowercase", action="store_true", help="Convert text to lowercase")
+    parser.add_argument("--min-length", type=int, help="Minimum text length")
+    parser.add_argument("--max-length", type=int, help="Maximum text length")
+    parser.add_argument("--output-format", choices=["classification", "instruction", "conversation", "qa"], help="Output format")
+    parser.add_argument("--output-dir", type=str, help="Output directory")
+    parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
+    
+    args = parser.parse_args()
+    
+    # Build command to call the pipeline
+    cmd = ["python", "pipelines/classification/data_processor.py"]
+    
+    # Add all arguments that were provided
+    for arg_name, arg_value in vars(args).items():
+        if arg_value is not None:
+            if isinstance(arg_value, bool):
+                if arg_value:  # Only add flag if True
+                    cmd.append(f"--{arg_name.replace('_', '-')}")
+            else:
+                cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
+    
+    print(f"Running: {' '.join(cmd)}")
+    print()
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print("✅ Data processing completed successfully!")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error running data processor: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def main():
+    """Main function"""
+    # Check if any command-line arguments were provided
+    if len(sys.argv) > 1:
+        # Check if it's a subcommand
+        if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
+            # Handle subcommands
+            if sys.argv[1] == "examples":
+                run_emotion_example()
+                run_custom_example()
+            elif sys.argv[1] == "emotion":
+                run_emotion_example()
+            elif sys.argv[1] == "custom":
+                run_custom_example()
+            elif sys.argv[1] == "create-config":
+                create_custom_config()
+            elif sys.argv[1] == "help":
+                show_usage()
+        else:
+            # Handle direct arguments (pass through to pipeline)
+            handle_direct_args()
+    else:
+        print("Classification Data Processor")
+        print("============================")
+        print()
+        print("This script processes classification datasets using YAML configurations.")
+        print()
+        print("Usage:")
+        print("  python scripts/classification/data_processor.py examples     # Run examples")
+        print("  python scripts/classification/data_processor.py emotion      # Run emotion example")
+        print("  python scripts/classification/data_processor.py custom       # Run custom example")
+        print("  python scripts/classification/data_processor.py create-config # Create custom config")
+        print("  python scripts/classification/data_processor.py help         # Show usage")
+        print()
+        print("Direct pipeline usage:")
+        print("  python scripts/classification/data_processor.py --config configs/classification/emotion.yaml")
+        print("  python scripts/classification/data_processor.py --data-source huggingface --dataset-name dair-ai/emotion")
+        print()
+        print("Benefits of YAML configurations:")
+        print("  ✅ Easier to manage complex configurations")
+        print("  ✅ Version control friendly")
+        print("  ✅ Self-documenting")
+        print("  ✅ Can still override with CLI args")
+        print("  ✅ Better for team collaboration")
+
+if __name__ == "__main__":
+    main() 
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+"""
+Classification Inference Script
+Uses YAML configurations for flexible and maintainable model inference.
+"""
+
+import sys
+import os
+import subprocess
+import argparse
+from pathlib import Path
+
+def run_with_yaml_config(config_path: str, **cli_overrides):
+    """Run inference with YAML configuration"""
+    print(f"=== Running Classification Inference ===")
+    print(f"Config: {config_path}")
+    
+    cmd = [
+        "python", "pipelines/classification/inference.py",
+        "--config", config_path
+    ]
+    
+    # Add CLI overrides
+    for key, value in cli_overrides.items():
+        if value is not None:
+            cmd.extend([f"--{key.replace('_', '-')}", str(value)])
+    
+    print(f"Command: {' '.join(cmd)}")
+    print()
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print("✅ Inference completed successfully!")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error running inference: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def run_single_text_inference():
+    """Run single text inference"""
+    print("=== Single Text Inference ===")
+    
+    # Check if model exists
+    model_path = "./results/emotion_model"
+    if not os.path.exists(model_path):
+        print(f"⚠️  Model not found: {model_path}")
+        print("Please train a model first using the trainer script.")
+        return False
+    
+    success = run_with_yaml_config(
+        "configs/classification/emotion.yaml",
+        model_path=model_path,
+        input_text="I love this product! It's amazing.",
+        return_top_k=3
+    )
+    
+    if success:
+        print("✅ Single text inference completed!")
+    else:
+        print("❌ Single text inference failed!")
+    
+    return success
+
+def run_file_inference():
+    """Run file-based inference"""
+    print("\n=== File-Based Inference ===")
+    
+    # Check if model exists
+    model_path = "./results/emotion_model"
+    if not os.path.exists(model_path):
+        print(f"⚠️  Model not found: {model_path}")
+        print("Please train a model first using the trainer script.")
+        return False
+    
+    # Create sample input file
+    sample_texts = [
+        "I love this product! It's amazing.",
+        "This is terrible, I hate it.",
+        "The weather is okay today.",
+        "Best purchase ever made!"
+    ]
+    
+    input_file = "sample_texts.txt"
+    with open(input_file, 'w') as f:
+        for text in sample_texts:
+            f.write(text + '\n')
+    
+    success = run_with_yaml_config(
+        "configs/classification/emotion.yaml",
+        model_path=model_path,
+        input_file=input_file,
+        output_file="predictions.jsonl",
+        batch_size=16
+    )
+    
+    if success:
+        print("✅ File-based inference completed!")
+        print(f"Results saved to: predictions.jsonl")
+    else:
+        print("❌ File-based inference failed!")
+    
+    return success
+
+def run_interactive_inference():
+    """Run interactive inference"""
+    print("\n=== Interactive Inference ===")
+    
+    # Check if model exists
+    model_path = "./results/emotion_model"
+    if not os.path.exists(model_path):
+        print(f"⚠️  Model not found: {model_path}")
+        print("Please train a model first using the trainer script.")
+        return False
+    
+    success = run_with_yaml_config(
+        "configs/classification/emotion.yaml",
+        model_path=model_path,
+        return_top_k=3
+    )
+    
+    if success:
+        print("✅ Interactive inference completed!")
+    else:
+        print("❌ Interactive inference failed!")
+    
+    return success
+
+def create_inference_config():
+    """Create an inference configuration file"""
+    inference_config = """model_path: "./results/emotion_model"
+device: "auto"
+batch_size: 32
+max_length: 512
+return_probabilities: true
+return_top_k: 3
+"""
+    
+    config_path = "configs/classification/inference.yaml"
+    with open(config_path, 'w') as f:
+        f.write(inference_config)
+    
+    print(f"✅ Created inference config: {config_path}")
+
+def show_usage():
+    """Show usage examples"""
+    print("=== Classification Inference Usage ===")
+    print()
+    print("1. Use YAML config only:")
+    print("   python scripts/classification/inference.py --config configs/classification/inference.yaml")
+    print()
+    print("2. Override YAML values:")
+    print("   python scripts/classification/inference.py --config configs/classification/inference.yaml --input-text 'Your text here'")
+    print()
+    print("3. Use CLI only (backward compatibility):")
+    print("   python scripts/classification/inference.py --model-path ./results/emotion_model --input-text 'Your text here'")
+    print()
+    print("4. Run examples:")
+    print("   python scripts/classification/inference.py examples")
+    print()
+    print("5. Create inference config:")
+    print("   python scripts/classification/inference.py create-config")
+
+def handle_direct_args():
+    """Handle direct command-line arguments by passing them to the pipeline"""
+    parser = argparse.ArgumentParser(description="Classification Inference")
+    
+    # Add all the same arguments as the pipeline
+    parser.add_argument("--config", type=str, help="Path to YAML configuration file")
+    parser.add_argument("--model-path", type=str, help="Path to saved model directory")
+    parser.add_argument("--device", choices=["auto", "cuda", "cpu"], help="Device to run inference on")
+    parser.add_argument("--batch-size", type=int, help="Batch size for inference")
+    parser.add_argument("--max-length", type=int, help="Maximum sequence length for tokenization")
+    parser.add_argument("--return-probabilities", action="store_true", help="Return all class probabilities")
+    parser.add_argument("--return-top-k", type=int, help="Return top K predictions")
+    parser.add_argument("--input-text", type=str, help="Single text for prediction")
+    parser.add_argument("--input-file", type=str, help="Input file path (txt or jsonl)")
+    parser.add_argument("--output-file", type=str, help="Output file path for results")
+    parser.add_argument("--chunk-size", type=int, help="Chunk size for large file processing")
+    parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
+    
+    args = parser.parse_args()
+    
+    # Build command to call the pipeline
+    cmd = ["python", "pipelines/classification/inference.py"]
+    
+    # Add all arguments that were provided
+    for arg_name, arg_value in vars(args).items():
+        if arg_value is not None:
+            if isinstance(arg_value, bool):
+                if arg_value:  # Only add flag if True
+                    cmd.append(f"--{arg_name.replace('_', '-')}")
+            else:
+                cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
+    
+    print(f"Running: {' '.join(cmd)}")
+    print()
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print("✅ Inference completed successfully!")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error running inference: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def main():
+    """Main function"""
+    # Check if any command-line arguments were provided
+    if len(sys.argv) > 1:
+        # Check if it's a subcommand
+        if sys.argv[1] in ["examples", "single", "file", "interactive", "create-config", "help"]:
+            # Handle subcommands
+            if sys.argv[1] == "examples":
+                run_single_text_inference()
+                run_file_inference()
+                run_interactive_inference()
+            elif sys.argv[1] == "single":
+                run_single_text_inference()
+            elif sys.argv[1] == "file":
+                run_file_inference()
+            elif sys.argv[1] == "interactive":
+                run_interactive_inference()
+            elif sys.argv[1] == "create-config":
+                create_inference_config()
+            elif sys.argv[1] == "help":
+                show_usage()
+        else:
+            # Handle direct arguments (pass through to pipeline)
+            handle_direct_args()
+    else:
+        print("Classification Inference")
+        print("=======================")
+        print()
+        print("This script performs inference using trained classification models.")
+        print()
+        print("Usage:")
+        print("  python scripts/classification/inference.py examples     # Run examples")
+        print("  python scripts/classification/inference.py single       # Single text inference")
+        print("  python scripts/classification/inference.py file         # File-based inference")
+        print("  python scripts/classification/inference.py interactive  # Interactive inference")
+        print("  python scripts/classification/inference.py create-config # Create inference config")
+        print("  python scripts/classification/inference.py help         # Show usage")
+        print()
+        print("Direct pipeline usage:")
+        print("  python scripts/classification/inference.py --config configs/classification/inference.yaml")
+        print("  python scripts/classification/inference.py --model-path ./results/emotion_model --input-text 'Your text here'")
+        print()
+        print("Benefits of YAML configurations:")
+        print("  ✅ Easier to manage complex configurations")
+        print("  ✅ Version control friendly")
+        print("  ✅ Self-documenting")
+        print("  ✅ Can still override with CLI args")
+        print("  ✅ Better for team collaboration")
+
+if __name__ == "__main__":
+    main() 
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+Classification Trainer Script
+Uses YAML configurations for flexible and maintainable model training.
+"""
+
+import sys
+import os
+import subprocess
+import argparse
+from pathlib import Path
+
+def run_with_yaml_config(config_path: str, **cli_overrides):
+    """Run trainer with YAML configuration"""
+    print(f"=== Running Classification Trainer ===")
+    print(f"Config: {config_path}")
+    
+    cmd = [
+        "python", "pipelines/classification/train.py",
+        "--config", config_path
+    ]
+    
+    # Add CLI overrides
+    for key, value in cli_overrides.items():
+        if value is not None:
+            cmd.extend([f"--{key.replace('_', '-')}", str(value)])
+    
+    print(f"Command: {' '.join(cmd)}")
+    print()
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print("✅ Training completed successfully!")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error running trainer: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def run_emotion_training():
+    """Run emotion classification training"""
+    print("=== Emotion Classification Training ===")
+    
+    success = run_with_yaml_config(
+        "configs/classification/emotion.yaml",
+        num_epochs=2,  # Override YAML value
+        batch_size=8,   # Smaller batch for testing
+        output_dir="./results/emotion_model"
+    )
+    
+    if success:
+        print("✅ Emotion classification training completed!")
+    else:
+        print("❌ Emotion classification training failed!")
+
+def run_custom_training():
+    """Run custom dataset training"""
+    print("\n=== Custom Dataset Training ===")
+    
+    if os.path.exists("data/custom_processed/train.jsonl"):
+        success = run_with_yaml_config(
+            "configs/classification/custom.yaml",
+            data_dir="data/custom_processed",
+            output_dir="./results/custom_model"
+        )
+        if success:
+            print("✅ Custom dataset training completed!")
+        else:
+            print("❌ Custom dataset training failed!")
+    else:
+        print("⚠️  Custom dataset not found, skipping...")
+
+def create_training_config():
+    """Create a training configuration file"""
+    training_config = """model_name: "bert-base-uncased"
+max_length: 512
+num_epochs: 3
+batch_size: 16
+learning_rate: 2e-5
+weight_decay: 0.01
+lr_scheduler_type: "linear"
+warmup_ratio: 0.1
+data_dir: "./data/classification"
+output_dir: "./results/classification_model"
+"""
+    
+    config_path = "configs/classification/training.yaml"
+    with open(config_path, 'w') as f:
+        f.write(training_config)
+    
+    print(f"✅ Created training config: {config_path}")
+
+def show_usage():
+    """Show usage examples"""
+    print("=== Classification Trainer Usage ===")
+    print()
+    print("1. Use YAML config only:")
+    print("   python scripts/classification/trainer.py --config configs/classification/emotion.yaml")
+    print()
+    print("2. Override YAML values:")
+    print("   python scripts/classification/trainer.py --config configs/classification/emotion.yaml --num-epochs 5")
+    print()
+    print("3. Use CLI only (backward compatibility):")
+    print("   python scripts/classification/trainer.py --model-name bert-base-uncased --num-epochs 3")
+    print()
+    print("4. Run examples:")
+    print("   python scripts/classification/trainer.py examples")
+    print()
+    print("5. Create training config:")
+    print("   python scripts/classification/trainer.py create-config")
+
+def handle_direct_args():
+    """Handle direct command-line arguments by passing them to the pipeline"""
+    parser = argparse.ArgumentParser(description="Classification Trainer")
+    
+    # Add all the same arguments as the pipeline
+    parser.add_argument("--config", type=str, help="Path to YAML configuration file")
+    parser.add_argument("--model-name", type=str, help="Model name from HuggingFace Hub")
+    parser.add_argument("--max-length", type=int, help="Maximum sequence length for tokenization")
+    parser.add_argument("--num-epochs", type=int, help="Number of training epochs")
+    parser.add_argument("--batch-size", type=int, help="Training batch size")
+    parser.add_argument("--learning-rate", type=float, help="Learning rate")
+    parser.add_argument("--weight-decay", type=float, help="Weight decay for optimizer")
+    parser.add_argument("--lr-scheduler-type", choices=["linear", "cosine", "polynomial"], help="Learning rate scheduler type")
+    parser.add_argument("--warmup-ratio", type=float, help="Warmup ratio for scheduler")
+    parser.add_argument("--data-dir", type=str, help="Directory containing train/validation/test JSONL files")
+    parser.add_argument("--output-dir", type=str, help="Output directory for saved model")
+    parser.add_argument("--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO", help="Logging level")
+    
+    args = parser.parse_args()
+    
+    # Build command to call the pipeline
+    cmd = ["python", "pipelines/classification/train.py"]
+    
+    # Add all arguments that were provided
+    for arg_name, arg_value in vars(args).items():
+        if arg_value is not None:
+            if isinstance(arg_value, bool):
+                if arg_value:  # Only add flag if True
+                    cmd.append(f"--{arg_name.replace('_', '-')}")
+            else:
+                cmd.extend([f"--{arg_name.replace('_', '-')}", str(arg_value)])
+    
+    print(f"Running: {' '.join(cmd)}")
+    print()
+    
+    try:
+        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
+        print("✅ Training completed successfully!")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error running trainer: {e}")
+        print(f"Error output: {e.stderr}")
+        return False
+
+def main():
+    """Main function"""
+    # Check if any command-line arguments were provided
+    if len(sys.argv) > 1:
+        # Check if it's a subcommand
+        if sys.argv[1] in ["examples", "emotion", "custom", "create-config", "help"]:
+            # Handle subcommands
+            if sys.argv[1] == "examples":
+                run_emotion_training()
+                run_custom_training()
+            elif sys.argv[1] == "emotion":
+                run_emotion_training()
+            elif sys.argv[1] == "custom":
+                run_custom_training()
+            elif sys.argv[1] == "create-config":
+                create_training_config()
+            elif sys.argv[1] == "help":
+                show_usage()
+        else:
+            # Handle direct arguments (pass through to pipeline)
+            handle_direct_args()
+    else:
+        print("Classification Trainer")
+        print("====================")
+        print()
+        print("This script trains classification models using YAML configurations.")
+        print()
+        print("Usage:")
+        print("  python scripts/classification/trainer.py examples     # Run examples")
+        print("  python scripts/classification/trainer.py emotion      # Run emotion training")
+        print("  python scripts/classification/trainer.py custom       # Run custom training")
+        print("  python scripts/classification/trainer.py create-config # Create training config")
+        print("  python scripts/classification/trainer.py help         # Show usage")
+        print()
+        print("Direct pipeline usage:")
+        print("  python scripts/classification/trainer.py --config configs/classification/emotion.yaml")
+        print("  python scripts/classification/trainer.py --model-name bert-base-uncased --num-epochs 3")
+        print()
+        print("Benefits of YAML configurations:")
+        print("  ✅ Easier to manage complex configurations")
+        print("  ✅ Version control friendly")
+        print("  ✅ Self-documenting")
+        print("  ✅ Can still override with CLI args")
+        print("  ✅ Better for team collaboration")
+
+if __name__ == "__main__":
+    main()