added style mimicking piepelines

2025-08-13 21:17:01 +01:00
parent fd54d4be39
commit 710d074b47
31 changed files with 3816 additions and 46 deletions
@@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Test script for the styling data processor
+"""
+
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from pipelines.styling.data_processor import StylingDataPipeline, create_custom_config, create_huggingface_config
+
+def test_styling_pipeline():
+    """Test the styling data processor with custom data"""
+    
+    print("Testing Styling Data Processor")
+    print("=" * 50)
+    
+    # Initialize the pipeline
+    pipeline = StylingDataPipeline()
+    
+    # Example 1: Load configuration from YAML
+    print("\n1. Loading configuration from YAML...")
+    try:
+        yaml_config = pipeline.load_config_from_yaml("./configs/styling/formal.yaml")
+        print(f"   ✅ YAML config loaded successfully!")
+        print(f"   Output directory: {yaml_config.output_dir}")
+        print(f"   Instruction: {yaml_config.instruction}")
+        print(f"   Input field: {yaml_config.input_field}")
+        print(f"   Output field: {yaml_config.output_field}")
+    except Exception as e:
+        print(f"   ❌ Error loading YAML config: {e}")
+        yaml_config = None
+    
+    # Example 2: Create custom dataset configuration
+    print("\n2. Creating custom dataset configuration...")
+    custom_config = create_custom_config(
+        data_path="./data/raw/styling/formal_dataset.jsonl",
+        data_format="jsonl",
+        input_field="text",
+        output_field="styled_text",
+        instruction="Rewrite the following text in a formal style",
+        max_samples=1000,
+        min_length=10,
+        max_length=256,
+        clean_text=True,
+        lowercase=False,
+        output_format="alpaca"
+    )
+    
+    print(f"   Input field: {custom_config.input_field} (maps to 'input')")
+    print(f"   Output field: {custom_config.output_field} (maps to 'output')")
+    print(f"   Instruction: {custom_config.instruction}")
+    print(f"   Max samples: {custom_config.max_samples}")
+    
+    # Example 3: Test with sample data (if available)
+    print("\n3. Testing pipeline with sample data...")
+    
+    # Create a sample dataset for testing
+    sample_data = [
+        {
+            "input": "Hey, what's up? How are you doing today?",
+            "output": "Hello, how are you doing today?"
+        },
+        {
+            "input": "This is really cool stuff!",
+            "output": "This is quite impressive material."
+        },
+        {
+            "input": "I'm gonna go to the store later.",
+            "output": "I will go to the store later."
+        }
+    ]
+    
+    # Save sample data to test file
+    import json
+    test_file = "./data/raw/styling/test_formal.jsonl"
+    os.makedirs(os.path.dirname(test_file), exist_ok=True)
+    
+    with open(test_file, 'w', encoding='utf-8') as f:
+        for item in sample_data:
+            f.write(json.dumps(item, ensure_ascii=False) + '\n')
+    
+    print(f"   Created test file: {test_file}")
+    
+    # Test the pipeline with the sample data
+    try:
+        test_config = create_custom_config(
+            data_path=test_file,
+            data_format="jsonl",
+            input_field="input",
+            output_field="output",
+            instruction="Rewrite the following text in a formal style",
+            max_samples=10,
+            output_format="alpaca"
+        )
+        
+        print("   Running pipeline...")
+        result = pipeline.run_pipeline(test_config, output_format="alpaca", save_splits=True, create_hf_dataset=True, save_hf_dataset=True)
+        
+        print("   ✅ Pipeline completed successfully!")
+        print(f"   Total samples: {result['analysis']['overall']['total_samples']}")
+        print(f"   Split sizes: {result['analysis']['overall']['split_sizes']}")
+        print(f"   Output directory: {result['output_dir']}")
+        
+        # Show HuggingFace dataset info if created
+        if 'hf_dataset' in result:
+            hf_dataset = result['hf_dataset']
+            print(f"   HuggingFace dataset created with {len(hf_dataset)} entries")
+            print(f"   Dataset features: {hf_dataset.features}")
+            
+            # Show save path if saved to disk
+            if 'hf_dataset_path' in result:
+                print(f"   Dataset saved to: {result['hf_dataset_path']}")
+            
+            # Show formatted example
+            if len(hf_dataset) > 0:
+                print(f"   Example formatted text:")
+                print(f"   {hf_dataset[0]['text'][:200]}...")
+        
+        # Show sample processed data
+        print("\n   Sample processed data:")
+        for split_name, split_data in result['data'].items():
+            if split_data:
+                print(f"   {split_name} split:")
+                for i, item in enumerate(split_data[:2]):  # Show first 2 items
+                    print(f"     Item {i+1}:")
+                    print(f"       Instruction: {item['instruction']}")
+                    print(f"       Input: {item['input'][:50]}...")
+                    print(f"       Output: {item['output'][:50]}...")
+                break
+        
+    except Exception as e:
+        print(f"   ❌ Error running pipeline: {e}")
+    
+    print("\n" + "=" * 50)
+    print("Test completed!")
+
+def test_hf_dataset_save_load():
+    """Test HuggingFace dataset save and load functionality"""
+    
+    print("\nTesting HuggingFace Dataset Save/Load")
+    print("=" * 50)
+    
+    from pipelines.styling.data_processor import save_hf_dataset_to_disk, load_hf_dataset_from_disk
+    
+    # Create a sample dataset for testing
+    sample_data = [
+        {
+            "instruction": "Rewrite in formal style",
+            "input": "Hey, what's up?",
+            "output": "Hello, how are you?"
+        },
+        {
+            "instruction": "Rewrite in formal style", 
+            "input": "This is really cool!",
+            "output": "This is quite impressive."
+        }
+    ]
+    
+    # Test configuration
+    config = create_custom_config(
+        data_path="dummy",
+        instruction="Rewrite in formal style"
+    )
+    
+    # Convert to HuggingFace dataset
+    pipeline = StylingDataPipeline()
+    hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
+    
+    print(f"Created HuggingFace dataset with {len(hf_dataset)} entries")
+    
+    # Test saving to disk
+    save_path = "./data/processed/styling/test_hf_dataset"
+    print(f"\nSaving dataset to: {save_path}")
+    
+    success = save_hf_dataset_to_disk(hf_dataset, save_path)
+    if success:
+        print("✅ Dataset saved successfully!")
+        
+        # Test loading from disk
+        print(f"\nLoading dataset from: {save_path}")
+        loaded_dataset = load_hf_dataset_from_disk(save_path)
+        
+        if loaded_dataset is not None:
+            print("✅ Dataset loaded successfully!")
+            print(f"Loaded dataset has {len(loaded_dataset)} entries")
+            print(f"Features: {loaded_dataset.features}")
+            
+            # Show sample data
+            print("\nSample loaded data:")
+            for i in range(len(loaded_dataset)):
+                print(f"  Entry {i+1}: {loaded_dataset[i]['text'][:100]}...")
+        else:
+            print("❌ Failed to load dataset")
+    else:
+        print("❌ Failed to save dataset")
+    
+    return hf_dataset
+
+def test_hf_dataset_conversion():
+    """Test the HuggingFace dataset conversion"""
+    
+    print("\nTesting HuggingFace Dataset Conversion")
+    print("=" * 50)
+    
+    pipeline = StylingDataPipeline()
+    
+    # Sample data with instruction field
+    sample_data = [
+        {
+            "instruction": "Rewrite in formal style",
+            "input": "Hey, what's up?",
+            "output": "Hello, how are you?"
+        },
+        {
+            "instruction": "Rewrite in formal style", 
+            "input": "This is really cool!",
+            "output": "This is quite impressive."
+        }
+    ]
+    
+    # Test configuration
+    config = create_custom_config(
+        data_path="dummy",
+        instruction="Rewrite in formal style"
+    )
+    
+    # Convert to HuggingFace dataset
+    hf_dataset = pipeline.convert_to_hf_dataset(sample_data, config)
+    
+    print(f"HuggingFace dataset created with {len(hf_dataset)} entries")
+    print(f"Dataset features: {hf_dataset.features}")
+    
+    # Show formatted examples
+    print("\nFormatted examples:")
+    for i in range(len(hf_dataset)):
+        print(f"  Example {i+1}:")
+        print(f"    {hf_dataset[i]['text'][:150]}...")
+        print()
+    
+    # Test the dataset can be used for training
+    print("Dataset ready for training!")
+    print(f"Number of training examples: {len(hf_dataset)}")
+    
+    return hf_dataset
+
+
+if __name__ == "__main__":
+    test_styling_pipeline()
+    # test_hf_dataset_save_load()
+    # test_hf_dataset_conversion()