import json import os import torch from datasets import Dataset from transformers import ( AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training import argparse def load_dataset(json_path): """Load the dataset from a JSON file.""" with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) # Convert to the format expected by Hugging Face formatted_data = [] for item in data: formatted_data.append({ "text": f"Prompt: {item['prompt']}\nCompletion: {item['completion']}\n\n" }) return Dataset.from_list(formatted_data) def tokenize_function(examples, tokenizer): """Tokenize the examples.""" return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512) def main(): class Args: def __init__(self): self.dataset_path = "datasets/adriana_finetune_dataset.json" self.model_name = "facebook/opt-350m" self.output_dir = "finetuned_model" self.num_train_epochs = 3 self.per_device_train_batch_size = 4 self.learning_rate = 5e-5 self.use_lora = False args = Args() # Load dataset print(f"Loading dataset from {args.dataset_path}") dataset = load_dataset(args.dataset_path) # Load tokenizer and model print(f"Loading tokenizer and model: {args.model_name}") tokenizer = AutoTokenizer.from_pretrained(args.model_name) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(args.model_name) # Apply LoRA if requested if args.use_lora: print("Applying LoRA for efficient finetuning") lora_config = LoraConfig( r=16, lora_alpha=32, target_modules=["c_attn", "c_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = prepare_model_for_kbit_training(model) model = get_peft_model(model, lora_config) # Tokenize dataset print("Tokenizing dataset") tokenized_dataset = dataset.map( lambda examples: tokenize_function(examples, tokenizer), batched=True, remove_columns=dataset.column_names ) # Set up training arguments training_args = TrainingArguments( output_dir=args.output_dir, num_train_epochs=args.num_train_epochs, per_device_train_batch_size=args.per_device_train_batch_size, learning_rate=args.learning_rate, weight_decay=0.01, logging_dir=f"{args.output_dir}/logs", logging_steps=10, save_strategy="epoch", fp16=torch.cuda.is_available(), ) # Set up data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False ) # Initialize trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=data_collator, ) # Train the model print("Starting training") trainer.train() # Save the model print(f"Saving model to {args.output_dir}") trainer.save_model(args.output_dir) tokenizer.save_pretrained(args.output_dir) print("Finetuning complete!") if __name__ == "__main__": main()