unsloth_compiled_cache/__pycache__/UnslothRewardTrainer.cpython-310.pyc

o
ö×°h¯¤ã@sòdZddlmZddlZddlmZddlmZddlmZm	Z	m
Z
mZmZm
Z
mZmZddlmZmZmZmZmZmZmZmZm
Z
mZmZmZmZmZmZmZmZmZm Z m!Z!mZm"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0mZm1Z1m2Z2m3Z3m4Z4m5Z5mZm6Z6m
Z
mZmZm Z m+Z+m1Z1mZddl1Z1ddlTddl7m8Z8m9Z9dd	l:m;Z;ddlZddl<Z=dd
l>m?Z?ddlmZddl@mAZAmBZCdd
dd
d
dœZDejEddeDd�dd„ƒZFe8Gdd„deƒƒZG	Gdd„de ƒZHGdd„deHƒZIdS)z9
2025.8.9
2025.8.10
4.55.4
0.21.0
__UNSLOTH_VERSIONING__
é)ÚTensorN)Ú
functional)ÚAnyÚListÚOptionalÚTupleÚUnionÚDictÚSetÚCallable)3rÚBaseImageProcessorrÚDataCollatorÚDatasetÚEvalPredictionÚFeatureExtractionMixinÚFrozenInstanceErrorrÚPartialStateÚPathÚ	PeftModelÚPreTrainedModelÚPreTrainedTokenizerBaseÚProcessorMixinÚRewardConfigÚRewardDataCollatorWithPaddingÚ
RewardTrainerÚTrainerÚTrainerCallbackrÚ	_tokenizeÚcompute_accuracyÚdecode_and_strip_paddingÚdefaultdictÚdisable_dropout_in_modelÚ
gather_objectÚgenerate_model_cardÚget_comet_experiment_urlÚinspectÚis_peft_availableÚis_rich_availableÚis_wandb_availableÚlog_table_to_comet_experimentÚmaybe_apply_chat_templateÚ
nested_detachÚnnÚosÚpdÚprepare_model_for_kbit_trainingÚprint_rich_tableÚreplaceÚtorchÚwarningsrrrrr&r-r2)Ú*)Ú	dataclassÚfield)ÚVersion)Únullcontext)ÚDataCollatorForSeq2SeqÚDataCollatorForLanguageModelingTF)Úepilogue_fusionÚmax_autotuneÚ
shape_paddingz
trace.enabledztriton.cudagraphs)ÚdynamicÚ	fullgraphÚoptionsc
Cs¾tj| d|jd¡ddd�}tj| d¡ddd�}g}t||ƒD](\}}| tj¡}tj|d| d¡d� 	d¡}tj
|dd�}||}	| |	¡q!	t |¡}| |jd|jdf¡}|S)Néÿÿÿÿér)ÚchunksÚdim)rDÚindex©rDé)
r2ÚchunkÚreshapeÚshapeÚzipÚtoÚfloat32ÚgatherÚ	unsqueezeÚsqueezeÚ	logsumexpÚappendÚconcat)
ÚlogitsrEÚchunked_logitsÚ
chunked_indexÚall_per_token_logpsÚchunk_logitsÚchunk_indexÚselected_logitsÚlogsumexp_valuesÚper_token_logps©r]úT/workspace/DS-LLM-TEMPLATE-FINETUNING/unsloth_compiled_cache/UnslothRewardTrainer.pyÚchunked_selective_log_softmax"s
r_cs†eZdZUdZedddid�Zeeed<edddid�Z	ee
ed	<eddd
id�Zee
ed<						
																														 									!	!					"	#								$														$						%	&				'												(									#				$				)	*														+						d.‡fd,d-„	Z‡Z
S)/ÚUnslothRewardConfigaI
    
    Configuration class for the [`RewardTrainer`].

    This class includes only the parameters that are specific to Reward training. For a full list of training
    arguments, please refer to the [`~transformers.TrainingArguments`] documentation. Note that default values in this
    class may differ from those in [`~transformers.TrainingArguments`].

    Using [`~transformers.HfArgumentParser`] we can turn this class into
    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
    command line.

    Parameters:
        max_length (`int` or `None`, *optional*, defaults to `1024`):
            Maximum length of the sequences (prompt + completion) in the batch, filters out entries that exceed the
            limit. This argument is required if you want to use the default data collator.
        disable_dropout (`bool`, *optional*, defaults to `True`):
            Whether to disable dropout in the model.
        dataset_num_proc (`int`, *optional*, defaults to `None`):
            Number of processes to use for processing the dataset.
        center_rewards_coefficient (`float`, *optional*, defaults to `None`):
            Coefficient to incentivize the reward model to output mean-zero rewards (proposed by
            https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.
        remove_unused_columns (`bool`, *optional*, defaults to `False`):
            Whether to remove the columns that are not used by the model's forward pass. Can be `True` only if the
            dataset is pretokenized.
    
    NÚhelpzvLLM SamplingParams)ÚdefaultÚmetadataÚvllm_sampling_paramsrAz8Chunk size to reduce memory usage. -1 is most efficient.Úunsloth_num_chunksz'Maximum sequence length to truncate to.Úmax_seq_lengthFÚnorBéréúç-Cëâ6
?ç{®Gáz„?çÍÌÌÌÌÌì?ç+‡ÙÎ÷ï?ç:Œ0âŽyE>çð?ç@Úlinearçš™™™™™¹?ÚpassiveÚwarningTÚstepsrGéôéO
ÚO1ÚautoÚçÚ
adamw_8bitÚlengthÚ
every_saveÚlastéécˆŠs´|dkrtd|›d�ƒ‚|dkrtd|›d�ƒ‚|dur(|#dkr(|$dkr(d}d	}#|ƒdur:d
dlm}‰t|‰ƒdd
ƒ}ƒtƒjd’id|“d|“d|“d|“d|“d|“d|“d|“d|	“d|
“d|“d|“d|
“d|“d|“d|“d|“d|“d |“d!|“d"|“d#|“d$|“d%|“d&|“d'|“d(|“d)|“d*|“d+|“d,|“d-| “d.|!“d/|"“d0|#“d1|$“d2|%“d3|&“d4|'“d5|(“d6|)“d7|*“d8|+“d9|,“d:|-“d;|.“d<|/“d=|0“d>|1“d?|2“d@|3“dA|4“dB|5“dC|6“dD|7“dE|8“dF|9“dG|:“dH|;“dI|<“dJ|=“dK|>“dL|?“dM|@“dN|A“dO|B“dP|C“dQ|D“dR|E“dS|F“dT|G“dU|H“dV|I“dW|J“dX|K“dY|L“dZ|M“d[|N“d\|O“d]|P“d^|Q“d_|R“d`|S“da|T“db|U“dc|V“dd|W“de|X“df|Y“dg|Z“dh|[“di|\“dj|]“dk|^“dl|_“dm|`“dn|a“do|b“dp|c“dq|d“dr|e“ds|f“dt|g“du|h“dv|i“dw|j“dx|k“dy|l“dz|m“d{|n“d||o“d}|p“d~|q“d|r“d€|s“d�|t“d‚|u“dƒ|v“d„|w“d…|x“d†|y“d‡|z“dˆ|{“d‰||“dŠ|}“d‹|~“dŒ|“d�|€“dŽ|�“d�|‚“d�|ƒ“d‘|„“|ˆ¤Ž|…|_|†|_|‡|_	dS)“NgH¯¼šò×z>z Unsloth: Your learning rate of `zi` is too small and less than 1e-7! Consider increasing it, otherwise gradient updates will be close to 0!rGza` is way too larger > 1! Consider decreasing it to 1e-1, otherwise gradient updates will explode!rurvÚunsloth_training_checkpointsrgr)Ú	cpu_countrBrhÚ
output_dirÚoverwrite_output_dirÚdo_trainÚdo_evalÚ
do_predictÚ
eval_strategyÚprediction_loss_onlyÚper_device_train_batch_sizeÚper_device_eval_batch_sizeÚper_gpu_train_batch_sizeÚper_gpu_eval_batch_sizeÚgradient_accumulation_stepsÚeval_accumulation_stepsÚ
eval_delayÚtorch_empty_cache_stepsÚ
learning_rateÚweight_decayÚ
adam_beta1Ú
adam_beta2Úadam_epsilonÚ
max_grad_normÚnum_train_epochsÚ	max_stepsÚlr_scheduler_typeÚwarmup_ratioÚwarmup_stepsÚ	log_levelÚlog_level_replicaÚlog_on_each_nodeÚlogging_dirÚlogging_strategyÚlogging_first_stepÚ
logging_stepsÚlogging_nan_inf_filterÚ
save_strategyÚ
save_stepsÚsave_total_limitÚsave_safetensorsÚsave_on_each_nodeÚsave_only_modelÚ'restore_callback_states_from_checkpointÚno_cudaÚuse_cpuÚuse_mps_deviceÚseedÚ	data_seedÚ
jit_mode_evalÚuse_ipexÚbf16Úfp16Úfp16_opt_levelÚhalf_precision_backendÚbf16_full_evalÚfp16_full_evalÚtf32Ú
local_rankÚddp_backendÚ
tpu_num_coresÚtpu_metrics_debugÚdebugÚdataloader_drop_lastÚ
eval_stepsÚdataloader_num_workersÚdataloader_prefetch_factorÚ
past_indexÚrun_nameÚdisable_tqdmÚremove_unused_columnsÚlabel_namesÚload_best_model_at_endÚmetric_for_best_modelÚgreater_is_betterÚignore_data_skipÚfsdpÚfsdp_min_num_paramsÚfsdp_configÚ"fsdp_transformer_layer_cls_to_wrapÚaccelerator_configÚ	deepspeedÚlabel_smoothing_factorÚoptimÚ
optim_argsÚ	adafactorÚgroup_by_lengthÚlength_column_nameÚ	report_toÚddp_find_unused_parametersÚddp_bucket_cap_mbÚddp_broadcast_buffersÚdataloader_pin_memoryÚdataloader_persistent_workersÚskip_memory_metricsÚuse_legacy_prediction_loopÚpush_to_hubÚresume_from_checkpointÚhub_model_idÚhub_strategyÚ	hub_tokenÚhub_private_repoÚhub_always_pushÚhub_revisionÚgradient_checkpointingÚgradient_checkpointing_kwargsÚinclude_inputs_for_metricsÚeval_do_concat_batchesÚfp16_backendÚpush_to_hub_model_idÚpush_to_hub_organizationÚpush_to_hub_tokenÚ
mp_parametersÚauto_find_batch_sizeÚfull_determinismÚtorchdynamoÚ	ray_scopeÚddp_timeoutÚ
torch_compileÚtorch_compile_backendÚtorch_compile_modeÚinclude_tokens_per_secondÚinclude_num_input_tokens_seenÚneftune_noise_alphaÚoptim_target_modulesÚbatch_eval_metricsÚ
eval_on_startÚuse_liger_kernelÚliger_kernel_configÚeval_use_gather_objectÚaverage_tokens_across_devicesÚ
max_lengthÚdisable_dropoutÚdataset_num_procÚcenter_rewards_coefficientr])
ÚFloatingPointErrorÚ
OverflowErrorÚmultiprocessingrƒÚmaxÚsuperÚ__init__rdrerf)ŠÚselfr„r…r†r‡rˆr‰rŠr‹rŒr�rŽr�r�r‘r’r“r”r•r–r—r˜r™ršr›rœr�ržrŸr r¡r¢r£r¤r¥r¦r§r¨r©rªr«r¬rr®r¯r°r±r²r³r´rµr¶r·r¸r¹rºr»r¼r½r¾r¿rÀrÁrÂrÃrÄrÅrÆrÇrÈrÉrÊrËrÌrÍrÎrÏrÐrÑrÒrÓrÔrÕrÖr×rØrÙrÚrÛrÜrÝrÞrßràrárârãrärårærçrèrérêrërìrírîrïrðrñròrórôrõrör÷rørùrúrûrürýrþrÿrrrrrrrrrdrerfÚkwargsrƒ©Ú	__class__r]r^r
]s@ÿþýüûúùø	÷
öõô
óòñðïîíìëêéèçæåäãâá à!ß"Þ#Ý$Ü%Û&Ú'Ù(Ø)×*Ö+Õ,Ô-Ó.Ò/Ñ0Ð1Ï2Î3Í4Ì5Ë6Ê7É8È9Ç:Æ;Å<Ä=Ã>Â?Á@ÀA¿B¾C½D¼E»FºG¹H¸I·J¶KµL´M³N²O±P°Q¯R®ST¬U«VªW©X¨Y§Z¦[¥\¤]£^¢_¡` aŸbžc�dœe›fšg™h˜i—j–k•l”m“n’o‘p�q�rŽs�tŒu‹vŠw‰xˆy‡z†{…|„}ƒ~‚��ÿ�þ�ý�ü�û
zUnslothRewardConfig.__init__)‡NNFFFrgFrBrBNNrhrhrrirjrkrlrmrnrorprArqrrrrsrtTNruFrGFrurvNTFFFFFFrwrwFFFFrxryFFNrANNFrzFNrNrANNFNFNNFrzrNNNNr{r|NFFr}NNNNTFTFFNNr~NNFNFNFTryNNNrzTFNrr€FNNFFNNFFFNFTr�TNNNrAN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r6rdrrÚ__annotations__reÚintrfr
Ú
__classcell__r]r]rr^r`3s.
þþþ�÷r`cseZdZddgZ												d(deeeejfdee	dee
deed	eeeee
effd
eeeeeefdeegefdeeegefd
eeedeejjejjjfdeeejejgejfdeef‡fdd„
Z		d)deeejfdee
eejeffdeejeejee
ejffffdd„Z	d*deeejfdee
eejeffde deee
deeejeejeejff
dd„Z!‡fdd„Z"de#fdd „Z$‡fd!d"„Z%			d+d#ee
d$ee
d%ee
ee
dffd&d'„Z&‡Z'S),Ú_UnslothRewardTrainerÚtrlzreward-trainerN©NNÚmodelÚargsÚ
data_collatorÚ
train_datasetÚeval_datasetÚprocessing_classÚ
model_initÚcompute_metricsÚ	callbacksÚ
optimizersÚpreprocess_logits_for_metricsÚpeft_configc

sHtƒs|durtdƒ‚tƒrV|durVt|tƒsVt|ddƒs#t|ddƒrTdtt t¡j	ƒv}
d|j
i}|
s@|jdur@t 
dt¡n|
rL|jdurL|j|d<t|fi|¤Ž}|}|jr]t|ƒ|durct}|dur˜|durotd	ƒ‚|j‰t|ƒ}|jr”zd|_Wnty�t|dd
�}Ynwt 
dt¡d|_nd|_d|jd
<d|jv�rtƒ ¡�Nd|i}|jtd|id�}|jtd||jd�}|j ‡fdd„|jd�}|durò|jtd|id�}|jt|d|jd�}|j ‡fdd„|jd�}Wdƒn1süwYt!ƒj"|||||||||	|
|d�t#|j$dƒ�r"|j$ %|j&¡dSdS)aÂ	
        Initialize RewardTrainer.

        Args:
            model (`transformers.PreTrainedModel`):
                The model to train, preferably an `AutoModelForSequenceClassification`.
            args (`RewardConfig`):
                The arguments to use for training.
            data_collator (`transformers.DataCollator`):
                The data collator to use for training. If None is specified, the default data collator
                (`RewardDataCollatorWithPadding`) will be used which will pad the sequences to the maximum length of
                the sequences in the batch, given a dataset of paired sequences.
            train_dataset (`datasets.Dataset`):
                The dataset to use for training.
            eval_dataset (`datasets.Dataset`):
                The dataset to use for evaluation.
            processing_class ([`~transformers.PreTrainedTokenizerBase`], [`~transformers.BaseImageProcessor`], [`~transformers.FeatureExtractionMixin`] or [`~transformers.ProcessorMixin`], *optional*, defaults to `None`):
                Processing class used to process the data. If provided, will be used to automatically process the
                inputs for the model, and it will be saved along the model to make it easier to rerun an interrupted
                training or reuse the fine-tuned model.
            model_init (`Callable[[], transformers.PreTrainedModel]`):
                The model initializer to use for training. If None is specified, the default model initializer will be
                used.
            compute_metrics (`Callable[[transformers.EvalPrediction], dict]`, *optional* defaults to `compute_accuracy`):
                The metrics to use for evaluation. If no metrics are specified, the default metric (`compute_accuracy`)
                will be used.
            callbacks (`list[transformers.TrainerCallback]`):
                The callbacks to use for training.
            optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`):
                The optimizer and scheduler to use for training.
            preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`):
                The function to use to preprocess the logits before computing the metrics.
            peft_config (`dict`, defaults to `None`):
                The PEFT configuration to use for training. If you pass a PEFT configuration, the model will be wrapped
                in a PEFT model.
        NzvPEFT is not installed and you passed a `peft_config` in the trainer's kwargs, please install it to use the PEFT modelsÚis_loaded_in_8bitFÚis_quantizedrêÚuse_gradient_checkpointingzÂYou passed `gradient_checkpointing_kwargs` in the trainer's kwargs, but your peft version does not support it. please update to the latest version of peft to use `gradient_checkpointing_kwargs`.zYA processing_class must be specified when using the default RewardDataCollatorWithPadding)rÇz°When using RewardDataCollatorWithPadding, you should set `remove_unused_columns=False` in your RewardConfig we have set it for you, but you should do it yourself in the future.TÚestimate_tokensÚinput_ids_chosenÚ	tokenizer)Ú	fn_kwargs)Úbatchedr.Únum_proccó t|dƒˆkot|dƒˆkS©Nr,Úinput_ids_rejected©Úlen©Úx©rr]r^Ú<lambda>s z0_UnslothRewardTrainer.__init__.<locals>.<lambda>)r0)r.r/r0cr1r2r4r6r8r]r^r9s)rrrrr r!r"r#r$r%r&Úadd_model_tags)'r&Ú
ValueErrorÚ
isinstancerÚgetattrÚlistr%Ú	signaturer/Ú
parametersrérêr3ÚwarnÚUserWarningrr!rrrrÇrr1Úuse_reward_data_collatorÚwarnings_issuedÚcolumn_namesrÚmain_process_firstÚmapr*rrÚfilterrr
Úhasattrrr:Ú
_tag_names)rrrrrr r!r"r#r$r%r&r'Ú_supports_gc_kwargsÚprepare_model_kwargsr.rr8r^r
~s´8ÿ

ÿ
ý
ÿ
ÿý
	ü	
þÿü	
ý€ã#õÿz_UnslothRewardTrainer.__init__FÚinputsÚreturncCs²||d|ddd�d}||d|ddd�d}d|vr.tj |||d¡ ¡}ntj ||¡ ¡}|jjdurN||jjt ||d	¡7}|rW|||d
œfS|S)Nr,Úattention_mask_chosenT)Ú	input_idsÚattention_maskÚreturn_dictrTr3Úattention_mask_rejectedÚmarginrh)Úrewards_chosenÚrewards_rejected)r,rÚ
logsigmoidÚmeanrrr2)rrrMÚreturn_outputsÚnum_items_in_batchrUrVÚlossr]r]r^Úcompute_loss0s2ýüýü þz"_UnslothRewardTrainer.compute_lossrŠÚignore_keysc	sè| |¡}ˆdurt|jdƒrt|jjdgƒ‰ng‰t ¡�|j||dd�\}}Wdƒn1s3wY|r?|ddfS| ¡}t	‡fdd„| 
¡Dƒƒ}t|ƒ}t |¡j
dd�jd	d�j}t |jd	¡}| |¡}|||fS)
NÚconfigÚkeys_to_ignore_at_inferenceT)rYc3s �|]\}}|ˆvr|VqdS©Nr])Ú.0ÚkÚv©r]r]r^Ú	<genexpr>fs€z8_UnslothRewardTrainer.prediction_step.<locals>.<genexpr>rhrFr)Ú_prepare_inputsrIrr=r^r2Úno_gradr\ÚdetachÚtupleÚitemsr+ÚstackrXÚsoftmaxÚTÚzerosrJ)	rrrMrŠr]r[Úlogits_dictrTÚlabelsr]rdr^Úprediction_stepQs"

ÿ


z%_UnslothRewardTrainer.prediction_stepcs(| dd¡}| |¡tƒj|i|¤ŽS)NÚnum_print_samplesrB)ÚpopÚvisualize_samplesrÚevaluate)rrrrrrr]r^ruqs
z_UnslothRewardTrainer.evaluaterrcCs>| ¡}ttƒ}t|ƒD]P\}}|j|j|dd�\}}}t|d|jƒ}t|d|jƒ}|d t	|ƒ¡|d t	|ƒ¡|d t	dd	„| 
¡Dƒƒ¡|d
kr\t|dƒ|kr\nqt 
|¡}	|jjd
kr›tƒrst|	d|…ƒd|jjvr�d
dl}
|
jdur�|
 d