diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py index 4e5e78d303..6b6a0e186a 100644 --- a/trl/trainer/grpo_trainer.py +++ b/trl/trainer/grpo_trainer.py @@ -330,7 +330,9 @@ def reward_func(completions, **kwargs): Dataset to use for evaluation. It must meet the same requirements as `train_dataset`. processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`): Processing class used to process the data. The padding side must be set to "left". If `None`, the - processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`]. + processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`]. A + padding token, `processing_class.pad_token`, must be set. If the processing class has not set a padding + token, `processing_class.eos_token` will be used as the default. reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`): Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either: @@ -436,6 +438,8 @@ def __init__( # Processing class if processing_class is None: processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left") + if processing_class.pad_token is None: + processing_class.pad_token = processing_class.eos_token # Reward functions if not isinstance(reward_funcs, list):