huggingface · qgallouedec · Feb 16, 2026 · Feb 16, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/docs/source/grpo_trainer.md b/docs/source/grpo_trainer.md
@@ -648,6 +648,7 @@ trainer = GRPOTrainer(
 
 Tested with:
 
+- **GPT-OSS** — e.g., `openai/gpt-oss-20b`
 - **Qwen3** — e.g., `Qwen/Qwen3-0.6B`
 
 > [!TIP]
@@ -665,6 +666,8 @@ accelerate launch \
   ...
 ```
 
+You can also use `openai/gpt-oss-20b` as the base model.
+
 ## Vision-Language Model (VLM) Training
 
 GRPO supports training Vision-Language Models (VLMs) on multimodal datasets containing both text and images.

diff --git a/tests/test_chat_template_utils.py b/tests/test_chat_template_utils.py
@@ -216,10 +216,25 @@ def test_non_prefix_preserving_template(self):
 @pytest.mark.parametrize(
     "tokenizer_name",
     [
+        pytest.param("trl-internal-testing/tiny-GptOssForCausalLM", id="gpt-oss"),
         pytest.param("trl-internal-testing/tiny-Qwen3MoeForSequenceClassification", id="qwen3"),
     ],
 )
 class TestGetTrainingChatTemplate:
+    @staticmethod
+    def _replace_end(text: str, old: str, new: str) -> str:
+        if text.endswith(old):
+            return text[: -len(old)] + new
+        return text
+
+    def _assert_equal(self, tokenizer_name: str, before: str, after: str) -> None:
+        # Same as `before == after` but with a special case for GPT-OSS.
+        # For GPT-OSS, the training template replaces the final <|return|> with <|end|> to ensure prefix preservation,
+        # so we expect a difference in the output.
+        if tokenizer_name == "trl-internal-testing/tiny-GptOssForCausalLM":
+            before = self._replace_end(before, "<|return|>", "<|end|>")
+        assert before == after
+
     def test_new_chat_template_is_prefix_preserving(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
         assert is_chat_template_prefix_preserving(tokenizer) is False
@@ -232,7 +247,7 @@ def test_behavior_unchanged_single_user_no_generation_prompt(self, tokenizer_nam
         before = tokenizer.apply_chat_template(messages, tokenize=False)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_single_user_with_generation_prompt(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -245,7 +260,7 @@ def test_behavior_unchanged_single_user_with_generation_prompt(self, tokenizer_n
             add_generation_prompt=True,
             chat_template=new_chat_template,
         )
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_single_user_and_final_assistant_plain_content(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -256,7 +271,7 @@ def test_behavior_unchanged_single_user_and_final_assistant_plain_content(self,
         before = tokenizer.apply_chat_template(messages, tokenize=False)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_final_assistant_with_reasoning_content(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -271,7 +286,7 @@ def test_behavior_unchanged_final_assistant_with_reasoning_content(self, tokeniz
         before = tokenizer.apply_chat_template(messages, tokenize=False)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_final_assistant_with_existing_think_tags(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -285,7 +300,7 @@ def test_behavior_unchanged_final_assistant_with_existing_think_tags(self, token
         before = tokenizer.apply_chat_template(messages, tokenize=False)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_assistant_with_tool_calls(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -300,7 +315,7 @@ def test_behavior_unchanged_assistant_with_tool_calls(self, tokenizer_name):
         before = tokenizer.apply_chat_template(messages, tokenize=False)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_assistant_with_tool_calls_with_string_arguments(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -315,7 +330,7 @@ def test_behavior_unchanged_assistant_with_tool_calls_with_string_arguments(self
         before = tokenizer.apply_chat_template(messages, tokenize=False)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_with_tools_with_and_without_system_message(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -340,7 +355,7 @@ def test_behavior_unchanged_with_tools_with_and_without_system_message(self, tok
         before = tokenizer.apply_chat_template(messages, tokenize=False, tools=tools)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, tools=tools, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_with_tools_with_system_message(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -365,7 +380,7 @@ def test_behavior_unchanged_with_tools_with_system_message(self, tokenizer_name)
         before = tokenizer.apply_chat_template(messages, tokenize=False, tools=tools)
         new_chat_template = get_training_chat_template(tokenizer)
         after = tokenizer.apply_chat_template(messages, tokenize=False, tools=tools, chat_template=new_chat_template)
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
     def test_behavior_unchanged_generation_prompt_with_enable_thinking_false(self, tokenizer_name):
         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
@@ -381,7 +396,7 @@ def test_behavior_unchanged_generation_prompt_with_enable_thinking_false(self, t
             enable_thinking=False,
             chat_template=new_chat_template,
         )
-        assert before == after
+        self._assert_equal(tokenizer_name, before, after)
 
 
 @pytest.mark.parametrize(