[algorithm][generator] change overlong filtering to use stop reasons over checking eos token (NovaSky-AI#1319)

erictang000 · web-flow · commit af42cf80b976 · 2026-03-12T16:07:58.000-07:00
previously overlong filtering was doing this: ```python [ [0] * len(mask) if not response or response[-1] != eos_token_id else mask for mask, response in zip(loss_masks, response_ids) ] ``` which was flaky, since models could choose to end with a token other than `tokenizer.eos_token_id`. This is the case for `moonlight_16b_a3b` which ends with `<|im_end|>` even though it separately has `tokenizer.eos_token_id` set to `[EOS]`. It's more reliable to just check `stop reasons != stop`. This overlaps slightly with `zero_reward_on_non_stop`, but does have different behavior since it zeroes out the loss mask and not the reward (which is environment responsibility).  --- <a href="https://app.devin.ai/review/novasky-ai/skyrl/pull/1319" target="_blank"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open with Devin"> </picture> </a>
diff --git a/skyrl/train/generators/skyrl_gym_generator.py b/skyrl/train/generators/skyrl_gym_generator.py
@@ -650,7 +650,8 @@ async def generate_batched(
         rollout_metrics = get_rollout_metrics(responses, rewards, env_metrics, env_classes)
 
         if self.generator_cfg.apply_overlong_filtering:
-            loss_masks = apply_overlong_filtering(loss_masks, responses, self.tokenizer.eos_token_id)
+            # set loss mask to 0 if the stop reason is not "stop"
+            loss_masks = apply_overlong_filtering(loss_masks, stop_reasons)
 
         generator_output: GeneratorOutput = {
             "prompt_token_ids": prompt_token_ids,
@@ -767,7 +768,8 @@ async def generate(self, input_batch: GeneratorInput, disable_tqdm: bool = False
             rewards = self._zero_reward_if_not_stop(rewards, stop_reasons)
 
         if self.generator_cfg.apply_overlong_filtering:
-            loss_masks = apply_overlong_filtering(loss_masks, responses, self.tokenizer.eos_token_id)
+            # set loss mask to 0 if the stop reason is not "stop"
+            loss_masks = apply_overlong_filtering(loss_masks, stop_reasons)
 
         generator_output: GeneratorOutput = {
             "prompt_token_ids": prompt_token_ids,
diff --git a/skyrl/train/generators/utils.py b/skyrl/train/generators/utils.py
@@ -276,20 +276,26 @@ def concatenate_generator_outputs(generator_outputs: List[GeneratorOutput]) -> G
 
 def apply_overlong_filtering(
     loss_masks: List[List[int]],
-    response_ids: List[List[int]],
-    eos_token_id: int,
+    stop_reasons: List[str],
 ) -> List[List[int]]:
     """
     Implements DAPO Overlong Filtering: zero-out every token's mask whenever
-    the response does not end with the eos token id (i.e. truncated).
+    the response was truncated (i.e. did not end with a stop token).
+
+    Uses stop_reasons from the inference engine rather than checking for a
+    specific eos token id, making this model/tokenizer agnostic.
+
+    Args:
+        loss_masks: Per-trajectory token loss masks.
+        stop_reasons: Per-trajectory stop reasons from the inference engine
+            (e.g. "stop" for normal completion, "length" for truncation).
 
     Returns:
-        - The loss masks with tokens zeroed out for truncated responses
+        The loss masks with tokens zeroed out for truncated responses.
     """
-    assert len(loss_masks) == len(response_ids), "loss_masks and response_ids must have the same length"
+    assert len(loss_masks) == len(stop_reasons), "loss_masks and stop_reasons must have the same length"
     return [
-        [0] * len(mask) if not response or response[-1] != eos_token_id else mask
-        for mask, response in zip(loss_masks, response_ids)
+        [0] * len(mask) if stop_reason != "stop" else mask[:] for mask, stop_reason in zip(loss_masks, stop_reasons)
     ]
 
 
diff --git a/tests/train/generators/test_utils.py b/tests/train/generators/test_utils.py
@@ -36,115 +36,95 @@ def qwen3_acc_thinking_template():
 
 
 @pytest.mark.parametrize(
-    "loss_masks,response_ids,eos_token_id,expected_masks",
+    "loss_masks,stop_reasons,expected_masks",
     [
-        # Test case 1: All responses end with eos token - masks should remain unchanged
+        # Test case 1: All responses completed normally - masks should remain unchanged
         (
             [[1, 1, 0, 1], [0, 1, 1, 1], [1, 0, 1]],
-            [[1, 2, 3, 4], [5, 6, 7, 4], [8, 9, 4]],  # All end with eos_token_id=4
-            4,
+            ["stop", "stop", "stop"],
             [[1, 1, 0, 1], [0, 1, 1, 1], [1, 0, 1]],
         ),
-        # Test case 2: No responses end with eos token - all masks should be zeroed
+        # Test case 2: All responses truncated - all masks should be zeroed
         (
             [[1, 1, 0, 1], [0, 1, 1, 1], [1, 0, 1]],
-            [[1, 2, 3, 5], [5, 6, 7, 8], [8, 9, 10]],  # None end with eos_token_id=4
-            4,
+            ["length", "length", "length"],
             [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0]],
         ),
-        # Test case 3: Mixed responses - only non-eos ending masks should be zeroed
+        # Test case 3: Mixed - only truncated masks should be zeroed
         (
             [[1, 1, 0, 1], [0, 1, 1, 1], [1, 0, 1, 0, 1]],
-            [[1, 2, 3, 4], [5, 6, 7, 8], [8, 9, 10, 11, 4]],  # First and third end with eos_token_id=4
-            4,
+            ["stop", "length", "stop"],
             [[1, 1, 0, 1], [0, 0, 0, 0], [1, 0, 1, 0, 1]],
         ),
-        # Test case 4: Empty responses should be zeroed
+        # Test case 4: Various non-"stop" reasons should all be zeroed
         (
             [[1, 1], [1, 0, 1], [0, 1, 1, 1]],
-            [[], [1, 2, 3], [4, 5, 6, 7]],  # Empty, no eos, no eos (eos_token_id=4)
-            4,
+            ["length", "abort", "cancelled"],
             [[0, 0], [0, 0, 0], [0, 0, 0, 0]],
         ),
         # Test case 5: Empty lists
-        ([], [], 4, []),
-        # Test case 6: Different eos token id
-        (
-            [[1, 1], [1, 0, 1], [0, 1, 1, 1]],
-            [[1, 2], [3, 4, 99], [5, 6, 7, 99]],  # Second and third end with eos_token_id=99
-            99,
-            [[0, 0], [1, 0, 1], [0, 1, 1, 1]],
-        ),
+        ([], [], []),
     ],
 )
-def test_apply_overlong_filtering(loss_masks, response_ids, eos_token_id, expected_masks):
+def test_apply_overlong_filtering(loss_masks, stop_reasons, expected_masks):
     """
     Test the apply_overlong_filtering function which implements DAPO Overlong Filtering.
 
-    This function should zero-out every token's mask whenever the response does not end
-    with the eos token id (i.e. truncated), while leaving other masks unchanged.
+    This function should zero-out every token's mask whenever the stop reason is not "stop"
+    (i.e. the response was truncated), while leaving other masks unchanged.
     """
-    result = apply_overlong_filtering(loss_masks, response_ids, eos_token_id)
+    result = apply_overlong_filtering(loss_masks, stop_reasons)
 
     assert result == expected_masks, f"Expected {expected_masks}, but got {result}"
 
-    # Verify that the original inputs are not modified (immutability check)
     assert len(result) == len(loss_masks), "Result should have same length as input"
 
-    # Check that each individual mask is processed correctly
-    for i, (original_mask, response, expected_mask) in enumerate(zip(loss_masks, response_ids, expected_masks)):
-        if len(response) == 0 or response[-1] != eos_token_id:
-            # Should be all zeros with same length as original
+    for i, (original_mask, stop_reason, expected_mask) in enumerate(zip(loss_masks, stop_reasons, expected_masks)):
+        if stop_reason != "stop":
             assert result[i] == [0] * len(original_mask), f"Mask {i} should be all zeros for truncated response"
         else:
-            # Should be unchanged
-            assert result[i] == original_mask, f"Mask {i} should be unchanged for response ending with eos token"
+            assert result[i] == original_mask, f"Mask {i} should be unchanged for completed response"
 
 
 def test_apply_overlong_filtering_immutability():
     """
     Test that apply_overlong_filtering doesn't modify the original input lists.
     """
     original_loss_masks = [[1, 1, 0, 1], [0, 1, 1]]
-    original_response_ids = [[1, 2, 3, 4], [5, 6, 7]]  # First ends with eos=4, second doesn't
-    eos_token_id = 4
+    original_stop_reasons = ["stop", "length"]
 
-    # Create copies to compare against later
-    loss_masks_copy = [mask[:] for mask in original_loss_masks]  # Deep copy of lists
-    response_ids_copy = [response[:] for response in original_response_ids]  # Deep copy of lists
+    loss_masks_copy = [mask[:] for mask in original_loss_masks]
+    stop_reasons_copy = original_stop_reasons[:]
 
-    result = apply_overlong_filtering(original_loss_masks, original_response_ids, eos_token_id)
+    result = apply_overlong_filtering(original_loss_masks, original_stop_reasons)
 
-    # Verify original inputs are unchanged
     assert original_loss_masks == loss_masks_copy, "Original loss_masks should not be modified"
-    assert original_response_ids == response_ids_copy, "Original response_ids should not be modified"
+    assert original_stop_reasons == stop_reasons_copy, "Original stop_reasons should not be modified"
 
-    # Verify result is correct
-    expected = [[1, 1, 0, 1], [0, 0, 0]]  # Second mask zeroed due to not ending with eos
+    expected = [[1, 1, 0, 1], [0, 0, 0]]  # Second mask zeroed due to truncation
     assert result == expected, f"Expected {expected}, got {result}"
 
 
 @pytest.mark.parametrize(
-    "loss_masks,response_ids",
+    "loss_masks,stop_reasons",
     [
-        # Test case 1: More loss_masks than response_ids
-        ([[1, 1], [0, 1]], [[1, 2]]),
-        # Test case 2: More response_ids than loss_masks
-        ([[1, 1]], [[1, 2], [3, 4]]),
-        # Test case 3: Empty loss_masks but non-empty response_ids
-        ([], [[1, 2]]),
-        # Test case 4: Non-empty loss_masks but empty response_ids
+        # Test case 1: More loss_masks than stop_reasons
+        ([[1, 1], [0, 1]], ["stop"]),
+        # Test case 2: More stop_reasons than loss_masks
+        ([[1, 1]], ["stop", "length"]),
+        # Test case 3: Empty loss_masks but non-empty stop_reasons
+        ([], ["stop"]),
+        # Test case 4: Non-empty loss_masks but empty stop_reasons
         ([[1, 0]], []),
     ],
 )
-def test_apply_overlong_filtering_length_mismatch_assertion(loss_masks, response_ids):
+def test_apply_overlong_filtering_length_mismatch_assertion(loss_masks, stop_reasons):
     """
-    Test that apply_overlong_filtering raises AssertionError when loss_masks and response_ids
+    Test that apply_overlong_filtering raises AssertionError when loss_masks and stop_reasons
     have different lengths.
     """
-    eos_token_id = 4
-    with pytest.raises(AssertionError, match="loss_masks and response_ids must have the same length"):
-        apply_overlong_filtering(loss_masks, response_ids, eos_token_id)
+    with pytest.raises(AssertionError, match="loss_masks and stop_reasons must have the same length"):
+        apply_overlong_filtering(loss_masks, stop_reasons)
 
 
 dummy_chat_template = (