NVIDIA-NeMo · chtruong814 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
@@ -211,6 +211,15 @@ runs:
           ${{ github.workspace }}/${{ github.run_id }}/${{steps.uuid.outputs.id }}/nemo-rl/tests/.coverage
         include-hidden-files: true
 
+    - name: Upload nemo_gym actual test data
+      uses: actions/upload-artifact@v6
+      if: always()
+      with:
+        name: actual_test_nemo_gym_sanity-${{ github.run_id }}
+        path: |
+          ${{ github.workspace }}/${{ github.run_id }}/${{steps.uuid.outputs.id }}/nemo-rl/tests/unit/environments/nemo_gym_test_data/actual_test_nemo_gym_sanity.json
+        if-no-files-found: ignore
+
     - name: Container shutdown
       if: always()
       shell: bash

@@ -25,6 +25,7 @@ coverage.json
 unit_results.json
 unit_results/
 test_assets/
+actual_test_nemo_gym_sanity.json
 .nrl_remote_map.json
 .nrl_remote_state.json
 

@@ -533,6 +533,7 @@ markers = [
   "automodel: marks tests that require the automodel extra",
   "vllm: marks tests that require the vllm extra",
   "sglang: marks tests that require the sglang extra",
+  "nemo_gym: marks tests that require the nemo_gym extra",
 ]
 
 [tool.pyrefly]

@@ -65,6 +65,14 @@ else
     uv run --extra sglang bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --hf-gated --sglang-only
 fi
 
+# Check and run nemo_gym tests
+exit_code=$(cd ${PROJECT_ROOT}/tests && uv run --extra nemo_gym pytest "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --collect-only --nemo-gym-only -q >/dev/null 2>&1; echo $?)
+if [[ $exit_code -eq 5 ]]; then
+    echo "No nemo_gym tests to run"
+else
+    uv run --extra nemo_gym bash -x ./tests/run_unit.sh "${TEST_PATHS[@]}" "${IGNORE[@]}" "${EXCLUDED_UNIT_TESTS[@]}" --cov=nemo_rl --cov-append --cov-report=term-missing --cov-report=json --nemo-gym-only -vv
+fi
+
 # Skip research tests in fast mode
 if [[ "${FAST:-0}" != "1" ]]; then
     for i in research/*/tests/unit; do

@@ -63,6 +63,12 @@ def pytest_addoption(parser):
         default=False,
         help="Run ONLY sglang tests",
     )
+    parser.addoption(
+        "--nemo-gym-only",
+        action="store_true",
+        default=False,
+        help="Run ONLY nemo_gym tests",
+    )
 
 
 def pytest_collection_modifyitems(config, items):
@@ -72,17 +78,19 @@ def pytest_collection_modifyitems(config, items):
     run_automodel_only = config.getoption("--automodel-only")
     run_vllm_only = config.getoption("--vllm-only")
     run_sglang_only = config.getoption("--sglang-only")
+    run_nemo_gym_only = config.getoption("--nemo-gym-only")
 
     # Check for mutually exclusive options
     exclusive_options = [
         run_mcore_only,
         run_automodel_only,
         run_vllm_only,
         run_sglang_only,
+        run_nemo_gym_only,
     ]
     if sum(exclusive_options) > 1:
         raise ValueError(
-            "--mcore-only, --automodel-only, --vllm-only, and --sglang-only are mutually exclusive"
+            "--mcore-only, --automodel-only, --vllm-only, --sglang-only, and --nemo-gym-only are mutually exclusive"
         )
 
     marker_expr = config.getoption("-m", default="")
@@ -170,6 +178,24 @@ def pytest_collection_modifyitems(config, items):
             item for item in new_items if not item.get_closest_marker("sglang")
         ]
 
+    # Filter by nemo_gym marker
+    if run_nemo_gym_only:
+        # Validate that nemo_gym is available
+        try:
+            from nemo_gym import config_types  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "Cannot run nemo_gym tests: nemo_gym is not available.\n"
+                "Please run tests with: uv run --extra nemo_gym --group test pytest ..."
+            )
+        # Include only nemo_gym tests
+        new_items = [item for item in new_items if item.get_closest_marker("nemo_gym")]
+    else:
+        # Exclude nemo_gym tests by default
+        new_items = [
+            item for item in new_items if not item.get_closest_marker("nemo_gym")
+        ]
+
     # Ensure run_first tests are prioritized
     new_items.sort(key=lambda item: 0 if item.get_closest_marker("run_first") else 1)
 

@@ -18,6 +18,7 @@
 
 import pytest
 import ray
+import torch
 from yaml import safe_load
 
 from nemo_rl.distributed.ray_actor_environment_registry import (
@@ -35,20 +36,11 @@
     tokenizer as nemo_gym_tokenizer,  # noqa: F401
 )
 
-try:
-    from nemo_gym import config_types  # noqa: F401
 
-    NEMO_GYM_INSTALLED = True
-except ImportError:
-    nemo_gym = None
-    NEMO_GYM_INSTALLED = False
-
-
-@pytest.mark.skipif(
-    not NEMO_GYM_INSTALLED,
-    reason="Skipping NeMo-Gym test since NeMo-Gym is not installed!",
-)
+@pytest.mark.nemo_gym
 def test_nemo_gym_stub_module():
+    from nemo_gym import config_types
+
     print(
         f"NeMo-Gym test successfully run! NeMo-Gym config_types module: {config_types}"
     )
@@ -141,10 +133,43 @@ def nemo_gym_sanity_test_data():
     return data
 
 
-@pytest.mark.skipif(
-    not NEMO_GYM_INSTALLED,
-    reason="Skipping NeMo-Gym test since NeMo-Gym is not installed!",
-)
+def _write_actual_test_data(original_input: list, actual_result: list):
+    """Write actual rollout results to actual_test_nemo_gym_sanity.json.
+
+    This makes it easy to update the expected output after a Gym commit bump:
+        cp nemo_gym_test_data/actual_test_nemo_gym_sanity.json nemo_gym_test_data/test_nemo_gym_sanity.json
+    """
+
+    def _convert(obj):
+        """Recursively convert torch tensors to Python lists for JSON serialization."""
+        if isinstance(obj, torch.Tensor):
+            return obj.tolist()
+        if isinstance(obj, dict):
+            return {k: _convert(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_convert(v) for v in obj]
+        return obj
+
+    cleaned = deepcopy(actual_result)
+    for r in cleaned:
+        r.pop("full_result", None)
+        for msg in r.get("message_log", [])[1:]:
+            if "token_ids" in msg:
+                msg["token_ids"] = []
+            if "generation_logprobs" in msg:
+                msg["generation_logprobs"] = []
+
+    output_path = (
+        Path(__file__).parent / "nemo_gym_test_data/actual_test_nemo_gym_sanity.json"
+    )
+    data = _convert({"input": original_input, "expected_output": cleaned})
+    with open(output_path, "w") as f:
+        json.dump(data, f)
+        f.write("\n")
+    print(f"Wrote updated test data to {output_path}")
+
+
+@pytest.mark.nemo_gym
 def test_nemo_gym_sanity(
     nemo_gym,
     nemo_gym_sanity_test_data,
@@ -153,6 +178,9 @@ def test_nemo_gym_sanity(
 ):
     """Test basic functionality of MathEnvironment step with simple messages."""
 
+    # Save original input before mutation for writing the actual test data file
+    original_input = deepcopy(nemo_gym_sanity_test_data["input"])
+
     # We need to match NeMo RL generation config params before sending to NeMo-Gym
     generation_config = nemo_gym_vllm_generation.cfg
     examples = nemo_gym_sanity_test_data["input"]
@@ -178,6 +206,10 @@ def test_nemo_gym_sanity(
         # for message in d["message_log"][:1]:
         #     message["token_ids"] = message["token_ids"].tolist()
 
+    # Write the actual result to a file so it can be used to update the expected output.
+    # To update: cp actual_test_nemo_gym_sanity.json test_nemo_gym_sanity.json
+    _write_actual_test_data(original_input, actual_result)
+
     def _standardize_single_result(d: dict):
         d = deepcopy(d)
         d.pop("full_result", None)

@@ -47,7 +47,6 @@
 
 # These are all fixtures
 from tests.unit.environments.test_nemo_gym import (
-    NEMO_GYM_INSTALLED,
     cluster,  # noqa: F401
     nemo_gym,  # noqa: F401
     nemo_gym_sanity_test_data,  # noqa: F401
@@ -787,10 +786,7 @@ def test_run_sliding_puzzle_vllm(sliding_puzzle_setup_vllm):
     print("\nSliding Puzzle VLLM Test assertions passed.")
 
 
-@pytest.mark.skipif(
-    not NEMO_GYM_INSTALLED,
-    reason="Skipping NeMo-Gym test since NeMo-Gym is not installed!",
-)
+@pytest.mark.nemo_gym
 def test_run_async_nemo_gym_rollout(
     nemo_gym,  # noqa: F811
     nemo_gym_vllm_generation,  # noqa: F811
@@ -827,9 +823,20 @@ def test_run_async_nemo_gym_rollout(
 
     expected_result = {
         "final_batch": {
-            "length": torch.tensor([3088, 3056]),
+            "agent_ref": [
+                {
+                    "name": "example_multi_step_simple_agent",
+                    "type": "responses_api_agents",
+                },
+                {
+                    "name": "example_multi_step_simple_agent",
+                    "type": "responses_api_agents",
+                },
+            ],
+            "length": torch.tensor([3080, 3048]),
             "loss_multiplier": torch.tensor([1.0, 1.0]),
             "total_reward": torch.tensor([0.0, 0.0]),
+            "truncated": torch.tensor([False, False]),
         },
         "rollout_metrics": {
             # core metrics
@@ -909,6 +916,7 @@ def _standardize(d: dict) -> dict:
         final_batch["total_reward"] = final_batch["total_reward"].tolist()
         final_batch["loss_multiplier"] = final_batch["loss_multiplier"].tolist()
         final_batch["length"] = final_batch["length"].tolist()
+        final_batch["truncated"] = final_batch["truncated"].tolist()
 
         for key in d["rollout_metrics"]:
             # We remove these fields from comparison since we cannot guarantee exact generation reproducibility