fix(pipeline): honor keep_mounts=False to prevent sandbox mount leak (#1394)

gwarmstrong · Kipok · web-flow · commit 645cf567ff08 · 2026-04-28T05:13:33.000Z
Signed-off-by: gwarmstrong &lt;gwarmstrong@users.noreply.github.com&gt;
Co-authored-by: gwarmstrong &lt;gwarmstrong@users.noreply.github.com&gt;
Co-authored-by: Igor Gitman &lt;igitman@nvidia.com&gt;
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
@@ -298,10 +298,13 @@ def prepare_for_execution(self, cluster_config: Dict) -> Tuple[run.Script, Dict]
         # For SandboxScript, keep_mounts=False (the safe default) maps to mounts=[]
         # so the sandbox container has no access to cluster filesystems.
         # keep_mounts=True maps to mounts=None, which inherits cluster mounts.
+        # keep_mounts is propagated separately so Stage B (_create_executor) can
+        # honor the isolation request even when Command.mounts is an explicit list
+        # (in which case Stage A's resolved_mounts alone loses that signal).
+        keep_mounts = getattr(self.script, "keep_mounts", True)
         if self.mounts is not None:
             resolved_mounts = self.mounts
         else:
-            keep_mounts = getattr(self.script, "keep_mounts", True)
             resolved_mounts = None if keep_mounts else []
 
         merged_env = dict(runtime_metadata.get("environment", {}))
@@ -311,6 +314,7 @@ def prepare_for_execution(self, cluster_config: Dict) -> Tuple[run.Script, Dict]
             "log_prefix": getattr(self.script, "log_prefix", "main"),
             "environment": merged_env,
             "mounts": resolved_mounts,
+            "keep_mounts": keep_mounts,
             "container": self.container,
         }
 
@@ -647,13 +651,28 @@ def _create_executor(
             else (hardware.num_tasks if hardware and hardware.num_tasks is not None else 1)
         )
 
-        # Allow per-command extra mounts without requiring editing the cluster YAML.
-        # We treat exec_config["mounts"] as additive and merge it with mounts from cluster_config.
-        mounts = None
-        extra_mounts = exec_config["mounts"] or None
-        if extra_mounts:
+        # Resolve mounts based on Stage A output and the script's keep_mounts flag:
+        # - mounts=None: inherit cluster mounts (Stage C default).
+        # - keep_mounts=False: the script asked for filesystem isolation. Pass its
+        #   mounts list verbatim (even empty) so cluster mounts are NOT merged in.
+        # - keep_mounts=True + non-empty extras: additive merge with cluster mounts.
+        # - keep_mounts=True + empty extras: inherit cluster mounts.
+        # Stage A invariant: mounts=None is only produced when keep_mounts=True
+        # (keep_mounts=False with no explicit Command.mounts is normalized to []),
+        # so the `extra_mounts is None` branch below is safe to take before
+        # consulting keep_mounts. `.get(..., True)` defends against exec_configs
+        # built by callers that bypass Stage A.
+        extra_mounts = exec_config["mounts"]
+        keep_mounts = exec_config.get("keep_mounts", True)
+        if extra_mounts is None:
+            mounts = None
+        elif not keep_mounts:
+            mounts = list(extra_mounts)
+        elif extra_mounts:
             base_mounts = get_mounts_from_config(cluster_config)
             mounts = base_mounts + [m for m in extra_mounts if m not in base_mounts]
+        else:
+            mounts = None
 
         # Sandbox-specific srun overrides: allow the sandbox to survive individual
         # worker crashes (e.g. SIGILL from libraries compiled for a different CPU).
diff --git a/tests/test_declarative_pipeline.py b/tests/test_declarative_pipeline.py
@@ -922,5 +922,157 @@ def capture_env_update(cluster_config, updates):
                     )
 
 
+class TestMountsResolution:
+    """Regression tests for the Command/Pipeline mounts resolution flow.
+
+    Covers the full (Command.mounts x script.keep_mounts) matrix described in
+    the sandbox-mount-leak bug analysis. The three bug rows share keep_mounts=False
+    and must NOT receive cluster mounts back via the Stage B additive merge.
+    """
+
+    CLUSTER_MOUNTS = ["/cluster/a:/cluster/a", "/cluster/b:/cluster/b"]
+
+    def _make_script(self, *, keep_mounts=None):
+        """Return a DummyScript with an optional keep_mounts attribute."""
+        script = DummyScript(inline="echo test")
+        if keep_mounts is not None:
+            script.keep_mounts = keep_mounts
+        return script
+
+    # -------------------- Stage A: Command.prepare_for_execution --------------------
+
+    @pytest.mark.parametrize(
+        "command_mounts, keep_mounts_attr, expected_mounts, expected_keep_mounts",
+        [
+            # Command.mounts=None
+            (None, None, None, True),  # non-sandbox (keep_mounts attr absent -> defaults True)
+            (None, True, None, True),  # sandbox opt-in: inherit cluster mounts
+            (None, False, [], False),  # sandbox default: empty list, flag propagated
+            # Command.mounts=[]
+            ([], None, [], True),
+            ([], True, [], True),
+            ([], False, [], False),
+            # Command.mounts=[/a:/b]
+            (["/a:/b"], None, ["/a:/b"], True),
+            (["/a:/b"], True, ["/a:/b"], True),
+            (["/a:/b"], False, ["/a:/b"], False),
+        ],
+    )
+    def test_stage_a_resolved_mounts_and_keep_mounts(
+        self, command_mounts, keep_mounts_attr, expected_mounts, expected_keep_mounts
+    ):
+        """Stage A must store mounts and the keep_mounts flag in execution_config."""
+        script = self._make_script(keep_mounts=keep_mounts_attr)
+        cmd = Command(script=script, name="c", mounts=command_mounts)
+        cluster_config = {"executor": "local", "containers": {}}
+
+        _, exec_config = cmd.prepare_for_execution(cluster_config)
+
+        assert exec_config["mounts"] == expected_mounts
+        assert exec_config["keep_mounts"] is expected_keep_mounts
+
+    # -------------------- Stage B/C: end-to-end mounts passed to get_executor --------------------
+
+    def _run_pipeline_and_capture_mounts(self, command_mounts, keep_mounts_attr):
+        """Run a one-command Pipeline with mocks and return the mounts kwarg passed to get_executor."""
+        captured = {}
+
+        def mock_get_executor(**kwargs):
+            captured["mounts"] = kwargs.get("mounts")
+            executor = MagicMock()
+            executor.packager = MagicMock()
+            return executor
+
+        cluster_config = {
+            "executor": "slurm",
+            "containers": {"nemo-skills": "test/container"},
+            "account": "test",
+            "env_vars": {"HF_HOME": "/hf"},
+            "mounts": self.CLUSTER_MOUNTS,
+        }
+
+        script = self._make_script(keep_mounts=keep_mounts_attr)
+        cmd = Command(script=script, name="c", mounts=command_mounts)
+        group = CommandGroup(commands=[cmd], name="g", log_dir="/logs")
+
+        with (
+            patch("nemo_skills.pipeline.utils.declarative.get_executor", side_effect=mock_get_executor),
+            patch(
+                "nemo_skills.pipeline.utils.declarative.get_mounts_from_config",
+                return_value=list(self.CLUSTER_MOUNTS),
+            ),
+            patch("nemo_skills.pipeline.utils.declarative.get_env_variables", return_value={"HF_HOME": "/hf"}),
+            patch("nemo_skills.pipeline.utils.declarative.get_exp") as mock_get_exp,
+            patch("nemo_skills.pipeline.utils.declarative.run_exp"),
+        ):
+            mock_exp = MagicMock()
+            mock_exp.__enter__ = MagicMock(return_value=mock_exp)
+            mock_exp.__exit__ = MagicMock(return_value=False)
+            mock_exp.add = MagicMock(return_value="handle")
+            mock_get_exp.return_value = mock_exp
+
+            Pipeline(
+                name="test",
+                cluster_config=cluster_config,
+                jobs=[{"name": "j", "group": group}],
+                skip_hf_home_check=True,
+                reuse_code=False,
+            ).run(dry_run=True)
+
+        assert "mounts" in captured, "get_executor was not called"
+        return captured["mounts"]
+
+    # ---- Non-bug rows: expected pre-fix behavior is preserved ----
+
+    def test_mounts_none_no_keep_mounts_attr_inherits_cluster(self):
+        """Non-sandbox script with no explicit mounts inherits cluster mounts."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=None, keep_mounts_attr=None)
+        # Stage C falls back to cluster mounts when mounts kwarg is None
+        assert mounts is None
+
+    def test_mounts_none_keep_mounts_true_inherits_cluster(self):
+        """keep_mounts=True with no explicit list inherits cluster mounts."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=None, keep_mounts_attr=True)
+        assert mounts is None
+
+    def test_mounts_empty_no_keep_mounts_attr_inherits_cluster(self):
+        """Empty Command.mounts on a non-sandbox script is treated as 'no extras' -> inherit."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=[], keep_mounts_attr=None)
+        assert mounts is None
+
+    def test_mounts_empty_keep_mounts_true_inherits_cluster(self):
+        """Empty Command.mounts with keep_mounts=True also inherits cluster mounts."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=[], keep_mounts_attr=True)
+        assert mounts is None
+
+    def test_mounts_extra_no_keep_mounts_attr_additive_merge(self):
+        """Non-sandbox extras are additively merged with cluster mounts."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=["/a:/b"], keep_mounts_attr=None)
+        assert mounts == self.CLUSTER_MOUNTS + ["/a:/b"]
+
+    def test_mounts_extra_keep_mounts_true_additive_merge(self):
+        """keep_mounts=True with extras: additive merge (opt-in inherit + extras)."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=["/a:/b"], keep_mounts_attr=True)
+        assert mounts == self.CLUSTER_MOUNTS + ["/a:/b"]
+
+    # ---- Bug rows: keep_mounts=False must isolate from cluster mounts ----
+
+    def test_bug_row_1_mounts_none_keep_mounts_false_no_cluster_leak(self):
+        """Sandbox default (Command.mounts=None, keep_mounts=False): no cluster mounts leak through."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=None, keep_mounts_attr=False)
+        # Must be an empty list passed to get_executor so Stage C does NOT fall back to cluster mounts
+        assert mounts == [], f"keep_mounts=False leaked cluster mounts: {mounts}"
+
+    def test_bug_row_2_mounts_empty_keep_mounts_false_no_cluster_leak(self):
+        """Sandbox with explicit empty list (Command.mounts=[], keep_mounts=False): no cluster mounts leak."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=[], keep_mounts_attr=False)
+        assert mounts == [], f"keep_mounts=False leaked cluster mounts: {mounts}"
+
+    def test_bug_row_3_mounts_extra_keep_mounts_false_no_cluster_merge(self):
+        """Sandbox with explicit extras (Command.mounts=[/a:/b], keep_mounts=False): extras verbatim, no cluster merge."""
+        mounts = self._run_pipeline_and_capture_mounts(command_mounts=["/a:/b"], keep_mounts_attr=False)
+        assert mounts == ["/a:/b"], f"keep_mounts=False merged cluster mounts into sandbox: {mounts}"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])