Merge branch 'main' into abstract-distributed-apis-checkpoint-loader

ronaldw07 · web-flow · commit 3bfb113398e2 · 2026-02-24T13:22:16.000-08:00
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -155,7 +155,7 @@ jobs:
           echo ${{ github.event.number }} > pr_number.txt
 
       - name: Archive coverage reports
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # ratchet:actions/upload-artifact@v4
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f  # v6.0.0
         if: always()
         with:
           name: coverage-reports
diff --git a/.github/workflows/post-coverage-comment.yml b/.github/workflows/post-coverage-comment.yml
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# This workflow is independent as it requires write permissions, which GitHub blocks
+# on forks for security reasons. Thus, in order to be able to post comments with
+# code coverage details on PRs, we run a separate workflow in the context of the
+# base repository after any PR build workflow completes, relying on files uploaded
+# by the PR's build.
 name: Post Coverage Comment
 
 on:
diff --git a/src/ml_flashpoint/adapter/nemo/wrapper_util.py b/src/ml_flashpoint/adapter/nemo/wrapper_util.py
@@ -197,6 +197,14 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
             + f"'{checkpoint_io.__class__.__name__}'."
         )
 
+    # Use 'spawn' instead of 'fork' for the multiprocessing context.
+    # By default, 'fork' causes the background SyncManager process to inherit
+    # the parent's CUDA context. If the main training process is forcefully
+    # killed (e.g., via SIGKILL during NVRX in-job restarts), the orphaned
+    # manager process keeps the GPU memory locked, leading to CUDA Out-Of-Memory
+    # (OOM) errors upon restart. 'spawn' launches a clean interpreter without
+    # the inherited CUDA state, allowing the GPU memory to be freed instantly.
+    ctx = torch_mp.get_context("spawn")
     save_strategy = MLFlashpointMegatronAsyncSaveStrategy(
         storage_writer=MemoryStorageWriter(
             checkpoint_saver=DefaultMLFlashpointCheckpointSaver(
@@ -208,7 +216,7 @@ def wrap_trainer_checkpoint_io_with_mlflashpoint(
                 initial_buffer_size_bytes=initial_write_buffer_size_bytes,
                 use_optimized_save=use_optimized_save,
             ),
-            mp_manager=torch_mp.Manager(),
+            mp_manager=ctx.Manager(),
             thread_count=write_thread_count,
         )
     )
diff --git a/src/ml_flashpoint/adapter/pytorch/memory_storage_writer.py b/src/ml_flashpoint/adapter/pytorch/memory_storage_writer.py
@@ -97,6 +97,9 @@ def __init__(
                 handling the actual checkpoint saving logic.
             mp_manager: A `torch.multiprocessing.Manager` instance for managing
                 shared state across processes, particularly for write results and events.
+                It is highly recommended to create this manager using a 'spawn'
+                multiprocessing context to avoid inheriting the parent's CUDA context,
+                which prevents CUDA OOM errors during failure recoveries
             thread_count: Optional. The number of threads to use for writing checkpoint data.
                 Defaults to 1. If a value less than 1 is provided, it will be reset to 1,
                 and a warning will be logged.
diff --git a/tests/adapter/nemo/test_wrapper_util.py b/tests/adapter/nemo/test_wrapper_util.py
@@ -794,6 +794,45 @@ def test_write_thread_count_forwarding(
         _, kwargs = spy_memory_storage_writer_init.call_args
         assert kwargs["thread_count"] == expected_thread_count
 
+    def test_spawn_context_used_for_mp_manager(self, mocker, mock_ckpt_obj_manager, mock_replication_manager):
+        """Tests that torch_mp.get_context('spawn').Manager() is correctly instantiated and passed."""
+        # Given
+        trainer = mocker.MagicMock(spec=nl_trainer.Trainer)
+        trainer.callbacks = [mocker.MagicMock(spec=MLFlashpointCheckpointCallback)]
+        trainer.strategy = mocker.MagicMock(spec=nl_strategies.MegatronStrategy)
+        original_checkpoint_io = mocker.MagicMock(spec=MegatronCheckpointIO)
+        trainer.strategy.checkpoint_io = original_checkpoint_io
+        base_container = "/test_base_container"
+
+        mock_get_context = mocker.patch("ml_flashpoint.adapter.nemo.wrapper_util.torch_mp.get_context")
+
+        mock_ctx = mock_get_context.return_value  # The mocked context object
+        mock_manager_instance = mock_ctx.Manager.return_value  # The mocked manager instance
+
+        spy_memory_storage_writer_init = mocker.spy(MemoryStorageWriter, "__init__")
+
+        # When
+        wrap_trainer_checkpoint_io_with_mlflashpoint(
+            trainer,
+            base_container,
+            mock_ckpt_obj_manager,
+            mock_replication_manager,
+            async_save=True,
+            checkpoint_loader=mocker.MagicMock(spec=DefaultMLFlashpointCheckpointLoader),
+        )
+
+        # Then
+        # Verify get_context was called explicitly with 'spawn'
+        mock_get_context.assert_called_once_with("spawn")
+
+        # Verify Manager() was called on the correct spawn context
+        mock_ctx.Manager.assert_called_once()
+
+        # Verify the exact Manager instance was passed to MemoryStorageWriter
+        spy_memory_storage_writer_init.assert_called_once()
+        _, kwargs = spy_memory_storage_writer_init.call_args
+        assert kwargs["mp_manager"] is mock_manager_instance
+
     @pytest.mark.parametrize("always_save_context, expected_value", [(True, True), (False, False)])
     def test_always_save_context_forwarding(
         self, mocker, mock_ckpt_obj_manager, mock_replication_manager, always_save_context, expected_value