fix: Disable double DDP construction inside build_model() via runtime patch (#264)

Xiaoming-AMD · Xiaoming-AMD · web-flow · commit 1baa0421a257 · 2025-11-03T20:27:24.000+08:00
Co-authored-by: Xiaoming-AMD &lt;xiaompen@amd.com&gt;
diff --git a/primus/pretrain.py b/primus/pretrain.py
@@ -16,6 +16,45 @@
 # Lazy backend loader
 def load_backend_trainer(framework: str):
     if framework == "megatron":
+        import megatron.training.training as training
+        import torch
+
+        _original_build_model = training.get_model
+
+        def _patched_get_model(*args, **kwargs):
+            """
+            Monkey-patched version of build_model that removes the second
+            DDP construction inside torch.cuda.stream() block.
+            """
+            import inspect
+
+            from megatron.training import training as tr
+
+            inspect.getsource(tr.get_model)
+            print("[PrimusPatch] Overriding build_model to disable second DDP construction...")
+
+            _orig_stream_ctx = torch.cuda.stream
+
+            def _noop_stream(*args, **kwargs):
+                class DummyCtx:
+                    def __enter__(self):
+                        return None
+
+                    def __exit__(self, *a):
+                        return False
+
+                return DummyCtx()
+
+            torch.cuda.stream = _noop_stream
+
+            try:
+                return _original_build_model(*args, **kwargs)
+            finally:
+                torch.cuda.stream = _orig_stream_ctx
+
+        training.get_model = _patched_get_model
+        print("[PrimusPatch] Applied Megatron build_model monkey-patch to disable second DDP.")
+
         from primus.modules.trainer.megatron.pre_trainer import MegatronPretrainTrainer
 
         return MegatronPretrainTrainer