Tiny extract and enhance oom dumper (#568)

fzyzcjy · web-flow · commit a133f93c52ff · 2025-10-24T14:51:01.000+08:00
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -22,6 +22,7 @@
 import wandb
 
 from slime.ray.train_actor import TrainRayActor
+from slime.utils import profile_utils
 from slime.utils.data import get_minimum_num_micro_batch_size, process_rollout_data
 from slime.utils.distributed_utils import get_gloo_group
 from slime.utils.ppo_utils import compute_approx_kl, compute_policy_loss
@@ -60,6 +61,12 @@ def init(self, args: Namespace, role: str, wandb_run_id: str, with_ref: bool = F
         self.args = args
         torch.manual_seed(args.seed)
 
+        if args.record_memory_history:
+            profile_utils.attach_oom_dump_memory_history(
+                memory_snapshot_dir=args.memory_snapshot_dir,
+                memory_snapshot_path=args.memory_snapshot_path,
+            )
+
         for i in range(dist.get_world_size()):
             if i == dist.get_rank():
                 self.hf_config = AutoConfig.from_pretrained(self.args.hf_checkpoint, trust_remote_code=True)
diff --git a/slime/backends/fsdp_utils/arguments.py b/slime/backends/fsdp_utils/arguments.py
@@ -32,6 +32,10 @@ class FSDPArgs:
     # FSDP configuration
     fsdp_full_params: bool = False  # If True, use full_tensor; if False, use shard_tensor
 
+    # Profile
+    record_memory_history: bool = False
+    memory_snapshot_path: str = "snapshot.pickle"
+
     # YAML bookkeeping
     config: str | None = None
 
diff --git a/slime/backends/megatron_utils/model_provider.py b/slime/backends/megatron_utils/model_provider.py
@@ -15,6 +15,7 @@
 from megatron.core.transformer.spec_utils import import_module
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.training.arguments import core_transformer_config_from_args
+from slime.utils import profile_utils
 
 
 # Adapt from https://github.com/volcengine/verl/blob/c3b20575d2bc815fcccd84bddb4c0401fc4b632b/verl/models/llama/megatron/layers/parallel_linear.py#L82
@@ -70,29 +71,13 @@ def model_provider(
         """
         use_te = args.transformer_impl == "transformer_engine"
 
+        # TODO maybe move this to other parts
         if args.record_memory_history:
-            torch.cuda.memory._record_memory_history(
-                # True,
-                # keep 100,000 alloc/free events from before the snapshot
-                max_entries=100000,
-                # record stack information for the trace events
-                # trace_alloc_record_context=True,
-                stacks="all",
+            profile_utils.attach_oom_dump_memory_history(
+                memory_snapshot_dir=args.memory_snapshot_dir,
+                memory_snapshot_path=args.memory_snapshot_path,
             )
 
-            def oom_observer(device, alloc, device_alloc, device_free):
-                # snapshot right after an OOM happened
-                print("saving allocated state during OOM")
-                snapshot = torch.cuda.memory._snapshot()
-                from pickle import dump
-
-                dump(
-                    snapshot,
-                    open(f"oom_rank-{torch.distributed.get_rank()}_{args.memory_snapshot_path}", "wb"),
-                )
-
-            torch._C._cuda_attach_out_of_memory_observer(oom_observer)
-
         # Experimental loading arguments from yaml
         config: TransformerConfig = core_transformer_config_from_args(args)
 
diff --git a/slime/utils/arguments.py b/slime/utils/arguments.py
@@ -863,6 +863,12 @@ def add_debug_arguments(parser):
                 default=None,
                 help=("Dump all details of training for post-hoc analysis and visualization."),
             )
+            # use together with --record-memory-history and --memory-snapshot-path (defined in Megatron)
+            parser.add_argument(
+                "--memory-snapshot-dir",
+                type=str,
+                default=".",
+            )
             return parser
 
         def add_network_arguments(parser):
diff --git a/slime/utils/profile_utils.py b/slime/utils/profile_utils.py
@@ -0,0 +1,23 @@
+from pickle import dump
+
+import torch
+
+
+# The memory_snapshot_path is not a full path, but we name like this to be compatible with megatron
+def attach_oom_dump_memory_history(memory_snapshot_dir, memory_snapshot_path):
+    torch.cuda.memory._record_memory_history(
+        max_entries=100000,
+        # record stack information for the trace events
+        # trace_alloc_record_context=True,
+        stacks="all",
+    )
+
+    def oom_observer(device, alloc, device_alloc, device_free):
+        path_dump = memory_snapshot_dir / f"oom_rank-{torch.distributed.get_rank()}_{memory_snapshot_path}"
+        print(f"Observe OOM, will dump snapshot to {path_dump}. ({device=} {alloc=} {device_alloc=} {device_free=})")
+
+        # TODO use `_dump_snapshot` instead?
+        snapshot = torch.cuda.memory._snapshot()
+        dump(snapshot, open(path_dump, "wb"))
+
+    torch._C._cuda_attach_out_of_memory_observer(oom_observer)