volcengine · Kite0011 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
@@ -97,6 +97,9 @@ actor_rollout_ref:
       - extra
       load_contents: ${.save_contents}
       async_save: false
+      mbridge_config:
+        memory_efficient: true
+        distributed_filesystem: true
     use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
     profiler:
       _target_: verl.utils.profiler.ProfilerConfig
@@ -491,6 +494,9 @@ critic:
     - extra
     load_contents: ${.save_contents}
     async_save: false
+    mbridge_config:
+      memory_efficient: true
+      distributed_filesystem: true
   profiler:
     _target_: verl.utils.profiler.ProfilerConfig
     tool: ${oc.select:global_profiler.tool,null}

@@ -84,6 +84,9 @@ actor_rollout_ref:
       - extra
       load_contents: ${.save_contents}
       async_save: false
+      mbridge_config:
+        memory_efficient: true
+        distributed_filesystem: true
     use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
     profiler:
       _target_: verl.utils.profiler.ProfilerConfig
@@ -425,6 +428,9 @@ critic:
     - extra
     load_contents: ${.save_contents}
     async_save: false
+    mbridge_config:
+      memory_efficient: true
+      distributed_filesystem: true
   profiler:
     _target_: verl.utils.profiler.ProfilerConfig
     tool: ${oc.select:global_profiler.tool,null}

@@ -122,6 +122,11 @@ checkpoint:
 
   # Whether to save checkpoints asynchronously. Only effective for Megatron as of now.
   async_save: False
+
+  # Mbridge config extension.
+  mbridge_config:
+    memory_efficient: true
+    distributed_filesystem: true
 
 # optimizer configs
 optim:

@@ -36,6 +36,9 @@ class CheckpointConfig(BaseConfig):
     save_contents: list[str] = field(default_factory=lambda: ["model", "optimizer", "extra"])
     load_contents: list[str] = field(default_factory=lambda: ["model", "optimizer", "extra"])
     async_save: bool = False
+    mbridge_config: dict[str, Any] = field(
+        default_factory=lambda: {"memory_efficient": True, "distributed_filesystem": True}
+    )
 
 
 @dataclass

@@ -98,6 +98,11 @@ checkpoint:
   # Whether to save checkpoints asynchronously. Only effective for Megatron as of now.
   async_save: False
 
+  # Mbridge config extension.
+  mbridge_config:
+    memory_efficient: true
+    distributed_filesystem: true
+
 # profile the critic model in `update_critic`
 profiler:
 

@@ -52,6 +52,10 @@ checkpoint:
 
   # For more flexibility, you can specify the contents to load from the checkpoint.
   load_contents: ${checkpoint.save_contents}
+  # Mbridge config extension.
+  mbridge_config:
+    memory_efficient: true
+    distributed_filesystem: true
 
 trainer:
   default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}

diff --git a/verl/utils/checkpoint/megatron_checkpoint_manager.py b/verl/utils/checkpoint/megatron_checkpoint_manager.py
@@ -500,7 +500,7 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
                 hf_ckpt_path = get_hf_model_checkpoint_path(local_path)
                 if self.vanilla_bridge:
                     self.bridge.save_weights(
-                        self.model, hf_ckpt_path, distributed_filesystem=True, memory_efficient=True
+                        self.model, hf_ckpt_path, **(getattr(self.checkpoint_config, "mbridge_config", None) or {})
                     )
                 else:
                     self.bridge.save_hf_weights(self.model, hf_ckpt_path)
@@ -572,7 +572,9 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
                 hf_model_ckpt_path = get_hf_model_checkpoint_path(local_path)
                 if self.vanilla_bridge:
                     self.bridge.save_weights(
-                        self.model, hf_model_ckpt_path, distributed_filesystem=True, memory_efficient=True
+                        self.model,
+                        hf_model_ckpt_path,
+                        **(getattr(self.checkpoint_config, "mbridge_config", None) or {}),
                     )
                 else:
                     self.bridge.save_hf_weights(self.model, hf_model_ckpt_path)