bugfix

zhuzilin · zhuzilin · commit a6fd0214932f · 2026-02-25T10:00:17.000Z
diff --git a/scripts/run-kimi-k2-Thinking.sh b/scripts/run-kimi-k2-Thinking.sh
@@ -170,7 +170,7 @@ ray job submit --address="http://127.0.0.1:8265" \
    --actor-num-nodes 32 \
    --actor-num-gpus-per-node 8 \
    --colocate \
-   --update-weight-buffer-size $(( 4 * 512 * 1024 * 1024))
+   --update-weight-buffer-size $(( 4 * 512 * 1024 * 1024)) \
    ${MODEL_ARGS[@]} \
    ${CKPT_ARGS[@]} \
    ${ROLLOUT_ARGS[@]} \
diff --git a/slime/backends/fsdp_utils/update_weight_utils.py b/slime/backends/fsdp_utils/update_weight_utils.py
@@ -130,6 +130,11 @@ def connect_rollout_engines(
                 self.tp_rank = dist.get_rank() - start_rank
 
     def update_bucket_weights(self, named_tensors, weight_version=None) -> None:
+        # Placeholder ranks (GPU slots reserved but no engine) have no gather group.
+        # gather_object is only collective among group members, so we skip entirely.
+        if self._ipc_gather_group is None:
+            return
+
         monkey_patch_torch_reductions()
         # Use flattened bucket approach similar to Megatron
         logger.info("Using flattened tensor bucket")
diff --git a/slime/backends/megatron_utils/update_weight/update_weight_from_tensor.py b/slime/backends/megatron_utils/update_weight/update_weight_from_tensor.py
@@ -207,6 +207,11 @@ def _send_to_colocated_engine(
     ipc_gather_group,
     weight_version,
 ) -> tuple[list[ObjectRef], Any]:
+    # Placeholder ranks (GPU slots reserved but no engine) have no gather group.
+    # gather_object is only collective among group members, so we skip entirely.
+    if ipc_gather_group is None:
+        return [], None
+
     # TODO improve
     long_live_tensors = []