Skip to content

Commit a6fd021

Browse files
committed
bugfix
1 parent 52d02f1 commit a6fd021

File tree

3 files changed

+11
-1
lines changed

3 files changed

+11
-1
lines changed

scripts/run-kimi-k2-Thinking.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ ray job submit --address="http://127.0.0.1:8265" \
170170
--actor-num-nodes 32 \
171171
--actor-num-gpus-per-node 8 \
172172
--colocate \
173-
--update-weight-buffer-size $(( 4 * 512 * 1024 * 1024))
173+
--update-weight-buffer-size $(( 4 * 512 * 1024 * 1024)) \
174174
${MODEL_ARGS[@]} \
175175
${CKPT_ARGS[@]} \
176176
${ROLLOUT_ARGS[@]} \

slime/backends/fsdp_utils/update_weight_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,11 @@ def connect_rollout_engines(
130130
self.tp_rank = dist.get_rank() - start_rank
131131

132132
def update_bucket_weights(self, named_tensors, weight_version=None) -> None:
133+
# Placeholder ranks (GPU slots reserved but no engine) have no gather group.
134+
# gather_object is only collective among group members, so we skip entirely.
135+
if self._ipc_gather_group is None:
136+
return
137+
133138
monkey_patch_torch_reductions()
134139
# Use flattened bucket approach similar to Megatron
135140
logger.info("Using flattened tensor bucket")

slime/backends/megatron_utils/update_weight/update_weight_from_tensor.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,11 @@ def _send_to_colocated_engine(
207207
ipc_gather_group,
208208
weight_version,
209209
) -> tuple[list[ObjectRef], Any]:
210+
# Placeholder ranks (GPU slots reserved but no engine) have no gather group.
211+
# gather_object is only collective among group members, so we skip entirely.
212+
if ipc_gather_group is None:
213+
return [], None
214+
210215
# TODO improve
211216
long_live_tensors = []
212217

0 commit comments

Comments
 (0)