fix update weights with pla placeholder

zhuzilin · zhuzilin · commit 52d02f15e1ce · 2026-02-25T09:52:53.000Z
diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -48,7 +48,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}]
+        info: [{"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_async_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_gsm8k_short.py"}, {"num_gpus": 4, "test_file": "test_qwen2.5_0.5B_sglang_config.py"}]
     defaults:
       run:
         working-directory: ${{ github.workspace }}
diff --git a/.github/workflows/pr-test.yml.j2 b/.github/workflows/pr-test.yml.j2
@@ -4,6 +4,7 @@
       'tests': [
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_async_short.py', 'num_gpus': 4},
         {'test_file': 'test_qwen2.5_0.5B_gsm8k_short.py', 'num_gpus': 4},
+        {'test_file': 'test_qwen2.5_0.5B_sglang_config.py', 'num_gpus': 4},
       ],
     },
     'e2e-test-fsdp': {
diff --git a/slime/backends/fsdp_utils/actor.py b/slime/backends/fsdp_utils/actor.py
@@ -731,14 +731,15 @@ def update_weights(self) -> None:  # type: ignore[override]
         if self.args.debug_train_only or self.args.debug_rollout_only:
             return
 
-        rollout_engines, rollout_engine_lock, num_new_engines, engine_gpu_counts = ray.get(
+        rollout_engines, rollout_engine_lock, num_new_engines, engine_gpu_counts, engine_gpu_offsets = ray.get(
             self.rollout_manager.get_rollout_engines_and_lock.remote()
         )
         if num_new_engines > 0:
             self.weight_updater.connect_rollout_engines(
                 rollout_engines,
                 rollout_engine_lock,
                 engine_gpu_counts=engine_gpu_counts,
+                engine_gpu_offsets=engine_gpu_offsets,
             )
             dist.barrier(group=get_gloo_group())
             if dist.get_rank() == 0:
diff --git a/slime/backends/fsdp_utils/update_weight_utils.py b/slime/backends/fsdp_utils/update_weight_utils.py
@@ -41,6 +41,7 @@ def connect_rollout_engines(
         rollout_engines: Sequence[ActorHandle],
         rollout_engine_lock: ActorHandle | None,
         engine_gpu_counts: Sequence[int] | None = None,
+        engine_gpu_offsets: Sequence[int] | None = None,
     ) -> None:
         pass
 
@@ -94,6 +95,7 @@ def connect_rollout_engines(
         rollout_engines: Sequence[ActorHandle],
         rollout_engine_lock: ActorHandle | None,
         engine_gpu_counts: Sequence[int] | None = None,
+        engine_gpu_offsets: Sequence[int] | None = None,
     ) -> None:
         """Attach rollout engines and create per-engine IPC (Gloo) groups.
 
@@ -104,15 +106,17 @@ def connect_rollout_engines(
 
         if engine_gpu_counts is None:
             engine_gpu_counts = [self.args.rollout_num_gpus_per_engine] * len(rollout_engines)
-
-        # Cumulative rank offsets for (potentially) non-uniform engine groups.
-        cumulative = [0]
-        for c in engine_gpu_counts:
-            cumulative.append(cumulative[-1] + c)
+        if engine_gpu_offsets is None:
+            # Fallback: assume engines are densely packed (no placeholder gaps).
+            engine_gpu_offsets = []
+            offset = 0
+            for c in engine_gpu_counts:
+                engine_gpu_offsets.append(offset)
+                offset += c
 
         for i, engine in enumerate(self.rollout_engines):
-            start_rank = cumulative[i]
-            end_rank = cumulative[i + 1]
+            start_rank = engine_gpu_offsets[i]
+            end_rank = start_rank + engine_gpu_counts[i]
             group_ranks = list(range(start_rank, end_rank))
             new_group = dist.new_group(
                 ranks=group_ranks,
@@ -191,6 +195,7 @@ def connect_rollout_engines(
         rollout_engines: Sequence[ActorHandle],
         rollout_engine_lock: ActorHandle | None,
         engine_gpu_counts: Sequence[int] | None = None,
+        engine_gpu_offsets: Sequence[int] | None = None,
     ) -> None:
         """On rank 0, initialize a temporary NCCL group for parameter broadcast."""
         self.rollout_engines = rollout_engines
diff --git a/slime/backends/megatron_utils/actor.py b/slime/backends/megatron_utils/actor.py
@@ -539,7 +539,7 @@ def update_weights(self) -> None:
                 ray.get(self.rollout_manager.recover_rollout_engines.remote())
             dist.barrier(group=get_gloo_group())
 
-        rollout_engines, rollout_engine_lock, num_new_engines, engine_gpu_counts = ray.get(
+        rollout_engines, rollout_engine_lock, num_new_engines, engine_gpu_counts, engine_gpu_offsets = ray.get(
             self.rollout_manager.get_rollout_engines_and_lock.remote()
         )
 
@@ -551,6 +551,7 @@ def update_weights(self) -> None:
                 rollout_engines,
                 rollout_engine_lock,
                 engine_gpu_counts=engine_gpu_counts,
+                engine_gpu_offsets=engine_gpu_offsets,
             )
             dist.barrier(group=get_gloo_group())
             if dist.get_rank() == 0:
diff --git a/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py b/slime/backends/megatron_utils/update_weight/update_weight_from_distributed.py
@@ -47,6 +47,7 @@ def connect_rollout_engines(
         rollout_engines: Sequence[ActorHandle],
         rollout_engine_lock: ActorHandle,
         engine_gpu_counts: Sequence[int] | None = None,
+        engine_gpu_offsets: Sequence[int] | None = None,
     ) -> None:
         """
         Create NCCL "slime-pp_{pp_rank}" if PP source (DP=TP=0). Lock prevents concurrent broadcasts.
diff --git a/slime/backends/megatron_utils/update_weight/update_weight_from_tensor.py b/slime/backends/megatron_utils/update_weight/update_weight_from_tensor.py
@@ -63,6 +63,7 @@ def connect_rollout_engines(
         rollout_engines: Sequence[ActorHandle],
         rollout_engine_lock: ActorHandle,
         engine_gpu_counts: Sequence[int] | None = None,
+        engine_gpu_offsets: Sequence[int] | None = None,
     ) -> None:
         """
         Split colocated/distributed engines. Global source rank (DP=TP=PP=0) creates NCCL
@@ -72,15 +73,20 @@ def connect_rollout_engines(
 
         if engine_gpu_counts is None:
             engine_gpu_counts = [self.args.rollout_num_gpus_per_engine] * len(rollout_engines)
-
-        # Compute colocated engine count from cumulative GPU budget.
+        if engine_gpu_offsets is None:
+            # Fallback: assume engines are densely packed (no placeholder gaps).
+            engine_gpu_offsets = []
+            offset = 0
+            for c in engine_gpu_counts:
+                engine_gpu_offsets.append(offset)
+                offset += c
+
+        # Compute colocated engine count: engines whose GPUs fall within actor GPU range.
         total_actor_gpus = self.args.actor_num_nodes * self.args.actor_num_gpus_per_node
         colocate_engine_nums = 0
-        gpu_sum = 0
-        for c in engine_gpu_counts:
-            if gpu_sum + c > total_actor_gpus:
+        for gpu_offset, gpu_count in zip(engine_gpu_offsets, engine_gpu_counts, strict=True):
+            if gpu_offset + gpu_count > total_actor_gpus:
                 break
-            gpu_sum += c
             colocate_engine_nums += 1
 
         self.use_distribute = len(rollout_engines) > colocate_engine_nums
@@ -108,25 +114,24 @@ def connect_rollout_engines(
                     engine_gpu_counts=distributed_gpu_counts,
                 )
 
-        # Cumulative rank offsets for (potentially) non-uniform colocated groups.
+        colocate_gpu_offsets = engine_gpu_offsets[:colocate_engine_nums]
         colocate_gpu_counts = engine_gpu_counts[:colocate_engine_nums]
-        cumulative = [0]
-        for c in colocate_gpu_counts:
-            cumulative.append(cumulative[-1] + c)
 
         # Create IPC Gloo gather groups (only on first call; partitioning is
         # fixed across reconnects).
         if self._ipc_gather_group is None:
             for i in range(colocate_engine_nums):
-                group_ranks = list(range(cumulative[i], cumulative[i + 1]))
+                group_ranks = list(range(colocate_gpu_offsets[i], colocate_gpu_offsets[i] + colocate_gpu_counts[i]))
                 new_group = dist.new_group(ranks=group_ranks, backend="gloo")
                 if dist.get_rank() in group_ranks:
                     self._ipc_gather_group = new_group
-                    self._ipc_gather_src = cumulative[i]
+                    self._ipc_gather_src = colocate_gpu_offsets[i]
 
         # Map training ranks to colocated engine actors.
         for i, engine in enumerate(self.rollout_engines):
-            if cumulative[i] <= dist.get_rank() < cumulative[i + 1]:
+            start = colocate_gpu_offsets[i]
+            end = start + colocate_gpu_counts[i]
+            if start <= dist.get_rank() < end:
                 self._ipc_engine = engine
 
     @torch.no_grad()
diff --git a/slime/ray/rollout.py b/slime/ray/rollout.py
@@ -362,6 +362,18 @@ def engine_gpu_counts(self) -> list[int]:
         """Per-engine GPU count for all node-0 engines, parallel to ``engines``."""
         return [g.num_gpus_per_engine for g in self.engine_groups for _ in g.engines]
 
+    @property
+    def engine_gpu_offsets(self) -> list[int]:
+        """Per-engine GPU offset for all node-0 engines, parallel to ``engines``.
+
+        Accounts for placeholder groups that occupy GPU slots without creating engines.
+        """
+        offsets = []
+        for g in self.engine_groups:
+            for j in range(len(g.engines)):
+                offsets.append(g.gpu_offset + j * g.num_gpus_per_engine)
+        return offsets
+
     @property
     def nodes_per_engine(self):
         """Nodes per engine.  Only valid when all active groups share the same value."""
@@ -505,8 +517,9 @@ def get_rollout_engines_and_lock(self, model_name: str | None = None):
         srv = self._get_server(model_name)
         engines = srv.engines if srv else []
         gpu_counts = srv.engine_gpu_counts if srv else []
+        gpu_offsets = srv.engine_gpu_offsets if srv else []
         num_new = srv.num_new_engines if srv else 0
-        return engines, self.rollout_engine_lock, num_new, gpu_counts
+        return engines, self.rollout_engine_lock, num_new, gpu_counts, gpu_offsets
 
     def get_num_rollout_per_epoch(self):
         assert self.args.rollout_global_dataset
@@ -566,10 +579,17 @@ def recover_rollout_engines(self, model_name: str | None = None):
         if self.rollout_id == -1 or srv is None:
             engines = srv.engines if srv else []
             gpu_counts = srv.engine_gpu_counts if srv else []
-            return engines, self.rollout_engine_lock, (srv.num_new_engines if srv else 0), gpu_counts
+            gpu_offsets = srv.engine_gpu_offsets if srv else []
+            return engines, self.rollout_engine_lock, (srv.num_new_engines if srv else 0), gpu_counts, gpu_offsets
 
         srv.recover()
-        return srv.engines, self.rollout_engine_lock, srv.num_new_engines, srv.engine_gpu_counts
+        return (
+            srv.engines,
+            self.rollout_engine_lock,
+            srv.num_new_engines,
+            srv.engine_gpu_counts,
+            srv.engine_gpu_offsets,
+        )
 
     def clear_num_new_engines(self, model_name: str | None = None):
         # when fault tolerance is not enabled, we need to manually clear num_new_engines after update_weights

Original file line number	Diff line number	Diff line change
`@@ -539,7 +539,7 @@ def update_weights(self) -> None:`
`539`	`539`	`ray.get(self.rollout_manager.recover_rollout_engines.remote())`
`540`	`540`	`dist.barrier(group=get_gloo_group())`
`541`	`541`
`542`		`- rollout_engines, rollout_engine_lock, num_new_engines, engine_gpu_counts = ray.get(`
	`542`	`+ rollout_engines, rollout_engine_lock, num_new_engines, engine_gpu_counts, engine_gpu_offsets = ray.get(`
`543`	`543`	`self.rollout_manager.get_rollout_engines_and_lock.remote()`
`544`	`544`	`)`
`545`	`545`
`@@ -551,6 +551,7 @@ def update_weights(self) -> None:`
`551`	`551`	`rollout_engines,`
`552`	`552`	`rollout_engine_lock,`
`553`	`553`	`engine_gpu_counts=engine_gpu_counts,`
	`554`	`+ engine_gpu_offsets=engine_gpu_offsets,`
`554`	`555`	`)`
`555`	`556`	`dist.barrier(group=get_gloo_group())`
`556`	`557`	`if dist.get_rank() == 0:`