Add colocated mode to agentic cli.

wang2yn84 · wang2yn84 · commit 180f4601bee8 · 2026-04-29T22:43:58.000-07:00
diff --git a/examples/deepscaler/run_deepscaler_disagg_v5p16.sh b/examples/deepscaler/run_deepscaler_disagg_v5p16.sh
@@ -64,8 +64,6 @@ python -m tunix.cli.grpo_main \
   model_config.remat_config=3 \
   actor_model_config.mesh.shape="$trainer_mesh" \
   actor_model_config.mesh.axis_names="('fsdp','tp')" \
-  reference_model_config.mesh=null \
-  reference_model_config.same_mesh_as="actor" \
   rollout_model_config.mesh.shape="$rollout_mesh" \
   rollout_model_config.mesh.axis_names="('fsdp','tp')" \
   \
diff --git a/examples/deepswe/run_deepswe_disagg_v5p_32.sh b/examples/deepswe/run_deepswe_disagg_v5p_32.sh
@@ -81,8 +81,6 @@ python -m tunix.cli.grpo_main \
   model_config.remat_config=3 \
   actor_model_config.mesh.shape="$trainer_mesh" \
   actor_model_config.mesh.axis_names="('fsdp','tp')" \
-  reference_model_config.mesh=null \
-  reference_model_config.same_mesh_as="actor" \
   rollout_model_config.mesh.shape="$rollout_mesh" \
   rollout_model_config.mesh.axis_names="('fsdp','tp')" \
   \
diff --git a/examples/rl/grpo/gsm8k/run_qwen3_8b.sh b/examples/rl/grpo/gsm8k/run_qwen3_8b.sh
@@ -45,6 +45,10 @@ num_generations="${num_generations:-4}"
 train_mesh="${train_mesh:-(8,1)}"
 rollout_mesh="${rollout_mesh:-(1,8)}"
 
+# Set rollout_colocate to the mesh name (e.g. "actor") to colocate the rollout
+# model on the same mesh as the actor model
+rollout_colocate="${rollout_colocate:-null}"
+
 checkpoint_dir="${checkpoint_dir:-gs://tunix/rl/checkpoints/gsm8k/qwen3/01}"
 checkpoint_suffix="${checkpoint_suffix:-$(printf '%04d' "$((RANDOM % 10000))")}"
 if [[ -n "$checkpoint_dir" && "$checkpoint_dir" != "null" ]]; then
@@ -79,8 +83,7 @@ python -m tunix.cli.grpo_main \
   model_config.remat_config=3 \
   actor_model_config.mesh.shape="$train_mesh" \
   actor_model_config.mesh.axis_names="('fsdp','tp')" \
-  reference_model_config.mesh=null \
-  reference_model_config.same_mesh_as="actor" \
+  rollout_model_config.colocate_with="$rollout_colocate" \
   rollout_model_config.mesh.shape="$rollout_mesh" \
   rollout_model_config.mesh.axis_names="('fsdp','tp')" \
   \
diff --git a/tests/cli/grpo_main_test.py b/tests/cli/grpo_main_test.py
@@ -644,7 +644,6 @@ def test_cli_empty_system_prompt_stays_empty_string(self):
     )
     self.assertEqual(p.config["agentic_grpo_config"]["system_prompt"], "")
 
-
 class SplitMeshConfigTest(absltest.TestCase):
 
   def test_split_mesh_uses_explicit_role_meshes(self):
@@ -688,7 +687,6 @@ def test_split_mesh_uses_explicit_role_meshes(self):
           "shape": "(2,1)",
           "axis_names": "('fsdp','tp')",
       }
-    pipeline.config["reference_model_config"] = {"same_mesh_as": "actor"}
     rollout_model_config = pipeline.config["rollout_model_config"]
     if isinstance(rollout_model_config, omegaconf.dictconfig.DictConfig):
       rollout_model_config["mesh"] = {
@@ -732,6 +730,87 @@ def __init__(self, devices, axis_names, axis_types=None):
         role_to_mesh[rl_cluster_lib.Role.ACTOR],
     )
 
+  def test_colocate_with_reuses_device_slice_with_different_mesh(self):
+    extra = """
+training_mode: "agentic_grpo"
+data_module: "tunix.cli.recipes.deepscaler_data"
+apply_chat_template_to_dataset: false
+data_config:
+  train_data_path: "gs://fake/train.json"
+  eval_data_path: "gs://fake/eval.parquet"
+prompt_key: "prompts"
+reward_functions: []
+verl_compatible: false
+chat_parser_config:
+  type: "default"
+agent_class_path: null
+agent_kwargs: {}
+env_class_path: null
+env_kwargs: {}
+kubernetes_config: null
+agentic_grpo_config:
+  num_generations: 2
+  num_iterations: 1
+  beta: 0.0
+  epsilon: 0.2
+  epsilon_high: 0.28
+  system_prompt: ""
+  max_concurrency: 1
+  off_policy_steps: 0
+  max_turns: 1
+  context_ratio: 1
+sglang_jax_config:
+  mem_fraction_static: 0.8
+vllm_config:
+  hbm_utilization: 0.4
+"""
+    pipeline = _make_pipeline(extra)
+    actor_model_config = pipeline.config["actor_model_config"]
+    if isinstance(actor_model_config, omegaconf.dictconfig.DictConfig):
+      actor_model_config["mesh"] = {
+          "shape": "(2,1)",
+          "axis_names": "('fsdp','tp')",
+      }
+    rollout_model_config = pipeline.config["rollout_model_config"]
+    if isinstance(rollout_model_config, omegaconf.dictconfig.DictConfig):
+      rollout_model_config["colocate_with"] = "actor"
+      rollout_model_config["mesh"] = {
+          "shape": "(1,2)",
+          "axis_names": "('fsdp','tp')",
+      }
+
+    fake_devices = list(range(4))
+
+    class FakeMesh:
+
+      def __init__(self, devices, axis_names, axis_types=None):
+        self.devices = devices
+        self.axis_names = axis_names
+        self.axis_types = axis_types
+
+    with mock.patch.object(grpo_main.jax, "devices", return_value=fake_devices):
+      with mock.patch.object(
+          grpo_main.jax.sharding, "Mesh", side_effect=FakeMesh
+      ):
+        role_to_mesh = pipeline.create_role_to_mesh()
+
+    self.assertSequenceEqual(
+        role_to_mesh[rl_cluster_lib.Role.ACTOR].devices.flatten().tolist(),
+        [0, 1],
+    )
+    self.assertSequenceEqual(
+        role_to_mesh[rl_cluster_lib.Role.ROLLOUT].devices.flatten().tolist(),
+        [0, 1],
+    )
+    self.assertEqual(
+        role_to_mesh[rl_cluster_lib.Role.ACTOR].devices.shape,
+        (2, 1),
+    )
+    self.assertEqual(
+        role_to_mesh[rl_cluster_lib.Role.ROLLOUT].devices.shape,
+        (1, 2),
+    )
+
 
 if __name__ == "__main__":
   absltest.main()
diff --git a/tests/rl/agentic/agentic_grpo_learner_test.py b/tests/rl/agentic/agentic_grpo_learner_test.py
@@ -224,6 +224,7 @@ def __init__(self, algo_config):
         self.algo_config = algo_config
         self.rl_cluster = mock.Mock()
         self.metric_fns = []
+        self._train_micro_batch_size = 1
 
       def _create_micro_batch_iterator(self, iterator, batch_size):
         # The dataset batch size is 2, and we want to test micro-batching
@@ -296,11 +297,86 @@ async def _orchestrator_producer(
       item = train_data_queue.get(block=True)
       if item is None:
         break
-      results.append(item)
+      results.extend(item)
 
     prompt_ids = [r.prompt_ids[0] for r in results]
     self.assertEqual(prompt_ids, [0, 0, 0, 0, 1, 1, 1, 1])
 
+    def test_iterator_colocated_batches_full_rollout_batch(self):
+        class _MockTrainer(agentic_grpo_learner.GRPOLearner):
+
+            def __init__(self, algo_config):
+                self.algo_config = algo_config
+                self.rl_cluster = mock.Mock()
+                self.metric_fns = []
+                self.can_enable_async_rollout = False
+                self._share_actor_rollout_devices = True
+                self._full_batch_size = 2
+                self._train_micro_batch_size = 2
+
+            def _create_micro_batch_iterator(self, iterator, batch_size):
+                del batch_size
+                for batch in iterator:
+                    for i in range(len(batch["prompts"])):
+                        yield jax.tree.map(lambda x, index=i: x[index : index + 1], batch)
+
+            @override
+            def _batch_to_train_example(self, batch_results, mode):
+                del mode
+                examples = []
+                for _ in range(self.algo_config.num_generations):
+                    examples.append(
+                            types.SimpleNamespace(
+                                    prompt_ids=batch_results[1][0]["prompts"],
+                            )
+                    )
+                return examples
+
+            @override
+            async def _orchestrator_producer(
+                    self,
+                    orchestrator,
+                    prompt_iterator: Iterable[TrainingInputT] | AsyncIterable[TrainingInputT],
+                    num_generations: int = 1,
+                    collect_mode: str = "Token",
+            ):
+                del orchestrator, num_generations, collect_mode
+                i = 0
+                async for example in prompt_iterator:
+                    group = [
+                            types.SimpleNamespace(pair_index=i * 2 + j) for j in range(2)
+                    ]
+                    yield group, [example]
+                    i += 1
+
+        algo_config = agentic_grpo_learner.GRPOConfig(
+                num_generations=2,
+                num_iterations=2,
+        )
+        trainer = _MockTrainer(algo_config)
+
+        train_data_queue = queue_lib.SimpleDataQueue(maxsize=0)
+        dataset = _dummy_dataset(MySource(data=[i for i in range(2)]), batch_size=2)
+        prompt_queue = queue.Queue()
+        for item in iter(dataset):
+            prompt_queue.put(item)
+        prompt_queue.put(None)
+
+        asyncio.run(trainer._producer(mock.Mock(), prompt_queue, train_data_queue))
+
+        queue_items = []
+        while True:
+            item = train_data_queue.get(block=True)
+            if item is None:
+                break
+            queue_items.append(item)
+
+        self.assertLen(queue_items, 4)
+        for batch in queue_items:
+            self.assertLen(batch, 2)
+        prompt_ids = [r.prompt_ids[0] for batch in queue_items for r in batch]
+        self.assertEqual(prompt_ids, [0, 0, 0, 0, 1, 1, 1, 1])
+
   def test_grpo_config_validation(self):
     with self.assertRaisesRegex(
         ValueError, "num_generations must be greater than 1"
@@ -636,7 +712,7 @@ def mock_compute_rewards(prompts, completions, **kwargs):
         algo_config=grpo_config,
         chat_parser=MockChatParser(),
     )
-    
+
     with mock.patch.object(learner, "_compute_rewards", side_effect=mock_compute_rewards):
       with mock.patch.object(
           learner.rl_cluster,
@@ -645,7 +721,7 @@ def mock_compute_rewards(prompts, completions, **kwargs):
           autospec=True,
       ):
         learner._process_results(trajectories)
-    
+
     self.assertEqual(extracted_completions, ["msg 0", "msg 1"])
 
   @parameterized.named_parameters(
diff --git a/tunix/cli/grpo_main.py b/tunix/cli/grpo_main.py
@@ -73,8 +73,8 @@ class GrpoPipeline(config.HyperParameters):
     plus ``max_turns``, ``context_ratio``, ``per_turn_timeout_secs``.
   * role-specific ``*_model_config.mesh``: any role with an explicit mesh gets
     its own device slice; omitted meshes share the actor mesh by default.
-  * role-specific ``same_mesh_as``: optional mesh sharing like
-    ``reference_model_config.same_mesh_as: actor``.
+  * role-specific ``colocate_with``: share another role's device set while
+    still allowing a different mesh shape on that same device set.
   * ``sglang_jax_config`` / ``vllm_config``: engine-specific rollout params.
   * ``chat_parser_config.type``: ``"default"`` or ``"qwen"``.
   * ``agent_class_path`` / ``env_class_path``: dotted Python paths to load
@@ -116,21 +116,19 @@ def _resolve_split_role(self, role_name: str) -> rl_cluster_lib.Role:
       )
     return self._SPLIT_ROLE_ALIASES[normalized]
 
-  def _get_same_mesh_as_map(
+  def _get_colocate_with_map(
       self,
   ) -> dict[rl_cluster_lib.Role, rl_cluster_lib.Role]:
-    same_mesh_as = {}
+    colocate_with = {}
     for role, model_key in self._ROLE_TO_MODEL_KEY.items():
       model_cfg = self.config.get(model_key, {}) or {}
-      target_name = model_cfg.get("same_mesh_as")
+      target_name = model_cfg.get("colocate_with")
       if target_name is None:
         continue
-      target_role = self._resolve_split_role(str(target_name))
       if role == rl_cluster_lib.Role.ACTOR:
-        raise ValueError("Actor must own its mesh.")
-      same_mesh_as[role] = target_role
-
-    return same_mesh_as
+        raise ValueError("Actor must own its device set.")
+      colocate_with[role] = self._resolve_split_role(str(target_name))
+    return colocate_with
 
   def _is_role_active(self, role: rl_cluster_lib.Role) -> bool:
     if role in (
@@ -145,10 +143,10 @@ def _is_role_active(self, role: rl_cluster_lib.Role) -> bool:
   def _resolve_mesh_owners(
       self,
   ) -> dict[rl_cluster_lib.Role, rl_cluster_lib.Role]:
-    same_mesh_as = self._get_same_mesh_as_map()
+    colocate_with = self._get_colocate_with_map()
     base_owners = {}
     for role, model_key in self._ROLE_TO_MODEL_KEY.items():
-      if not self._is_role_active(role) and role not in same_mesh_as:
+      if not self._is_role_active(role):
         continue
       has_mesh = bool(self.config.get(model_key, {}).get("mesh"))
       base_owners[role] = (
@@ -162,35 +160,28 @@ def resolve_owner(
         seen: set[rl_cluster_lib.Role],
     ) -> rl_cluster_lib.Role:
       if role in seen:
-        raise ValueError("same_mesh_as contains a cycle.")
-      if role not in same_mesh_as:
+        raise ValueError("colocate_with contains a cycle.")
+      if role not in colocate_with:
         return base_owners[role]
       seen.add(role)
-      target_role = same_mesh_as[role]
+      target_role = colocate_with[role]
       if target_role not in base_owners:
         raise ValueError(
             f"Role {target_role.value!r} is not active in this config."
         )
       return resolve_owner(target_role, seen)
 
     role_to_owner = {}
-    for role, model_key in self._ROLE_TO_MODEL_KEY.items():
-      if role not in base_owners:
-        continue
-      has_mesh = bool(self.config.get(model_key, {}).get("mesh"))
-      if role in same_mesh_as:
-        if has_mesh:
-          raise ValueError(
-              f"{model_key}.mesh is specified, so it must own a separate mesh "
-              "and cannot also use same_mesh_as."
-          )
-      else:
-        role_to_owner[role] = resolve_owner(role, set())
-        continue
+    for role in base_owners:
       role_to_owner[role] = resolve_owner(role, set())
     return role_to_owner
 
-  def _create_role_to_mesh(self):
+  def create_role_to_mesh(self):
+    """Build role→mesh mapping.
+
+    Any role with an explicit ``*.mesh`` config gets a dedicated device slice.
+    Roles without a mesh share the actor mesh by default.
+    """
     devices = list(jax.devices())
     role_to_owner = self._resolve_mesh_owners()
     owner_order = []
@@ -235,16 +226,18 @@ def _create_role_to_mesh(self):
             for owner in owner_order
         },
     )
-    return {role: owner_to_mesh[owner] for role, owner in role_to_owner.items()}
-
-  def create_role_to_mesh(self):
-    """Build role→mesh mapping.
+    role_to_mesh = {}
+    for role, owner in role_to_owner.items():
+      model_key = self._ROLE_TO_MODEL_KEY[role]
+      has_mesh = bool(self.config.get(model_key, {}).get("mesh"))
+      if role == owner or not has_mesh:
+        role_to_mesh[role] = owner_to_mesh[owner]
+      else:
+        role_to_mesh[role] = self.create_mesh(
+            model_key, devices=owner_to_device_slice[owner]
+        )
+    return role_to_mesh
 
-    Any role with an explicit ``*.mesh`` config gets a dedicated device slice.
-    Roles without a mesh share the actor mesh by default, or can point at
-    another role via ``same_mesh_as``.
-    """
-    return self._create_role_to_mesh()
 
   # ------------------------------------------------------------------
   # Rollout config
diff --git a/tunix/rl/agentic/agentic_rl_learner.py b/tunix/rl/agentic/agentic_rl_learner.py
diff --git a/tunix/rl/rl_learner.py b/tunix/rl/rl_learner.py

Original file line number	Diff line number	Diff line change
`@@ -64,8 +64,6 @@ python -m tunix.cli.grpo_main \`
`64`	`64`	`model_config.remat_config=3 \`
`65`	`65`	`actor_model_config.mesh.shape="$trainer_mesh" \`
`66`	`66`	`actor_model_config.mesh.axis_names="('fsdp','tp')" \`
`67`		`- reference_model_config.mesh=null \`
`68`		`- reference_model_config.same_mesh_as="actor" \`
`69`	`67`	`rollout_model_config.mesh.shape="$rollout_mesh" \`
`70`	`68`	`rollout_model_config.mesh.axis_names="('fsdp','tp')" \`
`71`	`69`	`\`
Original file line number	Diff line number	Diff line change
`@@ -81,8 +81,6 @@ python -m tunix.cli.grpo_main \`
`81`	`81`	`model_config.remat_config=3 \`
`82`	`82`	`actor_model_config.mesh.shape="$trainer_mesh" \`
`83`	`83`	`actor_model_config.mesh.axis_names="('fsdp','tp')" \`
`84`		`- reference_model_config.mesh=null \`
`85`		`- reference_model_config.same_mesh_as="actor" \`
`86`	`84`	`rollout_model_config.mesh.shape="$rollout_mesh" \`
`87`	`85`	`rollout_model_config.mesh.axis_names="('fsdp','tp')" \`
`88`	`86`	`\`