fix bug

kip-cxj · kip-cxj · commit 5f985f9f4663 · 2026-01-26T17:20:37.000+08:00
diff --git a/tests/checkpoint_engine/test_kimi_checkpoint_engine.py b/tests/checkpoint_engine/test_kimi_checkpoint_engine.py
@@ -66,7 +66,6 @@ def test_kimi_checkpoint_engine(
         rollout_pool,
         "kimi_ckpt_engine",
         checkpoint_kwargs,
-        device=get_device_name(),
         check_allclose=check_allclose,
     )
 
diff --git a/verl/checkpoint_engine/kimi_checkpoint_engine.py b/verl/checkpoint_engine/kimi_checkpoint_engine.py
@@ -183,7 +183,7 @@ class BroadcastOperation:
 
     Args:
         rank (int): The rank of the current process.
-        group_name (dist.ProcessGroup): The NCCL process group.
+        ranks_group (int): The process group's value.
         bucket (torch.Tensor): The tensor to broadcast.
         metadata (list[ParameterMeta]): The metadata of the tensor.
     """
@@ -224,7 +224,6 @@ class KIMICheckpointEngine(CheckpointEngine):
     Args:
         bucket_size (int): Bucket size in bytes to transfer multiple weights at one time. Note that we use
             two buffer to send and recv weights at same time, so the device memory overhead is 2 * bucket_size.
-        group_name (str): The name of the NCCL process group. Defaults to "default".
         rebuild_group (bool): Whether to rebuild the NCCL process group in each update. Defaults to False.
         is_master (bool): Whether the current process is the master process. Defaults to False.
         rollout_dtype (torch.dtype): The dtype of the weights received from rollout workers. Defaults to torch.bfloat16.
@@ -273,7 +272,8 @@ def init_process_group(self, rank: int, world_size: int, master_metadata: Master
             world_size (int): The total number of processes.
         """
         self.rank = rank
-        # unregister_memory in transfer engine is not supported on NPU, so we have to initialize ParameterServer each time
+        # unregister_memory in transfer engine is not supported on NPU,
+        # so we have to initialize ParameterServer each time
         if get_device_name() == "npu" or not self.initialized:
             self.parameter_server = ParameterServer(rank=rank, world_size=world_size, auto_pg=False, custom_dist=True)
             self.parameter_server.receive_tensor = types.MethodType(receive_tensor, self.parameter_server)

Original file line number	Diff line number	Diff line change
`@@ -66,7 +66,6 @@ def test_kimi_checkpoint_engine(`
`66`	`66`	`rollout_pool,`
`67`	`67`	`"kimi_ckpt_engine",`
`68`	`68`	`checkpoint_kwargs,`
`69`		`- device=get_device_name(),`
`70`	`69`	`check_allclose=check_allclose,`
`71`	`70`	`)`
`72`	`71`