misc

specture724 · specture724 · commit 1bfca4ab30e3 · 2025-12-11T06:25:51.000Z
diff --git a/checkpoint_engine/ps.py b/checkpoint_engine/ps.py
@@ -1097,7 +1097,7 @@ def update(
                     timeout=timeout,
                     is_master=self._rank == 0,
                 )
-            # if both ranks is None or [], it will use fully broadcast to update to all ranks
+            # if ranks is None or [], it will use fully broadcast to update to all ranks
             ranks_group = dist.new_group(ranks if ranks else None)
             self._update_per_bucket(checkpoint_name, req_func, ranks_group, ranks)
             self.store_based_barrier(manager_store)
diff --git a/tests/test_update.py b/tests/test_update.py
@@ -82,7 +82,7 @@ def error_run(weights: list[tuple[str, torch.Tensor]]):
         try:
             trigger_error(socket_paths)
         except RuntimeError as e:
-            assert str(e) == "Failed to update weights due to remote errors"
+            assert str(e) == "Some workers failed to update weights"
 
 
 def checker_proc(rank: int, device_uuid: str, named_tensors: dict[str, torch.Tensor], queue: Queue):
@@ -177,7 +177,7 @@ def run(
             ],
         ),
         ("test_with_remote_error", [[]]),
-        # ("long_test_no_error", [list(random.sample(range(get_world_size()), k=num_ranks)) for num_ranks in range(get_world_size() + 1)]),
+        ("test_no_error", [list(random.sample(range(get_world_size()), k=num_ranks)) for num_ranks in range(get_world_size() + 1)]),
     ],
 )
 def test_update(test_name: str, rank_list: list[list[int]] | None):

Original file line number	Diff line number	Diff line change
`@@ -1097,7 +1097,7 @@ def update(`
`1097`	`1097`	`timeout=timeout,`
`1098`	`1098`	`is_master=self._rank == 0,`
`1099`	`1099`	`)`
`1100`		`- # if both ranks is None or [], it will use fully broadcast to update to all ranks`
	`1100`	`+ # if ranks is None or [], it will use fully broadcast to update to all ranks`
`1101`	`1101`	`ranks_group = dist.new_group(ranks if ranks else None)`
`1102`	`1102`	`self._update_per_bucket(checkpoint_name, req_func, ranks_group, ranks)`
`1103`	`1103`	`self.store_based_barrier(manager_store)`