feat: test for inplace pin memory added

specture724 · specture724 · commit 4df01bb8bdde · 2025-12-05T06:44:02.000Z
diff --git a/tests/test_update.py b/tests/test_update.py
@@ -82,7 +82,7 @@ def error_run(weights: list[tuple[str, torch.Tensor]]):
         try:
             trigger_error(socket_paths)
         except RuntimeError as e:
-            assert str(e) == "Failed to update weights due to remote errors"
+            assert str(e) == "Some workers failed to update weights"
 
 
 def checker_proc(rank: int, device_uuid: str, named_tensors: dict[str, torch.Tensor], queue: Queue):
@@ -96,7 +96,7 @@ def check(names_to_check: dict[str, bool], weights: list[tuple[str, torch.Tensor
         for name, weight in weights:
             if name not in named_tensors:
                 continue
-            assert (weight == named_tensors[name]).all()
+            assert (weight == named_tensors[name]).all(), f"Tensor {name} does not match!"
             names_to_check[name] = True
 
     def check_weights(names_to_check: dict[str, bool], socket_paths: list[tuple[str, str]]):
@@ -163,6 +163,61 @@ def run(
     assert proc.exitcode == 0
 
 
+def run_with_files(
+    checker_func: callable,
+):
+    rank = int(os.getenv("RANK"))
+    ctx = get_context("spawn")
+    queue = ctx.Queue()
+    _device_uuid = _get_physical_gpu_id(device_manager, rank)
+    ps = ParameterServer(auto_pg=True)
+    _device_uuid = _get_physical_gpu_id(ps.device_manager, rank)
+    named_tensors = dict(gen_test_tensors(rank))
+
+    # Save 1/3 tensors to /dev/shm/ as .safetensors files
+    # Save 1/3 tensors to ./tmp (disk) as .safetensors files
+    # Keep 1/3 tensors in memory
+    import safetensors.torch
+
+    files = []
+    dev_shm_dir = "/dev/shm/checkpoint_engine_tests"  # noqa: S108
+    disk_dir = "/tmp/checkpoint_engine_tests"  # noqa: S108
+    os.makedirs(dev_shm_dir, exist_ok=True)
+    os.makedirs(disk_dir, exist_ok=True)
+    tensors_items = list(named_tensors.items())
+    tensors_in_dev_shm = named_tensors
+    tensors_in_dev_shm = dict(tensors_items[: len(tensors_items) // 2])
+    tensors_in_disk = dict(tensors_items[len(tensors_items) // 3 : 2 * len(tensors_items) // 3])
+    tensors_in_memory = dict(tensors_items[1 * len(tensors_items) // 2 :])
+    disk_files = [
+        os.path.join(disk_dir, f"rank{_rank}_checkpoint.safetensors")
+        for _rank in range(get_world_size())
+    ]
+    safetensors.torch.save_file(tensors_in_disk, disk_files[rank])
+    time.sleep(1)
+    files.append(disk_files[rank])
+    dev_shm_files = [
+        os.path.join(dev_shm_dir, f"rank{rank}_checkpoint.safetensors")
+        for _ in range(get_world_size())
+    ]
+    safetensors.torch.save_file(tensors_in_dev_shm, dev_shm_files[rank])
+    time.sleep(1)
+    files.append(dev_shm_files[rank])
+
+    checkpoint_name = "test_with_files"
+    proc = ctx.Process(target=checker_func, args=(rank, _device_uuid, named_tensors, queue))
+    proc.start()
+    ps.register_checkpoint(checkpoint_name, named_tensors=tensors_in_memory, files=files)
+    ps.gather_metas(checkpoint_name)
+    ps.update(checkpoint_name, queue.put, ranks=[])
+    # sleep 3s to wait process group is destroyed
+    time.sleep(3)
+    ps.unregister_checkpoint(checkpoint_name)
+    queue.put(None)
+    proc.join()
+    assert proc.exitcode == 0
+
+
 @pytest.mark.gpu
 @pytest.mark.parametrize(
     "test_name,rank_list",
@@ -211,6 +266,37 @@ def test_update(test_name: str, rank_list: list[list[int]] | None):
     assert result.returncode == 0
 
 
+@pytest.mark.gpu
+def test_update_with_files(test_name: str = "test_with_files"):
+    world_size = device_manager.device_module.device_count()
+    assert world_size >= 2, "This test requires at least 2 GPUs."
+    master_addr = "localhost"
+    master_port = 25400
+    cmd = [
+        "torchrun",
+        "--nproc_per_node",
+        str(world_size),
+        "--master_addr",
+        master_addr,
+        "--master_port",
+        str(master_port),
+        __file__,
+        test_name,
+        "[]",
+    ]
+
+    result = subprocess.run(  # noqa: S603
+        cmd,
+        capture_output=False,
+        text=True,
+        cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        shell=False,
+        check=False,
+    )
+
+    assert result.returncode == 0
+
+
 if __name__ == "__main__":
     run_with_pytest = "PYTEST_CURRENT_TEST" in os.environ
     if not run_with_pytest:
@@ -230,5 +316,7 @@ def test_update(test_name: str, rank_list: list[list[int]] | None):
             expected_exception=RuntimeError,
             exception_msg="Failed to update weights due to remote errors",
         )
+    elif test_type == "test_with_files":
+        run_with_files(checker_proc)
     else:
         raise ValueError(f"Unknown TEST_TYPE: {test_type}")