fix distributed integration tests

deependujha · deependujha · commit f6dbb49baf25 · 2026-02-20T16:24:02.000Z
diff --git a/tests/tests_fabric/plugins/io/test_distributed_async_io.py b/tests/tests_fabric/plugins/io/test_distributed_async_io.py
@@ -140,6 +140,16 @@ def test_async_checkpointio_storage_options_not_supported(tmp_path):
 
 
 # --- integration test to verify the checkpoint is actually saved and loaded asynchronously ---
+
+
+def _broadcast_from_rank0(fabric: Fabric, obj):
+    """Broadcast an object from rank0 once Fabric has launched."""
+    fabric.barrier("pre_broadcast")
+    obj = fabric.broadcast(obj)
+    fabric.barrier("post_broadcast")
+    return obj
+
+
 class SimpleModel(nn.Module):
     def __init__(self):
         super().__init__()
@@ -204,7 +214,11 @@ def run_async_checkpoint_state_restoration(tmp_path, expected_strategy_name, acc
     # snapshot weights BEFORE save
     before = {k: v.detach().clone() for k, v in model.state_dict().items()}
     state = AttributeDict(model=model, optimizer=optimizer, step=1)
+
+    # rank0 decides canonical checkpoint path
     ckpt_path = tmp_path / "ckpt"
+    ckpt_path = _broadcast_from_rank0(fabric, ckpt_path)
+
     fabric.save(ckpt_path, state)
 
     # Wait for DistributedAsyncCheckpointIO to finish writing checkpoint metadata.
diff --git a/tests/tests_pytorch/plugins/test_distributed_async_io.py b/tests/tests_pytorch/plugins/test_distributed_async_io.py
@@ -1,16 +1,6 @@
 # Copyright The Lightning AI team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Licensed under the Apache License, Version 2.0
+
 import os
 import time
 from pathlib import Path
@@ -23,62 +13,96 @@
 from lightning.pytorch.demos import BoringModel
 from tests_pytorch.helpers.runif import RunIf
 
-# --- integration test to verify the checkpoint is actually saved and loaded asynchronously ---
+# -------------------------------------------------------------------------
+# Helpers
+# -------------------------------------------------------------------------
+
 
+def _sync_across_ranks(trainer, obj):
+    """Broadcast an object from rank 0 once the strategy/process group exists."""
+    trainer.strategy.barrier()
+    obj = trainer.strategy.broadcast(obj, src=0)
+    trainer.strategy.barrier()
+    return obj
 
-def _wait_for_dcp_metadata(path: Path, timeout=10):
-    # writing files in CI can be slow,
-    # and DCP writes a metadata file last,
-    # so we can wait for that to appear to ensure the checkpoint is ready
+
+def _wait_for_dcp_metadata(path: Path, timeout: int = 10):
+    """DCP writes metadata last; wait until it appears."""
     start = time.time()
     while True:
-        # DCP metadata file pattern
         if any(p.name.startswith(".metadata") for p in path.iterdir()):
             return
         if time.time() - start > timeout:
             raise RuntimeError("Checkpoint metadata not visible yet")
         time.sleep(0.1)
 
 
+def _find_checkpoint(tmp_path: Path):
+    """Poll until a checkpoint file exists (async IO may delay visibility)."""
+    ckpt_dir = tmp_path / "lightning_logs" / "version_0" / "checkpoints"
+
+    for _ in range(100):
+        files = list(ckpt_dir.glob("*.ckpt"))
+        if files:
+            return max(files, key=os.path.getctime)
+        time.sleep(0.1)
+
+    raise RuntimeError(f"Checkpoint file not found in {ckpt_dir}")
+
+
+# -------------------------------------------------------------------------
+# Core logic
+# -------------------------------------------------------------------------
+
+
 def save_model_checkpoint(tmp_path, expected_strategy_name, accelerator, devices):
     model = BoringModel()
+
     trainer = Trainer(
         default_root_dir=tmp_path,
         max_epochs=10,
         devices=devices,
-        plugins=[DistributedAsyncCheckpointIO()],
         accelerator=accelerator,
+        plugins=[DistributedAsyncCheckpointIO()],
     )
+
     assert trainer.strategy.__class__.__name__ == expected_strategy_name, (
         f"Expected strategy {expected_strategy_name}, but got {trainer.strategy.__class__.__name__}"
     )
+
     trainer.fit(model)
 
+    # Important:
+    # pytest standalone gives each worker a different tmp_path.
+    # After DDP init (fit), broadcast rank0's path so all ranks agree.
+    tmp_path = _sync_across_ranks(trainer, tmp_path)
 
-def get_checkpoint_path(tmp_path):
-    tmp_path = Path(tmp_path)
-    ckpt_path = tmp_path / "lightning_logs" / "version_0" / "checkpoints"
-    ckpt_files = list(ckpt_path.glob("*.ckpt"))
-    assert len(ckpt_files) > 0, "No checkpoint files found"
-    return max(ckpt_files, key=os.path.getctime)
+    return tmp_path  # noqa: RET504
 
 
 def load_model_checkpoint(tmp_path, expected_strategy_name, accelerator, devices):
-    last_ckpt = get_checkpoint_path(tmp_path)
-
     model = BoringModel()
+
     trainer = Trainer(
         default_root_dir=tmp_path,
         max_epochs=20,
         devices=devices,
-        plugins=[DistributedAsyncCheckpointIO()],
         accelerator=accelerator,
+        plugins=[DistributedAsyncCheckpointIO()],
     )
+
     assert trainer.strategy.__class__.__name__ == expected_strategy_name, (
         f"Expected strategy {expected_strategy_name}, but got {trainer.strategy.__class__.__name__}"
     )
 
-    trainer.fit(model, ckpt_path=last_ckpt)  # if loading works, it will restore to epoch 10 and continue to 20
+    last_ckpt = _find_checkpoint(Path(tmp_path))
+
+    trainer.fit(model, ckpt_path=last_ckpt)
+
+
+# -------------------------------------------------------------------------
+# Tests
+# -------------------------------------------------------------------------
 
 
 @RunIf(min_torch="2.4", min_cuda_gpus=2, standalone=True)
@@ -91,20 +115,42 @@ def load_model_checkpoint(tmp_path, expected_strategy_name, accelerator, devices
 )
 def test_trainer_distributed_async_checkpointio_integration_cuda(tmp_path, expected_strategy_name, devices):
     torch.manual_seed(1234)
-    save_model_checkpoint(tmp_path, expected_strategy_name, accelerator="cuda", devices=devices)
 
-    ckpt_path = get_checkpoint_path(tmp_path)
+    tmp_path = save_model_checkpoint(
+        tmp_path,
+        expected_strategy_name,
+        accelerator="cuda",
+        devices=devices,
+    )
+
+    ckpt_path = _find_checkpoint(Path(tmp_path))
     _wait_for_dcp_metadata(ckpt_path)
 
-    load_model_checkpoint(tmp_path, expected_strategy_name, accelerator="cuda", devices=devices)
+    load_model_checkpoint(
+        tmp_path,
+        expected_strategy_name,
+        accelerator="cuda",
+        devices=devices,
+    )
 
 
 @RunIf(min_torch="2.4", standalone=True)
 def test_trainer_distributed_async_checkpointio_integration_cpu(tmp_path):
     torch.manual_seed(1234)
-    save_model_checkpoint(tmp_path, "SingleDeviceStrategy", accelerator="cpu", devices=1)
 
-    ckpt_path = get_checkpoint_path(tmp_path)
+    save_model_checkpoint(
+        tmp_path,
+        "SingleDeviceStrategy",
+        accelerator="cpu",
+        devices=1,
+    )
+
+    ckpt_path = _find_checkpoint(Path(tmp_path))
     _wait_for_dcp_metadata(ckpt_path)
 
-    load_model_checkpoint(tmp_path, "SingleDeviceStrategy", accelerator="cpu", devices=1)
+    load_model_checkpoint(
+        tmp_path,
+        "SingleDeviceStrategy",
+        accelerator="cpu",
+        devices=1,
+    )