Improve test coverage for broadcast_sigterm_every_n_steps

c-pozzi · c-pozzi · commit bac8431f08ab · 2026-04-06T13:26:14.000Z
- Fix epoch boundary test: mock world_size property instead of
  _devices_flag, which didn't affect trainer.world_size
- Rewrite interval test to call real advance() with mocked distributed
  instead of reimplementing the logic in the test
- Add ddp_spawn integration test exercising real NCCL broadcasts on
  2 GPUs with non-aligned step count to trigger epoch-end flush
diff --git a/tests/tests_pytorch/loops/test_training_epoch_loop.py b/tests/tests_pytorch/loops/test_training_epoch_loop.py
@@ -22,6 +22,7 @@
 from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint
 from lightning.pytorch.demos.boring_classes import BoringModel
 from lightning.pytorch.trainer.trainer import Trainer
+from tests_pytorch.helpers.runif import RunIf
 
 
 def test_no_val_on_train_epoch_loop_restart(tmp_path):
@@ -246,20 +247,29 @@ def test_broadcast_sigterm_every_n_steps_default():
 
 @pytest.mark.parametrize("n_steps", [1, 5, 10])
 def test_broadcast_sigterm_interval(n_steps):
-    """Test that _broadcast_sigterm_tensor is called at the correct interval."""
+    """Test that _broadcast_sigterm_tensor is called at the correct interval during advance()."""
     trainer = Trainer(broadcast_sigterm_every_n_steps=n_steps)
     epoch_loop = trainer.fit_loop.epoch_loop
 
     total_steps = 20
-    broadcast_call_count = 0
 
-    for _ in range(total_steps):
-        epoch_loop._sigterm_broadcast_step += 1
-        if epoch_loop._sigterm_broadcast_step >= trainer.broadcast_sigterm_every_n_steps:
-            epoch_loop._sigterm_broadcast_step = 0
-            broadcast_call_count += 1
-
-    assert broadcast_call_count == total_steps // n_steps
+    with (
+        patch.object(epoch_loop, "_broadcast_sigterm_tensor") as mock_broadcast,
+        patch("torch.distributed.is_available", return_value=True),
+        patch("torch.distributed.is_initialized", return_value=True),
+        patch.object(type(trainer), "world_size", new_callable=lambda: property(lambda self: 2)),
+    ):
+        for _ in range(total_steps):
+            # Raise StopIteration to exit advance() right after the broadcast check,
+            # before it tries to fetch a batch and run training.
+            mock_fetcher = Mock()
+            mock_fetcher.__next__ = Mock(side_effect=StopIteration)
+            try:
+                epoch_loop.advance(mock_fetcher)
+            except (StopIteration, TypeError, AttributeError):
+                pass
+
+    assert mock_broadcast.call_count == total_steps // n_steps
     assert epoch_loop._sigterm_broadcast_step == total_steps % n_steps
 
 
@@ -292,3 +302,32 @@ def test_broadcast_sigterm_forced_at_epoch_boundary():
 
     mock_broadcast.assert_called_once()
     assert epoch_loop._sigterm_broadcast_step == 0
+
+
+@RunIf(min_cuda_gpus=2)
+def test_broadcast_sigterm_interval_ddp(tmp_path):
+    """Test that broadcast_sigterm_every_n_steps controls broadcast frequency in real DDP training.
+
+    Uses ddp_spawn to exercise real torch.distributed broadcast paths (lines 300-304, 408-410).
+    After training, _sigterm_broadcast_step should be 0 because the epoch-end forced broadcast resets it.
+    """
+    n_steps = 5
+    limit_train_batches = 7  # 7 % 5 = 2 remaining steps, triggers epoch-end forced broadcast
+
+    model = BoringModel()
+    trainer = Trainer(
+        default_root_dir=tmp_path,
+        max_epochs=1,
+        limit_train_batches=limit_train_batches,
+        accelerator="gpu",
+        devices=2,
+        strategy="ddp_spawn",
+        broadcast_sigterm_every_n_steps=n_steps,
+        enable_progress_bar=False,
+        enable_model_summary=False,
+        enable_checkpointing=False,
+        logger=False,
+    )
+    # Training should complete without hanging — the epoch-end forced broadcast
+    # ensures all ranks stay in sync even when limit_train_batches is not a multiple of n_steps.
+    trainer.fit(model)