Update tests to reflect change in overlapping logic

deepakn94 · deepakn94 · commit d03929269fc6 · 2026-01-07T19:49:03.000-08:00
diff --git a/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py b/tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py
@@ -195,58 +195,66 @@ def test_grad_sync(
 
     params = list(model.parameters())
     map_bucket_to_last_param_idx = {}
-    for i, param in enumerate(params):
-        if not (param in param_to_bucket_group):
-            # it means this parameter is not on this device, skip
-            continue
-        bucket_group = param_to_bucket_group[param]
-        if bucket_group in map_bucket_to_last_param_idx:
-            param_idx = map_bucket_to_last_param_idx[bucket_group] + 1
-        else:
-            param_idx = 0
-        map_bucket_to_last_param_idx[bucket_group] = param_idx
-
-        register_grad_sync_context = (
-            contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
-        )
-        finish_grad_sync_context = contextlib.nullcontext()
-        if (
-            param_idx < (len(bucket_group.params) - 1)
-            and overlap_grad_reduce
-            and num_distributed_optimizer_instances == 1
-        ):
-            # Can't finish grad sync until all params have been registered ready.
-            finish_grad_sync_context = pytest.raises(AssertionError)
-
-        with register_grad_sync_context:
-            bucket_group.register_grad_ready(param)
-        with finish_grad_sync_context:
-            # When overlap_grad_reduce is True, this should throw an assertion error until all
-            # params in the model have registered their grad above.
-            # When overlap_grad_reduce is False, the collective is forced through.
-            bucket_group.finish_grad_sync()
-
-        if bucket_group in non_ep_bucket_groups:
-            expected_grad_data_value = non_ep_expected_grad_data_value_after_collective
-        else:
-            expected_grad_data_value = ep_expected_grad_data_value_after_collective
-        # Before gradient sync, the gradient value should keep original.
-        if overlap_grad_reduce and param_idx < (len(bucket_group.params) - 1):
-            if bucket_group in non_ep_bucket_groups:
-                expected_grad_data_value = 1
+    for iteration in range(2):
+        for i, param in enumerate(params):
+            if not (param in param_to_bucket_group):
+                # it means this parameter is not on this device, skip
+                continue
+            bucket_group = param_to_bucket_group[param]
+            if bucket_group in map_bucket_to_last_param_idx:
+                param_idx = map_bucket_to_last_param_idx[bucket_group] + 1
             else:
-                expected_grad_data_value = ep_size * etp_size
+                param_idx = 0
+            map_bucket_to_last_param_idx[bucket_group] = param_idx
+
+            register_grad_sync_context = (
+                contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
+            )
+            finish_grad_sync_context = contextlib.nullcontext()
+            if (
+                param_idx < (len(bucket_group.params) - 1)
+                and overlap_grad_reduce
+                and num_distributed_optimizer_instances == 1
+            ):
+                # Can't finish grad sync until all params have been registered ready.
+                finish_grad_sync_context = pytest.raises(AssertionError)
 
-        if bucket_group in non_ep_bucket_groups:
-            assert non_ep_param_and_grad_buffer.grad_data[0] == expected_grad_data_value
-        else:
-            assert ep_param_and_grad_buffer.grad_data[0] == expected_grad_data_value
+            with register_grad_sync_context:
+                bucket_group.register_grad_ready(param)
+            # Don't call finish_grad_sync() multiple times in the first iteration when
+            # golden_per_param_grad_ready_counts is being populated.
+            if iteration == 0 and i < (len(params) - 1):
+                continue
+            with finish_grad_sync_context:
+                # When overlap_grad_reduce is True, this should throw an assertion error until all
+                # params in the model have registered their grad above.
+                # When overlap_grad_reduce is False, the collective is forced through.
+                bucket_group.finish_grad_sync()
 
-        if not overlap_grad_reduce:
-            # Reset grad_data for subsequent collectives.
             if bucket_group in non_ep_bucket_groups:
-                non_ep_param_and_grad_buffer.grad_data.data.fill_(1.0)
+                expected_grad_data_value = non_ep_expected_grad_data_value_after_collective
             else:
-                ep_param_and_grad_buffer.grad_data.data.fill_(float(ep_size * etp_size))
+                expected_grad_data_value = ep_expected_grad_data_value_after_collective
+            # Before gradient sync, the gradient value should keep original.
+            if overlap_grad_reduce and param_idx < (len(bucket_group.params) - 1):
+                if bucket_group in non_ep_bucket_groups:
+                    expected_grad_data_value = 1
+                else:
+                    expected_grad_data_value = ep_size * etp_size
+
+            if bucket_group in non_ep_bucket_groups:
+                assert non_ep_param_and_grad_buffer.grad_data[0] == expected_grad_data_value
+            else:
+                assert ep_param_and_grad_buffer.grad_data[0] == expected_grad_data_value
+
+            if not overlap_grad_reduce:
+                # Reset grad_data for subsequent collectives.
+                if bucket_group in non_ep_bucket_groups:
+                    non_ep_param_and_grad_buffer.grad_data.data.fill_(1.0)
+                else:
+                    ep_param_and_grad_buffer.grad_data.data.fill_(float(ep_size * etp_size))
+
+         # Call reset to set .is_first_batch to False.
+        bucket_group.reset()
 
     Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/distributed/test_param_and_grad_buffer.py b/tests/unit_tests/distributed/test_param_and_grad_buffer.py
@@ -164,7 +164,6 @@ def _pad_param_if_needed(numel_unpadded):
 @pytest.mark.parametrize("overlap_grad_reduce", [False, True])
 @pytest.mark.parametrize("average_in_collective", [False, True])
 @pytest.mark.parametrize("num_distributed_optimizer_instances", [1, 2])
-# @pytest.mark.flaky
 def test_grad_sync(
     use_distributed_optimizer: bool,
     overlap_grad_reduce: bool,
@@ -216,36 +215,44 @@ def test_grad_sync(
         expected_grad_data_value_after_collective /= parallel_state.get_data_parallel_world_size()
 
     params = list(model.parameters())
-    for i, param in enumerate(params):
-        assert param in param_to_bucket_group
-        bucket_group = param_to_bucket_group[param]
-        register_grad_sync_context = (
-            contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
-        )
-        finish_grad_sync_context = contextlib.nullcontext()
-        if (
-            i < (len(params) - 1)
-            and overlap_grad_reduce
-            and num_distributed_optimizer_instances == 1
-        ):
-            # Can't finish grad sync until all params have been registered ready.
-            finish_grad_sync_context = pytest.raises(AssertionError)
-
-        with register_grad_sync_context:
-            bucket_group.register_grad_ready(param)
-        with finish_grad_sync_context:
-            # When overlap_grad_reduce is True, this should throw an assertion error until all
-            # params in the model have registered their grad above.
-            # When overlap_grad_reduce is False, the collective is forced through.
-            bucket_group.finish_grad_sync()
-
-        expected_grad_data_value = expected_grad_data_value_after_collective
-        if overlap_grad_reduce and i < (len(params) - 1):
-            expected_grad_data_value = 1
-        assert param_and_grad_buffer.grad_data[0] == expected_grad_data_value
-
-        if not overlap_grad_reduce:
-            # Reset grad_data for subsequent collectives.
-            param_and_grad_buffer.grad_data.data.fill_(1.0)
+    for iteration in range(2):
+        for i, param in enumerate(params):
+            assert param in param_to_bucket_group
+            bucket_group = param_to_bucket_group[param]
+            register_grad_sync_context = (
+                contextlib.nullcontext() if overlap_grad_reduce else pytest.raises(AssertionError)
+            )
+            finish_grad_sync_context = contextlib.nullcontext()
+            if (
+                i < (len(params) - 1)
+                and overlap_grad_reduce
+                and num_distributed_optimizer_instances == 1
+            ):
+                # Can't finish grad sync until all params have been registered ready.
+                finish_grad_sync_context = pytest.raises(AssertionError)
+
+            with register_grad_sync_context:
+                bucket_group.register_grad_ready(param)
+            # Don't call finish_grad_sync() multiple times in the first iteration when
+            # golden_per_param_grad_ready_counts is being populated.
+            if iteration == 0 and i < (len(params) - 1):
+                continue
+            with finish_grad_sync_context:
+                # When overlap_grad_reduce is True, this should throw an assertion error until all
+                # params in the model have registered their grad above.
+                # When overlap_grad_reduce is False, the collective is forced through.
+                bucket_group.finish_grad_sync()
+
+            expected_grad_data_value = expected_grad_data_value_after_collective
+            if overlap_grad_reduce and i < (len(params) - 1):
+                expected_grad_data_value = 1
+            assert param_and_grad_buffer.grad_data[0] == expected_grad_data_value
+
+            if not overlap_grad_reduce:
+                # Reset grad_data for subsequent collectives.
+                param_and_grad_buffer.grad_data.data.fill_(1.0)
+
+        # Call reset to set .is_first_batch to False.
+        bucket_group.reset()
 
     Utils.destroy_model_parallel()