Add destroy to tests to free memory (#7160)

tohtana · web-flow · commit 89590fb000d3 · 2025-03-20T23:48:02.000Z
ZeRO3 requires explicit cleaning in tests when reusing the environment.
This PR adds `destroy` calls to the tests to free memory and avoid
potential errors due to memory leaks.

Signed-off-by: Masahiro Tanaka &lt;mtanaka@microsoft.com&gt;
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
@@ -83,6 +83,7 @@ def test(self, zero_stage):
         data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
         run_unbalanced_gradients(model, data_loader)
+        model.destroy()
 
 
 # testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227
@@ -143,6 +144,8 @@ def forward(self, x, y):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 # testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227
 # also reproduces the https://github.com/deepspeedai/DeepSpeed/pull/1372
@@ -243,6 +246,8 @@ def forward(self, x, y):
                 # float() workaround for torch<1.6
                 assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())
 
+        model.destroy()
+
     def test_2_param_groups(self, tmpdir, zero_stage, freeze_params):
         # TODO:
         # - need to test with multiple param groups
@@ -348,6 +353,8 @@ def forward(self, x, y):
                 # float() workaround for torch<1.6
                 assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("allgather_bucket_size", [1000, 1001])
 class TestIncorectAllgatherBucketSize(DistributedTest):
@@ -821,6 +828,8 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
         _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
         assert not math.isclose(ds_engine.optimizer._global_grad_norm, 0.0)
 
+        ds_engine.destroy()
+
 
 @pytest.mark.parametrize("init_context_manager", [True, False])
 @pytest.mark.parametrize("reduce_scatter", [True, False])
@@ -893,6 +902,8 @@ def forward(self, x: Tensor) -> Tensor:
 
             assert torch.allclose(weight_gradient, expected_weight_gradient)
 
+        ds_engine.destroy()
+
 
 @pytest.mark.parametrize("init_context_manager", [True, False])
 class TestZero3ParamPartitioningManyParams(DistributedTest):
@@ -977,6 +988,8 @@ def forward(self, x: Tensor) -> Tensor:
             for layer_num, activation in enumerate(weight_gradients):
                 pass
 
+        ds_engine.destroy()
+
 
 class TestZero3InitForParentWeightInitialization(DistributedTest):
     world_size = 4
@@ -1197,6 +1210,8 @@ def create_tensor(vals):
         ds_engine.optimizer.step()
         _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
 
+        ds_engine.destroy()
+
 
 class TestParamPartitioningSkipInit(DistributedTest):
     world_size = 2
@@ -1274,6 +1289,8 @@ def forward(self, x, y):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 class TestZeroOffloadStage1(DistributedTest):
     world_size = 2
@@ -1311,6 +1328,8 @@ def test(self):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("return_type", [tuple, list, dict])
 class TestZero3DictFwd(DistributedTest):
@@ -1373,6 +1392,8 @@ def forward(self, x, y):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("zero_stage", [1, 2, 3])
 class TestZeroAdamOptimizerStepCount(DistributedTest):
@@ -1439,6 +1460,8 @@ def test(self, zero_stage):
                 assert all(step == step_counts[0] for step in step_counts)
                 assert model.global_steps == step_counts[0]
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("zero_stage", [1, 2, 3])
 class TestZeroFrozenWeights(DistributedTest):
@@ -1497,6 +1520,8 @@ def forward(self, x, y):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("force_ds_optim", [True, False])
 class TestZeroOffloadOptim(DistributedTest):
@@ -1577,6 +1602,8 @@ def test_training_partition_cache(self, training):
         model.empty_partition_cache()
         assert sum([p.numel() for p in model.parameters()]) == 0
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("use_client_optimizer", [True, False])
 @pytest.mark.parametrize("empty_weight_group", [True, False])
@@ -1629,6 +1656,8 @@ def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_grou
             config=config_dict,
         )
 
+        model.destroy()
+
 
 class TestZero3SwitchModes(DistributedTest):
     world_size = 2
@@ -1674,6 +1703,8 @@ def test(self, prefetch_ratio, zero_stage=3):
                 for batch in data_loader:
                     loss = model(batch[0], batch[1])
 
+        model.destroy()
+
 
 # Avoid overwriting client module id
 # https://github.com/deepspeedai/DeepSpeed/issues/6772
@@ -1707,3 +1738,4 @@ def forward(self, x):
         model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
         post_init_m_id = model.id
         assert pre_init_m_id == post_init_m_id
+        model.destroy()