@@ -83,6 +83,7 @@ def test(self, zero_stage):
83
83
data_loader = random_dataloader (model = model , total_samples = 16 , hidden_dim = hidden_dim , device = model .device )
84
84
85
85
run_unbalanced_gradients (model , data_loader )
86
+ model .destroy ()
86
87
87
88
88
89
# testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227
@@ -143,6 +144,8 @@ def forward(self, x, y):
143
144
model .backward (loss )
144
145
model .step ()
145
146
147
+ model .destroy ()
148
+
146
149
147
150
# testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227
148
151
# also reproduces the https://github.com/deepspeedai/DeepSpeed/pull/1372
@@ -243,6 +246,8 @@ def forward(self, x, y):
243
246
# float() workaround for torch<1.6
244
247
assert torch .allclose (orig_state_dict [name ].float (), fp32_state_dict [name ].float ())
245
248
249
+ model .destroy ()
250
+
246
251
def test_2_param_groups (self , tmpdir , zero_stage , freeze_params ):
247
252
# TODO:
248
253
# - need to test with multiple param groups
@@ -348,6 +353,8 @@ def forward(self, x, y):
348
353
# float() workaround for torch<1.6
349
354
assert torch .allclose (orig_state_dict [name ].float (), fp32_state_dict [name ].float ())
350
355
356
+ model .destroy ()
357
+
351
358
352
359
@pytest .mark .parametrize ("allgather_bucket_size" , [1000 , 1001 ])
353
360
class TestIncorectAllgatherBucketSize (DistributedTest ):
@@ -821,6 +828,8 @@ def create_tensor(vals, dtype: torch.dtype = None) -> Tensor:
821
828
_assert_partition_status (ds_engine , {ZeroParamStatus .NOT_AVAILABLE })
822
829
assert not math .isclose (ds_engine .optimizer ._global_grad_norm , 0.0 )
823
830
831
+ ds_engine .destroy ()
832
+
824
833
825
834
@pytest .mark .parametrize ("init_context_manager" , [True , False ])
826
835
@pytest .mark .parametrize ("reduce_scatter" , [True , False ])
@@ -893,6 +902,8 @@ def forward(self, x: Tensor) -> Tensor:
893
902
894
903
assert torch .allclose (weight_gradient , expected_weight_gradient )
895
904
905
+ ds_engine .destroy ()
906
+
896
907
897
908
@pytest .mark .parametrize ("init_context_manager" , [True , False ])
898
909
class TestZero3ParamPartitioningManyParams (DistributedTest ):
@@ -977,6 +988,8 @@ def forward(self, x: Tensor) -> Tensor:
977
988
for layer_num , activation in enumerate (weight_gradients ):
978
989
pass
979
990
991
+ ds_engine .destroy ()
992
+
980
993
981
994
class TestZero3InitForParentWeightInitialization (DistributedTest ):
982
995
world_size = 4
@@ -1197,6 +1210,8 @@ def create_tensor(vals):
1197
1210
ds_engine .optimizer .step ()
1198
1211
_assert_partition_status (ds_engine , {ZeroParamStatus .NOT_AVAILABLE })
1199
1212
1213
+ ds_engine .destroy ()
1214
+
1200
1215
1201
1216
class TestParamPartitioningSkipInit (DistributedTest ):
1202
1217
world_size = 2
@@ -1274,6 +1289,8 @@ def forward(self, x, y):
1274
1289
model .backward (loss )
1275
1290
model .step ()
1276
1291
1292
+ model .destroy ()
1293
+
1277
1294
1278
1295
class TestZeroOffloadStage1 (DistributedTest ):
1279
1296
world_size = 2
@@ -1311,6 +1328,8 @@ def test(self):
1311
1328
model .backward (loss )
1312
1329
model .step ()
1313
1330
1331
+ model .destroy ()
1332
+
1314
1333
1315
1334
@pytest .mark .parametrize ("return_type" , [tuple , list , dict ])
1316
1335
class TestZero3DictFwd (DistributedTest ):
@@ -1373,6 +1392,8 @@ def forward(self, x, y):
1373
1392
model .backward (loss )
1374
1393
model .step ()
1375
1394
1395
+ model .destroy ()
1396
+
1376
1397
1377
1398
@pytest .mark .parametrize ("zero_stage" , [1 , 2 , 3 ])
1378
1399
class TestZeroAdamOptimizerStepCount (DistributedTest ):
@@ -1439,6 +1460,8 @@ def test(self, zero_stage):
1439
1460
assert all (step == step_counts [0 ] for step in step_counts )
1440
1461
assert model .global_steps == step_counts [0 ]
1441
1462
1463
+ model .destroy ()
1464
+
1442
1465
1443
1466
@pytest .mark .parametrize ("zero_stage" , [1 , 2 , 3 ])
1444
1467
class TestZeroFrozenWeights (DistributedTest ):
@@ -1497,6 +1520,8 @@ def forward(self, x, y):
1497
1520
model .backward (loss )
1498
1521
model .step ()
1499
1522
1523
+ model .destroy ()
1524
+
1500
1525
1501
1526
@pytest .mark .parametrize ("force_ds_optim" , [True , False ])
1502
1527
class TestZeroOffloadOptim (DistributedTest ):
@@ -1577,6 +1602,8 @@ def test_training_partition_cache(self, training):
1577
1602
model .empty_partition_cache ()
1578
1603
assert sum ([p .numel () for p in model .parameters ()]) == 0
1579
1604
1605
+ model .destroy ()
1606
+
1580
1607
1581
1608
@pytest .mark .parametrize ("use_client_optimizer" , [True , False ])
1582
1609
@pytest .mark .parametrize ("empty_weight_group" , [True , False ])
@@ -1629,6 +1656,8 @@ def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_grou
1629
1656
config = config_dict ,
1630
1657
)
1631
1658
1659
+ model .destroy ()
1660
+
1632
1661
1633
1662
class TestZero3SwitchModes (DistributedTest ):
1634
1663
world_size = 2
@@ -1674,6 +1703,8 @@ def test(self, prefetch_ratio, zero_stage=3):
1674
1703
for batch in data_loader :
1675
1704
loss = model (batch [0 ], batch [1 ])
1676
1705
1706
+ model .destroy ()
1707
+
1677
1708
1678
1709
# Avoid overwriting client module id
1679
1710
# https://github.com/deepspeedai/DeepSpeed/issues/6772
@@ -1707,3 +1738,4 @@ def forward(self, x):
1707
1738
model , _ , _ , _ = deepspeed .initialize (model = model , model_parameters = model .parameters (), config = config_dict )
1708
1739
post_init_m_id = model .id
1709
1740
assert pre_init_m_id == post_init_m_id
1741
+ model .destroy ()
0 commit comments