Merge branch 'master' into async_tp

hwchen2017 · web-flow · commit 9dde296390d7 · 2025-03-20T17:26:12.000-07:00
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
@@ -23,7 +23,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -58,8 +58,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.6" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.6" --cuda_ver="12"
       - name: MII unit tests
         run: |
           BRANCH="main"
diff --git a/.github/workflows/nv-flash-attn.yml b/.github/workflows/nv-flash-attn.yml
@@ -18,7 +18,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -53,7 +53,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.6" --cuda_ver="12"
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
         uses: JasonEtco/create-an-issue@v2
diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml
@@ -11,7 +11,7 @@ jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:24.09-py3
+      image: nvcr.io/nvidia/pytorch:24.12-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
@@ -50,4 +50,4 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.6" --cuda_ver="12"
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
@@ -448,7 +448,7 @@ def destroy(self):
         for hook in self._leaf_module_hooks:
             hook.remove()
         print_rank_0("Removed grad acc hooks", force=False)
-        del self.__ipg_bucket_flat_buffer
+        self._release_ipg_buffers()
 
     def initialize_ds_offload(
         self,
@@ -967,7 +967,7 @@ def _create_fp16_sub_groups(self, params_group):
 
     def _release_ipg_buffers(self):
         if self.contiguous_gradients:
-            self.ipg_buffer = None
+            self.__ipg_bucket_flat_buffer = None
 
     def _optimizer_step(self, sub_group_id):
         param_group_id = self.sub_group_to_group_id[sub_group_id]
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
@@ -357,6 +357,8 @@ def test(self, zero_stage, use_cpu_offload):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("zero_stage", [1, 2, 3])
 @pytest.mark.parametrize("use_cpu_offload", [True, False])
@@ -402,6 +404,8 @@ def test(self, zero_stage, use_cpu_offload, hidden_dim=4):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 @pytest.mark.parametrize("zero_stage", [1, 2, 3])
 @pytest.mark.parametrize("use_cpu_offload", [True, False])
@@ -436,6 +440,7 @@ def test(self, zero_stage, use_cpu_offload):
                                                       model=model,
                                                       optimizer=optimizer,
                                                       model_parameters=model.parameters())
+            model.destroy()
 
 
 @pytest.mark.parametrize("zero_stage", [1, 2, 3])
@@ -486,6 +491,8 @@ def test(self, zero_stage, use_cpu_offload):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 @amp_available
 class TestAmp(DistributedTest):
@@ -615,6 +622,7 @@ def test(self, zero_stage, optimizer_constructor):
         model = SimpleModel(hidden_dim)
         client_optimizer = optimizer_constructor(params=model.parameters())
         model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, optimizer=client_optimizer)
+        model.destroy()
 
 
 class TestZero2ReduceScatterOff(DistributedTest):
@@ -727,6 +735,8 @@ def test(self):
             model.backward(loss)
             model.step()
 
+        model.destroy()
+
 
 @pytest.mark.parametrize('stage', [1, 2, 3])
 class TestZeroEmptyGrad(DistributedTest):
@@ -755,3 +765,5 @@ def test(self, stage):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
+
+        model.destroy()
diff --git a/tests/unit/runtime/test_multiple_models.py b/tests/unit/runtime/test_multiple_models.py
@@ -42,6 +42,9 @@ def train_shared_loss(num_models, config_dict, dtype):
         for m in models:
             m.optimizer.zero_grad()
 
+    for m in models:
+        m.destroy()
+
 
 def train_independent_loss(num_models, config_dict, dtype):
     hidden_dim = 64
@@ -59,6 +62,9 @@ def train_independent_loss(num_models, config_dict, dtype):
             m.backward(loss)
             m.step()
 
+    for m in models:
+        m.destroy()
+
 
 @pytest.mark.parametrize('num_models', [1, 2, 3])
 class TestMultipleModels(DistributedTest):