diff --git a/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
index bf997775fe32..f27b1c6303eb 100644
--- a/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
+++ b/.github/ISSUE_TEMPLATE/deepspeed_chat_bug_report.md
@@ -32,7 +32,7 @@ If applicable, add screenshots to help explain your problem.
 **System info (please complete the following information):**
  - OS: [e.g. Ubuntu 18.04]
  - GPU count and types [e.g. two machines with x8 A100s each]
- - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using
+ - (if applicable) what [DeepSpeed-MII](https://github.com/deepspeedai/deepspeed-mii) version are you using
  - (if applicable) Hugging Face Transformers/Accelerate/etc. versions
  - Python version
  - Any other relevant info about your setup
diff --git a/.github/ISSUE_TEMPLATE/inference_bug_report.md b/.github/ISSUE_TEMPLATE/inference_bug_report.md
index bc5df17258b0..8a4144ce049a 100644
--- a/.github/ISSUE_TEMPLATE/inference_bug_report.md
+++ b/.github/ISSUE_TEMPLATE/inference_bug_report.md
@@ -29,7 +29,7 @@ If applicable, add screenshots to help explain your problem.
 **System info (please complete the following information):**
  - OS: [e.g. Ubuntu 18.04]
  - GPU count and types [e.g. two machines with x8 A100s each]
- - (if applicable) what [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) version are you using
+ - (if applicable) what [DeepSpeed-MII](https://github.com/deepspeedai/deepspeed-mii) version are you using
  - (if applicable) Hugging Face Transformers/Accelerate/etc. versions
  - Python version
  - Any other relevant info about your setup
diff --git a/.github/workflows/amd-mi100.yml b/.github/workflows/amd-mi100.yml
deleted file mode 100644
index 7ad0f4178db4..000000000000
--- a/.github/workflows/amd-mi100.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-name: amd-mi100
-
-on:
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  amd-tests:
-    # The type of runner that the job will run on
-    runs-on: [self-hosted, amd, mi100]
-
-    # Steps represent a sequence of tasks that will be executed as part of the job
-    steps:
-      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v3
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install pytorch
-        run: |
-          pip install --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.1.1
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      - name: Install transformers
-        run: |
-          git clone https://github.com/huggingface/transformers
-          cd transformers
-          # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
-          git rev-parse --short HEAD
-          pip install .
-
-      # Runs a set of commands using the runners shell
-      - name: Install deepspeed
-        run: |
-          pip install .[dev,1bit,autotuning]
-          #python -c "from deepspeed.env_report import cli_main; cli_main()"
-          ds_report
-
-      - name: Python environment
-        run: |
-          pip list
-
-      # Runs a set of commands using the runners shell
-      - name: Unit tests
-        run: |
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          pytest $PYTEST_OPTS -n 4 --verbose unit/
-          pytest $PYTEST_OPTS -m 'sequential' unit/
diff --git a/.github/workflows/amd-mi200.yml b/.github/workflows/amd-mi200.yml
index 8c4292d4675c..6e8d5847835d 100644
--- a/.github/workflows/amd-mi200.yml
+++ b/.github/workflows/amd-mi200.yml
@@ -1,9 +1,13 @@
 name: amd-mi200
 
 on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/amd-mi200.yml'
+      - 'requirements/**'
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -21,14 +25,14 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm5.6
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/rocm6.0
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -44,8 +48,6 @@ jobs:
       - name: Install (ROCm) apex
         run: |
           git clone https://github.com/ROCmSoftwarePlatform/apex.git
-          cd apex
-          git checkout torch_2.1_higher
           CURRENT_VER=$(git rev-parse HEAD)
           INSTALLED_VER=$(cat /blob/amd-apex/.venv_installed_version)
           if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
diff --git a/.github/workflows/auto-sync.yml b/.github/workflows/auto-sync.yml
deleted file mode 100644
index bfbf5a2ae37a..000000000000
--- a/.github/workflows/auto-sync.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: AutoSync
-
-on:
-  push:
-    branches:
-      - 'master'
-
-jobs:
-
-  Create-PR:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          token: ${{ secrets.GHP_TOKEN }}
-          repository: ${{ secrets.DST_REPO }}
-          ref: ${{ secrets.DST_REPO_BRANCH }}
-          path: dst-repo
-
-      - name: Get PR data
-        run: |
-          echo "REPO=${{ github.repository }}" >> $GITHUB_ENV
-          echo "COMMIT_SHA=${{ github.event.after }}" >> $GITHUB_ENV
-          echo "SHORT_SHA=$(echo ${{ github.event.after }} | cut -c1-8)" >> $GITHUB_ENV
-          echo "USERNAME=${{ github.event.head_commit.author.username }}" >> $GITHUB_ENV
-          echo "USER_EMAIL=${{ github.event.head_commit.author.username }}@users.noreply.github.com" >> $GITHUB_ENV
-          echo "PR_NAME=$(echo '${{ github.event.head_commit.message }}' | head -1 | sed 's|#|${{ github.repository }}#|g')" >> $GITHUB_ENV
-
-      - name: Cherry pick commit
-        continue-on-error: true
-        run: |
-          cd dst-repo
-          git config --global user.name ${{ env.USERNAME }}
-          git config --global user.email ${{ env.USER_EMAIL }}
-          git fetch https://github.com/${{ env.REPO }}.git master
-          git cherry-pick FETCH_HEAD --strategy-option octopus
-
-      - name: Add modified files
-        run: |
-          cd dst-repo
-          git add .
-
-      - name: Create Pull Request
-        uses: peter-evans/create-pull-request@v4
-        with:
-          path: dst-repo
-          token: ${{ secrets.GHP_TOKEN }}
-          body: |
-            **Auto-generated PR**
-            Repo    - [${{ env.REPO }}](https://github.com/${{ env.REPO }})
-            PR name - ${{ env.PR_NAME }}
-            Commit  - ${{ env.REPO }}@${{ env.COMMIT_SHA }}
-            Author  - @${{ env.USERNAME }}
-          branch: AutoPR/${{ env.SHORT_SHA }}
-          assignees: ${{ env.USERNAME }}
-          title: ${{ env.PR_NAME }}
-          labels: AutoPR
-          author: ${{ env.USERNAME }} <${{ env.USER_EMAIL }}>
diff --git a/.github/workflows/cpu-inference.yml b/.github/workflows/cpu-inference.yml
index 521fe2b5bea4..007313964f4a 100644
--- a/.github/workflows/cpu-inference.yml
+++ b/.github/workflows/cpu-inference.yml
@@ -2,6 +2,19 @@ name: cpu-inference
 
 on:
   workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/cpu-inference.yml'
+      - 'requirements/**'
+      - 'deepspeed/__init__.py'
+      - 'deepspeed/inference/**'
+      - '!deepspeed/inference/v2/**' # exclude v2 dir
+      - 'tests/unit/inference/**'
+      - '!tests/unit/inference/v2/**' # exclude v2 tests dir
+  merge_group:
+    branches: [ master ]
+  schedule:
+        - cron: "0 0 * * 0"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -9,45 +22,53 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: ubuntu-20.04
+    runs-on: [self-hosted, cpu]
+
+    env: {ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true} # Allow using Node16 actions
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
+      - name: Install gcc-9
+        run: |
+          sudo add-apt-repository -u ppa:ubuntu-toolchain-r/test
+          sudo apt install -y gcc-9 g++-9
+          # set gcc-9 and g++9 to default
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 99
+          sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 99
+
+      - name: Check gcc version
+        run: |
+          # Get gcc version
+          gcc --version
+          g++ --version
+
       - name: Detect instruction sets on instance
         run: |
           lscpu
-          pip install cmake
-          git clone https://github.com/intel/intel-extension-for-pytorch
-          cd intel-extension-for-pytorch/tests/cpu/isa
-          cmake .
-          make
-          ./cpu_features
 
       - name: Install numactl
         run: |
           sudo apt-get install -y numactl
 
-      - name: Install oneCCL Bindings for PyTorch
+      - name: Install dependencies
         run: |
-          python -m pip install intel_extension_for_pytorch
-          python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
+          pip install torch
+          # check installed version
+          pip list |grep \\\<torch\\\>
 
       - name: Install oneCCL
         run: |
+          pip install cmake
           git clone https://github.com/oneapi-src/oneCCL
           cd oneCCL
           mkdir build
           cd build
           cmake ..
-          make
-          make install
-          #source ./_install/env/setvars.sh
-          # test whether oneCCL is correctly installed
-          #mpirun -n 2 ./examples/benchmark/benchmark
+          make -j install
 
       - name: Install transformers
         run: |
@@ -62,14 +83,21 @@ jobs:
           pip install .[dev,1bit,autotuning,inf]
           ds_report
 
-      - name: Python environment
+      - name: Python environment check
         run: |
           pip list
+          source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
+          # check whether the environment is properly setup
+          python -c "import deepspeed;from deepspeed.accelerator import get_accelerator;print(get_accelerator().device_name());print(get_accelerator().is_available())"
 
       - name: Unit tests
         run: |
+          # prep oneCCL for CCLBackend comm ops building
           source oneCCL/build/_install/env/setvars.sh
+          export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libstdc++.so.6
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
-          TRANSFORMERS_CACHE=~/tmp/transformers_cache/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
+          cd  tests
+          # LOCAL_SIZE=2 enforce CPU to report 2 devices, this helps run the test on github default runner
+          LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'seq_inference' unit/
+          LOCAL_SIZE=2 COLUMNS=240 HF_HOME=~/tmp/hf_home/ TORCH_EXTENSIONS_DIR=./torch-extensions pytest -m 'inference_ops' -m 'inference' unit/
diff --git a/.github/workflows/nv-torch110-p40.yml b/.github/workflows/cpu-torch-latest.yml
similarity index 55%
rename from .github/workflows/nv-torch110-p40.yml
rename to .github/workflows/cpu-torch-latest.yml
index 45f3e0438233..6496d7e35065 100644
--- a/.github/workflows/nv-torch110-p40.yml
+++ b/.github/workflows/cpu-torch-latest.yml
@@ -1,31 +1,39 @@
-name: nv-torch110-p40
+name: cpu-torch-latest
 
 on:
+  workflow_dispatch:
+  pull_request:
+    paths-ignore:
+      - 'docs/**'
+      - 'blogs/**'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+  merge_group:
+    branches: [ master ]
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-permissions:
-    contents: read
-    issues: write
-
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, p40]
+    runs-on: ubuntu-24.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
+      - name: Install system packages
+        run: |
+          sudo apt-get install -y numactl pdsh
+
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==1.10.0+cu111 torchvision==0.11.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -34,13 +42,13 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,1bit,autotuning] --no-build-isolation
+          pip install .[dev,autotuning]
           ds_report
 
       - name: Python environment
@@ -51,13 +59,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="1.10" --cuda_ver="11.1"
-
-      - name: Open GitHub issue if nightly CI fails
-        if: ${{ failure() && (github.event_name == 'schedule') }}
-        uses: JasonEtco/create-an-issue@v2
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
-          update_existing: true
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="2.6"
+          HF_HOME=/tmp/hf_home/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="2.6"
diff --git a/.github/workflows/formatting.yml b/.github/workflows/formatting.yml
index a168af277fb8..e33da160aaf3 100644
--- a/.github/workflows/formatting.yml
+++ b/.github/workflows/formatting.yml
@@ -1,6 +1,7 @@
 name: Formatting
 
 on:
+  workflow_dispatch:
   pull_request:
     branches:
       '**'
@@ -16,11 +17,11 @@ concurrency:
 jobs:
 
   # formatting and basic install on cpu-only machine
-  formatting:
-    runs-on: ubuntu-20.04
+  unit-tests:
+    runs-on: ubuntu-22.04
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: environment
         run: |
diff --git a/.github/workflows/hpu-gaudi2-nightly.yml b/.github/workflows/hpu-gaudi2-nightly.yml
new file mode 100644
index 000000000000..c0576360cd61
--- /dev/null
+++ b/.github/workflows/hpu-gaudi2-nightly.yml
@@ -0,0 +1,85 @@
+name: hpu-gaudi2-nightly
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/hpu-gaudi2-nightly.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, intel, gaudi2]
+    container:
+      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+      ports:
+        - 80
+      options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
+
+    env:
+      PT_HPU_LAZY_MODE: 0
+      TORCHINDUCTOR_COMPILE_THREADS: 1
+      TEST_LIST: |
+        test_adamw.py
+        test_bf16.py
+        test_ds_config_dict.py
+        test_dynamic_loss_scale.py
+        test_latest_checkpoint.py
+        test_moe_checkpoint.py
+        test_multi_output_model.py
+        test_other_optimizer.py
+        test_pipe.py
+        test_pipeline.py
+        test_universal_checkpoint.py
+        test_zero_context_return.py
+        test_zero_leaf_module.py
+        test_zero_offloadpp.py
+        test_zero_tiled.py
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+
+      - name: Check container state
+        run: |
+          ldd --version
+          hl-smi -L
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,autotuning]
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
+          export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
+          TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
+          echo "TEST_LIST ${TEST_LIST}"
+          pytest --verbose unit/ -k "${TEST_LIST}"
diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
new file mode 100644
index 000000000000..48730442686c
--- /dev/null
+++ b/.github/workflows/hpu-gaudi2.yml
@@ -0,0 +1,136 @@
+name: hpu-gaudi2
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/hpu-gaudi2.yml"
+      - "accelerator/hpu_accelerator.py"
+      - "op_builder/hpu/**"
+      - "deepspeed/runtime/engine.py"
+      - "deepspeed/runtime/bf16_optimizer.py"
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
+      - "deepspeed/runtime/zero/partition_parameters.py"
+      - "deepspeed/runtime/zero/partitioned_param_coordinator.py"
+      - "deepspeed/runtime/zero/parameter_offload.py"
+      - "deepspeed/runtime/pipe/engine.py"
+      - "deepspeed/runtime/utils.py"
+      - "deepspeed/inference/engine.py"
+      - "deepspeed/module_inject/auto_tp.py"
+      - "deepspeed/module_inject/replace_module.py"
+      - "deepspeed/module_inject/load_checkpoint.py"
+      - "deepspeed/module_inject/inject.py"
+      - "deepspeed/ops/transformer/**"
+      - "deepspeed/ops/adam/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  unit-tests:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, intel, gaudi2]
+    container:
+      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
+      ports:
+        - 80
+      options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
+
+    env:
+      PT_HPU_LAZY_MODE: 0
+      TORCHINDUCTOR_COMPILE_THREADS: 1
+      TEST_LIST: |
+        test_accelerator.py
+        test_autotuning.py
+        test_compression.py
+        test_dist.py
+        test_elastic.py
+        test_ds_arguments.py
+        test_run.py
+        test_multinode_runner.py
+        test_moe_tp.py
+        test_monitor.py
+        (test_zero_optimizer.py and (TestSaveTensorClone or TestZeRONonDistributed))
+        (test_latest_checkpoint.py and test_missing_latest)
+        test_reshape_checkpoint.py
+        test_shared_weights.py
+        test_sparse.py
+        test_tag_validation.py
+        test_pipe_module.py
+        (test_flops_profiler.py and test_flops_profiler_in_inference)
+        test_get_optim_files.py
+        test_groups.py
+        test_partition_balanced.py
+        (test_adamw.py and TestAdamConfigs)
+        test_coalesced_collectives.py
+        test_activation_checkpointing_non_reentrant.py
+        test_activation_checkpointing.py
+        test_data.py
+        (test_ds_config_dict.py and (TestBasicConfig or TestBatchConfig))
+        test_ds_config_model.py
+        test_mup_optimizers.py
+        (test_pld.py and test_pld_schedule)
+        test_runtime_utils.py
+        test_pipe_schedule.py
+        test_topology.py
+        (test_ds_initialize.py and (TestClientOptimizer or TestClientLrScheduler))
+        test_csr.py
+        (test_fp16.py and (TestZeroEmptyGrad or TestZeroAllowUntestedOptimizer))
+        (test_bf16.py and TestZeroDtypeCocktail)
+        test_partition.py
+        test_ignore_unused_parameters.py
+        test_zero_config.py
+        test_zero_context_ancestry.py
+        (test_zero_context.py and not TestSerialContext)
+        test_zero_dynamic_class.py
+        test_zero_nesting_init.py
+        test_zeropp.py
+        (test_zero.py and (TestZero3ParamPartitioningLargeParam or TestZero3ParamPartitioningLargeParam))
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+
+      - name: Check container state
+        run: |
+          ldd --version
+          hl-smi -L
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+      - name: Install transformers
+        run: |
+          git clone https://github.com/huggingface/transformers
+          cd transformers
+          # if needed switch to the last known good SHA until transformers@master is fixed
+          git checkout 981c276
+          git rev-parse --short HEAD
+          pip install .
+
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,autotuning]
+          ds_report
+
+      - name: Python environment
+        run: |
+          pip list
+
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          export PT_HPU_LAZY_MODE=${PT_HPU_LAZY_MODE}
+          export TORCHINDUCTOR_COMPILE_THREADS=${TORCHINDUCTOR_COMPILE_THREADS}
+          TEST_LIST=$(echo "$TEST_LIST" | awk 'NF{printf "%s%s", (NR>1 ? " or " : ""), $0} END{if (NR>1) print ""}')
+          echo "TEST_LIST ${TEST_LIST}"
+          pytest --verbose unit/ -k "${TEST_LIST}"
diff --git a/.github/workflows/no-torch.yml b/.github/workflows/no-torch.yml
new file mode 100644
index 000000000000..75c4ecb850c9
--- /dev/null
+++ b/.github/workflows/no-torch.yml
@@ -0,0 +1,49 @@
+name: no-torch
+
+on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'accelerator/**'
+      - '.github/workflows/no-torch.yml'
+      - 'op_builder/**'
+  schedule:
+    - cron: "0 0 * * *"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+    contents: read
+    issues: write
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - id: setup-venv
+        uses: ./.github/workflows/setup-venv
+
+      - name: Python environment
+        run: |
+          pip uninstall torch --yes
+          pip install setuptools
+          pip install build
+          pip list
+
+      - name: Build deepspeed
+        run: |
+          DS_BUILD_STRING=" " python -m build --sdist
+
+      - name: Open GitHub issue if nightly CI fails
+        if: ${{ failure() && (github.event_name == 'schedule') }}
+        uses: JasonEtco/create-an-issue@v2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          filename: .github/ISSUE_TEMPLATE/ci_failure_report.md
+          update_existing: true
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index a2b99de488d5..77a2661d08a6 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -3,29 +3,33 @@ name: nv-a6000
 on:
   pull_request:
     paths:
-      - "deepspeed/inference/v2/**"
-      - "tests/unit/inference/v2/**"
+      - 'accelerator/cuda_accelerator.py'
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+      - '.github/workflows/nv-a6000.yml'
   workflow_dispatch:
+    inputs:
+      mii_branch:
+        description: 'DeepSpeed-MII Branch'
+        required: false
+        default: 'main'
+        type: string
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-permissions:
-  contents: read
-  issues: write
-
 jobs:
   unit-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
+      image: nvcr.io/nvidia/pytorch:24.09-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
@@ -36,14 +40,16 @@ jobs:
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
       - name: Install transformers
         run: |
-          git clone --depth=1 https://github.com/huggingface/transformers
+          git clone https://github.com/huggingface/transformers
           cd transformers
+          # if you need to use an older transformers version temporarily in case of breakage
+          git checkout 981c276
           git rev-parse --short HEAD
           python -m pip install .
       - name: Install deepspeed
         run: |
           python -m pip install docutils==0.18.1 jinja2==3.0 urllib3==1.26.11 ninja
-          python -m pip install .[dev,1bit,autotuning]
+          python -m pip install .[dev,1bit,autotuning,inf]
           ds_report
       - name: Python environment
         run: |
@@ -52,11 +58,16 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.0" --cuda_ver="12"
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12"
       - name: MII unit tests
         run: |
-          git clone --depth=1 https://github.com/microsoft/DeepSpeed-MII.git
+          BRANCH="main"
+          if [[ ! -z "${{ github.event.inputs.mii_branch }}" ]]; then
+              BRANCH="${{ github.event.inputs.mii_branch }}"
+          fi
+          echo "Cloning DeepSpeed-MII branch: $BRANCH"
+          git clone -b $BRANCH --depth=1 https://github.com/deepspeedai/DeepSpeed-MII.git
           cd DeepSpeed-MII
           pip install .[dev]
           cd tests
diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
index 0f6491e08336..c9caf697b5b3 100644
--- a/.github/workflows/nv-accelerate-v100.yml
+++ b/.github/workflows/nv-accelerate-v100.yml
@@ -1,12 +1,13 @@
 name: nv-accelerate-v100
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:
@@ -18,17 +19,17 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -51,7 +52,5 @@ jobs:
           pip install .[testing]
           # force protobuf version due to issues
           pip install "protobuf<4.21.0"
-          # tmp fix: force newer datasets version
-          #pip install "datasets>=2.0.0"
           pip list
           pytest $PYTEST_OPTS --color=yes --durations=0 --verbose tests/deepspeed
diff --git a/.github/workflows/nv-ds-chat.yml b/.github/workflows/nv-ds-chat.yml
index b53fac36315b..faef9f180c77 100644
--- a/.github/workflows/nv-ds-chat.yml
+++ b/.github/workflows/nv-ds-chat.yml
@@ -10,29 +10,40 @@ on:
         required: false
         default: 'master'
         type: string
+  pull_request:
+    paths:
+      - ".github/workflows/nv-ds-chat.yml"
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
+      - "deepspeed/runtime/hybrid_engine.py"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+permissions:
+    contents: read
+    issues: write
+
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
       - name: Install deepspeed
         run: |
+          pip install transformers==4.48.3
           pip install .[dev]
           ds_report
 
@@ -43,7 +54,7 @@ jobs:
               BRANCH="${{ github.event.inputs.dse_branch }}"
           fi
           echo "DeepSpeedExamples Branch: $BRANCH"
-          git clone -b $BRANCH https://github.com/microsoft/DeepSpeedExamples.git
+          git clone -b $BRANCH https://github.com/deepspeedai/DeepSpeedExamples.git
           cd DeepSpeedExamples/applications/DeepSpeed-Chat
           pip install -r requirements.txt
           pip install -e .
@@ -56,6 +67,7 @@ jobs:
         run: |
           cd DeepSpeedExamples/applications/DeepSpeed-Chat
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          unset NCCL_DEBUG
           cd tests
           pytest $PYTEST_OPTS ./
 
diff --git a/.github/workflows/nv-torch110-v100.yml b/.github/workflows/nv-flash-attn.yml
similarity index 54%
rename from .github/workflows/nv-torch110-v100.yml
rename to .github/workflows/nv-flash-attn.yml
index 1fd8aaac0ffa..591969fbd986 100644
--- a/.github/workflows/nv-torch110-v100.yml
+++ b/.github/workflows/nv-flash-attn.yml
@@ -1,59 +1,59 @@
-name: nv-torch110-v100
+name: nv-flash-attn
 
 on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - 'deepspeed/sequence/**'
+      - 'tests/unit/sequence_parallelism/**'
+      - '.github/workflows/nv-flash-attn.yml'
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
-permissions:
-  contents: read
-  issues: write
-
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:24.09-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install pytorch
+      - name: Check container state
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==1.10.0+cu111 torchvision==0.11.0+cu111 -f https://download.pytorch.org/whl/torch_stable.html
+          ldd --version
+          nvcc --version
+          nvidia-smi
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
       - name: Install transformers
         run: |
-          git clone https://github.com/huggingface/transformers
+          git clone --depth=1 https://github.com/huggingface/transformers
           cd transformers
-          # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
           git rev-parse --short HEAD
-          pip install .
-
+          python -m pip install .
       - name: Install deepspeed
         run: |
-          pip install .[dev,1bit,autotuning] --no-build-isolation
+          python -m pip install .[dev]
           ds_report
-
+      - name: Install FlashAttention
+        run: |
+          python -m pip install flash-attn
       - name: Python environment
         run: |
-          pip list
-
+          python -m pip list
       - name: Unit tests
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4  unit/ --torch_ver="1.10" --cuda_ver="11"
-          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="1.10" --cuda_ver="11"
-
+          python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12"
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
         uses: JasonEtco/create-an-issue@v2
diff --git a/.github/workflows/nv-h100.yml b/.github/workflows/nv-h100.yml
index a1b812b3eafd..5574ce8aa634 100644
--- a/.github/workflows/nv-h100.yml
+++ b/.github/workflows/nv-h100.yml
@@ -1,9 +1,9 @@
 name: nv-h100
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  workflow_dispatch:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -23,7 +23,7 @@ jobs:
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml
new file mode 100644
index 000000000000..3f59c42f697e
--- /dev/null
+++ b/.github/workflows/nv-human-eval.yml
@@ -0,0 +1,53 @@
+name: nv-human-eval
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:24.09-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check container state
+        run: |
+          ldd --version
+          nvcc --version
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone --depth=1 https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Clone Human Eval
+        run: |
+          git clone --depth=1 https://github.com/openai/human-eval.git
+          sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py
+          cd human-eval
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Install deepspeed
+        run: |
+          python -m pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          python -m pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"
diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
index f20b4496b6df..8906130bc1af 100644
--- a/.github/workflows/nv-inference.yml
+++ b/.github/workflows/nv-inference.yml
@@ -1,12 +1,16 @@
 name: nv-inference
 
 on:
+  workflow_dispatch:
   pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-      - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+    paths:
+      - '.github/workflows/nv-inference.yml'
+      - 'requirements/**'
+      - 'deepspeed/__init__.py'
+      - 'deepspeed/inference/**'
+      - '!deepspeed/inference/v2/**' # exclude v2 dir
+      - 'tests/unit/inference/**'
+      - '!tests/unit/inference/v2/**' # exclude v2 tests dir
   merge_group:
     branches: [ master ]
   schedule:
@@ -18,17 +22,17 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -36,13 +40,14 @@ jobs:
         run: |
           git clone https://github.com/huggingface/transformers
           cd transformers
-          git checkout f370bebdc
+          #git checkout f370bebdc
           git rev-parse --short HEAD
           pip install .
 
       - name: Install deepspeed
         run: |
-          pip install .[dev,1bit,autotuning,inf,triton]
+          DS_ACCELERATOR=cpu pip install .[dev,1bit,autotuning,inf]
+          #pip install .[dev,1bit,autotuning,inf,triton]
           ds_report
 
       - name: Python environment
@@ -53,12 +58,8 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="1.13" --cuda_ver="11.6"
-
-      - name: Coverage report
-        run: |
-          cd tests
-          coverage combine
-          coverage report -m
+          #pytest $PYTEST_OPTS -m 'seq_inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
+          pytest $PYTEST_OPTS -m 'inference_ops' unit/ --torch_ver="2.1" --cuda_ver="12.4"
+          pytest $PYTEST_OPTS --forked -n 4 -m 'inference' unit/ --torch_ver="2.1" --cuda_ver="12.4"
+          # run ds_report again to check updated op list
+          ds_report
diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index d25d40aef967..d31ae5569848 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -1,12 +1,13 @@
 name: nv-lightning-v100
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:
@@ -18,17 +19,17 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu111, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
deleted file mode 100644
index 3a3b70dcd17d..000000000000
--- a/.github/workflows/nv-megatron.yml
+++ /dev/null
@@ -1,63 +0,0 @@
-name: nv-megatron
-
-on:
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-      - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
-  merge_group:
-    branches: [ master ]
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install pytorch
-        run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      - name: Install deepspeed
-        run: |
-          pip install .[dev]
-          ds_report
-
-      - name: Install apex
-        run: |
-          git clone https://github.com/NVIDIA/apex.git
-          cd apex
-          CURRENT_VER=$(git rev-parse HEAD)
-          INSTALLED_VER=$(cat /blob/apex/.venv_installed_version)
-          if [[ "$CURRENT_VER" != "$INSTALLED_VER" ]]; then
-            pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--global-option=--cpp_ext" --config-settings "--global-option=--cuda_ext" --target=/blob/apex/ --upgrade .
-            git rev-parse HEAD > /blob/apex/.venv_installed_version
-          fi
-          echo PYTHONPATH=$PYTHONPATH:/blob/apex/ >> $GITHUB_ENV
-
-      - name: Python environment
-        run: |
-          pip list
-
-      - name: Megatron unit tests
-        run: |
-          git clone https://github.com/microsoft/Megatron-DeepSpeed.git
-          cd Megatron-DeepSpeed
-          pip install .
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          pytest $PYTEST_OPTS ./
diff --git a/.github/workflows/nv-mii.yml b/.github/workflows/nv-mii.yml
index 86de2a3b0bcb..aff0c8a548c1 100644
--- a/.github/workflows/nv-mii.yml
+++ b/.github/workflows/nv-mii.yml
@@ -1,10 +1,21 @@
 name: nv-mii
 
 on:
+  workflow_dispatch:
+    inputs:
+      mii_branch:
+        description: 'DeepSpeed-MII Branch'
+        required: false
+        default: 'main'
+        type: string
   pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
+    paths:
+      - '.github/workflows/nv-mii.yml'
+      - 'requirements/**'
+      - 'setup.py'
+      - 'deepspeed/__init__.py'
+      - 'deepspeed/inference/**'
+      - '!deepspeed/inference/v2/**' # exclude v2 dir
   merge_group:
     branches: [ master ]
   schedule:
@@ -16,17 +27,17 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip3 install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
+          pip3 install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -35,7 +46,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          git checkout v4.42.4
           git rev-parse --short HEAD
           pip install .
 
@@ -50,7 +61,12 @@ jobs:
 
       - name: MII unit tests
         run: |
-          git clone https://github.com/microsoft/DeepSpeed-MII.git
+          BRANCH="main"
+          if [[ ! -z "${{ github.event.inputs.mii_branch }}" ]]; then
+              BRANCH="${{ github.event.inputs.mii_branch }}"
+          fi
+          echo "Cloning DeepSpeed-MII branch: $BRANCH"
+          git clone -b $BRANCH --depth=1 https://github.com/deepspeedai/DeepSpeed-MII.git
           cd DeepSpeed-MII
           pip install .[dev]
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
diff --git a/.github/workflows/nv-nightly.yml b/.github/workflows/nv-nightly.yml
index 1ed7d34a6be4..7f81484c7646 100644
--- a/.github/workflows/nv-nightly.yml
+++ b/.github/workflows/nv-nightly.yml
@@ -1,6 +1,10 @@
 name: nv-nightly
 
 on:
+  workflow_dispatch:
+  pull_request:
+    paths:
+      - '.github/workflows/nv-nightly.yml'
   schedule:
     - cron: "0 0 * * *"
 
@@ -14,17 +18,17 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch==1.13.1 torchvision --extra-index-url https://download.pytorch.org/whl/cu116
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -33,10 +37,14 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          git checkout v4.42.4
           git rev-parse --short HEAD
           pip install .
 
+      - name: Install datasets
+        run: |
+          pip install datasets
+
       - name: Install deepspeed
         run: |
           pip install .[dev,1bit,autotuning,inf]
@@ -50,7 +58,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="1.13" --cuda_ver="11.6"
+          pytest $PYTEST_OPTS --forked -m 'nightly' unit/ --torch_ver="2.6" --cuda_ver="12.4"
 
       - name: Open GitHub issue if nightly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
index 839312190d22..fc810bc190d0 100644
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -1,6 +1,7 @@
 name: nv-pre-compile-ops
 
 on:
+  workflow_dispatch:
   pull_request:
     branches:
       '**'
@@ -8,7 +9,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:
@@ -19,13 +20,13 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build-ops:
-    runs-on: ubuntu-20.04
+  unit-tests:
+    runs-on: ubuntu-24.04
     container:
       image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
 
     steps:
-        - uses: actions/checkout@v3
+        - uses: actions/checkout@v4
 
         - name: environment
           run: |
@@ -35,7 +36,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0  DS_BUILD_CUTLASS_OPS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 pip3 install .
         - name: DS Report
           run: |
              ds_report
diff --git a/.github/workflows/nv-sd.yml b/.github/workflows/nv-sd.yml
index 5ca159074a4d..af406075b868 100644
--- a/.github/workflows/nv-sd.yml
+++ b/.github/workflows/nv-sd.yml
@@ -1,15 +1,19 @@
 name: nv-sd
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * 0"
-  workflow_dispatch:
   pull_request:
     paths:
       - "deepspeed/ops/transformer/inference/diffusers_**"
       - "tests/unit/inference/test_stable_diffusion.py"
       - "deepspeed/model_implementations/diffusers/unet.py"
       - "deepspeed/model_implementations/diffusers/vae.py"
+      - "deepspeed/module_inject/containers/vae.py"
+      - "deepspeed/module_inject/containers/unet.py"
+      - ".github/workflows/nv-sd.yml"
+      - "requirements/requirements-sd.txt"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -23,13 +27,13 @@ jobs:
   sd-tests:
     runs-on: [self-hosted, nvidia, a6000]
     container:
-      image: nvcr.io/nvidia/pytorch:23.03-py3
+      image: nvcr.io/nvidia/pytorch:24.03-py3
       ports:
         - 80
       options: --gpus all --shm-size "8G"
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - name: Check container state
         run: |
@@ -58,7 +62,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.0" --cuda_ver="12"
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'stable_diffusion' -k "TestStableDiffusion" unit/ --torch_ver="2.3" --cuda_ver="12"
 
       - name: Open GitHub issue if weekly CI fails
         if: ${{ failure() && (github.event_name == 'schedule') }}
diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml
deleted file mode 100644
index 9ca1529d9018..000000000000
--- a/.github/workflows/nv-torch-latest-cpu.yml
+++ /dev/null
@@ -1,49 +0,0 @@
-name: nv-torch-latest-cpu
-
-on:
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-      - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
-  merge_group:
-    branches: [ master ]
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - uses: actions/checkout@v3
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install pytorch
-        run: |
-          pip install torch==1.12.0+cpu torchvision==0.13.0+cpu torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cpu
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      - name: Install deepspeed
-        run: |
-          pip install .[dev,autotuning]
-          ds_report
-
-      - name: Python environment
-        run: |
-          pip list
-
-      - name: Unit tests
-        run: |
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          cd tests
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -n 4 unit/ --torch_ver="1.12"
-          TRANSFORMERS_CACHE=/tmp/transformers_cache/ pytest $PYTEST_OPTS -m 'sequential' unit/ --torch_ver="1.12"
diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
index 8813a4bb2c4f..eba35ba7210a 100644
--- a/.github/workflows/nv-torch-latest-v100.yml
+++ b/.github/workflows/nv-torch-latest-v100.yml
@@ -1,12 +1,13 @@
 name: nv-torch-latest-v100
 
 on:
+  workflow_dispatch:
   pull_request:
     paths-ignore:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:
@@ -18,17 +19,17 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -37,7 +38,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
@@ -54,11 +55,5 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -n 4 unit/ --torch_ver="2.1" --cuda_ver="11.8"
-          coverage run --concurrency=multiprocessing -m pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.1" --cuda_ver="11.8"
-
-      - name: Coverage report
-        run: |
-          cd tests
-          coverage combine
-          coverage report -m
+          pytest $PYTEST_OPTS --forked -n 8 unit/ --torch_ver="2.6" --cuda_ver="12.4"
+          pytest $PYTEST_OPTS --forked -m 'sequential' unit/ --torch_ver="2.6" --cuda_ver="12.4"
diff --git a/.github/workflows/nv-torch-nightly-v100.yml b/.github/workflows/nv-torch-nightly-v100.yml
index d0df6e546982..0013ed3f276f 100644
--- a/.github/workflows/nv-torch-nightly-v100.yml
+++ b/.github/workflows/nv-torch-nightly-v100.yml
@@ -1,8 +1,12 @@
 name: nv-torch-nightly-v100
 
 on:
+  workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - '.github/workflows/nv-torch-nightly-v100.yml'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
@@ -14,17 +18,17 @@ permissions:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
 
       - name: Install pytorch
         run: |
-          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118
+          pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
@@ -33,7 +37,7 @@ jobs:
           git clone https://github.com/huggingface/transformers
           cd transformers
           # if needed switch to the last known good SHA until transformers@master is fixed
-          # git checkout 1cc453d33
+          git checkout 981c276
           git rev-parse --short HEAD
           pip install .
 
@@ -50,7 +54,7 @@ jobs:
         run: |
           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
           cd tests
-          pytest $PYTEST_OPTS --forked -n 4 unit/
+          pytest $PYTEST_OPTS --forked -n 8 unit/
           pytest $PYTEST_OPTS --forked -m 'sequential' unit/
 
       - name: Open GitHub issue if nightly CI fails
diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
index 7753133f2886..9d1253fd77ca 100644
--- a/.github/workflows/nv-transformers-v100.yml
+++ b/.github/workflows/nv-transformers-v100.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:
@@ -18,10 +18,10 @@ concurrency:
 
 jobs:
   unit-tests:
-    runs-on: [self-hosted, nvidia, cu116, v100]
+    runs-on: [self-hosted, nvidia, cu124, v100]
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - id: setup-venv
         uses: ./.github/workflows/setup-venv
@@ -29,7 +29,7 @@ jobs:
       - name: Install pytorch
         run: |
           # use the same pytorch version as transformers CI
-          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu118 --index-url https://download.pytorch.org/whl/cu118
+          pip install -U --cache-dir $TORCH_CACHE torch==2.0.1+cu124 --index-url https://download.pytorch.org/whl/cu124
           python -c "import torch; print('torch:', torch.__version__, torch)"
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
index 279bad471c01..35f9502ecbc9 100644
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@@ -1,6 +1,7 @@
 name: python
 
 on:
+  workflow_dispatch:
   pull_request:
     branches:
       '**'
@@ -17,18 +18,18 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  version-check:
+  unit-tests:
     strategy:
       matrix:
-        pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        pyVersion: ["3.8", "3.9", "3.10"]
       fail-fast: false
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     container:
       image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
 
     steps:
-        - uses: actions/checkout@v3
+        - uses: actions/checkout@v4
 
         - name: environment
           run: |
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 8e016b4169cb..4bddbc26be4a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -7,11 +7,11 @@ on:
 
 jobs:
   deploy:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     environment: release-env
 
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
       with:
         ref: "master"
     - id: setup-venv
@@ -25,7 +25,9 @@ jobs:
         python release/check_release_version.py --release_version ${{ env.RELEASE_VERSION }}
     - name: Build DeepSpeed
       run: |
-        DS_BUILD_STRING=" " python setup.py sdist
+        pip install setuptools
+        pip install build
+        DS_BUILD_STRING=" " python -m build --sdist
     - name: Publish to PyPI
       uses: pypa/gh-action-pypi-publish@release/v1
       with:
@@ -35,7 +37,7 @@ jobs:
       run: |
         python release/bump_patch_version.py --current_version ${{ env.RELEASE_VERSION }}
     - name: Create Pull Request
-      uses: peter-evans/create-pull-request@v4
+      uses: peter-evans/create-pull-request@v6
       with:
         token: ${{ secrets.GH_PAT }}
         add-paths: |
diff --git a/.github/workflows/setup-venv/action.yml b/.github/workflows/setup-venv/action.yml
index ce2c458b9e57..9a88e0651860 100644
--- a/.github/workflows/setup-venv/action.yml
+++ b/.github/workflows/setup-venv/action.yml
@@ -22,7 +22,7 @@ runs:
     - id: set-env-vars
       run: |
         echo TEST_DATA_DIR=/blob/ >> $GITHUB_ENV
-        echo TRANSFORMERS_CACHE=/blob/transformers_cache/ >> $GITHUB_ENV
+        echo HF_HOME=/blob/hf_home/ >> $GITHUB_ENV
         echo TORCH_EXTENSIONS_DIR=./torch-extensions/ >> $GITHUB_ENV
         echo TORCH_CACHE=/blob/torch_cache/ >> $GITHUB_ENV
         echo HF_DATASETS_CACHE=/blob/datasets_cache/ >> $GITHUB_ENV
diff --git a/.github/workflows/xpu-compile.yml b/.github/workflows/xpu-compile.yml
new file mode 100644
index 000000000000..9e8bd9d792fb
--- /dev/null
+++ b/.github/workflows/xpu-compile.yml
@@ -0,0 +1,65 @@
+name: xpu-compile
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/xpu-compile.yml"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  compile-tests:
+    runs-on: [self-hosted, intel, xpu]
+    container:
+      image: intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install prerequisite
+      run: |
+        apt-get update
+        apt-get install clinfo libaio-dev python3-pip -y
+        pip install torch==2.3.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+        pip install intel-extension-for-pytorch==2.3.110+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.3.100+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+        pip install torchvision==0.18.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
+        pip install https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v3.0.0b2/triton_xpu-3.0.0b2-cp310-cp310-linux_x86_64.whl
+        pip install py-cpuinfo numpy
+        pip install .[dev,autotuning]
+
+    - name: Check container state
+      run: |
+        ldd --version
+        ds_report
+        python3 -c "import torch; print('torch:', torch.__version__, torch)"
+        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
+        pip list
+
+    - name: Compile Status
+      shell: bash
+      run: |
+        echo "# torch.compile graph breaks" >> $GITHUB_STEP_SUMMARY
+        export FI_HMEM=system
+        ulimit -n 1048575
+        cd tests/torch_compile
+        export ZE_AFFINITY_MASK=0,1
+        echo "## ZeRO stage 3" >> $GITHUB_STEP_SUMMARY
+        deepspeed test_compile.py --deepspeed_config ds_config_z3.json 2>&1 | tee log_z3.txt
+        # for each line start with 'dynamo_output', extract the second field and following fields and append to GITHUB_STEP_SUMMARY using awk
+        cat log_z3.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
+        echo "## ZeRO stage 2" >> $GITHUB_STEP_SUMMARY
+        deepspeed test_compile.py --deepspeed_config ds_config_z2.json 2>&1 | tee log_z2.txt
+        cat log_z2.txt | awk '/^dynamo_output/ {$1=""; print $0}' >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/xpu-max1100.yml b/.github/workflows/xpu-max1100.yml
new file mode 100644
index 000000000000..2d84f8f60571
--- /dev/null
+++ b/.github/workflows/xpu-max1100.yml
@@ -0,0 +1,96 @@
+name: xpu-max1100
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - ".github/workflows/xpu-max1100.yml"
+      - "accelerator/xpu_accelerator.py"
+      - "accelerator/abstract_accelerator.py"
+      - "accelerator/cpu_accelerator.py"
+      - "accelerator/real_accelerator.py"
+      - "csrc/xpu/**"
+      - "deepspeed/runtime/engine.py"
+      - "deepspeed/runtime/bf16_optimizer.py"
+      - "deepspeed/runtime/zero/stage_1_and_2.py"
+      - "deepspeed/runtime/zero/stage3.py"
+      - "deepspeed/runtime/zero/partition_parameters.py"
+      - "deepspeed/runtime/zero/partitioned_param_coordinator.py"
+      - "deepspeed/runtime/zero/parameter_offload.py"
+      - "deepspeed/runtime/pipe/engine.py"
+      - "deepspeed/runtime/utils.py"
+      - "op_builder/xpu/**"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, intel, xpu]
+    container:
+      image: intel/oneapi-basekit:2025.0.1-0-devel-ubuntu24.04
+      ports:
+        - 80
+      options: --privileged -it --rm --device /dev/dri:/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --ipc=host --cap-add=ALL
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install prerequisite
+      shell: bash
+      run: |
+        apt-get update
+        apt-get install clinfo libaio-dev python3-pip python3.12-venv -y
+        python3 -m venv ~/ds_env
+        source ~/ds_env/bin/activate
+        pip install torch==2.5.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torch/
+        pip install intel-extension-for-pytorch==2.5.10+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/intel-extension-for-pytorch/
+        pip install oneccl_bind_pt==2.5.0+xpu -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/oneccl-bind-pt/
+        pip install torchvision==0.20.1 -f https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/torchvision/
+        pip install py-cpuinfo numpy
+        pip install .[dev,autotuning]
+
+    - name: Check container state
+      shell: bash
+      run: |
+        source ~/ds_env/bin/activate
+        ldd --version
+        ds_report
+        python3 -c "import torch; print('torch:', torch.__version__, torch)"
+        python3 -c "import torch; import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())"
+        python3 -c "from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)"
+        pip list
+
+    - name: Unit tests
+      shell: bash
+      run: |
+        source ~/ds_env/bin/activate
+        cd tests/unit
+        pytest --verbose accelerator/*
+        pytest --verbose autotuning/*
+        pytest --verbose checkpoint/test_reshape_checkpoint.py
+        pytest --verbose checkpoint/test_moe_checkpoint.py
+        pytest --verbose checkpoint/test_shared_weights.py
+        pytest --verbose launcher/test_ds_arguments.py launcher/test_run.py
+        pytest --verbose model_parallelism/*
+        pytest --verbose moe/test_moe_tp.py
+        pytest --verbose monitor/*
+        pytest --verbose utils/*
+        pytest --verbose runtime/test_ds_config_model.py
+        pytest --verbose runtime/pipe/test_pipe_schedule.py
+        pytest --verbose runtime/zero/test_zero_config.py
+        pytest --verbose runtime/zero/test_zero_tiled.py
+        pytest --verbose runtime/zero/test_zeropp.py
+        pytest --verbose runtime/test_autocast.py
+        pytest --verbose runtime/test_data.py
+        pytest --verbose runtime/test_runtime_utils.py
+        pytest --verbose runtime/activation_checkpointing/*
+        pytest --verbose runtime/utils/*
+        pytest --verbose runtime/zero/test_zero_dynamic_class.py
diff --git a/.gitignore b/.gitignore
index 5b9cc7ac3156..db6790886cb4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,40 +1,62 @@
-# Ignore Python compiled files
+## Ignore Python compiled files
 *.pyc
 
-# Ignore IDE-specific files and directories
-.idea/         # JetBrains IDE settings
-.vscode/       # Visual Studio Code settings
-.theia/        # Theia IDE settings
-
-# Ignore temporary and backup files
-*~             # General backup files
-*.swp          # Vim swap files
-
-# Ignore log files
+## Ignore IDE-specific files and directories
+# JetBrains IDE settings
+.idea/
+# Visual Studio Code settings
+.vscode/
+# Theia IDE settings
+.theia/
+
+## Ignore temporary and backup files
+# General backup files
+*~
+# Vim swap files
+*.swp
+
+## Ignore log files
 *.log
 
-# Ignore a specific generated file
+## Ignore a specific generated file
 deepspeed/git_version_info_installed.py
 
-# Ignore Python bytecode cache
+## Ignore Python bytecode cache
 __pycache__
 
-# Build + installation data
-build/                  # Build artifacts
-dist/                   # Distribution files
-*.so                    # Compiled shared objects
-deepspeed.egg-info/     # Deepspeed package info
-build.txt               # Build information
-
-# Website generated files
-docs/_site/             # Jekyll generated site
-docs/build              # Generated documentation
+## Build + installation data
+# Build artifacts
+build/
+# Distribution files
+dist/
+# Compiled shared objects
+*.so
+# Deepspeed package info
+deepspeed.egg-info/
+# Build information
+build.txt
+
+## Website generated files
+# Jekyll generated site
+docs/_site/
+# Generated documentation
+docs/build
 docs/code-docs/source/_build
 docs/code-docs/_build
 docs/code-docs/build
-.sass-cache/            # SASS cache
-.jekyll-cache/          # Jekyll cache
+# SASS cache
+.sass-cache/
+# Jekyll cache
+.jekyll-cache/
 .jekyll-metadata
 
-# Testing data
-tests/unit/saved_checkpoint/  # Saved checkpoints for testing
+## Testing data
+# Saved checkpoints for testing
+tests/unit/saved_checkpoint/
+
+# HIP files created during AMD compilation
+*_hip.cpp
+*_hip.h
+*.hip
+*.cuh
+*hip_layers.h
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2432a7a24124..9a7bb1c9b371 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
     -   id: check-useless-excludes
 
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.0.1
+    rev: v5.0.0
     hooks:
     -   id: check-case-conflict
     -   id: check-json
@@ -23,7 +23,7 @@ repos:
     -   id: trailing-whitespace
 
 -   repo: https://github.com/google/yapf
-    rev: v0.32.0
+    rev: v0.40.0
     hooks:
     -   id: yapf
 
@@ -59,13 +59,13 @@ repos:
             # Do not check files that are automatically generated
             '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
             '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
-            '--ignore-words-list=youn,unsupport,noe',  # Word used in error messages that need rewording
+            '--ignore-words-list=youn,unsupport,noe,cann',  # Word used in error messages that need rewording
             --check-filenames,
             --check-hidden
         ]
 
 -   repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 5.0.4
     hooks:
     -   id: flake8
         args: ['--config=.flake8']
@@ -76,5 +76,14 @@ repos:
         name: check-torchcuda
         entry: ./scripts/check-torchcuda.py
         language: python
-        exclude: ^(.github/workflows/|scripts/check-torchcuda.py|docs/_tutorials/accelerator-abstraction-interface.md|accelerator/cuda_accelerator.py|deepspeed/inference/engine.py|deepspeed/model_implementations/transformers/clip_encoder.py|deepspeed/model_implementations/diffusers/vae.py|deepspeed/model_implementations/diffusers/unet.py|op_builder/spatial_inference.py|op_builder/transformer_inference.py|op_builder/builder.py|setup.py|tests/unit/ops/sparse_attention/test_sparse_attention.py)
+        exclude: ^(.github/workflows/|scripts/check-torchcuda.py|docs/_tutorials/accelerator-abstraction-interface.md|docs/_tutorials/deepnvme.md|accelerator/cuda_accelerator.py|deepspeed/inference/engine.py|deepspeed/model_implementations/transformers/clip_encoder.py|deepspeed/model_implementations/diffusers/vae.py|deepspeed/model_implementations/diffusers/unet.py|op_builder/spatial_inference.py|op_builder/transformer_inference.py|op_builder/builder.py|setup.py|tests/unit/ops/sparse_attention/test_sparse_attention.py)
         # Specific deepspeed/ files are excluded for now until we wrap ProcessGroup in deepspeed.comm
+
+-   repo: local
+    hooks:
+    -   id: check-extraindexurl
+        name: check-extraindexurl
+        entry: ./scripts/check-extraindexurl.py
+        language: python
+        files: \.(yml|yaml|sh|py)$
+        exclude: ^(scripts/check-extraindexurl.py)
diff --git a/CODEOWNERS b/CODEOWNERS
index 2410b3ebc09b..b0d3b8b0d77b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -7,50 +7,53 @@
 
 
 # top-level repo folders
-/.github/ @jeffra @mrwyattii @loadams
-/azure/ @jeffra @awan-10
-/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
-/bin/ @jeffra
-/csrc/ @RezaYazdaniAminabadi @awan-10 @jeffra @cmikeh2 @arashb
-/deepspeed/ @jeffra
-/docker/ @jeffra @awan-10
-/docs/ @jeffra @mrwyattii
-/examples/ @jeffra @awan-10 @mrwyattii
-/op_builder/ @jeffra @RezaYazdaniAminabadi @cmikeh2
-/release/ @jeffra @mrwyattii
-/requirements/ @jeffra @mrwyattii
-/scripts/ @jeffra @awan-10
-/tests/ @jeffra @mrwyattii @tjruwase
+/.github/ @loadams
+/azure/ @loadams
+/benchmarks/ @guanhuawang @tjruwase
+/bin/ @loadams
+/csrc/ @tjruwase
+/deepspeed/ @loadams @tjruwase
+/docker/ @loadams @guanhuawang
+/docs/ @loadams @tjruwase
+/examples/ @jomayeri @tohtana
+/op_builder/ @loadams @tjruwase @jomayeri
+/release/ @loadams @jomayeri
+/requirements/ @loadams
+/scripts/ @loadams @tjruwase
+/tests/ @tjruwase @loadams @tohtana
 
 # deepspeed
-/deepspeed/autotuning/ @cli99
+/deepspeed/autotuning/ @loadams
 /deepspeed/checkpoint/ @tjruwase
-/deepspeed/comm/ @awan-10
-/deepspeed/compression/ @yaozhewei @minjiaz @xiaoxiawu-microsoft @conglongli
-/deepspeed/elasticity/ @jeffra @awan-10
-/deepspeed/launcher/ @jeffra @awan-10
-/deepspeed/module_inject/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/moe/ @awan-10
-/deepspeed/monitor/ @awan-10 @jeffra
-/deepspeed/nebula/ @tjruwase @jeffra
-/deepspeed/ops/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/pipe/ @ShadenSmith @duli2012
-/deepspeed/profiling/ @cli99
-/deepspeed/utils/ @jeffra @tjruwase @awan-10
+/deepspeed/comm/ @guanhuawang
+/deepspeed/compression/ @tjruwase
+/deepspeed/elasticity/ @tjruwase
+/deepspeed/launcher/ @loadams
+/deepspeed/module_inject/ @hwchen2017 @loadams
+/deepspeed/moe/ @tohtana
+/deepspeed/monitor/ @tjruwase
+/deepspeed/nebula/ @tjruwase
+/deepspeed/nvme/ @tjruwase @jomayeri
+/deepspeed/ops/ @tohtana
+/deepspeed/pipe/ @tohtana @loadams
+/deepspeed/profiling/ @loadams
+/deepspeed/sequence/ @tohtana
+/deepspeed/utils/ @tjruwase @tohtana
 
 # inference
-/deepspeed/inference/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
-/deepspeed/model_implementations/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/inference/ @hwchen2017 @tohtana
+/deepspeed/model_implementations/@tohtana @loadams
 
 # training
-/deepspeed/runtime/ @jeffra @tjruwase
-/deepspeed/runtime/activation_checkpointing/ @jeffra @tjruwase
-/deepspeed/runtime/checkpoint_engine/ @tjruwase @jeffra
-/deepspeed/runtime/comm/ @awan-10
-/deepspeed/runtime/compression/ @awan-10 @conglongli
-/deepspeed/runtime/data_pipeline/ @conglongli
-/deepspeed/runtime/fp16/ @jeffra @tjruwase
-/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
-/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
-/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
-/deepspeed/runtime/zero/ @jeffra @tjruwase @samyam @mrwyattii
+/deepspeed/runtime/ @tjruwase @tohtana
+/deepspeed/runtime/activation_checkpointing/ @tjruwase
+/deepspeed/runtime/checkpoint_engine/ @tjruwase
+/deepspeed/runtime/comm/ @guanhuawang
+/deepspeed/runtime/compression/ @tjruwase
+/deepspeed/runtime/data_pipeline/ @tjruwase
+/deepspeed/runtime/domino/ @guanhuawang @hwchen2017
+/deepspeed/runtime/fp16/ @tjruwase @tohtana
+/deepspeed/runtime/fp16/onebit/ @tjruwase
+/deepspeed/runtime/pipe/ @loadams @tohtana
+/deepspeed/runtime/swap_tensor/ @tjruwase @jomayeri
+/deepspeed/runtime/zero/ @tjruwase @tohtana
diff --git a/COMMITTERS.md b/COMMITTERS.md
new file mode 100644
index 000000000000..8418bdf8629d
--- /dev/null
+++ b/COMMITTERS.md
@@ -0,0 +1,11 @@
+# DeepSpeed TSC Committers #
+
+| Name | GitHub ID | Affiliation
+|--- | ---- | --- |
+| Olatunji Ruwase | [tjruwase](https://github.com/tjruwase)     | Microsoft |
+| Logan Adams     | [loadams](https://github.com/loadams)      | Microsoft |
+| Masahiro Tanaka | [tohtana](https://github.com/tohtana)      | Microsoft |
+| Jeff Rasley     | [jeffra](https://github.com/jeffra)       | SnowFlake  |
+| Minjia Zhang    | [minjiazhang](https://github.com/minjiazhang)  | UIUC  |
+| Ashwin Aji      | [ashwinma](https://github.com/ashwinma)        | AMD   |
+| Sam Foreman     | [saforem2](https://github.com/saforem2)        | Argonne National Laboratory |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f6e5f39869eb..bfc22afb5359 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -13,7 +13,7 @@ pre-commit install
 Afterwards, our suite of formatting tests run automatically before each `git commit`. You
 can also run these manually:
 ```bash
-pre-commit run --all-files
+pre-commit run --files  $(git diff --name-only master)
 ```
 If a formatting test fails, it will fix the modified code in place and abort
 the `git commit`. After looking over the changes, you can `git add <modified files>`
@@ -23,7 +23,7 @@ and then repeat the previous `git commit` command.
 ## Testing
 DeepSpeed tracks two types of tests: unit tests and more costly model convergence tests.
 The model convergence tests train
-[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) and measure
+[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) and measure
 end-to-end convergence and related metrics. Unit tests are found in `tests/unit/` and
 the model convergence tests are found in `tests/model/`.
 
@@ -40,7 +40,7 @@ tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) an
 
 ### Model Tests
 To execute model tests, first [install DeepSpeed](#installation). The
-[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) repository is cloned
+[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) repository is cloned
 as part of this process. Next, execute the model test driver:
 ```bash
 cd tests/model/
@@ -85,8 +85,8 @@ Based on the issue we shall discuss the merit of the new feature and decide whet
 ### Step 2: implementation and verification
 Contributor will go ahead and implement the feature, and the DeepSpeed team will provide guidance/helps as needed. The required deliverables include:
 
-* A PR to [microsoft/DeepSpeed](https://github.com/microsoft/DeepSpeed) including (1) the feature implementation (2) unit tests (3) documentation (4) tutorial
-* A PR to [microsoft/DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) or [microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) including the examples of how to use the feature (this is related to the planned testing experiments in proposal)
+* A PR to [deepspeedai/DeepSpeed](https://github.com/deepspeedai/DeepSpeed) including (1) the feature implementation (2) unit tests (3) documentation (4) tutorial
+* A PR to [deepspeedai/DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) or [deepspeedai/Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) including the examples of how to use the feature (this is related to the planned testing experiments in proposal)
 * In the implementation (code, documentation, tutorial), we require the feature author to record their GitHub username as a contact method for future questions/maintenance.
 
 After receiving the PRs, we will review them and merge them after necessary tests/fixes.
diff --git a/GOVERNANCE.md b/GOVERNANCE.md
new file mode 100644
index 000000000000..d488ec55114e
--- /dev/null
+++ b/GOVERNANCE.md
@@ -0,0 +1,101 @@
+
+# DeepSpeed Project Charter and Governance
+
+This charter sets forth the responsibilities and procedures for technical contribution to, and oversight of, the DeepSpeed open source project. All contributors (including committers, maintainers, and other technical positions) and other participants in the Project (collectively, "Collaborators") must comply with the terms of this Charter.
+
+## Mission and Scope of the Project
+
+The mission of the Project is to DeepSpeed is a deep learning optimization library that makes distributed training and inference easy, efficient, and effective.
+
+The scope of the Project includes collaborative development under the Project License (as defined herein) supporting the mission, including documentation, testing, integration, and the creation of other artifacts that aid the development, deployment, operation, or adoption of the open source project.
+
+## Technical Steering Committee
+
+1. The Technical Steering Committee (the "TSC") will be responsible for all technical oversight of the open source Project.
+
+2. The TSC voting members are initially the Project's Committers. At the inception of the project, the Committers of the Project will be as set forth within the "CONTRIBUTING" file within the Project's code repository. The TSC may choose an alternative approach for determining the voting members of the TSC, and any such alternative approach will be documented in the CONTRIBUTING file. Any meetings of the Technical Steering Committee are intended to be open to the public, and can be conducted electronically, via teleconference, or in person.
+
+3. TSC projects generally will involve Contributors and Committers. The TSC may adopt or modify roles so long as the roles are documented in the CONTRIBUTING file. Unless otherwise documented:
+
+	- **Contributors** include anyone in the technical community that contributes code, documentation, or other technical artifacts to the Project.
+	- **Committers** are Contributors who have earned the ability to modify ("commit") source code, documentation, or other technical artifacts in a project's repository.
+
+	-  A Contributor may become a Committer by a majority approval of the existing Committers. A Committer may be removed by a majority approval of the other existing Committers.
+
+4. Participation in the Project through becoming a Contributor and Committer is open to anyone so long as they abide by the terms of this Charter.
+
+5. The TSC may:
+	- Establish workflow procedures for the submission, approval, and closure/archiving of projects.
+	- Set requirements for the promotion of Contributors to Committer status, as applicable.
+	- Amend, adjust, refine and/or eliminate the roles of Contributors and Committers, and create new roles, and publicly document any TSC roles, as it sees fit.
+
+6. The TSC may elect a TSC Chair, who will preside over meetings of the TSC and will serve until their resignation or replacement by the TSC. The TSC Chair, or any other TSC member so designated by the TSC, will serve as the primary communication contact between the Project and AI & Data, a directed fund of The Linux Foundation.
+
+7. Responsibilities:  The TSC will be responsible for all aspects of oversight relating to the Project, which may include:
+
+	- Coordinating the technical direction of the Project.
+	- Approving project or system proposals (including, but not limited to, incubation, deprecation, and changes to a sub-project's scope).
+	- Organizing sub-projects and removing sub-projects.
+	- Creating sub-committees or working groups to focus on cross-project technical issues and requirements.
+	- Appointing representatives to work with other open source or open standards communities.
+	- Establishing community norms, workflows, issuing releases, and security issue reporting policies.
+	- Approving and implementing policies and processes for contributing (to be published in the CONTRIBUTING file) and coordinating with the series manager of the Project (as provided for in the Series Agreement, the "Series Manager") to resolve matters or concerns that may arise as set forth in Section 7 of this Charter.
+	- Discussions, seeking consensus, and where necessary, voting on technical matters relating to the code base that affect multiple projects.
+	- Coordinating any marketing, events, or communications regarding the Project.
+
+## TSC Voting
+
+1. While the Project aims to operate as a consensus-based community, if any TSC decision requires a vote to move the Project forward, the voting members of the TSC will vote on a one vote per voting member basis.
+
+2. Quorum for TSC meetings requires at least fifty percent of all voting members of the TSC to be present. The TSC may continue to meet if quorum is not met but will be prevented from making any decisions at the meeting.
+
+3. Except as provided in Section 7.c. and 8.a, decisions by vote at a meeting require a majority vote of those in attendance, provided quorum is met. Decisions made by electronic vote without a meeting require a majority vote of all voting members of the TSC.
+
+4. In the event a vote cannot be resolved by the TSC, any voting member of the TSC may refer the matter to the Series Manager for assistance in reaching a resolution.
+
+## Compliance with Policies
+
+1. This Charter is subject to the Series Agreement for the Project and the Operating Agreement of LF Projects. Contributors will comply with the policies of LF Projects as may be adopted and amended by LF Projects, including, without limitation, the policies listed at https://lfprojects.org/policies/.
+
+2. The TSC may adopt a code of conduct ("CoC") for the Project, which is subject to approval by the Series Manager. In the event that a Project-specific CoC has not been approved, the LF Projects Code of Conduct listed at https://lfprojects.org/policies will apply for all Collaborators in the Project.
+
+3. When amending or adopting any policy applicable to the Project, LF Projects will publish such policy, as to be amended or adopted, on its website at least 30 days prior to such policy taking effect; provided, however, that in the case of any amendment of the Trademark Policy or Terms of Use of LF Projects, any such amendment is effective upon publication on LF Project's website.
+
+4. All Collaborators must allow open participation from any individual or organization meeting the requirements for contributing under this Charter and any policies adopted for all Collaborators by the TSC, regardless of competitive interests. Put another way, the Project community must not seek to exclude any participant based on any criteria, requirement, or reason other than those that are reasonable and applied on a non-discriminatory basis to all Collaborators in the Project community.
+
+5. The Project will operate in a transparent, open, collaborative, and ethical manner at all times. The output of all Project discussions, proposals, timelines, decisions, and status should be made open and easily visible to all. Any potential violations of this requirement should be reported immediately to the Series Manager.
+
+## Community Assets
+
+1. LF Projects will hold title to all trade or service marks used by the Project ("Project Trademarks"), whether based on common law or registered rights. Project Trademarks will be transferred and assigned to LF Projects to hold on behalf of the Project. Any use of any Project Trademarks by Collaborators in the Project will be in accordance with the license from LF Projects and inure to the benefit of LF Projects.
+
+2. The Project will, as permitted and in accordance with such license from LF Projects, develop and own all Project GitHub and social media accounts, and domain name registrations created by the Project community.
+
+3. Under no circumstances will LF Projects be expected or required to undertake any action on behalf of the Project that is inconsistent with the tax-exempt status or purpose, as applicable, of the Joint Development Foundation or LF Projects, LLC.
+
+## General Rules and Operations
+
+The Project will:
+
+1. Engage in the work of the Project in a professional manner consistent with maintaining a cohesive community, while also maintaining the goodwill and esteem of LF Projects, Joint Development Foundation, and other partner organizations in the open source community.
+2. Respect the rights of all trademark owners, including any branding and trademark usage guidelines.
+
+## Intellectual Property Policy
+
+1. Collaborators acknowledge that the copyright in all new contributions will be retained by the copyright holder as independent works of authorship and that no contributor or copyright holder will be required to assign copyrights to the Project.
+
+2. Except as described in Section 7.c., all contributions to the Project are subject to the following:
+
+    - All new inbound code contributions to the Project must be made using Apache License, Version 2.0 available at http://www.apache.org/licenses/LICENSE-2.0 (the "Project License").
+	- All new inbound code contributions must also be accompanied by a Developer Certificate of Origin (http://developercertificate.org) sign-off in the source code system that is submitted through a TSC-approved contribution process which will bind the authorized contributor and, if not self-employed, their employer to the applicable license.
+	- All outbound code will be made available under the Project License.
+	- Documentation will be received and made available by the Project under the Creative Commons Attribution 4.0 International License (available at http://creativecommons.org/licenses/by/4.0/).
+	- The Project may seek to integrate and contribute back to other open source projects ("Upstream Projects"). In such cases, the Project will conform to all license requirements of the Upstream Projects, including dependencies, leveraged by the Project. Upstream Project code contributions not stored within the Project's main code repository will comply with the contribution process and license terms for the applicable Upstream Project.
+
+3. The TSC may approve the use of an alternative license or licenses for inbound or outbound contributions on an exception basis. To request an exception, please describe the contribution, the alternative open source license(s), and the justification for using an alternative open source license for the Project. License exceptions must be approved by a two-thirds vote of the entire TSC.
+
+4. Contributed files should contain license information, such as SPDX short form identifiers, indicating the open source license or licenses pertaining to the file.
+
+## Amendments
+
+1. This charter may be amended by a two-thirds vote of the entire TSC and is subject to approval by LF Projects.
diff --git a/MANIFEST.in b/MANIFEST.in
index ab79573ef96c..8d84aee0faf4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,8 +2,8 @@ include *.txt README.md
 include deepspeed/inference/v2/kernels/ragged_ops/libs/*.so
 include deepspeed/inference/v2/kernels/cutlass_ops/libs/*.so
 recursive-include requirements *.txt
-recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
-recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
+recursive-include deepspeed *.cpp *.h *.hpp *.cu *.hip *.tr *.cuh *.cc *.json
+recursive-include csrc *.cpp *.h *.hpp *.cu *.tr *.cuh *.cc
 recursive-include op_builder *.py
 recursive-include benchmarks *.py
 recursive-include accelerator *.py
diff --git a/README.md b/README.md
index 783687f96f85..233baa31f4ab 100755
--- a/README.md
+++ b/README.md
@@ -1,10 +1,4 @@
-[![License Apache 2.0](https://badgen.net/badge/license/apache2.0/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
-[![PyPI version](https://badge.fury.io/py/deepspeed.svg)](https://pypi.org/project/deepspeed/)
-[![Downloads](https://static.pepy.tech/badge/deepspeed)](https://pepy.tech/project/deepspeed)
-[![Build](https://badgen.net/badge/build/check-status/blue)](#build-pipeline-status)
-[![Twitter](https://img.shields.io/twitter/follow/MSFTDeepSpeed)](https://twitter.com/intent/follow?screen_name=MSFTDeepSpeed)
-[![Japanese Twitter](https://img.shields.io/badge/%E6%97%A5%E6%9C%AC%E8%AA%9ETwitter-%40MSFTDeepSpeedJP-blue)](https://twitter.com/MSFTDeepSpeedJP)
-[![Chinese Zhihu](https://img.shields.io/badge/%E7%9F%A5%E4%B9%8E-%E5%BE%AE%E8%BD%AFDeepSpeed-blue)](https://www.zhihu.com/people/deepspeed)
+[![License MIT](https://badgen.net/badge/license/MIT/blue)](https://github.com/Microsoft/DeepSpeed/blob/master/LICENSE)
 
 
 <div align="center">
@@ -12,259 +6,11 @@
  <img src="docs/assets/images/DeepSpeed_dark_transparent.svg#gh-dark-mode-only" width="400px">
 </div>
 
-## Latest News
-<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
+## DeeperSpeed
 
-* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
-* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
-* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
-* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
-* [2023/08] [DeepSpeed ZeRO-Inference: 20x faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
+DeeperSpeed is a fork of the [Deepspeed](https://github.com/microsoft/DeepSpeed) library that is tailor-made for the [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) by [EleutherAI](https://www.eleuther.ai/). 
 
-<!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
-<details>
- <summary>More news</summary>
- <ul>
-  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md")>DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements</a></li>
+Prior to 3/9/2023, DeeperSpeed was based on an old version of DeepSpeed (0.3.15). In order to migrate to the latest upstream DeepSpeed version while allowing users to access the old versions of GPT-NeoX and DeeperSpeed, we have introduced two versioned releases for both libraries:
 
-  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses">DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models</a> [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md">日本語</a>]</li>
-
-  <li>[2023/06] <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">ZeRO++: A leap in speed for LLM and chat model training with 4X less communication</a> [<a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">English</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md">日本語</a>]</li>
- </ul>
-</details>
-
----
-
-# Extreme Speed and Scale for DL Training and Inference
-
-***[DeepSpeed](https://www.deepspeed.ai/) enables world's most powerful language models like [MT-530B](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/) and [BLOOM](https://huggingface.co/blog/bloom-megatron-deepspeed)***. It is an easy-to-use deep learning optimization software suite that powers unprecedented scale and speed for both training and inference. With DeepSpeed you can:
-
-* Train/Inference dense or sparse models with billions or trillions of parameters
-* Achieve excellent system throughput and efficiently scale to thousands of GPUs
-* Train/Inference on resource constrained GPU systems
-* Achieve unprecedented low latency and high throughput for inference
-* Achieve extreme compression for an unparalleled inference latency and model size reduction with low costs
-
----
-
-# DeepSpeed's four innovation pillars
-
-<img src="docs/assets/images/DeepSpeed-pillars.png" width="800px">
-
-
-## DeepSpeed-Training
-
-DeepSpeed offers a confluence of system innovations, that has made large scale DL training effective, and efficient, greatly improved ease of use, and redefined the DL training landscape in terms of scale that is possible. These innovations such as ZeRO, 3D-Parallelism, DeepSpeed-MoE, ZeRO-Infinity, etc. fall under the training pillar. Learn more: [DeepSpeed-Training](https://www.deepspeed.ai/training/)
-
-## DeepSpeed-Inference
-
-DeepSpeed brings together innovations in parallelism technology such as tensor, pipeline, expert and ZeRO-parallelism, and combines them with high performance custom inference kernels, communication optimizations and heterogeneous memory technologies to enable inference at an unprecedented scale, while achieving unparalleled latency, throughput and cost reduction. This systematic composition of system technologies for inference falls under the inference pillar. Learn more: [DeepSpeed-Inference](https://www.deepspeed.ai/inference)
-
-
-## DeepSpeed-Compression
-
-To further increase the inference efficiency, DeepSpeed offers easy-to-use and flexible-to-compose compression techniques for researchers and practitioners to compress their models while delivering faster speed, smaller model size, and significantly reduced compression cost. Moreover, SoTA innovations on compression like ZeroQuant and XTC are included under the compression pillar. Learn more: [DeepSpeed-Compression](https://www.deepspeed.ai/compression)
-
-## DeepSpeed4Science
-
-In line with Microsoft's mission to solve humanity's most pressing challenges, the DeepSpeed team at Microsoft is responding to this opportunity by launching a new initiative called *DeepSpeed4Science*, aiming to build unique capabilities through AI system technology innovations to help domain experts to unlock today's biggest science mysteries. Learn more: [DeepSpeed4Science website](https://deepspeed4science.ai/) and [tutorials](https://www.deepspeed.ai/deepspeed4science/)
-
----
-
-# DeepSpeed Software Suite
-
-## DeepSpeed Library
-
-   The [DeepSpeed](https://github.com/microsoft/deepspeed) library (this repository) implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for easy composition of multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
-
-## Model Implementations for Inference (MII)
-
-   [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
-
-## DeepSpeed on Azure
-
-   DeepSpeed users are diverse and have access to different environments. We recommend to try DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
-
----
-
-# DeepSpeed Adoption
-
-DeepSpeed is an important part of Microsoft’s new
-[AI at Scale](https://www.microsoft.com/en-us/research/project/ai-at-scale/)
-initiative to enable next-generation AI capabilities at scale, where you can find more
-information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
-
-DeepSpeed has been used to train many different large-scale models, below is a list of several examples that we are aware of (if you'd like to include your model please submit a PR):
-
-  * [Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/)
-  * [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)
-  * [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed)
-  * [GLM (130B)](https://github.com/THUDM/GLM-130B)
-  * [xTrimoPGLM (100B)](https://www.biorxiv.org/content/10.1101/2023.07.05.547496v2)
-  * [YaLM (100B)](https://github.com/yandex/YaLM-100B)
-  * [GPT-NeoX (20B)](https://github.com/EleutherAI/gpt-neox)
-  * [AlexaTM (20B)](https://www.amazon.science/blog/20b-parameter-alexa-model-sets-new-marks-in-few-shot-learning)
-  * [Turing NLG (17B)](https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/)
-  * [METRO-LM (5.4B)](https://arxiv.org/pdf/2204.06644.pdf)
-
-DeepSpeed has been integrated with several different popular open-source DL frameworks such as:
-
-|                                                                                                | Documentation                                |
-| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-<img src="docs/assets/images/transformers-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/transformers-dark.png#gh-dark-mode-only" width="250px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
-| <img src="docs/assets/images/accelerate-light.png#gh-light-mode-only" width="250px"><img src="docs/assets/images/accelerate-dark.png#gh-dark-mode-only" width="250px"> | [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
-| <img src="docs/assets/images/lightning-light.svg#gh-light-mode-only" width="200px"><img src="docs/assets/images/lightning-dark.svg#gh-dark-mode-only" width="200px"> | [Lightning with DeepSpeed](https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed) |
-| <img src="docs/assets/images/mosaicml.svg" width="200px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/projects/composer/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
-| <img src="docs/assets/images/determined.svg" width="225px"> | [Determined with DeepSpeed](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) |
-| <img src="https://user-images.githubusercontent.com/58739961/187154444-fce76639-ac8d-429b-9354-c6fac64b7ef8.jpg" width=150> | [MMEngine with DeepSpeed](https://mmengine.readthedocs.io/en/latest/common_usage/large_model_training.html#deepspeed) |
-
----
-
-# Build Pipeline Status
-
-| Description | Status |
-| ----------- | ------ |
-| NVIDIA | [![nv-torch110-p40](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-p40.yml) [![nv-torch110-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch110-v100.yml) [![nv-torch-latest-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-v100.yml) [![nv-h100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-h100.yml) [![nv-inference](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-inference.yml) [![nv-nightly](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-nightly.yml) |
-| AMD | [![amd-mi100](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi100.yml) [![amd-mi200](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/amd-mi200.yml) |
-| CPU | [![nv-torch-latest-cpu](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-latest-cpu.yml) |
-| PyTorch Nightly | [![nv-torch-nightly-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-torch-nightly-v100.yml) |
-| Integrations | [![nv-transformers-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-transformers-v100.yml) [![nv-lightning-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-lightning-v100.yml) [![nv-accelerate-v100](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-accelerate-v100.yml) [![nv-megatron](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-megatron.yml) [![nv-mii](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-mii.yml) [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) |
-| Misc | [![Formatting](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/formatting.yml) [![pages-build-deployment](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/microsoft/DeepSpeed/actions/workflows/pages/pages-build-deployment) [![Documentation Status](https://readthedocs.org/projects/deepspeed/badge/?version=latest)](https://deepspeed.readthedocs.io/en/latest/?badge=latest)[![python](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/python.yml) |
-
-# Installation
-
-The quickest way to get started with DeepSpeed is via pip, this will install
-the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
-versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
-to as our 'ops'.  By default, all of these extensions/ops will be built
-just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
-ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
-dynamically link them at runtime.
-
-## Requirements
-* [PyTorch](https://pytorch.org/) must be installed _before_ installing DeepSpeed.
-* For full feature support we recommend a version of PyTorch that is >= 1.9 and ideally the latest PyTorch stable release.
-* A CUDA or ROCm compiler such as [nvcc](https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#introduction) or [hipcc](https://github.com/ROCm-Developer-Tools/HIPCC) used to compile C++/CUDA/HIP extensions.
-* Specific GPUs we develop and test against are listed below, this doesn't mean your GPU will not work if it doesn't fall into this category it's just DeepSpeed is most well tested on the following:
-  * NVIDIA: Pascal, Volta, Ampere, and Hopper architectures
-  * AMD: MI100 and MI200
-
-## PyPI
-We regularly push releases to [PyPI](https://pypi.org/project/deepspeed/) and encourage users to install from there in most cases.
-
-```bash
-pip install deepspeed
-```
-
-After installation, you can validate your install and see which extensions/ops
-your machine is compatible with via the DeepSpeed environment report.
-
-```bash
-ds_report
-```
-
-If you would like to pre-install any of the DeepSpeed extensions/ops (instead
-of JIT compiling) or install pre-compiled ops via PyPI please see our [advanced
-installation instructions](https://www.deepspeed.ai/tutorials/advanced-install/).
-
-## Windows
-Windows support is partially supported with DeepSpeed. On Windows you can build wheel with following steps, currently only inference mode is supported.
-1. Install pytorch, such as pytorch 1.8 + cuda 11.1
-2. Install visual cpp build tools, such as VS2019 C++ x64/x86 build tools
-3. Launch cmd console with Administrator privilege for creating required symlink folders
-4. Run `python setup.py bdist_wheel` to build wheel in `dist` folder
-
-# Features
-
-Please checkout [DeepSpeed-Training](https://www.deepspeed.ai/training), [DeepSpeed-Inference](https://www.deepspeed.ai/inference) and [DeepSpeed-Compression](https://www.deepspeed.ai/compression) pages for full set of features offered along each of these three pillars.
-
-# Further Reading
-
-All DeepSpeed documentation, tutorials, and blogs can be found on our website: [deepspeed.ai](https://www.deepspeed.ai/)
-
-
-|                                                                                                | Description                                  |
-| ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-| [Getting Started](https://www.deepspeed.ai/getting-started/)                                   |  First steps with DeepSpeed                  |
-| [DeepSpeed JSON Configuration](https://www.deepspeed.ai/docs/config-json/)                     |  Configuring DeepSpeed                       |
-| [API Documentation](https://deepspeed.readthedocs.io/en/latest/)                               |  Generated DeepSpeed API documentation       |
-| [Tutorials](https://www.deepspeed.ai/tutorials/)                                               |  Tutorials                                   |
-| [Blogs](https://www.deepspeed.ai/posts/)                                                       |  Blogs                                   |
-
-
-# Contributing
-DeepSpeed welcomes your contributions! Please see our
-[contributing](CONTRIBUTING.md) guide for more details on formatting, testing,
-etc.<br/>
-Thanks so much to all of our amazing contributors!
-
-<a href="https://github.com/microsoft/DeepSpeed/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=microsoft/DeepSpeed&r="  width="800px"/>
-</a>
-
-## Contributor License Agreement
-This project welcomes contributions and suggestions. Most contributions require you to
-agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
-actually do, grant us the rights to use your contribution. For details, visit
-https://cla.opensource.microsoft.com.
-
-When you submit a pull request, a CLA bot will automatically determine whether you need
-to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
-follow the instructions provided by the bot. You will only need to do this once across
-all repos using our CLA.
-
-## Code of Conduct
-This project has adopted the [Microsoft Open Source Code of
-Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the
-[Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
-[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
-
-# Publications
-1. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He. (2019) ZeRO: memory optimizations toward training trillion parameter models. [arXiv:1910.02054](https://arxiv.org/abs/1910.02054) and [In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (SC '20)](https://dl.acm.org/doi/10.5555/3433701.3433727).
-2. Jeff Rasley, Samyam Rajbhandari, Olatunji Ruwase, and Yuxiong He. (2020) DeepSpeed: System Optimizations Enable Training Deep Learning Models with Over 100 Billion Parameters. [In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD '20, Tutorial)](https://dl.acm.org/doi/10.1145/3394486.3406703).
-3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
-4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie). [[paper]](https://arxiv.org/abs/2101.06840) [[slides]](https://www.usenix.org/system/files/atc21_slides_ren-jie.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
-5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
-6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205). [[paper]](https://arxiv.org/abs/2104.07857) [[slides]](docs/assets/files/SC21-ZeRO-Infinity.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
-7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
-8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
-9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
-10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html). [[pdf]](https://arxiv.org/abs/2201.05596) [[slides]](docs/assets/files/ICML-5mins.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/)
-11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
-12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
-13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1) [[slides]](docs/assets/files/zeroquant_series.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/)
-14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946). [[paper]](https://arxiv.org/abs/2207.00032) [[slides]](docs/assets/files/sc22-ds-inference.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/)
-15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
-16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597) [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/)
-17. Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, Yuxiong He. (2023) Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases. [arXiv:2301.12017](https://arxiv.org/abs/2301.12017) and [ICML2023](https://icml.cc/Conferences/2023).
-18. Syed Zawad, Cheng Li, Zhewei Yao, Elton Zheng, Yuxiong He, Feng Yan. (2023) DySR: Adaptive Super-Resolution via Algorithm and System Co-design. [ICLR:2023](https://openreview.net/forum?id=Pgtn4l6eKjv).
-19. Sheng Shen, Zhewei Yao, Chunyuan Li, Trevor Darrell, Kurt Keutzer, Yuxiong He. (2023) Scaling Vision-Language Models with Sparse Mixture of Experts. [arXiv:2303.07226](https://arxiv.org/abs/2303.07226) and [Finding at EMNLP2023](https://2023.emnlp.org/).
-20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
-21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023.
-22. Guanhua Wang, Heyang Qin, Sam Ade Jacobs, Xiaoxia Wu, Connor Holmes, Zhewei Yao, Samyam Rajbhandari, Olatunji Ruwase, Feng Yan, Lei Yang, Yuxiong He. (2023) ZeRO++: Extremely Efficient Collective Communication for Giant Model Training [arXiv:2306.10209](https://arxiv.org/abs/2306.10209) and [ML for Sys Workshop at NeurIPS2023](http://mlforsystems.org/) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)
-23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](docs/assets/files/zeroquant_series.pdf)
-24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847)
-25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320).
-26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](docs/assets/files/zeroquant_series.pdf)
-27. Zhewei Yao, Xiaoxia Wu, Conglong Li, Minjia Zhang, Heyang Qin, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He. (2023) DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention [arXiv:2309.14327](https://arxiv.org/pdf/2309.14327.pdf)
-28. Shuaiwen Leon Song, Bonnie Kruft, Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Masahiro Tanaka, Xiaoxia Wu, Jeff Rasley, Ammar Ahmad Awan, Connor Holmes, Martin Cai, Adam Ghanem, Zhongzhu Zhou, Yuxiong He, et al. (2023) DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies [arXiv:2310.04610](https://arxiv.org/abs/2310.04610) [[blog]](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
-29. Zhewei Yao, Reza Yazdani Aminabadi, Stephen Youn, Xiaoxia Wu, Elton Zheng, Yuxiong He. (2023) ZeroQuant-HERO: Hardware-Enhanced Robust Optimized Post-Training Quantization Framework for W8A8 Transformers [arXiv:2310.17723](https://arxiv.org/abs/2310.17723)
-
-
-# Videos
-1. DeepSpeed KDD 2020 Tutorial
-    1. [Overview](https://www.youtube.com/watch?v=CaseqC45DNc&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=29)
-    2. [ZeRO + large model training](https://www.youtube.com/watch?v=y4_bCiAsIAk&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=28)
-    3. [17B T-NLG demo](https://www.youtube.com/watch?v=9V-ZbP92drg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=27)
-    4. [Fastest BERT training + RScan tuning](https://www.youtube.com/watch?v=o1K-ZG9F6u0&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=26)
-    5. DeepSpeed hands on deep dive: [part 1](https://www.youtube.com/watch?v=_NOk-mBwDYg&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=92), [part 2](https://www.youtube.com/watch?v=sG6_c4VXLww&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=94), [part 3](https://www.youtube.com/watch?v=k9yPkBTayos&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=93)
-    6. [FAQ](https://www.youtube.com/watch?v=nsHu6vEgPew&list=PLa85ZdUjfWS21mgibJ2vCvLziprjpKoW0&index=24)
-2. Microsoft Research Webinar
-    * Registration is free and all videos are available on-demand.
-    * [ZeRO & Fastest BERT: Increasing the scale and speed of deep learning training in DeepSpeed](https://note.microsoft.com/MSR-Webinar-DeepSpeed-Registration-On-Demand.html).
-3. [DeepSpeed on AzureML](https://youtu.be/yBVXR8G8Bg8)
-4. [Large Model Training and Inference with DeepSpeed // Samyam Rajbhandari // LLMs in Prod Conference](https://www.youtube.com/watch?v=cntxC3g22oU) [[slides]](docs/assets/files/presentation-mlops.pdf)
-5. Community Tutorials
-    * [DeepSpeed: All the tricks to scale to gigantic models (Mark Saroufim)](https://www.youtube.com/watch?v=pDGI668pNg0)
-    * [Turing-NLG, DeepSpeed and the ZeRO optimizer (Yannic Kilcher)](https://www.youtube.com/watch?v=tC01FRB0M7w)
-    * [Ultimate Guide To Scaling ML Models (The AI Epiphany)](https://www.youtube.com/watch?v=hc0u4avAkuM)
+- Version 1.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v1.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v1.0) maintain snapshots of the old stable versions that [GPT-NeoX-20B](https://arxiv.org/abs/2204.06745) and the [Pythia Suite](https://github.com/EleutherAI/pythia) were trained on.
+- Version 2.0 of [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/releases/tag/v2.0) and [DeeperSpeed](https://github.com/EleutherAI/DeeperSpeed/releases/tag/v2.0) are the latest versions built on the latest DeepSpeed, and will be maintained going forward.
diff --git a/SECURITY.md b/SECURITY.md
index 9e9391ee0bac..3061748e610b 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -39,3 +39,7 @@ We prefer all communications to be in English.
 Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
 
 <!-- END MICROSOFT SECURITY.MD BLOCK -->
+
+---
+
+Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 0c7ee3212fad..2a0770ac681b 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -12,11 +12,24 @@ class DeepSpeedAccelerator(ABC):
     def __init__(self):
         self._name = None
         self._communication_backend_name = None
+        self._compile_backend = None
 
     @abc.abstractmethod
     def is_synchronized_device(self):
         ...
 
+    @abc.abstractmethod
+    def use_host_timers(self):
+        ...
+
+    @abc.abstractmethod
+    def resolves_data_dependency(self):
+        ...
+
+    @abc.abstractmethod
+    def handles_memory_backpressure(self):
+        ...
+
     # Device APIs
     @abc.abstractmethod
     def device_name(self, device_index):
@@ -68,7 +81,7 @@ def manual_seed_all(self, seed):
         ...
 
     @abc.abstractmethod
-    def initial_seed(self, seed):
+    def initial_seed(self):
         ...
 
     @abc.abstractmethod
@@ -193,6 +206,19 @@ def communication_backend_name(self):
     def is_triton_supported(self):
         ...
 
+    # Graph operations
+    @abc.abstractmethod
+    def create_graph(self):
+        ...
+
+    @abc.abstractmethod
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        ...
+
+    @abc.abstractmethod
+    def replay_graph(self, graph):
+        ...
+
     # Tensor operations
     @property
     @abc.abstractmethod
@@ -258,3 +284,23 @@ def get_op_builder(self, class_name):
     @abc.abstractmethod
     def build_extension(self):
         ...
+
+    @abc.abstractmethod
+    def export_envs(self):
+        ...
+
+    @abc.abstractmethod
+    def visible_devices_envs(self):
+        ...
+
+    @abc.abstractmethod
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        ...
+
+    @abc.abstractmethod
+    def get_compile_backend(self):
+        ...
+
+    @abc.abstractmethod
+    def set_compile_backend(self, backend):
+        ...
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index fdbbd33c07a2..de711f73144e 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -3,10 +3,21 @@
 
 # DeepSpeed Team
 
-import torch
-from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
-import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
-import psutil
+from .abstract_accelerator import DeepSpeedAccelerator
+
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch
+except ImportError as e:
+    pass
+
+try:
+    import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
+    oneccl_imported_p = True
+except ImportError as e:
+    oneccl_imported_p = False
+
 import os
 
 
@@ -15,12 +26,31 @@ class CPU_Accelerator(DeepSpeedAccelerator):
 
     def __init__(self):
         self._name = 'cpu'
-        self._communication_backend_name = 'ccl'
-        self.max_mem = psutil.Process().memory_info().rss
+        self._compile_backend = "inductor"
+        if oneccl_imported_p:
+            self._communication_backend_name = 'ccl'
+        else:
+            # fallback to gloo if oneccl_binding_for_pytorch is not installed
+            self._communication_backend_name = 'gloo'
+        try:
+            import psutil
+            mem = psutil.Process().memory_info().rss
+            self.max_mem = mem
+        except ImportError as e:
+            self.max_mem = 0
 
     def is_synchronized_device(self):
         return True
 
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
     # Device APIs
     def device_name(self, device_index=None):
         return 'cpu'
@@ -47,6 +77,8 @@ def device_count(self):
             # In flat mode, HBM is in separate NUMA node with no cores on this node.
             # Ignore these NUMA nodes with no cores.
             numa_core_lists = get_numa_cores()
+            if not numa_core_lists:
+                return 1
             numa_count = 0
             prev_core_list = []
             for core_list in numa_core_lists:
@@ -63,7 +95,7 @@ def random(self):
         return torch.random
 
     def set_rng_state(self, new_state, device_index=None):
-        if device_index == None:
+        if device_index is None:
             return torch.set_rng_state(new_state)
         return torch.set_rng_state(new_state, device_index)
 
@@ -76,8 +108,8 @@ def manual_seed(self, seed):
     def manual_seed_all(self, seed):
         return torch.manual_seed(seed)
 
-    def initial_seed(self, seed):
-        return torch.initial_seed(seed)
+    def initial_seed(self):
+        return torch.initial_seed()
 
     def default_generator(self, device_index):
         return torch.default_generator
@@ -106,12 +138,14 @@ def empty_cache(self):
         return
 
     def get_rss(self):
+        import psutil
         mem = psutil.Process().memory_info().rss
         if mem > self.max_mem:
             self.max_mem = mem
         return mem
 
     def reset_rss(self):
+        import psutil
         mem = psutil.Process().memory_info().rss
         self.max_mem = mem
         return mem
@@ -157,9 +191,11 @@ def max_memory_reserved(self, device_index=None):
         return self.max_mem
 
     def total_memory(self, device_index=None):
+        import psutil
         return psutil.virtual_memory().total
 
     def available_memory(self, device_index=None):
+        import psutil
         return psutil.virtual_memory().available
 
     # Misc
@@ -198,8 +234,18 @@ def is_fp16_supported(self):
     def supported_dtypes(self):
         return [torch.float, torch.bfloat16]
 
-    # Tensor operations
+    # Graph operations
+    def create_graph(self):
+        return None
 
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
+    # Tensor operations
     @property
     def BFloat16Tensor(self):
         return torch.BFloat16Tensor
@@ -253,7 +299,7 @@ def on_accelerator(self, tensor):
     # create an instance of op builder and return, name specified by class_name
     def create_op_builder(self, op_name):
         builder_class = self.get_op_builder(op_name)
-        if builder_class != None:
+        if builder_class is not None:
             return builder_class()
         return None
 
@@ -263,16 +309,20 @@ def get_op_builder(self, class_name):
             # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
             # if successful this also means we're doing a local install and not JIT compile path
             from op_builder import __deepspeed__  # noqa: F401 # type: ignore
-            from op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
         except ImportError:
-            from deepspeed.ops.op_builder.cpu import CCLCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
+            from deepspeed.ops.op_builder.cpu import AsyncIOBuilder, CCLCommBuilder, ShareMemCommBuilder, FusedAdamBuilder, CPUAdamBuilder, NotImplementedBuilder
 
         if class_name == "CCLCommBuilder":
             return CCLCommBuilder
+        elif class_name == "ShareMemCommBuilder":
+            return ShareMemCommBuilder
         elif class_name == "FusedAdamBuilder":
             return FusedAdamBuilder
         elif class_name == "CPUAdamBuilder":
             return CPUAdamBuilder
+        elif class_name == "AsyncIOBuilder":
+            return AsyncIOBuilder
         else:
             # return a NotImplementedBuilder to avoid get NoneType[Name] in unit tests
             return NotImplementedBuilder
@@ -280,3 +330,25 @@ def get_op_builder(self, class_name):
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
         return BuildExtension
+
+    def export_envs(self):
+        return []
+
+    # TODO: cpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
+    def visible_devices_envs(self):
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 2786b425ca7f..06fd443f9829 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -3,9 +3,11 @@
 
 # DeepSpeed Team
 
+import functools
 import os
 import pkgutil
 import importlib
+import sys
 
 from .abstract_accelerator import DeepSpeedAccelerator
 # During setup stage torch may not be installed, pass on no torch will
@@ -23,7 +25,8 @@ class CUDA_Accelerator(DeepSpeedAccelerator):
 
     def __init__(self):
         self._name = 'cuda'
-        self._communication_backend_name = 'nccl'
+        self._communication_backend_name = 'nccl' if sys.platform != 'win32' else 'gloo'
+        self._compile_backend = "inductor"
         if pynvml is None:
             self._init_pynvml()
 
@@ -42,9 +45,18 @@ def _init_pynvml(self):
     def is_synchronized_device(self):
         return False
 
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
     # Device APIs
     def device_name(self, device_index=None):
-        if device_index == None:
+        if device_index is None:
             return 'cuda'
         return 'cuda:{}'.format(device_index)
 
@@ -88,8 +100,8 @@ def manual_seed(self, seed):
     def manual_seed_all(self, seed):
         return torch.cuda.manual_seed_all(seed)
 
-    def initial_seed(self, seed):
-        return torch.cuda.initial_seed(seed)
+    def initial_seed(self):
+        return torch.cuda.initial_seed()
 
     def default_generator(self, device_index):
         return torch.cuda.default_generators[device_index]
@@ -180,17 +192,31 @@ def available_memory(self, device_index=None):
 
     # Data types
     def is_bf16_supported(self):
+        if not torch.cuda.is_available():
+            return True
         return torch.cuda.is_bf16_supported()
 
     def is_fp16_supported(self):
+        if not torch.cuda.is_available():
+            return True
+        # See https://docs.nvidia.com/deeplearning/tensorrt/support-matrix/index.html#hardware-precision-matrix
+        # FP16 on compute capability 6.x is deprecated
+        allow_deprecated_fp16 = os.environ.get('DS_ALLOW_DEPRECATED_FP16', '0') == '1'
         major, _ = torch.cuda.get_device_capability()
         if major >= 7:
             return True
+        elif major == 6 and allow_deprecated_fp16:
+            return True
         else:
             return False
 
     def supported_dtypes(self):
-        return [torch.float, torch.half, torch.bfloat16]
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
 
     # Misc
     def amp(self):
@@ -222,35 +248,46 @@ def is_triton_supported(self):
         else:
             return False
 
+    # Graph operations
+    def create_graph(self):
+        return torch.cuda.CUDAGraph()
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        return torch.cuda.graph(graph, pool, stream)
+
+    def replay_graph(self, graph):
+        graph.replay()
+        return
+
     # Tensor operations
 
     @property
     def BFloat16Tensor(self):
-        return torch.cuda.BFloat16Tensor
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='cuda')
 
     @property
     def ByteTensor(self):
-        return torch.cuda.ByteTensor
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='cuda')
 
     @property
     def DoubleTensor(self):
-        return torch.cuda.DoubleTensor
+        return functools.partial(torch.tensor, dtype=torch.double, device='cuda')
 
     @property
     def FloatTensor(self):
-        return torch.cuda.FloatTensor
+        return functools.partial(torch.tensor, dtype=torch.float, device='cuda')
 
     @property
     def HalfTensor(self):
-        return torch.cuda.HalfTensor
+        return functools.partial(torch.tensor, dtype=torch.half, device='cuda')
 
     @property
     def IntTensor(self):
-        return torch.cuda.IntTensor
+        return functools.partial(torch.tensor, dtype=torch.int, device='cuda')
 
     @property
     def LongTensor(self):
-        return torch.cuda.LongTensor
+        return functools.partial(torch.tensor, dtype=torch.long, device='cuda')
 
     def pin_memory(self, tensor, align_bytes=1):
         return tensor.pin_memory()
@@ -280,7 +317,7 @@ def op_builder_dir(self):
     class_dict = None
 
     def _lazy_init_class_dict(self):
-        if self.class_dict != None:
+        if self.class_dict is not None:
             return
         else:
             self.class_dict = {}
@@ -322,3 +359,24 @@ def get_op_builder(self, class_name):
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
         return BuildExtension
+
+    def export_envs(self):
+        return ['NCCL']
+
+    def visible_devices_envs(self):
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
new file mode 100644
index 000000000000..b46351f8ca43
--- /dev/null
+++ b/accelerator/hpu_accelerator.py
@@ -0,0 +1,329 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import functools
+import os
+import pkgutil
+import importlib
+import torch
+
+from .abstract_accelerator import DeepSpeedAccelerator
+
+
+class HPU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'hpu'
+        self._communication_backend_name = 'hccl'
+        self._compile_backend = "hpu_backend"
+        self.apply_hpu_workarounds()
+        try:
+            import habana_frameworks.torch.hpu as hpu
+            self.hpu = hpu
+            torch.use_deterministic_algorithms(True)
+        except ImportError as e:
+            raise ValueError(
+                f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
+
+        self.fp16_supported = None
+
+    def apply_hpu_workarounds(self):
+
+        def update_wa_env_var(key, value):
+            if key not in os.environ.keys():
+                os.environ[key] = value
+
+        update_wa_env_var("PT_HPU_LAZY_ACC_PAR_MODE", "0")
+        update_wa_env_var("PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES", "0")
+
+    # Device APIs
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return False
+
+    def resolves_data_dependency(self):
+        return True
+
+    def handles_memory_backpressure(self):
+        return True
+
+    def device_name(self, device_index=None):
+        # ignoring device_index.
+        return 'hpu'
+
+    def device(self, device_index=None):
+        return torch.device(self.device_name(device_index))
+
+    def set_device(self, device_index):
+        self.hpu.set_device(device_index)
+
+    def current_device(self):
+        return (self.hpu.current_device())
+
+    def current_device_name(self):
+        return 'hpu:{}'.format(self.current_device())
+
+    def device_count(self):
+        return self.hpu.device_count()
+
+    def synchronize(self, device_index=None):
+        return self.hpu.synchronize()
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        self.hpu.random.set_rng_state(new_state)
+
+    def get_rng_state(self, device_index=None):
+        return self.hpu.random.get_rng_state()
+
+    def manual_seed(self, seed):
+        return self.hpu.random.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        self.hpu.random.manual_seed_all(seed)
+
+    def initial_seed(self):
+        return self.hpu.random.initial_seed()
+
+    def default_generator(self, device_index):
+        return self.hpu.random.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return self.hpu.Stream
+
+    def stream(self, stream):
+        return self.hpu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return self.hpu.current_stream()
+
+    def default_stream(self, device_index=None):
+        return self.hpu.default_stream()
+
+    @property
+    def Event(self):
+        import habana_frameworks.torch.core as htcore
+        return htcore.hpu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return
+
+    def memory_allocated(self, device_index=None):
+        return self.hpu.memory_allocated()
+
+    def max_memory_allocated(self, device_index=None):
+        return self.hpu.max_memory_allocated()
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return self.hpu.reset_max_memory_allocated()
+
+    def memory_cached(self, device_index=None):
+        return self.hpu.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return self.hpu.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return None
+
+    def memory_stats(self, device_index=None):
+        return self.hpu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        self.hpu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        return self.hpu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        return self.hpu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return self.memory_stats(device_index)['Limit']
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return True
+
+    def is_fp16_supported(self):
+        if self.fp16_supported is None:
+            import habana_frameworks.torch.utils.experimental as htexp
+            self.fp16_supported = htexp._is_fp16_supported()
+        return self.fp16_supported
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float, torch.bfloat16]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        return None
+
+    def is_available(self):
+        return self.hpu.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        callback()
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Graph operations
+    def create_graph(self):
+        return self.hpu.HPUGraph()
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        return self.hpu.graph(graph, stream=stream)
+
+    def replay_graph(self, graph):
+        graph.replay()
+        return
+
+    # Tensor operations
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='hpu')
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='hpu')
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device='hpu')
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device='hpu')
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device='hpu')
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device='hpu')
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device='hpu')
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor.pin_memory(self.device())
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('hpu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.hpu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.hpu"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict is not None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CPUOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return []
+
+    def visible_devices_envs(self):
+        # Current way deepspeed set this env var is not applicable with all HPU instances
+        # User has to follow instructions in:
+        # https://docs.habana.ai/en/latest/PyTorch/Reference/PT_Multiple_Tenants_on_HPU/Multiple_Workloads_Single_Docker.html
+        # keeping CUDA_VISIBLE_DEVICES
+        return ['CUDA_VISIBLE_DEVICES']  #['HABANA_VISIBLE_MODULES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/mlu_accelerator.py b/accelerator/mlu_accelerator.py
new file mode 100644
index 000000000000..bef716f0ee4e
--- /dev/null
+++ b/accelerator/mlu_accelerator.py
@@ -0,0 +1,300 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+import importlib
+import inspect
+import functools
+
+from .abstract_accelerator import DeepSpeedAccelerator
+import torch
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+
+
+class MLU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'mlu'
+        self._communication_backend_name = 'cncl'
+        self._compile_backend = "inductor"
+        self.class_dict = None
+
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'mlu'
+        return 'mlu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.mlu.device(device_index)
+
+    def set_device(self, device_index):
+        torch.mlu.set_device(device_index)
+
+    def current_device(self):
+        return torch.mlu.current_device()
+
+    def current_device_name(self):
+        return 'mlu:{}'.format(torch.mlu.current_device())
+
+    def device_count(self):
+        return torch.mlu.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.mlu.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.mlu.set_rng_state(new_state)
+
+        return torch.mlu.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.mlu.get_rng_state()
+
+        return torch.mlu.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.mlu.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.mlu.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        return torch.mlu.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.mlu.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.mlu.Stream
+
+    def stream(self, stream):
+        return torch.mlu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.mlu.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.mlu.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.mlu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.mlu.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.mlu.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.mlu.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.mlu.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.mlu.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.mlu.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.mlu.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.mlu, 'memory_stats'):
+            return torch.mlu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.mlu, 'reset_peak_memory_stats'):
+            return torch.mlu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.mlu, 'memory_reserved'):
+            return torch.mlu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.mlu, 'max_memory_reserved'):
+            return torch.mlu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.mlu.get_device_properties(device_index).total_memory
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.mlu.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        return True
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.mlu, 'amp'):
+            return torch.mlu.amp
+        return None
+
+    def is_available(self):
+        return torch.mlu.is_available()
+
+    def range_push(self, msg):
+        if hasattr(torch.mlu.cnpx, 'range_push'):
+            return torch.mlu.cnpx.range_push(msg)
+
+    def range_pop(self):
+        if hasattr(torch.mlu.cnpx, 'range_pop'):
+            return torch.mlu.cnpx.range_pop()
+
+    def lazy_call(self, callback):
+        return torch.mlu._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return True
+
+    # Graph operations
+    def create_graph(self):
+        torch.mlu.MLUGraph()
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        return torch.mlu.graph(graph, pool, stream)
+
+    def replay_graph(self, graph):
+        graph.replay()
+        return
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='mlu')
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='mlu')
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device='mlu')
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device='mlu')
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device='mlu')
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device='mlu')
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device='mlu')
+
+    def pin_memory(self, tensor):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('mlu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.mlu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.mlu"
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict:
+            return
+
+        op_builder_module = importlib.import_module(self.op_builder_dir())
+
+        # get op builder class from op_builder/mlu/__init__.py
+        self.class_dict = {}
+        for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+            self.class_dict[class_name] = class_obj
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        builder_class = self.get_op_builder(class_name)
+        return builder_class()
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder']
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return ['NEUWARE_HOME', 'CNCL', 'LD_LIBRARY', 'PATH']
+
+    def visible_devices_envs(self):
+        return ['MLU_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }")
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
index 77595f6b636c..1083ba14d27e 100644
--- a/accelerator/mps_accelerator.py
+++ b/accelerator/mps_accelerator.py
@@ -20,13 +20,23 @@ class MPS_Accelerator(DeepSpeedAccelerator):
     def __init__(self):
         self._name = "mps"
         self._communication_backend_name = None
+        self._compile_backend = "inductor"
 
     def is_synchronized_device(self):
         return False
 
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
     # Device APIs
     def device_name(self, device_index=None):
-        if device_index == None:
+        if device_index is None:
             return "mps"
         return "mps:{}".format(device_index)
 
@@ -67,7 +77,7 @@ def manual_seed_all(self, seed):
     def seed(self):
         return torch.mps.seed()
 
-    def initial_seed(self, seed):
+    def initial_seed(self):
         return
 
     def default_generator(self, device_index):
@@ -166,6 +176,17 @@ def communication_backend_name(self):
     def is_triton_supported(self):
         return False
 
+    # Graph operations
+    def create_graph(self):
+        return None
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
     # Tensor operations
     @property
     def BFloat16Tensor(self):
@@ -221,7 +242,7 @@ def op_builder_dir(self):
     # create an instance of op builder, specified by class_name
     def create_op_builder(self, op_name):
         builder_class = self.get_op_builder(op_name)
-        if builder_class != None:
+        if builder_class is not None:
             return builder_class()
         return None
 
@@ -235,3 +256,26 @@ def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
 
         return BuildExtension
+
+    def export_envs(self):
+        return []
+
+    # TODO: mpu's visible envs is confirmed, keep as CUDA_VISIBLE_DEVICES
+    def visible_devices_envs(self):
+        # TODO: could not find visible devices env for mps
+        return ['CUDA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
index 7ebbd320bb15..9252990d0a94 100644
--- a/accelerator/npu_accelerator.py
+++ b/accelerator/npu_accelerator.py
@@ -20,6 +20,7 @@ def __init__(self):
         super().__init__()
         self._name = 'npu'
         self._communication_backend_name = 'hccl'
+        self._compile_backend = "inductor"
         # dict that holds class name <--> class type mapping i.e.
         # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
         # this dict will be filled at init stage
@@ -28,9 +29,18 @@ def __init__(self):
     def is_synchronized_device(self):
         return False
 
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
     # Device APIs
     def device_name(self, device_index=None):
-        if device_index == None:
+        if device_index is None:
             return 'npu'
         return 'npu:{}'.format(device_index)
 
@@ -74,8 +84,8 @@ def manual_seed(self, seed):
     def manual_seed_all(self, seed):
         return torch.npu.manual_seed_all(seed)
 
-    def initial_seed(self, seed):
-        return torch.npu.initial_seed(seed)
+    def initial_seed(self):
+        return torch.npu.initial_seed()
 
     def default_generator(self, device_index):
         return torch.npu.default_generators[device_index]
@@ -176,6 +186,17 @@ def communication_backend_name(self):
     def is_triton_supported(self):
         return False
 
+    # Graph operations
+    def create_graph(self):
+        return None
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
     # Tensor operations
 
     @property
@@ -255,3 +276,24 @@ def get_op_builder(self, class_name):
     def build_extension(self):
         from torch.utils.cpp_extension import BuildExtension
         return BuildExtension
+
+    def export_envs(self):
+        return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH']
+
+    def visible_devices_envs(self):
+        return ['ASCEND_RT_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends }")
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
index 49133489b051..9c1713ca4c6c 100644
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
@@ -20,7 +20,7 @@
 except ImportError as e:
     dsa2 = None
 
-SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'npu', 'mps']
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu', 'mlu', 'sdaa']
 
 ds_accelerator = None
 
@@ -45,7 +45,7 @@ def _validate_accelerator(accel_obj):
 
 
 def is_current_accelerator_supported():
-    return get_accelerator() in SUPPORTED_ACCELERATOR_LIST
+    return get_accelerator().device_name() in SUPPORTED_ACCELERATOR_LIST
 
 
 def get_accelerator():
@@ -60,22 +60,32 @@ def get_accelerator():
         accelerator_name = os.environ["DS_ACCELERATOR"]
         if accelerator_name == "xpu":
             try:
-                from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F401 # type: ignore
+                import intel_extension_for_pytorch as ipex
+                assert ipex._C._has_xpu(), "XPU_Accelerator requires an intel_extension_for_pytorch that supports XPU."
             except ImportError as e:
                 raise ValueError(
-                    f"XPU_Accelerator requires intel_extension_for_deepspeed, which is not installed on this system.")
-        elif accelerator_name == "cpu":
+                    f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
+        elif accelerator_name == "xpu.external":
             try:
-                import intel_extension_for_pytorch  # noqa: F401 # type: ignore
+                import intel_extension_for_deepspeed  # noqa: F401 # type: ignore
             except ImportError as e:
                 raise ValueError(
-                    f"CPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
+                    f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
+                )
+        elif accelerator_name == "cpu":
+            pass
         elif accelerator_name == "npu":
             try:
                 import torch_npu  # noqa: F401 # type: ignore
             except ImportError as e:
                 raise ValueError(f"NPU_Accelerator requires torch_npu, which is not installed on this system.")
             pass
+        elif accelerator_name == "sdaa":
+            try:
+                import torch_sdaa  # noqa: F401 # type: ignore
+            except ImportError as e:
+                raise ValueError(f"SDAA_Accelerator requires torch_sdaa, which is not installed on this system.")
+            pass
         elif accelerator_name == "mps":
             try:
                 import torch.mps
@@ -84,7 +94,18 @@ def get_accelerator():
                 torch.mps.current_allocated_memory()
             except (RuntimeError, ImportError) as e:
                 raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.")
-        elif is_current_accelerator_supported():
+        elif accelerator_name == "hpu":
+            try:
+                import habana_frameworks.torch.hpu  # noqa: F401
+            except ImportError as e:
+                raise ValueError(
+                    f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
+        elif accelerator_name == "mlu":
+            try:
+                import torch_mlu  # noqa: F401
+            except ImportError as e:
+                raise ValueError(f"MLU_Accelerator requires torch_mlu, which is not installed on this system.")
+        elif accelerator_name not in SUPPORTED_ACCELERATOR_LIST:
             raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
                              f'Value "{accelerator_name}" is not supported')
         ds_set_method = "override"
@@ -104,15 +125,15 @@ def get_accelerator():
 
         try:
             from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F401,F811 # type: ignore
-
-            accelerator_name = "xpu"
+            accelerator_name = "xpu.external"
         except ImportError as e:
             pass
         if accelerator_name is None:
             try:
-                import intel_extension_for_pytorch  # noqa: F401,F811 # type: ignore
+                import intel_extension_for_pytorch as ipex
 
-                accelerator_name = "cpu"
+                if ipex._C._has_xpu():
+                    accelerator_name = "xpu"
             except ImportError as e:
                 pass
         if accelerator_name is None:
@@ -122,6 +143,13 @@ def get_accelerator():
                 accelerator_name = "npu"
             except ImportError as e:
                 pass
+        if accelerator_name is None:
+            try:
+                import torch_sdaa  # noqa: F401,F811 # type: ignore
+
+                accelerator_name = "sdaa"
+            except ImportError as e:
+                pass
         if accelerator_name is None:
             try:
                 import torch.mps
@@ -132,7 +160,41 @@ def get_accelerator():
             except (RuntimeError, ImportError) as e:
                 pass
         if accelerator_name is None:
-            accelerator_name = "cuda"
+            try:
+                import habana_frameworks.torch.hpu  # noqa: F401,F811
+
+                accelerator_name = "hpu"
+            except ImportError as e:
+                pass
+        if accelerator_name is None:
+            try:
+                import torch_mlu  # noqa: F401,F811
+
+                accelerator_name = "mlu"
+            except ImportError as e:
+                pass
+        if accelerator_name is None:
+            try:
+                import torch
+
+                # Determine if we are on a GPU or x86 CPU with torch.
+                # "torch.cuda.is_available()" provides a stronger guarantee,     #ignore-cuda
+                # ensuring that we are free from CUDA initialization errors.
+                # While "torch.cuda.device_count() > 0" check ensures that       #ignore-cuda
+                # we won't try to do any CUDA calls when no device is available
+                # For reference: https://github.com/deepspeedai/DeepSpeed/pull/6810
+                if torch.cuda.device_count() > 0 and torch.cuda.is_available():  #ignore-cuda
+                    accelerator_name = "cuda"
+            except (RuntimeError, ImportError) as e:
+                # TODO need a more decent way to detect which accelerator to use, consider using nvidia-smi command for detection
+                pass
+        if accelerator_name is None:
+            # borrow this log from PR#5084
+            if accel_logger is not None:
+                accel_logger.warning(
+                    "Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.")
+            # cpu added as catch-all when accelerator detection fails
+            accelerator_name = "cpu"
 
         ds_set_method = "auto detect"
 
@@ -145,17 +207,33 @@ def get_accelerator():
         from .cpu_accelerator import CPU_Accelerator
 
         ds_accelerator = CPU_Accelerator()
-    elif accelerator_name == "xpu":
+    elif accelerator_name == "xpu.external":
         # XPU_Accelerator is already imported in detection stage
+        ds_accelerator = XPU_Accelerator()
+    elif accelerator_name == "xpu":
+        from .xpu_accelerator import XPU_Accelerator
+
         ds_accelerator = XPU_Accelerator()
     elif accelerator_name == "npu":
         from .npu_accelerator import NPU_Accelerator
 
         ds_accelerator = NPU_Accelerator()
+    elif accelerator_name == "sdaa":
+        from .sdaa_accelerator import SDAA_Accelerator
+
+        ds_accelerator = SDAA_Accelerator()
     elif accelerator_name == "mps":
         from .mps_accelerator import MPS_Accelerator
 
         ds_accelerator = MPS_Accelerator()
+    elif accelerator_name == 'hpu':
+        from .hpu_accelerator import HPU_Accelerator
+
+        ds_accelerator = HPU_Accelerator()
+    elif accelerator_name == 'mlu':
+        from .mlu_accelerator import MLU_Accelerator
+
+        ds_accelerator = MLU_Accelerator()
     _validate_accelerator(ds_accelerator)
     if accel_logger is not None:
         accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")
diff --git a/accelerator/sdaa_accelerator.py b/accelerator/sdaa_accelerator.py
new file mode 100755
index 000000000000..26113d38dd15
--- /dev/null
+++ b/accelerator/sdaa_accelerator.py
@@ -0,0 +1,328 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+import importlib
+import inspect
+import functools
+
+from .abstract_accelerator import DeepSpeedAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.sdaa
+except ImportError:
+    pass
+
+
+class SDAA_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'sdaa'
+        self._communication_backend_name = 'tccl'
+        self._compile_backend = "inductor"
+        self.class_dict = None
+
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index is None:
+            return 'sdaa'
+        return 'sdaa:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.sdaa.device(device_index)
+
+    def set_device(self, device_index):
+        torch.sdaa.set_device(device_index)
+
+    def current_device(self):
+        return torch.sdaa.current_device()
+
+    def current_device_name(self):
+        return 'sdaa:{}'.format(torch.sdaa.current_device())
+
+    def device_count(self):
+        return torch.sdaa.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.sdaa.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.sdaa.set_rng_state(new_state)
+
+        return torch.sdaa.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.sdaa.get_rng_state()
+
+        return torch.sdaa.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.sdaa.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.sdaa.manual_seed_all(seed)
+
+    def initial_seed(self):
+        return torch.sdaa.initial_seed()
+
+    def default_generator(self, device_index):
+        return torch.sdaa.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.sdaa.Stream
+
+    def stream(self, stream):
+        return torch.sdaa.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.sdaa.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.sdaa.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.sdaa.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.sdaa.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.sdaa.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.sdaa.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.sdaa.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.sdaa.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.sdaa.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.sdaa.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.sdaa, 'memory_stats'):
+            return torch.sdaa.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.sdaa, 'reset_peak_memory_stats'):
+            return torch.sdaa.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.sdaa, 'memory_reserved'):
+            return torch.sdaa.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.sdaa, 'max_memory_reserved'):
+            return torch.sdaa.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.sdaa.get_device_properties(device_index).total_memory
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.sdaa.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        return True
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.half)
+        if self.is_bf16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.sdaa, 'amp'):
+            return torch.sdaa.amp
+        return None
+
+    def is_available(self):
+        return torch.sdaa.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        return torch.sdaa._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Graph operations
+    def create_graph(self):
+        return None
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device='sdaa')
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device='sdaa')
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device='sdaa')
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device='sdaa')
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device='sdaa')
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device='sdaa')
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device='sdaa')
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor.pin_memory()
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('sdaa:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.sdaa"
+        except ImportError:
+            return "deepspeed.ops.op_builder.sdaa"
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict:
+            return
+
+        op_builder_module = importlib.import_module(self.op_builder_dir())
+
+        # get op builder class from op_builder/sdaa/__init__.py
+        self.class_dict = {}
+        for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+            self.class_dict[class_name] = class_obj
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        builder_class = self.get_op_builder(class_name)
+        return builder_class()
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder']
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return ['NCCL', 'LD_LIBRARY', 'PATH']
+
+    def visible_devices_envs(self):
+        return ['SDAA_VISIBLE_DEVICES']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
new file mode 100644
index 000000000000..ad8a10710bf2
--- /dev/null
+++ b/accelerator/xpu_accelerator.py
@@ -0,0 +1,318 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator
+import intel_extension_for_pytorch as ipex  # noqa: F401 # type: ignore
+import oneccl_bindings_for_pytorch  # noqa: F401 # type: ignore
+import functools
+
+import importlib
+import inspect
+
+
+class XPU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'xpu'
+        self._communication_backend_name = 'ccl'
+        self._compile_backend = "inductor"
+        self.aligned_tensors = []
+        self.class_dict = None
+
+    def is_synchronized_device(self):
+        return False
+
+    def use_host_timers(self):
+        # WA XPU event will be consolidated in 2.6
+        if ipex.__version__ < '2.6':
+            return True
+        else:
+            return self.is_synchronized_device()
+
+    def resolves_data_dependency(self):
+        return self.is_synchronized_device()
+
+    def handles_memory_backpressure(self):
+        return self.is_synchronized_device()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'xpu'
+        return 'xpu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.xpu.device(device_index)
+
+    def set_device(self, device_index):
+        torch.xpu.set_device(device_index)
+
+    def current_device(self):
+        return torch.xpu.current_device()
+
+    def current_device_name(self):
+        return 'xpu:{}'.format(torch.xpu.current_device())
+
+    def device_count(self):
+        return torch.xpu.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.xpu.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.xpu.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index == None:
+            return torch.xpu.set_rng_state(new_state)
+        return torch.xpu.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index == None:
+            return torch.xpu.get_rng_state()
+        return torch.xpu.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.xpu.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.xpu.manual_seed_all(seed)
+
+    def initial_seed(self):
+        return torch.xpu.initial_seed()
+
+    def default_generator(self, device_index):
+        return torch.xpu.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.xpu.Stream
+
+    def stream(self, stream):
+        return torch.xpu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.xpu.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        # torch.xpu does not support the sync behavior of default stream as cuda
+        # use current_stream as workaround
+        # see https://pytorch.org/docs/stable/notes/cuda.html#cuda-streams
+        return torch.xpu.current_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.xpu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.xpu.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.xpu.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.xpu.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.xpu.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.xpu.memory_reserved(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.xpu.max_memory_reserved(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.xpu.reset_max_memory_reserved(device_index)
+
+    def memory_stats(self, device_index=None):
+        return torch.xpu.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        return torch.xpu.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        return torch.xpu.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        return torch.xpu.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.xpu.get_device_properties(device_index).total_memory
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Misc
+    def amp(self):
+        return torch.xpu.amp
+
+    def is_available(self):
+        return torch.xpu.is_available()
+
+    def range_push(self, msg):
+        # TODO itt is currently not supported yet
+        # return torch.profiler.itt.range_push(msg)
+        return
+
+    def range_pop(self):
+        # TODO itt is currently not supported yet
+        # return torch.profiler.itt.range_pop()
+        return
+
+    def lazy_call(self, callback):
+        if hasattr(torch.xpu, "_lazy_call"):
+            return torch.xpu._lazy_call(callback)
+        else:
+            return torch.xpu.lazy_init._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Graph operations
+    def create_graph(self):
+        return None
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
+    # Data types
+    def is_bf16_supported(self):
+        return True
+
+    def is_fp16_supported(self):
+        return True
+
+    def supported_dtypes(self):
+        return [torch.float, torch.half, torch.bfloat16]
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return functools.partial(torch.tensor, dtype=torch.bfloat16, device=self._name)
+
+    @property
+    def ByteTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.uint8, device=self._name)
+
+    @property
+    def DoubleTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.double, device=self._name)
+
+    @property
+    def FloatTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.float, device=self._name)
+
+    @property
+    def HalfTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.half, device=self._name)
+
+    @property
+    def IntTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.int, device=self._name)
+
+    @property
+    def LongTensor(self):
+        return functools.partial(torch.tensor, dtype=torch.long, device=self._name)
+
+    def pin_memory(self, tensor, align_bytes=1):
+        if align_bytes == 1:
+            return tensor.pin_memory(device=self.current_device_name())
+        elif align_bytes == 0:
+            from deepspeed.ops.op_builder.xpu import AsyncIOBuilder
+            self.aio_handle = AsyncIOBuilder().load().aio_handle(128 * 1024, 8, False, False, False)
+            aligned_t = self.aio_handle.new_cpu_locked_tensor(tensor.numel(), tensor)
+            aligned_t = aligned_t[:tensor.numel()].copy_(tensor)
+            self.aligned_tensors.append([aligned_t.data_ptr(), aligned_t[-1].data_ptr()])
+            return aligned_t
+
+    def is_pinned(self, tensor):
+        if tensor.is_pinned(device=self.current_device_name()):
+            return True
+        else:
+            for begin, end in self.aligned_tensors:
+                if begin <= tensor.data_ptr() and tensor.data_ptr() <= end:
+                    return True
+        return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.xpu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.xpu"
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('xpu:'):
+            return True
+        else:
+            return False
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict:
+            return
+
+        op_builder_module = importlib.import_module(self.op_builder_dir())
+
+        # get op builder class from op_builder/xpu/__init__.py
+        self.class_dict = {}
+        for class_name, class_obj in inspect.getmembers(op_builder_module, inspect.isclass):
+            self.class_dict[class_name] = class_obj
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        builder_class = self.get_op_builder(class_name)
+        return builder_class()
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder']
+
+    def build_extension(self):
+        try:
+            from intel_extension_for_pytorch.xpu.cpp_extension import DpcppBuildExtension
+        except ImportError:
+            from intel_extension_for_pytorch.xpu.utils import DpcppBuildExtension
+        return DpcppBuildExtension
+
+    def export_envs(self):
+        return []
+
+    def visible_devices_envs(self):
+        return ['ZE_AFFINITY_MASK']
+
+    def set_visible_devices_envs(self, current_env, local_accelerator_ids):
+        for env in self.visible_devices_envs():
+            current_env[env] = ",".join(map(str, local_accelerator_ids))
+
+    def get_compile_backend(self):
+        return self._compile_backend
+
+    def set_compile_backend(self, backend):
+        supported_backends = torch._dynamo.list_backends(exclude_tags=())
+        if backend in supported_backends:
+            self._compile_backend = backend
+        else:
+            raise ValueError(
+                f"{backend} not supported by {self.device_name()}. Supported Backends are {supported_backends}")
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 4c88b2dd091c..a2b332732042 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -2,5 +2,5 @@
 
 If you are looking for DeepSpeed benchmarks, please see the following resources:
 
-1. [Communication Benchmarking Suite](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/communication)
-2. [Inference Benchmarks](https://github.com/microsoft/DeepSpeedExamples/tree/master/benchmarks/inference)
+1. [Communication Benchmarking Suite](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/communication)
+2. [Inference Benchmarks](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/inference)
diff --git a/bin/deepspeed.bat b/bin/deepspeed.bat
new file mode 100644
index 000000000000..8e488bde380c
--- /dev/null
+++ b/bin/deepspeed.bat
@@ -0,0 +1,2 @@
+@echo off
+python "%~dp0\ds" %*
diff --git a/bin/ds_bench b/bin/ds_bench
index bfacbc8e25c8..80bf4029604e 100755
--- a/bin/ds_bench
+++ b/bin/ds_bench
@@ -10,7 +10,10 @@ import sys
 required_env = ["RANK", "WORLD_SIZE", "MASTER_ADDR", "MASTER_PORT", "LOCAL_RANK"]
 if not all(map(lambda v: v in os.environ, required_env)):
     import subprocess
-    subprocess.run("deepspeed $(which ds_bench) " + " ".join(sys.argv[1:]), shell=True)
+    r = subprocess.check_output(["which", "ds_bench"])
+    ds_bench_bin = r.decode('utf-8').strip()
+    safe_cmd = ["deepspeed", ds_bench_bin] + sys.argv[1:]
+    subprocess.run(safe_cmd)
 else:
     args = benchmark_parser().parse_args()
     rank = args.local_rank
diff --git a/bin/ds_io b/bin/ds_io
new file mode 100644
index 000000000000..681fd634764c
--- /dev/null
+++ b/bin/ds_io
@@ -0,0 +1,6 @@
+#!/usr/bin/env python3
+
+from deepspeed.nvme import ds_io_main
+
+if __name__ == '__main__':
+    ds_io_main()
diff --git a/bin/ds_nvme_tune b/bin/ds_nvme_tune
new file mode 100644
index 000000000000..117adfba22c0
--- /dev/null
+++ b/bin/ds_nvme_tune
@@ -0,0 +1,9 @@
+#!/usr/bin/env python3
+
+from deepspeed.nvme import sweep_main, generate_main, parse_sweep_arguments
+
+if __name__ == '__main__':
+    args = parse_sweep_arguments()
+    print(f"Running DeepNVMe performance tuning on {args.nvme_dir}")
+    sweep_main(args)
+    generate_main(args.log_dir)
diff --git a/bin/ds_report.bat b/bin/ds_report.bat
new file mode 100644
index 000000000000..78c7225f147c
--- /dev/null
+++ b/bin/ds_report.bat
@@ -0,0 +1,2 @@
+@echo off
+python "%~dp0\ds_report" %*
diff --git a/blogs/comm-opt/README.md b/blogs/comm-opt/README.md
index 4767c4342816..7049e3b9f162 100644
--- a/blogs/comm-opt/README.md
+++ b/blogs/comm-opt/README.md
@@ -63,7 +63,7 @@ For this part of the optimization, we add some fusion for the communication that
 
 These fusions are done at two levels:
 1. Fuse the sequence AlltoAll for q,k, and v: we Scatter the heads using the mixed tensor rather than splitting them beforehand. For this part, we need to get some more information from the modeling side (such as the number of q and kv heads), to split the heads before calling AlltoAll. We have added some new changes on the Megatron-DeepSpeed repo that incorporate these changes for the sequence-parallelism.
-2. Fuse the AlltoAll tensors and call the PyTorch's AlltoAll-sinlge API: we reshape the tensors for the scatter dimension and use a single tensor for AlltoAll which alleviates the overhead of using a list of tensors which requires a contiguous call for each element of the list.
+2. Fuse the AlltoAll tensors and call the PyTorch's AlltoAll-single API: we reshape the tensors for the scatter dimension and use a single tensor for AlltoAll which alleviates the overhead of using a list of tensors which requires a contiguous call for each element of the list.
 
 By adding these optimizations, we see about 10 to 15% speedup compared to the previous design, and obtain good scalability across different SP-degree and context-lengths. In the following table, we show the improvement achieved by using SP, when doubling the GPU-count and increasing the SP-degree. We obtain over 80% of efficiency when increasing from 256 to 512 GPUs using SP-2. Furthermore, by increasing the sequence-length and SP, while keeping the processed tokens similar, we achieve over 75% of efficiency for 2x more resources. On the other hand, if we can double the number of tokens (shown on the last row of table 2), we can improve the performance to 1.81x.
 
diff --git a/blogs/deepspeed-chat/README.md b/blogs/deepspeed-chat/README.md
index 43501652b98e..66fa9930b299 100644
--- a/blogs/deepspeed-chat/README.md
+++ b/blogs/deepspeed-chat/README.md
@@ -65,7 +65,7 @@ DeepSpeed-RLHF system is capable of unparalleled efficiency at scale, making com
 *Table 2. Multi-Node 64x A100-80GB: Training Time and Corresponding Approximate Cost on Azure.*
 </div>
 
-> ***Very Important Details***: The numbers in both tables above are for Step 3 of the training and are based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs). We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for more details.
+> ***Very Important Details***: The numbers in both tables above are for Step 3 of the training and are based on actual measured training throughput on DeepSpeed-RLHF curated dataset and training recipe which trains for one epoch on a total of 135M tokens. We have in total 67.5M query tokens (131.9k queries with sequence length 256) and 67.5M generated tokens (131.9k answers with sequence length 256), and a maximum global batch size per step of 0.5M tokens (1024 query-answer pairs). We urge readers to pay attention to these specifications before making any cost and e2e time comparisons with DeepSpeed-RLHF. See our [benchmark settings](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for more details.
 
 
 ***Democratizing RLHF Training***: With just a single GPU, DeepSpeed-HE supports training models with over 13 billion parameters, enabling data scientists without access to multi-GPU systems to create not just toy RLHF models but large and powerful ones that can be used in real-world scenarios.
@@ -95,7 +95,7 @@ We use an example of pretrained OPT-13B as the actor model and OPT-350M as the r
 ```python
 pip install deepspeed>=0.9.0
 
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/applications/DeepSpeed-Chat/
 pip install -r requirements.txt
 
@@ -285,7 +285,7 @@ This improvement in efficiency stems from DeepSpeed-HE’s ability to accelerate
 
 ## Effective Throughput and Scalability Analysis
 
-***(I) Effective Throughput Analysis.*** The effective throughput of DeepSpeed-HE during Stage 3 of the RLHF training depends on the throughput that it achieves during the generation and RL training phases. In our RLHF pipeline, the generation phase comprises approximately 20% of the total computation while the RL training phase comprises of remaining 80% (see [benchmark settings](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for details). However, despite having a small proportion, the former can take a large portion of the e2e time as it requires running the actor model once for each of the 256 generated tokens with an initial prompt of 256 tokens, making it memory bandwidth bound and difficult to achieve high throughput. In contrast, the RL training phase is compute bound running the reference actor model with just a couple of forward and backward passes with full 512 tokens from both prompt and generation per sample and can achieve good throughput.
+***(I) Effective Throughput Analysis.*** The effective throughput of DeepSpeed-HE during Stage 3 of the RLHF training depends on the throughput that it achieves during the generation and RL training phases. In our RLHF pipeline, the generation phase comprises approximately 20% of the total computation while the RL training phase comprises of remaining 80% (see [benchmark settings](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md) page for details). However, despite having a small proportion, the former can take a large portion of the e2e time as it requires running the actor model once for each of the 256 generated tokens with an initial prompt of 256 tokens, making it memory bandwidth bound and difficult to achieve high throughput. In contrast, the RL training phase is compute bound running the reference actor model with just a couple of forward and backward passes with full 512 tokens from both prompt and generation per sample and can achieve good throughput.
 
 <div align="center">
 
@@ -311,7 +311,7 @@ Furthermore, we would like to point out that our effective performance is 19x hi
 
 ***(II) Scalability Analysis.*** The best effective throughput for different model sizes is achieved at different GPU count. This is in part because some of the larger model sizes require more memory to run. However, a large part of this behavior stems from DeepSpeed-HE’s scalability properties that we discuss next.
 
-Figure 7 shows that DeepSeed-RLHF has achieved good scaling overall on up to 64 GPUs. However, if we look more closely, it shows that DeepSpeed-RLHF training achieves super-linear scaling at small scale, followed by near linear or sub-linear scaling at larger scales. This is due to the interaction between memory availability and max global batch size.
+Figure 7 shows that DeepSpeed-RLHF has achieved good scaling overall on up to 64 GPUs. However, if we look more closely, it shows that DeepSpeed-RLHF training achieves super-linear scaling at small scale, followed by near linear or sub-linear scaling at larger scales. This is due to the interaction between memory availability and max global batch size.
 
 As DeepSpeed-HE is powered by ZeRO-based technology for training, it allows model states to be partitioned across the available GPUs. As a result, the memory consumption per GPU reduces with the increase in the number of GPUs, allowing DeepSpeed-HE to support a larger batch per GPU resulting in super-linear scaling. However, at large scale, while the available memory continues to increase, the maximum global batch size (1024, in our case, with a sequence length of 512) limits the batch size per GPU, resulting in near-linear or sub-linear scaling.
 As a result, for a given max global batch size, DeepSpeed-HE achieves the best throughput and cost efficiency at the boundary of super-linear and sub-linear scalability, and the exact point is mostly determined by the largest batch size that can be run per GPU as the function of available memory and global batch size.
@@ -320,13 +320,13 @@ As a result, for a given max global batch size, DeepSpeed-HE achieves the best t
 
 We are very excited to share that DeepSpeed-Chat is now open-sourced and available to the AI community.
 
-* To get started, please visit our github page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+* To get started, please visit our github page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
 
-* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for the future.
+* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for the future.
 
 DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
 
 * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
-* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+* You can also follow us on our [English Twitter](https://twitter.com/DeepSpeedAI), [Japanese Twitter](https://twitter.com/DeepSpeedAI_JP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
 
-DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md
index 03bf9b69449f..64c7822e021d 100644
--- a/blogs/deepspeed-chat/chinese/README.md
+++ b/blogs/deepspeed-chat/chinese/README.md
@@ -63,7 +63,7 @@ DeepSpeed-RLHF 系统在大规模训练中具有无与伦比的效率，使复
 *表 2. 多节点 64x A100-80GB：训练时长及预估的 Azure 费用。*
 </div>
 
-> ***非常重要的细节***: 上述两个表格（即表一和表二）中的数据均针对 RLHF 训练的第 3 步，基于实际数据集和 DeepSpeed-RLHF 训练吞吐量的测试。该训练在总共 1.35 亿（135M）个字符（token）上进行一个时期（epoch）的训练。我们总共有 6750 万个查询（query）字符（131.9k 个 query，每个序列长度为 256）和 6750 万个生成/回答字符（131.9k 个答案，每个序列长度为 256），每步的最大全局字符批量大小约为 500 万个字符（1024 个查询-答案对）。在与 DeepSpeed-RLHF 进行任何成本和端到端时间比较之前，我们建议读者注意这些设定。想要了解更多详细信息，请参阅我们的页面 [benchmark setting](https://github.com/microsoft/DeepSpeedExamples-internal/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)。
+> ***非常重要的细节***: 上述两个表格（即表一和表二）中的数据均针对 RLHF 训练的第 3 步，基于实际数据集和 DeepSpeed-RLHF 训练吞吐量的测试。该训练在总共 1.35 亿（135M）个字符（token）上进行一个时期（epoch）的训练。我们总共有 6750 万个查询（query）字符（131.9k 个 query，每个序列长度为 256）和 6750 万个生成/回答字符（131.9k 个答案，每个序列长度为 256），每步的最大全局字符批量大小约为 500 万个字符（1024 个查询-答案对）。在与 DeepSpeed-RLHF 进行任何成本和端到端时间比较之前，我们建议读者注意这些设定。想要了解更多详细信息，请参阅我们的页面 [benchmark setting](https://github.com/deepspeedai/DeepSpeedExamples-internal/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)。
 
 ***实现 RLHF 训练的普及化***：仅凭单个 GPU，DeepSpeed-HE 就能支持训练超过 130 亿参数的模型。这使得那些无法使用多 GPU 系统的数据科学家和研究者不仅能够轻松创建轻量级的 RLHF 模型，还能创建大型且功能强大的模型，以应对不同的使用场景。
 
@@ -91,7 +91,7 @@ DeepSpeed-RLHF 系统在大规模训练中具有无与伦比的效率，使复
 ```
 pip install deepspeed>=0.9.0
 
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/applications/DeepSpeed-Chat/
 pip install -r requirements.txt
 
@@ -274,7 +274,7 @@ DeepSpeed-HE可以在训练和推理之间无缝更改模型分区，以支持
 
 ## 有效吞吐量和可扩展性分析
 
-***(I) 有效吞吐量分析。*** 在 RLHF 训练的第 3 阶段，DeepSpeed-HE 的有效吞吐量取决于它在生成和 RL 训练阶段所实现的吞吐量。在我们的 RLHF （详见 [benchmarking setting](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)）中，生成阶段占总计算的约 20%，而 RL 训练阶段占剩余的 80%。然而，尽管比例较小，前者可能会占用大部分的端到端时间，因为它需要为每个生成的字符运行一次 actor 模型，使其受到内存带宽限制，难以实现高吞吐量。相比之下，RL 训练阶段是计算密集型的，仅需运行参考 actor 模型进行几次前向和后向传递，每个样本都有来自提示和生成的全部 512 个字符，可以实现良好的吞吐量。
+***(I) 有效吞吐量分析。*** 在 RLHF 训练的第 3 阶段，DeepSpeed-HE 的有效吞吐量取决于它在生成和 RL 训练阶段所实现的吞吐量。在我们的 RLHF （详见 [benchmarking setting](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)）中，生成阶段占总计算的约 20%，而 RL 训练阶段占剩余的 80%。然而，尽管比例较小，前者可能会占用大部分的端到端时间，因为它需要为每个生成的字符运行一次 actor 模型，使其受到内存带宽限制，难以实现高吞吐量。相比之下，RL 训练阶段是计算密集型的，仅需运行参考 actor 模型进行几次前向和后向传递，每个样本都有来自提示和生成的全部 512 个字符，可以实现良好的吞吐量。
 
 <div align="center">
 
@@ -300,7 +300,7 @@ DeepSpeed-HE可以在训练和推理之间无缝更改模型分区，以支持
 
 ***(II) 可扩展性分析。*** 不同模型大小的最佳有效吞吐量取决于不同的 GPU 数量。部分原因是因为一些较大的模型大小需要更多的内存来运行。基于此，我们接下来讨论 DeepSpeed-HE 的可扩展性特性。
 
-图 7 显示 DeepSeed-RLHF 在多达 64 个 GPU的集群 上实现了良好的整体扩展。然而，如果我们仔细观察，可以发现 DeepSpeed-RLHF 训练在小规模时实现了超线性扩展，随后在较大规模时实现了接近线性或次线性扩展。这是由于内存可用性和最大全局批量大小之间的相互作用。
+图 7 显示 DeepSpeed-RLHF 在多达 64 个 GPU的集群 上实现了良好的整体扩展。然而，如果我们仔细观察，可以发现 DeepSpeed-RLHF 训练在小规模时实现了超线性扩展，随后在较大规模时实现了接近线性或次线性扩展。这是由于内存可用性和最大全局批量大小之间的相互作用。
 
 DeepSpeed-HE 的核心技术基于 ZeRO，用于训练过程中将模型状态分割到每个GPU上。这意味着随着 GPU 数量的增加，每个 GPU 的内存消耗会减少，使得 DeepSpeed-HE 能够在每个 GPU 上支持更大的批量，从而实现超线性扩展。然而，在大规模情况下，尽管可用内存持续增加，但最大全局批量大小仍然限制了每个 GPU 的批量大小，导致接近线性或次线性扩展。因此，在给定的最大全局批量大小（例如，我们设置为 1024 个句子，每个句子长度为 512）下，DeepSpeed-HE 在超线性和次线性可扩展性之间实现了最佳的吞吐量和成本效益。具体的平衡点主要取决于每个 GPU 上可运行的最大批量大小，而这又受到可用内存和全局批量大小的函数所决定。
 
@@ -308,18 +308,18 @@ DeepSpeed-HE 的核心技术基于 ZeRO，用于训练过程中将模型状态
 
 我们非常高兴地宣布，DeepSpeed-Chat现已开源并向 AI 社区开放。
 
-* 如果你发现我们的成果对你有用或者喜欢我们的开源成果，请在 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)上点⭐。
+* 如果你发现我们的成果对你有用或者喜欢我们的开源成果，请在 [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) 和 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)上点⭐。
 
 
-* 请访问我们的DeepSpeed-Chat GitHub页面以开始使用：[GitHub 登陆页面](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+* 请访问我们的DeepSpeed-Chat GitHub页面以开始使用：[GitHub 登陆页面](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
 
 
-* 我们将继续根据你的反馈和支持改进 DeepSpeed-Chat。我们的[计划图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)显示了当前支持的功能以及计划在未来支持的功能。
+* 我们将继续根据你的反馈和支持改进 DeepSpeed-Chat。我们的[计划图](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)显示了当前支持的功能以及计划在未来支持的功能。
 
 DeepSpeed-Chat 是更大的DeepSpeed生态系统的一部分，包括众多深度学习系统和建模技术。要了解更多信息，
 
 * 请访问我们的[网站](https://www.deepspeed.ai/)，了解详细的博客文章、教程和有用的文档。
-* 我们会在[知乎](https://www.zhihu.com/people/deepspeed)上发布最新中文博客及动态。你还可以关注我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed) 和[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP)。
+* 我们会在[知乎](https://www.zhihu.com/people/deepspeed)上发布最新中文博客及动态。你还可以关注我们的[英文 Twitter](https://twitter.com/DeepSpeedAI) 和[日文 Twitter](https://twitter.com/DeepSpeedAI_JP)。
 
 
-DeepSpeed 欢迎你的贡献！我们鼓励你在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面报告问题、贡献 PR 并参与讨论。请参阅我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)了解更多详情。我们愿意与大学、研究实验室、公司等进行合作，共同开展深度学习研究，将 DeepSpeed 应用于赋能现实世界的 AI 模型和应用等。对于此类需求（以及其他不适合在 GitHub 上提出的需求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+DeepSpeed 欢迎你的贡献！我们鼓励你在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面报告问题、贡献 PR 并参与讨论。请参阅我们的[贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)了解更多详情。我们愿意与大学、研究实验室、公司等进行合作，共同开展深度学习研究，将 DeepSpeed 应用于赋能现实世界的 AI 模型和应用等。对于此类需求（以及其他不适合在 GitHub 上提出的需求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
diff --git a/blogs/deepspeed-chat/ds-chat-release-8-31/README.md b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md
index e1d4bf952bf1..2b1ffbe73ae3 100644
--- a/blogs/deepspeed-chat/ds-chat-release-8-31/README.md
+++ b/blogs/deepspeed-chat/ds-chat-release-8-31/README.md
@@ -24,7 +24,7 @@
 
 # 1. Introduction <a name="introduction"></a>
 
-DeepSpeed-Chat is a general system framework for RLHF training that enables easy, fast, affordable, and scalable training of ChatGPT-style models that we [publicly released on GitHub](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md). The detailed performance and capabilities of DeepSpeed-Chat have been published in our [blog post](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat) and [arXiv](https://arxiv.org/abs/2308.01320) paper.
+DeepSpeed-Chat is a general system framework for RLHF training that enables easy, fast, affordable, and scalable training of ChatGPT-style models that we [publicly released on GitHub](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md). The detailed performance and capabilities of DeepSpeed-Chat have been published in our [blog post](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat) and [arXiv](https://arxiv.org/abs/2308.01320) paper.
 
 We are happy to share that today we are improving DeepSpeed-Chat along three areas: i) system support for Llama/Llama-2 family of models, ii) system features for improved efficiency and accessibility, and iii) stability and software enhancements.
 
@@ -33,15 +33,15 @@ We are happy to share that today we are improving DeepSpeed-Chat along three are
   We ***introduce system support for training Llama and Llama-2 models*** in DeepSpeed-Chat enabling and leveraging various optimizations and features including the Hybrid Engine, ZeRO family of optimizations, Low-Rank Adaptation (LoRA) support, as well as full integration into the three-stage DeepSpeed-Chat RLHF pipeline. By leveraging the Hybrid-Engine, we speed up the experience generation phase for Llama-2-7B and Llama-2-13B models by **up to 7.1X**.
 
 -  **New System Features for Improved Efficiency and Accessibility**
-    - ***Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954))***. It is an extended set of optimization strategies built upon [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by **up to 3.3x** for the Llama-2-70B model running on 128 V100 GPUs.
+    - ***Mixed Precision ZeRO++ ([MixZ++](https://github.com/deepspeedai/DeepSpeed/pull/3954))***. It is an extended set of optimization strategies built upon [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by **up to 3.3x** for the Llama-2-70B model running on 128 V100 GPUs.
 
     - ***[ZeRO-Offload](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)***. It is an optimization that offloads optimizer memory and computation from the GPU to the host CPU, enabling larger models to be trained with fewer GPU resources. After training stability fixes and testing, we have enabled this feature across all three stages of the DeepSpeed-Chat RLHF training pipeline. ZeRO-Offload reduces the minimum number of GPUs required to train large models by **up to 16x**.
 
 - **Stability and Software Enhancements**
 
-  - DeepSpeed-Chat contains a rich set of features for training across many different platforms and scenarios. Composing these features in a systematic way and ensuring both system stability and decent training convergence is critical for the usability of the framework. Thus, in addition to new features in DeepSpeed-Chat, many system stability and training convergence issues have been fixed both in DeepSpeed-Chat (client code) and DeepSpeed (runtime). These improvements have been thoroughly tested using the OPT model family for end-to-end training. Furthermore, end-to-end testing, characterization scripts, and several instrumentation features like TensorBoard support are now also available. *To try out these latest features and software improvements, please use DeepSpeed release [v0.10.2](https://github.com/microsoft/DeepSpeed/tree/v0.10.2) and the latest DeepSpeed-Chat in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)*.
+  - DeepSpeed-Chat contains a rich set of features for training across many different platforms and scenarios. Composing these features in a systematic way and ensuring both system stability and decent training convergence is critical for the usability of the framework. Thus, in addition to new features in DeepSpeed-Chat, many system stability and training convergence issues have been fixed both in DeepSpeed-Chat (client code) and DeepSpeed (runtime). These improvements have been thoroughly tested using the OPT model family for end-to-end training. Furthermore, end-to-end testing, characterization scripts, and several instrumentation features like TensorBoard support are now also available. *To try out these latest features and software improvements, please use DeepSpeed release [v0.10.2](https://github.com/deepspeedai/DeepSpeed/tree/v0.10.2) and the latest DeepSpeed-Chat in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)*.
 
-  - Finally, to ensure the long-term health of the DeepSpeed-Chat training framework, [PyTests](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) were added for testing Step 3 of the RLHF training pipeline and are run on a nightly basis through a newly developed [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml).
+  - Finally, to ensure the long-term health of the DeepSpeed-Chat training framework, [PyTests](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) were added for testing Step 3 of the RLHF training pipeline and are run on a nightly basis through a newly developed [GitHub Actions workflow](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml).
 
 We now dive into the details of our new features, training stability, and software improvements.
 
@@ -54,19 +54,19 @@ The DeepSpeed-Chat training framework now provides system support for the Llama
 The following key optimizations in DeepSpeed are now fully integrated for Llama and Llama-2 models:
 
 - **DeepSpeed-Chat Integration**: Fully integrated into the complete, end-to-end three-stage DeepSpeed-Chat RLHF training framework, based on the OpenAI InstructGPT training strategy.
-- **Hybrid Engine**: DeepSpeed Hybrid Engine allows for superior generation phase [acceleration](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), now supported for all Llama-1 model variants, Llama-2-7B, and Llama-2-13B models.
-- **ZeRO and ZeRO-Offload**: Fully supported by the [ZeRO](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems) family of optimizations including offload support leveraging full memory capacity of a system thus enabling training of even larger models.
+- **Hybrid Engine**: DeepSpeed Hybrid Engine allows for superior generation phase [acceleration](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems), now supported for all Llama-1 model variants, Llama-2-7B, and Llama-2-13B models.
+- **ZeRO and ZeRO-Offload**: Fully supported by the [ZeRO](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md#throughput-and-model-size-scalability-comparisons-with-existing-rlhf-systems) family of optimizations including offload support leveraging full memory capacity of a system thus enabling training of even larger models.
 - **Mixed Precision ZeRO++ (MixZ++)**: Enhanced support for larger models like Llama-2-70B through the new MixZ++ feature, improving efficiency and reducing memory usage when there are frozen or non-trainable parameters.
-- **LoRA**: Fully supported by the [LoRA](https://github.com/microsoft/LoRA) feature, which vastly reduces the storage requirements for large language models by freezing original weights and learning pairs of rank-decomposition matrices.
+- **LoRA**: Fully supported by the [LoRA](https://github.com/deepspeedai/LoRA) feature, which vastly reduces the storage requirements for large language models by freezing original weights and learning pairs of rank-decomposition matrices.
 
 ## Getting Started
 
 Users looking to try the new Llama and Llama-2 model support can get started by using the newly added Llama scripts.
 | Step Number | Scripts |
 | --- | --- |
-| 1 | [Llama-2 Step 1 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2) |
-| 2 | [Llama-2 Step 2 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2) |
-| 3 | [Llama-2 Step 3 Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2) |
+| 1 | [Llama-2 Step 1 Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/llama2) |
+| 2 | [Llama-2 Step 2 Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/llama2) |
+| 3 | [Llama-2 Step 3 Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2) |
 
 *Note*: While all the system aspects of Llama and Llama-2 support have been extensively tested, there are no guarantees about training convergence and may require hyper-parameter tuning to achieve convergence.
 
@@ -103,11 +103,11 @@ We now dive into the details of two new features we are introducing today: 1) Mi
 
 ## 3.3x Higher Throughput with MixZ++ for LoRA <a name="mixz"></a>
 
-Mixed Precision ZeRO++ ([MixZ++](https://github.com/microsoft/DeepSpeed/pull/3954)) is an extended set of optimization strategies built upon [ZeRO](https://www.deepspeed.ai/tutorials/zero/) and [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA.
+Mixed Precision ZeRO++ ([MixZ++](https://github.com/deepspeedai/DeepSpeed/pull/3954)) is an extended set of optimization strategies built upon [ZeRO](https://www.deepspeed.ai/tutorials/zero/) and [ZeRO++](https://www.deepspeed.ai/tutorials/zeropp/) tailored to reduce memory usage and improve training/inference efficiency for RLHF training with LoRA.
 
 Similar to [ZeRO](https://www.deepspeed.ai/tutorials/zero/), MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them only when needed. In addition, similar to ZeRO++, MixZ++ allows for hierarchical partitioning and quantized communication. The hierarchical partitioning allows all the parameters to be stored within a node when possible so that the communication happens within a node, where communication bandwidth is significantly higher than communicating across nodes. The communication overhead is further reduced by quantizing the weights before gathering them.
 
-Finally, unlike ZeRO++ where parameters are always stored in fp16/bf16, and quantized/dequantized before and after communication, MixZ++ can persistently store the frozen weights in [Low-Rank Adaptation (LoRA)](https://github.com/microsoft/LoRA) in lower-precision, significantly reducing the communication overhead, eliminating quantization overhead, and supporting larger batch sizes that enable better efficiency.
+Finally, unlike ZeRO++ where parameters are always stored in fp16/bf16, and quantized/dequantized before and after communication, MixZ++ can persistently store the frozen weights in [Low-Rank Adaptation (LoRA)](https://github.com/deepspeedai/LoRA) in lower-precision, significantly reducing the communication overhead, eliminating quantization overhead, and supporting larger batch sizes that enable better efficiency.
 
 A comprehensive exploration of technical details can be accessed through our [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/), [MixZ++ tutorial](https://www.deepspeed.ai/tutorials/mixed_precision_zeropp/), and [paper](https://arxiv.org/pdf/2306.10209.pdf).
 
@@ -147,13 +147,13 @@ To try this feature, please refer to [MixZ++ tutorial](https://www.deepspeed.ai/
 
 </div>
 
-ZeRO-Offload was [disabled](https://github.com/microsoft/DeepSpeedExamples/pull/553)
+ZeRO-Offload was [disabled](https://github.com/deepspeedai/DeepSpeedExamples/pull/553)
  with the initial release of DeepSpeed-Chat due to training instability that was observed when it was used with Hybrid Engine and LoRA. After improvements to Hybrid Engine and LoRA as well as extensive testing of all feature configurations for ZeRO Stage2 and ZeRO Stage 3, this feature can now be enabled across all three steps of the DeepSpeed-Chat training framework. Please note that configuring ZeRO-Offload with ZeRO Stage 2 and Hybrid Engine with LoRA disabled is currently unsupported due to observed training instability.
 
 <div align="center">
   <img src="../assets/images/zero_offload_after_stability.png" width="750">
 
-  *Figure 5: Reward scores for all supported DeepSpeed-Chat configurations with ZeRO-Offload enabled. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
+  *Figure 5: Reward scores for all supported DeepSpeed-Chat configurations with ZeRO-Offload enabled. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/deepspeedai/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/deepspeedai/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
 
 </div>
 
@@ -164,11 +164,11 @@ A wide range of issues have been addressed in the DeepSpeed runtime and the Deep
 <div align="center">
   <img src="../assets/images/ds_chat_stability_sweep.png" width="750">
 
-  *Figure 6: Step 3 Reward Scores for all supported DeepSpeed-Chat configurations. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
+  *Figure 6: Step 3 Reward Scores for all supported DeepSpeed-Chat configurations. Run with 16 V100 GPUs, [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) actor model, [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) critic model, DS commit: [f036f00c](https://github.com/deepspeedai/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/deepspeedai/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).*
 
 </div>
 
-*Figure 6* above shows the training convergence across all supported DeepSpeed-Chat configurations. This data was collected using 16 V100 NVIDIA GPUs, the [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) OPT model as the actor, the [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) OPT model as the critic, and the following DeepSpeed and DeepSpeedExamples repository commits: DS commit: [f036f00c](https://github.com/microsoft/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/microsoft/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).
+*Figure 6* above shows the training convergence across all supported DeepSpeed-Chat configurations. This data was collected using 16 V100 NVIDIA GPUs, the [AdamG012/chat-opt-1.3b-sft-deepspeed](https://huggingface.co/AdamG012/chat-opt-1.3b-sft-deepspeed) OPT model as the actor, the [AdamG012/chat-opt-350m-reward-deepspeed](https://huggingface.co/AdamG012/chat-opt-350m-reward-deepspeed) OPT model as the critic, and the following DeepSpeed and DeepSpeedExamples repository commits: DS commit: [f036f00c](https://github.com/deepspeedai/DeepSpeed/tree/f036f00c3763694e539a9070a98130e2667e49bd), DSE commit: [81a8521f](https://github.com/deepspeedai/DeepSpeedExamples/tree/81a8521f05e2761eed34fcf65f19873df9f74403).
 
 We now dive into the details of all the fixes across different areas.
 
@@ -178,13 +178,13 @@ In this section we discuss the functionality and training stability fixes in the
 
 - **Training Stability:**
 
-  - [PR #620 - Make training more stable](https://github.com/microsoft/DeepSpeedExamples/pull/620)
+  - [PR #620 - Make training more stable](https://github.com/deepspeedai/DeepSpeedExamples/pull/620)
 
     - To improve the training stability in Step 3, several different areas of training were tuned and changed. To start, the Kullback-Liebler (KL) divergence used in the Proximal Policy Optimization (PPO) trainer was slightly tuned to reduce divergence between the new and reference policies and improve the reward score. Next, the sequence generation function in the PPO trainer (`_generate_sequence()`) removed the specification of a `min_length` in the Actor model's `generate()` call, which means generated sequences won't be artificially enlarged, allowing for the possibility of sequence generation to collapse i.e. when training convergence is extremely poor. A minor off-by-one error was also fixed in the PPO trainer's reward computation function (`compute_rewards()`). Finally, the PPO trainer's RLHF training function was updated to zero out the reward and value after the end of a conversation to prevent incorrect `advantages` and `returns`.
 
-  - [PR #633 - DS Chat Step 3 - Add separate Lora Adam optimizer group](https://github.com/microsoft/DeepSpeedExamples/pull/633)
+  - [PR #633 - DS Chat Step 3 - Add separate Lora Adam optimizer group](https://github.com/deepspeedai/DeepSpeedExamples/pull/633)
 
-    - The [LoRA](https://github.com/microsoft/LoRA) feature is supported across all three training steps of the DeepSpeed-Chat framework. Prior to this stability effort, there was no distinction between the overall learning rate and the LoRA learning rate i.e. the LoRA learning rate was set to whatever the overall learning rate was. This led to instability in training convergence and can be seen in *Figure 7* below showing the reward score across training steps for various Step 3 configurations:
+    - The [LoRA](https://github.com/deepspeedai/LoRA) feature is supported across all three training steps of the DeepSpeed-Chat framework. Prior to this stability effort, there was no distinction between the overall learning rate and the LoRA learning rate i.e. the LoRA learning rate was set to whatever the overall learning rate was. This led to instability in training convergence and can be seen in *Figure 7* below showing the reward score across training steps for various Step 3 configurations:
 
       <div align="center">
         <img src="../assets/images/sweep_before_lora_fix.png" width="650">
@@ -204,25 +204,25 @@ In this section we discuss the functionality and training stability fixes in the
 
       The next fix details the addition of separate LoRA learning rate arguments.
 
-  - [PR ##685 Add LoRA LR for DS Chat steps 1-3](https://github.com/microsoft/DeepSpeedExamples/pull/685)
+  - [PR ##685 Add LoRA LR for DS Chat steps 1-3](https://github.com/deepspeedai/DeepSpeedExamples/pull/685)
 
     - A *separate* LoRA learning rate argument can now be provided in each of the three training steps, with Step 3 having individual LoRA learning rates for the Actor and Critic models.
 
 - **Bug Fixes:**
 
-  - [PR #636 - DS Chat Step 3 - Fix Zero Stage 3](https://github.com/microsoft/DeepSpeedExamples/pull/636)
+  - [PR #636 - DS Chat Step 3 - Fix Zero Stage 3](https://github.com/deepspeedai/DeepSpeedExamples/pull/636)
 
       - During DeepSpeed-Chat Step 3 training, we observed hangs when ZeRO Stage 3 was enabled for the actor model and when the `world_size > 1`. When observing the state of each rank, one rank would still be in the sequence generation phase `self._generate_sequence()`, while the other rank had already progressed to the `self.actor_model()` call. This ZeRO Stage 3 desynchronization, due to misaligned token generation between the GPUs, can normally be automatically detected and accounted for in the HuggingFace Transformers library via `synced_gpus`. However, due to the nature of the DeepSpeed-Chat pipeline and the lifetime of the corresponding model configuration objects, this automatic detection code was not triggered. To resolve this, when invoking the `generate()` function, the `synced_gpus` argument is explicitly passed and set to `True` when ZeRO Stage 3 is being used.
 
-  - [PR #658 - Fix only optimize lora and ack-ckpting compatible](https://github.com/microsoft/DeepSpeedExamples/pull/658)
+  - [PR #658 - Fix only optimize lora and ack-ckpting compatible](https://github.com/deepspeedai/DeepSpeedExamples/pull/658)
 
     - This fix allows Step 3 training to run with the combination of gradient checkpointing and *LoRA-only* parameter optimization, a previously unsupported training case. With the addition of the [enable_input_require_grads](https://github.com/huggingface/transformers/blob/f26099e7b5cf579f99a42bab6ddd371bf2c8d548/src/transformers/modeling_utils.py#L1225) model utility function in the HuggingFace Transformers library, which enables the gradients for the input embeddings, gradient checkpointing and optimization of *only* the LoRA parameters is made possible.
 
-  - [PR #576 - Fix argparse](https://github.com/microsoft/DeepSpeedExamples/pull/576)
+  - [PR #576 - Fix argparse](https://github.com/deepspeedai/DeepSpeedExamples/pull/576)
 
     - An external contributor helped in resolving an argument parsing issue.
 
-  - [PR #584 - Fix unused parameter bug](https://github.com/microsoft/DeepSpeedExamples/pull/584)
+  - [PR #584 - Fix unused parameter bug](https://github.com/deepspeedai/DeepSpeedExamples/pull/584)
 
     - An external contributor fixed the passing of an uninitialized parameter that was hardcoded earlier.
 
@@ -230,11 +230,11 @@ In this section we discuss the functionality and training stability fixes in the
 ## Hybrid Engine Fixes
 In this section we discuss several fixes in the Hybrid Engine.
 
-- [PR #3563 - Fix LoRA Fuse/Unfuse in Hybrid Engine](https://github.com/microsoft/DeepSpeed/pull/3563)
+- [PR #3563 - Fix LoRA Fuse/Unfuse in Hybrid Engine](https://github.com/deepspeedai/DeepSpeed/pull/3563)
 
   - During Step 3 training for OPT with LoRA and Hybrid Engine enabled, an issue arose regarding a tensor size mismatch of the LoRA weights. Specifically, the LoRA QKV weights were not fused in the OPT container policy, yet they were expected to be fused by the Hybrid Engine. This challenge was effectively resolved by introducing both fused and unfused LoRA methods in the Hybrid Engine. We thank @sxjscience for providing this fix.
 
-- [PR #3883 - Extend HE-Lora test with Z3 support + Fix/add guard in HE for Z3](https://github.com/microsoft/DeepSpeed/pull/3883)
+- [PR #3883 - Extend HE-Lora test with Z3 support + Fix/add guard in HE for Z3](https://github.com/deepspeedai/DeepSpeed/pull/3883)
 
   - The Hybrid Engine was updated to properly check whether ZeRO Stage 3 was enabled when resetting the inference container parameters, along with expanding the corresponding unit tests.
 
@@ -242,17 +242,17 @@ In this section we discuss several fixes in the Hybrid Engine.
 ## ZeRO Stage 3 Fixes
 In this section we discuss several fixes in support of the ZeRO Stage 3 feature.
 
-- [PR #3819 - Fix racing condition in GatheredParameters](https://github.com/microsoft/DeepSpeed/pull/3819)
+- [PR #3819 - Fix racing condition in GatheredParameters](https://github.com/deepspeedai/DeepSpeed/pull/3819)
 
   - A race condition in the the ZeRO `GatheredParameters` context, which resulted in various `'status': 'INFLIGHT'` issues, was fixed by removing duplicate input parameters that were being passed from the Hybrid Engine.
 
-- [PR #3884 - Separate ZeRO3 InflightParamRegistry for train and eval](https://github.com/microsoft/DeepSpeed/pull/3884)
+- [PR #3884 - Separate ZeRO3 InflightParamRegistry for train and eval](https://github.com/deepspeedai/DeepSpeed/pull/3884)
 
   - The ZeRO Stage 3 `InflightParamRegistry` was updated to use a separate `InflightParamRegistry` for training and evaluation, fixing an issue where leftover parameters in flight were causing inflight parameter errors. These fixes, along with related fixes in the Hybrid Engine, enabled the use of the ZeRO-Offload feature in the DeepSpeed-Chat training pipeline.
 
-- [PR #3928 - Remove the param.ds_tensor from print](https://github.com/microsoft/DeepSpeed/pull/3928)
+- [PR #3928 - Remove the param.ds_tensor from print](https://github.com/deepspeedai/DeepSpeed/pull/3928)
 
-  - A minor change that was necessary to address the DeepSpeed-Chat Step 3 hang issue ([PR #636](https://github.com/microsoft/DeepSpeedExamples/pull/636)) as it allowed us to progress further into execution and observe the desynchronization point.
+  - A minor change that was necessary to address the DeepSpeed-Chat Step 3 hang issue ([PR #636](https://github.com/deepspeedai/DeepSpeedExamples/pull/636)) as it allowed us to progress further into execution and observe the desynchronization point.
 
 
 # 5. Software Improvements <a name="software-improvements"></a>
@@ -263,9 +263,9 @@ To improve the characterization, ease of debug, and maintainability of the DeepS
 
 The DeepSpeed-Chat training framework provides a rich set of features (Hybrid Engine, ZeRO, LoRA, etc.) that can be composed in many different combinations, depending on the scenario. The interactions between the features are often complex and composing them in a systematic way for characterization is useful for understanding their behavior. To support such use cases, characterization scripts have been added to run sweeps of Steps 1, 2, and 3 training for various combinations of features. The scripts default to OPT but can be modified to run with Llama. Please see the READMEs in the following folders for more details:
 
-- [Step 1 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep)
-- [Step 2 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep)
-- [Step 3 Sweep Scripts](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep)
+- [Step 1 Sweep Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/training_scripts/opt/single_node/sweep)
+- [Step 2 Sweep Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step2_reward_model_finetuning/training_scripts/opt/single_node/sweep)
+- [Step 3 Sweep Scripts](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/opt/single_node/sweep)
 
 For example, the Step 3 characterization script sweeps across various training features:
 | Feature | Values |
@@ -286,13 +286,13 @@ The training log for each combination of features will be stored in a folder wit
 
 Related PRs:
 
-- [DS Chat Characterization Scripts (Step 1 and 3)](https://github.com/microsoft/DeepSpeedExamples/pull/638)
-- [Add step 2 sweep script, clean up scripts](https://github.com/microsoft/DeepSpeedExamples/pull/664)
-- [Update script location and docs for all 3 steps](https://github.com/microsoft/DeepSpeedExamples/pull/681)
+- [DS Chat Characterization Scripts (Step 1 and 3)](https://github.com/deepspeedai/DeepSpeedExamples/pull/638)
+- [Add step 2 sweep script, clean up scripts](https://github.com/deepspeedai/DeepSpeedExamples/pull/664)
+- [Update script location and docs for all 3 steps](https://github.com/deepspeedai/DeepSpeedExamples/pull/681)
 
 ## Instrumentation
 
-To gain better insight into DeepSpeed-Chat training, new [instrumentation features](https://github.com/microsoft/DeepSpeedExamples/pull/624) were added across all three steps of DeepSpeed-Chat and can be enabled via arguments to each step's `main.py`.
+To gain better insight into DeepSpeed-Chat training, new [instrumentation features](https://github.com/deepspeedai/DeepSpeedExamples/pull/624) were added across all three steps of DeepSpeed-Chat and can be enabled via arguments to each step's `main.py`.
 
 | Argument | Description | Step(s) |
 | --- | --- | --- |
@@ -318,11 +318,11 @@ TensorBoard logging can be enabled in each of the three training steps, with som
 
 ## Testing
 
-As part of the DeepSpeed team's commitment to maintaining the DeepSpeed-Chat training framework, continuous integration [PyTest](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) testing has been added for Step 3 RLHF training in a new [GitHub Actions workflow](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml).
+As part of the DeepSpeed team's commitment to maintaining the DeepSpeed-Chat training framework, continuous integration [PyTest](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/tests/test_training.py) testing has been added for Step 3 RLHF training in a new [GitHub Actions workflow](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml).
 
 | Description | Status |
 | ----------- | ------ |
-| Integrations | [![nv-ds-chat](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/microsoft/DeepSpeed/actions/workflows/nv-ds-chat.yml) |
+| Integrations | [![nv-ds-chat](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml/badge.svg?branch=master)](https://github.com/deepspeedai/DeepSpeed/actions/workflows/nv-ds-chat.yml) |
 
  The workflow is run on a **nightly** basis across a **16-case** test matrix (see table below), and uses the **facebook/opt-125m** model for both the actor and critic.
 
@@ -338,15 +338,15 @@ Each configuration (16 total) runs through a limited number of Step 3 non-overfl
 # 6. Try Out DeepSpeed-Chat <a name="try-out-deepspeed-chat"></a>
 We are very excited to share this DeepSpeed-Chat feature and stability release.
 
-* To get started, please visit our GitHub page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
+* To get started, please visit our GitHub page for DeepSpeed-Chat: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)
 
-* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for future.
+* We will continue to improve DeepSpeed-Chat with your feedback and support. Our [roadmap](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-) shows currently supported features as well as ones that are planned for future.
 
 DeepSpeed-Chat is part of the bigger DeepSpeed ecosystem comprising of a multitude of Deep Learning systems and modeling technologies. To learn more,
 
 * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
-* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+* You can also follow us on our [English Twitter](https://twitter.com/DeepSpeedAI), [Japanese Twitter](https://twitter.com/DeepSpeedAI_JP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
 
-DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
 
-* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work!
+* "Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/) repositories if you like our work!
diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md
index e7aa62721417..daa9b387de28 100644
--- a/blogs/deepspeed-chat/japanese/README.md
+++ b/blogs/deepspeed-chat/japanese/README.md
@@ -62,7 +62,7 @@ DeepSpeed-RLHFシステムは、大規模モデルの学習において類を見
 *表2. 複数ノード（64x A100-80GB）を用いた場合の訓練時間とAzureでの概算実行コスト*
 </div>
 
-> ***注意事項***: 上記の2つの表の数値は、訓練のステージ3のものです。DeepSpeed-RLHFが用いるデータセットと訓練の設定において、合計1.35億トークンを1エポックで訓練した際のスループットの実測値に基づいています。合計6750万のクエリートークン（配列長256の13万件のクエリー）と6750万の生成トークン（配列長256の13万件の回答）があり、ステップごとの最大グローバルバッチサイズは 50万 トークン（クエリーと回答それぞれ1024件）です。DeepSpeedRLHFを用いた場合のコストおよび実行時間の比較にあたっては、これらの詳細をよくご確認ください。さらに詳細な情報は[ベンチマーク設定](https://github.com/microsoft/DeepSpeedExamples/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照ください。
+> ***注意事項***: 上記の2つの表の数値は、訓練のステージ3のものです。DeepSpeed-RLHFが用いるデータセットと訓練の設定において、合計1.35億トークンを1エポックで訓練した際のスループットの実測値に基づいています。合計6750万のクエリートークン（配列長256の13万件のクエリー）と6750万の生成トークン（配列長256の13万件の回答）があり、ステップごとの最大グローバルバッチサイズは 50万 トークン（クエリーと回答それぞれ1024件）です。DeepSpeedRLHFを用いた場合のコストおよび実行時間の比較にあたっては、これらの詳細をよくご確認ください。さらに詳細な情報は[ベンチマーク設定](https://github.com/deepspeedai/DeepSpeedExamples/blob/staging-deepspeed-chat-v2/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照ください。
 
 ***RLHFを誰もが利用できるように***: DeepSpeed-HEは、1台のGPUのみで130億以上のパラメーターを持つモデルの訓練を実行できます。複数のGPUを備えた高価な計算設備を持たないデータサイエンティストも、小規模なトイモデルではなく、実際のシナリオで使用できる大規模で強力なRLHFモデルを作成できます。
 
@@ -92,7 +92,7 @@ DeepSpeed-RLHFシステムは、大規模モデルの学習において類を見
 ```python
 pip install deepspeed>=0.9.0
 
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/applications/DeepSpeed-Chat/
 pip install -r requirements.txt
 
@@ -279,7 +279,7 @@ DeepSpeed-RLHFは、Colossal-AIや、ネイティブのPyTorchを用いたHuggin
 
 ## 実効スループットとスケーラビリティ
 
-***(I) 実効スループット分析*** RLHFのステージ3におけるDeepSpeed-HEの実効スループットは、生成フェーズと強化学習の訓練フェーズの両方のスループットで決まります。我々の作成したRLHFのパイプラインでは、生成フェーズが全計算量の約20%を占め、強化学習の訓練フェーズが残りの80%を占めています（詳細は[ベンチマークのページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照）。しかし、計算量で見た割合が少ないとはいえ、前者は生成された256個のトークンのそれぞれに対して、初期プロンプトの256個のトークンに対してアクターモデルによる推論をそれぞれ1回実行する必要があるため、end-to-endの時間で見ると、その大部分を占めることになり、メモリ帯域が制限されて高いスループットを得ることが難しくなります。一方、強化学習の訓練フェーズでは、1サンプルあたりプロンプトと生成の両方から512個のトークンをフルに使用して、参照アクターモデルについて、数回のフォワードパスとバックワードパスで実行できるため、高いスループットを達成できます。
+***(I) 実効スループット分析*** RLHFのステージ3におけるDeepSpeed-HEの実効スループットは、生成フェーズと強化学習の訓練フェーズの両方のスループットで決まります。我々の作成したRLHFのパイプラインでは、生成フェーズが全計算量の約20%を占め、強化学習の訓練フェーズが残りの80%を占めています（詳細は[ベンチマークのページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/BenckmarkSetting.md)を参照）。しかし、計算量で見た割合が少ないとはいえ、前者は生成された256個のトークンのそれぞれに対して、初期プロンプトの256個のトークンに対してアクターモデルによる推論をそれぞれ1回実行する必要があるため、end-to-endの時間で見ると、その大部分を占めることになり、メモリ帯域が制限されて高いスループットを得ることが難しくなります。一方、強化学習の訓練フェーズでは、1サンプルあたりプロンプトと生成の両方から512個のトークンをフルに使用して、参照アクターモデルについて、数回のフォワードパスとバックワードパスで実行できるため、高いスループットを達成できます。
 
 <div align="center">
 
@@ -305,7 +305,7 @@ DeepSpeed-RLHFは、Colossal-AIや、ネイティブのPyTorchを用いたHuggin
 
 ***(II) スケーラビリティ分析*** モデルサイズごとに、最良のスループットを得られるGPU数は異なります。これは、モデルサイズが大きくなると、実行に多くのメモリを必要とすることに加え、以下に説明する DeepSpeed-HE のスケーラビリティ特性にも起因しています。
 
-図7は、DeepSeed-RLHF が最大 64 GPU で全体的に良好なスケーラビリティを達成したことを示しています。しかし、より詳細に見ると、DeepSpeed-RLHFの訓練では、小規模な環境では超線形（super linear）なスケーリングを達成し、大規模では線形（linear）またはそれ以下のスケーラビリティになっていることが分かります。これは、メモリの可用性と最大グローバルバッチサイズとの間の相互作用によるものです。
+図7は、DeepSpeed-RLHF が最大 64 GPU で全体的に良好なスケーラビリティを達成したことを示しています。しかし、より詳細に見ると、DeepSpeed-RLHFの訓練では、小規模な環境では超線形（super linear）なスケーリングを達成し、大規模では線形（linear）またはそれ以下のスケーラビリティになっていることが分かります。これは、メモリの可用性と最大グローバルバッチサイズとの間の相互作用によるものです。
 
 DeepSpeed-HEはトレーニングにZeROの技術を採用しているため、利用可能なGPU間でモデルを分割することが可能です。その結果、GPUあたりのメモリ消費量はGPU数の増加とともに減少し、DeepSpeed-HEはGPUあたりでより大きなバッチサイズをサポートできるようになり、超線形のスケーリングが実現できます。しかし、より大規模になると、利用可能なメモリが増加し続ける一方で、最大グローバルバッチサイズが制限されているため、GPUあたりのバッチサイズを小さくすることになり、線形またはそれ以下のスケーリングになります。その結果、与えられた最大グローバルバッチサイズに対して、DeepSpeed-HEは、スーパーリニアとサブリニアのスケーラビリティの境界で最高のスループットとコスト効率を達成し、正確なポイントは、利用可能なメモリとグローバルバッチサイズの関数としてGPUごとに実行できる最大バッチサイズによってほぼ決定されます。
 
@@ -314,8 +314,8 @@ DeepSpeed-HEはトレーニングにZeROの技術を採用しているため、
 
 DeepSpeed-ChatをオープンソースソフトウェアとしてAIコミュニティに公開できることを嬉しく思います。
 
-* DeepSpeed-Chatの[GitHubページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)を見て、早速使い始めましょう。
-* ユーザのみなさまからのフィードバックと協力で、これからも継続的に DeepSpeed-Chat を改善していく予定です。現在サポートされている機能や、将来的にサポートされている機能については、[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)をご覧ください。
+* DeepSpeed-Chatの[GitHubページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat)を見て、早速使い始めましょう。
+* ユーザのみなさまからのフィードバックと協力で、これからも継続的に DeepSpeed-Chat を改善していく予定です。現在サポートされている機能や、将来的にサポートされている機能については、[ロードマップ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-Chat/README.md#-deepspeed-chats-roadmap-)をご覧ください。
 
 
 # 7. DeepSpeedについて
@@ -332,14 +332,14 @@ DeepSpeedは、以下のような機能を提供します。
 
 DeepSpeedは、Microsoftの[AI at Scale initiative](https://www.microsoft.com/en-us/research/project/ai-at-scale/)の一部で、次世代AIの機能の大規模な実現を進めています。詳細は[こちら](https://innovation.microsoft.com/en-us/exploring-ai-at-scale)をご覧ください。DeepSpeedは、[Megatron-Turing NLG (530B)](https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/), [Jurassic-1 (178B)](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf), [BLOOM (176B)](https://huggingface.co/blog/bloom-megatron-deepspeed), [GLM (130B)](https://github.com/THUDM/GLM-130B), [YaLM (100B)](https://github.com/yandex/YaLM-100B) を含め、様々な大規模モデルを学習するのに使用されてきました。
 
-またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/main/main_classes/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
+またDeepSpeedは、 [Hugging Face Transformers](https://huggingface.co/docs/transformers/deepspeed), [Hugging Face Accelerate](https://huggingface.co/docs/accelerate/usage_guides/deepspeed), [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html), [MosaicML Composer](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration), [Determined AI](https://docs.determined.ai/latest/training/apis-howto/deepspeed/overview.html) など、多くの著名なオープンソースの深層学習フレームワークのバックエンドとして利用されています。
 
 DeepSpeedについてのより詳しい情報は、以下をご覧ください。
 
 * [DeepSpeedのWebサイト](https://www.deepspeed.ai/) には、DeepSpeedの技術に関する詳細なブログ記事、チュートリアル、ドキュメントなどが掲載されています。
-* [DeepSpeedのTwitterアカウント (英語)](https://twitter.com/MSFTDeepSpeed) では、DeepSpeedの最新情報を発信していますので、ぜひフォローください。[日本語版のTwitterアカウント](https://twitter.com/MSFTDeepSpeedJP)もあり、最新の情報を日本語で発信しています。
+* [DeepSpeedのTwitterアカウント (英語)](https://twitter.com/DeepSpeedAI) では、DeepSpeedの最新情報を発信していますので、ぜひフォローください。[日本語版のTwitterアカウント](https://twitter.com/DeepSpeedAI_JP)もあり、最新の情報を日本語で発信しています。
 
 DeepSpeedチームは、ユーザの方々からのフィードバックやご連絡を受け付けています。
 
-* ユーザのみなさまからのバグ報告、Pull request、さまざまな議論への参加は、[GitHub](https://github.com/microsoft/DeepSpeed/)で受け付けています。詳細については、[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)を確認してください。
+* ユーザのみなさまからのバグ報告、Pull request、さまざまな議論への参加は、[GitHub](https://github.com/deepspeedai/DeepSpeed/)で受け付けています。詳細については、[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)を確認してください。
 * DeepSpeedチームでは、DeepSpeedを用いた深層学習の研究や実世界へのAIモデルやアプリケーションに関して、大学、研究所、企業との方々とのコラボレーションを行っています（日本語でコミュニケーション可能な研究員も在籍しています）。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については、deepspeed-info@microsoft.com まで直接メールをお送りください。
diff --git a/blogs/deepspeed-domino/README.md b/blogs/deepspeed-domino/README.md
new file mode 100644
index 000000000000..ce190ed1e459
--- /dev/null
+++ b/blogs/deepspeed-domino/README.md
@@ -0,0 +1,199 @@
+<p align="center">
+  <img  width="300" height="300" src="./images/domino-logo.png" alt="domino logo"/>
+</p>
+
+<div align="center">
+
+# Domino: Communication-Free LLM Training Engine
+
+</div>
+
+<div align="center">
+  <img src="./images/domino-hero.png" alt="" width="1200" />
+
+<div align="left">
+
+*Figure 1: Project Domino is Microsoft DeepSpeed's Tensor Parallel (TP) Training Engine, which provides a uniform solution for both single-node and **multi-node** cases. Domino scales up traditional single-node-only TP solution to multi-node environments via **near-complete communication hiding** behind computation.*
+
+</div>
+</div><br>
+
+# Table of Content
+1. [Introduction](#introduction)
+2. [Domino Highlights](#domino-highlights)
+3. [Design Motivation](#design-motivation)
+4. [Domino Design](#domino-design)
+5. [Implementation and Optimization](#implementation-and-optimization)
+6. [Getting Started: Try out DeepSpeed-Domino](#getting-started-try-out-deepspeed-domino)
+7. [Citation](#citation)
+8. [Acknowledgements](#acknowledgements)
+
+
+# Introduction
+
+Generative AI (GenAI) has enabled transformative applications in a wide variety of domains, including chatbot, text summarization, and high-quality image and video generation. These capabilities are built on top of large foundation models, particularly Large Language Models (LLMs). LLMs are typically based on the [Transformer](https://arxiv.org/abs/1706.03762) network architecture, and include popular model families such as GPT and Llama. LLMs have grown beyond the memory capacity of a single accelerator (e.g., GPU), and so inferencing or training them requires distributed processing using multiple GPUs or even multiple nodes.
+
+Tensor parallelism (TP) is a popular distributed technique for training LLMs. TP leverages the aggregate memory of multiple GPUs to fit LLMs by partitioning each model layer across the GPUs. However, TP incurs two communication collective operations for each partitioned layer, separately for the forward and backward passes. TP is appealing due to its excellent system efficiency in single-node cases, where GPUs are directly connected via high bandwidth links like NVLink and NVSwitch. However, TP falls short in multi-node cases due to the lower bandwidth of cross-node interconnects. [Prior work](https://arxiv.org/abs/2406.06858) reports that communication can take up to 75\% of end-to-end training time. Figure 2 shows that even on the latest DGX-H100 nodes interconnected with high-end Infiniband of 400GB/s bandwidth, communication overheads remains as high as 43\% of end-to-end training iteration time. Recent advances in GeMM+NCCL kernel fusion are unable to fully hide communication overheads due to their limited scope of computation-communication overlapping. The trend of faster compute in newer GPUs (e.g., DGX-B200) indicates that the communication overheads of TP will be more pronounced in both single node and multiple node scenarios.
+
+<div align="center">
+  <img src="./images/gpt3-scale.png" width="400"><br>
+
+  *Figure 2: TP communication overhead in GPT-3-13B training using 1,2,4 DGX-H100 nodes (i.e., 8, 16, 32 H100 GPUs).*
+
+</div>
+
+# Domino Highlights
+
+
+* Domino is TP optimization technique that achieves **Near-Complete** communication hiding behind computation by decomposing a single batch training iteration into smaller and independent pieces, allowing efficient pipelining.
+
+Domino is the first work that provides a **uniform** Tensor Parallelism (TP) solution for both single-node and **multi-node** cases. Traditional TP solutions (e.g., Megatron-LM) fall short in multi-node cases due to limited cross-node communication bandwidth.
+
+### Performance
+
+We tested Domino on 1 to 4 DGX-H100 boxes (8xH100 per box). Each node has intra-node NVLink bandwidth of 900GB/s and inter-node IB bandwidth of 400GB/s. We oberved the following performance results:
+1. For both GPT and Llama model series, Domino outperforms Megatron-LM by up to **1.3x** and **1.2x** respectively in end-to-end training iteration throughput for different model sizes, sequence lengths and batch sizes. These results are summarized in Figure 1.
+2. For several cases, Domino achieves **near-optimal** training throughput, where optimal throughput refers to the throughput achieved assuming the communication collectives of TP are disabled.
+
+For more detailed performance results, please refer to our [arxiv paper](https://arxiv.org/abs/2409.15241).
+
+# Design Motivation
+
+In this section, we briefly discuss three topics. First, we motivate why the time is right is for a uniform TP solution for both single node and multi-node cases. Next, we analyze the communication overhead on latest Nvidia DGX-H100 boxes with high cross-node communication interconnects. Finally, we describe TP's sequential data dependency which causing communication stands out.
+
+### It is time for a uniform TP for single and multi-node scenarios
+
+Nvidia is pushing hard on breaking communication bandwidth gap between intra-node (i.e., GPUs within a node connected with NVLink) and inter-node (i.e., cross-node connected with Infini-Band(IB)). For example, each DGX-H100 is equipped with eight ConnectX-7 network cards and gets aggregated cross-node bandwidth of 400GB/s, which is at same level of intra-node NVLink (900GB/s). Therefore, it is time for proposing a uniform solution for both single node and multi-node TP training.
+
+### Communication Overhead in TP
+
+As described in [Megatron-LM paper](https://arxiv.org/pdf/1909.08053), for TP, every transformer block (i.e.,1 Self-Attention layer + 1 MLP layer) incurs 4 AllReduce calls, two in forward pass and two in the backward pass (shown in Figure 3). Given a LLM consisting of $N$ stacked transformer blocks, the number of AllReduce calls required for TP training is $4 * N$. Even for small models like GPT-3 2.7B or 6.7B which consists of 32 layers, the total number of AllReduce calls is 128 for every training iteration. For larger models, the number of AllReduce calls grows linearly with number of layers.
+
+<div align="center">
+  <img src="./images/tp-ar.png" width="500"><br>
+
+  *Figure 3: TP communication = 4 x AllReduce x num\_transformer\_block*
+
+</div>
+
+One big issue for TP is that the *communication resides on critical path of every input batch training execution* due to sequential data dependency we described in the following [TP data dependency analysis](#tp-data-dependency-analysis) section. Therefore, the communication overhead stands out and is difficult to hide behind computation. In Figure 4, we provide our communication overhead measurement using Megatron-LM training GPT-3 and Llama-2 model series with different model sizes and batch sizes across 1 to 4 DGX-H100 nodes (i.e., 8 to 32 H100 GPUs). The communication overhead is up to **47\%** despite using latest Nvidia hardware DGX-H100 with 400GB/s cross-node bandwidth.
+
+<div align="center">
+  <img src="./images/tp-comm-overhead.png" width="600"><br>
+
+  *Figure 4: TP communication and computation ratio per training iteration time over different models and batch sizes using 1 to 4 DGX-H100 nodes.*
+
+</div>
+
+As Llama-3 405B model training takes 54 days on 16,000 H100 GPUs, the projected communication time can be up to around **25 days on 16,000 H100s**. This finding shows that, despite using latest high-bandwidth interconnects like NVLink/Infini-Band(IB), the communication overheads of TP remains a huge portion of end-to-end training time.
+
+### TP data dependency analysis
+
+In traditional TP, shown in Figure 5, a transformer layer (either Attn or MLP layer) computation can be abstracted into $X\*A\*B=Y$, where $X$ is input. For attention layer, $A$ is attention computation (e.g., multihead-attention) and $B$ is linear layer. For MLP layer, both $A$ and $B$ are linear layers. An AllReduce is conducted on $Y$ after computation. Due to **sequential data dependency on $Y$ between computation (i.e., $X\*A\*B=Y$) and communication (i.e., AllReduce($Y$)), AllReduce($Y$) completely stands out**, thus making TP not efficient in limited communication bandwidth scenarios.
+
+<div align="center">
+  <img src="./images/design-base.png" width="600"><br>
+<div align="left">
+
+  *Figure 5: TP Forward pass of single Self-Attention/MLP layer. (X is input, A is attention computation for Self-Attention layer and linear for MLP layer, B is linear for both Self-Attention and MLP layer. Y is X\*A\*B output)*
+
+</div>
+</div>
+
+
+# Domino Design
+
+Compared to Figure 5, Domino breaks data dependency of $X\*A\*B$ via [*Row-wise Split on Inputs X*](#row-wise-split-on-inputs-x), [*Column-wise Split on Weights B*](#column-wise-split-on-weights-b), as well as a [hybrid solution combining these two](#2d-split-on-both-x-and-b). After breaking computation into pieces, Domino pipelines computation and communication working on different independent pieces, thus achieving near-complete communication hiding behind computation. Domino's unique benefits are listed as follows:
+
+1. Comparing with GeMM+NCCL kernel fusion techniques, Domino breaks data dependency thus has a much wider range of computation kernel sequences to overlap with NCCL call. For example, Domino can overlap AllReduce not only to a single GeMM, but also extend overlapping scope to multiple GeMMs, LayerNorm, DropOut and more.
+2. Domino achieves near-complete communication hiding behind computation, thus also achieves near-optimal system throughput in certain cases. (Optimal throughput refers to end-to-end throughput that disables all communication in TP training.)
+3. Domino works at kernel scheduler level, any kernel optimizations or new kernels can be seamlessly integrated into Domino framework.
+4. Domino tensor partition scheme is simple and generic. It is easy for user side end-to-end correctness debugging when facing issues like overflow or weights/gradients errors.
+
+For the ease of illustration, we describe forward propagation only (since backward pass is just in reverse order), and we describe only splitting tensor into two chunks.
+
+## Row-wise split on Inputs X:
+
+Domino breaks Input X in row dimension (i.e. batch dimension).
+
+<div align="center">
+  <img src="./images/design-row.png" width="600"><br>
+
+  *Figure 6: Domino row-wise (batch-dim) split on inputs X.*
+
+</div>
+
+**Data Dependency**: Split inputs' batch dimension has no data dependency for both intra-layer and inter-layer cases. Therefore, we achieve both *intra-layer* (AllReduce($Y1$) and $X2\*A\*B$) and *inter-layer* (AllReduce($Y2$) and next-layer's $X1\*A\*B$) computation-communication overlapping. With this batch split on inputs, Domino can hide up to **100\%** communication behind computation.
+
+## Column-wise split on Weights B:
+
+Domino breaks weight matrix B in column dimension.
+
+
+<div align="center">
+  <img src="./images/design-column.png" width="600"><br>
+
+  *Figure 7: Domino column-wise (last-dim) split on weights B.*
+
+</div>
+
+**Data Dependency**: Split Weights B column-wise have no data dependency in intra-layer case but have data dependency in inter-layer case. Therefore, we only achieve *intra-layer*
+ (AllReduce($Y1$) and $X2\*A\*B$) computation-communication overlapping. This column-split on weights scheme remains essential, since row-wise input split only would lead to narrow shape tensors that hinder kernel computational efficiency. In practice, Domino achieves 50\% to 70\% communication hiding behind computation with weights B column-wise split.
+
+## 2D Split on both X and B:
+
+For extremely large LLMs, Domino splits both inputs X and weights B in row and column dimension, separately. This method is beneficial for model training requiring both low memory footprints and minimizing communication overheads.
+
+<div align="center">
+  <img src="./images/design-hybrid.png" width="600"><br>
+
+  *Figure 8: Domino 2D split on both inputs X and weights B.*
+
+</div>
+
+**Data Dependency**: This 2D split policy inherits synchronization at the end of each transformer layer due to column-wise split on weights B. Therefore, the 2D approach only achieves *intra-layer* computation-communication overlapping.
+
+# Implementation and Optimization
+
+For brevity, we summarize key implementation of row-wise input split. For more implementation details, please refer to our [arxiv paper](https://arxiv.org/abs/2409.15241).
+
+**Forward:** Figure 9 shows how we position and trigger NCCL calls in order to overlap with computation kernel sequences in forward propagation. We split batch into two chunks as $\mu$-batch0 and $\mu$-batch1. $\mu$-batch0 attention output as attn0 and MLP output as MLP0. $\mu$-batch1's attention output as attn1 and MLP output as MLP1. AllReduce(attn0) is overlapped with self-attention computation on $\mu$-batch1. For AllReduce(attn1), we group multiple $\mu$-batches' Dropout, Residual, LayerNorm computation-communication overlapping. This small kernel grouping not only enable complete hiding of AllReduce(attn1), but also provides proper overlapping space for AllReduce(MLP0) in the backward pass shown in Figure 10. For AllReduce(MLP0), we hide it behind $\mu$-batch1's MLP computation kernel sequence of GeMM + GeLU + GeMM. For AllReduce(MLP1), we hide it behind next layer's attention computation.
+
+<div align="center">
+  <img src="./images/implement-fwd.png" width="700"><br>
+
+  *Figure 9: Transformer block (i.e., 1 self-attn + 1 MLP) forward pass. Upper figure is vanila TP implementation, bottom is Domino implementation.*
+
+</div>
+
+**Backward:** Figure 10 shows a simple example of batch split in to two $\mu$-batches as $\mu$-batch0 and $\mu$-batch1. Besides similar overlapping strategy in the forward pass, we extend the scope of overlap communication with weights' gradient computation inside same $\mu$-batch (e.g., AllReduce(MLP1) partially overlaps with its own $\mu$-batch1 computation as the 3rd orange block from left). Each *grad matmul* includes two separate GeMM computation for inputs gradient and weights gradient. Therefore, we can extend overlapping scope by overlapping AllReduce(MLP1) with $\mu$-batch1's weights gradient computation.
+
+Backward is a bit more challenging because backward computation graph is automatically generated by torch.autograd(). To precisely control NCCL call triggering time, we implement a *no\_operation* module, which obtains communication handle during forward pass and retains it for use during backward pass. Our *no\_operation* module works seamlessly with torch.autograd(), and enable us precisely control NCCL start/end time without rewriting customized backward computation graph.
+
+<div align="center">
+  <img src="./images/implement-bwd.png" width="700"><br>
+
+  *Figure 10: Transformer block (i.e., 1 self-attn + 1 MLP) backward pass. Upper figure is vanila TP implementation, bottom is Domino implementation.*
+
+</div>
+
+**General kernel optimizations:** We adopt general kernel-level optimization techniques. For example, we use cudaGraph to squeeze idle/bubble time between adjacent compute kernels to reduce end-to-end latency. We use CUDA multi-stream to increase parallel execution. We also leverage torch.compile() to further improve our system efficiency.
+
+# Getting Started: Try out DeepSpeed-Domino
+
+To try out DeepSpeed-Domino, please refer to [Domino tutorial](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in our DeepSpeedExample repo.
+
+## Citation
+
+```
+@article{wang2024-deepspeed-domino,
+  title={{Domino: Eliminating Communication in LLM Training via Generic Tensor Slicing and Overlapping}},
+  author={Guanhua Wang and Chengming Zhang and Zheyu Shen and Ang Li and Olatunji Ruwase},
+  journal={arXiv preprint arXiv:2409.15241},
+  year={2024}
+}
+```
+
+## Acknowledgements
+
+This work is the result of a deep collaboration between Microsoft DeepSpeed and our academia partners from University of Maryland, University of Houston. The contributors include [Guanhua Wang](https://www.microsoft.com/en-us/research/people/guanhuawang/), [Hongwei Chen](https://github.com/hwchen2017) and [Olatunji Ruwase](https://www.microsoft.com/en-us/research/people/olruwase/) from Microsoft DeepSpeed Team, [Chengming Zhang](https://chengmingzh8.github.io/) from University of Houston, [Zheyu Shen](https://www.linkedin.com/in/zheyushen/) and [Ang Li](https://www.ang-li.com/) from University of Maryland.
diff --git a/blogs/deepspeed-domino/images/design-base.png b/blogs/deepspeed-domino/images/design-base.png
new file mode 100644
index 000000000000..d347e9c2ba8b
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-base.png differ
diff --git a/blogs/deepspeed-domino/images/design-column.png b/blogs/deepspeed-domino/images/design-column.png
new file mode 100644
index 000000000000..a99ad3c6b461
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-column.png differ
diff --git a/blogs/deepspeed-domino/images/design-hybrid.png b/blogs/deepspeed-domino/images/design-hybrid.png
new file mode 100644
index 000000000000..302e3f95e8fc
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-hybrid.png differ
diff --git a/blogs/deepspeed-domino/images/design-row.png b/blogs/deepspeed-domino/images/design-row.png
new file mode 100644
index 000000000000..551a54f4e651
Binary files /dev/null and b/blogs/deepspeed-domino/images/design-row.png differ
diff --git a/blogs/deepspeed-domino/images/domino-hero.png b/blogs/deepspeed-domino/images/domino-hero.png
new file mode 100644
index 000000000000..078b6472b42a
Binary files /dev/null and b/blogs/deepspeed-domino/images/domino-hero.png differ
diff --git a/blogs/deepspeed-domino/images/domino-logo.png b/blogs/deepspeed-domino/images/domino-logo.png
new file mode 100644
index 000000000000..58be0990b944
Binary files /dev/null and b/blogs/deepspeed-domino/images/domino-logo.png differ
diff --git a/blogs/deepspeed-domino/images/gpt3-scale.png b/blogs/deepspeed-domino/images/gpt3-scale.png
new file mode 100644
index 000000000000..611b2221a73c
Binary files /dev/null and b/blogs/deepspeed-domino/images/gpt3-scale.png differ
diff --git a/blogs/deepspeed-domino/images/implement-bwd.png b/blogs/deepspeed-domino/images/implement-bwd.png
new file mode 100644
index 000000000000..4b115222f387
Binary files /dev/null and b/blogs/deepspeed-domino/images/implement-bwd.png differ
diff --git a/blogs/deepspeed-domino/images/implement-fwd.png b/blogs/deepspeed-domino/images/implement-fwd.png
new file mode 100644
index 000000000000..51d3a73bae58
Binary files /dev/null and b/blogs/deepspeed-domino/images/implement-fwd.png differ
diff --git a/blogs/deepspeed-domino/images/tp-ar.png b/blogs/deepspeed-domino/images/tp-ar.png
new file mode 100644
index 000000000000..6dd01ccceed8
Binary files /dev/null and b/blogs/deepspeed-domino/images/tp-ar.png differ
diff --git a/blogs/deepspeed-domino/images/tp-comm-overhead.png b/blogs/deepspeed-domino/images/tp-comm-overhead.png
new file mode 100644
index 000000000000..947473ff5261
Binary files /dev/null and b/blogs/deepspeed-domino/images/tp-comm-overhead.png differ
diff --git a/blogs/deepspeed-fastgen/2024-01-19/README.md b/blogs/deepspeed-fastgen/2024-01-19/README.md
new file mode 100644
index 000000000000..06650d8473a1
--- /dev/null
+++ b/blogs/deepspeed-fastgen/2024-01-19/README.md
@@ -0,0 +1,187 @@
+<div align="center">
+
+# DeepSpeed-FastGen: Introducing Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.
+
+</div>
+
+<div align="center">
+ <img src="assets/images/fastgen-hero-light.png#gh-light-mode-only" width="850px">
+ <img src="assets/images/fastgen-hero-dark.png#gh-dark-mode-only" width="850px">
+</div>
+
+# Table of Contents
+1. [Introduction](#introduction)
+2. [New Model Families](#new-model-families)
+3. [Performance Optimizations](#performance-optimizations)
+4. [Feature Enhancements](#stability-and-software-enhancements)
+5. [Community Engagement](#community-engagement)
+6. [Try Out DeepSpeed-FastGen](#try-out-deepspeed-fastgen)
+
+
+# 1. Introduction <a name="introduction"></a>
+
+[DeepSpeed-FastGen](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen) is an inference system framework that enables easy, fast, and affordable inference for large language models (LLMs). From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. DeepSpeed-FastGen utilizes the Dynamic SplitFuse technique to tackle the unique challenges of serving these applications and offer higher effective throughput than other state-of-the-art systems like vLLM.
+
+Today, we are happy to share that we are improving DeepSpeed-FastGen along three areas: i) three new model families, ii) performance optimizations, and iii) feature enhancements:
+- **New Model Families**
+
+  We introduce support for Mixtral (MoE), Falcon, and Phi-2 model families in DeepSpeed-FastGen. Our inference optimizations for these models provide up to 2.5X improvement in effective throughput over other state-of-the-art frameworks like vLLM.
+
+- **Performance Optimizations**
+
+  We drastically reduced the scheduling overhead of Dynamic SplitFuse and increased the efficiency of token sampling. As a result, we see higher throughput and lower latency, particularly when handling concurrent requests from many clients. We demonstrate the performance optimizations with benchmarks and evaluation of DeepSpeed-FastGen against vLLM for the newly added model families. The benchmark results can be seen in [Performance Evaluation](#performance-optimizations) and the benchmark code is available at [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/benchmarks/inference/mii).
+
+- **Feature Enhancements**
+
+  DeepSpeed-FastGen contains a rich set of features for running inference with many different model families and over 20,000 HuggingFace hosted models. We extend this feature set for all models to include a RESTful API, more generation options, and support for models using the safetensor checkpoint format. Additionally, we improve on overall stability and address bugs in our original DeepSpeed-FastGen release.
+
+We now dive into the details of the new model families, performance optimizations, and software improvements. If you would like to get started right away please see [Try Out DeepSpeed-FastGen](#try-out-deepspeed-fastgen). This new release is available in [DeepSpeed versions >= 0.13.0](https://github.com/deepspeedai/DeepSpeed/tree/v0.13.0) and [DeepSpeed-MII versions >= 0.2.0](https://github.com/deepspeedai/DeepSpeed-MII/tree/v0.2.0).
+
+# 2. New Model Families <a name="new-model-families"></a>
+
+Today we introduce support for three new model families: i) [Mixtral (MoE)](https://arxiv.org/pdf/2401.04088.pdf), ii) [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/), and iii) [Falcon](https://arxiv.org/pdf/2311.16867v1.pdf)
+
+## Mixtral
+
+Mixtral model, a language model based on sparse mixture of experts (MoE), has demonstrated promising performance across multiple benchmarks. The Mixtral model operates by applying a router network at each layer for every token, selecting two distinct experts for processing the current state and combine their outputs. This process is dynamic, with the possibility of different experts being chosen at each timestep. This architecture ensures that while each token is exposed to a broad spectrum of parameters, it actively utilizes only a subset during inference.
+
+In this release, we are pleased to announce the support for Mixtral models. We've enhanced our FastGen codebase by the integration of the Mixtral model implementation, refinements to our high-performance kernels for efficient top-k gating, and updates to Rotary Positional Encoding (RoPE) implementation. These advancements ensure that users can fully exploit the capabilities of DeepSpeed-FastGen for executing Mixtral model inference, thereby achieving heightened performance and efficiency.
+
+## Phi-2
+
+Microsoft Research has introduced a suite of small language models (SLMs) named "Phi," notable for their exceptional performance across a spectrum of benchmarks. The latest addition to this suite, [Phi-2](https://www.microsoft.com/en-us/research/blog/phi-2-the-surprising-power-of-small-language-models/), is a language model boasting 2.7 billion parameters. It stands out as a testament to outstanding reasoning and language understanding capabilities, exemplifying state-of-the-art performance within the realm of base language models featuring fewer than 13 billion parameters. Notably, Phi-2 achieves parity with or surpasses models up to 25 times its size on complex benchmarks, a feat attributed to pioneering innovations in model scaling and meticulous training data curation.
+
+Owing to its compact size, Phi-2 emerges as an ideal model for both researchers and deployment scenarios, promising a reduction in inference costs. To efficiently support the Phi-2 model family, we introduce partial RoPE support in our DeepSpeed-FastGen kernels.
+
+## Falcon
+
+Falcon is a family of large language models (LLMs) developed by the Technology Innovation Institute (TII). The Falcon models include Falcon 7B, Falcon-40B and its larger counterpart, Falcon-180B, the largest openly available language model to date.
+
+A closer examination of the architectural nuances within the Falcon series reveals notable distinctions. Specifically, the Falcon 7B model diverges slightly from Falcon-40B; notably, Falcon-40B incorporates an additional layer norm preceding the parallel MLP layer, a feature absent in the Falcon 7B model. In contrast, Falcon-180B adheres to the same architecture as Falcon-40B but stands out as a scaled-up version.
+
+# 3. Performance Optimizations and Evaluation <a name="performance-optimizations"></a>
+
+SplitFuse effectively enhances utilization by simultaneously computing prompts and decoding (generating tokens). However, we observed a significant overhead for scheduling ragged batching, especially when generating a large number of tokens from numerous concurrent requests. In this release, we've minimized this scheduling overhead for querying KV cache states. As a result, there's a notable improvement in the performance for scenarios with a large number of generation steps.
+
+In general for long prompts and a smaller number of generated tokens, we can fully utilize the benefits of SplitFuse, which combines prompt processing and decoding (token generation) in a single forward pass. This provides a significant advantage over vLLM in these scenarios as shown in our [previous blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen). For short prompts and a larger number of generated tokens, where most forward passes run purely for decoding, our highly optimized engine and the efficient scheduler for ragged batching demonstrate impressive performance.
+
+We follow the benchmarking methodology we presented in our [previous blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen#a-benchmarking-methodology).
+
+*NOTE: All the benchmarks in this blog use the recommended DeepSpeed-FastGen persistent deployment mode.*
+
+### Mixtral
+
+We developed a new MoE module, which contains kernels optimized for our inference engine. The enhancements in the decoding phase, included in this release, significantly improve throughput and efficiency in generating a large number of tokens as shown in *Figure 1*.
+
+<div align="center">
+  <img src="assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_1.png" alt="" width="600"/><br>
+
+  *Figure 1: Throughput-latency curve of Mixtral using A100. A normal distribution was applied to prompt and generation lengths with averages of (1200, 2600) and (60, 128), respectively, and a 30% variance*<br>
+</div>
+
+We show the throughput-latency of Mixtral-8x7B-v0.1 running on A100 with tensor parallelism degree of 4. First, we show the scenarios where the prompt lengths are longer than the number of generation steps (i.e., tokens), which is typical of popular use cases like chatbots. From *Figure 1*, DeepSpeed-FastGen provides 2.4X higher throughput for a prompt length of 1200 and 60 generation steps. In addition to the performance for the long prompt scenarios, we present new results for shorter prompts and larger number of generation steps in *Figure 2*. Our performance advantage still holds.
+
+<div align="center">
+  <img src="assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_2.png" alt="" width="800"/><br>
+
+  *Figure 2: Throughput-latency curve of Mixtral using A100. A normal distribution was applied to prompt and generation lengths with averages of 500 and (150, 500, 1024), respectively, and a 30% variance*<br>
+</div>
+
+As we can see in *Figure 2*, DeepSpeed-FastGen is showing higher throughput and lower latency thanks to the scheduling performance improvements presented in this blog.
+
+### Phi-2
+
+<div align="center">
+  <img src="assets/images/th_lat_curve_phi-2_tp1.png" alt="" width="600"/><br>
+
+  *Figure 3: Throughput-latency curve of Phi-2 using A100. A normal distribution was applied to prompt and generation lengths with averages of (1200, 1900) and (60, 128), respectively, and a 30% variance*<br>
+</div>
+
+From *Figure 3*, DeepSpeed-FastGen provides 1.5X higher throughput for a prompt length of 1900 and 60 generation steps. For other scenarios our throughput-latency evaluation of the Phi-2 model show a similar pattern, with DeepSpeed-FastGen providing equivalent latency with greater throughput or lower latency for the same throughput.
+
+### Falcon
+
+Given the substantial size of the Falcon-40B and Falcon-180B models, the majority of computations are dedicated to forward passes, while the overhead of scheduling and token sampling is relatively minor.
+
+<div align="center">
+  <img src="assets/images/th_lat_curve_falcon-40b_tp2.png" alt="" width="600"/><br>
+
+  *Figure 4: Throughput-latency curve of Falcon 40B using A100. A normal distribution was applied to prompt and generation lengths with averages of (1200, 1900) and (60, 128), respectively, and a 30% variance*<br>
+</div>
+
+<div align="center">
+  <img src="assets/images/th_lat_curve_falcon-180B_tp8.png" alt="" width="600"/><br>
+
+  *Figure 5: Throughput-latency curve of Falcon 180B using A100. A normal distribution was applied to prompt and generation lengths with averages of (1200, 1900) and (60, 128), respectively, and a 30% variance*<br>
+</div>
+
+As seen in *Figure 4* and *Figure 5*, DeepSpeed-FastGen is able to provide higher throughput and lower latency compared to vLLM for Falcon-40B and Falcon-180B.
+
+# 4. Feature Enhancements <a name="stability-and-software-enhancements"></a>
+
+In this section we introduce several feature enhancements that have been released since we first introduced DeepSpeed-FastGen.
+
+## Performance improvements
+We achieve a notable improvement in performance by minimizing the scheduling overhead for querying KV cache states as discussed in [Performance Optimizations](#performance-optimizations).
+
+See [PR-4965](https://github.com/deepspeedai/DeepSpeed/pull/4965), [PR-377](https://github.com/deepspeedai/DeepSpeed-MII/pull/377) for more details.
+
+## Support for safetensor checkpoints
+Some HuggingFace-hosted model checkpoint weights are provided only in the safetensor format. We extend our HuggingFace checkpoint engine to work with the safetensor format to support even more models!
+
+See [PR-4659](https://github.com/deepspeedai/DeepSpeed/pull/4659), [PR-296](https://github.com/deepspeedai/DeepSpeed-MII/pull/296) for more details.
+
+## Added RESTful API
+
+We add the option to automatically stand up a RESTful API when creating DeepSpeed-FastGen persistent deployments in DeepSpeed-MII. This API provides a way for users to send prompts to their deployments and receive responses using HTTP POST methods and tools like `curl` or python's `request` package. The RESTful API provides the same high throughput and low latency performance as our python APIs. For more information, please see [MII RESTful API](https://github.com/deepspeedai/DeepSpeed-MII#restful-api).
+
+See [PR-348](https://github.com/deepspeedai/DeepSpeed-MII/pull/348), [PR-328](https://github.com/deepspeedai/DeepSpeed-MII/pull/328), [PR-294](https://github.com/deepspeedai/DeepSpeed-MII/pull/294) for more details.
+
+## Added deployment and generate options
+
+We extend the customizability of DeepSpeed-FastGen deployments and text-generation. Users can now specify a `device_map` when creating non-persistent pipelines and persistent deployments that controls which GPUs to use for hosting a model. Additionally, the interfaces between pipelines and deployments now match and include options for setting top-p, top-k, and temperature values. For additional information about the user-exposed options, please see [MII Pipeline](https://github.com/deepspeedai/DeepSpeed-MII#non-persistent-pipeline) and [MII Deployment](https://github.com/deepspeedai/DeepSpeed-MII#persistent-deployment).
+
+See [PR-331](https://github.com/deepspeedai/DeepSpeed-MII/pull/331), [PR-280](https://github.com/deepspeedai/DeepSpeed-MII/pull/280), [PR-275](https://github.com/deepspeedai/DeepSpeed-MII/pull/275), [PR-268](https://github.com/deepspeedai/DeepSpeed-MII/pull/268), [PR-295](https://github.com/deepspeedai/DeepSpeed-MII/pull/295), for more details.
+
+## Mitigate risk of deadlock
+
+In use cases where many prompts are sent to a deployment in a small time window, deadlock can occur in the DeepSpeed-FastGen inference engine, resulting in no text-generation progress is made on any prompts. To mitigate this, we ensure that there is a sufficient margin in the KV cache when scheduling requests. While not completely resolved, we continue to investigate a fix for these situations that arrive when the deployment is under heavy load.
+
+See [PR-274](https://github.com/deepspeedai/DeepSpeed-MII/pull/274) for more details.
+
+## Inference Checkpoints
+
+We add the capability to create inference engine snapshots to DeepSpeed-FastGen. This reduces the loading time for large models in future deployments.
+
+See [PR-4664](https://github.com/deepspeedai/DeepSpeed/pull/4664) for more details.
+
+## General stability and bug fixes
+
+We include many bug fixes and stability improvements to DeepSpeed-FastGen. This includes fixing issues with some OPT model size variants, bugs with MII configuration options, and improved error messages.
+
+See [PR-4938](https://github.com/deepspeedai/DeepSpeed/pull/4938), [PR-4920](https://github.com/deepspeedai/DeepSpeed/pull/4920), [PR-4739](https://github.com/deepspeedai/DeepSpeed/pull/4739), [PR-4694](https://github.com/deepspeedai/DeepSpeed/pull/4694), [PR-4634](https://github.com/deepspeedai/DeepSpeed/pull/4634), [PR-367](https://github.com/deepspeedai/DeepSpeed-MII/pull/367), [PR-350](https://github.com/deepspeedai/DeepSpeed-MII/pull/350), for more details.
+
+# 5. Community Engagement <a name="community-engagement"></a>
+
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+*We would like to recognize the contribution from our user community in adding support for the [Qwen](https://arxiv.org/abs/2309.16609) model family to DeepSpeed-FastGen in [PR-4913](https://github.com/deepspeedai/DeepSpeed/pull/4913).*
+
+# 6. Try Out DeepSpeed-FastGen <a name="try-out-deepspeed-fastgen"></a>
+
+We are very excited to share this DeepSpeed-FastGen release.
+
+* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeed-MII)
+
+DeepSpeed-FastGen is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* You can also follow us on our [English Twitter](https://twitter.com/DeepSpeedAI), [Japanese Twitter](https://twitter.com/DeepSpeedAI_JP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+The following items are on our roadmap and we plan to engage with our community on these through our GitHub issues and PRs:
+
+* Performance improvements
+* Quantization support
+* New hardware backends through collaboration with partners
+
+**"Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) repositories if you like our work!**
diff --git a/blogs/deepspeed-fastgen/2024-01-19/assets/images/fastgen-hero-dark.png b/blogs/deepspeed-fastgen/2024-01-19/assets/images/fastgen-hero-dark.png
new file mode 100644
index 000000000000..1121fa9dafd6
Binary files /dev/null and b/blogs/deepspeed-fastgen/2024-01-19/assets/images/fastgen-hero-dark.png differ
diff --git a/blogs/deepspeed-fastgen/2024-01-19/assets/images/fastgen-hero-light.png b/blogs/deepspeed-fastgen/2024-01-19/assets/images/fastgen-hero-light.png
new file mode 100644
index 000000000000..35f60788331c
Binary files /dev/null and b/blogs/deepspeed-fastgen/2024-01-19/assets/images/fastgen-hero-light.png differ
diff --git a/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_falcon-180B_tp8.png b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_falcon-180B_tp8.png
new file mode 100644
index 000000000000..6ccfcb0fe17f
Binary files /dev/null and b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_falcon-180B_tp8.png differ
diff --git a/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_falcon-40b_tp2.png b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_falcon-40b_tp2.png
new file mode 100644
index 000000000000..b08401cca7cb
Binary files /dev/null and b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_falcon-40b_tp2.png differ
diff --git a/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_1.png b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_1.png
new file mode 100644
index 000000000000..519781246289
Binary files /dev/null and b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_1.png differ
diff --git a/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_2.png b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_2.png
new file mode 100644
index 000000000000..f2bf11cda74b
Binary files /dev/null and b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_mistralai-Mixtral-8x7B-v0.1_tp4_2.png differ
diff --git a/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_phi-2_tp1.png b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_phi-2_tp1.png
new file mode 100644
index 000000000000..7e92417a64fe
Binary files /dev/null and b/blogs/deepspeed-fastgen/2024-01-19/assets/images/th_lat_curve_phi-2_tp1.png differ
diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md
index c309a9def53f..4217e5d4a996 100644
--- a/blogs/deepspeed-fastgen/README.md
+++ b/blogs/deepspeed-fastgen/README.md
@@ -23,11 +23,11 @@
 
 Large language models (LLMs) like GPT-4 and LLaMA have emerged as a dominant workload in serving a wide range of applications infused with AI at every level. From general chat models to document summarization, and from autonomous driving to copilots at every layer of the software stack, the demand to deploy and serve these models at scale has skyrocketed. While frameworks like DeepSpeed, PyTorch, and several others can regularly achieve good hardware utilization during LLM training, the interactive nature of these applications and the poor arithmetic intensity of tasks like open-ended text generation have become the bottleneck for inference throughput in existing systems.
 
-To this end, frameworks like [vLLM](https://arxiv.org/pdf/2309.06180.pdf) powered by PagedAttention and research systems like [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) have significantly improved the performance of inference for LLMs. However, these systems still struggle to provide consistent quality of service, particularly for workloads with longer prompts. These long prompt workloads are becoming increasingly important as more and more models, like [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b), and systems, such as [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses), support context windows stretching to tens of thousands of tokens. To better understand the problem space, we provide detailed examples of how text generation works for LLMs in two distinct phases called prompt processing and generation. When systems treat them as distinct phases, generation will be preempted by prompt processing that risks breaking the service level agreements (SLAs).
+To this end, frameworks like [vLLM](https://arxiv.org/pdf/2309.06180.pdf) powered by PagedAttention and research systems like [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) have significantly improved the performance of inference for LLMs. However, these systems still struggle to provide consistent quality of service, particularly for workloads with longer prompts. These long prompt workloads are becoming increasingly important as more and more models, like [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b), and systems, such as [DeepSpeed Ulysses](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses), support context windows stretching to tens of thousands of tokens. To better understand the problem space, we provide detailed examples of how text generation works for LLMs in two distinct phases called prompt processing and generation. When systems treat them as distinct phases, generation will be preempted by prompt processing that risks breaking the service level agreements (SLAs).
 
 Today, we are glad to present DeepSpeed-FastGen, a system that overcomes these limitations by leveraging the proposed Dynamic SplitFuse technique and offers up to 2.3x higher effective throughput compared to state-of-the-art systems like vLLM. DeepSpeed-FastGen leverages the combination of DeepSpeed-MII and DeepSpeed-Inference to provide an easy-to-use serving system.
 
-**Quick Start:** Trying DeepSpeed-FastGen is as simple as installing the latest [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) release:
+**Quick Start:** Trying DeepSpeed-FastGen is as simple as installing the latest [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) release:
 
 ```bash
 pip install deepspeed-mii
@@ -209,7 +209,7 @@ In addition to the deep analysis on A100, we provide additional benchmarking res
 
 ## 5. DeepSpeed-FastGen: Implementation and Usage  <a name="using-deepspeed-fastgen"></a>
 
-DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) and [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) as illustrated in the figure below. Together, both of these software packages provide various components of the system including the frontend APIs, the host and device infrastructure to schedule batches using Dynamic SplitFuse, optimized kernel implementations, and the tools to construct new model implementations.
+DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) and [DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed) as illustrated in the figure below. Together, both of these software packages provide various components of the system including the frontend APIs, the host and device infrastructure to schedule batches using Dynamic SplitFuse, optimized kernel implementations, and the tools to construct new model implementations.
 
 <div align="center">
 <img src="assets/images/fastgen-arch-light.png#gh-light-mode-only" width="800px">
@@ -219,7 +219,7 @@ DeepSpeed-FastGen is the synergistic composition of [DeepSpeed-MII](https://gith
 
 The fastest way to get started with our alpha release of DeepSpeed-FastGen is: `pip install deepspeed-mii`.
 
-Please follow our [Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) guide for more details. For usage and reporting issues, please use the [DeepSpeed-MII Github repository](https://github.com/microsoft/DeepSpeed-MII).
+Please follow our [Getting Started](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii) guide for more details. For usage and reporting issues, please use the [DeepSpeed-MII Github repository](https://github.com/deepspeedai/DeepSpeed-MII).
 
 ### A. Supported Models
 
@@ -228,13 +228,20 @@ We currently support the following model architectures in this alpha release of
 * [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
 * [Mistral](https://huggingface.co/models?other=mistral)
 * [OPT](https://huggingface.co/models?other=opt)
+* [Falcon](https://huggingface.co/models?other=falcon)
+* [Mixtral](https://huggingface.co/models?other=mixtral)
+* [Phi-2](https://huggingface.co/models?other=phi-msft)
+* [Phi-3](https://huggingface.co/models?other=phi3)
+* [Qwen](https://huggingface.co/models?other=qwen)
+* [Qwen2](https://huggingface.co/models?other=qwen2)
+* [Qwen2-MoE](https://huggingface.co/models?other=qwen2_moe)
 
 All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
 
-We plan to add additional models in the coming weeks and months after the initial release. If there are specific model architectures you would like supported, please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know.
+We plan to add additional models in the coming weeks and months after the initial release. If there are specific model architectures you would like supported, please [file an issue](https://github.com/deepspeedai/DeepSpeed-MII/issues) and let us know.
 
 ### B. Deployment options
-All of the examples below are runnable in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii). Once installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment:
+All of the examples below are runnable in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii). Once installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment:
 
 #### Non-persistent pipeline
 
@@ -273,20 +280,20 @@ client.terminate_server()
 
 ### C. Advanced Installation Information
 
-For ease of use and a significant reduction in lengthy compile times that many projects require in this space, we distribute a pre-compiled Python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases, you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/microsoft/DeepSpeed-Kernels#source).
+For ease of use and a significant reduction in lengthy compile times that many projects require in this space, we distribute a pre-compiled Python wheel covering the majority of our custom kernels through a new library called [DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels). We have found this library to be very portable across environments with NVIDIA GPUs with compute capabilities 8.0+ (Ampere+), CUDA 11.6+, and Ubuntu 20+. In most cases, you shouldn't even need to know this library exists as it is a dependency of DeepSpeed-MII and will be installed with it. However, if for whatever reason you need to compile our kernels manually please see our [advanced installation docs](https://github.com/deepspeedai/DeepSpeed-Kernels#source).
 
 
 # 6. Try Out DeepSpeed-FastGen <a name="try"></a>
 We are very excited to share this DeepSpeed-FastGen alpha release.
 
-* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/microsoft/DeepSpeed-MII)
+* To get started, please visit our GitHub page for DeepSpeed-MII: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeed-MII)
 
 DeepSpeed-FastGen is part of the bigger DeepSpeed ecosystem comprising a multitude of Deep Learning systems and modeling technologies. To learn more,
 
 * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
-* You can also follow us on our [English Twitter](https://twitter.com/MSFTDeepSpeed), [Japanese Twitter](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+* You can also follow us on our [English Twitter](https://twitter.com/DeepSpeedAI), [Japanese Twitter](https://twitter.com/DeepSpeedAI_JP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
 
-DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+DeepSpeed welcomes your contributions! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, and companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
 
 The following items are on our roadmap and we plan to engage with our community on these through our GitHub issues and PRs:
 
@@ -295,7 +302,7 @@ The following items are on our roadmap and we plan to engage with our community
 - New hardware backends through collaboration with partners
 - Release performance benchmarks (used to generate plots in this blog)
 
-**"Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) repositories if you like our work!**
+**"Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeedMII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) repositories if you like our work!**
 
 # 7. Acknowledgements <a name="acknowledgements"></a>
 
diff --git a/blogs/deepspeed-fastgen/chinese/README.md b/blogs/deepspeed-fastgen/chinese/README.md
index fb9cc7319ab6..2885799912ae 100644
--- a/blogs/deepspeed-fastgen/chinese/README.md
+++ b/blogs/deepspeed-fastgen/chinese/README.md
@@ -23,11 +23,11 @@
 
 GPT-4 和 LLaMA 这样的大型语言模型（LLMs）已在各个层次上成为了集成 AI 的主流服务应用。从常规聊天模型到文档摘要，从自动驾驶到各个软件中的Copilot功能，这些模型的部署和服务需求正在迅速增加。像 DeepSpeed、PyTorch 和其他几个框架可以在 LLM 训练期间实现良好的硬件利用率。但它们在与用户互动及处理开放式文本生成等任务时，受限于这些操作的计算密集度相对较低，现有系统往往在推理吞吐量上遇到瓶颈。
 
-为了解决这一问题， [vLLM](https://arxiv.org/pdf/2309.06180.pdf) 这样由 PagedAttention 驱动的框架和 [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) 这样的系统显著提高了 LLM 推理的性能。然而，这些系统在面对长提示的工作负载时，依旧难以提供良好的服务质量。随着越来越多的模型（例如 [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b)）和系统（例如[DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses)）支持延伸到数万个令牌的上下文窗口，这些长提示工作负载变得越来越重要。为了更好地理解问题，我们在下文中提供了详细的示例来说明 LLM 的文本生成是如何在“提示处理”和“生成”的这两个阶段中工作的。当系统将它们视为不同的阶段时，生成阶段将被提示处理所抢占，这可能会破坏服务级别协议（SLAs）。
+为了解决这一问题， [vLLM](https://arxiv.org/pdf/2309.06180.pdf) 这样由 PagedAttention 驱动的框架和 [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) 这样的系统显著提高了 LLM 推理的性能。然而，这些系统在面对长提示的工作负载时，依旧难以提供良好的服务质量。随着越来越多的模型（例如 [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b)）和系统（例如[DeepSpeed Ulysses](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses)）支持延伸到数万个令牌的上下文窗口，这些长提示工作负载变得越来越重要。为了更好地理解问题，我们在下文中提供了详细的示例来说明 LLM 的文本生成是如何在“提示处理”和“生成”的这两个阶段中工作的。当系统将它们视为不同的阶段时，生成阶段将被提示处理所抢占，这可能会破坏服务级别协议（SLAs）。
 
 今天，我们很高兴地介绍 DeepSpeed-FastGen 框架，它通过采用我们提出的动态 SplitFuse 技术，能够提供比vLLM 等先进系统高出多达 2.3 倍的有效吞吐量。DeepSpeed-FastGen 是 DeepSpeed-MII 和 DeepSpeed-Inference 的结合，提供了一个易于使用的服务系统。
 
-**快速开始：** 要使用 DeepSpeed-FastGen 只需安装最新的 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 发行版：
+**快速开始：** 要使用 DeepSpeed-FastGen 只需安装最新的 [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) 发行版：
 
 ```bash
 pip install deepspeed-mii
@@ -207,7 +207,7 @@ DeepSpeed-FastGen 提供了副本级负载均衡，可以将请求均匀分布
 
 ## 5. DeepSpeed-FastGen：软件实现与使用指南 <a name="using-deepspeed-fastgen"></a>
 
-DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII) 和 [DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed) 的协同组合，如下图所示。这两个软件包共同提供了系统的各个组成部分，包括前端 API、用于使用动态 SplitFuse 调度批次的主机和设备基础设施、优化的内核实现，以及构建新模型实现的工具。
+DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII) 和 [DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed) 的协同组合，如下图所示。这两个软件包共同提供了系统的各个组成部分，包括前端 API、用于使用动态 SplitFuse 调度批次的主机和设备基础设施、优化的内核实现，以及构建新模型实现的工具。
 
 <div align="center">
 <img src="../assets/images/fastgen-arch-light.png#gh-light-mode-only" width="800px">
@@ -217,7 +217,7 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII
 
 使用我们的 alpha 版 DeepSpeed-FastGen 最快的入门方式是：`pip install deepspeed-mii`。
 
-请按照我们的 [入门指南](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii) 获取更多细节。如需使用和报告问题，请使用 [DeepSpeed-MII Github 仓库](https://github.com/microsoft/DeepSpeed-MII)。
+请按照我们的 [入门指南](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii) 获取更多细节。如需使用和报告问题，请使用 [DeepSpeed-MII Github 仓库](https://github.com/deepspeedai/DeepSpeed-MII)。
 
 ### A. 支持的模型
 
@@ -226,13 +226,17 @@ DeepSpeed-FastGen 是 [DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII
 * [LLaMA](https://huggingface.co/models?other=llama) 和 [LLaMA-2](https://huggingface.co/models?other=llama-2)
 * [Mistral](https://huggingface.co/models?other=mistral)
 * [OPT](https://huggingface.co/models?other=opt)
+* [Falcon](https://huggingface.co/models?other=falcon)
+* [Mixtral](https://huggingface.co/models?other=mixtral)
+* [Phi-2](https://huggingface.co/models?other=phi-msft)
+* [Qwen](https://huggingface.co/models?other=qwen)
 
 所有当前模型都利用了后端的 [HuggingFace](https://github.com/huggingface) API 来提供模型权重和模型对应的分词器。
 
-> 我们计划在最初发布后的几周和几个月内添加更多模型。如果您希望支持特定的模型架构，请[提交问题](https://github.com/microsoft/DeepSpeed-MII/issues)来让我们知道。
+> 我们计划在最初发布后的几周和几个月内添加更多模型。如果您希望支持特定的模型架构，请[提交问题](https://github.com/deepspeedai/DeepSpeed-MII/issues)来让我们知道。
 
 ### B. 部署选项
-以下所有示例均可在 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) 中运行。安装后，您有两种部署方式：交互式非持久管道或持久化服务部署：
+以下所有示例均可在 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) 中运行。安装后，您有两种部署方式：交互式非持久管道或持久化服务部署：
 
 #### 非持久管道
 
@@ -270,20 +274,20 @@ client.terminate_server()
 
 ### C. 高级安装方式
 
-为了使用方便并显著减少许多其他框架所需的冗长编译时间，我们通过名为 [DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels) 的新库分发了覆盖我们大部分自定义内核的预编译 Python wheel。我们发现这个库在环境中非常便携，只要这些环境具有 NVIDIA GPU 计算能力 8.0+（Ampere+）、CUDA 11.6+ 和 Ubuntu 20+。在大多数情况下，您甚至不需要知道这个库的存在，因为它是 DeepSpeed-MII 的依赖项，并将自动与之一起安装。然而，如果您因任何原因需要手动编译我们的内核，请参阅我们的[高级安装文档](https://github.com/microsoft/DeepSpeed-Kernels#source)。
+为了使用方便并显著减少许多其他框架所需的冗长编译时间，我们通过名为 [DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels) 的新库分发了覆盖我们大部分自定义内核的预编译 Python wheel。我们发现这个库在环境中非常便携，只要这些环境具有 NVIDIA GPU 计算能力 8.0+（Ampere+）、CUDA 11.6+ 和 Ubuntu 20+。在大多数情况下，您甚至不需要知道这个库的存在，因为它是 DeepSpeed-MII 的依赖项，并将自动与之一起安装。然而，如果您因任何原因需要手动编译我们的内核，请参阅我们的[高级安装文档](https://github.com/deepspeedai/DeepSpeed-Kernels#source)。
 
 
 # 6. 尝试 DeepSpeed-FastGen <a name="try"></a>
 我们非常高兴分享 DeepSpeed-FastGen 的首个 alpha 版本。
 
-* 要开始，请访问我们的 DeepSpeed-MII GitHub 页面： [GitHub 登陆页面](https://github.com/microsoft/DeepSpeed-MII)
+* 要开始，请访问我们的 DeepSpeed-MII GitHub 页面： [GitHub 登陆页面](https://github.com/deepspeedai/DeepSpeed-MII)
 
 DeepSpeed-FastGen 是更大的 DeepSpeed 生态系统的一部分，该生态系统包含了多种深度学习系统和建模技术。要了解更多，
 
 * 请访问我们的[网站](https://www.deepspeed.ai/)，详细查看博客文章、教程和有用的文档。
-* 您也可以通过我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed)、[日本 Twitter](https://twitter.com/MSFTDeepSpeedJP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 关注我们，以获取 DeepSpeed 的最新消息。
+* 您也可以通过我们的[英文 Twitter](https://twitter.com/DeepSpeedAI)、[日本 Twitter](https://twitter.com/DeepSpeedAI_JP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 关注我们，以获取 DeepSpeed 的最新消息。
 
-DeepSpeed 欢迎您的贡献！我们鼓励您在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上报告问题、贡献 PR，并参与讨论。有关更多详细信息，请参见我们的[贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们愿意与大学、研究实验室和公司合作，比如那些在深度学习研究上共同工作，应用 DeepSpeed 来赋能真实世界的 AI 模型和应用等。对于那些不适合在 GitHub 上提出的请求（以及其他请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+DeepSpeed 欢迎您的贡献！我们鼓励您在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面上报告问题、贡献 PR，并参与讨论。有关更多详细信息，请参见我们的[贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)。我们愿意与大学、研究实验室和公司合作，比如那些在深度学习研究上共同工作，应用 DeepSpeed 来赋能真实世界的 AI 模型和应用等。对于那些不适合在 GitHub 上提出的请求（以及其他请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
 
 以下项目在我们的路线图上，我们计划通过我们的 GitHub 问题和 PR 与我们的社区在这些项目上进行交流：
 
@@ -292,7 +296,7 @@ DeepSpeed 欢迎您的贡献！我们鼓励您在 [DeepSpeed GitHub](https://git
 - 通过与合作伙伴的合作支持新硬件后端
 - 发布性能测试套件（例如此博客中生成的图表）
 
-如果您喜欢我们的工作，请为我们的 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) 仓库打上“星标”！
+如果您喜欢我们的工作，请为我们的 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 和 [DeepSpeedMII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) 仓库打上“星标”！
 
 # 7. 致谢 <a name="acknowledgements"></a>
 
diff --git a/blogs/deepspeed-fastgen/japanese/README.md b/blogs/deepspeed-fastgen/japanese/README.md
index 9729854afcf0..2ce25e62c551 100644
--- a/blogs/deepspeed-fastgen/japanese/README.md
+++ b/blogs/deepspeed-fastgen/japanese/README.md
@@ -24,14 +24,14 @@
 AIを様々な目的に利用する幅広いアプリケーションで、GPT-4やLLaMAのような大規模言語モデル（LLM）が、主要なワークロードになってきています。一般的なチャットモデルから、文書の要約、自動運転、ソフトウェアスタックの各層におけるプログラミングの補助まで、これらのモデルを大規模に展開・提供する需要が急増しています。DeepSpeedやPyTorchをはじめとするフレームワークは、一般に、LLMの訓練では良好なハードウェアの利用効率を達成できるものの、オープンエンドのテキスト生成などの課題では、GPUなどのハードウェア上で一度に実行される計算量が少ないことが、既存システムにおいて推論スループットのボトルネックとなっています。
 
 PagedAttentionを搭載した [vLLM](https://arxiv.org/pdf/2309.06180.pdf) や [Orca](https://www.usenix.org/system/files/osdi22-yu.pdf) のような既存システムは、こうした課題を解決するために設計され、LLMの推論性能を大幅に向上させました。しかしこれらのシステムは依然として、特に長いプロンプトを含むワークロードにおいて、一貫したサービス品質の提供という点で課題を残しています。
-数千トークンに及ぶコンテキストウィンドウをサポートするモデルやシステム、例えば [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b) や [DeepSpeed Ulysses](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) などが増えるにつれて、これらの長いプロンプトのワークロードはますます重要になってきています。
+数千トークンに及ぶコンテキストウィンドウをサポートするモデルやシステム、例えば [MPT-StoryWriter](https://www.mosaicml.com/blog/mpt-7b) や [DeepSpeed Ulysses](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses) などが増えるにつれて、これらの長いプロンプトのワークロードはますます重要になってきています。
 これらの問題をより深く理解するために、LLMによるテキスト生成がどのように機能するか説明します。LLMによるテキスト生成は、プロンプト処理と生成と呼ばれる2つの異なるフェーズから構成されます。システムがこれらを全く独立に扱うと、生成のフェーズは、プロンプト処理によって中断されることになります。その結果、システムのレイテンシなどを定めた SLA (Service Level Agreement) に違反する可能性が高くなります。
 
 このブログで紹介するDeepSpeed-FastGenは、新たに提案するDynamic SplitFuse技術などを活用することでこうした課題を解決し、vLLMなどの最新の既存システムと比較して最大2.3倍の実効スループットを実現するシステムです。
 DeepSpeed-FastGenは、DeepSpeed-MIIとDeepSpeed-Inferenceの組み合わせにより、使いやすいテキスト生成機能を実現します。
 
 
-**クイックスタート:** 最新の[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)をインストールするだけで、 DeepSpeed-FastGenを試すことができます。
+**クイックスタート:** 最新の[DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII)をインストールするだけで、 DeepSpeed-FastGenを試すことができます。
 
 
 ```bash
@@ -218,7 +218,7 @@ A100 GPUを用いた分析に加えて、H100とA6000を使用したベンチマ
 
 ## 5. DeepSpeed-FastGen: 実装と使い方 <a name="using-deepspeed-fastgen"></a>
 
-DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-MII](https://github.com/microsoft/DeepSpeed-MII)と[DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed)を融合的に組み合わせたものです。これらのソフトウェアパッケージは、フロントエンドAPI、Dynamic SplitFuseを使用してバッチをスケジュールするホストおよびデバイスインフラストラクチャ、最適化されたカーネル実装、新しいモデル実装を構築するためのツールなど、システムの様々なコンポーネントを提供します。
+DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-MII](https://github.com/deepspeedai/DeepSpeed-MII)と[DeepSpeed-Inference](https://github.com/deepspeedai/DeepSpeed)を融合的に組み合わせたものです。これらのソフトウェアパッケージは、フロントエンドAPI、Dynamic SplitFuseを使用してバッチをスケジュールするホストおよびデバイスインフラストラクチャ、最適化されたカーネル実装、新しいモデル実装を構築するためのツールなど、システムの様々なコンポーネントを提供します。
 
 
 <div align="center">
@@ -228,7 +228,7 @@ DeepSpeed-FastGenは、以下の図に示されているように、[DeepSpeed-M
 
 DeepSpeed-FastGenのアルファリリースを使い始める最も簡単な方法は、 ``pip install deepspeed-mii`` を実行することです。
 
-詳細については、[Getting Started](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii)ガイドを参照してください。使用法や問題の報告には、[DeepSpeed-MII Github リポジトリ](https://github.com/microsoft/DeepSpeed-MII)を使用してください。
+詳細については、[Getting Started](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii)ガイドを参照してください。使用法や問題の報告には、[DeepSpeed-MII Github リポジトリ](https://github.com/deepspeedai/DeepSpeed-MII)を使用してください。
 
 ### A. 対応モデル
 
@@ -240,11 +240,11 @@ DeepSpeed-FastGenのアルファリリースを使い始める最も簡単な方
 
 現在のすべてのモデルは、モデルの重みとモデルに対応するトークナイザーの両方を提供するために、バックエンドで [HuggingFace](https://github.com/huggingface) を利用しています。
 
-初期リリース後の数週間と数ヶ月に追加のモデルを追加する予定です。サポートを希望する特定のモデルアーキテクチャがある場合は、[issue](https://github.com/microsoft/DeepSpeed-MII/issues) を登録してください。。
+初期リリース後の数週間と数ヶ月に追加のモデルを追加する予定です。サポートを希望する特定のモデルアーキテクチャがある場合は、[issue](https://github.com/deepspeedai/DeepSpeed-MII/issues) を登録してください。。
 
 ### B. デプロイメントのオプション
 
-以下の例はすべて [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/mii) で実行可能です。インストール後、デプロイメントのオプションとして、対話型の非永続パイプラインまたは永続的なサービス提供デプロイメントの2つのオプションがあります。
+以下の例はすべて [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/mii) で実行可能です。インストール後、デプロイメントのオプションとして、対話型の非永続パイプラインまたは永続的なサービス提供デプロイメントの2つのオプションがあります。
 
 #### 非永続パイプライン
 
@@ -284,21 +284,21 @@ client.terminate_server()
 ### C. インストールの詳細情報
 
 類似の他のプロジェクトでは、カスタムカーネルのコンパイルに非常に時間がかかることがよくあります。
-DeepSpeed-FastGenでは、このコンパイル時間を大幅に短縮し、利便性を向上するため、主要なカスタムカーネルの大部分を事前コンパイルしたPython wheelを、[DeepSpeed-Kernels](https://github.com/microsoft/DeepSpeed-Kernels)という新しいライブラリを通じて配布しています。
+DeepSpeed-FastGenでは、このコンパイル時間を大幅に短縮し、利便性を向上するため、主要なカスタムカーネルの大部分を事前コンパイルしたPython wheelを、[DeepSpeed-Kernels](https://github.com/deepspeedai/DeepSpeed-Kernels)という新しいライブラリを通じて配布しています。
 このライブラリは、NVIDIA GPUのコンピュート能力が8.0以上（Ampere+）、CUDA 11.6以上、Ubuntu 20以上の環境で非常に移植性が高いことがわかっています。
-このライブラリは、DeepSpeed-MIIの依存関係としてインストールされるため、ほとんどの場合では、このライブラリの存在を知る必要はありません。しかし、何らかの理由でカーネルを手動でコンパイルする必要がある場合は、インストールに関する[詳細ドキュメント](https://github.com/microsoft/DeepSpeed-Kernels#source)をご覧ください。
+このライブラリは、DeepSpeed-MIIの依存関係としてインストールされるため、ほとんどの場合では、このライブラリの存在を知る必要はありません。しかし、何らかの理由でカーネルを手動でコンパイルする必要がある場合は、インストールに関する[詳細ドキュメント](https://github.com/deepspeedai/DeepSpeed-Kernels#source)をご覧ください。
 
 # 6. DeepSpeed-FastGen を使ってみる <a name="try"></a>
 
 このDeepSpeed-FastGenアルファリリースをユーザの皆さんと共有できることを非常に嬉しく思います。
 
-* 使用を始めるにあたっては、DeepSpeed-MIIのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/microsoft/DeepSpeed-MII)
+* 使用を始めるにあたっては、DeepSpeed-MIIのGitHubページをご覧ください: [GitHubランディングページ](https://github.com/deepspeedai/DeepSpeed-MII)
 
 DeepSpeed-FastGenは、Deep Learningシステムやモデリングテクノロジーを数多く含む、より大きなDeepSpeedエコシステムの一部です。さらに詳しい情報が必要な方は、
 [詳細なブログ記事]、チュートリアル、役立つドキュメントがある私たちの [ウェブサイト](https://www.deepspeed.ai/) をご覧ください。
-DeepSpeedの最新情報については、[英語のTwitter](https://twitter.com/MSFTDeepSpeed)、[日本語のTwitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語の知乎](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+DeepSpeedの最新情報については、[英語のTwitter](https://twitter.com/DeepSpeedAI)、[日本語のTwitter](https://twitter.com/DeepSpeedAI_JP)、[中国語の知乎](https://www.zhihu.com/people/deepspeed)をフォローしてください。
 
-DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。[contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) にはより詳細な情報があります。
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。[contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) にはより詳細な情報があります。
 また、深層学習の研究や、実世界のAIモデルやアプリケーションへのDeepSpeedの適用に取り組む大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
 
 以下の項目は、今後のロードマップです。GitHubの問題やPRを通じてコミュニティと協力して取り組む予定です:
@@ -308,7 +308,7 @@ DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeed
 - パートナーとのコラボレーションによる新しいハードウェアバックエンド
 - ブログに掲載したプロットを生成するパフォーマンスベンチマークのリリース
 
-このプロジェクトが気に入ったら、ぜひ [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) と [DeepSpeedMII GitHub](https://github.com/microsoft/DeepSpeed-MII/) のリポジトリに "スター" をつけてください。
+このプロジェクトが気に入ったら、ぜひ [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) と [DeepSpeedMII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) のリポジトリに "スター" をつけてください。
 
 # 7. 謝辞 <a name="acknowledgements"></a>
 
diff --git a/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md b/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md
new file mode 100644
index 000000000000..a880836ea2e2
--- /dev/null
+++ b/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md
@@ -0,0 +1,143 @@
+<div align="center">
+
+# DeepSpeed-FP6：大型语言模型中以FP6为核心的强大推理服务
+
+</div>
+
+<div align="center">
+
+<img src="./assets/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+
+要引用DeepSpeed-FP6，请引用以下两篇arxiv报告 - ZeroQuant(4+2) 和 FP6-LLM:
+
+```
+@article{wu2023zeroquant,
+  title={Zeroquant(4+2): Redefining llms quantization with a new fp6-centric strategy for diverse generative tasks},
+  author={Wu, Xiaoxia and Xia, Haojun and Youn, Stephen and Zheng, Zhen and Chen, Shiyang and Bakhtiari, Arash and Wyatt, Michael and Aminabadi, Reza Yazdani and He, Yuxiong and Ruwase, Olatunji and Song, Leon and others},
+  journal={arXiv preprint arXiv:2312.08583},
+  year={2023}
+}
+
+@article{xia2024fp6,
+  title={FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design},
+  author={Xia, Haojun and Zheng, Zhen and Wu, Xiaoxia and Chen, Shiyang and Yao, Zhewei and Youn, Stephen and Bakhtiari, Arash and Wyatt, Michael and Zhuang, Donglin and Zhou, Zhongzhu and others},
+  journal={arXiv preprint arXiv:2401.14112},
+  year={2024}
+}
+```
+
+
+# Table of Contents
+1. [为什么选择6位浮点(FP6)](#introduction)
+2. [FP6的系统支持](#system-fp6)
+3. [FP6的LLMs服务系统](#serving-llm)
+4. [如何开始](#how-to-start)
+5. [软件改进](#software-improvements)
+6. [致谢和贡献](#ac)
+
+# 1. 为什么选择6位浮点 <a name="introduction"></a>
+大型语言模型（LLMs）领域正处于迅猛发展之中，模型量化是提升推理服务性能的关键技术之一。 我们的研究旨在提高计算效率和存储空间，同时保持模型质量。
+
+**深入研究INT4的挑战**  在最近的研究成果 ZeroQuant(4+2)[1] 中, 我们探索了INT4量化技术(如GPTQ算法) 在大语言模型(LLMs)中的表现能力。虽然这些技术可以减小模型大小和参数存储量，但由于过拟合问题, 它们在更一般的许多任务中往往表现不佳，包括代码生成和摘要等更多生成任务。因此, 当前迫切需要新的方法来提高LLMs的效率和有效性。
+
+ **FP6的突破**  我们对不同量化方法的探索将我们带到了FP6精度标准。尽管FP6数据格式在当前AI硬件的高效支持中存在挑战（我们将在下一节中解决这一挑战），该格式在各种任务的性能和灵活性方面均表现出色。值得注意的是，使用FP6量化的模型，如StarCoder-15B，在代码生成方面达到了与FP16模型相当的结果，而较小的模型(如BART-406M)在摘要方面达到了标准FP16性能水平。为了提高FP6在当前主流AI硬件上的执行效率，我们提出了一种4+2新颖的FP6 GPU kernel方案。这一创新使FP6成为提高LLMs效率的有效途径。更多详细信息请参阅我们的研究论文 ZeroQuant(4+2)[1]。
+
+
+# 2. FP6的系统支持 <a name="system-fp6"></a>
+
+**开创性的全栈GPU KERNEL设计** FP6量化的一个挑战是缺乏针对这种不规则位宽的高效GPU KERNEL设计。在我们最近的研究中（FP6-LLM[2]），我们设计并实现了TC-FPx，第一个具有Tensor Core支持的用于FP6和各种量化位宽(6位、5位、3位等)的浮点权重的GPU系统设计方案，缓解了LLM推理期间的“内存墙”问题。TC-FPx打破了底层GPU硬件的限制，允许GPU支持涉及任意位宽模型权重的矩阵乘法计算。在TC-FPx中，Tensor Cores用于矩阵乘法的密集计算，而SIMT cores在运行时有效地用于权重反量化，将模型权重反量化为FP16类型，Tensor Core基于此进行计算。它具有以下关键创新:
+<div align="center">
+  <img src="./assets/fp6-design.png" alt="fp6 design" width="600"/>
+
+</div>
+
+* 运行前比特层级的数据排布转换。用以解决权重具有不规则位宽时不友好的内存访问挑战，实现GPU内存的最优访问；
+
+* 运行时的高效SIMT计算。用以最小化权重反量化的运行时开销；
+
+* 全栈的高效流水线设计。其SIMT计算、Tensor Core计算和GPU内存访问进行高效调度，最大程度提升性能。
+
+
+
+平均而言，我们的FP6 kernel在NVIDIA A100 GPU上进行（因decoder的矩阵形状狭长而导致参数矩阵的访存成为瓶颈的）矩阵乘法时，处理速度比FP16 cuBLAS基准提高了2.1倍。值得注意的是，通过FP6量化实现的FP6内核使LLaMA-70b模型能够在单个A100 GPU上运行。这一显著成就使得其在batch小于32的LLM推理任务中，性能比FP16基准高出1.69到2.65倍。目前，TC-FPx内核仅支持NVIDIA Ampere GPU，并且仅在A100 GPU上进行了测试和验证。
+
+
+# 3. 使用FP6服务LLMs <a name="serving-llm"></a>
+
+我们已成功将FP6量化内核[3]集成到DeepSpeed-FastGen中，实现了运行时的即时量化。这一增强功能允许通过DeepSpeed-FastGen中的统一配置选项来高效量化和部署大型语言模型。通过我们的接口，用户可以输入HuggingFace模型名称或本地checkpoint目录。输入后，我们的系统将启动指定模型的加载，对每个线性层实现FP6量化，并将量化的权重进行比特层级的数据排布转换。转换后的张量随后作为更新后的权重，而原始的FP16权重被丢弃以优化内存使用。在推理阶段，FP6内核将利用这些6位的权重进行计算。
+
+我们在两个A100 GPU-80G上评估了LLaMA-2-70b模型使用FP6量化的服务性能，实现了1.5倍的推理延迟减少和3.5倍的推理吞吐量增加，与FP16基线相比。FP6量化为模型推理提供了两个关键好处：它使大型语言模型（LLMs）能够在更少的GPU上部署——例如，LLaMA-70b在单个A100-80G GPU上就能以FP6形式运行，而FP16模型至少需要两个GPU。此外，它显著加快了小batch之下内存访问为瓶颈的线性层计算。此外，FP6量化减少了模型权重的GPU内存需求，允许同时服务更多查询，从而提高了服务吞吐量。
+
+我们的系统在处理长序列生成时表现出很高的效率。如图1所示，对于超过提示长度的生成长度，我们的系统展现出显著的性能优势。随着生成序列长度的延伸，FP6与FP16之间的性能差异加大。这一趋势主要归因于解码长度扩展时，推理过程变得越来越受内存访问瓶颈限制，有利于我们的权重量化的GPU kernel，相对于FP16实现更大的kernel速度提升。需要强调的是，较长解码场景中内存访问瓶颈增强的两个因素如下：
+
+首先，KV缓存的内存使用随序列长度增加而增加，减少了可容纳的batch大小并导致线性层的矩阵计算瓶颈变为参数的访存。
+
+其次，在DeepSpeed-FastGen的prefill-decoding-mixed-batch技术背景下，对于decoding较长的情况，用于和decoding进行mixed-batching的prefill切块会相对不足，这导致纯粹用于decoding的batch频率增加，进一步加剧了访存的瓶颈。
+<p align="center">
+  <img src="./assets/servingllm/100-250.png" alt="Caption1" width="30%">
+  <img src="./assets/servingllm/100-500.png" alt="Caption2" width="30%">
+  <img src="./assets/servingllm/100-1000.png" alt="Caption3" width="30%">
+</p>
+
+图1：在DeepSpeed-MII中，使用128个请求和32个客户端，对LLaMA-2-70B模型在2xA100-80g上进行端到端服务性能测试。我们尝试了128、256和512之间不同数量的请求，发现加速效果相似。
+
+尽管FP6量化带来了显著的好处，但当前实现仍面临一些限制。值得注意的是，在GEMM因batch较大或有充足的GPU内存而使得瓶颈变为Tensor Core计算时，我们的仅限权重的量化kernel可能无法保持其性能优势，尤其是与厂商的优化库如cuBlas相比。然而，我们系统的低内存占用仍是一个关键优势。目前的支持限于非混合专家（Non-MoE）结构，我们正在努力将支持扩展到MoE结构。此外，当前系统仅与FP16输入模型兼容，因为当前实现的FP6 kernel仅支持处理FP16的激活。
+
+</div>
+
+# 4. 如何开始 <a name="how-to-start"></a>
+
+DeepSpeed-FP6的量化和推理体验简单方便。这里我们以LLaMa-2-70B模型为例：
+```python
+import mii
+pipe = mii.pipeline("NousResearch/Llama-2-70b-hf", quantization_mode='wf6af16')
+response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
+print(response)
+```
+
+您需要安装以下内容
+
+```
+pip install deepspeed-mii
+pip install qtorch
+```
+
+要使用我们的DeepSpeed-FP6进行基准测试，请访问以下脚本：
+```bash
+https://github.com/deepspeedai/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh
+```
+
+也请访问[FP6-LLM github](https://github.com/usyd-fsalab/fp6_llm) 获取FP6的独立kernel。不要忘了给仓库加星标以表达您的支持！
+
+
+# 5. 软件改进  <a name="software-improvements"></a>
+
+
+我们的DeepSpeed-FP6目前仅支持线性GEMM。我们期待未来能够支持MoE GEMM。我们将继续根据您的反馈和支持改进DeepSpeed-FP6。DeepSpeed-FP6是更大DeepSpeed生态系统的一部分，包括一系列深度学习系统和建模技术。要了解更多，
+
+* 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。
+* 在我们的 [英文 X(Twitter)](https://twitter.com/DeepSpeedAI)、[日语 X(Twitter)](https://twitter.com/DeepSpeedAI_JP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们，以获取 DeepSpeed 的最新消息。
+
+我们欢迎您为 DeepSpeed 做出贡献！我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面上参加讨论。有关更多详细信息，请查看我们的 [贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度，例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+
+* 如果你喜欢我们的工作，请在[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/)， [DeepSpeed-MII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) 和 [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/)仓库“点赞”！
+
+
+# 6. 致谢和贡献 <a name="ac"></a>
+我们感谢悉尼大学和罗格斯大学的合作。我们还感谢开源库 [aspuru-guzik-group/qtorch](https://github.com/aspuru-guzik-group/qtorch).
+
+贡献:
+Xiaoxia Wu\* $^1$, Zhen Zheng\* $^1$, Haojun Xia\* $^2$, Arash Bakhtiari $^1$, Michael Wyatt $^1$, Shiyang Chen $^3$, Stephen Youn $^1$, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase $^1$,  Zhewei Yao, Leon Song  $^1$ $^2$（项目负责人）
+
+\* 平等贡献 1: 微软 2: 悉尼大学 3: 罗格斯大学
+
+文献:
+
+[1] ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks. arXiv. https://arxiv.org/abs/2312.08583
+
+[2] FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design. arXiv. https://arxiv.org/abs/2401.14112
+
+[3] FP6-LLM kernel release. GitHub. https://github.com/usyd-fsalab/fp6_llm
diff --git a/blogs/deepspeed-fp6/03-05-2024/README.md b/blogs/deepspeed-fp6/03-05-2024/README.md
new file mode 100755
index 000000000000..9455938d38fc
--- /dev/null
+++ b/blogs/deepspeed-fp6/03-05-2024/README.md
@@ -0,0 +1,147 @@
+<div align="center">
+
+# DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models
+
+</div>
+
+<div align="center">
+
+<img src="./assets/hero-figure.png" width="1000px" alt="DeepSpeed-VisualChat!"/>
+
+</div>
+
+
+To cite DeepSpeed-FP6, please cite the following two arxiv reports - ZeroQuant(4+2) and FP6-LLM:
+
+```
+@article{wu2023zeroquant,
+  title={Zeroquant(4+2): Redefining llms quantization with a new fp6-centric strategy for diverse generative tasks},
+  author={Wu, Xiaoxia and Xia, Haojun and Youn, Stephen and Zheng, Zhen and Chen, Shiyang and Bakhtiari, Arash and Wyatt, Michael and Aminabadi, Reza Yazdani and He, Yuxiong and Ruwase, Olatunji and Song, Leon and others},
+  journal={arXiv preprint arXiv:2312.08583},
+  year={2023}
+}
+
+@article{xia2024fp6,
+  title={FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design},
+  author={Xia, Haojun and Zheng, Zhen and Wu, Xiaoxia and Chen, Shiyang and Yao, Zhewei and Youn, Stephen and Bakhtiari, Arash and Wyatt, Michael and Zhuang, Donglin and Zhou, Zhongzhu and others},
+  journal={arXiv preprint arXiv:2401.14112},
+  year={2024}
+}
+```
+
+
+# Table of Contents
+1. [Why 6-bit Floating Point (FP6)](#introduction)
+2. [System Support for FP6](#system-fp6)
+3. [LLMs Serving with FP6](#serving-llm)
+4. [How to Start](#how-to-start)
+5. [Software Improvements](#software-improvements)
+6. [Acknowledgments and Contributions](#ac)
+
+# 1. Why 6-bit Floating Point (FP6) <a name="introduction"></a>
+
+
+In the evolving landscape of Large Language Models (LLMs) like GPT, our research aims to boost computational efficiency and storage while preserving model quality. This focus brings us to tackle the complex challenges of 4-bit quantization, where optimizing performance, efficiency, and accuracy is crucial.
+
+**Exploring the Challenges of 4-bit Quantization** In our recent research findings -- ZeroQuant (4+2)[1], we explore the capabilities of INT4 quantization techniques (like the GPTQ algorithm) for serving Large Language Models (LLMs). While these techniques reduce memory and computational requirements, they often perform poorly on a broad array of tasks, including generative tasks such as code generation and summarization, due to overfitting issues. This highlights the urgent need for new quantization approaches that simultaneously improve both the efficiency and effectiveness of LLMs.
+
+**Breakthroughs with FP6 Precision** Our exploration of different quantization methods led us to the FP6 precision standard. Despite the challenges in integrating and accelerating FP6 with current AI hardware -- which we will address in the next section - this format excels in performance and flexibility across various tasks. Notably, we observe that for generative tasks, FP6 quantization can match the performance of the half-precision (FP16) format. For example, with FP6 quantization, StarCoder-15B achieves comparable code generation results to the FP16 variant, while a smaller model, such as BART-460M, achieves comparable summarization performance to the standard FP16 equivalent. In order to preserve these quality gains, while matching the system efficiency of INT4 quantization on AI hardware, we propose a novel 4+2 FP6 scheme. This innovation makes FP6 a promising direction for improving the efficiency of LLMs, marking a significant leap in AI technology advancement. For more details, please refer to our research paper - ZeroQuant (4+2)[1].
+
+
+# 2. System Support for FP6 <a name="system-fp6"></a>
+
+**Pioneering Full-Stack GPU Kernel Design** A key challenge of FP6 quantization is the lack of efficient GPU kernel designs for this irregular, i.e., "non-power of 2", bit-width. In our recent research — FP6-LLM [2], we introduce TC-FPx, the first full-stack GPU system design scheme with unified Tensor Core support of floating point weights for FP6 and other irregular quantization bit-widths (6-bit, 5-bit, 3-bit, etc.). TC-FPx breaks the limitations of the underlying GPU hardware, allowing the GPU to support linear layer calculations on model weights of arbitrary bit width. By increasing the number of bit-width options for efficient quantization, TC-FPx significantly mitigates the "memory wall" challenges of LLM inference. In TC-FPx, Tensor Cores are utilized for intensive computation of matrix multiplications, while SIMT cores are effectively leveraged for weight dequantization, transforming the x-bit model weights to FP16 type during runtime before feeding them to Tensor Cores. It has the following key innovations:
+<div align="center">
+  <img src="./assets/fp6-design.png" alt="fp6 design" width="600"/>
+
+</div>
+
+* *Ahead-of-time Bit-level Pre-packing*: resolve the challenge of unfriendly memory access for weights with irregular bit-width, and enable optimal GPU memory access.
+
+* *SIMT-Efficient GPU Runtime*: minimize the runtime overhead of weight de-quantization.
+
+* *The software pipeline of TC-FPx kernel*: efficiently utilize SIMT cores, Tensor Cores, and the GPU memory hierarchy for high performance.
+
+
+
+On average, the TC-FPx kernel demonstrates a 2.1-fold improvement in processing speed over the FP16 cuBLAS benchmark during memory-intensive General Matrix Multiply (GEMM) operations on NVIDIA A100 GPUs. Notably, the implementation of the FP6 kernel through FP6 quantization facilitates the operation of LLaMA-70b on a solitary A100 GPU. This remarkable feat results in a normalized inference throughput that is 1.69 to 2.65 times superior to the FP16 benchmark when conducting inference tasks with batch-size under 32. Currently, TC-FPx kernel only supports NVIDIA Ampere GPUs and is only tested and verified on A100 GPUs
+
+
+# 3. LLMs serving with FP6 <a name="serving-llm"></a>
+
+We have successfully integrated the FP6 quantization kernel [3] into DeepSpeed-FastGen, facilitating on-the-fly, weight-only quantization. This enhancement permits the efficient quantization and deployment of large language models (LLMs) through a unified configuration option within DeepSpeed-FastGen. Detailed information regarding this feature will be provided in due course. Through our interface, users have the flexibility to load a model checkpoint from either HuggingFace hub or a local directory. While loading the checkpoint, our system applies FP6 round-to-nearest quantization on each linear layer, and transforms the quantized weights into 6-bit prepacked tensors. These tensors will serve as the model weights for inference, while the original FP16 weights are discarded to release memory. Throughout the inference stage, the FP6 kernels leverage the 6-bit prepacked weights, ensuring a seamless experience for users engaging with our platform.
+
+We assessed the LLaMA-70b model's serving performance using FP6 quantization on two A100 GPUs-80G, and observed a *1.5x* reduction in inference latency and a *3.5x* increase in inference throughput compared to the FP16 baseline. FP6 quantization offers two key benefits for model inference: it enables the deployment of large language models (LLMs) on fewer GPUs — for instance, LLaMA-70b fits on a single A100-80G GPU with FP6, versus at least two GPUs required for the FP16 baseline. Additionally, it significantly accelerates linear layers in memory-bound scenarios, which are common in LLM inference. Moreover, FP6 quantization reduces the GPU memory requirements for model weights, allowing for more queries to be served simultaneously, and thus increasing serving throughput.
+
+Our system demonstrates exceptional efficiency in handling long generation sequences. As illustrated in Figure 1, for generation lengths surpassing the prompt length, our system exhibits a notable performance superiority. The disparity in performance between FP6 and the FP16 baseline widens with the extension of the generation sequence length. This trend is primarily attributed to the inference process becoming increasingly memory-constrained as the decoding length expands, favoring our weight-quantized GPU kernels by facilitating faster compute compared to the FP16 baseline. It is important to highlight two factors contributing to the increased memory constraints in longer decoding scenarios.
+ - Firstly, the memory usage for the KV cache escalates with the sequence length, reducing the feasible batch sizes and leading to memory-bound GEMM  operations.
+ - Secondly, within the context of DeepSpeed-FastGen's prefill-decoding-mixed-batch technique, scenarios involving extended token generation encounter a reduction in prefill-chunks available for mixing with decodings. This results in a higher frequency of batches dedicated solely to decodings, further intensifying the memory-bound conditions.
+
+<p align="center">
+  <img src="./assets/servingllm/100-250.png" alt="Caption1" width="30%">
+  <img src="./assets/servingllm/100-500.png" alt="Caption2" width="30%">
+  <img src="./assets/servingllm/100-1000.png" alt="Caption3" width="30%">
+</p>
+
+  *Figure 1*:  End-to-end serving performances in DeepSpeed-MII with 32 clients and total of 128 requests, for LLaMA-2-70B model on 2xA100-80g with two-way tensor parallelism. We experimented with different number of requests between 128, 256 and 512 and found that the speedup is simillar.
+
+Despite the significant benefits of FP6 quantization, the current implementation faces limitations. Notably, in scenarios where GEMM operations become compute-bound due to large batch sizes or sufficient GPU memory, our weight-only quantization kernel may not sustain its latency advantage, especially against optimized libraries like cuBlas. However, our system's memory efficiency remains a key benefit. Currently, support is limited to Non-Mixture of Experts (Non-MoE) structures, with efforts underway to extend support to MoE structures. Additionally, the system is compatible only with FP16 input models, as the FP6 kernel processes FP16 activations exclusively.
+
+</div>
+
+# 4. How to begin with DeepSpeed-FP6  <a name="how-to-start"></a>
+
+The quantization-and-inference experience of DeepSpeed-FP6 is straightforward and convenient. Here we give an example based on LLaMa-2-70B model:
+
+```python
+import mii
+pipe = mii.pipeline("NousResearch/Llama-2-70b-hf", quantization_mode='wf6af16')
+response = pipe(["DeepSpeed is", "Seattle is"], max_new_tokens=128)
+print(response)
+```
+
+You need to install the following:
+```
+pip install deepspeed-mii
+pip install qtorch
+```
+
+To benchmark with our DeepSpeed-FP6, please visit the following script:
+```bash
+https://github.com/deepspeedai/DeepSpeedExamples/blob/master/benchmarks/inference/mii/run_fp6.sh
+```
+
+Please also visit the [FP6-LLM github](https://github.com/usyd-fsalab/fp6_llm) for the standalone kernel of FP6.  Don't forget to star the repo to show your support!
+
+
+# 5. Software Improvements  <a name="software-improvements"></a>
+
+
+Currently, DeepSpeed-FP6 supports only dense models with MoE models support upcoming. We will continue to improve DeepSpeed-FP6 with your feedback and support. DeepSpeed-FP6 is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more,
+
+* Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
+* Follow us on our [English X(Twitter)](https://twitter.com/DeepSpeedAI), [Japanese X(Twitter)](https://twitter.com/DeepSpeedAI_JP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+
+We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+
+* "Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeed-MII GitHub](https://github.com/deepspeedai/DeepSpeed-MII/) and [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/)  repositories if you like our work!
+
+
+# 6. Acknowledgments and Contributions <a name="ac"></a>
+We thank the collaboration of the University of Sydney and Rutgers University. We also thank the open-source library [aspuru-guzik-group/qtorch](https://github.com/aspuru-guzik-group/qtorch).
+
+Contributions:
+Xiaoxia Wu\* $^1$, Zhen Zheng\* $^1$, Haojun Xia\* $^2$, Arash Bakhtiari $^1$, Michael Wyatt $^1$, Shiyang Chen $^3$, Stephen Youn $^1$, Reza Yazdani Aminabadi, Yuxiong He, Olatunji Ruwase $^1$,  Zhewei Yao, Leon Song $^1$ $^2$ (project lead)
+
+\* Equal Contribution
+1: Microsoft
+2: University of Sydney
+3: Rutgers University
+
+Reference:
+
+[1] ZeroQuant(4+2): Redefining LLMs Quantization with a New FP6-Centric Strategy for Diverse Generative Tasks. arXiv. https://arxiv.org/abs/2312.08583
+
+[2] FP6-LLM: Efficiently Serving Large Language Models Through FP6-Centric Algorithm-System Co-Design. arXiv. https://arxiv.org/abs/2401.14112
+
+[3] FP6-LLM kernel release. GitHub. https://github.com/usyd-fsalab/fp6_llm
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png b/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png
new file mode 100644
index 000000000000..5024332a8f33
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/fp6-design.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/hero-figure.png b/blogs/deepspeed-fp6/03-05-2024/assets/hero-figure.png
new file mode 100644
index 000000000000..61a5061dc954
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/hero-figure.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-1000.png b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-1000.png
new file mode 100644
index 000000000000..c1095ee0053b
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-1000.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-250.png b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-250.png
new file mode 100644
index 000000000000..aeeaab55466d
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-250.png differ
diff --git a/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-500.png b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-500.png
new file mode 100644
index 000000000000..eb3c1ac12a7b
Binary files /dev/null and b/blogs/deepspeed-fp6/03-05-2024/assets/servingllm/100-500.png differ
diff --git a/blogs/deepspeed-gds/README.md b/blogs/deepspeed-gds/README.md
new file mode 100644
index 000000000000..536b6f984af0
--- /dev/null
+++ b/blogs/deepspeed-gds/README.md
@@ -0,0 +1,88 @@
+<div align="center">
+
+# DeepNVMe: Improving DL Applications through I/O Optimizations
+
+</div>
+
+# Introduction
+
+Deep Learning (DL) continues to drive unprecedented advancements across important
+Artificial Intelligence domains including language, speech, video, and multimodal applications.
+A key factor to these advancements is dramatic scalability on multiple dimensions including model size,
+sequence length, and hardware parallelism. From a system perspective, DL scalability puts significant
+pressure on essential subsystems including computation, memory, communication, and storage. However,
+existing DL optimization efforts have mostly neglected the storage subsystem, making I/O operations such
+as data loading, model checkpointing, and offloading the main bottlenecks of large-scale DL. To address
+this problem, DeepSpeed has created a suite of I/O optimizations collectively called DeepNVMe.
+
+DeepNVMe improves the performance and efficiency of I/O-bound DL applications by accelerating I/O operations
+and reducing hardware requirements. It achieves this by leveraging storage innovations such as Non-Volatile
+Memory Express (NVMe) Solid State Drives (SSDs) and NVIDIA Magnum IO<sup>TM</sup> GPUDirect® Storage (GDS). In this
+blog we show the benefits of DeepNVMe using microbenchmarks and an inference application. In experiments
+conducted on an Azure NC96ads\_A100\_v4 VM, we observed that DeepNVMe saturates available NVMe bandwidth for
+data transfers with GPU or CPU memory, achieving up to 10GB/sec reads and 5 GB/secs writes.
+
+# Background
+High-performance access to persistent storage is a common challenge in many computing domains, including DL. Thus, a significant number of hardware and software solutions have been proposed. DeepNVMe builds on three such solutions: (1) NVMe SSDs, (2) NVIDIA GDS, and (3) Linux Asynchronous I/O (libaio). We will briefly describe each of these technologies.
+
+NVMe SSDs are Flash-based storage devices that are replacing much slower hard disk drives (HDD) as primary persistent storage in modern servers. For example, an Azure NC96ads\_A100\_v4 VM is equipped with four NVMe SSDs which are individually capable of 3.25 GB/sec reads and can be combined in a RAID-0 configuration for a theoretical aggregate read bandwidth of 13 GB/sec. NVIDIA GDS enables direct transfers between NVMe and GPU memory thus avoiding the inefficiencies of the traditional approach of using intermediate CPU memory (bounce buffer). NVIDIA GDS is generally available in CUDA versions 11.4 and above. Finally, libaio is an asynchronous I/O stack introduced in Linux to better extract raw performance of fast storage devices like NVMe SSDs compared to the traditional I/O stack.
+
+# DeepNVMe: an Optimization Module for Deep Learning I/O
+
+DeepNVMe is a Python module that we developed with two key design principles. First, it leverages the above discussed storage technologies to implement powerful optimizations such as non-blocking I/O operations, bulk submission of I/O operations, parallelization of an individual I/O operation, and a lightweight runtime. Second, it exposes these I/O optimizations through a simple POSIX-like interface to foster easy integration into DL applications while avoiding the complexities of the underlying technologies.
+
+# Evaluation
+
+Our experiments are conducted on an Azure NC96ads\_A100\_v4 VM with setup details summarized in Table 1. For multi-device experiments, the SSDs are combined in a RAID-0 configuration.
+
+<img src="./media/table1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Table 1: Experimental setup details
+</div>
+
+## Microbenchmark Performance
+
+We used three benchmarking tools for our evaluations. The first is fio, the popular I/O benchmarking tool written in C. The second is gdsio from NVIDIA for benchmarking GDS performance. The third is ds\_io, a Python tool that we created for easy integration with DeepNVMe and to be more representative of DL applications which are commonly Python-based.
+
+## High-Performance I/O with CPU Buffers via NVMe Scaling
+
+Our first set of microbenchmark evaluations used fio and ds\_io to measure the performance of transferring 1GB data between NVMe and CPU memory. We configure fio to use the libaio backend for these experiments1. The results are summarized in Figure 1, from which we make two observations. First, DeepNVMe demonstrates high performance as it roughly matches fio, despite being more representative of DL applications. Second, DeepNVMe scales I/O performance almost linearly with available NVMe bandwidth, achieving rates of 10GB/sec reads and 5GB/sec writes.
+
+<img src="./media/figure1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 1: Using DeepNVMe to scale data transfers between NVMe and CPU buffer
+</div>
+
+## High-Performance I/O with GPU Buffers via NVMe Scaling
+
+Our second set of microbenchmark evaluations used gdsio and ds\_io to measure the performance of 1GB data transfer between NVMe and GPU memory. For this experiment, we configure ds\_io to use both the traditional bounce buffer approach and the more efficient GDS approach. The results are summarized in Figure 2, from which we make three observations. First, we see that GDS improves performance in DeepNVMe compared to the traditional bounce buffer approach, with up to 37% speedup. Second, DeepNVMe demonstrates high performance by matching (and sometimes surpassing) gdsio despite being more representative of DL applications. Third, we see that DeepNVMe, with and without GDS, scales I/O performance with available NVMe bandwidth. With GDS, DeepNVMe achieves a maximum of 9.6GB/sec reads and 5GB/sec writes, and without GDS achieves 7GB/sec reads and 4GB/sec writes.
+
+<img src="./media/figure2.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 2: Using DeepNVMe to scale data transfers between NVMe and GPU memory
+</div>
+
+## ZeRO-Inference: Generative AI Performance
+
+ZeRO-Inference is an AI democratization technology that reduces the hardware cost of inferencing massive models by using DeepNVMe to offload model weights to CPU or NVMe memory.  ZeRO-Inference is well suited for throughput-oriented applications, such as offline inferencing, and for scenarios with limited hardware budget. We use token generation workload to evaluate DeepNVMe performance for NVMe offloading.
+
+## High-Performance Offloading via NVMe Scaling
+
+We measure the generation throughput of inferencing a LLAMA3-70B model on a single NVIDIA A100-80GB with a prompt length of 512, generation length of 32, and batch size of 96. We scale the number of NVMe SSDs from 1 to 4 and present the results for ZeRO-Inference with and without GDS in Figure 3.  We make two observations from these results. First, GDS consistently provides better performance compared to the bounce buffer approach, achieving 10-18% faster token generation. Second, DeepNVMe, with and without GDS, scales generation performance with available NVMe bandwidth. With four NVMe SSDs, DeepNVMe achieves generation throughput rates of 7 tokens per second with GDS and 6 tokens per second without GDS. Our profiling results suggest that DeepNVMe will continue to scale with more NVMe bandwidth, making it an economic option for boosting generative application performance.
+
+<img src="./media/figure3.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+Figure 3: Using DeepNVMe to scale LLAMA3-70B token generation performance with NVMe offloading.
+</div>
+
+# Summary
+
+In this blog post, we introduced DeepNVMe, an I/O optimization technology created to tackle the emergence of I/O operations as key bottlenecks of Deep Learning scalability. DeepNVMe enables fast and efficient data transfers between persistent storage and DL application memory through optimizations built on popular storage technologies such as NVMe SSDs and NVIDIA GDS. We showed benefits of using DeepNVMe for LLAMA3-70B token generation on single A100-80GB GPU with NVMe offloading, for which it achieves up to 7 tokens per second in generation throughput on an Azure NC96ads\_A100\_v4 VM. DeepNVMe will be open-sourced and generally available in DeepSpeed versions >= [0.15.0](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.15.0).  In future blogs, we will report DeepNVMe improvements for other I/O bound DL applications such as model checkpointing and data loading.
+
+
+# Acknowlegements
+This work is the result of a deep collaboration between Microsoft and NVIDIA. The contributors include Joe Mayer, Martin Cai, and Olatunji Ruwase from Microsoft; Kiran Modukuri, Vahid Noormofidi, Sourab Gupta, and Sandeep Joshi from Nivida.
diff --git a/blogs/deepspeed-gds/chinese/README.md b/blogs/deepspeed-gds/chinese/README.md
new file mode 100644
index 000000000000..9fa9d7150c42
--- /dev/null
+++ b/blogs/deepspeed-gds/chinese/README.md
@@ -0,0 +1,77 @@
+<div align="center">
+
+# DeepNVMe: 通过I/O优化提高深度学习应用性能
+
+</div>
+
+# 引言
+
+深度学习（DL）在语言、语音、视频和多模态应用等重要人工智能领域不断推动着前所未有的进展。这些进展的关键因素是模型大小、序列长度和硬件并行性等多个维度上的显著可扩展性。从系统角度来看，深度学习的可扩展性给计算、内存、通信和存储等关键子系统带来了巨大的压力。然而，现有的深度学习优化工作大多忽略了存储子系统，使得数据加载、模型检查点和卸载等I/O操作成为大规模深度学习中的主要瓶颈。为了解决这个问题，DeepSpeed开发了一整套I/O优化技术，统称为DeepNVMe。
+
+DeepNVMe通过加速I/O操作和减少硬件需求，提高了I/O受限的深度学习应用的性能和效率。它通过利用存储创新，如非易失性内存快速通道（NVMe）固态硬盘（SSD）和NVIDIA Magnum IO<sup>TM</sup> GPUDirect®存储（GDS）实现这一目标。在本文中，我们通过微基准测试和推理应用来展示DeepNVMe的优势。在对Azure NC96ads\_A100\_v4虚拟机进行的实验中，我们观察到DeepNVMe能充分利用可用的NVMe带宽进行GPU或CPU内存的数据传输，读取速度达到10GB/秒，写入速度达到5GB/秒。
+
+# 背景
+
+高性能访问持久存储是许多计算领域（包括深度学习）中的一个常见挑战。因此，已经提出了大量的硬件和软件解决方案。DeepNVMe基于三种解决方案： (1) NVMe SSDs，(2) NVIDIA GDS，(3) Linux异步I/O（libaio）。我们将简要介绍每项技术。
+
+NVMe SSDs是基于闪存的存储设备，正在取代传统的硬盘驱动器（HDD），成为现代服务器的主要持久存储。例如，Azure NC96ads\_A100\_v4虚拟机配备了四个NVMe SSD，每个SSD可提供3.25GB/秒的读取速度，并且可以组合成RAID-0配置，理论上的总读取带宽为13GB/秒。NVIDIA GDS可以实现NVMe和GPU内存之间的直接数据传输，从而避免了传统使用中间CPU内存（缓冲区）方法的低效。NVIDIA GDS在CUDA 11.4及以上版本中可用。最后，libaio是Linux引入的异步I/O栈，它比传统的I/O栈更有效地提取NVMe SSD等高速存储设备的原始性能。
+
+# DeepNVMe: 深度学习I/O优化模块
+
+DeepNVMe是一个Python模块，我们开发时遵循了两个关键设计原则。首先，它利用上述存储技术，实现了强大的优化，如非阻塞I/O操作、批处理I/O操作提交、单个I/O操作的并行化以及轻量级运行时。其次，它通过一个简单的POSIX-like接口让用户使用I/O优化，便于深度学习应用集成，同时避免了底层技术的复杂性。
+
+# 评估
+
+我们的实验在Azure NC96ads\_A100\_v4虚拟机上进行，实验设置的详细信息见表1。对于多设备实验，SSD是以RAID-0配置组合使用的。
+
+<img src="../media/table1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+表1: 实验设置详细信息
+</div>
+
+## 微基准性能测试
+
+我们使用了三种基准测试工具进行评估。第一个是fio，这是一个用C语言编写的流行I/O基准测试工具。第二个是来自NVIDIA的gdsio，用于基准测试GDS性能。第三个是ds\_io，这是我们创建的Python工具，便于与DeepNVMe集成，并且更能代表常见的基于Python的深度学习应用。
+
+## 通过NVMe扩展CPU缓冲区，从而提高I/O性能
+
+我们的第一组微基准评估使用fio和ds\_io，测量1GB数据在NVMe和CPU内存之间的传输性能。我们配置fio使用libaio后端进行这些实验。结果总结在图1中，我们可以得出两个结论。首先，DeepNVMe表现出高性能，尽管它更能代表深度学习应用，但其性能与fio大致相当。其次，DeepNVMe的I/O性能几乎与可用的NVMe带宽成线性扩展，达到了10GB/秒的读取速度和5GB/秒的写入速度。
+
+<img src="../media/figure1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+图1: 使用DeepNVMe扩展NVMe与CPU缓冲区之间的数据传输
+</div>
+
+## 通过NVMe扩展GPU缓冲区，从而提高I/O性能
+
+我们的第二组微基准评估使用gdsio和ds\_io，测量1GB数据在NVMe和GPU内存之间的传输性能。在此实验中，我们配置ds\_io同时使用传统的缓冲区方法和更高效的GDS方法。结果总结在图2中，我们可以得出三个结论。首先，我们看到GDS提高了DeepNVMe的性能，相比传统缓冲区方法，速度提高了最多37%。其次，DeepNVMe表现出高性能，尽管它更能代表深度学习应用，但其性能与gdsio相匹配（有时甚至超过）。第三，我们看到DeepNVMe，无论是否使用GDS，都能根据可用的NVMe带宽扩展I/O性能。使用GDS时，DeepNVMe的读取速度最高达到9.6GB/秒，写入速度为5GB/秒；不使用GDS时，读取速度为7GB/秒，写入速度为4GB/秒。
+
+<img src="../media/figure2.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+图2: 使用DeepNVMe扩展NVMe与GPU内存之间的数据传输
+</div>
+
+## ZeRO-Inference: 生成式AI性能
+
+ZeRO-Inference是一项AI普及技术，通过使用DeepNVMe将模型权重卸载(Offload)到CPU或NVMe内存，降低了推理大规模模型的硬件成本。ZeRO-Inference非常适合于面向吞吐量的应用，如离线推理，和硬件预算有限的场景。我们使用token生成工作负载来评估DeepNVMe在NVMe卸载下的性能。
+
+## 通过NVMe扩展的高性能卸载(Offload)
+
+我们测量了在单个NVIDIA A100-80GB上推理LLAMA3-70B模型的生成吞吐量，使用512的提示长度、32的生成长度和96的批量大小。我们将NVMe SSD的数量从1扩展到4，并呈现了ZeRO-Inference在有GDS和没有GDS的情况下的结果，如图3所示。我们从这些结果中得出两个结论。首先，GDS始终提供比传统缓冲区方法更好的性能，token生成速度提高了10-18%。其次，DeepNVMe，无论是否使用GDS，都能根据可用的NVMe带宽扩展生成性能。在四个NVMe SSD的情况下，DeepNVMe的生成吞吐量分别为每秒7个token（使用GDS）和每秒6个token（不使用GDS）。我们的分析结果表明，DeepNVMe将在更多的NVMe带宽下继续扩展，是提升生成应用性能的经济选择。
+
+<img src="../media/figure3.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+图3: 使用DeepNVMe通过NVMe卸载(offload)扩展LLAMA3-70B的token生成性能
+</div>
+
+# 总结
+
+在本文中，我们介绍了DeepNVMe，一项为了解决I/O操作成为深度学习可扩展性关键瓶颈而创建的I/O优化技术。DeepNVMe通过基于流行存储技术（如NVMe SSD和NVIDIA GDS）的优化，实现了持久存储与深度学习应用内存之间的快速高效数据传输。我们展示了在Azure NC96ads\_A100\_v4虚拟机上，DeepNVMe通过NVMe卸载支持LLAMA3-70B的token生成，最高达到每秒7个token的生成吞吐量。DeepNVMe将在DeepSpeed版本>= 0.15.0中开源，并广泛发布。在未来的博客中，我们将报告DeepNVMe在其他I/O受限的深度学习应用中的改进，如模型检查点和数据加载。
+
+# 致谢
+
+这项工作是微软和NVIDIA之间深入合作的结果。贡献者包括微软的Joe Mayer、Martin Cai和Olatunji Ruwase；NVIDIA的Kiran Modukuri、Vahid Noormofidi、Sourab Gupta和Sandeep Joshi。
diff --git a/blogs/deepspeed-gds/japanese/README.md b/blogs/deepspeed-gds/japanese/README.md
new file mode 100644
index 000000000000..26320d00ab94
--- /dev/null
+++ b/blogs/deepspeed-gds/japanese/README.md
@@ -0,0 +1,77 @@
+<div align="center">
+
+# DeepNVMe: I/O最適化による深層学習アプリケーションの高速化
+
+</div>
+
+# はじめに
+
+深層学習（Deep Learning）は、言語、音声、ビデオ、マルチモーダルアプリケーションなどの重要なAIの応用領域において、かつてない進歩を続けています。この進歩の鍵となる要因は、モデルサイズ、シーケンス長、ハードウェア並列性などの複数の次元での劇的なスケーラビリティです。システムの観点から見ると、深層学習のスケーラビリティは計算、メモリ、通信、ストレージなどの重要なサブシステムに大きな負荷をかけます。しかし、既存の取り組みは、ストレージサブシステムの最適化はほとんど扱われておらず、データロード、モデルチェックポイント、オフロードなどのI/O操作が大規模な深層学習の主要なボトルネックとなっています。この問題に対処するために、DeepSpeedは一連のI/O最適化機能を「DeepNVMe」と呼ばれる形で提供します。
+
+DeepNVMeは、I/O操作の高速化とハードウェア要件の緩和によって、I/Oがボトルネックとなる深層学習アプリケーションのパフォーマンスと効率を向上させます。これを実現するために、Non-Volatile Memory Express（NVMe）やSSD、NVIDIA Magnum IO `<sup>`TM `</sup>` GPUDirect® Storage（GDS）などのストレージ技術を活用しています。このブログでは、マイクロベンチマークと推論アプリケーションの性能評価結果に基づいて、DeepNVMeの利点を示します。Azure NC96ads_A100_v4 VMで実施された実験では、DeepNVMeがGPUまたはCPUメモリへのデータ転送で利用可能なNVMe帯域幅を最大限に活用し、最大10GB/秒の読み取りと5GB/秒の書き込みを達成しました。
+
+# 背景
+
+永続ストレージへの高性能アクセスは、深層学習を含む多くのコンピューティングドメインで共通の課題です。これに対して、多くのハードウェアおよびソフトウェアソリューションが提案されています。DeepNVMeは、以下の3つのソリューションを基に構築されています。(1) NVMe SSD、(2) NVIDIA GDS、(3) Linux非同期I/O（libaio）。これらの技術について簡単に説明します。
+
+NVMe SSDは、現代のサーバーで主要な永続ストレージとして、従来の遅いハードディスクドライブ（HDD）に取って代わるフラッシュベースのストレージデバイスです。たとえば、Azure NC96ads_A100_v4 VMには4つのNVMe SSDが装備されており、それぞれが3.25 GB/秒の読み取り速度を持ち、RAID-0構成で組み合わせると理論上の合計読み取り帯域幅は13 GB/秒となります。NVIDIA GDSは、NVMeとGPUメモリ間の直接転送を可能にすることで、中間のCPUメモリ（バウンスバッファ）を使用する従来のアプローチの非効率を回避します。NVIDIA GDSは、CUDAバージョン11.4以上で利用可能です。最後に、libaioは、従来のI/Oスタックと比較して、NVMe SSDのような高速ストレージデバイスの性能をより引き出すためにLinuxに導入された非同期I/Oスタックです。
+
+# DeepNVMe: 深層学習のためのI/O最適化モジュール
+
+DeepNVMeは、以下の2つの主要な設計原則に基づいて開発されたPythonモジュールです。第一に、上記のストレージ技術を活用して、ノンブロッキングI/O操作、I/O操作の一括送信、個々のI/O操作の並列化、軽量なランタイムなどの最適化を実装しています。第二に、これらのI/O最適化をシンプルなPOSIXライクなインターフェースを通じて提供し、深層学習アプリケーションへの容易な統合を促進し、基盤となっている複雑な技術を直接扱うことなく、その性能を活用することを可能にします。
+
+# 評価
+
+実験は、Azure NC96ads_A100_v4 VMで実施されました。設定の詳細は表1の通りです。
+
+<img src="../media/table1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+表1: 実験設定の詳細
+</div>
+
+## マイクロベンチマーク
+
+評価には3つのベンチマークツールを使用しました。一つ目は、C言語で書かれた一般的なI/Oベンチマークツールであるfioです。次に、GDSパフォーマンスのベンチマークを行うためのNVIDIAのgdsioです。最後に、DeepNVMeとの容易な統合のために我々た作成したds_ioです。ds_ioは、深層学習アプリケーションで代表的に使用されるPythonで作成されています。
+
+## CPUバッファを使用したNVMeスケーリングによる高性能I/O
+
+最初のマイクロベンチマーク評価では、fioとds_ioを使用して、NVMeとCPUメモリ間で1GBのデータを転送するパフォーマンスを測定しました。これらの実験ではfioをlibaioバックエンドに設定しました。結果は図1の通りです。ここから、2つの点が読み取れます。第一に、DeepNVMeは、深層学習アプリケーションにおける性能改善を目指したものであるにも関わらず、このマイクロベンチマークでもfioに匹敵する高性能を示しています。第二に、DeepNVMeは、利用可能なNVMe帯域幅にほぼ線形にスケールし、10GB/秒の読み取りおよび5GB/秒の書き込み速度を達成しています。
+
+<img src="../media/figure1.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+図1: DeepNVMeを使用したNVMeとCPUバッファ間のデータ転送のスケーリング
+</div>
+
+## GPUバッファを使用したNVMeスケーリングによる高性能I/O
+
+二つ目のマイクロベンチマーク評価では、gdsioとds_ioを使用して、NVMeとGPUメモリ間で1GBのデータ転送のパフォーマンスを測定しました。この実験では、ds_ioを従来のバウンスバッファアプローチとより効率的なGDSアプローチの両方で設定します。結果は図2の通りです。ここから、次の3点が観察できます。第一にGDSを用いるケースで、従来のバウンスバッファアプローチと比較して、DeepNVMeは最大で37%のスピードアップを実現しています。第二に、DeepNVMeは、深層学習アプリケーションのために作成されたものであるにも関わらず、gdsioに匹敵する（時にはそれを上回る）高性能を示します。第三に、DeepNVMeは、GDSの有無にかかわらず、NVMe帯域幅を最大限に活用できます。GDSを使用した場合、DeepNVMeは最大9.6GB/秒の読み取りおよび5GB/秒の書き込み速度を達成し、GDSを使用しない場合は7GB/秒の読み取りおよび4GB/秒の書き込み速度を達成します。
+
+<img src="../media/figure2.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+図2: DeepNVMeを使用したNVMeとGPUメモリ間のデータ転送のスケーリング
+</div>
+
+## ZeRO-Inference: 生成AIパフォーマンス
+
+ZeRO-Inferenceは、モデルの重み（パラメータ）をCPUまたはNVMeメモリにオフロードすることで、大規模モデルの推論に必要なハードウェアコストを削減し、限られたハードウェア資源しかないユーザでも大規模モデルを活用できるようにするための技術です。ZeRO-Inferenceは、オフライン推論などのスループット指向のアプリケーションや、ハードウェア予算が限られているシナリオに適しています。DeepNVMeのNVMeオフロードのパフォーマンスを評価するために、トークン生成ワークロードを使用します。
+
+## NVMeスケーリングによる高性能オフロード
+
+LLAMA3-70Bモデルの推論を単一のNVIDIA A100-80GBで、プロンプト長512、生成長32、バッチサイズ96で実行し、生成スループットを測定します。NVMe SSDの数を1から4までスケーリングし、GDSの有無でZeRO-Inferenceの結果を図3に示します。この結果から、2つの観察ができます。第一に、GDSはバウンスバッファアプローチと比較して一貫して優れたパフォーマンスを提供し、トークン生成を10-18%高速化します。第二に、DeepNVMeは、GDSの有無にかかわらず、利用可能なNVMe帯域幅にスケールします。4つのNVMe SSDを使用する場合、DeepNVMeはGDSを使用して1秒あたり7トークン、GDSを使用しない場合は1秒あたり6トークンの生成スループットを達成します。プロファイリング結果は、DeepNVMeがより多くのNVMe帯域幅で引き続きスケールし、生成アプリケーションのパフォーマンスを低コストで向上できることを示しています。
+
+<img src="../media/figure3.png" style="width:6.5in;height:3.42153in" />
+
+<div align="center">
+図3: DeepNVMeを使用したLLAMA3-70Bトークン生成パフォーマンスのNVMeオフロードによるスケーリング
+</div>
+
+# まとめ
+
+このブログ記事では、深層学習のスケーラビリティにおいて主要なボトルネックとなるI/O操作を最適化する、DeepNVMeを紹介しました。DeepNVMeは、NVMe SSDやNVIDIA GDSなどのストレージ技術に基づいた最適化を通じて、永続ストレージと深層学習アプリケーションのデータ転送を高速かつ効率的に実現します。Azure NC96ads_A100_v4 VMでの単一A100-80GB GPUを使用したLLAMA3-70Bトークン生成において、DeepNVMeを使用することで、NVMeオフロードで最大7トークン/秒の生成スループットを達成しました。DeepNVMeはオープンソース化され、DeepSpeedバージョン[0.15.0](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.15.0).以上で利用可能です。今後のブログでは、モデルチェックポイントやデータロードなどの他のI/Oがボトルネックとなる深層学習アプリケーションに対するDeepNVMeの改善について報告します。
+
+# 謝辞
+
+この成果は、MicrosoftとNVIDIAの協力によるものです。MicrosoftからはJoe Mayer、Martin Cai、Olatunji Ruwase、NVIDIAからはKiran Modukuri、Vahid Noormofidi、Sourab Gupta、Sandeep Joshiが貢献しました。
diff --git a/blogs/deepspeed-gds/media/figure1.png b/blogs/deepspeed-gds/media/figure1.png
new file mode 100755
index 000000000000..08db7d2f8afa
Binary files /dev/null and b/blogs/deepspeed-gds/media/figure1.png differ
diff --git a/blogs/deepspeed-gds/media/figure2.png b/blogs/deepspeed-gds/media/figure2.png
new file mode 100755
index 000000000000..35be5d4c4015
Binary files /dev/null and b/blogs/deepspeed-gds/media/figure2.png differ
diff --git a/blogs/deepspeed-gds/media/figure3.png b/blogs/deepspeed-gds/media/figure3.png
new file mode 100755
index 000000000000..7175236f886b
Binary files /dev/null and b/blogs/deepspeed-gds/media/figure3.png differ
diff --git a/blogs/deepspeed-gds/media/table1.png b/blogs/deepspeed-gds/media/table1.png
new file mode 100755
index 000000000000..bba571369932
Binary files /dev/null and b/blogs/deepspeed-gds/media/table1.png differ
diff --git a/blogs/deepspeed-offloadpp/README.md b/blogs/deepspeed-offloadpp/README.md
index 1441da5a35c0..f58173b7bc8b 100644
--- a/blogs/deepspeed-offloadpp/README.md
+++ b/blogs/deepspeed-offloadpp/README.md
@@ -4,7 +4,7 @@ Deep learning has been successfully adopted in a wide range of applications such
 
 ZeRO offers memory efficient data parallel training scheme. For training large models like LLMs using ZeRO, GPU memory size is still often insufficient to hold all the model parameters. Thus, ZeRO-Offload is introduced to solve this insufficient GPU memory issue. ZeRO-Offload releases GPU memory pressure by offloading data and compute to the CPU side while minimizing CPU-GPU data copy overhead. Given CPU memory is often orders-of-magnitude larger than GPU memory, ZeRO-Offload was the first piece of work that enables billion-level parameter training even with very limited GPU memory resources (e.g., to an extreme: single GPU). ZeRO-Offload provides excellent performance when model size is multiple times larger than total GPU memory size.
 
-However, system efficiency is still far from optimal when adopting ZeRO-Offload in some scenarios. Especially in the cases like small batch training, model that could not fit into GPU memory but not orders-of-magnitude bigger than GPU memory capacity, CPU offload not only introduce long end-to-end latency, but also underutilize GPU computation resources. To reduce memory copy latency as well as inefficient utilization of GPU introduced in these offload cases, we propose ZeRO-Offload++, which leverages both CPU and GPU coherently. ZeRO-Offload++ mainly includes 3 new features as _Twin-Flow_, MemCpy reduction, CPUAdam optimization. Now we release our __Twin-Flow__ feature.
+However, system efficiency is still far from optimal when adopting ZeRO-Offload in some scenarios. Especially in the cases like small batch training, model that could not fit into GPU memory but not orders-of-magnitude bigger than GPU memory capacity, CPU offload not only introduce long end-to-end latency, but also underutilized GPU computation resources. To reduce memory copy latency as well as inefficient utilization of GPU introduced in these offload cases, we propose ZeRO-Offload++, which leverages both CPU and GPU coherently. ZeRO-Offload++ mainly includes 3 new features as _Twin-Flow_, MemCpy reduction, CPUAdam optimization. Now we release our __Twin-Flow__ feature.
 
 The key benefits are:
 * With _Twin-Flow_, ZeRO-Offload++ achieves up to **6x** training speedup compared with ZeRO-Offload.
@@ -43,7 +43,7 @@ We conduct our performance evaluations over both A100 and H100 DGX machine and t
 
 ## Tutorials
 
-Examples and Tutorials are [here](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/offload_pp/README.md)
+Examples and Tutorials are [here](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/examples_deepspeed/offload_pp/README.md)
 
 ## Contributors:
 
diff --git a/blogs/deepspeed-triton/README.md b/blogs/deepspeed-triton/README.md
index 071b5d4bc6d0..57922c5e1a23 100644
--- a/blogs/deepspeed-triton/README.md
+++ b/blogs/deepspeed-triton/README.md
@@ -19,7 +19,7 @@ Table 1. The average speedup (see NOTE below for more detail)
 
 For those transformer operators in float16, we have implemented kernels written in Triton language that replace ordinary CUDA kernels or torch operators.
 The Triton kernels we implemented include softmax, layer-normalization, residual-addition and all the matrix multiplications except MLP layers (see NOTE below for details).
-In our experiments, Triton kernels help to reduce the average latecy (over difference sequence lengths) by 6\~24% (depending on model and hardware) when compared to the latency with CUDA-only kernels.
+In our experiments, Triton kernels help to reduce the average latency (over difference sequence lengths) by 6\~24% (depending on model and hardware) when compared to the latency with CUDA-only kernels.
 
 
 Figures below show the latency reduction in more detail.
@@ -65,7 +65,7 @@ We use an example of Bert-base here.
 ```python
 pip install deepspeed[triton]
 
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/inference/huggingface/fill-mask
 
 deepspeed --num_gpus 1 test-bert.py --triton
@@ -76,7 +76,7 @@ To run a performance benchmark, you can use the following command:
 ```python
 pip install deepspeed[triton]
 
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/benchmarks/inference
 
 deepspeed --num_gpus 1 triton-bert-benchmark.py --model bert-base-cased --dtype fp16 --kernel-inject --deepspeed --graphs --triton
@@ -84,7 +84,7 @@ deepspeed --num_gpus 1 triton-bert-benchmark.py --model bert-base-cased --dtype
 
 # NOTE
 <!-- **_NOTE:_** -->
-* For more information on how to use DeepSpeed, please visit our [GitHub Page](https://github.com/microsoft/DeepSpeedExamples) and our [website](https://www.deepspeed.ai/), where you can find blog posts, tutorials, and documentation.
+* For more information on how to use DeepSpeed, please visit our [GitHub Page](https://github.com/deepspeedai/DeepSpeedExamples) and our [website](https://www.deepspeed.ai/), where you can find blog posts, tutorials, and documentation.
 
 * This feature is currently only supported for BERT, Roberta and other BERT-like models, and not for text-generation models yet.
 
diff --git a/blogs/deepspeed-ucp/README.md b/blogs/deepspeed-ucp/README.md
new file mode 100644
index 000000000000..3420e72c238e
--- /dev/null
+++ b/blogs/deepspeed-ucp/README.md
@@ -0,0 +1,273 @@
+<div align="center">
+
+# DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training
+
+</div>
+
+<img src="./media/image1.png" style="width:6.5in;height:3.65625in" />
+
+To cite DeepSpeed Universal Checkpoint, please cite our [arxiv report](https://arxiv.org/abs/2406.18820):
+
+```
+@article{lian2024-ucp,
+title={Universal Checkpointing: Efficient and Flexible Checkpointing for
+Large Scale Distributed Training},
+author={Xinyu Lian and Sam Ade Jacobs and Lev Kurilenko and Masahiro Tanaka
+and Stas Bekman and Olatunji Ruwase and Minjia Zhang},
+journal={arxiv preprint arxiv:406.18820},
+year={2024},
+
+}
+```
+
+# Introduction
+
+Checkpointing is a crucial technique for reducing the cost of training
+machine learning models, as it enables saving the model state during the process.
+This way, if the system fails, the training can resume from the most recent checkpoint
+instead of from the beginning. Additionally, checkpointing allows for
+evaluating the model performance at various stages of training, which
+facilitates hyperparameter tuning and finetuning for different and
+varied downstream tasks.
+
+However, there are challenges in the design, implementation and usage of
+checkpointing especially in distributed training and finetuning
+scenarios. Parallel training methods such as ZeRO data parallelism (ZeRO-DP),
+pipeline parallelism (PP), tensor parallelism (TP) and sequence
+parallelism (SP) are popular technologies for accelerating LLMs training.
+However, elastic and flexible composition of these different parallelism
+topologies with checkpointing is not currently available, in part, because
+these techniques shard model and/or optimizer states making it difficult to
+resume training with a checkpoint that was created on a different number of GPUs or
+accelerators.
+
+In this release, we are excited to introduce DeepSpeed Universal
+Checkpointing (*UCP*), a most comprehensive solution to the problem of
+distributed checkpointing. *UCP* enables efficient checkpoint creation
+while providing the flexibility of resuming on arbitrary parallelism
+strategies and hardware configurations. *UCP* also unlocks unprecedented
+capabilities for large-scale training such as improved resilience to
+hardware failures through continued training on remaining healthy
+hardware, and reduced training time through opportunistic exploitation
+of elastic capacity.
+
+In summary, this release of *UCP* unlocks the following capabilities:
+
+- Flexible checkpoints reshape along any of the training parallelism
+  techniques (i.e., PP, TP, DP, ZeRO-DP, SP, MoE)
+
+- Elastic resource management, scale up or scale down of training and
+  finetuning accelerator resources
+
+- Real world examples with support for multiple commercial-scale models
+  (i.e., BLOOM, Megatron GPT, LLAMA, Microsoft Phi)
+
+# Core Design
+
+The key insight of DeepSpeed *UCP* is the selection of the optimal
+representation in each phase of the checkpointing life cycle:
+distributed representation for saving, and consolidated representation
+for loading. This is achieved using two key mechanisms. First, the
+universal checkpoint format, which consists of a consolidated
+representation of each model parameter, and metadata for mapping
+parameter fragments to the ranks of an arbitrary parallel training
+configuration. Second, the universal checkpoint language, a simple but
+powerful and robust specification language for converting distributed
+checkpoints into the universal checkpoint format.
+
+## Universal Checkpoint Format
+
+<img src="./media/image2.png" style="width:6.5in;height:3.42153in" />
+
+Figure 1: UCP overview: top row and bottom row are Source and Target
+parallelism configurations respectively. The middle row shows UCP as
+an intermediate format of translating from Source to Target.
+
+Figure 1 shows high level schematic description of *UCP* conversion
+process and format. Conversion starts with top block of checkpointing in
+any parallel format e.g, DP, TP, PP, SP. Saving in the native format of parallel training avoids any overhead of
+consolidating into a single global checkpoint. To ensure that
+a checkpoint saved in one parallel configuration (herein called *Source*) can be
+easily converted and loaded for continuous training in another parallel configuration (herein called *Target*),
+we introduce the idea of atomic checkpoint as an intermediate format.
+
+The concept of atomic checkpoint is central to *UCP*. These are
+fine-grained files containing the consolidated representation of each
+model parameter, along with optimizer states. The atomic checkpoint
+format is useful for three reasons. First, the atomic representation of
+checkpoints decouples the dependencies of distributed checkpoints and
+specific parallelism techniques and hardware configurations. As such,
+one does not need to implement individual converters for each *Source*
+and *Target* pair. Instead, *UCP* can act as a common interchange format
+between different distributed training techniques, which then can be
+easily transformed into other distributed training strategies, as shown
+in Fig 2. By keeping the consolidated representation of each model
+parameter, *UCP* enables easy splitting and flexible mapping of model states
+or fragmented states to different GPUs on a parameter-by-parameter
+basis, effectively reducing the working memory needed to load large
+model checkpoints. Second, the *UCP* conversion happens lazily and
+on-demand, e.g., when a training process detects a change of parallelism
+technique and hardware configuration. In other words, the existing
+distributed checkpoint saving logic does not need any change. Third, the
+structure of the *UCP* also makes it easy to handle advanced techniques
+in distributed training, such as mixed-precision training. In practice,
+researchers and practitioners may switch between fp16 and bfloat16 mixed
+precision training. By keeping the fp32 weight/optimizer values, the
+training can resume either with fp16 or bfloat16.
+
+## Universal Checkpoint Language
+
+<img src="./media/flowchart.png" style="width:6.5in;height:2.22222in" />
+
+Figure 2: UCP language helps transform distributed checkpoints into the
+UCP format and load UCP checkpoints based on the Target parallel
+technique and new hardware configuration.
+
+
+While *UCP* provides a common interface for different parallelism
+strategies, the development of transformation from arbitrary distributed
+checkpoints to *UCP* can still incur a high engineering and
+implementation cost. This is because the number of distributed checkpoint files
+and their contents can vary across the different parallel training techniques.
+
+To tackle this challenge, *UCP* provides *UCP* language, which is a
+simple but powerful specification language for converting a distributed checkpoint
+into the atomic checkpoint format, described in previous
+section. *UCP* does this in two ways. First, it provides a declarative
+system with pre-defined *parameter patterns*, which cover a wide range
+of parallelism strategies for model states. Parameter patterns contain
+runtime information about how a parameter is partitioned across GPUs.
+For instance, *nopattern* means that a parameter is uniquely associated
+with a GPU rank, which is the most common pattern seen in techniques
+such as ZeRO-1/2 and PP (see our technical report for a completed list
+of currently supported parameter patterns). Second, *UCP* language
+provides a set of common operators that facilitate the transformation of
+distributed checkpoints into consolidated atomic checkpoints. At a
+high-level, as illustrated in Figure 3, *UCP* language is invoked when
+support for a new *Target* is needed or the hardware
+configuration changes. It first transforms distributed checkpoints into
+the *UCP* format. It then loads the *UCP* checkpoints based on the
+*Target* parallel technique and new hardware configuration.
+
+# Key Results
+
+We evaluate *UCP* through a series of experiments on training LLMs. We
+focus on the decoder-only Transformers: an architecture chosen due to
+its state-of-the-art performance. Some of the largest models are also
+decoder-based, making flexible and efficient checkpointing especially
+important. In this blog, we present results of correctness verification
+across different models and parallel strategies. For more results on
+parallel efficiency analysis, detailed system and model architectures
+and training hyperparameters, please see our technical report referenced
+above.
+
+*UCP* provides flexible checkpointing from a *Source* parallelism
+strategy to a different *Target* with different hardware configurations.
+To verify this capability, we conduct correctness tests of *UCP* with
+two groups of experiments.
+
+## Single Source to Multiple Target
+
+<img src="./media/image4.png" style="width:4.85477in;height:4in" />
+
+Figure 3: Training curves of loading UCP checkpoints into different
+Target at iteration 101 with various GPU counts and parallelism
+strategies
+
+To test if UCP allows resuming training with different parallelism
+strategies and hardware configuration, we first train the GPT-3 model
+using a configuration of TP=2, PP=2, DP=2 (ZeRO-1), and SP=1. Due to
+constraints in time and resources, we limited the experiment to the
+first 200 iterations. We convert the checkpoints saved at the 100th
+iteration to *UCP* checkpoints and resume training with these *UCP*
+checkpoints using different GPU counts and parallelism strategies. We
+record the LM loss (average losses across the data parallel group) for
+each iteration. Figure 3 illustrates that the training can be seamlessly
+resumed with *UCP* checkpoints using different *Target* parallelism
+strategies, achieving consistent convergence if the training were to
+continue with the *Source* strategy.
+
+## Multiple Source to Single Target
+
+<img src="./media/image5.png" style="width:4.85477in;height:4in" />
+
+Figure 4: Training curves of transforming different Source parallelism
+strategies at iteration 100 to UCP and loading UCP with a different
+Target.
+
+Figure 4 shows the training curves from multiple *Source* configurations
+to a single *Target*. Given a fixed random seed, we first train the
+GPT-3 model using different *Source* configurations. We then convert
+their distributed checkpoints saved at the 100th iteration to *UCP*
+checkpoints and resume training with a configuration of TP=2, PP=2,
+DP=1, and SP=1. The results show that regardless of the different
+*Source* configurations, their checkpoints can all be converted into
+*UCP* and resume training with a different configuration. Most
+importantly, the resumed training curves match the curves from the
+*Source* at iterations 101--200. These results validate the
+effectiveness of *UCP* of converting an arbitrary configuration to a
+different configuration for resumed training.
+
+## Varying Model Architectures
+
+*UCP* is model architecture agnostic. As such, it is not only compatible
+with GPT models but also flexible enough to support various other model
+architectures and sizes. Figures 5, 6 and 7 show the training
+convergence for LLaMA 7B, BLOOM 176B, and a variant of Mixtral-7x8B MoE,
+when resuming from *UCP* at the middle of training with new parallelism
+strategies. These figures show that training is seamlessly resumed with
+*UCP*, achieving consistent convergence that aligns with the initial
+training phase across these diverse models. These results suggest that
+*UCP* is quite flexible for various model architectures and sizes.
+
+<img src="./media/image6.png" style="width:5in;height:4in"
+alt="A graph of training step Description automatically generated" />
+
+Figure 5: Training curve with LLaMA model architecture. Source is
+TP=PP=DP=2. Training is resumed at iteration 101 with new Targets
+TP=DP=2, PP=1 and TP=PP=2, DP=1
+
+<img src="./media/image7.png" style="width:5in;height:4in"
+alt="A graph with numbers and lines Description automatically generated" />
+
+Figure 6: Training curve of BLOOM model architecture. Source is TP=2,
+PP=24, DP=8. Training is resumed at iteration 94767 with a new Targets
+TP=2, DP=4, PP=24.
+
+<img src="./media/image8.png" style="width:5in;height:4in"
+alt="A graph of training step Description automatically generated" />
+
+Figure 7: Training curve with a variant of the Mixtral-MoE model
+architecture. Source is TP=1, PP=2, DP=4. Training is resumed at
+iteration 501 with a new Target TP=PP=DP=2.
+
+# General Availability of DeepSpeed Universal Checkpoint
+
+We are excited to release DeepSpeed Universal Checkpoint. DeepSpeed
+Universal Checkpoint is available in DeepSpeed versions >=
+[0.14.4](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.14.4),
+has been fully integrated with [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) ([commit c3a13be](https://github.com/deepspeedai/Megatron-DeepSpeed/commit/c3a13be721da0d0de16c338d0d665b0f7d13d14f)).
+Detailed tutorial on usage is available on
+[DeepSpeed tutorial page](https://www.deepspeed.ai/tutorials/universal-checkpointing/).
+
+We welcome contributions and collaboration from the broader open-source
+community. DeepSpeed Universal Checkpoint is part of the bigger
+DeepSpeed ecosystem of large-scale AI training and inference. For more
+details on all DeepSpeed technologies and innovations, please visit our
+[website]((https://www.deepspeed.ai/)) and follow us
+on X, formerly Twitter, ([English](https://twitter.com/DeepSpeedAI),
+[Japanese](https://twitter.com/DeepSpeedAI_JP))
+and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed).
+
+# Acknowledgements and Contributions
+We thank the collaboration of University of Illinois at Urbana-Champaign,
+Statosphere, and Intel Habana.
+
+Contributions:
+Xinyu Lian $^1$, Sam Ade Jacobs $^2$, Lev Kurilenko $^2$, Masahiro Tanaka $^2$,
+Stas Bekman $^3$, Olatunji Ruwase $^2$, Minjia Zhang $^1$, Moshe Island $^4$
+
+1: University of Illinois at Urbana-Champaign
+2: Microsoft
+3: StasoSphere
+4: Intel Habana
diff --git a/blogs/deepspeed-ucp/chinese/README.md b/blogs/deepspeed-ucp/chinese/README.md
new file mode 100644
index 000000000000..4260595671dc
--- /dev/null
+++ b/blogs/deepspeed-ucp/chinese/README.md
@@ -0,0 +1,124 @@
+
+<div align="center">
+
+# DeepSpeed通用检查点：用于大规模分布式训练的高效灵活检查点系统
+
+</div>
+
+<img src="../media/image1.png" style="width:6.5in;height:3.65625in" />
+
+要引用DeepSpeed通用检查点，请引用我们的[arxiv报告](https://arxiv.org/abs/2406.18820)：
+
+```
+@article{lian2024-ucp,
+title={Universal Checkpointing: Efficient and Flexible Checkpointing for
+Large Scale Distributed Training},
+author={Xinyu Lian and Sam Ade Jacobs and Lev Kurilenko and Masahiro Tanaka
+and Stas Bekman and Olatunji Ruwase and Minjia Zhang},
+journal={arxiv preprint arxiv:406.18820},
+year={2024},
+
+}
+```
+
+# 引言
+
+检查点是降低训练大型语言模型成本的关键技术，它使我们在训练过程中可以保存模型状态。这样，如果训练失败，训练可以从最后保存的点继续，而不是从头开始。此外，检查点还允许在训练的不同阶段评估模型性能，从而便于进行超参数调整以及针对不同和多样化下游任务的微调。
+
+然而，在分布式训练和微调场景中设计、实施和使用检查点存在困难。ZeRO数据并行（ZeRO-DP）、流水线并行（PP）、张量并行（TP）和序列并行（SP）等方法是加速大型语言模型训练的出色技术，但与传统的默认（Torch）保存和加载检查点机制不兼容。此外，目前尚无技术支持将这些不同的并行拓扑与检查点灵活组合，部分原因是这些技术将模型和/或优化器状态分片，使得在不同GPU或加速器数量上创建的检查点难以用于恢复训练。
+
+在此，我们很高兴地发布DeepSpeed通用检查点（*UCP*），这是解决分布式检查点问题的最全面的解决方案。*UCP*在高效创建检查点的同时，提供了在任意并行策略和硬件配置上恢复的灵活性。*UCP*还解锁了大规模训练的前所未有的能力，例如通过在剩余健康硬件上继续训练来提高对硬件故障的抵抗力，以及通过机会性利用弹性容量来减少训练时间。
+
+简单来说，当前版本的*UCP*解锁了以下功能：
+
+- 灵活的检查点可沿任何训练并行技术（即PP、TP、DP、ZeRO-DP、SP、MoE）重塑训练
+
+- 弹性资源管理，在训练和微调中随意增加或减少硬件资源
+
+- 支持多种商业规模模型的真实世界用例（例如BLOOM、Megatron GPT、LLAMA、Microsoft Phi）
+
+# 核心设计
+
+DeepSpeed *UCP*的关键洞察是在检查点生命周期的每个阶段选择最佳表示：分布式表示用于保存，合并表示用于加载。这通过两个关键机制实现。首先，通用检查点格式，它包括每个模型参数的合并表示和用于将参数片段映射到任意模型并行配置的训练级别的元数据。其次，通用检查点语言，这是一个简单但强大且健壮的规范语言，用于将分布式检查点转换为通用检查点格式。
+
+## 通用检查点格式
+
+<img src="../media/image2.png" style="width:6.5in;height:3.42153in" />
+
+图1：UCP概述：顶部行和底部行分别为源并行配置和目标并行配置。中间行显示UCP作为从源到目标的转换中介块。
+
+图1显示了*UCP*转换过程和格式的整体概念性描述。转换从任何并行策略格式的检查点顶部块开始。允许以训练的本地格式保存消除了可能因同步全局检查点保存而产生的任何开销。为确保保存的检查点（称为*源*）可以轻松转换并加载到任何并行策略以进行连续训练（称为*目标*），我们引入了作为中介块的原子检查点格式的概念。
+
+原子检查点是*UCP*的核心概念。这些是包含每个模型参数的合并表示及其优化器状态的细粒度文件。原子检查点格式有三个用途。首先，原子检查点的表示解除了分布式检查点与特定并行技术和硬件配置的依赖。因此，无需为每个*源*到*目标*实现单独的转换器。相反，*UCP*可以充当不同分布式训练技术之间的通用交换格式，然后可以轻松地转换为其他分布式训练策略，如图2所示。通过保持每个模型参数的合并表示，*UCP*可以轻松地将模型状态或片段状态拆分并灵活地映射到不同GPU上，有效减少加载大型模型检查点所需的工作内存。其次，*UCP*转换是懒惰和按需进行的，例如，当训练过程检测到并行技术和硬件配置的变化时。换句话说，现有的分布式检查点保存逻辑不需要任何改变。第三，*UCP*的结构还易于处理分布式训练中的高级技术，例如混合精度训练。在实践中，研究人员和从业者可能在fp16和bfloat16混合精度训练之间切换。通过保持fp32的权重/优化器值，训练可以继续使用fp16或bfloat16恢复。
+
+## 通用检查点语言
+
+<img src="../media/flowchart.png" style="width:6.5in;height:2.22222in" />
+
+图2：UCP语言帮助将分布式检查点转换为UCP格式，并根据目标并行技术和新硬件配置加载UCP检查点。
+
+
+虽然*UCP*为不同的并行策略提供了一个公共接口，但从任意分布式检查点到*UCP*的转换仍然可能具有不菲的工程和实施成本。这是因为分布式训练中的每个GPU都调用一个持久方法（例如，在PyTorch中使用torch.save()）将其拥有的GPU模型状态保存到磁盘上的检查点文件中，而每个检查点的具体内容在不同技术之间会有所不同。
+
+为了应对这一挑战，*UCP*提供了*UCP*语言，这是一个简单但强大的规范语言，用于将几种类型的分布式检查点转换为前一节中描述的通用格式。*UCP*以两种方式实现这一点。首先，它提供了一个具有预定义*参数模式*的声明式系统，这些模式涵盖了模型状态的广泛并行
+
+策略。参数模式包含有关参数如何在GPU之间分区的运行时信息。例如，*nopattern*表示一个参数与某个GPU唯一相关，这是ZeRO-1/2和PP等技术中最常见的模式（参见我们的技术报告，以获得当前支持的参数模式完整列表）。其次，*UCP*语言提供了一组常见操作符，以便将分布式检查点转换为合并的原子检查点。从高层次来看，如图3所示，当需要新的*目标*并行技术或硬件配置发生变化时，将调用*UCP*语言。它首先将分布式检查点转换为*UCP*格式。然后根据*目标*并行技术和新硬件配置加载*UCP*检查点。
+
+# 关键结果
+
+我们通过一系列实验评估*UCP*，专注于仅解码器的Transformers架构，这是由于其最先进的性能。一些最大的模型也是基于解码器的，这使得灵活高效的检查点尤为重要。在本博客中，我们展示了在不同模型和并行策略下正确性验证的结果。有关并行效率分析、详细的系统和模型架构以及训练超参数的更多结果，请参阅上面引用的技术报告。
+
+*UCP*提供了从一个*源*并行策略到不同的*目标*和不同硬件配置的灵活检查点。为验证这一能力，我们进行了正确性测试的两组实验。
+
+## 单源到多目标
+
+<img src="../media/image4.png" style="width:4.85477in;height:4in" />
+
+图3：在第101次迭代时使用不同目标加载UCP检查点的训练曲线，具有不同GPU数量和并行策略
+
+为测试UCP是否允许使用不同并行策略和硬件配置恢复训练，我们首先使用TP=2、PP=2、DP=2（ZeRO-1）和SP=1的配置训练GPT-3模型。由于时间和资源的限制，我们将实验限制在前200次迭代。我们将在第100次迭代保存的检查点转换为*UCP*检查点，并使用不同GPU数量和并行策略恢复训练。我们记录了每次迭代的LM损失（数据并行组的平均损失）。图3显示，训练可以使用不同的*目标*并行策略无缝地使用*UCP*检查点恢复，如果训练继续使用*源*策略，将实现一致的收敛。
+
+## 多源到单目标
+
+<img src="../media/image5.png" style="width:4.85477in;height:4in" />
+
+图4：在第100次迭代将不同源并行策略转换为UCP并加载UCP的训练曲线，具有不同的目标。
+
+图4显示了从多个*源*配置到单一*目标*的训练曲线。在固定随机种子的情况下，我们首先使用不同的*源*配置训练GPT-3模型。然后我们将它们在第100次迭代保存的分布式检查点转换为*UCP*检查点，并使用TP=2、PP=2、DP=1和SP=1的配置恢复训练。结果显示，无论不同的*源*配置如何，它们的检查点都可以转换为*UCP*并使用不同的配置恢复训练。最重要的是，恢复的训练曲线与第101--200次迭代的*源*曲线匹配。这些结果验证了*UCP*将任意配置转换为不同配置以恢复训练的有效性。
+
+## 不同模型架构的变化
+
+*UCP*与模型架构无关。因此，它不仅与GPT模型兼容，而且足够灵活，可以支持各种其他模型架构和大小。图5、6和7显示了使用新并行策略从*UCP*中恢复训练时的训练收敛情况。这些图表显示，训练可以使用*UCP*无缝恢复，实现与初始训练阶段一致的收敛，这与这些不同模型相符。这些结果表明，*UCP*对于各种模型架构和大小都非常灵活。
+
+<img src="../media/image6.png" style="width:5in;height:4in"
+alt="A graph of training step Description automatically generated" />
+
+图5：使用LLaMA模型架构的训练曲线。源是TP=PP=DP=2。训练在第101次迭代时使用新目标TP=DP=2, PP=1和TP=PP=2, DP=1恢复
+
+<img src="../media/image7.png" style="width:5in;height:4in"
+alt="A graph with numbers and lines Description automatically generated" />
+
+图6：使用BLOOM模型架构的训练曲线。源是TP=2, PP=24, DP=8。训练在第94767次迭代时使用新目标TP=2, DP=4, PP=24恢复。
+
+<img src="../media/image8.png" style="width:5in;height:4in"
+alt="A graph of training step Description automatically generated" />
+
+图7：使用Mixtral-MoE模型架构变种的训练曲线。源是TP=1, PP=2, DP=4。训练在第501次迭代时使用新目标TP=PP=DP=2恢复。
+
+# DeepSpeed通用检查点的普遍可用性
+
+我们很高兴发布DeepSpeed通用检查点。DeepSpeed通用检查点已与Megatron-DeepSpeed的重构版本完全集成，并可通过DeepSpeed和Megatron-DeepSpeed的GitHub仓库访问。详细的使用教程可在[DeepSpeed教程页面](https://www.deepspeed.ai/tutorials/universal-checkpointing/)上找到。
+
+我们欢迎来自更广泛开源社区的贡献和合作。DeepSpeed通用检查点是大规模AI训练和推理DeepSpeed生态系统的一部分。有关所有DeepSpeed技术和创新的更多详细信息，请访问我们的[网站](https://www.deepspeed.ai/)并在X（前Twitter）（[英文](https://twitter.com/DeepSpeedAI)，[日文](https://twitter.com/DeepSpeedAI_JP)）和[中文知乎](https://www.zhihu.com/people/deepspeed)上关注我们。
+
+# 致谢和贡献
+我们感谢伊利诺伊大学厄巴纳-香槟分校、Statosphere和英特尔Habana的合作。
+
+贡献者：
+Xinyu Lian $^1$, Sam Ade Jacobs $^2$, Lev Kurilenko $^2$, Masahiro Tanaka $^2$,
+Stas Bekman $^3$, Olatunji Ruwase $^2$, Minjia Zhang $^1$, Moshe Island $^4$
+
+1: 伊利诺伊大学厄巴纳-香槟分校
+2: 微软
+3: Statosphere
+4: 英特尔Habana
diff --git a/blogs/deepspeed-ucp/japanese/README.md b/blogs/deepspeed-ucp/japanese/README.md
new file mode 100644
index 000000000000..24da72298dd8
--- /dev/null
+++ b/blogs/deepspeed-ucp/japanese/README.md
@@ -0,0 +1,115 @@
+<div align="center">
+
+# DeepSpeed Universal Checkpointing: 大規模分散学習のための効率的かつ柔軟なチェックポイント
+
+</div>
+
+<img src="../media/image1.png" style="width:6.5in;height:3.65625in" />
+
+DeepSpeed Universal Checkpointを引用する際は、こちらの[arxiv report](https://arxiv.org/abs/2406.18820)を参照してください。
+
+```
+@article{lian2024-ucp,
+title={Universal Checkpointing: Efficient and Flexible Checkpointing for
+Large Scale Distributed Training},
+author={Xinyu Lian and Sam Ade Jacobs and Masahiro Tanaka and Lev
+Kurilenko and Stas Bekman and Olatunji Ruwase and Minjia Zhang},
+journal={arxiv preprint arxiv:406.18820},
+year={2024},
+
+}
+```
+
+# はじめに
+
+モデルの状態を保存するをチェックポイントは、システム障害が発生した場合に途中から学習を再開するために、LLMのトレーニングコストを削減するための重要な技術です。さらに、学習のさまざまな段階でモデルのパフォーマンスを評価することができるため、ハイパーパラメータの調整や異なる下流タスクのためのファインチューニングが容易になります。
+
+しかし、特に分散学習やファインチューニングのシナリオにおいて、チェックポイントの設計、実装、および使用には多くの課題があります。DeepSpeedが備えるZeROを用いたデータ並列化（ZeRO-DP）、パイプライン並列化（PP）、テンソル並列化（TP）、およびシーケンス並列化（SP）などのいくつかの方法は、LLM学習を加速するための優れた技術ですが、一般的なチェックポイント保存と読み込みのメカニズムと互換性がありません。さらに、これらの異なる並列化を用いたエラスティックで柔軟な組み合わせは、現在サポートされていません。主な理由の一つは、こうした並列化技術がモデルおよび/またはオプティマイザの状態を分割するため、異なるGPUまたはアクセラレータの数に基づいて作成されたチェックポイントから学習を再開することが困難であるためです。
+
+このリリースでは、分散チェックポイントの問題に対する包括的なソリューションであるDeepSpeed Universal Checkpointing (*UCP*) を紹介します。*UCP*は、任意の並列化戦略とハードウェア構成で再開する柔軟性を提供しながら、効率的なチェックポイント作成を可能にします。また、*UCP*は、ハードウェア障害の際にも、残りの正常なハードウェアでのトレーニングの継続を可能にするため、キャパシティがエラスティックに変化するハードウェアを活用でき、トレーニング時間を短縮するなど、大規模学習を最大限に効率化できます。
+
+現在のリリースには、*UCP*の次の機能が含まれます。
+
+- 任意のトレーニング並列技術（例：PP、TP、DP、ZeRO-DP、SP、MoE）に沿った柔軟なチェックポイントの再構成
+- ファインチューニングを含む学習およびアクセラレータリソースのエラスティックなリソース管理、スケールアップまたはスケールダウン
+- BLOOM、Megatron GPT、LLAMA、Microsoft Phiなどの複数の商用規模モデルのサポートを伴う実利用例
+
+# UCPの設計
+
+DeepSpeed *UCP*における中心的な考え方は、チェックポイントライフサイクルの各段階で最適な表現を選択することです。保存のための分散表現と、読み込みのための統合表現です。これは、2つの重要なメカニズムを使用して実現されます。一つ目は、各モデルパラメータの統合表現と、パラメータのフラグメントを任意のモデル並列化構成におけるランク（プロセスのインデックス）にマッピングするためのメタデータからなるユニバーサルチェックポイントフォーマットです。二つ目は、分散チェックポイントをユニバーサルチェックポイント形式に変換するためのシンプルで強力かつ堅牢な仕様言語であるユニバーサルチェックポイント言語です。
+
+## ユニバーサルチェックポイントフォーマット
+
+<img src="../media/image2.png" style="width:6.5in;height:3.42153in" />
+
+図1：*UCP*の概要：上段と下段はそれぞれソースとターゲットの並列化構成です。中央の段は、ソースからターゲットへの翻訳の仲介ブロックとしての*UCP*を示しています。
+
+図1は、*UCP*の変換プロセスとフォーマットの抽象レベルの概略図を示しています。変換は、DP、TP、PP、SPなどの任意の並列戦略形式のチェックポイントから始まります。訓練結果のモデルやオプティマイザ状態をネイティブ形式で保存することで、同期されたグローバルチェックポイントの保存に伴うオーバーヘッドを回避します。保存されたチェックポイント（以下、*ソース*と呼びます）を任意の並列戦略に簡単に変換してロードできるようにするために、中間ブロックとして原子チェックポイント (atomic checkpoint) 形式のアイデアを導入します。
+
+原子チェックポイントの概念は、*UCP*の中心となるものです。これらは、各モデルパラメータの統合表現とオプティマイザ状態を含む細粒度のファイルです。原子チェックポイント形式は、次の3つの理由で有用です。まず、チェックポイントの原子表現は、分散チェックポイントと特定の並列技術およびハードウェア構成の依存関係を切り離します。そのため、*ソース*から*ターゲット*への個別のコンバータを実装する必要はありません。代わりに、*UCP*は異なる分散トレーニング技術間の共通交換形式として機能し、他の分散トレーニング戦略に簡単に変換できます（図2参照）。各モデルパラメータの統合表現を保持することで、*UCP*はモデル状態またはフラグメント状態をパラメータごとに異なるGPUに柔軟にマッピングし、大規模モデルチェックポイントを読み込むために必要な作業メモリを効果的に削減します。第二に、*UCP*の変換は遅延してオンデマンドで行われます。たとえば、トレーニングプロセスが並列技術とハードウェア構成の変更を検出したときです。つまり、既存の分散チェックポイント保存ロジックには変更が必要ありません。第三に、*UCP*の構造により、混合精度トレーニングなどの高度な技術を分散トレーニングで簡単に処理できます。実際には、研究者や実務者はfp16とbfloat16の混合精度トレーニングを切り替えることがあります。fp32の重み/オプティマイザの値を保持することで、トレーニングはfp16またはbfloat16のいずれかで再開できます。
+
+## ユニバーサルチェックポイント言語
+
+<img src="../media/flowchart.png" style="width:6.5in;height:2.22222in" />
+
+図2：*UCP*言語は、分散チェックポイントを*UCP*形式に変換し、新しいハードウェア構成とターゲットの並列技術に基づいて*UCP*チェックポイントを読み込みます。
+
+*UCP*は異なる並列戦略に対する共通インターフェースを提供しますが、任意の分散チェックポイントから*UCP*への変換の開発には依然として高いエンジニアリングおよび実装コストがかかる場合があります。これは、分散トレーニングの各GPUが保存のためのメソッド（例：PyTorchのtorch.save()）を呼び出して、所有するGPUモデル状態のチェックポイントファイルをディスクに保存し、各チェックポイントの正確な内容が異なる技術によって異なるためです。
+
+この課題に取り組むために、*UCP*は*UCP*言語を提供します。これは、前述の共通形式にいくつかの種類の分散チェックポイントを変換するためのシンプルで強力な仕様言語です。*UCP*はこれを2つの方法で行います。まず、モデル状態の並列戦略の広範な範囲をカバーする事前定義された*パラメータパターン*を持つ宣言型システムを提供します。パラメータパターンには、パラメータがGPU間でどのように分割されているかについてのランタイム情報が含まれています。たとえば、*nopattern*は、パラメータがGPUランクに一意に関連付けられていることを意味し、これはZeRO-1/2やPPなどの技術で最も一般的に見られるパターンです（現在サポートされているパラメータパターンの完全なリストについては、技術レポートを参照してください）。第二に、*UCP*言語は、分散チェックポイントを統合された原子チェックポイントに変換するための一般的な演算子のセットを提供します。抽象的なレベルで見ると、図2に示すように、ターゲットへの移行後に新しい並列技術が必要な場合やハードウェア構成が変更された場合に、*UCP*言語が使用されます。最初に、分散チェックポイントを*UCP*形式に変換し、次にターゲットの並列技術と新しいハードウェア構成に基づいて*UCP*チェックポイントを読み込みます。
+
+# 主要な結果
+
+我々は、LLMの訓練に関する一連の実験を通じて*UCP*を評価します。デコーダーのみのトランスフォーマーに焦点を当てました。これは最先端のパフォーマンスを持つアーキテクチャです。いくつかの最大のモデルもデコーダーベースであるため、柔軟で効率的なチェックポイントは特に重要です。このブログでは、さまざまなモデルと並列戦略にわたる正確性の検証結果を紹介します。並列効率分析、詳細なシステムおよびモデルアーキテクチャ、および訓練のハイパーパラメータに関する詳細な結果については、上記の技術レポートを参照してください。
+
+*UCP*は、異なるハードウェア構成を持つ異なる*ターゲットの*並列戦略に対する*ソース*並列戦略からの柔軟なチェックポイントを提供します。この能力を検証するために、2つの実験グループで*UCP*の正確さを確認しました。
+
+## シングルソースから複数のターゲットへ
+
+<img src="../media/image4.png" style="width:4.85477in;height:4in" />
+
+図3：さまざまなGPU数と並列戦略で*ターゲット*に*UCP*チェックポイントをロードする訓練lossの曲線（イテレーション100で保存・ロード）
+
+*UCP*が異なる並列戦略とハードウェア構成での訓練再開を可能にするかどうかをテストするために、まずTP=2、PP=2、DP=2（ZeRO-1）、SP=1の構成でGPT-3モデルを訓練します。時間とリソースの制約のため、この実験は最初の200イテレーションに限定しました。100イテレーション目で保存されたチェックポイントを*UCP*チェックポイントに変換し、異なるGPU数と並列戦略を使用してこれらの*UCP*チェックポイントで訓練を再開します。各イテレーションのLM損失（データ並列グループ全体の平均損失）を記録しました。図3は、異なる*ターゲット*並列戦略を使用して*UCP*チェックポイントで訓練をシームレスに再開し、*ソース*戦略を継続して訓練する場合と一致する収束を達成することを示しています。
+
+## 複数ソースからシングルターゲットへ
+
+<img src="../media/image5.png" style="width:4.85477in;height:4in" />
+
+図4：100イテレーション目で異なるソース並列戦略を*UCP*に変換し、異なるターゲットで*UCP*をロードする訓練lossの曲線
+
+図4は、複数の*ソース*構成から単一の*ターゲット*へのlossの曲線を示しています。固定されたランダムシードを使用して、まずGPT-3モデルを異なる*ソース*構成で訓練します。次に、100イテレーション目で保存された分散チェックポイントを*UCP*チェックポイントに変換し、TP=2、PP=2、DP=1、SP=1の構成でトレーニングを再開します。結果は、異なる*ソース*構成にもかかわらず、そのチェックポイントはすべて*UCP*に変換され、異なる構成で訓練を再開できることを示しています。最も重要なのは、再開されたlossの曲線が、イテレーション101～200での*ソース*の曲線と一致することです。これらの結果は、訓練再開時に任意の構成を異なる構成に変換する*UCP*の効果を検証しています。
+
+## 異なるモデルアーキテクチャへの対応
+
+*UCP*はモデルアーキテクチャに依存しません。したがって、GPTモデルとの互換性だけでなく、さまざまなモデルアーキテクチャとサイズをサポートする柔軟性も備えています。図5、6、7は、新しい並列戦略で*UCP*から訓練を再開したときのLLaMA 7B、BLOOM 176B、およびMixtral-7x8B MoEを元にしたモデルのトレーニング収束を示しています。これらの図は、トレーニングが*UCP*でシームレスに再開され、これらの多様なモデル全体で訓練の初期フェーズと一致する収束を達成することを示しています。これらの結果は、さまざまなモデルアーキテクチャとサイズに対する*UCP*の柔軟性を示しています。
+
+<img src="../media/image6.png" style="width:5in;height:4in" alt="A graph of training step Description automatically generated" />
+
+図5：LLaMAモデルアーキテクチャの訓練lossの曲線。ソースはTP=PP=DP=2。訓練はイテレーション101で新しいターゲットTP=DP=2、PP=1およびTP=PP=2、DP=1で再開しました。
+
+<img src="../media/image7.png" style="width:5in;height:4in" alt="A graph with numbers and lines Description automatically generated" />
+
+図6：BLOOMモデルアーキテクチャの訓練lossの曲線。ソースはTP=2、PP=24、DP=8。訓練はイテレーション94767で新しいターゲットTP=2、DP=4、PP=24で再開しました。
+
+<img src="../media/image8.png" style="width:5in;height:4in" alt="A graph of training step Description automatically generated" />
+
+図7：Mixtral-MoEモデルアーキテクチャに基づくモデルの訓練lossの曲線。ソースはTP=1、PP=2、DP=4。訓練はイテレーション501で新しいターゲットTP=PP=DP=2で再開しました。
+
+# DeepSpeed Universal Checkpointの一般公開
+
+DeepSpeed Universal Checkpointは、リベースされたMegatron-DeepSpeedバージョンに完全に統合されており、DeepSpeedおよびMegatron-DeepSpeedのGitHubリポジトリを通じてアクセスできます。使用に関する詳細なチュートリアルは、[DeepSpeedチュートリアルページ](https://www.deepspeed.ai/tutorials/universal-checkpointing/)にあります。
+
+DeepSpeedでは、広範なオープンソースコミュニティからの貢献とコラボレーションを受け入れています。DeepSpeed Universal Checkpointは、大規模AIトレーニングおよび推論のためのDeepSpeedエコシステムの一部です。すべてのDeepSpeed技術とイノベーションについての詳細は、[ウェブサイト](https://www.deepspeed.ai/)をご覧いただき、X（旧Twitter）での[英語](https://twitter.com/DeepSpeedAI)、[日本語](https://twitter.com/DeepSpeedAI_JP)、および[中国のZhihu](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+
+# 謝辞と貢献
+
+University of Illinois at Urbana-Champaign、Statosphere、およびIntel Habanaとの協力に感謝します。
+
+コントリビュータ：
+Xinyu Lian $^1$, Sam Ade Jacobs $^2$, Lev Kurilenko $^2$, Masahiro Tanaka $^2$, Stas Bekman $^3$, Olatunji Ruwase $^2$, Minjia Zhang $^1$, Moshe Island $^4$
+
+1: University of Illinois at Urbana-Champaign
+2: Microsoft
+3: StasoSphere
+4: Intel Habana
diff --git a/blogs/deepspeed-ucp/media/flowchart.png b/blogs/deepspeed-ucp/media/flowchart.png
new file mode 100644
index 000000000000..d5198ca00e03
Binary files /dev/null and b/blogs/deepspeed-ucp/media/flowchart.png differ
diff --git a/blogs/deepspeed-ucp/media/image1.png b/blogs/deepspeed-ucp/media/image1.png
new file mode 100755
index 000000000000..c9663de91cc2
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image1.png differ
diff --git a/blogs/deepspeed-ucp/media/image2.png b/blogs/deepspeed-ucp/media/image2.png
new file mode 100644
index 000000000000..4262aa26600f
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image2.png differ
diff --git a/blogs/deepspeed-ucp/media/image3.png b/blogs/deepspeed-ucp/media/image3.png
new file mode 100755
index 000000000000..101a19c86ae5
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image3.png differ
diff --git a/blogs/deepspeed-ucp/media/image4.png b/blogs/deepspeed-ucp/media/image4.png
new file mode 100755
index 000000000000..b4f083e8eeba
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image4.png differ
diff --git a/blogs/deepspeed-ucp/media/image5.png b/blogs/deepspeed-ucp/media/image5.png
new file mode 100755
index 000000000000..f0195ebc8d11
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image5.png differ
diff --git a/blogs/deepspeed-ucp/media/image6.png b/blogs/deepspeed-ucp/media/image6.png
new file mode 100644
index 000000000000..19405123e79a
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image6.png differ
diff --git a/blogs/deepspeed-ucp/media/image7.png b/blogs/deepspeed-ucp/media/image7.png
new file mode 100644
index 000000000000..c2d383110a59
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image7.png differ
diff --git a/blogs/deepspeed-ucp/media/image8.png b/blogs/deepspeed-ucp/media/image8.png
new file mode 100644
index 000000000000..0014db8b688f
Binary files /dev/null and b/blogs/deepspeed-ucp/media/image8.png differ
diff --git a/blogs/deepspeed-ulysses/README.md b/blogs/deepspeed-ulysses/README.md
index 7ea7a4535e90..ed377fec1039 100644
--- a/blogs/deepspeed-ulysses/README.md
+++ b/blogs/deepspeed-ulysses/README.md
@@ -32,7 +32,7 @@ that process speech, images and waveforms concurrently require long
 context reasoning over high dimensional inputs with extremely large
 sequences. Similarly, chapter and book level summarization (estimated at
 tens and hundreds of thousands of words) are of great importance in
-conversational AI and abstractive summarization tasks.
+conversational AI and abstract summarization tasks.
 
 Long sequence length is equally critical for AI for science opening
 doors for better understanding of structure biology, health care,
@@ -233,7 +233,7 @@ at different sequence length and GPU count.*
 
 Next, we evaluate Ulysses on 7 billion (7B) and 30 billion (30B) parameter
 GPT dense attention models and compare against Megatron-LM's sequence
-parallelism (Megatron LM) and Colosal AI sequence parallelism (ColAI-SP) on
+parallelism (Megatron LM) and Colossal AI sequence parallelism (ColAI-SP) on
 32 and 64 A100 GPUs respectively. The results of these evaluations are shown
 in Figures 3 and 4.
 
@@ -362,9 +362,9 @@ on what is possible when long context window is no longer a limitation.
 DeepSpeed-Ulysses is part of the bigger DeepSpeed ecosystem of
 large-scale AI training and inference. For more details on all DeepSpeed
 technologies and innovations, please visit our [website]((https://www.deepspeed.ai/)) and follow us
-on X, formerly Twitter, ([English](https://twitter.com/MSFTDeepSpeed), [Japanese](https://twitter.com/MSFTDeepSpeedJP)) and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed).
+on X, formerly Twitter, ([English](https://twitter.com/DeepSpeedAI), [Japanese](https://twitter.com/DeepSpeedAI_JP)) and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed).
 
 We are open to collaborations with universities, research labs, and
 companies. For such requests (and other requests unsuitable for GitHub),
 please directly email to <deepspeed-info@microsoft.com>. If you like
-our work, please "Star" our [repo](https://github.com/microsoft/DeepSpeed).
+our work, please "Star" our [repo](https://github.com/deepspeedai/DeepSpeed).
diff --git a/blogs/deepspeed-ulysses/chinese/README.md b/blogs/deepspeed-ulysses/chinese/README.md
index 20af8b91fbea..7ba941214cf4 100644
--- a/blogs/deepspeed-ulysses/chinese/README.md
+++ b/blogs/deepspeed-ulysses/chinese/README.md
@@ -150,6 +150,6 @@ DeepSpeed-Ulysses已经完全与Megatron-DeepSpeed代码库集成并经过测试
 
 我们很高兴宣布推出DeepSpeed-Ulysses，您可以通过DeepSpeed GitHub获取代码。详细的使用教程在[DeepSpeed教程页面](https://www.deepspeed.ai/tutorials/ds-sequence/)上提供。
 
-我们欢迎各种形式的贡献和合作，以共同推动当长上下文窗口不再是限制时的各种创新。DeepSpeed-Ulysses是大规模AI训练和推理的更大DeepSpeed生态系统的一部分。有关所有DeepSpeed技术和创新的更多细节，请访问我们的[网站]((https://www.deepspeed.ai/))，并关注我们在X（Twitter）上的帐号（[英文](https://twitter.com/MSFTDeepSpeed)，[日文](https://twitter.com/MSFTDeepSpeedJP)）和[中文知乎](https://www.zhihu.com/people/deepspeed)。
+我们欢迎各种形式的贡献和合作，以共同推动当长上下文窗口不再是限制时的各种创新。DeepSpeed-Ulysses是大规模AI训练和推理的更大DeepSpeed生态系统的一部分。有关所有DeepSpeed技术和创新的更多细节，请访问我们的[网站]((https://www.deepspeed.ai/))，并关注我们在X（Twitter）上的帐号（[英文](https://twitter.com/DeepSpeedAI)，[日文](https://twitter.com/DeepSpeedAI_JP)）和[中文知乎](https://www.zhihu.com/people/deepspeed)。
 
 我们愿意与大学、研究实验室和公司合作。有关此类请求（以及不适合在GitHub上提出的其他请求），请直接发送电子邮件至<deepspeed-info@microsoft.com>。
diff --git a/blogs/deepspeed-ulysses/japanese/README.md b/blogs/deepspeed-ulysses/japanese/README.md
index 88a0e375ce70..ca847cd07ada 100644
--- a/blogs/deepspeed-ulysses/japanese/README.md
+++ b/blogs/deepspeed-ulysses/japanese/README.md
@@ -153,6 +153,6 @@ dist_attn = DistributedAttention(attn, get_sequence_parallel_group())
 DeepSpeed-Ulyssesは、DeepSpeedのGitHubを通じてアクセス可能です。使用方法に関する詳しいチュートリアルは、[DeepSpeedのチュートリアルページ
 ](https://www.deepspeed.ai/tutorials/ds-sequence/)にあります。
 
-長いコンテキストを扱う際の制約を取り除くことによって何が可能になるのか、ユーザの皆様と共に様々な可能性を探求するため、幅広い協力やコラボレーションを歓迎します。DeepSpeed-Ulyssesは、大規模なAIの訓練と推論のためのより大きなDeepSpeedエコシステムの一部です。DeepSpeedの多くの技術や革新的な機能の詳細については、[ウェブサイト](https://www.deepspeed.ai/)をご覧いただくか、X（以前のTwitter。[英語版](https://twitter.com/MSFTDeepSpeed)、[日本語版](https://twitter.com/MSFTDeepSpeedJP)）や、中国の[Zhihu](https://www.zhihu.com/people/deepspeed)でフォローしてください。
+長いコンテキストを扱う際の制約を取り除くことによって何が可能になるのか、ユーザの皆様と共に様々な可能性を探求するため、幅広い協力やコラボレーションを歓迎します。DeepSpeed-Ulyssesは、大規模なAIの訓練と推論のためのより大きなDeepSpeedエコシステムの一部です。DeepSpeedの多くの技術や革新的な機能の詳細については、[ウェブサイト](https://www.deepspeed.ai/)をご覧いただくか、X（以前のTwitter。[英語版](https://twitter.com/DeepSpeedAI)、[日本語版](https://twitter.com/DeepSpeedAI_JP)）や、中国の[Zhihu](https://www.zhihu.com/people/deepspeed)でフォローしてください。
 
-DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
index e91ff1ecd51e..d4cc268fbe89 100644
--- a/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
@@ -146,7 +146,7 @@ DeepSpeed-VisualChat 是一个易于使用的训练框架，具有很好的可
 使用 DeepSpeed-VisualChat 训练模型是简单和方便的。这里我们给出了基于 CLIP 视觉编码器和 LLaMa-7B 模型的一个例子：
 
 ```
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
 pip install -r requirements.txt
 cd training
@@ -161,21 +161,21 @@ bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt pat
 
 为了支持更大的模型推理，我们已经将 Hugging Face 大模型推理集成到我们的 DeepSpeed-VisualChat API 中。因此，用户可以根据 GPU 内存容量和模型大小选择不同数量的 GPU。
 
-请参考我们的 [GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) 了解更多细节。
+请参考我们的 [GitHub 主页](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) 了解更多细节。
 
 # 7. 发布：今天尝试 DeepSpeed-VisualChat！
 
 我们非常兴奋地分享 DeepSpeed-VisualChat 现已开源并供 AI 社区使用。
 
-* 要开始使用，请访问我们的 DeepSpeed-VisualChat GitHub 页面：[GitHub 主页](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+* 要开始使用，请访问我们的 DeepSpeed-VisualChat GitHub 页面：[GitHub 主页](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
 
-* 我们将继续在您的反馈和支持下改进 DeepSpeed-VisualChat。我们的 [路线图](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) 显示了目前支持的功能以及未来计划支持的功能。
+* 我们将继续在您的反馈和支持下改进 DeepSpeed-VisualChat。我们的 [路线图](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) 显示了目前支持的功能以及未来计划支持的功能。
 
 DeepSpeed-VisualChat 是更大的 DeepSpeed 生态系统的一部分，其中包括一系列深度学习系统和建模技术。要了解更多信息，
 
 * 请访问我们的 [网站](https://www.deepspeed.ai/) 了解详细的博客文章、教程和文档。
-* 在我们的 [英文 X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[日语 X(Twitter)](https://twitter.com/MSFTDeepSpeedJP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们，以获取 DeepSpeed 的最新消息。
+* 在我们的 [英文 X(Twitter)](https://twitter.com/DeepSpeedAI)、[日语 X(Twitter)](https://twitter.com/DeepSpeedAI_JP) 和 [中文知乎](https://www.zhihu.com/people/deepspeed) 上关注我们，以获取 DeepSpeed 的最新消息。
 
-我们欢迎您为 DeepSpeed 做出贡献！我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 页面上参加讨论。有关更多详细信息，请查看我们的 [贡献指南](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度，例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
+我们欢迎您为 DeepSpeed 做出贡献！我们鼓励您报告问题、贡献 PRs、并在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 页面上参加讨论。有关更多详细信息，请查看我们的 [贡献指南](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)。我们对与大学、研究实验室、公司等进行合作持开放态度，例如共同进行深度学习研究、应用 DeepSpeed 为现实世界的 AI 模型和应用提供支持等等。对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 deepspeed-info@microsoft.com。
 
-* 如果你喜欢我们的工作，请在 [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) 和 [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) 上为我们的仓库点“星”。
+* 如果你喜欢我们的工作，请在 [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) 和 [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/) 上为我们的仓库点“星”。
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
index ec23e005bff6..4697c15f2bb3 100755
--- a/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
+++ b/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
@@ -54,7 +54,7 @@ GPTやLLaMaのような大規模言語モデル（LLM）は、テキスト生成
 
 
 <div align="center">
-  <img src="../assets/images/attention.png" alt="Different attention mehanisms" width="1000"/>
+  <img src="../assets/images/attention.png" alt="Different attention mechanisms" width="1000"/>
 
   *図2: 異なるアテンションの機構: 「ユーザー：画像を説明してください」という入力文と3つの画像トークン（I-token1、I-token2、I-token3）と組み合わせて与えた場合の、それぞれのattention機構の構成を示しています。左側では、標準的なcausal attentionによって、画像トークンをテキストとして扱う様子を示しています。中央は、テキストトークンに対する標準的なcausal attentionを維持しながら、画像に適用されるcross attentionを使用する様子を示しています。右側では、画像トークンはself attentionのみを行い、テキストトークンはテキスト／画像トークンへのアテンションを独立に計算するという、新しいマルチモーダルのためのアテンションの提案を、オレンジ色のマスクで強調して示しています。この仕組みは、Q, Kをクエリとキーとしたとき、 softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$)として定義されます。M $`\in`$ R<sup>10x10</sup>としたとき、$`M_1`$=[M==1], and $`M_2`$=[M==2] です。*
 </div>
@@ -152,7 +152,7 @@ DeepSpeed-VisualChatは使いやすく、かつ優れたスケーラビリティ
 DeepSpeed-VisualChatの訓練は簡単かつ便利に実行できます。ここではCLIPビジュアルエンコーダーとLLaMa-7Bモデルを使用する例を示します：
 
 ```
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
 pip install -r requirements.txt
 cd training
@@ -168,21 +168,21 @@ bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt pat
 
 より大規模なモデル推論をサポートするために、我々はHugging Faceの大規模モデル推論をDeepSpeed-VisualChat APIに組み込みました。そのため、ユーザーはGPUメモリ容量とモデルサイズに基づいて、異なるGPU数を選択することができます。
 
-詳細は[ランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)をご参照ください。
+詳細は[ランディングページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)をご参照ください。
 
 # 7. 早速使ってみましょう！
 
 DeepSpeed-VisualChatがオープンソース化され、AIコミュニティで利用できるようになったことを大変嬉しく思います。
 
-* まずは、DeepSpeed-VisualChatのGitHubページをご覧ください： [GitHubランディングページ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+* まずは、DeepSpeed-VisualChatのGitHubページをご覧ください： [GitHubランディングページ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
 
-* DeepSpeed-VisualChatは、皆様からのフィードバックとサポートにより改良を続けていきます。私たちの[ロードマップ](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-)は、現在サポートされている機能と将来的に計画している機能を示しています。
+* DeepSpeed-VisualChatは、皆様からのフィードバックとサポートにより改良を続けていきます。私たちの[ロードマップ](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-)は、現在サポートされている機能と将来的に計画している機能を示しています。
 
 DeepSpeed-VisualChatは、さまざまなDeep Learningシステムやモデリング技術を含む、より大きなDeepSpeedエコシステムの一部です。詳細については、以下をご覧ください。
 
 * 私たちの[ウェブサイト](https://www.deepspeed.ai/)で、詳細なブログ記事、チュートリアル、役立つドキュメントを提供しています。
-* DeepSpeedの最新ニュースは、[English X(Twitter)](https://twitter.com/MSFTDeepSpeed)、[Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP)、[Chinese Zhihu](https://www.zhihu.com/people/deepspeed)をフォローしてください。
+* DeepSpeedの最新ニュースは、[English X(Twitter)](https://twitter.com/DeepSpeedAI)、[Japanese X(Twitter)](https://twitter.com/DeepSpeedAI_JP)、[Chinese Zhihu](https://www.zhihu.com/people/deepspeed)をフォローしてください。
 
-DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
 
-* 私たちの[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)および[DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/)リポジトリが気に入ったら、ぜひスターをつけてください！
+* 私たちの[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/)および[DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/)リポジトリが気に入ったら、ぜひスターをつけてください！
diff --git a/blogs/deepspeed-visualchat/10-03-2023/README.md b/blogs/deepspeed-visualchat/10-03-2023/README.md
index eac9d06c3cea..ce354fd10812 100755
--- a/blogs/deepspeed-visualchat/10-03-2023/README.md
+++ b/blogs/deepspeed-visualchat/10-03-2023/README.md
@@ -55,7 +55,7 @@ The model architecture of DeepSpeed-VisualChat, as depicted in *Figure 1*, is co
 There are two common attention mechanisms used to connect the visual and textual components in a multi-modal model: causal attention, as used in MiniGPT and QWen-VL, and cross attention, as used in Otter and Flamingo.
 
 <div align="center">
-  <img src="../assets/images/attention.png" alt="Different attention mehanisms" width="1000"/>
+  <img src="../assets/images/attention.png" alt="Different attention mechanisms" width="1000"/>
 
   *Figure 2: Different Attention Mechanisms: Examine the differing attention mechanisms using an input sentence "User: Please describe the image." coupled with three Image tokens (I-token1, I-token2, I-token3). On the left, we demonstrate standard causal attention, treating image tokens as text. In the middle, we present cross attention applied to images, while maintaining standard causal attention for text tokens. On the right, we illustrate our innovative multi-modal attention proposal where image tokens only perform self-attention, and text tokens attend to text/image tokens independently, highlighted with an orange mask. This mechanism is defined by: softmax($`QK^T \odot M_1`$)+ softmax($`QK^T \odot M_2`$) with Q and K as query and key, $`M_1`$=[M==1], and $`M_2`$=[M==2], with M $`\in`$ R<sup>10x10</sup> in this case.*
 </div>
@@ -153,7 +153,7 @@ DeepSpeed-VisualChat is an easy-to-use training framework with great scalability
 
 The training experience of DeepSpeed-VisualChat is straightforward and convenient. Here we give an example based on the CLIP visual encoder and the LLaMa-7B model:
 ```
-git clone https://github.com/microsoft/DeepSpeedExamples.git
+git clone https://github.com/deepspeedai/DeepSpeedExamples.git
 cd DeepSpeedExamples/applications/DeepSpeed-VisualChat/
 pip install -r requirements.txt
 cd training
@@ -167,22 +167,22 @@ bash chat_scripts/run.sh # You need to change necessary variables, e.g, ckpt pat
 ```
 To support larger model inference, we have incorporated Hugging Face large model inference into our DeepSpeed-VisualChat API. Therefore, users can choose a different number of GPUs based on the GPU memory capacity and the model size.
 
-Please refer to our [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) for more details.
+Please refer to our [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat) for more details.
 
 # 7. Release: Try DeepSpeed-VisualChat today!
 
 We are very excited to share that DeepSpeed-VisualChat is now open-sourced and available to the AI community.
 
-* To get started, please visit our GitHub page for DeepSpeed-VisualChat: [GitHub Landing Page](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
+* To get started, please visit our GitHub page for DeepSpeed-VisualChat: [GitHub Landing Page](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat)
 
-* We will continue to improve DeepSpeed-VisualChat with your feedback and support. Our [roadmap](https://github.com/microsoft/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) shows currently supported features as well as ones that are planned for the future.
+* We will continue to improve DeepSpeed-VisualChat with your feedback and support. Our [roadmap](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/applications/DeepSpeed-VisualChat/README.md#-deepspeed-visualchats-roadmap-) shows currently supported features as well as ones that are planned for the future.
 
 
 DeepSpeed-VisualChat is a component of the larger DeepSpeed ecosystem, which includes a range of Deep Learning systems and modeling technologies. To learn more,
 
 * Please visit our [website](https://www.deepspeed.ai/) for detailed blog posts, tutorials, and helpful documentation.
-* Follow us on our [English X(Twitter)](https://twitter.com/MSFTDeepSpeed), [Japanese X(Twitter)](https://twitter.com/MSFTDeepSpeedJP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
+* Follow us on our [English X(Twitter)](https://twitter.com/DeepSpeedAI), [Japanese X(Twitter)](https://twitter.com/DeepSpeedAI_JP), and [Chinese Zhihu](https://www.zhihu.com/people/deepspeed) for latest news on DeepSpeed.
 
-We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) page. Please see our [contributing guide](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
+We welcome your contributions to DeepSpeed! We encourage you to report issues, contribute PRs, and join discussions on the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) page. Please see our [contributing guide](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md) for more details. We are open to collaborations with universities, research labs, companies, such as those working together on deep learning research, applying DeepSpeed to empower real-world AI models and applications, and so on. For such requests (and other requests unsuitable for GitHub), please directly email to deepspeed-info@microsoft.com.
 
-* "Star" our [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/microsoft/DeepSpeedExamples/) repositories if you like our work!
+* "Star" our [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/) and [DeepSpeedExamples GitHub](https://github.com/deepspeedai/DeepSpeedExamples/) repositories if you like our work!
diff --git a/blogs/deepspeed4science/chinese/README.md b/blogs/deepspeed4science/chinese/README.md
index dabc4ab077f2..d3bebfec598f 100644
--- a/blogs/deepspeed4science/chinese/README.md
+++ b/blogs/deepspeed4science/chinese/README.md
@@ -123,11 +123,11 @@ DeepSpeed4Science的旅程始于两个开创性的基于LLM的结构生物学研
 *图9：由不同框架在不同规模下支持的两个GenSLMs模型的最大序列长度。使用NVIDIA DGX，每个节点有八个40G A100 GPU。*
 </div>
 
-具体在系统层面，我们发布了包括[长序列支持和其他新优化](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)的最新的[Megatron-DeepSpeed框架](https://github.com/microsoft/Megatron-DeepSpeed)。科学家现在可以通过我们新添加的内存优化技术（如注意力掩码异步处理和位置码分割）、张量并行、流水线并行、序列并行、基于ZeRO的数据并行和模型状态异步处理等技术的协同组合，用更长的序列训练他们的GenSLMs等大型科学模型。图9展示了我们的新版本使GenSLMs的25B和33B模型的最长序列长度分别比之前的Megatron-DeepSpeed版本增加了12倍和14倍。在支持的序列长度方面，这个新Megatron-DeepSpeed框架也显著地超过了NVIDIA的Megatron-LM（对于25B和33B模型分别高达9.8倍和9.1倍）。例如，阿贡实验室团队的GenSLMs 25B模型在64个GPU上的原始序列长度为42K，而现在可以用512K的核苷酸序列进行训练。这在不损失准确性的条件下大大提高了模型质量和科学发现的范围。对于那些更喜欢相对位置编码技术这样的算法策略的领域科学家，这个[新版本](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)也进行了集成。
+具体在系统层面，我们发布了包括[长序列支持和其他新优化](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)的最新的[Megatron-DeepSpeed框架](https://github.com/deepspeedai/Megatron-DeepSpeed)。科学家现在可以通过我们新添加的内存优化技术（如注意力掩码异步处理和位置码分割）、张量并行、流水线并行、序列并行、基于ZeRO的数据并行和模型状态异步处理等技术的协同组合，用更长的序列训练他们的GenSLMs等大型科学模型。图9展示了我们的新版本使GenSLMs的25B和33B模型的最长序列长度分别比之前的Megatron-DeepSpeed版本增加了12倍和14倍。在支持的序列长度方面，这个新Megatron-DeepSpeed框架也显著地超过了NVIDIA的Megatron-LM（对于25B和33B模型分别高达9.8倍和9.1倍）。例如，阿贡实验室团队的GenSLMs 25B模型在64个GPU上的原始序列长度为42K，而现在可以用512K的核苷酸序列进行训练。这在不损失准确性的条件下大大提高了模型质量和科学发现的范围。对于那些更喜欢相对位置编码技术这样的算法策略的领域科学家，这个[新版本](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)也进行了集成。
 
 ## 总结和路线图
 
-我们非常自豪和兴奋地宣布DeepSpeed4Science计划以及几个研发亮点和成果。从今天开始，我们将在[deepspeed4science.ai](https://deepspeed4science.ai/)上介绍我们的新计划，包括关于我们的外部合作者的信息，以及当前和未来的DeepSpeed4Science技术发布。我们的一个高层次目标是推广广泛解决大规模科学发现的主要系统痛点的AI系统技术。我们希望全球的科学家们能够从DeepSpeed4Science通过开源软件解锁的新功能中受益。我们期待更好地了解阻碍您的科学发现的AI系统设计挑战。我们真诚地欢迎您的参与，帮助构建一个更有前途的AI4Science未来。请给我们发送电子邮件至<deepspeed-info@microsoft.com>。我们鼓励您在我们的[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)上报告问题、贡献PR、参与讨论。
+我们非常自豪和兴奋地宣布DeepSpeed4Science计划以及几个研发亮点和成果。从今天开始，我们将在[deepspeed4science.ai](https://deepspeed4science.ai/)上介绍我们的新计划，包括关于我们的外部合作者的信息，以及当前和未来的DeepSpeed4Science技术发布。我们的一个高层次目标是推广广泛解决大规模科学发现的主要系统痛点的AI系统技术。我们希望全球的科学家们能够从DeepSpeed4Science通过开源软件解锁的新功能中受益。我们期待更好地了解阻碍您的科学发现的AI系统设计挑战。我们真诚地欢迎您的参与，帮助构建一个更有前途的AI4Science未来。请给我们发送电子邮件至<deepspeed-info@microsoft.com>。我们鼓励您在我们的[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/)上报告问题、贡献PR、参与讨论。
 
 ## 致谢
 
diff --git a/blogs/deepspeed4science/japanese/README.md b/blogs/deepspeed4science/japanese/README.md
index 276528650ab5..21b56d2dd944 100644
--- a/blogs/deepspeed4science/japanese/README.md
+++ b/blogs/deepspeed4science/japanese/README.md
@@ -123,11 +123,11 @@ DeepSpeed4Scienceは、構造生物学研究(タンパク質構造予測や平
 *図9: 異なるスケールで異なるフレームワークがサポートする2つのGenSLMsモデルの最大シーケンス長。1ノードあたり8個の40G A100 GPUを搭載したNVIDIA DGXノードを使用。*
 </div>
 
-システムレベルでは、非常に長いシーケンスをサポートするための最新の[Megatron-DeepSpeedフレームワーク](https://github.com/microsoft/Megatron-DeepSpeed)を、[他の新しい最適化とともにリリースします](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)。科学者は、（アテンションマスクと位置の埋め込みに関する）新しく追加されたメモリ最適化手法、テンソル並列処理、パイプライン並列処理、シーケンス並列処理、ZeROスタイルのデータ並列処理、モデル状態のオフロードなどの技術を相乗的な組み合わせにより、GenSLMsのような大規模な科学モデルをはるかに長いシーケンスで訓練できるようになりました。図9は、新しいリリースにより、GenSLMsの25Bおよび33Bモデルで、以前のMegatron-DeepSpeedよりもそれぞれ最大12倍および14倍の最長シーケンス長を処理できることを示しています。サポートされているシーケンス長に関しては、この新しいMegatron-DeepSpeedは、25Bモデルと33Bモデルでそれぞれ最大9.8倍と9.1倍でNVIDIAのMegatron-LMを大幅に上回っています。たとえば、GenSLMsの25Bモデルは、64個のGPUでのアルゴンヌチームの元の42Kシーケンス長と比較して、512Kのヌクレオチド配列で訓練できるようになりました。これにより、精度を損なうことなく、モデルの品質と科学的発見の範囲が大幅に向上します。Relative position embeddingなどのアルゴリズム戦略を必要とする科学者向けの追加サポートも、[このリリース](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)に統合されています。
+システムレベルでは、非常に長いシーケンスをサポートするための最新の[Megatron-DeepSpeedフレームワーク](https://github.com/deepspeedai/Megatron-DeepSpeed)を、[他の新しい最適化とともにリリースします](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support)。科学者は、（アテンションマスクと位置の埋め込みに関する）新しく追加されたメモリ最適化手法、テンソル並列処理、パイプライン並列処理、シーケンス並列処理、ZeROスタイルのデータ並列処理、モデル状態のオフロードなどの技術を相乗的な組み合わせにより、GenSLMsのような大規模な科学モデルをはるかに長いシーケンスで訓練できるようになりました。図9は、新しいリリースにより、GenSLMsの25Bおよび33Bモデルで、以前のMegatron-DeepSpeedよりもそれぞれ最大12倍および14倍の最長シーケンス長を処理できることを示しています。サポートされているシーケンス長に関しては、この新しいMegatron-DeepSpeedは、25Bモデルと33Bモデルでそれぞれ最大9.8倍と9.1倍でNVIDIAのMegatron-LMを大幅に上回っています。たとえば、GenSLMsの25Bモデルは、64個のGPUでのアルゴンヌチームの元の42Kシーケンス長と比較して、512Kのヌクレオチド配列で訓練できるようになりました。これにより、精度を損なうことなく、モデルの品質と科学的発見の範囲が大幅に向上します。Relative position embeddingなどのアルゴリズム戦略を必要とする科学者向けの追加サポートも、[このリリース](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/)に統合されています。
 
 ## まとめとロードマップ
 
-DeepSpeed4Scienceイニシアティブを、いくつかのR&Dのハイライトや成果と共に発表できることを嬉しく思います。本日から、外部の協力者に関する情報や、現在および将来のDeepSpeed4Scienceテクノロジーリリースなど、新しいイニシアティブでの活動を[deepspeed4science.ai](https://deepspeed4science.ai/)上で進めていきます。私たちの高レベルな目標の1つは、大規模な科学的発見のための主要なシステムの問題点に広く対処するAIシステムテクノロジーを一般化することです。世界中の科学者によって、オープンソースのソフトウェアを通じてDeepSpeed4Scienceによって利用可能になる新機能が活用されることを願っています。科学的発見の障害となるAIシステム設計の課題を解決していくことを楽しみにしています。AI4Scienceの有望な未来を築くために、皆様の参加を歓迎します。お問い合わせは<deepspeed-info@microsoft.com>までお願いします。問題の報告や、PRを通じての貢献、ディスカッションへの参加は、[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/)でお願いします。
+DeepSpeed4Scienceイニシアティブを、いくつかのR&Dのハイライトや成果と共に発表できることを嬉しく思います。本日から、外部の協力者に関する情報や、現在および将来のDeepSpeed4Scienceテクノロジーリリースなど、新しいイニシアティブでの活動を[deepspeed4science.ai](https://deepspeed4science.ai/)上で進めていきます。私たちの高レベルな目標の1つは、大規模な科学的発見のための主要なシステムの問題点に広く対処するAIシステムテクノロジーを一般化することです。世界中の科学者によって、オープンソースのソフトウェアを通じてDeepSpeed4Scienceによって利用可能になる新機能が活用されることを願っています。科学的発見の障害となるAIシステム設計の課題を解決していくことを楽しみにしています。AI4Scienceの有望な未来を築くために、皆様の参加を歓迎します。お問い合わせは<deepspeed-info@microsoft.com>までお願いします。問題の報告や、PRを通じての貢献、ディスカッションへの参加は、[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/)でお願いします。
 
 ## 謝辞
 
diff --git a/blogs/intel-inference/README.md b/blogs/intel-inference/README.md
new file mode 100644
index 000000000000..917117523abc
--- /dev/null
+++ b/blogs/intel-inference/README.md
@@ -0,0 +1,143 @@
+
+# Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed
+
+> This blog is co-published with Intel and can also be viewed on [Intel's website](https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html).
+
+## Introduction
+
+Transformer models have revolutionized natural language processing with their ability to capture complex semantic and syntactic relationships. However, these models also pose significant challenges for efficient inference, especially for large language models (LLMs) that have billions of parameters. For example, running half-precision inference of Megatron-Turing 530B would require 40 A100-40GB GPUs [1]. To address challenges associated with the inference of large-scale transformer models, the DeepSpeed team at Microsoft* developed DeepSpeed Inference [2]. It provides high-performance multi-GPU inferencing capabilities and introduces several features to efficiently serve transformer-based PyTorch models using GPU. Today, we are very excited to share that DeepSpeed Inference has been implemented for the 4th Gen Intel® Xeon® scalable processor.
+
+## 4th Gen Intel Xeon Processor
+
+Intel launched the 4th gen Intel Xeon processor in January 2023. This CPU has built-in accelerators for AI, data analytics, networking, storage and HPC. Tile Matrix Multiplication (TMUL) is the built-in AI accelerator. It executes the Intel® Advanced Matrix Extensions (Intel®AMX). Intel AMX can significantly speed up deep learning (DL) applications, both in inference and training. Other notable new features in 4th gen Intel Xeon processors that can speed up DL applications include PCI Express Gen5 (PCIe 5.0) and DDR5. PCIe 5.0 doubles the I/O bandwidth from PCIe 4.0, increasing the bandwidth between CPU and connected devices. DDR5 offers up to 1.5x bandwidth increase over DDR4 [3].
+
+4th gen Intel Xeon with Intel AMX sped up training of BERT-large by 4x compared to 3rd gen Intel Xeon [4]. TMUL executes Intel AMX instructions on data loaded in 2D registers, hence the name tiles. These instructions operate on 8-bit integer (INT8) or 16-bit bfloat (BF16) datatype. 4th gen Intel Xeon with Intel AMX can attain 2048 INT8 operations per cycle compared to 256 INT8 operations per cycle in 3rd gen Intel Xeon with Intel Advanced Vector Extensions 512 Neural Network Instructions (Intel AVX-512 VNNI). Its BF16 performance is 1024 operations per cycle compared to its FP32 performance of 64 operations per cycle. Therefore, Intel AMX can significantly speed up DL applications when INT8 or BF16 datatype is used for matrix multiplication or convolution computations, the common operations in transformer or convolution-based models.
+
+## DeepSpeed enabled for 4th Gen Intel Xeon
+
+DeepSpeed is a DL optimization software for scaling and speeding up DL training and inference. DeepSpeed Inference refers to the feature set in DeepSpeed implemented to speed up inference of transformer models [2]. It initially supported only CUDA GPU. We recently added support for CPU, specifically 4th gen Intel Xeon. Features currently implemented for 4th gen Intel Xeon include automatic tensor parallelism (AutoTP), BF16 and INT8 datatype support, and binding cores to rank.
+
+DeepSpeed builds on top of PyTorch, which has been highly optimized for CPU inference and training. Intel® Extension for PyTorch* adds state-of-the-art optimizations for popular LLMs architectures, including highly efficient matrix multiplication kernels to speed-up linear layers and customized operators to reduce the memory footprint [5]. The runtime software components for DeepSpeed Inference on CPU are shown below in Figure 1. Intel® oneAPI Deep Neural Network Library (oneDNN) uses Intel AVX-512 VNNI and Intel AMX optimizations [6]. Intel® oneAPI Collective Communications Library (oneCCL)  is a library that implements the communication patterns in DL [7]. Intel® Neural Compressor (INC) was used to convert the LLMs from FP32 datatype to BF16 or INT8 datatype [8].
+
+
+<div align="center">
+ <img src="assets/software-arch.png" width="450px"><br>
+Figure 1. Software components for DeepSpeed Inference on CPU
+</div>
+
+## Technologies Introduced
+
+To accelerate running LLMs with DeepSpeed on 4th-generation Intel Xeon, we introduced technologies into both DeepSpeed and Intel Extension for PyTorch.
+
+1. Extend DeepSpeed Accelerator Abstraction Interface to provide CPU support [9]. We implemented CPU as a DeepSpeed Accelerator which allows CPU support to be plugged into DeepSpeed in a device-agnostic manner. Device-agnostic DeepSpeed model scripts which use DeepSpeed Accelerator Abstraction Interface can run on CPU devices without modification.
+2.	Fine-grain core binding. We introduced two new DeepSpeed command line arguments: `--bind_cores_to_rank` and `--bind_core_list` to allow core binding with DeepSpeed AutoTP [10] on a node with multiple sockets or on a single socket with multiple sub-NUMA nodes (SNC). Using `numactl`` for each tensor parallel worker, we can bind workers to cores and NUMA memory. This reduces interference between workers and uses memory bandwidth and core more effectively.
+3.	Optimized shared memory (SHM) based on AllReduce communication primitives for a single CPU node. We implemented a low latency SHM based AllReduce primitive which utilizes the shared memory of a single-node CPU system.
+4.	Optimizations in Intel Extension for PyTorch
+
+    a. oneDNN, Tensor Processing Primitives (TPP) and customized linear kernels for weight only quantization.
+
+    b. Indirect Access KV Cache reduces memory reorder overhead when using KV cache.
+
+    c. Subgraph fusion to reduce memory footprint.
+
+    d. Fusion of AllReduce between multi-head attention and multilayer perceptron in transformer layer when there is no dependency between them.
+
+## How to run DeepSpeed on CPU
+
+Software required for DeepSpeed Inference on CPU (Specific details can be found in the configuration.)
+* PyTorch
+* Intel Extension for PyTorch [6]
+* oneCCL binding for PyTorch [11]
+* oneCCL [7]
+* DeepSpeed [12]
+
+After installing the required software, we can run inference for a model on CPU. Device agnostic interfaces are used to load and run the model. These device agnostic interfaces are accessed through deepspeed.accelerator.get_accelerator() as shown below in Listing 1. Refer to the DeepSpeed tutorial on DeepSpeed accelerator interfaces [13] for further details.
+
+```python
+# Listing 1. An example of using device agnostic interface to get the accelerator device and load and run a model.
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+...
+# load model checkpoint into model
+model = model.eval().to(get_accelerator().device_name())
+
+ds_world_size = int(os.getenv('WORLD_SIZE', '0'))
+
+engine = deepspeed.init_inference(model=model, mp_size=ds_world_size, \
+  dtype=torch.bfloat16, replace_method="auto", \
+  replace_with_kernel_inject=False)
+
+model = engine.module
+...
+# evaluate model
+```
+
+Execute the inference code with DeepSpeed using the following command:
+
+```bash
+deepspeed --bind_cores_to_rank <python script>
+```
+
+This command detects the number of sockets on host and launches as many inference workers as the number of sockets. The LLM workload runs in parallel on the inference workers with DeepSpeed AutoTP [10]. AutoTP distributes inference computation among workers and reduces inference latency. For example, if the host has two sockets, this command will launch two inference workers to inference the input sample in parallel. The argument --bind_cores_to_rank instructs DeepSpeed to split the CPU cores and distribute them to each rank evenly. This ensures that each inference worker uses an exclusive set of CPU cores to avoid interfering with one another. If this argument is not specified, it will defer to the operating system to schedule the workers to the CPU cores, which may not be optimal.
+
+Intel Extension for PyTorch is compatible with DeepSpeed AutoTP and can therefore be used to further optimize AutoTP models generated by DeepSpeed.
+
+```python
+# Use Intel Extension for PyTorch to optimize model
+...
+model = engine.module
+import intel_extension_for_pytorch as ipex
+model = ipex.optimize_transformers(model.eval(), dtype=torch.bfloat16, inplace=True)
+...
+```
+Examples of LLM optimizations for DeepSpeed AutoTP models with Intel Extension for PyTorch are available at [14].
+
+## Results
+
+DeepSpeed enables optimal distribution of LLM inference on two 4th gen Intel Xeon sockets. Intel AMX on 4th gen Intel Xeon can be used to accelerate BF16 matrix multiplication operations. Support for Intel AMX is available through Intel Extension for PyTorch. Performance speedups in GPT-J-6B and LLaMA2-13B from DeepSpeed AutoTP on 2 sockets are shown in Figure 2 below. GPT-J-6B has 6 billion parameters, requiring 12 GB of memory for its weights. Llama-2-13B has 13 billion parameters, requiring 26 GB of memory for the weights. Latency improvement is the metric used. Prompt latency and per token latency improved as shown by the speedups in the plot.
+
+<div align="center">
+ <img src="assets/intel-results.png" width="800px"><br>
+Figure 2. Performance speedups from 1-socket to 2-socket 4th gen Intel Xeon with DeepSpeed AutoTP. Higher speedup represents higher performance. Per token latency is per token latency for 2nd and subsequent tokens. in/out refers to the input token size and output token size. Beam search size was 4. See backup for configurations, results may vary.
+</div>
+
+## Summary
+DeepSpeed Inference has been enabled for 4th gen Intel Xeon with Intel AMX to accelerate matrix multiplications common in DL workloads. DeepSpeed Inference leverages 4th Gen Intel Xeon to speed up the inferences of GPT-J-6B and Llama-2-13B. We will continue to improve it for new devices and new LLMs. Intel Data Center GPU Max is a new GPU designed for AI for which DeepSpeed will also be enabled [15].
+
+## Contributors
+This work was made possible through deep collaboration between software engineers and researchers at Intel and Microsoft. The contributors of this work include Guokai Ma, Kiefer Kuah, Yejing Lai, Liangang Zhang, Xiaofei Feng, Xu Deng, Mengfei Li, Jianan Gu, Haihao Shen, and Fan Zhao from Intel; Olatunji Ruwase, Martin Cai, and Yuxiong He from Microsoft.
+
+## Configuration
+1-node, 2x Intel® Xeon® Platinum 8480+, 56 cores, HT On, Turbo On, 1024 GB (16x64GB DDR5 4800 MT/s [4800 MT/s]) , BIOS version Intel Corporation SE5C7411.86B.9525.D13.2302071333, 02/07/2023, ucode version 0x2b000190, Red Hat Enterprise Linux 8.6, kernel version 4.18.0-372.9.1.el8.x86_64, gcc 11.2.1, PyTorch 2.1.0.dev20230618+cpu,  DeepSpeed 0.9.5+3f5e4931, ipex 2.1.0+git31b7cd6, GPT-J-6B, LLaMA-2-13B.
+
+## References
+
+[1] 	Microsoft, "ZeRO-Inference: Democratizing massive model inference," 9 September 2022. [Online]. Available: https://www.deepspeed.ai/2022/09/09/zero-inference.html. [Accessed 12 April 2023].
+
+[2] 	R. Y. Aminabadi, S. Rajbhandari, M. Zhang, A. A. Awan, C. Li, D. Li, E. Zheng, J. Rasley, S. Smith, O. Ruwase, Y. H. Y. Aminabadi, S. Rajbhandari, M. Zhang, A. A. Awan, C. Li, D. Li and El, "DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale," 30 6 2022. [Online]. Available: https://arxiv.org/abs/2207.00032.
+
+[3] 	Intel, "4th Gen Intel(r) Xeon(r) Scalable Processors," [Online]. Available: https://www.intel.com/content/www/us/en/products/docs/processors/xeon-accelerated/4th-gen-xeon-scalable-processors-product-brief.html. [Accessed 12 4 2023].
+
+[4] 	Intel, "Accelerate AI Workloads with Intel® AMX," [Online]. Available: https://www.intel.com/content/www/us/en/products/docs/accelerator-engines/advanced-matrix-extensions/ai-solution-brief.html. [Accessed 12 4 2023].
+
+[5] 	Intel, "Large Language Models (LLM) Optimizations Overview," [Online]. Available: https://intel.github.io/intel-extension-for-pytorch/cpu/2.1.0+cpu/tutorials/llm.html.
+
+[6] 	Intel, "Intel® Extension for PyTorch," [Online]. Available: https://github.com/intel/intel-extension-for-pytorch.
+
+[7] 	Intel, "oneAPI Collective Communications Library (oneCCL)," [Online]. Available: https://github.com/oneapi-src/oneCCL.
+
+[8] 	Intel, "Intel® Neural Compressor," [Online]. Available: https://github.com/intel/neural-compressor.
+
+[9] 	Microsoft, "DeepSpeed Accelerator Abstraction Interface," [Online]. Available: https://github.com/deepspeedai/DeepSpeed/blob/master/docs/_tutorials/accelerator-abstraction-interface.md.
+
+[10] 	Microsoft, "Automatic Tensor Parallelism for HuggingFace Models," [Online]. Available: https://www.deepspeed.ai/tutorials/automatic-tensor-parallelism.
+
+[11] 	Intel, "Intel® oneCCL Bindings for PyTorch," [Online]. Available: https://github.com/intel/torch-ccl.
+
+[12] 	Microsoft, "deepspeed," [Online]. Available: https://github.com/deepspeedai/deepspeed.
+
+[13] 	Intel, "DeepSpeed Accelerator Abstraction Interface," [Online]. Available: https://github.com/deepspeedai/DeepSpeed/pull/3184.
+
+[14] 	Intel, "Intel® Extension for PyTorch large language model example," [Online]. Available: https://github.com/intel/intel-extension-for-pytorch/tree/llm_feature_branch/examples/cpu/inference/python/llm.
+
+[15] 	Intel, "Intel® Data Center GPU Max Series," [Online]. Available: https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html.
diff --git a/blogs/intel-inference/assets/intel-results.png b/blogs/intel-inference/assets/intel-results.png
new file mode 100755
index 000000000000..e65aae3d4a8c
Binary files /dev/null and b/blogs/intel-inference/assets/intel-results.png differ
diff --git a/blogs/intel-inference/assets/software-arch.png b/blogs/intel-inference/assets/software-arch.png
new file mode 100755
index 000000000000..da147f4b9672
Binary files /dev/null and b/blogs/intel-inference/assets/software-arch.png differ
diff --git a/blogs/ulysses-offload/README.md b/blogs/ulysses-offload/README.md
new file mode 100644
index 000000000000..5dfd63ccdc32
--- /dev/null
+++ b/blogs/ulysses-offload/README.md
@@ -0,0 +1,263 @@
+# Ulysses-Offload: Democratizing Long Context LLM Training
+
+<img src="./media/image1.png" style="width:6.5in;height:3.34583in"
+alt="A screenshot of a computer Description automatically generated" />
+
+Figure 1: Ulysses-Offload supports 16x longer sequence lengths at 55%
+Model FLOPs Utilization (MFU) than NVIDIA Megatron-SP and DeepSpeed Ulysses.
+
+
+To cite and for more technical in depth of this release, please see
+our [arxiv report](https://arxiv.org/abs/2408.16978):
+
+@article{yao2024ulysses,
+
+title={ Training Ultra Long Context Language Model with Fully Pipelined
+Distributed Transformer},
+
+author={Jinghan Yao and Sam Ade Jacobs and Masahiro Tanaka and Olatunji
+Ruwase and Aamir Shafi and Hari Subramoni and Dhabaleswar K. (DK) Panda
+},
+
+journal={https://arxiv.org/abs/2408.16978},
+
+year={2024}
+
+}
+
+## Introduction
+
+In the rapidly evolving field of generative AI and scientific ML, the
+ability to train large (language) models with ultra-long context
+capabilities is becoming increasingly important. These models are
+essential for a variety of complex tasks, such as understanding
+lengthy documents, generating images and videos, and processing extensive
+sequences in computational biology. However, training such models
+efficiently poses significant challenges due to the enormous GPU
+memory required.
+
+Building DeepSpeed Ulysses, our previous project, which developed
+system optimizations for training extremely long sequence transformer
+models, we are excited to present Ulysses-Offload, in this release. Ulysses-Offload
+is an innovative, resource-efficient technique that offers comparable
+benefits to DeepSpeed Ulysses and other previous long-context
+optimization methods, but with a lower hardware budget. Ulysses-Offload makes
+ultra long-context large language models (LLM) training and finetuning
+accessible to everyone, including those with limited GPU resources. Ulysses-Offload enables
+training with context lengths of up to 2 million tokens using just 4
+NVIDIA A100-40GB GPUs. Ulysses-Offload supports 16x longer sequence lengths at 55%
+Model FLOPs Utilization (MFU) than NVIDIA Megatron-SP and DeepSpeed Ulysses
+(see Figure 1). The next section highlights the key innovations of Ulysses-Offload,
+and subsequent sections provide additional details on the design and
+usability of Ulysses-Offload, followed by experimental results.
+
+## Key Innovations
+
+### 1. Fully Pipelined Distributed Transformer (FPDT)
+
+The core innovation of our work is the Fully Pipelined Distributed
+Transformer (FPDT). This approach leverages a pipelined sequence
+chunking, which allows for the training of LLMs with sequence lengths up
+to 2 million tokens on just 4 A100-40GB GPUs. By breaking down the
+sequence into manageable chunks and processing them in a pipelined
+manner, Ulysses-Offload significantly reduces the memory footprint while
+maintaining high computational efficiency. This method ensures that the
+GPUs are utilized effectively, even when dealing with extremely long
+sequences.
+
+### 2. Memory Optimization
+
+One of the critical aspects of our approach is the comprehensive
+analysis and optimization of the memory footprint during LLM training.
+We target the reduction of redundant intermediate buffers in both the
+forward and backward passes of the training process. By optimizing the
+use of GPU and host CPU memory, we can train larger models with longer
+sequences without running into GPU memory limitations. This optimization
+is crucial for enabling the training of ultra-long context models on a
+limited number of GPUs. It is worth noting that Ulysses-Offload memory optimization
+is orthogonal and complementary to model- parameter-focused memory
+optimization techniques used by DeepSpeed ZeRO and PyTorch FSDP. Ulysses-Offload optimizes memory footprint of activations associated with long sequences while ZeRO and FSDP optimize memory footprint of model parameters.
+
+### 3. Compatibility and Flexibility
+
+Ulysses-Offload is designed to be agnostic to existing training techniques and
+works efficiently across different LLM models, including popular
+architecture like GPT and Llama. This flexibility ensures that our
+approach can be easily integrated into various training workflows.
+Additionally, Ulysses-Offload is compatible with advanced memory optimization
+techniques such as DeepSpeed ZeRO and PyTorch FSDP, further enhancing
+its usability and performance.
+
+## Core Design of Ulysses-Offload
+
+Figure 2 illustrates the core structure of Ulysses-Offload. Ulysses-Offload leverages multiple
+memory hierarchies in modern GPU clusters, thus boosting hardware
+efficiency and cost-effectiveness while achieving very high model FLOP
+utilization (MFU). The design of Ulysses-Offload centers around pipelining,
+scheduling, and memory management. These well-known optimization
+techniques are essential for scaling LLM context length to a million
+scale with a few GPUs and will be discussed in the subsequent
+subsections.
+
+<img src="./media/image2.png" style="width:6.5in;height:2.68634in"
+alt="A screenshot of a computer Description automatically generated" />
+
+Figure 2: Core design
+
+###
+
+### Pipelining and Scheduling
+
+Ulysses-Offload employs sequence chunking and pipelined computation design to manage the memory
+and computational load efficiently. In traditional Transformer model,
+input (hidden state) tensor is projected to q, k, v tensors. Each of these tensors can be denoted *\[B, S, H, D\]*, where *B* is batch
+size, *S* is sequence length, *H* is number of heads and *D* is hidden
+dimension per head. With sequence parallelism such as DeepSpeed Ulysses,
+input tensor is partitioned along sequence dimension across sequence
+parallel group P, that is *\[B, S/P, H, D\]* prior to alltoall collective
+communication. The alltoall collective communication gathers partitioned tensors
+along sequence dimension and scatter them along head dimension essentially
+transforming tensor from *\[B, S/P, H, D\]* to *\[B, S, H/P, D\]*. Post attention computation, a second alltoall communication transforms *\[B, S, H/P, D\]* back to *\[B, S/P, H, D\]*
+
+In our Ulysses-Offload design, input sequence are partitioned at a much finer granularity than DeepSpeed Ulysses. In other words, we made changes to sequence partitioning such that we further subdivide per GPU *S/P* sequence into smaller *u*
+chunks. Thus, the input tensors are now represented as \[*B, S/uP, H,
+D*\]. We denote these chunks as *T<sub>i</sub>*,
+where$\ i\  \in \ 0,1,\ldots,\ u - 1.$ As shown in Figure 1,
+*T<sub>i</sub>* is projected to query *q<sub>i</sub>*, key
+*k<sub>i</sub>*, and value *v<sub>i</sub>*. Then, similar to DeepSpeed Ulysses, an alltoall collective communication gathers partitioned tensor
+along sequence dimension and scatter them along head dimension. In our chunk
+design, the sequence length for each chunk is reduced by a factor of *u*
+compared to Ulysses. Please note that our Ulysses-Offload chunking procedure is generally applicable to other sequence parallelism techniques.
+
+<img src="./media/image3.png" style="width:6.5in;height:5.36042in"
+alt="A screenshot of a computer Description automatically generated" />
+
+Figure 3: Core design with offload description
+
+Figure 3 gives an example of how to perform the computation of chunk
+*T<sub>m</sub>*. After the alltoall collective communication,
+*GPU<sub>j</sub>* receives
+$\widehat{q}m,\ \widehat{k}m,\ and\ \widehat{v}m$*.* We then fetch the
+previous sequence chunk by chunk from the host memory to
+GPU<sub>j</sub>, and perform online attention with the current
+$\widehat{q}m$ and update the output chunk accordingly. Note that, in a
+strict manner, at any given time, only one set of chunks
+$\widehat{k}i,\ and\ \widehat{v}i$ is placed on GPU's HBM, reducing the
+memory footprint to $\frac{1}{u}$ compared to the non-offloading version
+without double buffering. With double buffering, memory footprint is
+reduced by *2/u*.
+
+### Memory Management
+
+Ulysses-Offload optimizes memory usage by carefully managing the allocation and
+deallocation of buffers during training. This involves:
+
+1.  Double Buffering:
+
+    - Two sets of buffers are maintained to overlap computation with
+      data transfer.
+
+    - While one set of buffers is used for computation, the other set is
+      preloaded with the next chunk of data.
+
+2.  Hierarchical Memory Utilization:
+
+    - GPU High Bandwidth Memory (HBM) is used for active computation.
+
+    - Host memory is used to store intermediate results that are not
+      immediately needed, reducing the pressure on GPU memory.
+
+## Integration with Existing Frameworks
+
+Ulysses-Offload is designed to integrate seamlessly with popular deep learning
+frameworks such as PyTorch. Ulysses-Offload provides user-friendly APIs that
+abstract the complexities of pipelined training and memory management.
+Users can adopt Ulysses-Offload with minimal changes to existing codebases.
+
+## Experimental Results
+
+<img src="./media/image4.png" style="width:6.5in;height:3.37431in"
+alt="A collage of graphs Description automatically generated" />
+
+Figure 4: Supported sequence lengths and corresponding Model FLOPs
+Utilization (MFU) using Megatron-SP, Ulysses, and our proposed Ulysses-Offload (FPDT). OOM
+denotes the point where increasing sequence length will cause memory
+issues. We show Ulysses-Offload's performance when the sequence length is larger
+than 128K, as shorter sequences can be properly handled by existing
+strategies.
+
+### Extended Sequence Lengths
+
+In our experimental setup, we compare Ulysses-Offload with two existing methods:
+Microsoft DeepSpeed Ulysses and NVIDIA Megatron-SP. Both DeepSpeed
+Ulysses and Megatron-SP employ similar approaches to sequence
+parallelism but differ in the collective communication used for
+gathering sequences before the attention block. The former utilizes
+alltoall communication, whereas the latter employs allgather. Ulysses-Offload
+builds upon the DeepSpeed Ulysses approach. The primary advantage of
+Ulysses-Offload is its capability to support the training of large language models
+(LLMs) with ultra-long sequence lengths using fewer GPUs. As shown in
+Figure 4, our method enables the training of 8B parameter models with
+sequence lengths of 2 million tokens using only 4 GPUs. For even larger
+models, such as GPT-30B and Llama-70B parameter models, Ulysses-Offload supports
+sequence lengths up to 3 million and 4 million tokens using 16 GPUs and
+32 GPUs respectively. This represents a 16x increase in sequence length
+compared to current state-of-the-art solutions (see Figure 5), making
+Ulysses-Offload a game-changer for tasks that require processing long sequences.
+
+### High Hardware Efficiency
+
+As shown in Figure 4 with different model sizes ranging from GPT-2.7B to
+Llama-80B parameters, Ulysses-Offload achieves over 55% Model FLOPs Utilization
+(MFU), ensuring that the hardware resources are utilized effectively.
+This high level of efficiency is maintained even when dealing with
+extremely long sequences (up to 4 million context length), making Ulysses-Offload
+an ideal solution for training large-scale LLMs. By maximizing the use
+of available hardware, Ulysses-Offload reduces the overall cost and complexity of
+training long-context models. Our [technical report](https://arxiv.org/abs/2408.16978) offers
+further insights into optimizing sequence chunks to balance the
+trade-off between memory usage and MFU.
+
+<img src="./media/image5.png" style="width:6.5in;height:2.01667in" />
+
+Figure 5: A comprehensive analysis on long-context LLM training with
+different training techniques: tensor parallelism (TP), activation
+checkpoint (AC), activation checkpoint with CPU offloading (OC), Ulysses
+(UL), and our approach Ulysses-Offload (FPDT).
+
+## Implementation and Usability
+
+Ulysses-Offload is designed to be easily integrated with popular deep learning
+frameworks such as DeepSpeed, Megatron-DeepSpeed and PyTorch. Users can
+adopt our approach with minimal changes to their existing training
+pipeline, making it accessible to a broad audience. The integration
+process involves setting up the sequence chunk pipeline and configuring
+the memory optimization techniques, both of which are straightforward
+and well-documented (see tutorial).
+
+Our pipeline design and memory optimization techniques are
+straightforward to implement, making Ulysses-Offload accessible to researchers and
+practitioners aiming to train long-context LLMs efficiently. We provide
+detailed [technical report](https://arxiv.org/abs/2408.16978),
+documentation and examples to guide users through the setup process,
+ensuring a smooth transition to using Ulysses-Offload. Additionally, Ulysses-Offload, in the
+tradition of DeepSpeed provides user-friendly API which abstracts the
+complexities of mixed precision training and memory optimization,
+allowing users to focus on their research and development tasks.
+
+## General Availability of DeepSpeed Ulysses-Offload
+
+We are excited to release Ulysses-Offload. Ulysses-Offload has been
+fully integrated with Megatron-DeepSpeed and accessible through both
+DeepSpeed and Megatron-DeepSpeed GitHub repos. Click here for detailed
+[tutorial](https://www.deepspeed.ai/tutorials/ulysses-offload/) on usage.
+
+We invite the community to explore our implementation, contribute to
+further advancements, and join us in pushing the boundaries of what is
+possible in LLM and AI. This release is part of the bigger DeepSpeed
+ecosystem of large-scale AI training, finetuning and inference. For more
+details on all DeepSpeed technologies and innovations, please visit our
+[website]((https://www.deepspeed.ai/)) and follow us
+on X, formerly Twitter, ([English](https://twitter.com/DeepSpeedAI),
+[Japanese](https://twitter.com/DeepSpeedAI_JP)) and
+[Chinese Zhihu](https://www.zhihu.com/people/deepspeed).
diff --git a/blogs/ulysses-offload/media/image1.png b/blogs/ulysses-offload/media/image1.png
new file mode 100644
index 000000000000..df63ef819763
Binary files /dev/null and b/blogs/ulysses-offload/media/image1.png differ
diff --git a/blogs/ulysses-offload/media/image2.png b/blogs/ulysses-offload/media/image2.png
new file mode 100644
index 000000000000..fee5ad05d313
Binary files /dev/null and b/blogs/ulysses-offload/media/image2.png differ
diff --git a/blogs/ulysses-offload/media/image3.png b/blogs/ulysses-offload/media/image3.png
new file mode 100644
index 000000000000..7777b1b245dd
Binary files /dev/null and b/blogs/ulysses-offload/media/image3.png differ
diff --git a/blogs/ulysses-offload/media/image4.png b/blogs/ulysses-offload/media/image4.png
new file mode 100644
index 000000000000..05226922851b
Binary files /dev/null and b/blogs/ulysses-offload/media/image4.png differ
diff --git a/blogs/ulysses-offload/media/image5.png b/blogs/ulysses-offload/media/image5.png
new file mode 100644
index 000000000000..447037975d1a
Binary files /dev/null and b/blogs/ulysses-offload/media/image5.png differ
diff --git a/blogs/windows/08-2024/README.md b/blogs/windows/08-2024/README.md
new file mode 100644
index 000000000000..5148fa80d122
--- /dev/null
+++ b/blogs/windows/08-2024/README.md
@@ -0,0 +1,101 @@
+<div align="center">
+
+# DeepSpeed on Windows
+
+</div>
+
+# Introduction
+
+DeepSpeed is a popular open-source deep learning optimization library that makes distributed training and inference easy, efficient, and effective. DeepSpeed has been widely used to train a variety of state-of-the-art models, including Phi-3, Megatron-Turing-530B, BLOOM-176B, and Arctic because of its rich suite of sophisticated optimizations (e.g., ZeRO, 3D parallelism, MoE, etc.). However, the lack of native support for Microsoft Windows, the most popular operating system, means that DeepSpeed innovations are inaccessible to many AI developers and users. To address this problem, we started an effort to make DeepSpeed run natively with full features on Windows, while ensuring the same ease-of-use enjoyed on Linux.
+
+In this blog, we are pleased to announce some early achievements on this journey: DeepSpeed can now be installed in Windows and run natively for single-GPU training, finetuning, and inferencing. Importantly, both the installation and usage experiences are identical to those on Linux. Furthermore, the finetuning and inferencing workloads demonstrate the functioning of three critical DeepSpeed features, HuggingFace Transformers integration, LoRA support, and CPU Offloading. DeepSpeed on Windows is available in DeepSpeed versions 0.14.5 and above. In the rest of this blog, we present examples to demonstrate these achievements.
+
+# Evaluation Environment
+We conducted the experiments on a Surface Laptop Studio 2 running Windows 11 Version 23H2 and Build 22631.3880. The laptop is equipped with a single NVIDIA RTX A2000 GPU with 4GB VRAM. We used Pytorch version 2.3.0 and HuggingFace Transformers version 4.41.2. The example scripts used are from the [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples), therefore you need to clone the repo before running any of the following examples.
+
+# Installation
+DeepSpeed can be installed on Windows in one of two ways. The easier way is to use the pip package manager, while the other is to build from source. The prerequisites for in both cases are Python 3.x and Pytorch with CUDA support.
+
+## Installing via pip
+To install DeepSpeed, simply run: `pip install deepspeed`. This will install the latest version of DeepSpeed (0.14.5 at this time). Unlike the Linux counterpart, the Windows version comes with all the operators already prebuilt, so there is no need to have a CUDA SDK or C++ compiler installed.
+
+<div align="center">
+    <img src="./media/win_pip_install_deepspeed.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    pip installation of DeepSpeed on Windows.
+</div>
+
+
+## Building from Source
+To build DeepSpeed from source, you need to clone the DeepSpeed repository and run the `build_win.bat` compilation script.
+
+
+## Validating Installation
+Regardless of the installation choice, you can check that the installation was successful by running ds_report. The output should look like this:
+
+
+<div align="center">
+    <img src="./media/ds_report.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    ds_report output confirming  Windows installation of DeepSpeed.
+</div>
+
+# Pretraining Examples
+We use an image classification model, CIFAR10, and a language model, BERT, to demonstrate pretraining on Windows with DeepSpeed.
+
+## Pretraining CIFAR10
+The scripts and codes required for CIFAR10 pretraining example are available in the following path: DeepSpeedExamples\training\cifar. You can launch the CIFAR10 pretraining experiment using the following command: `deepspeed cifar10_deepspeed.py --deepspeed`. The final output should look something like this:
+<div align="center">
+    <img src="./media/cifar10_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    Pretraining CIFAR10 model on Windows using DeepSpeed.
+</div>
+
+## Pretraining BERT
+The scripts and codes for the BERT pretraining example are available in the following path: DeepSpeedExamples\training\HelloDeepSpeed. You can launch the BERT pretraining experiment using the following command: `deepspeed train_bert_ds.py --checkpoint_dir experiment_deepspeed`. The final output should look like this:
+
+<div align="center">
+    <img src="./media/bert_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    Pretraining BERT model on Windows using DeepSpeed.
+</div>
+
+# Fine Tuning Example
+We demonstrate fine tuning capability by using the supervised fine tuning (SFT) step of DeepSpeed-Chat application. We conduct SFT of the HuggingFace facebook/opt-125m model while enabling LoRA and CPU offloading memory optimizations. The command line for running this example is as follows:\
+`deepspeed training\step1_supervised_finetuning\main.py --model_name_or_path facebook/opt-125m --gradient_accumulation_steps 8 --lora_dim 128 --only_optimize_lora --print_loss --zero_stage 2 --deepspeed --dtype bf16 --offload --output_dir output`.\
+The output should look like this:
+
+<div align="center">
+    <img src="./media/opt125m_finetuning.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    Supervised Finetuning of facebook/opt-125m model on Windows using DeepSpeed.
+</div>
+
+# Inference Example
+We demonstrate inference capability by using ZeRO-Inference for token generation. ZeRO-Inference reduces hardware cost of inferencing by offloading to CPU or NVMe memories. We use the example scripts here to run token generation using Llama-2-7B model from HuggingFace. We offload the model weights to CPU memory since the 4GB VRAM is insufficient to host both the model and the generation working set. We use the following command line to generate 32 tokens from a prompt of 8 tokens:\
+`deepspeed run_model.py --model meta-llama/Llama-2-7b-hf --batch-size 64 --prompt-len 8 --gen-len 32 --cpu-offload`.\
+The output will look something like this:
+
+<div align="center">
+    <img src="./media/llama2-7b_inference.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    LLAMA2-7B token generation on Windows using ZeRO-Inference.
+</div>
+
+# Summary
+Enabling DeepSpeed, a popular deep learning framework, to run natively on Windows, the most popular operating system, is a crucial step towards empowering every person and every organization to benefit from the ongoing AI revolution.  In this blog, we have shared early results of our work towards this goal. Although Windows support of DeepSpeed is a work-in-progress, we hope that the above updates are encouraging and already useful to users. The next items on our roadmap include running on multiple GPUs, weight quantization, and performance studies.
+
+# Acknowledgements
+This work is a result of significant contributions from current and former DeepSpeed members including Costin Eseanu, Logan Adams, Elton Zheng, Reza Yazdani Aminabadi, Martin Cai, and Olatunji Ruwase. We also acknowledge the valuable contributions of DeepSpeed users who righteously demanded this feature, provided critical workarounds, partial solutions, and constructive feedback, and most importantly, stuck with us.
diff --git a/blogs/windows/08-2024/chinese/README.md b/blogs/windows/08-2024/chinese/README.md
new file mode 100644
index 000000000000..5d62705df3ae
--- /dev/null
+++ b/blogs/windows/08-2024/chinese/README.md
@@ -0,0 +1,103 @@
+<div align="center">
+
+# 在Windows系统上使用DeepSpeed
+
+</div>
+
+# 简介
+
+DeepSpeed是一个广受欢迎的开源深度学习优化库，它使得分布式训练和推理变得简单、高效且有效。凭借其众多复杂的优化技术（如ZeRO、3D并行、MoE等），DeepSpeed已被成功应用于包括Phi-3、Megatron-Turing-530B、BLOOM-176B和Arctic在内的多种前沿模型的训练。然而，由于缺乏对主流操作系统微软 Windows的原生支持，许多AI开发者与用户无法充分利用DeepSpeed的创新。为此，我们致力于让DeepSpeed在Windows上实现原生全功能运行，并保持与Linux相同的易用性。
+
+在这篇博客中，我们很高兴地宣布我们开发工作中的一些早期成果：DeepSpeed 现在可以在 Windows 上安装并原生支持单 GPU 的训练、微调和推理。重要的是，安装和使用体验与 Linux 上完全相同。此外，微调和推理工作展示了 DeepSpeed 的三个关键特性：HuggingFace Transformers 的集成、LoRA 的支持和 CPU offload。DeepSpeed 在 Windows 上的支持从 DeepSpeed 0.14.5 开始。接下来，我们将通过一些例子展示这些成就。
+
+# 测试环境
+
+我们在一台运行 Windows 11 23H2 版本号 22631.3880 的 Surface Laptop Studio 2 上进行了测试。该笔记本配备了一块 4GB 显存的 NVIDIA RTX A2000 GPU。我们使用了 Pytorch 2.3.0 和 HuggingFace Transformers 4.41.2。测试所用的脚本来自 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) 代码仓库，因此在运行以下任何示例前，你需要克隆该仓库。
+
+# 安装指南
+DeepSpeed可以通过两种方式在Windows系统上安装。较为简单的方式是使用pip包管理器安装，另一种方法是从源代码安装。两种安装方式的前提条件都是系统已经安装了Python 3.x 和支持CUDA的Pytorch.
+
+## 通过pip安装
+要安装 DeepSpeed，只需运行：`pip install deepspeed`。它将安装最新版本的 DeepSpeed（目前为 0.14.5）。与 Linux 版本不同的是，Windows 版本已经预先编译了内部的自定义算子，因此不需要安装 CUDA 或 C++ 编译器。
+
+<div align="center">
+    <img src="../media/win_pip_install_deepspeed.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    通过pip在Windows上安装Deepspeed.
+</div>
+
+
+##  通过源代码安装
+克隆DeepSpeed代码仓库后，运行build_win.bat脚本进行编译安装。
+
+
+## 验证安装
+无论选择哪种安装方式，你都可以通过运行 ds_report 来检查安装是否成功。输出应该如下所示：
+
+
+<div align="center">
+    <img src="../media/ds_report.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    ds_report的输出结果，用于验证安装是否成功.
+</div>
+
+# 预训练(Pretraining)
+我们使用图像分类模型 CIFAR10 和语言模型 BERT 来演示在 Windows 上使用 DeepSpeed 进行预训练。
+
+## CIFAR10模型预训练
+用于 CIFAR10 预训练的脚本和代码可以在以下路径找到：`DeepSpeedExamples\training\cifar`。你可以运行以下命令启动 CIFAR10 预训练实验：`deepspeed cifar10_deepspeed.py --deepspeed`。最终输出应类似于：
+<div align="center">
+    <img src="../media/cifar10_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    在 Windows 上使用 Deepspeed 进行 CIFAR10 模型预训练
+</div>
+
+## BERT模型预训练
+用于 BERT 预训练的脚本和代码可以在以下路径找到：`DeepSpeedExamples\training\HelloDeepSpeed`。你可以使用以下命令启动 BERT 预训练实验：`deepspeed train_bert_ds.py --checkpoint_dir experiment_deepspeed`。最终输出应如下所示：
+
+<div align="center">
+    <img src="../media/bert_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    在 Windows 上使用 Deepspeed 进行 BERT 模型预训练
+</div>
+
+# 微调(Fine Tuning)
+我们使用 DeepSpeed-Chat 应用的监督微调（SFT）步骤来演示微调能力。我们对 HuggingFace 的 facebook/opt-125m 模型进行监督微调，同时启用 LoRA 和 CPU offload进行内存优化。运行命令行如下：\
+`deepspeed training\step1_supervised_finetuning\main.py --model_name_or_path facebook/opt-125m --gradient_accumulation_steps 8 --lora_dim 128 --only_optimize_lora --print_loss --zero_stage 2 --deepspeed --dtype bf16 --offload --output_dir output`\
+输出应如下所示：
+
+<div align="center">
+    <img src="../media/opt125m_finetuning.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    在 Windows 上使用 DeepSpeed 对 facebook/opt-125m 监督微调
+</div>
+
+# 推理
+我们使用 ZeRO-Inference 的token生成来演示推理能力。ZeRO-Inference 通过转移存储到 CPU 内存或 NVMe 硬盘内存来减少推理的硬件成本。我们使用以下脚本运行 HuggingFace 的 Llama-2-7B 模型来进行 token 生成。由于 4GB 显存无法容纳模型和生成所需的内存，我们将模型权重转移到 CPU 内存。我们使用以下命令行从 8个token的提示词中生成 32 个token：\
+`deepspeed run_model.py --model meta-llama/Llama-2-7b-hf --batch-size 64 --prompt-len 8 --gen-len 32 --cpu-offload`\
+输出应类似于：
+
+<div align="center">
+    <img src="../media/llama2-7b_inference.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    在 Windows 上使用 ZeRO-Inference 进行 LLAMA2-7B 模型的token生成
+</div>
+
+# 总结
+
+使得DeepSpeed，一个流行的深度学习框架，能够原生运行在最流行的操作系统 Windows 上，是让每个人和组织从当前的人工智能革命中受益的重要一步。在这篇博客中，我们分享了我们为实现这一目标所取得的早期成果。尽管 DeepSpeed 对 Windows 的支持仍在继续开发中，我们希望上述结果已经能够对我们的用户有实用价值，并且鼓舞他们。我们接下来的工作计划涵盖多GPU支持、权重量化以及性能优化。
+
+# 致谢
+这给项目的完成得益于现任和前任 DeepSpeed 成员的大力合作，包括 Costin Eseanu、Logan Adams、Elton Zheng、Reza Yazdani Aminabadi、Martin Cai 和 Olatunji Ruwase。我们还要感谢那些及时提出此项需求、提供关键的临时解决方法、部分解决方案和建设性反馈的 DeepSpeed 用户，最重要的是，他们始终与我们同行.
diff --git a/blogs/windows/08-2024/japanese/README.md b/blogs/windows/08-2024/japanese/README.md
new file mode 100644
index 000000000000..c2f5b9ee2143
--- /dev/null
+++ b/blogs/windows/08-2024/japanese/README.md
@@ -0,0 +1,123 @@
+<div align="center">
+
+# DeepSpeedのWindowsサポート
+
+</div>
+
+# はじめに
+
+DeepSpeedは、分散学習と推論を簡単かつ効率的に行うための人気のあるオープンソースの深層学習最適化ライブラリです。DeepSpeedは、その豊富かつ高度な最適化機能（例：ZeRO、3D parallelism, MoEなど）のおかげで、Phi-3、Megatron-Turing-530B、BLOOM-176B、Arcticなどの最先端モデルの学習に広く利用されています。しかし、最も普及しているオペレーティングシステムであるMicrosoft Windowsをネイティブにサポートしていなかったため、多くのAI開発者やユーザーが、DeepSpeedの革新的な機能を利用できない状態でした。この問題を解決するため、DeepSpeedの完全な機能をWindows上でネイティブに実行し、Linux上と同じ使いやすさを実現するための取り組みを開始しました。
+
+このブログでは、この取り組みの最初の成果をお知らせします。現在、DeepSpeedはWindowsにインストールし、単一GPUでの学習、ファインチューニング、および推論をネイティブに実行できるようになりました。ここで重要なこととして、インストールと利用は、Linuxとまったく同じように行えます。ファインチューニングと推論のワークロードを通じて、HuggingFace Transformers との統合、LoRAのサポート、CPUオフロードの3つの重要なDeepSpeedの機能が、正しく動作していることが確認できました。このWindowsサポートは、バージョン0.14.5以降で利用可能です。このブログの残りの部分では、これらの成果を示す例を紹介します。
+
+# テスト環境
+
+Windows 11 Version 23H2 および Build 22631.3880 を実行している Surface Laptop Studio 2 でテストを行いました。このハードウェアには、4GBのVRAMを搭載した NVIDIA RTX A2000 GPU が1つ搭載されています。また、PyTorchバージョン 2.3.0 および HuggingFace Transformersバージョン 4.41.2 を使用しました。使用したサンプルスクリプトは[DeepSpeedExamplesリポジトリ](https://github.com/deepspeedai/DeepSpeedExamples)から取得できます。以下の例を実行する前にリポジトリをクローンしてください。
+
+# インストール
+
+DeepSpeedは、2つの方法でWindowsにインストールできます。より簡単な方法は、pipパッケージマネージャーを使用することで、もう一方はソースからビルドする方法です。どちらの場合も、Python 3.xとCUDAサポート付きのPyTorchが必要です。
+
+## pipを使用したインストール
+
+DeepSpeedをインストールするには、単に次のコマンドを実行します: `pip install deepspeed`。
+これにより、最新バージョンのDeepSpeed（現時点では0.14.5）がインストールされます。Linux版とは異なり、Windows版ではすべてのオペレーターがすでにビルド済みであるため、CUDA SDKやC++コンパイラをインストールする必要はありません。
+
+<div align="center">
+    <img src="../media/win_pip_install_deepspeed.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    pipによるWindowsへのDeepSpeedのインストール
+</div>
+
+
+## ソースからのビルド
+
+ソースからDeepSpeedをビルドするには、DeepSpeedリポジトリをクローンし、コンパイルスクリプトである `build_win.bat` を実行する必要があります。
+
+## インストールの検証
+
+インストール方法にかかわらず、`ds_report`を実行してインストールが成功したかどうかを確認できます。出力は次のようになります：
+
+<div align="center">
+    <img src="../media/ds_report.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    DeepSpeedのWindowsインストールを確認するds_reportの出力
+</div>
+
+# 事前学習の例
+
+Windows上でDeepSpeedを使用した事前学習の例として、画像分類モデルCIFAR10と言語モデルBERTの実行例を示します。
+
+## CIFAR10の事前学習
+
+CIFAR10の事前学習に必要なスクリプトとコードは、次のパスにあります: `DeepSpeedExamples\training\cifar`
+
+以下のコマンドを使用してCIFAR10の事前学習を開始できます: `deepspeed cifar10_deepspeed.py –deepspeed`
+
+出力は次のようになります。
+
+<div align="center">
+    <img src="../media/cifar10_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    DeepSpeedによるWindowsでのCIFAR10モデルの事前学習
+</div>
+
+## BERTの事前学習
+
+BERTの事前学習に必要なスクリプトとコードは、次のパスにあります: `DeepSpeedExamples\training\HelloDeepSpeed`
+
+以下のコマンドを使用してBERTの事前学習を開始できます: `deepspeed train_bert_ds.py --checkpoint_dir experiment_deepspeed`
+
+出力は次のようになります。
+
+<div align="center">
+    <img src="../media/bert_training.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    DeepSpeedによるWindowsでのBERTモデルの事前学習
+</div>
+
+# ファインチューニングの例
+
+DeepSpeed-Chatアプリケーションの教師ありファインチューニング（supervised fine tuning; SFT）を使用して、ファインチューニングの機能を示します。LoRAおよびCPUオフロードメモリ最適化を有効にして、 HuggingFace の `facebook/opt-125m` モデルのSFTを実施します。この例を実行するためのコマンドラインは次のとおりです: `deepspeed training\step1_supervised_finetuning\main.py --model_name_or_path facebook/opt-125m --gradient_accumulation_steps 8 --lora_dim 128 --only_optimize_lora --print_loss --zero_stage 2 --deepspeed --dtype bf16 --offload --output_dir output`
+
+出力は次のようになります。
+
+<div align="center">
+    <img src="../media/opt125m_finetuning.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    DeepSpeedを使用したWindowsでの facebook/opt-125m モデルのファインチューニング
+</div>
+
+# 推論の例
+
+推論の機能を示すために、トークン生成のためのZeRO-Inferenceを使用します。ZeRO-Inferenceは、CPUまたはNVMeメモリにオフロードすることで推論のハードウェアコストを削減します。ここでは、サンプルスクリプトを使用して、HuggingFaceのLlama-2-7Bモデルを使用したトークン生成を実行します。4GBのVRAMではモデルと生成処理の両方を実効するのに十分ではないため、モデルパラメータをCPUメモリにオフロードします。
+
+次のコマンドラインを使用して、8トークンのプロンプトから32トークンを生成します: `deepspeed run_model.py --model meta-llama/Llama-2-7b-hf --batch-size 64 --prompt-len 8 --gen-len 32 --cpu-offload`
+
+出力は次のようになります。
+
+<div align="center">
+    <img src="../media/llama2-7b_inference.png" style="width:6.5in;height:3.42153in" />
+</div>
+
+<div align="center">
+    DeepSpeedのZeRO-InferenceによるWindowsでのLLAMA2-7Bのトークン生成
+</div>
+
+# まとめ
+
+最も広く使われているオペレーティングシステムであるWindowsで、深層学習フレームワークであるDeepSpeedをネイティブに実行できるようにすることは、多くの人と組織が、今まさに進行中のAI革命の恩恵を受けるための重要な一歩です。このブログでは、この目標に向けたプロジェクトの、最初の成果を共有しました。Windowsのサポートは現在進行中のプロジェクトですが、今回の成果が多くのユーザにとって活用され、またさらに発展していけることを願っています。次のロードマップには、複数のGPUでの実行、モデルパラメータの量子化、パフォーマンスの詳細な分析が含まれます。
+
+# 謝辞
+
+このプロジェクトは、Costin Eseanu、Logan Adams、Elton Zheng、Reza Yazdani Aminabadi、Martin Cai、Olatunji Ruwaseを含むDeepSpeedメンバーによる大きな貢献の結果です。また、この機能を必要とし、様々な問題の解決策や、建設的なフィードバックを提供し、私たちと共に歩んでくれたDeepSpeedユーザーの重要な貢献に感謝します。
diff --git a/blogs/windows/08-2024/media/bert_training.png b/blogs/windows/08-2024/media/bert_training.png
new file mode 100644
index 000000000000..c5935e47747e
Binary files /dev/null and b/blogs/windows/08-2024/media/bert_training.png differ
diff --git a/blogs/windows/08-2024/media/cifar10_training.png b/blogs/windows/08-2024/media/cifar10_training.png
new file mode 100644
index 000000000000..99f3fa25bc70
Binary files /dev/null and b/blogs/windows/08-2024/media/cifar10_training.png differ
diff --git a/blogs/windows/08-2024/media/ds_report.png b/blogs/windows/08-2024/media/ds_report.png
new file mode 100644
index 000000000000..43d82d724ed2
Binary files /dev/null and b/blogs/windows/08-2024/media/ds_report.png differ
diff --git a/blogs/windows/08-2024/media/llama2-7b_inference.png b/blogs/windows/08-2024/media/llama2-7b_inference.png
new file mode 100644
index 000000000000..f5874468a854
Binary files /dev/null and b/blogs/windows/08-2024/media/llama2-7b_inference.png differ
diff --git a/blogs/windows/08-2024/media/opt125m_finetuning.png b/blogs/windows/08-2024/media/opt125m_finetuning.png
new file mode 100644
index 000000000000..ed6d1522e3b3
Binary files /dev/null and b/blogs/windows/08-2024/media/opt125m_finetuning.png differ
diff --git a/blogs/windows/08-2024/media/win_pip_install_deepspeed.png b/blogs/windows/08-2024/media/win_pip_install_deepspeed.png
new file mode 100644
index 000000000000..3b87c95ef144
Binary files /dev/null and b/blogs/windows/08-2024/media/win_pip_install_deepspeed.png differ
diff --git a/blogs/zeropp/chinese/README.md b/blogs/zeropp/chinese/README.md
index e4a6b5279de5..09aac2cef948 100644
--- a/blogs/zeropp/chinese/README.md
+++ b/blogs/zeropp/chinese/README.md
@@ -174,7 +174,7 @@ ZeRO++ 已集成到 DeepSpeed-Chat 中，以支持 ChatGPT 类模型的 RLHF 训
 
 DeepSpeed-ZeRO++ 是 DeepSpeed 生态系统的一部分。 要了解更多信息，请访问我们的网站，在那里您可以找到详细的博客文章、教程和有用的文档。
 
-您还可以在我们的[英文 Twitter](https://twitter.com/MSFTDeepSpeed)、[日文 Twitter](https://twitter.com/MSFTDeepSpeedJP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 上获取最新的 DeepSpeed 新闻。
+您还可以在我们的[英文 Twitter](https://twitter.com/DeepSpeedAI)、[日文 Twitter](https://twitter.com/DeepSpeedAI_JP) 和[中文知乎](https://www.zhihu.com/people/deepspeed) 上获取最新的 DeepSpeed 新闻。
 
 DeepSpeed 欢迎您的贡献！ 我们鼓励您在 DeepSpeed GitHub 页面上报告问题、贡献 PR 并加入讨论。 有关更多详细信息，请参阅我们的贡献指南。 我们对与大学、研究实验室和公司的合作持开放态度。 对于此类请求（以及其他不适合 GitHub 的请求），请直接发送电子邮件至 <deepspeed-info@microsoft.com>。
 
diff --git a/blogs/zeropp/japanese/README.md b/blogs/zeropp/japanese/README.md
index a4d4e68f6b02..5a76930e3f96 100644
--- a/blogs/zeropp/japanese/README.md
+++ b/blogs/zeropp/japanese/README.md
@@ -174,9 +174,9 @@ ZeRO++の技術的な詳細については、arXivにアップロードされた
 
 DeepSpeed-ZeRO++は、DeepSpeedエコシステムの一部です。詳細については、我々の[Webサイト](https://www.deepspeed.ai/)をご覧ください。詳細なブログ記事、チュートリアル、ドキュメントが掲載されています。
 
-また、[英語版Twitter](https://twitter.com/MSFTDeepSpeed)、[日本語版Twitter](https://twitter.com/MSFTDeepSpeedJP)、[中国語版Zhihuアカウント](https://www.zhihu.com/people/deepspeed)でも最新のDeepSpeedニュースを発信しています。
+また、[英語版Twitter](https://twitter.com/DeepSpeedAI)、[日本語版Twitter](https://twitter.com/DeepSpeedAI_JP)、[中国語版Zhihuアカウント](https://www.zhihu.com/people/deepspeed)でも最新のDeepSpeedニュースを発信しています。
 
-DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/microsoft/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
+DeepSpeedは、皆様の開発への参加を歓迎しています。DeepSpeedのGitHubページで、バグ報告、Pull Request、ディスカッションへの参加が可能です。詳細は[ガイドライン](https://github.com/deepspeedai/DeepSpeed/blob/master/CONTRIBUTING.md)をご覧ください。また、大学、研究所、企業とのコラボレーションも行っています。こうしたコラボレーションについてのご要望（およびGitHubには適さないその他の話題）については<deepspeed-info@microsoft.com> まで直接メールをお送りください。
 
 
 **Contributors:**
diff --git a/build_win.bat b/build_win.bat
index ec8c8a362a78..627694dbe8a0 100644
--- a/build_win.bat
+++ b/build_win.bat
@@ -1,19 +1,16 @@
 @echo off
 
+set CUDA_HOME=%CUDA_PATH%
+set DISTUTILS_USE_SDK=1
+
 set DS_BUILD_AIO=0
+set DS_BUILD_CUTLASS_OPS=0
+set DS_BUILD_EVOFORMER_ATTN=0
+set DS_BUILD_FP_QUANTIZER=0
+set DS_BUILD_GDS=0
+set DS_BUILD_RAGGED_DEVICE_OPS=0
 set DS_BUILD_SPARSE_ATTN=0
 
-echo Administrative permissions required. Detecting permissions...
-
-net session >nul 2>&1
-if %errorLevel% == 0 (
-    echo Success: Administrative permissions confirmed.
-) else (
-    echo Failure: Current permissions inadequate.
-    goto end
-)
-
-
-python setup.py bdist_wheel
+python -m build --wheel --no-isolation
 
 :end
diff --git a/csrc/adagrad/cpu_adagrad.cpp b/csrc/adagrad/cpu_adagrad.cpp
index 563255176500..e276ad0856dd 100644
--- a/csrc/adagrad/cpu_adagrad.cpp
+++ b/csrc/adagrad/cpu_adagrad.cpp
@@ -5,55 +5,38 @@
 
 #include "cpu_adagrad.h"
 #include <torch/extension.h>
+#include <functional>
 #include <iostream>
+#include <map>
 #include <memory>
 #include <type_traits>
 #include <unordered_map>
-#if defined(__ENABLE_CUDA__)
-#include <cuda_runtime_api.h>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-#endif
 
+using namespace std::string_literals;
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
 // C++ interface
 
-void Adagrad_Optimizer::Step_1(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               ds_half_precision_t* dev_params,
-                               bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adagrad_Optimizer::Step_1(ds_params_precision_t* _params,
+                               ds_params_precision_t* grads,
+                               ds_state_precision_t* _exp_avg_sq,
+                               size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+    Step_AVX<1>(&rounded_size, _params, grads, _exp_avg_sq, _param_size);
 #endif
     if (_param_size > rounded_size) {
         float step_size = -1 * _alpha;
-        ds_half_precision_t* grads_cast_h;
-        ds_half_precision_t* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
-            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
-        }
         for (size_t t = rounded_size; t < _param_size; t += TILE) {
             size_t copy_size = TILE;
             if ((t + TILE) > _param_size) copy_size = _param_size - t;
             size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#elif defined(__ENABLE_CANN__)
-            if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
-#endif
 #pragma omp parallel for
             for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float grad = (float)grads[k];
+                float param = (float)_params[k];
                 float momentum = grads[k];
                 float variance = _exp_avg_sq[k];
                 if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
@@ -64,58 +47,30 @@ void Adagrad_Optimizer::Step_1(float* _params,
                 grad += _eps;
                 grad = momentum / grad;
                 param = grad * step_size + param;
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-#endif
-                if (half_precision)
-                    params_cast_h[k] = (ds_half_precision_t)param;
-                else
-                    _params[k] = param;
+                _params[k] = param;
                 // STORE UPDATE TERM TO GRAD'S MEMORY
                 grads[k] = grad * step_size;
                 _exp_avg_sq[k] = variance;
             }
-#if defined(__ENABLE_CUDA__)
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-                _buf_index = !_buf_index;
-            }
-#elif defined(__ENABLE_CANN__)
-            if (dev_params) {
-                size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
-                aclrtMemcpy(dev_params + t,
-                            memcpy_size,
-                            _doubled_buffer[_buf_index],
-                            memcpy_size,
-                            aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
-
-                _buf_index = !_buf_index;
-            }
-#endif
         }
     }
 }
 
-void Adagrad_Optimizer::Step_4(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               ds_half_precision_t* dev_params,
-                               bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adagrad_Optimizer::Step_4(ds_params_precision_t* _params,
+                               ds_params_precision_t* grads,
+                               ds_state_precision_t* _exp_avg_sq,
+                               size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+    Step_AVX<4>(&rounded_size, _params, grads, _exp_avg_sq, _param_size);
 #endif
     if (_param_size > rounded_size)
         Step_1((_params + rounded_size),
                (grads + rounded_size),
                (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
+               (_param_size - rounded_size));
 }
 
 int create_adagrad_optimizer(int optimizer_id,
@@ -149,25 +104,77 @@ int create_adagrad_optimizer(int optimizer_id,
     return 0;
 }
 
-void Adagrad_Optimizer::Step_8(float* _params,
-                               float* grads,
-                               float* _exp_avg_sq,
-                               size_t _param_size,
-                               ds_half_precision_t* dev_params,
-                               bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adagrad_Optimizer::Step_8(ds_params_precision_t* _params,
+                               ds_params_precision_t* grads,
+                               ds_state_precision_t* _exp_avg_sq,
+                               size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(
-        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+    Step_AVX<8>(&rounded_size, _params, grads, _exp_avg_sq, _param_size);
 #endif
     if (_param_size > rounded_size)
         Step_4((_params + rounded_size),
                (grads + rounded_size),
                (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
+               (_param_size - rounded_size));
+}
+
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void step_invoker(std::shared_ptr<Adagrad_Optimizer> opt,
+                  void* _params,
+                  void* grads,
+                  void* _exp_avg_sq,
+                  size_t _param_size)
+{
+    opt->Step_8((ds_params_precision_t*)(_params),
+                (ds_params_precision_t*)(grads),
+                (ds_state_precision_t*)(_exp_avg_sq),
+                _param_size);
+}
+
+std::map<std::tuple<c10::ScalarType, c10::ScalarType>,
+         std::function<void(std::shared_ptr<Adagrad_Optimizer>, void*, void*, void*, size_t)>>
+    invokers;
+
+// Fill map with template functions for each type
+template <class ds_params_precision_t, class ds_state_precision_t>
+void create_invoker()
+{
+    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_precision_t>(),
+                        c10::CppTypeToScalarType<ds_state_precision_t>())] =
+        step_invoker<ds_params_precision_t, ds_state_precision_t>;
+}
+struct InvokerInitializer {
+    InvokerInitializer()
+    {
+        create_invoker<c10::Half, float>();
+        create_invoker<c10::Half, c10::Half>();
+        create_invoker<c10::BFloat16, float>();
+        create_invoker<c10::BFloat16, c10::BFloat16>();
+        create_invoker<float, float>();
+    }
+} _invoker_initializer;
+
+void invoke(std::shared_ptr<Adagrad_Optimizer> opt,
+            torch::Tensor& params,
+            torch::Tensor& grads,
+            torch::Tensor& exp_avg_sq,
+            size_t param_size)
+{
+    c10::ScalarType params_type = at::typeMetaToScalarType(params.options().dtype());
+    c10::ScalarType state_type = at::typeMetaToScalarType(exp_avg_sq.options().dtype());
+
+    auto it = invokers.find(std::tuple(params_type, state_type));
+    if (it == invokers.end()) {
+        throw std::runtime_error("Adagrad optimizer with param type "s +
+                                 c10::toString(params_type) + " and state type "s +
+                                 c10::toString(state_type) +
+                                 " is not supported on current hardware"s);
+    }
+
+    it->second(opt, params.data_ptr(), grads.data_ptr(), exp_avg_sq.data_ptr(), param_size);
 }
 
 int ds_adagrad_step(int optimizer_id,
@@ -183,58 +190,13 @@ int ds_adagrad_step(int optimizer_id,
     auto grads_c = grads.contiguous();
     auto exp_avg_sq_c = exp_avg_sq.contiguous();
 
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
     std::shared_ptr<Adagrad_Optimizer> opt =
         std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
     opt->IncrementStep(step);
     opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel());
 
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-    opt->SynchronizeStreams();
-#endif
-    return 0;
-}
+    invoke(opt, params_c, grads_c, exp_avg_sq_c, params_c.numel());
 
-int ds_adagrad_step_plus_copy(int optimizer_id,
-                              size_t step,
-                              float lr,
-                              float epsilon,
-                              float weight_decay,
-                              torch::Tensor& params,
-                              torch::Tensor& grads,
-                              torch::Tensor& exp_avg_sq,
-                              torch::Tensor& gpu_params)
-{
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adagrad_Optimizer> opt =
-        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step);
-    opt->update_state(lr, epsilon, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_sq_ptr,
-                params_c.numel(),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-#else
-    assert(false);
-#endif
     return 0;
 }
 
@@ -248,9 +210,6 @@ int destroy_adagrad_optimizer(int optimizer_id)
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
-    m.def("adagrad_update_copy",
-          &ds_adagrad_step_plus_copy,
-          "DeepSpeed CPU Adagrad update and param copy (C++)");
     m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
     m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
 }
diff --git a/csrc/adam/cpu_adam.cpp b/csrc/adam/cpu_adam.cpp
index 96809827f3e1..263c443cb4d4 100644
--- a/csrc/adam/cpu_adam.cpp
+++ b/csrc/adam/cpu_adam.cpp
@@ -8,9 +8,6 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("adam_update", &ds_adam_step, "DeepSpeed CPU Adam update (C++)");
-    m.def("adam_update_copy",
-          &ds_adam_step_plus_copy,
-          "DeepSpeed CPU Adam update and param copy (C++)");
     m.def("create_adam", &create_adam_optimizer, "DeepSpeed CPU Adam (C++)");
     m.def("destroy_adam", &destroy_adam_optimizer, "DeepSpeed CPU Adam destroy (C++)");
 }
diff --git a/csrc/adam/cpu_adam_impl.cpp b/csrc/adam/cpu_adam_impl.cpp
index 9a4a8d956519..465aae7b9a34 100644
--- a/csrc/adam/cpu_adam_impl.cpp
+++ b/csrc/adam/cpu_adam_impl.cpp
@@ -5,42 +5,29 @@
 
 #include <torch/extension.h>
 #include <cassert>
+#include <functional>
 #include <iostream>
+#include <map>
 #include <memory>
 #include <type_traits>
 #include <unordered_map>
 #include "cpu_adam.h"
 
-#if defined(__ENABLE_CUDA__)
-#include <cuda_runtime_api.h>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-#endif
-
+using namespace std::string_literals;
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
 // C++ interface
 
-void Adam_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adam_Optimizer::Step_1(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
+                            ds_state_precision_t* _exp_avg,
+                            ds_state_precision_t* _exp_avg_sq,
+                            size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
+    Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _exp_avg_sq, _param_size);
 #endif
     if (_param_size > rounded_size) {
         float betta1_minus1 = 1 - _betta1;
@@ -48,26 +35,15 @@ void Adam_Optimizer::Step_1(float* _params,
 
         float step_size = -1 * _alpha / _bias_correction1;
         float w_decay = -1 * _alpha * _weight_decay;
-        ds_half_precision_t* grads_cast_h;
-        ds_half_precision_t* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
-            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
-        }
 
         for (size_t t = rounded_size; t < _param_size; t += TILE) {
             size_t copy_size = TILE;
             if ((t + TILE) > _param_size) copy_size = _param_size - t;
             size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#elif defined(__ENABLE_CANN__)
-            if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
-#endif
 #pragma omp parallel for
             for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float grad = (float)grads[k];
+                float param = (float)_params[k];
                 float momentum = _exp_avg[k];
                 float variance = _exp_avg_sq[k];
                 if (_weight_decay > 0 && !_adamw_mode) { grad = param * _weight_decay + grad; }
@@ -83,66 +59,31 @@ void Adam_Optimizer::Step_1(float* _params,
                 grad = momentum / grad;
                 if (_weight_decay > 0 && _adamw_mode) { param += w_decay * param; }
                 param = grad * step_size + param;
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-#endif
-                if (half_precision)
-                    params_cast_h[k] = (ds_half_precision_t)param;
-                else
-                    _params[k] = param;
+                _params[k] = param;
                 _exp_avg[k] = momentum;
                 _exp_avg_sq[k] = variance;
             }
-#if defined(__ENABLE_CUDA__)
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-#elif defined(__ENABLE_CANN__)
-            if (dev_params) {
-                size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
-                aclrtMemcpy(dev_params + t,
-                            memcpy_size,
-                            _doubled_buffer[_buf_index],
-                            memcpy_size,
-                            aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
-
-                _buf_index = !_buf_index;
-            }
-#endif
         }
     }
 }
 
-void Adam_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adam_Optimizer::Step_4(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
+                            ds_state_precision_t* _exp_avg,
+                            ds_state_precision_t* _exp_avg_sq,
+                            size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
+    Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _exp_avg_sq, _param_size);
 #endif
     if (_param_size > rounded_size)
         Step_1((_params + rounded_size),
                (grads + rounded_size),
                (_exp_avg + rounded_size),
                (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
+               (_param_size - rounded_size));
 }
 
 int create_adam_optimizer(int optimizer_id,
@@ -185,33 +126,86 @@ int create_adam_optimizer(int optimizer_id,
     return 0;
 }
 
-void Adam_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            float* _exp_avg_sq,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Adam_Optimizer::Step_8(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
+                            ds_state_precision_t* _exp_avg,
+                            ds_state_precision_t* _exp_avg_sq,
+                            size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size,
-                _params,
-                grads,
-                _exp_avg,
-                _exp_avg_sq,
-                _param_size,
-                dev_params,
-                half_precision);
+    Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _exp_avg_sq, _param_size);
 #endif
     if (_param_size > rounded_size)
         Step_4((_params + rounded_size),
                (grads + rounded_size),
                (_exp_avg + rounded_size),
                (_exp_avg_sq + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
+               (_param_size - rounded_size));
+}
+
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void step_invoker(std::shared_ptr<Adam_Optimizer> opt,
+                  void* _params,
+                  void* grads,
+                  void* _exp_avg,
+                  void* _exp_avg_sq,
+                  size_t _param_size)
+{
+    opt->Step_8((ds_params_precision_t*)(_params),
+                (ds_params_precision_t*)(grads),
+                (ds_state_precision_t*)(_exp_avg),
+                (ds_state_precision_t*)(_exp_avg_sq),
+                _param_size);
+}
+
+std::map<std::tuple<c10::ScalarType, c10::ScalarType>,
+         std::function<void(std::shared_ptr<Adam_Optimizer>, void*, void*, void*, void*, size_t)>>
+    invokers;
+
+// Fill map with template functions for each type
+template <class ds_params_precision_t, class ds_state_precision_t>
+void create_invoker()
+{
+    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_precision_t>(),
+                        c10::CppTypeToScalarType<ds_state_precision_t>())] =
+        step_invoker<ds_params_precision_t, ds_state_precision_t>;
+}
+struct InvokerInitializer {
+    InvokerInitializer()
+    {
+        create_invoker<c10::Half, float>();
+        create_invoker<c10::Half, c10::Half>();
+        create_invoker<c10::BFloat16, float>();
+        create_invoker<c10::BFloat16, c10::BFloat16>();
+        create_invoker<float, float>();
+    }
+} _invoker_initializer;
+
+void invoke(std::shared_ptr<Adam_Optimizer> opt,
+            torch::Tensor& params,
+            torch::Tensor& grads,
+            torch::Tensor& exp_avg,
+            torch::Tensor& exp_avg_sq,
+            size_t param_size)
+{
+    c10::ScalarType params_type = at::typeMetaToScalarType(params.options().dtype());
+    c10::ScalarType state_type = at::typeMetaToScalarType(exp_avg.options().dtype());
+
+    auto it = invokers.find(std::tuple(params_type, state_type));
+    if (it == invokers.end()) {
+        throw std::runtime_error("Adam optimizer with param type "s + c10::toString(params_type) +
+                                 " and state type "s + c10::toString(state_type) +
+                                 " is not supported on current hardware"s);
+    }
+
+    it->second(opt,
+               params.data_ptr(),
+               grads.data_ptr(),
+               exp_avg.data_ptr(),
+               exp_avg_sq.data_ptr(),
+               param_size);
 }
 
 int ds_adam_step(int optimizer_id,
@@ -232,75 +226,13 @@ int ds_adam_step(int optimizer_id,
     auto exp_avg_c = exp_avg.contiguous();
     auto exp_avg_sq_c = exp_avg_sq.contiguous();
 
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
     std::shared_ptr<Adam_Optimizer> opt =
         std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
     opt->IncrementStep(step, beta1, beta2);
     opt->update_state(lr, epsilon, weight_decay, bias_correction);
 
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.numel(),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
+    invoke(opt, params_c, grads_c, exp_avg_c, exp_avg_sq_c, params_c.numel());
 
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-    opt->SynchronizeStreams();
-#endif
-    return 0;
-}
-
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& device_params)
-{
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-    auto params_c = params.contiguous();
-    auto device_params_c = device_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto exp_avg_sq_c = exp_avg_sq.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    ds_half_precision_t* device_params_ptr = (ds_half_precision_t*)device_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
-
-    std::shared_ptr<Adam_Optimizer> opt =
-        std::static_pointer_cast<Adam_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, epsilon, weight_decay, bias_correction);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                exp_avg_sq_ptr,
-                params_c.numel(),
-                device_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-#else
-    assert(false);
-#endif
     return 0;
 }
 
diff --git a/csrc/adam/multi_tensor_adam.cu b/csrc/adam/multi_tensor_adam.cu
index 1b697d989b1a..a1fc7d15aec9 100644
--- a/csrc/adam/multi_tensor_adam.cu
+++ b/csrc/adam/multi_tensor_adam.cu
@@ -23,14 +23,14 @@ This file is adapted from fused adam in NVIDIA/apex, commit a109f85
 #define BLOCK_SIZE 512
 #define ILP 4
 
-typedef enum {
+typedef enum : int {
     ADAM_MODE_0 = 0,  // L2 regularization mode
     ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
 } adamMode_t;
 
 using MATH_T = float;
 
-template <typename T>
+template <typename T, typename index_t>
 struct AdamFunctor {
     __device__ __forceinline__ void operator()(int chunk_size,
                                                volatile int* noop_gmem,
@@ -48,13 +48,13 @@ struct AdamFunctor {
         // if(*noop_gmem == 1)
         //   return;
 
-        int tensor_loc = tl.block_to_tensor[blockIdx.x];
+        index_t tensor_loc = tl.block_to_tensor[blockIdx.x];
 
         // potentially use to pass in list of scalar
         // int tensor_num = tl.start_tensor_this_launch + tensor_loc;
 
-        int chunk_idx = tl.block_to_chunk[blockIdx.x];
-        int n = tl.sizes[tensor_loc];
+        index_t chunk_idx = tl.block_to_chunk[blockIdx.x];
+        index_t n = tl.sizes[tensor_loc];
 
         T* g = (T*)tl.addresses[0][tensor_loc];
         g += chunk_idx * chunk_size;
@@ -71,7 +71,8 @@ struct AdamFunctor {
         n -= chunk_idx * chunk_size;
 
         // see note in multi_tensor_scale_kernel.cu
-        for (int i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * ILP) {
+        for (index_t i_start = 0; i_start < n && i_start < chunk_size;
+             i_start += blockDim.x * ILP) {
             MATH_T r_g[ILP];
             MATH_T r_p[ILP];
             MATH_T r_m[ILP];
@@ -146,23 +147,57 @@ void multi_tensor_adam_cuda(int chunk_size,
         bias_correction2 = 1 - std::pow(beta2, step);
     }
 
+    size_t max_size = 0;
+    bool requires_64bit_indexing = false;
+    for (auto it = tensor_lists.begin(); it != tensor_lists.end(); it++) {
+        for (auto it2 = it->begin(); it2 != it->end(); it2++) {
+            if (it2->numel() > max_size) {
+                max_size = it2->numel();
+                if (max_size >= INT_MAX) {
+                    requires_64bit_indexing = true;
+                    break;
+                }
+            }
+        }
+        if (requires_64bit_indexing) { break; }
+    }
+
     // Assume single type across p,g,m1,m2 now
-    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
-                                   0,
-                                   "adam",
-                                   multi_tensor_apply<4>(BLOCK_SIZE,
-                                                         chunk_size,
-                                                         noop_flag,
-                                                         tensor_lists,
-                                                         AdamFunctor<scalar_t_0>(),
-                                                         beta1,
-                                                         beta2,
-                                                         bias_correction1,
-                                                         bias_correction2,
-                                                         epsilon,
-                                                         lr,
-                                                         (adamMode_t)mode,
-                                                         weight_decay);)
+    if (requires_64bit_indexing) {
+        DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                       0,
+                                       "adam",
+                                       multi_tensor_apply<4>((int64_t)BLOCK_SIZE,
+                                                             (int64_t)chunk_size,
+                                                             noop_flag,
+                                                             tensor_lists,
+                                                             AdamFunctor<scalar_t_0, int64_t>(),
+                                                             beta1,
+                                                             beta2,
+                                                             bias_correction1,
+                                                             bias_correction2,
+                                                             epsilon,
+                                                             lr,
+                                                             (adamMode_t)mode,
+                                                             weight_decay);)
+    } else {
+        DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                       0,
+                                       "adam",
+                                       multi_tensor_apply<4>(BLOCK_SIZE,
+                                                             chunk_size,
+                                                             noop_flag,
+                                                             tensor_lists,
+                                                             AdamFunctor<scalar_t_0, int32_t>(),
+                                                             beta1,
+                                                             beta2,
+                                                             bias_correction1,
+                                                             bias_correction2,
+                                                             epsilon,
+                                                             lr,
+                                                             (adamMode_t)mode,
+                                                             weight_decay);)
+    }
 
     AT_CUDA_CHECK(cudaGetLastError());
 }
diff --git a/csrc/adam/multi_tensor_apply.cuh b/csrc/adam/multi_tensor_apply.cuh
index 12f41cb49c6b..342376c141be 100644
--- a/csrc/adam/multi_tensor_apply.cuh
+++ b/csrc/adam/multi_tensor_apply.cuh
@@ -35,7 +35,7 @@ struct TensorListMetadata {
 };
 
 template <typename T, typename U, typename... ArgTypes>
-__global__ void multi_tensor_apply_kernel(int chunk_size,
+__global__ void multi_tensor_apply_kernel(int64_t chunk_size,
                                           volatile int* noop_flag,
                                           T tl,
                                           U callable,
@@ -46,8 +46,8 @@ __global__ void multi_tensor_apply_kernel(int chunk_size,
 }
 
 template <int depth, typename T, typename... ArgTypes>
-void multi_tensor_apply(int block_size,
-                        int chunk_size,
+void multi_tensor_apply(int64_t block_size,
+                        int64_t chunk_size,
                         const at::Tensor& noop_flag,
                         const std::vector<std::vector<at::Tensor>>& tensor_lists,
                         T callable,
@@ -91,9 +91,9 @@ void multi_tensor_apply(int block_size,
             tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
         loc_tensor_info++;
 
-        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+        auto chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
 
-        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+        for (auto chunk = 0; chunk < chunks_this_tensor; chunk++) {
             // std::cout << chunks_this_tensor << std::endl;
             tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
             tl.block_to_chunk[loc_block_info] = chunk;
diff --git a/csrc/aio/common/deepspeed_aio_common.cpp b/csrc/aio/common/deepspeed_aio_common.cpp
index 32b0e8a32394..9d7ff5093017 100644
--- a/csrc/aio/common/deepspeed_aio_common.cpp
+++ b/csrc/aio/common/deepspeed_aio_common.cpp
@@ -68,8 +68,8 @@ static void _get_aio_latencies(std::vector<std::chrono::duration<double>>& raw_l
         std::accumulate(lat_usec.begin(), lat_usec.end(), 0) / lat_usec.size();
 }
 
-static void _do_io_submit_singles(const long long int n_iocbs,
-                                  const long long int iocb_index,
+static void _do_io_submit_singles(const int64_t n_iocbs,
+                                  const int64_t iocb_index,
                                   std::unique_ptr<aio_context>& aio_ctxt,
                                   std::vector<std::chrono::duration<double>>& submit_times)
 {
@@ -89,8 +89,8 @@ static void _do_io_submit_singles(const long long int n_iocbs,
     }
 }
 
-static void _do_io_submit_block(const long long int n_iocbs,
-                                const long long int iocb_index,
+static void _do_io_submit_block(const int64_t n_iocbs,
+                                const int64_t iocb_index,
                                 std::unique_ptr<aio_context>& aio_ctxt,
                                 std::vector<std::chrono::duration<double>>& submit_times)
 {
@@ -109,18 +109,18 @@ static void _do_io_submit_block(const long long int n_iocbs,
     assert(submit_ret > 0);
 }
 
-static int _do_io_complete(const long long int min_completes,
-                           const long long int max_completes,
+static int _do_io_complete(const int64_t min_completes,
+                           const int64_t max_completes,
                            std::unique_ptr<aio_context>& aio_ctxt,
                            std::vector<std::chrono::duration<double>>& reap_times)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
-    long long int n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
-                                              min_completes,
-                                              max_completes,
-                                              aio_ctxt->_io_events.data(),
-                                              nullptr,
-                                              nullptr);
+    int64_t n_completes = io_pgetevents(aio_ctxt->_io_ctxt,
+                                        min_completes,
+                                        max_completes,
+                                        aio_ctxt->_io_events.data(),
+                                        nullptr,
+                                        nullptr);
     reap_times.push_back(std::chrono::high_resolution_clock::now() - start_time);
     assert(n_completes >= min_completes);
     return n_completes;
@@ -134,7 +134,7 @@ void do_aio_operation_sequential(const bool read_op,
 {
     struct io_prep_context prep_ctxt(read_op, xfer_ctxt, aio_ctxt->_block_size, &aio_ctxt->_iocbs);
 
-    const auto num_io_blocks = static_cast<long long int>(
+    const auto num_io_blocks = static_cast<int64_t>(
         ceil(static_cast<double>(xfer_ctxt->_num_bytes) / aio_ctxt->_block_size));
 #if DEBUG_DS_AIO_PERF
     const auto io_op_name = std::string(read_op ? "read" : "write");
@@ -145,15 +145,14 @@ void do_aio_operation_sequential(const bool read_op,
     std::vector<std::chrono::duration<double>> submit_times;
     std::vector<std::chrono::duration<double>> reap_times;
     const auto max_queue_bytes =
-        static_cast<long long int>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
+        static_cast<int64_t>(aio_ctxt->_queue_depth * aio_ctxt->_block_size);
 
     auto start = std::chrono::high_resolution_clock::now();
-    for (long long iocb_index = 0; iocb_index < num_io_blocks;
-         iocb_index += aio_ctxt->_queue_depth) {
+    for (int64_t iocb_index = 0; iocb_index < num_io_blocks; iocb_index += aio_ctxt->_queue_depth) {
         const auto start_offset = iocb_index * aio_ctxt->_block_size;
         const auto start_buffer = (char*)xfer_ctxt->_mem_buffer + start_offset;
         const auto n_iocbs =
-            min(static_cast<long long>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
+            min(static_cast<int64_t>(aio_ctxt->_queue_depth), (num_io_blocks - iocb_index));
         const auto num_bytes = min(max_queue_bytes, (xfer_ctxt->_num_bytes - start_offset));
         prep_ctxt.prep_iocbs(n_iocbs, num_bytes, start_buffer, start_offset);
 
@@ -268,6 +267,10 @@ void report_file_error(const char* filename, const std::string file_op, const in
 int open_file(const char* filename, const bool read_op)
 {
     const int flags = read_op ? (O_RDONLY | O_DIRECT) : (O_WRONLY | O_CREAT | O_DIRECT);
+#if defined(__ENABLE_CANN__)
+    int* flags_ptr = (int*)&flags;
+    *flags_ptr = read_op ? (O_RDONLY) : (O_WRONLY | O_CREAT);
+#endif
     const int mode = 0600;
     const auto fd = open(filename, flags, mode);
     if (fd == -1) {
@@ -281,13 +284,14 @@ int open_file(const char* filename, const bool read_op)
 
 int regular_read(const char* filename, std::vector<char>& buffer)
 {
-    long long int num_bytes;
-    const auto f_size = get_file_size(filename, num_bytes);
-    assert(f_size != -1);
-    buffer.resize(num_bytes);
     const auto fd = open(filename, O_RDONLY, 0600);
     assert(fd != -1);
-    long long int read_bytes = 0;
+    struct stat fs;
+    const auto result = fstat(fd, &fs);
+    assert(result != -1);
+    int64_t num_bytes = fs.st_size;
+    buffer.resize(num_bytes);
+    int64_t read_bytes = 0;
     auto r = 0;
     do {
         const auto buffer_ptr = buffer.data() + read_bytes;
@@ -297,16 +301,15 @@ int regular_read(const char* filename, std::vector<char>& buffer)
     } while (r > 0);
 
     if (read_bytes != num_bytes) {
-        std::cerr << "read error "
-                  << " read_bytes (read) = " << read_bytes << " num_bytes (fstat) = " << num_bytes
-                  << std::endl;
+        std::cerr << "read error " << " read_bytes (read) = " << read_bytes
+                  << " num_bytes (fstat) = " << num_bytes << std::endl;
     }
     assert(read_bytes == num_bytes);
     close(fd);
     return 0;
 }
 
-static bool _validate_buffer(const char* filename, void* aio_buffer, const long long int num_bytes)
+static bool _validate_buffer(const char* filename, void* aio_buffer, const int64_t num_bytes)
 {
     std::vector<char> regular_buffer;
     const auto reg_ret = regular_read(filename, regular_buffer);
@@ -314,7 +317,7 @@ static bool _validate_buffer(const char* filename, void* aio_buffer, const long
     std::cout << "regular read of " << filename << " returned " << regular_buffer.size() << " bytes"
               << std::endl;
 
-    if (static_cast<long long int>(regular_buffer.size()) != num_bytes) { return false; }
+    if (static_cast<int64_t>(regular_buffer.size()) != num_bytes) { return false; }
 
     return (0 == memcmp(aio_buffer, regular_buffer.data(), regular_buffer.size()));
 }
@@ -322,7 +325,7 @@ static bool _validate_buffer(const char* filename, void* aio_buffer, const long
 bool validate_aio_operation(const bool read_op,
                             const char* filename,
                             void* aio_buffer,
-                            const long long int num_bytes)
+                            const int64_t num_bytes)
 {
     const auto msg_suffix = std::string("deepspeed_aio_") +
                             std::string(read_op ? "read()" : "write()") +
diff --git a/csrc/aio/common/deepspeed_aio_common.h b/csrc/aio/common/deepspeed_aio_common.h
index 2940de945ee8..aa4e49f4f4ed 100644
--- a/csrc/aio/common/deepspeed_aio_common.h
+++ b/csrc/aio/common/deepspeed_aio_common.h
@@ -35,4 +35,4 @@ int regular_read(const char* filename, std::vector<char>& buffer);
 bool validate_aio_operation(const bool read_op,
                             const char* filename,
                             void* aio_buffer,
-                            const long long int num_bytes);
+                            const int64_t num_bytes);
diff --git a/csrc/aio/common/deepspeed_aio_utils.cpp b/csrc/aio/common/deepspeed_aio_utils.cpp
index 763b2c253a34..fb269b58315f 100644
--- a/csrc/aio/common/deepspeed_aio_utils.cpp
+++ b/csrc/aio/common/deepspeed_aio_utils.cpp
@@ -18,10 +18,15 @@ const int c_block_size = 128 * 1024;
 const int c_io_queue_depth = 8;
 
 io_xfer_ctxt::io_xfer_ctxt(const int fd,
-                           const long long int file_offset,
-                           const long long int num_bytes,
+                           const int64_t file_offset,
+                           const int64_t buffer_offset,
+                           const int64_t num_bytes,
                            const void* buffer)
-    : _fd(fd), _base_offset(file_offset), _mem_buffer(buffer), _num_bytes(num_bytes)
+    : _fd(fd),
+      _file_base_offset(file_offset),
+      _buffer_base_offset(buffer_offset),
+      _mem_buffer(buffer),
+      _num_bytes(num_bytes)
 {
 }
 
@@ -36,14 +41,15 @@ io_prep_context::io_prep_context(const bool read_op,
 void io_prep_context::prep_iocbs(const int n_iocbs,
                                  const size_t num_bytes,
                                  const void* start_buffer,
-                                 const long long int start_offset)
+                                 const int64_t start_offset)
 {
     assert(static_cast<size_t>(n_iocbs) <= _iocbs->size());
     for (auto i = 0; i < n_iocbs; ++i) {
         const auto shift = i * _block_size;
-        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_base_offset + shift;
-        const auto xfer_offset = _xfer_ctxt->_base_offset + start_offset + shift;
+        const auto xfer_buffer = (char*)start_buffer + _xfer_ctxt->_buffer_base_offset + shift;
+        const auto xfer_offset = _xfer_ctxt->_file_base_offset + start_offset + shift;
         auto byte_count = _block_size;
+
         if ((shift + _block_size) > num_bytes) { byte_count = num_bytes - shift; }
 
         if (_read_op) {
@@ -64,25 +70,25 @@ io_prep_generator::io_prep_generator(const bool read_op,
       _next_iocb_index(0)
 {
     _num_io_blocks =
-        static_cast<long long int>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
+        static_cast<int64_t>(ceil(static_cast<double>(xfer_ctxt->_num_bytes) / block_size));
     _remaining_io_blocks = _num_io_blocks;
 }
 
 int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs)
 {
     if ((_remaining_bytes) == 0 || (_remaining_io_blocks == 0)) {
-        assert(static_cast<long long int>(_remaining_bytes) == _remaining_io_blocks);
+        assert(static_cast<int64_t>(_remaining_bytes) == _remaining_io_blocks);
         return 0;
     }
 
     assert(static_cast<size_t>(n_iocbs) <= iocbs->size());
 
-    auto actual_n_iocbs = min(static_cast<long long int>(n_iocbs), _remaining_io_blocks);
+    auto actual_n_iocbs = min(static_cast<int64_t>(n_iocbs), _remaining_io_blocks);
     for (auto i = 0; i < actual_n_iocbs; ++i, ++_next_iocb_index) {
-        const auto xfer_offset = _xfer_ctxt->_base_offset + (_next_iocb_index * _block_size);
-        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + xfer_offset;
-        const auto num_bytes = min(static_cast<long long int>(_block_size), _remaining_bytes);
-
+        const auto xfer_buffer = (char*)_xfer_ctxt->_mem_buffer + _xfer_ctxt->_buffer_base_offset +
+                                 (_next_iocb_index * _block_size);
+        const auto xfer_offset = _xfer_ctxt->_file_base_offset + (_next_iocb_index * _block_size);
+        const auto num_bytes = min(static_cast<int64_t>(_block_size), _remaining_bytes);
         if (_read_op) {
             io_prep_pread(iocbs->at(i), _xfer_ctxt->_fd, xfer_buffer, num_bytes, xfer_offset);
         } else {
@@ -95,7 +101,7 @@ int io_prep_generator::prep_iocbs(const int n_iocbs, std::vector<struct iocb*>*
     return actual_n_iocbs;
 }
 
-int get_file_size(const char* filename, long long int& size)
+int get_file_size(const char* filename, int64_t& size)
 {
     struct stat st;
     if (stat(filename, &st) == -1) { return -1; }
@@ -103,7 +109,7 @@ int get_file_size(const char* filename, long long int& size)
     return 0;
 }
 
-void* ds_page_aligned_alloc(const size_t size, const bool lock)
+void* ds_page_aligned_alloc(const int64_t size, const bool lock)
 {
     void* ptr;
     int retval;
diff --git a/csrc/aio/common/deepspeed_aio_utils.h b/csrc/aio/common/deepspeed_aio_utils.h
index 9c58c2286610..6b7599acecb4 100644
--- a/csrc/aio/common/deepspeed_aio_utils.h
+++ b/csrc/aio/common/deepspeed_aio_utils.h
@@ -30,13 +30,15 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 struct io_xfer_ctxt {
     const int _fd;
-    const long long int _base_offset;
+    const int64_t _file_base_offset;
+    const int64_t _buffer_base_offset;
     const void* _mem_buffer;
-    const long long int _num_bytes;
+    const int64_t _num_bytes;
 
     io_xfer_ctxt(const int fd,
-                 const long long int file_offset,
-                 const long long int num_bytes,
+                 const int64_t file_offset,
+                 const int64_t buffer_offset,
+                 const int64_t num_bytes,
                  const void* buffer);
 };
 
@@ -54,7 +56,7 @@ struct io_prep_context {
     void prep_iocbs(const int n_iocbs,
                     const size_t num_bytes,
                     const void* start_buffer,
-                    const long long int start_offset);
+                    const int64_t start_offset);
 };
 
 struct io_prep_generator {
@@ -62,10 +64,10 @@ struct io_prep_generator {
     const std::unique_ptr<io_xfer_ctxt>& _xfer_ctxt;
     const size_t _block_size;
 
-    long long int _remaining_bytes;
-    long long int _num_io_blocks;
-    long long int _remaining_io_blocks;
-    long long int _next_iocb_index;
+    int64_t _remaining_bytes;
+    int64_t _num_io_blocks;
+    int64_t _remaining_io_blocks;
+    int64_t _next_iocb_index;
 
     io_prep_generator(const bool read_op,
                       const std::unique_ptr<io_xfer_ctxt>& xfer_ctxt,
@@ -74,6 +76,6 @@ struct io_prep_generator {
     int prep_iocbs(const int n_iocbs, std::vector<struct iocb*>* iocbs);
 };
 
-void* ds_page_aligned_alloc(const size_t size, const bool lock = false);
+void* ds_page_aligned_alloc(const int64_t size, const bool lock = false);
 
-int get_file_size(const char* filename, long long int& size);
+int get_file_size(const char* filename, int64_t& size);
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
new file mode 100644
index 000000000000..945251397225
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_aio_op_desc.h"
+
+using namespace std;
+
+io_op_desc_t::io_op_desc_t(const bool read_op,
+                           const torch::Tensor& buffer,
+                           const int fd,
+                           const char* filename,
+                           const int64_t file_num_bytes,
+                           const int intra_op_parallelism,
+                           const bool validate,
+                           const int64_t file_offset)
+    : _read_op(read_op),
+      _buffer(buffer),
+      _fd(fd),
+      _filename(filename),
+      _file_num_bytes(file_num_bytes),
+      _file_offset(file_offset),
+      _intra_op_parallelism(intra_op_parallelism),
+      _num_bytes_per_thread(static_cast<int64_t>(buffer.nbytes()) / intra_op_parallelism),
+      _validate(validate)
+{
+}
+
+char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void io_op_desc_t::finish() {}
+
+void io_op_desc_t::validate() {}
+
+void io_op_desc_t::run(const int tid,
+                       std::unique_ptr<aio_context>& aio_ctxt,
+                       deepspeed_aio_config_t* aio_config)
+{
+}
diff --git a/csrc/aio/py_lib/deepspeed_aio_op_desc.h b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
new file mode 100644
index 000000000000..ac1cdf90f78b
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_aio_op_desc.h
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#ifndef _IO_OP_DESC_T_
+#define _IO_OP_DESC_T_
+#include <memory>
+#include <queue>
+#include "deepspeed_py_aio.h"
+
+struct io_op_desc_t {
+    const bool _read_op;
+    torch::Tensor _buffer;
+    int _fd;
+    const std::string _filename;
+    const int64_t _file_num_bytes;
+    const int _intra_op_parallelism;
+    const int64_t _num_bytes_per_thread;
+    torch::Tensor _contiguous_buffer;
+    const bool _validate;
+    const int64_t _file_offset;
+
+    io_op_desc_t(const bool read_op,
+                 const torch::Tensor& buffer,
+                 const int fd,
+                 const char* filename,
+                 const int64_t file_num_bytes,
+                 const int intra_op_parallelism,
+                 const bool validate,
+                 const int64_t file_offset);
+
+    virtual void run(const int tid,
+                     std::unique_ptr<aio_context>& aio_ctxt,
+                     deepspeed_aio_config_t* aio_config);
+
+    virtual char* data_ptr() const;
+
+    virtual void validate();
+
+    virtual void finish();
+};
+#endif  // _IO_OP_DESC_T_
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.cpp b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
index e9c6a8505858..30c3b4914397 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.cpp
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.cpp
@@ -11,32 +11,6 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 using namespace std;
 
-io_op_desc_t::io_op_desc_t(const bool read_op,
-                           const torch::Tensor& buffer,
-                           const int fd,
-                           const char* filename,
-                           const long long int num_bytes,
-                           const bool validate)
-    : _read_op(read_op),
-      _buffer(buffer),
-      _fd(fd),
-      _filename(filename),
-      _num_bytes(num_bytes),
-      _validate(validate)
-{
-    _cpu_buffer = (_buffer.is_cuda() || _buffer.is_xpu()) ? _buffer.to(torch::kCPU).pin_memory()
-                                                          : _buffer;
-    _contiguous_buffer = _cpu_buffer.contiguous();
-}
-
-char* io_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
-
-void io_op_desc_t::fini()
-{
-    if (_read_op && _buffer.is_cuda()) { _buffer.copy_(_cpu_buffer.to(torch::kCUDA)); }
-    if (_read_op && _buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
-}
-
 deepspeed_aio_thread_t::deepspeed_aio_thread_t(const int tid, deepspeed_aio_config_t& aio_config)
     : _tid(tid),
       _aio_config(aio_config),
@@ -63,18 +37,7 @@ void deepspeed_aio_thread_t::run()
         }
 
         if (next_io_op) {
-            const auto base_offset = next_io_op->_num_bytes * _tid;
-
-            std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
-                next_io_op->_fd, base_offset, next_io_op->_num_bytes, next_io_op->data_ptr()));
-
-            if (_aio_config._overlap_events) {
-                do_aio_operation_overlap(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            } else {
-                do_aio_operation_sequential(
-                    next_io_op->_read_op, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-            }
+            next_io_op->run(_tid, _aio_ctxt, &_aio_config);
 
             {
                 std::lock_guard<std::mutex> lock(_complete_sync._mutex);
diff --git a/csrc/aio/py_lib/deepspeed_aio_thread.h b/csrc/aio/py_lib/deepspeed_aio_thread.h
index 20799ecbb018..a192804db13d 100644
--- a/csrc/aio/py_lib/deepspeed_aio_thread.h
+++ b/csrc/aio/py_lib/deepspeed_aio_thread.h
@@ -10,28 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <condition_variable>
 #include <memory>
 #include <queue>
-#include "deepspeed_py_aio.h"
-
-struct io_op_desc_t {
-    const bool _read_op;
-    torch::Tensor _buffer;
-    int _fd;
-    const std::string _filename;
-    const long long int _num_bytes;
-    torch::Tensor _cpu_buffer;
-    torch::Tensor _contiguous_buffer;
-    const bool _validate;
-
-    io_op_desc_t(const bool read_op,
-                 const torch::Tensor& buffer,
-                 const int fd,
-                 const char* filename,
-                 const long long int num_bytes,
-                 const bool validate);
-
-    char* data_ptr() const;
-    void fini();
-};
+#include "deepspeed_cpu_op.h"
 
 struct thread_sync_t {
     std::mutex _mutex;
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.cpp b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
new file mode 100644
index 000000000000..56fb33fb1886
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.cpp
@@ -0,0 +1,109 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "deepspeed_cpu_op.h"
+#include "deepspeed_pin_tensor.h"
+
+using namespace std;
+
+cpu_op_desc_t::cpu_op_desc_t(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
+    const int fd,
+    const char* filename,
+    const int64_t file_num_bytes,
+    const int intra_op_parallelism,
+    const bool validate,
+    const int64_t file_offset)
+    : io_op_desc_t(read_op,
+                   buffer,
+                   fd,
+                   filename,
+                   file_num_bytes,
+                   intra_op_parallelism,
+                   validate,
+                   file_offset),
+      _cpu_buffer(buffer),
+      _pinned_tensor_mgr(pinned_tensor_mgr),
+      _is_managed_bounce_buffer(false)
+{
+    // Need to use CPU bounce buffer if buffer is not a page-locked DRAM memory.
+    _use_bounce_buffer =
+        !(_buffer.is_cpu() && (_buffer.is_pinned() || _pinned_tensor_mgr->is_managed(_buffer)));
+    if (_use_bounce_buffer) {
+        _alloc_bounce_buffer();
+        if (!_read_op) { _cpu_buffer.copy_(_buffer); }
+    }
+    _contiguous_buffer = _cpu_buffer.contiguous();
+}
+
+char* cpu_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void cpu_op_desc_t::finish()
+{
+    if (_use_bounce_buffer) {
+        if (_read_op) {
+            if (_buffer.is_cuda()) {
+                _buffer.copy_(_cpu_buffer.to(torch::Device(torch::kCUDA, _buffer.get_device()),
+                                             /*non_blocking=*/true));
+            }
+            if (_buffer.is_xpu()) { _buffer.copy_(_cpu_buffer.to(torch::kXPU)); }
+            if (_buffer.is_cpu()) { _buffer.copy_(_cpu_buffer); }
+#if defined(__ENABLE_CANN__)
+            if (torch_npu::utils::is_npu(_buffer)) {
+                auto device = at::Device("npu:0");
+                _buffer.copy_(_cpu_buffer.to(device));
+            }
+#endif
+        }
+
+        _free_bounce_buffer();
+    }
+}
+
+void cpu_op_desc_t::validate()
+{
+    validate_aio_operation(_read_op, _filename.c_str(), data_ptr(), _file_num_bytes);
+}
+
+void cpu_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _intra_op_parallelism);
+    const auto buffer_base_offset = _num_bytes_per_thread * tid;
+    const auto file_base_offset = _file_offset + (_num_bytes_per_thread * tid);
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(
+        _fd, file_base_offset, buffer_base_offset, _num_bytes_per_thread, data_ptr()));
+
+    if (aio_config->_overlap_events) {
+        do_aio_operation_overlap(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(_read_op, aio_ctxt, xfer_ctxt, aio_config, nullptr);
+    }
+}
+
+void cpu_op_desc_t::_alloc_bounce_buffer()
+{
+    auto options = torch::TensorOptions()
+                       .dtype(_buffer.dtype())
+                       .layout(_buffer.layout())
+                       .device(torch::kCPU)
+                       .requires_grad(false);
+
+#if defined(__CUDA_ARCH__)
+    _cpu_buffer = torch::empty(_buffer.numel(), options).pin_memory();
+#else
+    _is_managed_bounce_buffer = true;
+    _cpu_buffer = _pinned_tensor_mgr->alloc(_buffer.numel(), options);
+#endif
+}
+
+void cpu_op_desc_t::_free_bounce_buffer()
+{
+    if (_is_managed_bounce_buffer) { _pinned_tensor_mgr->free(_cpu_buffer); }
+}
diff --git a/csrc/aio/py_lib/deepspeed_cpu_op.h b/csrc/aio/py_lib/deepspeed_cpu_op.h
new file mode 100644
index 000000000000..debaf4a90731
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_cpu_op.h
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <memory>
+#include <queue>
+#include "deepspeed_aio_op_desc.h"
+
+struct cpu_op_desc_t : io_op_desc_t {
+    torch::Tensor _cpu_buffer;
+    bool _use_bounce_buffer;
+    bool _is_managed_bounce_buffer;
+    const std::unique_ptr<struct deepspeed_pin_tensor_t>& _pinned_tensor_mgr;
+
+    cpu_op_desc_t(const bool read_op,
+                  const torch::Tensor& buffer,
+                  const std::unique_ptr<struct deepspeed_pin_tensor_t>& pinned_tensor_mgr,
+                  const int fd,
+                  const char* filename,
+                  const int64_t file_num_bytes,
+                  const int intra_op_parallelism,
+                  const bool validate,
+                  const int64_t file_offset);
+
+    void run(const int tid,
+             std::unique_ptr<aio_context>& aio_ctxt,
+             deepspeed_aio_config_t* aio_config);
+
+    char* data_ptr() const;
+
+    void validate();
+
+    void finish();
+
+    void _alloc_bounce_buffer();
+    void _free_bounce_buffer();
+};
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
index 752823dc7dd2..a97a4ac18ba8 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.cpp
@@ -15,21 +15,28 @@ deepspeed_pin_tensor_t::~deepspeed_pin_tensor_t()
 {
     for (auto iter = _locked_tensors.begin(); iter != _locked_tensors.end(); ++iter) {
         munlock(iter->first, iter->second);
+        std::free((void*)iter->first);
     }
     _locked_tensors.clear();
 }
 
-torch::Tensor deepspeed_pin_tensor_t::alloc(const size_t num_elem, const at::ScalarType& elem_type)
+torch::Tensor deepspeed_pin_tensor_t::alloc(const int64_t num_elem,
+                                            const torch::TensorOptions& options)
 {
-    const auto num_bytes = num_elem * elementSize(elem_type);
+    const auto scalar_dtype = torch::typeMetaToScalarType(options.dtype());
+    const auto num_bytes = num_elem * torch::elementSize(scalar_dtype);
     auto pinned_buffer = ds_page_aligned_alloc(num_bytes, true);
     assert(nullptr != pinned_buffer);
 
     _locked_tensors[pinned_buffer] = num_bytes;
 
-    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU);
+    return at::from_blob(pinned_buffer, static_cast<int64_t>(num_elem), options);
+}
 
-    return at::from_blob(pinned_buffer, static_cast<long int>(num_bytes), options);
+torch::Tensor deepspeed_pin_tensor_t::alloc(const int64_t num_elem, const at::ScalarType& elem_type)
+{
+    auto options = torch::TensorOptions().dtype(elem_type).device(torch::kCPU).requires_grad(false);
+    return alloc(num_elem, options);
 }
 
 bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
@@ -37,9 +44,18 @@ bool deepspeed_pin_tensor_t::free(torch::Tensor& locked_tensor)
     auto addr = locked_tensor.data_ptr();
     if (_locked_tensors.find(addr) != _locked_tensors.end()) {
         munlock(addr, _locked_tensors[addr]);
+        std::free(addr);
         _locked_tensors.erase(addr);
         return true;
     }
 
     return false;
 }
+
+bool deepspeed_pin_tensor_t::is_managed(const torch::Tensor& buffer)
+{
+    if (!buffer.is_cpu()) { return false; }
+    auto addr = buffer.data_ptr();
+    if (_locked_tensors.find(addr) != _locked_tensors.end()) { return true; }
+    return false;
+};
diff --git a/csrc/aio/py_lib/deepspeed_pin_tensor.h b/csrc/aio/py_lib/deepspeed_pin_tensor.h
index 4350a4ac7df6..4b8ad7e76085 100644
--- a/csrc/aio/py_lib/deepspeed_pin_tensor.h
+++ b/csrc/aio/py_lib/deepspeed_pin_tensor.h
@@ -15,13 +15,16 @@ Functionality for managing CPU tensors occupying page-locked memory.
 #include "deepspeed_py_aio.h"
 
 struct deepspeed_pin_tensor_t {
-    std::map<void*, size_t> _locked_tensors;
+    std::map<void*, int64_t> _locked_tensors;
 
     deepspeed_pin_tensor_t() = default;
 
     ~deepspeed_pin_tensor_t();
 
-    torch::Tensor alloc(const size_t num_elem, const at::ScalarType& elem_type);
+    torch::Tensor alloc(const int64_t num_elem, const at::ScalarType& elem_type);
+    torch::Tensor alloc(const int64_t num_elem, const torch::TensorOptions& options);
 
     bool free(torch::Tensor& locked_tensor);
+
+    bool is_managed(const torch::Tensor& buffer);
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.cpp b/csrc/aio/py_lib/deepspeed_py_aio.cpp
index 387b713f2bfc..1ff0397043fa 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio.cpp
@@ -4,9 +4,6 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
@@ -54,8 +51,10 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
+
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, 0, 0, num_write_bytes, write_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
     if (config._overlap_events) {
@@ -72,9 +71,8 @@ int deepspeed_py_aio_write(const torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
 
@@ -87,7 +85,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
                           const bool validate)
 {
     const auto start_time = std::chrono::high_resolution_clock::now();
-    long long num_file_bytes;
+    int64_t num_file_bytes;
     if (-1 == get_file_size(filename, num_file_bytes)) {
         const auto error_code = errno;
         report_file_error(filename, " fstat for read", error_code);
@@ -99,9 +97,10 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
     if (fd == -1) { return -1; }
 
     auto read_buffer = (char*)buffer.data_ptr();
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
+    assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
 
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, 0, 0, num_file_bytes, read_buffer));
     std::unique_ptr<aio_context> aio_ctxt(new aio_context(config._block_size, config._queue_depth));
 
     if (config._overlap_events) {
@@ -118,8 +117,7 @@ int deepspeed_py_aio_read(torch::Tensor& buffer,
 
     const std::chrono::duration<double> fn_time =
         std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
     return 0;
 }
diff --git a/csrc/aio/py_lib/deepspeed_py_aio.h b/csrc/aio/py_lib/deepspeed_py_aio.h
index 11d5225de9f1..ba794db5440d 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio.h
@@ -4,10 +4,7 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+Functionality for swapping tensors to/from (NVMe) storage devices.
 */
 
 #include <deepspeed_aio_common.h>
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
index c21e92de9449..2b1093e99286 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.cpp
@@ -4,295 +4,25 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_py_aio_handle.h"
+#include <cstdlib>
 
 using namespace std;
 
-static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
-
 deepspeed_aio_handle_t::deepspeed_aio_handle_t(const int block_size,
                                                const int queue_depth,
                                                const bool single_submit,
                                                const bool overlap_events,
-                                               const int num_threads)
-    : _aio_ctxt(new aio_context(block_size, queue_depth)),
-      _single_submit(single_submit),
-      _overlap_events(overlap_events),
-      _num_threads(num_threads),
-      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
-      _num_pending_ops(0),
-      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
-{
-    for (auto i = 0; i < num_threads; ++i) {
-        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
-    }
-
-    for (auto& ctxt : _thread_contexts) {
-        _threads.push_back(std::thread(_start_aio_thread, ctxt));
-    }
-}
-
-deepspeed_aio_handle_t::~deepspeed_aio_handle_t()
-{
-    _stop_threads();
-    for (auto& thr : _threads) { thr.join(); }
-}
-
-const int deepspeed_aio_handle_t::get_block_size() const
-{
-    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
-}
-
-const int deepspeed_aio_handle_t::get_queue_depth() const
-{
-    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
-}
-
-const bool deepspeed_aio_handle_t::get_single_submit() const { return _single_submit; }
-
-const bool deepspeed_aio_handle_t::get_overlap_events() const { return _overlap_events; }
-
-const int deepspeed_aio_handle_t::get_thread_count() const { return _num_threads; }
-
-int deepspeed_aio_handle_t::read(torch::Tensor& buffer, const char* filename, const bool validate)
-{
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    assert(_aio_ctxt);
-
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto read_buffer = (char*)buffer.data_ptr();
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_file_bytes, read_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-
-    close(fd);
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
-}
-
-int deepspeed_aio_handle_t::write(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate)
+                                               const int intra_op_parallelism)
+    : deepspeed_io_handle_t(block_size,
+                            queue_depth,
+                            single_submit,
+                            overlap_events,
+                            intra_op_parallelism)
 {
-    assert(_aio_ctxt);
-
-    const auto start_time = std::chrono::high_resolution_clock::now();
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto write_buffer = (char*)buffer.data_ptr();
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(new io_xfer_ctxt(fd, 0, num_write_bytes, write_buffer));
-
-    if (_aio_config._overlap_events) {
-        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    } else {
-        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
-    }
-    const std::chrono::duration<double> aio_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-
-    close(fd);
-
-    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
-
-    const std::chrono::duration<double> fn_time =
-        std::chrono::high_resolution_clock::now() - start_time;
-    std::cout << "Elapsed time(usec): "
-              << "aio = " << aio_time.count() * 1e6 << " call = " << fn_time.count() * 1e6
-              << std::endl;
-    return 0;
 }
 
-void deepspeed_aio_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
-{
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_work_queue.push(scheduled_op);
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-    _num_pending_ops++;
-}
-
-std::shared_ptr<struct io_op_desc_t> deepspeed_aio_handle_t::_wait_for_aio_work()
-{
-    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
-    for (auto& ctxt : _thread_contexts) {
-        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
-        ctxt->_complete_sync._cond_var.wait(lock,
-                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
-        completed_op = ctxt->_complete_queue.front();
-        ctxt->_complete_queue.pop();
-    }
-    return completed_op;
-}
-
-void deepspeed_aio_handle_t::_stop_threads()
-{
-    assert(0 == _num_pending_ops);
-    for (auto& ctxt : _thread_contexts) {
-        {
-            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
-            ctxt->_time_to_exit = true;
-        }
-        ctxt->_work_sync._cond_var.notify_one();
-    }
-}
-
-int deepspeed_aio_handle_t::wait()
-{
-    assert(_num_pending_ops > 0);
-    auto num_completed_ops = 0;
-
-    while (_num_pending_ops > 0) {
-        auto completed_op = _wait_for_aio_work();
-
-        completed_op->fini();
-
-        close(completed_op->_fd);
-
-        if (completed_op->_validate) {
-            validate_aio_operation(completed_op->_read_op,
-                                   completed_op->_filename.c_str(),
-                                   completed_op->data_ptr(),
-                                   _num_threads * completed_op->_num_bytes);
-        }
-        --_num_pending_ops;
-        ++num_completed_ops;
-    }
-
-    return num_completed_ops;
-}
-
-bool deepspeed_aio_handle_t::_is_valid_parallel_aio_op(const bool read_op,
-                                                       const long long int num_bytes)
-{
-    const auto op_string = read_op ? "Read" : "Write";
-    if (num_bytes % get_thread_count()) {
-        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
-                  << " not divisible by thread count = " << get_thread_count() << std::endl;
-        return false;
-    }
-
-    return true;
-}
-
-int deepspeed_aio_handle_t::pread(const torch::Tensor& buffer,
-                                  const char* filename,
-                                  const bool validate,
-                                  const bool async)
-{
-    long long num_file_bytes;
-    if (-1 == get_file_size(filename, num_file_bytes)) {
-        const auto error_code = errno;
-        report_file_error(filename, " fstat for read", error_code);
-        return -1;
-    }
-    const auto buffer_bytes = static_cast<long long int>(buffer.nbytes());
-    if (buffer_bytes != num_file_bytes) {
-        std::cout << filename << ": buffer nbytes != file bytes " << buffer_bytes
-                  << " != " << num_file_bytes << std::endl;
-    }
-    assert(static_cast<long long int>(buffer.nbytes()) == num_file_bytes);
-    assert((num_file_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(true, num_file_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, true);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        true, buffer, fd, filename, (num_file_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::pwrite(const torch::Tensor& buffer,
-                                   const char* filename,
-                                   const bool validate,
-                                   const bool async)
-{
-    const auto num_write_bytes = static_cast<long long int>(buffer.nbytes());
-    assert((num_write_bytes % _num_threads) == 0);
-
-    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
-
-    const auto fd = open_file(filename, false);
-    if (fd == -1) { return -1; }
-
-    auto scheduled_op = std::make_shared<io_op_desc_t>(
-        false, buffer, fd, filename, (num_write_bytes / _num_threads), validate);
-
-    _schedule_aio_work(scheduled_op);
-
-    if (async) { return 0; }
-
-    return wait();
-}
-
-int deepspeed_aio_handle_t::sync_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::sync_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, false);
-}
-
-int deepspeed_aio_handle_t::async_pread(torch::Tensor& buffer, const char* filename)
-{
-    return pread(buffer, filename, false, true);
-}
-
-int deepspeed_aio_handle_t::async_pwrite(const torch::Tensor& buffer, const char* filename)
-{
-    return pwrite(buffer, filename, false, true);
-}
-
-at::Tensor deepspeed_aio_handle_t::new_cpu_locked_tensor(const size_t num_elem,
-                                                         const torch::Tensor& example_tensor)
-{
-    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
-}
-
-bool deepspeed_aio_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
-{
-    return _pinned_tensor_mgr->free(locked_tensor);
-}
+deepspeed_aio_handle_t::~deepspeed_aio_handle_t() {}
diff --git a/csrc/aio/py_lib/deepspeed_py_aio_handle.h b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
index 3a254c3814a2..1398df9a56c9 100644
--- a/csrc/aio/py_lib/deepspeed_py_aio_handle.h
+++ b/csrc/aio/py_lib/deepspeed_py_aio_handle.h
@@ -9,69 +9,14 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 
 #include <condition_variable>
 #include <memory>
-#include "deepspeed_aio_thread.h"
-#include "deepspeed_pin_tensor.h"
-
-struct deepspeed_aio_handle_t {
-    std::unique_ptr<struct aio_context> _aio_ctxt;
-    const bool _single_submit;
-    const bool _overlap_events;
-    const int _num_threads;
-    deepspeed_aio_config_t _aio_config;
-
-    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
-    std::vector<std::thread> _threads;
-    int _num_pending_ops;
-    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
+#include "deepspeed_py_io_handle.h"
 
+struct deepspeed_aio_handle_t : deepspeed_io_handle_t {
     deepspeed_aio_handle_t(const int block_size,
                            const int queue_depth,
                            const bool single_submit,
                            const bool overlap_events,
-                           const int num_threads);
+                           const int intra_op_parallelism);
 
     ~deepspeed_aio_handle_t();
-
-    const int get_block_size() const;
-    const int get_queue_depth() const;
-    const bool get_single_submit() const;
-    const bool get_overlap_events() const;
-    const int get_thread_count() const;
-
-    int read(torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int write(const torch::Tensor& buffer, const char* filename, const bool validate);
-
-    int pread(const torch::Tensor& buffer,
-              const char* filename,
-              const bool validate,
-              const bool async);
-
-    int pwrite(const torch::Tensor& buffer,
-               const char* filename,
-               const bool validate,
-               const bool async);
-
-    int sync_pread(torch::Tensor& buffer, const char* filename);
-
-    int sync_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    int async_pread(torch::Tensor& buffer, const char* filename);
-
-    int async_pwrite(const torch::Tensor& buffer, const char* filename);
-
-    // TODO: Make API's args to be shape and dtype.
-    torch::Tensor new_cpu_locked_tensor(const size_t num_elem, const torch::Tensor& example_tensor);
-
-    bool free_cpu_locked_tensor(torch::Tensor&);
-
-    int wait();
-
-    void _stop_threads();
-
-    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
-
-    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
-
-    bool _is_valid_parallel_aio_op(const bool read_op, const long long int num_bytes);
 };
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.cpp b/csrc/aio/py_lib/deepspeed_py_copy.cpp
index 8a59107dd347..f5480e9d9d83 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.cpp
+++ b/csrc/aio/py_lib/deepspeed_py_copy.cpp
@@ -4,13 +4,13 @@
 // DeepSpeed Team
 
 /*
-Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+Functionality for swapping tensors to/from (NVMe) storage devices.
 */
 
 #include "deepspeed_py_copy.h"
 #include <omp.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__) or defined(__AVX256__)
 union AVX_Data {
diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
index 19ba28317d00..f443571a3e7b 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -4,9 +4,6 @@
 // DeepSpeed Team
 
 /*
-Copyright 2020 The Microsoft DeepSpeed Team
-Licensed under the MIT license.
-
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.cpp b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
new file mode 100644
index 000000000000..64d7c2e0541e
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.cpp
@@ -0,0 +1,322 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_io_handle.h"
+#include <cstdlib>
+
+using namespace std;
+
+static void _start_aio_thread(std::shared_ptr<struct deepspeed_aio_thread_t> ctxt) { ctxt->run(); }
+
+deepspeed_io_handle_t::deepspeed_io_handle_t(const int block_size,
+                                             const int queue_depth,
+                                             const bool single_submit,
+                                             const bool overlap_events,
+                                             const int intra_op_parallelism)
+    : _aio_ctxt(new aio_context(block_size, queue_depth)),
+      _single_submit(single_submit),
+      _overlap_events(overlap_events),
+      _intra_op_parallelism(intra_op_parallelism),
+      _aio_config(block_size, queue_depth, single_submit, overlap_events, false),
+      _num_pending_ops(0),
+      _pinned_tensor_mgr(new deepspeed_pin_tensor_t())
+{
+    for (auto i = 0; i < intra_op_parallelism; ++i) {
+        _thread_contexts.push_back(std::make_shared<deepspeed_aio_thread_t>(i, _aio_config));
+    }
+
+    for (auto& ctxt : _thread_contexts) {
+        _threads.push_back(std::thread(_start_aio_thread, ctxt));
+    }
+}
+
+deepspeed_io_handle_t::~deepspeed_io_handle_t()
+{
+    _stop_threads();
+    for (auto& thr : _threads) { thr.join(); }
+}
+
+const int deepspeed_io_handle_t::get_block_size() const
+{
+    return _aio_ctxt ? _aio_ctxt->_block_size : -1;
+}
+
+const int deepspeed_io_handle_t::get_queue_depth() const
+{
+    return _aio_ctxt ? _aio_ctxt->_queue_depth : -1;
+}
+
+const bool deepspeed_io_handle_t::get_single_submit() const { return _single_submit; }
+
+const bool deepspeed_io_handle_t::get_overlap_events() const { return _overlap_events; }
+
+const int deepspeed_io_handle_t::get_intra_op_parallelism() const { return _intra_op_parallelism; }
+
+int deepspeed_io_handle_t::read(torch::Tensor& buffer,
+                                const char* filename,
+                                const bool validate,
+                                const int64_t file_offset)
+{
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    assert(_aio_ctxt);
+
+    int64_t num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+    assert(static_cast<int64_t>(buffer.nbytes()) == num_file_bytes);
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto read_buffer = (char*)buffer.data_ptr();
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, file_offset, 0, num_file_bytes, read_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(true, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+
+    close(fd);
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    if (validate) { validate_aio_operation(true, filename, read_buffer, num_file_bytes); }
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
+    return 0;
+}
+
+int deepspeed_io_handle_t::write(const torch::Tensor& buffer,
+                                 const char* filename,
+                                 const bool validate,
+                                 const int64_t file_offset)
+{
+    assert(_aio_ctxt);
+
+    const auto start_time = std::chrono::high_resolution_clock::now();
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto write_buffer = (char*)buffer.data_ptr();
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
+    std::unique_ptr<io_xfer_ctxt> xfer_ctxt(
+        new io_xfer_ctxt(fd, file_offset, 0, num_write_bytes, write_buffer));
+
+    if (_aio_config._overlap_events) {
+        do_aio_operation_overlap(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    } else {
+        do_aio_operation_sequential(false, _aio_ctxt, xfer_ctxt, &_aio_config, nullptr);
+    }
+    const std::chrono::duration<double> aio_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+
+    close(fd);
+
+    if (validate) { validate_aio_operation(false, filename, write_buffer, num_write_bytes); }
+
+    const std::chrono::duration<double> fn_time =
+        std::chrono::high_resolution_clock::now() - start_time;
+    std::cout << "Elapsed time(usec): " << "aio = " << aio_time.count() * 1e6
+              << " call = " << fn_time.count() * 1e6 << std::endl;
+    return 0;
+}
+
+void deepspeed_io_handle_t::_schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op)
+{
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_work_queue.push(scheduled_op);
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+    _num_pending_ops++;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_wait_for_aio_work()
+{
+    std::shared_ptr<struct io_op_desc_t> completed_op = nullptr;
+    for (auto& ctxt : _thread_contexts) {
+        std::unique_lock<std::mutex> lock(ctxt->_complete_sync._mutex);
+        ctxt->_complete_sync._cond_var.wait(lock,
+                                            [ctxt] { return !ctxt->_complete_queue.empty(); });
+        completed_op = ctxt->_complete_queue.front();
+        ctxt->_complete_queue.pop();
+    }
+    return completed_op;
+}
+
+void deepspeed_io_handle_t::_stop_threads()
+{
+    assert(0 == _num_pending_ops);
+    for (auto& ctxt : _thread_contexts) {
+        {
+            std::lock_guard<std::mutex> lock(ctxt->_work_sync._mutex);
+            ctxt->_time_to_exit = true;
+        }
+        ctxt->_work_sync._cond_var.notify_one();
+    }
+}
+
+int deepspeed_io_handle_t::wait()
+{
+    assert(_num_pending_ops > 0);
+    auto num_completed_ops = 0;
+
+    while (_num_pending_ops > 0) {
+        auto completed_op = _wait_for_aio_work();
+
+        if (completed_op->_validate) { completed_op->validate(); }
+
+        completed_op->finish();
+
+        close(completed_op->_fd);
+
+        --_num_pending_ops;
+        ++num_completed_ops;
+    }
+
+    return num_completed_ops;
+}
+
+bool deepspeed_io_handle_t::_is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes)
+{
+    const auto op_string = read_op ? "Read" : "Write";
+    if (num_bytes % get_intra_op_parallelism()) {
+        std::cout << "deepspeed_aio failure: parallel " << op_string << " num_bytes = " << num_bytes
+                  << " not divisible by thread count = " << get_intra_op_parallelism() << std::endl;
+        return false;
+    }
+
+    return true;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_io_handle_t::_create_io_op_desc(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const int fd,
+    const char* filename,
+    const int64_t file_num_bytes,
+    const bool validate,
+    const int64_t file_offset)
+{
+    return std::make_shared<cpu_op_desc_t>(read_op,
+                                           buffer,
+                                           _pinned_tensor_mgr,
+                                           fd,
+                                           filename,
+                                           file_num_bytes,
+                                           _intra_op_parallelism,
+                                           validate,
+                                           file_offset);
+}
+
+int deepspeed_io_handle_t::pread(const torch::Tensor& buffer,
+                                 const char* filename,
+                                 const bool validate,
+                                 const bool async,
+                                 const int64_t file_offset)
+{
+    int64_t num_file_bytes;
+    if (-1 == get_file_size(filename, num_file_bytes)) {
+        const auto error_code = errno;
+        report_file_error(filename, " fstat for read", error_code);
+        return -1;
+    }
+
+    // buffer can exceed file size to enable 4k alignment
+    const auto buffer_bytes = static_cast<int64_t>(buffer.nbytes());
+    assert((num_file_bytes % _intra_op_parallelism) == 0);
+
+    if (!_is_valid_parallel_aio_op(true, buffer_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, true);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op =
+        _create_io_op_desc(true, buffer, fd, filename, num_file_bytes, validate, file_offset);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_io_handle_t::pwrite(const torch::Tensor& buffer,
+                                  const char* filename,
+                                  const bool validate,
+                                  const bool async,
+                                  const int64_t file_offset)
+{
+    const auto num_write_bytes = static_cast<int64_t>(buffer.nbytes());
+    assert((num_write_bytes % _intra_op_parallelism) == 0);
+
+    if (!_is_valid_parallel_aio_op(false, num_write_bytes)) { return -1; }
+
+    const auto fd = open_file(filename, false);
+    if (fd == -1) { return -1; }
+
+    auto scheduled_op =
+        _create_io_op_desc(false, buffer, fd, filename, num_write_bytes, validate, file_offset);
+
+    _schedule_aio_work(scheduled_op);
+
+    if (async) { return 0; }
+
+    return wait();
+}
+
+int deepspeed_io_handle_t::sync_pread(torch::Tensor& buffer,
+                                      const char* filename,
+                                      const int64_t file_offset)
+{
+    return pread(buffer, filename, false, false, file_offset);
+}
+
+int deepspeed_io_handle_t::sync_pwrite(const torch::Tensor& buffer,
+                                       const char* filename,
+                                       const int64_t file_offset)
+{
+    return pwrite(buffer, filename, false, false, file_offset);
+}
+
+int deepspeed_io_handle_t::async_pread(torch::Tensor& buffer,
+                                       const char* filename,
+                                       const int64_t file_offset)
+{
+    return pread(buffer, filename, false, true, file_offset);
+}
+
+int deepspeed_io_handle_t::async_pwrite(const torch::Tensor& buffer,
+                                        const char* filename,
+                                        const int64_t file_offset)
+{
+    return pwrite(buffer, filename, false, true, file_offset);
+}
+
+at::Tensor deepspeed_io_handle_t::new_cpu_locked_tensor(const int64_t num_elem,
+                                                        const torch::Tensor& example_tensor)
+{
+    return _pinned_tensor_mgr->alloc(num_elem, example_tensor.scalar_type());
+}
+
+bool deepspeed_io_handle_t::free_cpu_locked_tensor(torch::Tensor& locked_tensor)
+{
+    return _pinned_tensor_mgr->free(locked_tensor);
+}
diff --git a/csrc/aio/py_lib/deepspeed_py_io_handle.h b/csrc/aio/py_lib/deepspeed_py_io_handle.h
new file mode 100644
index 000000000000..dfcb4125ab9a
--- /dev/null
+++ b/csrc/aio/py_lib/deepspeed_py_io_handle.h
@@ -0,0 +1,94 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_aio_thread.h"
+#include "deepspeed_pin_tensor.h"
+
+struct deepspeed_io_handle_t {
+    std::unique_ptr<struct aio_context> _aio_ctxt;
+    const bool _single_submit;
+    const bool _overlap_events;
+    const int _intra_op_parallelism;
+    deepspeed_aio_config_t _aio_config;
+
+    std::vector<std::shared_ptr<struct deepspeed_aio_thread_t>> _thread_contexts;
+    std::vector<std::thread> _threads;
+    int _num_pending_ops;
+    std::unique_ptr<struct deepspeed_pin_tensor_t> _pinned_tensor_mgr;
+
+    deepspeed_io_handle_t(const int block_size,
+                          const int queue_depth,
+                          const bool single_submit,
+                          const bool overlap_events,
+                          const int intra_op_parallelism);
+
+    virtual ~deepspeed_io_handle_t() = 0;
+
+    const int get_block_size() const;
+    const int get_queue_depth() const;
+    const bool get_single_submit() const;
+    const bool get_overlap_events() const;
+    const int get_intra_op_parallelism() const;
+
+    int read(torch::Tensor& buffer,
+             const char* filename,
+             const bool validate,
+             const int64_t file_offset);
+
+    int write(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const int64_t file_offset);
+
+    int pread(const torch::Tensor& buffer,
+              const char* filename,
+              const bool validate,
+              const bool async,
+              const int64_t file_offset);
+
+    int pwrite(const torch::Tensor& buffer,
+               const char* filename,
+               const bool validate,
+               const bool async,
+               const int64_t file_offset);
+
+    int sync_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);
+
+    int sync_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
+
+    int async_pread(torch::Tensor& buffer, const char* filename, const int64_t file_offset);
+
+    int async_pwrite(const torch::Tensor& buffer, const char* filename, const int64_t file_offset);
+
+    // TODO: Make API's args to be shape and dtype.
+    torch::Tensor new_cpu_locked_tensor(const int64_t num_elem,
+                                        const torch::Tensor& example_tensor);
+
+    bool free_cpu_locked_tensor(torch::Tensor&);
+
+    int wait();
+
+    void _stop_threads();
+
+    void _schedule_aio_work(std::shared_ptr<struct io_op_desc_t> scheduled_op);
+
+    std::shared_ptr<struct io_op_desc_t> _wait_for_aio_work();
+
+    bool _is_valid_parallel_aio_op(const bool read_op, const int64_t num_bytes);
+
+    virtual std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
+                                                                    const torch::Tensor& buffer,
+                                                                    const int fd,
+                                                                    const char* filename,
+                                                                    const int64_t file_num_bytes,
+                                                                    const bool validate,
+                                                                    const int64_t file_offset);
+};
diff --git a/csrc/aio/py_lib/py_ds_aio.cpp b/csrc/aio/py_lib/py_ds_aio.cpp
old mode 100755
new mode 100644
index 9033549bc0d2..bf298b691b81
--- a/csrc/aio/py_lib/py_ds_aio.cpp
+++ b/csrc/aio/py_lib/py_ds_aio.cpp
@@ -10,6 +10,7 @@ Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 #include <torch/extension.h>
 #include "deepspeed_py_aio_handle.h"
 #include "deepspeed_py_copy.h"
+using namespace pybind11::literals;
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
@@ -20,27 +21,96 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("deepspeed_memcpy", &deepspeed_py_memcpy, "DeepSpeed Memory Copy");
 
     py::class_<deepspeed_aio_handle_t>(m, "aio_handle")
-        .def(py::init<const int, const int, const bool, const bool, const int>())
+        .def(py::init<const int, const int, const bool, const bool, const int>(),
+             "AIO handle constructor",
+             "block_size"_a = 1024 * 1024,
+             "queue_depth"_a = 128,
+             "single_submit"_a = false,
+             "overlap_events"_a = false,
+             "intra_op_parallelism"_a = 1)
 
         .def("get_block_size", &deepspeed_aio_handle_t::get_block_size)
         .def("get_queue_depth", &deepspeed_aio_handle_t::get_queue_depth)
         .def("get_single_submit", &deepspeed_aio_handle_t::get_single_submit)
         .def("get_overlap_events", &deepspeed_aio_handle_t::get_overlap_events)
-        .def("get_thread_count", &deepspeed_aio_handle_t::get_thread_count)
+        .def("get_intra_op_parallelism", &deepspeed_aio_handle_t::get_intra_op_parallelism)
 
-        .def("read", &deepspeed_aio_handle_t::read)
-        .def("write", &deepspeed_aio_handle_t::write)
+        .def("read",
+             &deepspeed_aio_handle_t::read,
+             "Synchronous and non-parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "file_offset"_a = 0)
 
-        .def("pread", &deepspeed_aio_handle_t::pread)
-        .def("pwrite", &deepspeed_aio_handle_t::pwrite)
+        .def("write",
+             &deepspeed_aio_handle_t::write,
+             "Synchronous and non-parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "file_offset"_a = 0)
 
-        .def("sync_pread", &deepspeed_aio_handle_t::sync_pread)
-        .def("sync_pwrite", &deepspeed_aio_handle_t::sync_pwrite)
-        .def("async_pread", &deepspeed_aio_handle_t::async_pread)
-        .def("async_pwrite", &deepspeed_aio_handle_t::async_pwrite)
+        .def("pread",
+             &deepspeed_aio_handle_t::pread,
+             "Parallel file read with option of parallelism. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a,
+             "file_offset"_a = 0)
 
-        .def("new_cpu_locked_tensor", &deepspeed_aio_handle_t::new_cpu_locked_tensor)
-        .def("free_cpu_locked_tensor", &deepspeed_aio_handle_t::free_cpu_locked_tensor)
+        .def("pwrite",
+             &deepspeed_aio_handle_t::pwrite,
+             "Parallel file write with option of parallelism. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a,
+             "file_offset"_a = 0)
 
-        .def("wait", &deepspeed_aio_handle_t::wait);
+        .def("sync_pread",
+             &deepspeed_aio_handle_t::sync_pread,
+             "Synchrononous parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("sync_pwrite",
+             &deepspeed_aio_handle_t::sync_pwrite,
+             "Synchronous parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("async_pread",
+             &deepspeed_aio_handle_t::async_pread,
+             "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
+             "following wait() returns count of completed ops.",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("async_pwrite",
+             &deepspeed_aio_handle_t::async_pwrite,
+             "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
+             "count of completed ops.",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("new_cpu_locked_tensor",
+             &deepspeed_aio_handle_t::new_cpu_locked_tensor,
+             "Allocate pinned CPU tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_cpu_locked_tensor",
+             &deepspeed_aio_handle_t::free_cpu_locked_tensor,
+             "Free pinned CPU tensor.",
+             "tensor"_a)
+
+        .def("wait",
+             &deepspeed_aio_handle_t::wait,
+             "Wait for (ongoing) asynchronous operations to complete");
 }
diff --git a/csrc/aio/py_test/aio_bench_generate_param.py b/csrc/aio/py_test/aio_bench_generate_param.py
index 09d0e03c7ef6..7a0ab59ed73d 100644
--- a/csrc/aio/py_test/aio_bench_generate_param.py
+++ b/csrc/aio/py_test/aio_bench_generate_param.py
@@ -41,9 +41,9 @@ def convert_to_param(key):
     return {
         "single_submit": "true" if key[0] == "single" else "false",
         "overlap_events": "true" if key[1] == "overlap" else "false",
-        "thread_count": int(key[3]),
-        "queue_depth": int(key[4]),
-        "block_size": int(key[5])
+        "thread_count": int(key[5]),
+        "queue_depth": int(key[3]),
+        "block_size": int(key[4])
     }
 
 
diff --git a/csrc/aio/py_test/aio_bench_perf_sweep.py b/csrc/aio/py_test/aio_bench_perf_sweep.py
index 7d55f7ded65c..ba95150b11e1 100644
--- a/csrc/aio/py_test/aio_bench_perf_sweep.py
+++ b/csrc/aio/py_test/aio_bench_perf_sweep.py
@@ -10,75 +10,47 @@
 import argparse
 import json
 import itertools
-import subprocess
 import shutil
 
-from test_ds_aio_utils import refine_integer_value
+from ds_aio_job import Job, run_job
 from perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
-    READ_IO_DIR, WRITE_IO_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+    READ_LOG_DIR, WRITE_LOG_DIR
 from deepspeed.ops.op_builder import AsyncIOBuilder
 
 OTHER_OPTIONS = '--handle'
 PERF_SCRIPT = 'test_ds_aio.py'
 DEFAULT_SWEEP_CONFIG = {
-    "block_size": ["128K", "256K"],
-    "queue_depth": [4, 16, 32],
-    "overlap_events": [True, False],
-    "io_parallel": [2, 8],
-    "single_submit": [False]
+    "block_size": ["128K", "1M"],
+    "queue_depth": [32, 64, 128],
+    "sequential_requests": [True, False],
+    "single_submit": [False],
+    "io_parallel": [1, 2, 8],
 }
 
 
-class Job(object):
-
-    def __init__(self, cmd_line, output_file=None, work_dir=None):
-        self.cmd_line = cmd_line
-        self.output_file = output_file
-        self.work_dir = work_dir
-        self.output_fd = None
-
-    def cmd(self):
-        return self.cmd_line
-
-    def get_stdout(self):
-        return self.output_fd
-
-    def get_stderr(self):
-        return self.output_fd
-
-    def get_cwd(self):
-        return self.work_dir
-
-    def open_output_file(self):
-        if self.output_file is not None:
-            self.output_fd = open(self.output_file, 'w')
-
-    def close_output_file(self):
-        if self.output_fd is not None:
-            self.output_fd.close()
-            self.output_fd = None
-
-
 class SweepConfig(object):
 
     def __init__(self, args):
-        self.nvme_dir = args.nvme_dir
-        self.io_size = args.io_size
+        self.folder_to_device_mapping = get_ftd_map(args.nvme_dir)
         self.search_space = get_sweep_config_dict(args.sweep_config)
+        self.search_space.update(self.folder_to_device_mapping)
         self.read = not args.no_read
         self.write = not args.no_write
         self.flush_cache = not args.no_sudo
         self.log_dir = args.log_dir
-        self.loops = args.loops
-        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops}'
+        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
+        if args.gpu:
+            self.other_options += ' --gpu'
+        if args.gds:
+            self.other_options += ' --use_gds'
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
 
     parser.add_argument('--nvme_dir',
+                        nargs='+',
                         required=True,
-                        type=str,
                         help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
 
     parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
@@ -92,6 +64,10 @@ def parse_arguments():
                         default="400M",
                         help='Number of I/O bytes to read/write for performance measurements.')
 
+    parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.')
+
+    parser.add_argument('--gds', action='store_true', help='Run the sweep over NVIDIA GPUDirectStorage operator')
+
     parser.add_argument(
         '--no_sudo',
         action='store_true',
@@ -118,6 +94,12 @@ def dump_cmd_lines(cmd_lines):
         print(f'{i}:  {cmd}')
 
 
+def get_ftd_map(nvme_dir_list):
+    ftd_list = [f'{dir}:{dev}' for dev, dir in enumerate(nvme_dir_list)]
+    ftd_arg = [' '.join(ftd for ftd in ftd_list)]
+    return {'folder_to_device_mapping': ftd_arg}
+
+
 def get_sweep_config_dict(sweep_config_json):
     if sweep_config_json is None:
         return DEFAULT_SWEEP_CONFIG
@@ -148,16 +130,6 @@ def flatten_options(key, value_list):
     return cmd_list
 
 
-def run_job(job):
-    args = ' '.join(job.cmd())
-    print(f'args = {args}')
-    job.open_output_file()
-    proc = subprocess.run(args=args, shell=True, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
-    job.close_output_file()
-    assert proc.returncode == 0, \
-    f"This command failed: {job.cmd()}"
-
-
 def launch_sweep(sweep_jobs, sync_job, flush_cache_job):
     for perf_job in sweep_jobs:
         if flush_cache_job is not None:
@@ -176,7 +148,12 @@ def create_cmd_tags(cmd_line):
         if len(fields) == 1:
             tags[fields[0]] = None
         elif len(fields) == 2:
-            tags[fields[0]] = fields[1]
+            if fields[0] == '--folder_to_device_mapping':
+                tags[fields[0]] = len(fields[1:])
+            else:
+                tags[fields[0]] = fields[1]
+        elif len(fields) > 2:
+            tags[fields[0]] = len(fields[1:])
     return tags
 
 
@@ -184,16 +161,16 @@ def get_log_file(io_op_desc, cmd_line):
     QUEUE_DEPTH = "--queue_depth"
     BLOCK_SIZE = "--block_size"
     SINGLE_SUBMIT = "--single_submit"
-    OVERLAP_EVENTS = "--overlap_events"
-    THREAD_COUNT = "--threads"
+    SEQUENTIAL_REQUESTS = "--sequential_requests"
+    FTD_MAP = "--folder_to_device_mapping"
     IO_PARALLEL = "--io_parallel"
 
     tag_map = {
         QUEUE_DEPTH: "d",
         BLOCK_SIZE: "bs",
         SINGLE_SUBMIT: "single",
-        OVERLAP_EVENTS: "overlap",
-        THREAD_COUNT: "t",
+        SEQUENTIAL_REQUESTS: "sequential",
+        FTD_MAP: "ftd",
         IO_PARALLEL: "p"
     }
 
@@ -201,14 +178,14 @@ def get_log_file(io_op_desc, cmd_line):
         QUEUE_DEPTH: 1,
         BLOCK_SIZE: "1M",
         SINGLE_SUBMIT: "block",
-        OVERLAP_EVENTS: "sequential",
-        THREAD_COUNT: 1,
+        SEQUENTIAL_REQUESTS: "overlap",
+        FTD_MAP: 1,
         IO_PARALLEL: 1
     }
 
     def get_default_value(tag):
         value = tag_default[tag]
-        if tag in [SINGLE_SUBMIT, OVERLAP_EVENTS]:
+        if tag in [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS]:
             return value
         return f'{tag_map[tag]}{value}'
 
@@ -218,7 +195,7 @@ def get_config_value(tag, value):
             return tag_key
         return f'{tag_key}{value}'
 
-    tag_list = [SINGLE_SUBMIT, OVERLAP_EVENTS, THREAD_COUNT, IO_PARALLEL, QUEUE_DEPTH, BLOCK_SIZE]
+    tag_list = [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS, FTD_MAP, QUEUE_DEPTH, BLOCK_SIZE, IO_PARALLEL]
     log_tags = [io_op_desc]
     cmd_tags = create_cmd_tags(cmd_line)
     for tag in tag_list:
@@ -252,40 +229,14 @@ def async_io_setup():
     return AsyncIOBuilder().is_compatible()
 
 
-def get_block_size_and_count(io_bytes):
-    block_size = 1
-    block_count = io_bytes
-    bytes_in_KB = 1024
-
-    while block_count % bytes_in_KB == 0:
-        block_size *= bytes_in_KB
-        block_count /= bytes_in_KB
-
-    return int(block_size), int(block_count)
-
-
-def create_read_file(sweep_config):
-    read_folder = os.path.join(sweep_config.nvme_dir, f'{READ_IO_DIR}')
-    os.makedirs(read_folder, exist_ok=True)
-    read_file_name = os.path.join(read_folder, f'random_{sweep_config.io_size}B.pt')
-    block_size, block_count = get_block_size_and_count(refine_integer_value(sweep_config.io_size))
-    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={read_file_name} bs={block_size} count={block_count}'])
-    print(f'[Start] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-    run_job(dd_job)
-    print(f'[Done] Create read file of {sweep_config.io_size} bytes by running {dd_job.cmd()} ....')
-    return read_folder, read_file_name
-
-
 def remove_folder(folder):
     assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
     shutil.rmtree(folder)
 
 
 def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    read_folder, read_file_name = create_read_file(sweep_config)
-    read_option = f'--read_file {read_file_name}'
-    read_cmd_lines = [[f'{read_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
-    #dump_cmd_lines(read_cmd_lines)
+    read_cmd_lines = [[f'--read {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    #dump_cmd_lines(cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
     os.makedirs(log_folder, exist_ok=True)
@@ -294,15 +245,9 @@ def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
 
     launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
-    remove_folder(read_folder)
-
 
 def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
-    write_folder = os.path.join(sweep_config.nvme_dir, f'{WRITE_IO_DIR}')
-    os.makedirs(write_folder, exist_ok=True)
-    write_file_name = os.path.join(write_folder, f'random_{sweep_config.io_size}B.pt')
-    write_option = f'--write_size {sweep_config.io_size} --write_file {write_file_name}'
-    write_cmd_lines = [[f'{write_option} {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    write_cmd_lines = [[f'{sweep_config.other_options}'] + cmd for cmd in cmd_lines]
     #dump_cmd_lines(write_cmd_lines)
 
     log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
@@ -312,8 +257,6 @@ def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
 
     launch_sweep(sweep_jobs=perf_jobs, sync_job=sync_job, flush_cache_job=flush_cache_job)
 
-    remove_folder(write_folder)
-
 
 def main():
     print("Running performance sweep of deepspeed nvme library")
diff --git a/csrc/aio/py_test/ds_aio_args.py b/csrc/aio/py_test/ds_aio_args.py
new file mode 100644
index 000000000000..346feabe4810
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_args.py
@@ -0,0 +1,175 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import argparse
+import os
+from test_ds_aio_utils import refine_integer_value
+from deepspeed.accelerator import get_accelerator
+
+MAPPING_DELIMITER = ':'
+
+
+def refine_args(args):
+    if args.io_size and type(args.io_size) == str:
+        args.io_size = refine_integer_value(args.io_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+    return args
+
+
+def _get_mapping_dict(args):
+    if args.folder is not None:
+        d = {i: args.folder for i in range(args.multi_process)}
+    else:
+        d = {}
+        for m in args.folder_to_device_mapping:
+            fields = m.split(MAPPING_DELIMITER)
+            d[fields[1]] = fields[0]
+
+    return d
+
+
+def _validate_folder_mapping(args):
+    no_error = True
+    error_messages = []
+    invalid_mappings = [m for m in args.folder_to_device_mapping if MAPPING_DELIMITER not in m]
+    if len(invalid_mappings) > 0:
+        error_messages.append(
+            f'Missing delimiter ({MAPPING_DELIMITER}) in folder_to_device_mapping {invalid_mappings}')
+        no_error = False
+
+    folder_list = [m.split(MAPPING_DELIMITER)[0] for m in args.folder_to_device_mapping]
+    invalid_folders = [d for d in folder_list if not os.path.exists(d)]
+    if len(invalid_folders) > 0:
+        error_messages.append(f'Invalid folders in folder_to_device_mapping: {invalid_folders}')
+        no_error = False
+
+    if args.gpu:
+        device_list = [int(m.split(MAPPING_DELIMITER)[1]) for m in args.folder_to_device_mapping]
+        invalid_device_list = [dev_id for dev_id in device_list if not dev_id < get_accelerator().device_count()]
+        if len(invalid_device_list) > 0:
+            error_messages.append(f'Invalid device ids in folder_to_device_mapping: {invalid_device_list}')
+            no_error = False
+
+    return no_error, error_messages
+
+
+def validate_args(args):
+    no_error = True
+    error_messages = []
+
+    if args.folder is not None and len(args.folder_to_device_mapping) > 0:
+        error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
+        no_error = False
+    elif args.folder is None and len(args.folder_to_device_mapping) == 0:
+        error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
+        no_error = False
+
+    # Validate --folder
+    if args.folder is not None and not os.path.exists(args.folder):
+        no_error = False
+        error_messages.append(f'Invalid folder in --folder: {args.folder} ')
+
+    # Validate --folder_mapping_to_device
+    if len(args.folder_to_device_mapping) > 0:
+        no_mapping_error, mapping_error_messages = _validate_folder_mapping(args)
+        no_error = no_error and no_mapping_error
+        error_messages += mapping_error_messages
+
+    # Validate --gpu, --use_gds
+    if args.use_gds and not args.gpu:
+        error_messages.append(f'--gpu must be set to transfer with --use_gds')
+        no_error = False
+
+    if not no_error:
+        print(f'Found {len(error_messages)} validation errors')
+        for i, msg in enumerate(error_messages):
+            print(f'{i+1}: {msg}')
+
+    return no_error
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--folder', default=None, type=str, help='Folder to use for I/O.')
+
+    parser.add_argument('--folder_to_device_mapping',
+                        default=[],
+                        nargs='+',
+                        help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).'
+                        'Can be specified multiple times for multi-process runs,'
+                        'e.g. --folder_to_device_mapping /mnt/nvme0:0 --folder_to_device_mapping /mnt/nvme1:15 --gpu'
+                        'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15')
+
+    parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
+
+    parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')
+
+    parser.add_argument('--multi_process',
+                        type=int,
+                        default=1,
+                        help='Number of parallel processes doing I/O (default 1).')
+
+    parser.add_argument('--block_size',
+                        type=str,
+                        default='1M',
+                        help='I/O block size. Can use K, M, or G suffix (default 1M for 1 megabytes).')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth (default 32).')
+
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
+
+    parser.add_argument(
+        '--sequential_requests',
+        action='store_true',
+        help=
+        'Delay I/O request submission until completion of prior requests (default is overlap I/O submission and completion requests.).'
+    )
+
+    parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    parser.add_argument('--use_gds', action='store_true', help='Enable GDS AIO')
+
+    parser.add_argument('--slow_bounce_buffer',
+                        action='store_true',
+                        help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def get_validated_args():
+    args = parse_arguments()
+    args = refine_args(args)
+    if not validate_args(args):
+        quit()
+    print(f'Successful validation of command line arguments')
+
+    peer_tag = 'gpu' if args.gpu else 'process'
+    args.mapping_dict = _get_mapping_dict(args)
+    args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]
+    assert len(args.mapping_dict) == len(args.mapping_list)
+    print(f'Configuring {len(args.mapping_list)} {peer_tag} to folder mapping')
+    for i, (device_id, folder) in enumerate(args.mapping_list):
+        print(f'[{i}]: {peer_tag} {device_id} <----> {folder}')
+
+    return args
diff --git a/csrc/aio/py_test/ds_aio_basic.py b/csrc/aio/py_test/ds_aio_basic.py
index ad2a4349cd0c..9b3c7cbfc49f 100755
--- a/csrc/aio/py_test/ds_aio_basic.py
+++ b/csrc/aio/py_test/ds_aio_basic.py
@@ -9,10 +9,9 @@
 import torch
 import os
 import time
+from deepspeed.ops.aio import AsyncIOBuilder
 from multiprocessing import Pool, Barrier
 from test_ds_aio_utils import report_results, task_log, task_barrier
-from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import AsyncIOBuilder
 
 
 def pre_basic(args, tid, read_op):
@@ -21,7 +20,7 @@ def pre_basic(args, tid, read_op):
     file = args.read_file if read_op else f'{args.write_file}.{tid}'
 
     task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
-    buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
     task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
 
     ctxt = {}
@@ -56,7 +55,7 @@ def main_basic_read(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
     AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                     args.single_submit, args.overlap_events, args.validate)
+                                     args.single_submit, not args.sequential_requests, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -67,7 +66,7 @@ def main_basic_write(pool_params):
     args, tid, ctxt = pool_params
     start_time = time.time()
     AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
-                                      args.single_submit, args.overlap_events, args.validate)
+                                      args.single_submit, not args.sequential_requests, args.validate)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -90,16 +89,17 @@ def get_schedule(args, read_op):
 
 def _aio_handle_tasklet(pool_params):
     args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
 
     # Create schedule
     schedule = get_schedule(args, read_op)
     task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run pre task
     task_log(tid, f'running pre-task')
     ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run main tasks in a loop
     ctxt["main_task_sec"] = 0
@@ -107,14 +107,14 @@ def _aio_handle_tasklet(pool_params):
         task_log(tid, f'running main task {i}')
         start_time = time.time()
         ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
+        task_barrier(aio_barrier, num_processes)
         stop_time = time.time()
         ctxt["main_task_sec"] += stop_time - start_time
 
     # Run post task
     task_log(tid, f'running post-task')
     ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
@@ -125,9 +125,10 @@ def _init_tasklet(b):
 
 
 def aio_basic_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_handle.py b/csrc/aio/py_test/ds_aio_handle.py
index d35b2713edae..6913e9090bf5 100755
--- a/csrc/aio/py_test/ds_aio_handle.py
+++ b/csrc/aio/py_test/ds_aio_handle.py
@@ -10,40 +10,56 @@
 import os
 import time
 from multiprocessing import Pool, Barrier
-from test_ds_aio_utils import report_results, task_log, task_barrier
+from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import AsyncIOBuilder
+
+BUFFER = 'buffer'
+BOUNCE_BUFFER = 'bounce_buffer'
 
 
 def pre_handle(args, tid, read_op):
     io_string = "Read" if read_op else "Write"
-    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
-    file = args.read_file if read_op else f'{args.write_file}.{tid}'
-
-    io_parallel = args.io_parallel if args.io_parallel else 1
-    handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
-                                                args.overlap_events, io_parallel)
-    task_log(tid, f'Created deepspeed aio handle')
-
+    gds = True if args.use_gds else False
+    device_id, folder = args.mapping_list[tid]
+    filename = create_filename(folder, args.read, args.io_size, tid)
+    if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+        create_file(filename, args.io_size)
+
+    task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+    bounce_buffer = None
     if args.gpu:
-        buffer = torch.empty(num_bytes, dtype=torch.uint8, device=get_accelerator().device_name())
+        device_name = get_accelerator().device_name(device_id)
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
+        if not (args.slow_bounce_buffer or gds):
+            bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
+                                          device='cpu').pin_memory()
     else:
-        if args.use_accelerator_pin_memory:
-            buffer = get_accelerator().pin_memory(torch.empty(num_bytes, dtype=torch.uint8, device='cpu'))
-        else:
-            buffer = handle.new_cpu_locked_tensor(num_bytes, torch.empty(0, dtype=torch.uint8))
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid,
+             f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+             force=True)
 
-    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    if gds:
+        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
+        handle.pin_device_tensor(buffer)
+    else:
+        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                    not args.sequential_requests, io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
 
     ctxt = {}
-    ctxt['file'] = file
-    ctxt['num_bytes'] = num_bytes
+    ctxt['file'] = filename
+    ctxt['num_bytes'] = args.io_size
     ctxt['handle'] = handle
-    ctxt['buffer'] = buffer
+    ctxt['gds'] = gds
+    ctxt[BUFFER] = buffer
+    ctxt[BOUNCE_BUFFER] = bounce_buffer
     ctxt['elapsed_sec'] = 0
 
-    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
-
     return ctxt
 
 
@@ -61,8 +77,12 @@ def pre_handle_write(pool_params):
 
 def post_handle(pool_params):
     _, _, ctxt = pool_params
-    ctxt["buffer"].detach()
-    ctxt["buffer"] = None
+    for buf in [BUFFER, BOUNCE_BUFFER]:
+        if ctxt[buf] is not None:
+            if ctxt['gds']:
+                ctxt['handle'].unpin_device_tensor(ctxt[buf])
+            ctxt[buf].detach()
+            ctxt[buf] = None
     return ctxt
 
 
@@ -71,20 +91,31 @@ def main_parallel_read(pool_params):
     handle = ctxt['handle']
 
     start_time = time.time()
-    ret = handle.pread(ctxt['buffer'], ctxt['file'], args.validate, True)
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, 0, True)
     assert ret != -1
     handle.wait()
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
-
     return ctxt
 
 
 def main_parallel_write(pool_params):
     args, tid, ctxt = pool_params
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
     handle = ctxt['handle']
     start_time = time.time()
-    ret = handle.pwrite(ctxt['buffer'], ctxt['file'], args.validate, True)
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
     assert ret != -1
     handle.wait()
     end_time = time.time()
@@ -98,8 +129,11 @@ def main_handle_read(pool_parms):
     handle = ctxt['handle']
 
     start_time = time.time()
-    ret = handle.read(ctxt['buffer'], ctxt['file'], args.validate)
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
     assert ret != -1
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
 
@@ -108,9 +142,18 @@ def main_handle_read(pool_parms):
 
 def main_handle_write(pool_parms):
     args, tid, ctxt = pool_parms
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
     handle = ctxt['handle']
     start_time = time.time()
-    ret = handle.write(ctxt['buffer'], ctxt['file'], args.validate)
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
     assert ret != -1
     end_time = time.time()
     ctxt['elapsed_sec'] += end_time - start_time
@@ -123,27 +166,28 @@ def get_schedule(args, read_op):
     if read_op:
         schedule['pre'] = pre_handle_read
         schedule['post'] = post_handle
-        schedule['main'] = main_parallel_read if args.io_parallel else main_handle_read
+        schedule['main'] = main_parallel_read
     else:
         schedule['pre'] = pre_handle_write
         schedule['post'] = post_handle
-        schedule['main'] = main_parallel_write if args.io_parallel else main_handle_write
+        schedule['main'] = main_parallel_write
 
     return schedule
 
 
 def _aio_handle_tasklet(pool_params):
     args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
 
     # Create schedule
     schedule = get_schedule(args, read_op)
     task_log(tid, f'schedule = {schedule}')
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run pre task
     task_log(tid, f'running pre-task')
     ctxt = schedule["pre"]((args, tid))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     # Run main tasks in a loop
     ctxt["main_task_sec"] = 0
@@ -151,14 +195,14 @@ def _aio_handle_tasklet(pool_params):
         task_log(tid, f'running main task {i}')
         start_time = time.time()
         ctxt = schedule["main"]((args, tid, ctxt))
-        task_barrier(aio_barrier, args.threads)
+        task_barrier(aio_barrier, num_processes)
         stop_time = time.time()
         ctxt["main_task_sec"] += stop_time - start_time
 
     # Run post task
     task_log(tid, f'running post-task')
     ctxt = schedule["post"]((args, tid, ctxt))
-    task_barrier(aio_barrier, args.threads)
+    task_barrier(aio_barrier, num_processes)
 
     return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
 
@@ -169,9 +213,10 @@ def _init_tasklet(b):
 
 
 def aio_handle_multiprocessing(args, read_op):
-    b = Barrier(args.threads)
-    pool_params = [(args, p, read_op) for p in range(args.threads)]
-    with Pool(processes=args.threads, initializer=_init_tasklet, initargs=(b, )) as p:
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
         pool_results = p.map(_aio_handle_tasklet, pool_params)
 
     report_results(args, read_op, pool_results)
diff --git a/csrc/aio/py_test/ds_aio_job.py b/csrc/aio/py_test/ds_aio_job.py
new file mode 100644
index 000000000000..e9579a48fe4d
--- /dev/null
+++ b/csrc/aio/py_test/ds_aio_job.py
@@ -0,0 +1,49 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import subprocess
+import shlex
+
+
+class Job(object):
+
+    def __init__(self, cmd_line, output_file=None, work_dir=None):
+        self.cmd_line = cmd_line
+        self.output_file = output_file
+        self.work_dir = work_dir
+        self.output_fd = None
+
+    def cmd(self):
+        return self.cmd_line
+
+    def get_stdout(self):
+        return self.output_fd
+
+    def get_stderr(self):
+        return self.output_fd
+
+    def get_cwd(self):
+        return self.work_dir
+
+    def open_output_file(self):
+        if self.output_file is not None:
+            self.output_fd = open(self.output_file, 'w')
+
+    def close_output_file(self):
+        if self.output_fd is not None:
+            self.output_fd.close()
+            self.output_fd = None
+
+
+def run_job(job):
+    args = shlex.split(' '.join(job.cmd()))
+    print(f'args = {args}')
+    job.open_output_file()
+    proc = subprocess.run(args=args, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
+    job.close_output_file()
+    assert proc.returncode == 0, \
+    f"This command failed: {job.cmd()}"
diff --git a/csrc/aio/py_test/run_read_sweep.sh b/csrc/aio/py_test/run_read_sweep.sh
index b9d7e050454a..59d82996a0e2 100755
--- a/csrc/aio/py_test/run_read_sweep.sh
+++ b/csrc/aio/py_test/run_read_sweep.sh
@@ -1,13 +1,22 @@
 #!/bin/bash
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <input file> <output log dir>"
+if [[ $# -lt 2 ]]; then
+    echo "Usage: $0 <io_size> <output log dir> <target_gpu>"
     exit 1
 fi
 
+function prep_folder()
+{
+    folder=$1
+    if [[ -d ${folder} ]]; then
+        rm -f ${folder}/*
+    else
+        mkdir -p ${folder}
+    fi
+}
 
 function validate_environment()
 {
-    validate_cmd="python ./validate_async_io.py"
+    validate_cmd="TORCH_EXTENSIONS_DIR=./torch_extentions python3 ./validate_async_io.py"
     eval ${validate_cmd}
     res=$?
     if [[ $res != 0 ]]; then
@@ -17,18 +26,27 @@ function validate_environment()
     fi
 }
 
+function fileExists() {
+    local file="$1"
+    if [[ -f "$file" ]]; then
+        return 0
+    else
+        return 1
+    fi
+}
 
 validate_environment
 
-INPUT_FILE=$1
-if [[ ! -f ${INPUT_FILE} ]]; then
-    echo "Input file not found: ${INPUT_FILE}"
-    exit 1
-fi
-
-LOG_DIR=$2/aio_perf_sweep
+IO_SIZE=$1
+LOG_DIR=./aio_perf_sweep
+MAP_DIR=$2/aio
+GPU_MEM=$3
+USE_GDS=$4
 RUN_SCRIPT=./test_ds_aio.py
-READ_OPT="--read_file ${INPUT_FILE}"
+READ_OPT="--read"
+
+prep_folder ${MAP_DIR}
+prep_folder ${LOG_DIR}
 
 if [[ -d ${LOG_DIR} ]]; then
     rm -f ${LOG_DIR}/*
@@ -36,37 +54,60 @@ else
     mkdir -p ${LOG_DIR}
 fi
 
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
-SYNC="sync"
+if [[ ${GPU_MEM} == "gpu" ]]; then
+    gpu_opt="--gpu"
+else
+    gpu_opt=""
+fi
+if [[ ${USE_GDS} == "gds" ]]; then
+    gds_opt="--use_gds"
+else
+    gds_opt=""
+fi
+
+DISABLE_CACHE="sudo sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+SYNC="sudo sync"
 
-for sub in single block; do
-    if [[ $sub == "single" ]]; then
-        sub_opt="--single_submit"
+for xtype in cpu gpu gds; do
+    if [[ $xtype == "cpu" ]]; then
+        gpu_opt=""
+        gds_opt=""
+    elif [[ $xtype == "gpu" ]]; then
+        gpu_opt="--gpu"
+        gds_opt=""
     else
-        sub_opt=""
+        gpu_opt="--gpu"
+        gds_opt="--use_gds"
     fi
-    for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
+    for sub in single block; do
+        if [[ $sub == "single" ]]; then
+            sub_opt="--single_submit"
         else
-            ov_opt=""
+            sub_opt=""
         fi
-        for t in 1 2 4 8; do
-            for p in 1 ; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
-                        LOG="${LOG_DIR}/read_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
-                        echo ${DISABLE_CACHE}
-                        echo ${cmd}
-                        echo ${SYNC}
+        for ov in overlap sequential; do
+            if [[ $ov == "sequential" ]]; then
+                ov_opt="--sequential_requests"
+            else
+                ov_opt=""
+            fi
+            for p in 1 2 4 8; do
+                for t in 1 2 4 8; do
+                    for d in 8 16 32 64 128; do
+                        for bs in 128K 256K 512K 1M 2M 4M 8M 16M; do
+                            SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder_to_device_mapping /mnt/nvme01:0"
+                            OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --io_parallel ${t}"
+                            LOG="${LOG_DIR}/read_${xtype}_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
+                            cmd="/usr/bin/time python ${RUN_SCRIPT} ${READ_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
 
-                        eval ${DISABLE_CACHE}
-                        eval ${cmd}
-                        eval ${SYNC}
-                        sleep 2
+                            echo ${DISABLE_CACHE}
+                            echo ${cmd}
+                            echo ${SYNC}
+                            eval ${DISABLE_CACHE}
+                            eval ${cmd}
+                            eval ${SYNC}
+                            sleep 2
+                        done
                     done
                 done
             done
diff --git a/csrc/aio/py_test/run_write_sweep.sh b/csrc/aio/py_test/run_write_sweep.sh
index 99f2113dda6f..a54d1c8d7bed 100755
--- a/csrc/aio/py_test/run_write_sweep.sh
+++ b/csrc/aio/py_test/run_write_sweep.sh
@@ -25,25 +25,33 @@ function validate_environment()
 
 validate_environment
 
-if [[ $# -ne 3 ]]; then
-    echo "Usage: $0 <write size in MB> <write dir ><output log dir>"
-    exit 1
-fi
-
-SIZE="$1M"
-WRITE_DIR=$2
-LOG_DIR=$3/aio_perf_sweep
+IO_SIZE=$1
+LOG_DIR=$2/aio_perf_sweep
+MAP_DIR=$2/aio
+GPU_MEM=$3
+USE_GDS=$4
+RUN_SCRIPT=./test_ds_aio.py
 
-OUTPUT_FILE=${WRITE_DIR}/ds_aio_write_${SIZE}B.pt
-WRITE_OPT="--write_file ${OUTPUT_FILE} --write_size ${SIZE}"
+OUTPUT_FILE=${MAP_DIR}/ds_aio_write_${SIZE}B.pt
+WRITE_OPT=""
 
 
-prep_folder ${WRITE_DIR}
+prep_folder ${MAP_DIR}
 prep_folder ${LOG_DIR}
 
-RUN_SCRIPT=./test_ds_aio.py
 
-DISABLE_CACHE="sync; sudo bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
+if [[ ${GPU_MEM} == "gpu" ]]; then
+    gpu_opt="--gpu"
+else
+    gpu_opt=""
+fi
+if [[ ${USE_GDS} == "gds" ]]; then
+    gds_opt="--use_gds"
+else
+    gds_opt=""
+fi
+
+DISABLE_CACHE="sync; bash -c 'echo 1 > /proc/sys/vm/drop_caches' "
 SYNC="sync"
 
 for sub in single block; do
@@ -53,19 +61,19 @@ for sub in single block; do
         sub_opt=""
     fi
     for ov in overlap sequential; do
-        if [[ $ov == "overlap" ]]; then
-            ov_opt="--overlap_events"
+        if [[ $ov == "sequential" ]]; then
+            ov_opt="--sequential_requests"
         else
             ov_opt=""
         fi
-        for t in 1 2 4 8; do
-            for p in 1; do
-                for d in 1 2 4 8 16 32; do
-                    for bs in 128K 256K 512K 1M; do
-                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle --threads ${t}"
-                        OPTS="--io_parallel ${p} --queue_depth ${d} --block_size ${bs}"
+        for p in 1 2 4 8; do
+            for t in 1 2 4 8; do
+                for d in 32 64 128; do
+                    for bs in 256K 512K 1M; do
+                        SCHED_OPTS="${sub_opt} ${ov_opt} --handle ${gpu_opt} ${gds_opt} --folder ${MAP_DIR}"
+                        OPTS="--queue_depth ${d} --block_size ${bs} --io_size ${IO_SIZE} --multi_process ${p} --io_parallel ${t}"
                         LOG="${LOG_DIR}/write_${sub}_${ov}_t${t}_p${p}_d${d}_bs${bs}.txt"
-                        cmd="python ${RUN_SCRIPT} ${WRITE_OPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
+                        cmd="python ${RUN_SCRIPT} ${OPTS} ${SCHED_OPTS} &> ${LOG}"
                         echo ${DISABLE_CACHE}
                         echo ${cmd}
                         echo ${SYNC}
diff --git a/csrc/aio/py_test/test_ds_aio.py b/csrc/aio/py_test/test_ds_aio.py
index e6242cb35789..6de72755e9e5 100755
--- a/csrc/aio/py_test/test_ds_aio.py
+++ b/csrc/aio/py_test/test_ds_aio.py
@@ -6,79 +6,19 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
-import os
-import argparse
 import multiprocessing as mp
 from ds_aio_basic import aio_basic_multiprocessing
 from ds_aio_handle import aio_handle_multiprocessing
-from test_ds_aio_utils import refine_args
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument('--read_file', type=str, default=None, help='Read file.')
-
-    parser.add_argument('--write_file', type=str, default=None, help='Write file.')
-
-    parser.add_argument('--write_size', type=str, default=None, help='Number of bytes to write.')
-
-    parser.add_argument('--block_size', type=str, default='1M', help='I/O block size.')
-
-    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth.')
-
-    parser.add_argument('--threads', type=int, default=1, help='Thread parallelism count.')
-
-    parser.add_argument('--single_submit',
-                        action='store_true',
-                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
-
-    parser.add_argument('--overlap_events',
-                        action='store_true',
-                        help='Overlap I/O submission and completion requests.')
-
-    parser.add_argument('--validate', action='store_true', help='Perform validation in library.')
-
-    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
-
-    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
-
-    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
-
-    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
-
-    parser.add_argument('--use_accelerator_pin_memory',
-                        action='store_true',
-                        help='Obtain pinned (CPU page-locked) tensors from accelerator')
-
-    args = parser.parse_args()
-    print(f'args = {args}')
-    return args
-
-
-def validate_args(args):
-    if args.read_file and not os.path.isfile(args.read_file):
-        print(f'args validation error: {args.read_file} not found')
-        return False
-
-    return True
+from ds_aio_args import get_validated_args
 
 
 def main():
     print(f'Testing deepspeed_aio python frontend')
 
-    args = parse_arguments()
-    refine_args(args)
-    if not validate_args(args):
-        quit()
-
+    args = get_validated_args()
     mp.set_start_method('spawn')
     multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
-    if args.read_file:
-        multiprocess_function(args, True)
-
-    if args.write_file:
-        multiprocess_function(args, False)
+    multiprocess_function(args, args.read)
 
 
 if __name__ == "__main__":
diff --git a/csrc/aio/py_test/test_ds_aio_utils.py b/csrc/aio/py_test/test_ds_aio_utils.py
index 6aad114c0bdc..968ff4a60ef9 100755
--- a/csrc/aio/py_test/test_ds_aio_utils.py
+++ b/csrc/aio/py_test/test_ds_aio_utils.py
@@ -6,12 +6,17 @@
 Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
 """
 
+import os
+from ds_aio_job import Job, run_job
+
 BYTES_PER_GB = 1024**3
+BYTES_PER_MB = 1024**2
+BYTES_PER_KB = 1024
 LOG_TIDS = [0]
 
 
-def task_log(tid, msg):
-    if tid in LOG_TIDS:
+def task_log(tid, msg, force=False):
+    if force or tid in LOG_TIDS:
         print(f'tid {tid}: {msg}')
 
 
@@ -31,16 +36,29 @@ def report_results(args, read_op, pool_results):
     total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
 
     task_latency_sec = max([sec for _, sec, _ in pool_results])
-    task_speed_GB = total_bytes / task_latency_sec / BYTES_PER_GB
+    task_speed_GB = 0 if task_latency_sec == 0 else total_bytes / task_latency_sec / BYTES_PER_GB
     print(f'Task {io_string} Latency = {task_latency_sec} sec')
     print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
 
     e2e_latency_sec = max([sec for sec, _, _ in pool_results])
-    e2e_speed_GB = total_bytes / e2e_latency_sec / BYTES_PER_GB
+    e2e_speed_GB = 0 if e2e_latency_sec == 0 else total_bytes / e2e_latency_sec / BYTES_PER_GB
     print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
     print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
 
 
+def get_block_size_and_count(io_bytes):
+    if io_bytes > BYTES_PER_MB and io_bytes % BYTES_PER_MB == 0:
+        block_size = BYTES_PER_MB
+        block_size_string = '1M'
+    else:
+        assert io_bytes % BYTES_PER_KB == 0
+        block_size = BYTES_PER_KB
+        block_size_string = '1K'
+    block_count = io_bytes / block_size
+
+    return block_size_string, int(block_count)
+
+
 def refine_integer_value(value):
     unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
 
@@ -50,9 +68,14 @@ def refine_integer_value(value):
     return int(value)
 
 
-def refine_args(args):
-    if args.write_size and type(args.write_size) == str:
-        args.write_size = refine_integer_value(args.write_size)
+def create_filename(folder, read_op, size, tid):
+    io_string = "read" if read_op else "write"
+    return os.path.join(folder, f'_aio_{io_string}_{size}.pt.{tid}')
+
 
-    if args.block_size and type(args.block_size) == str:
-        args.block_size = refine_integer_value(args.block_size)
+def create_file(filename, num_bytes):
+    block_size, block_count = get_block_size_and_count(num_bytes)
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={filename} bs={block_size} count={block_count}'])
+    print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
+    run_job(dd_job)
+    print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
diff --git a/csrc/aio/py_test/validate_async_io.py b/csrc/aio/py_test/validate_async_io.py
index 019ec05d49d3..10fb638347bc 100644
--- a/csrc/aio/py_test/validate_async_io.py
+++ b/csrc/aio/py_test/validate_async_io.py
@@ -7,3 +7,4 @@
 """
 from deepspeed.ops.op_builder import AsyncIOBuilder
 assert AsyncIOBuilder().is_compatible()
+assert AsyncIOBuilder().load()
diff --git a/csrc/common/custom_cuda_kernel.cu b/csrc/common/custom_cuda_kernel.cu
deleted file mode 100644
index f46bf303125c..000000000000
--- a/csrc/common/custom_cuda_kernel.cu
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// SPDX-License-Identifier: Apache-2.0
-
-// DeepSpeed Team
-
-#include "custom_cuda_layers.h"
-
-__global__ void param_update_kernel(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (id < size) { output[id] = (__half)input[id]; }
-}
-
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
-
-__global__ void param_update_kernel_half(const float* input, __half* output, int size)
-{
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    __half2* output_cast = reinterpret_cast<__half2*>(output);
-    if (id < size) {
-        float input_f = input[id];
-        __half2* input_h = reinterpret_cast<__half2*>(&input_f);
-        output_cast[id] = *input_h;
-    }
-}
-
-void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream)
-{
-    int threads = 1024;
-    size /= 2;
-    dim3 grid_dim((size - 1) / threads + 1);
-    dim3 block_dim(threads);
-
-    param_update_kernel_half<<<grid_dim, block_dim, 0, stream>>>(input, output, size);
-}
diff --git a/csrc/cpu/comm/ccl.cpp b/csrc/cpu/comm/ccl.cpp
index 9a04b6f873f2..d28509e59266 100644
--- a/csrc/cpu/comm/ccl.cpp
+++ b/csrc/cpu/comm/ccl.cpp
@@ -5,281 +5,24 @@
 
 #include <torch/extension.h>
 
-#include <fcntl.h>
-#include <immintrin.h>
-#include <math.h>
-#include <omp.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <atomic>
-#include <cstdlib>
-#include <iostream>
 #include <oneapi/ccl.hpp>
+#include "shm.h"
 
-// states for collectives
-enum coll_state {
-    coll_begin = 0,
-    // coll states for naive allreduce
-    coll_allreduce_naive__copy_in_done,   // this state is for rank != 0
-    coll_allreduce_naive__reduce_done,    // this state is for rank == 0
-    coll_allreduce_naive__copy_out_done,  // this state is for rank != 0
-};
-
-// SHM building blocks
-struct SharedData {
-    const char* name;
-    int descriptor;
-    void* bytes;
-    size_t nbytes;
-};
-
-void shared_open(SharedData* data, const char* name, size_t nbytes)
-{
-    int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
-    if (d != -1) {
-        void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
-        data->name = name;
-        data->descriptor = d;
-        data->bytes = bytes;
-        data->nbytes = nbytes;
-    } else {
-        printf("shared_open %s failed\n", name);
-        data->descriptor = -1;
-    }
-}
-
-void shared_create(SharedData* data, const char* name, void* bytes, size_t nbytes)
-{
-    int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
-    if (d != -1) {
-        if (nbytes = write(d, bytes, nbytes)) { shared_open(data, name, nbytes); }
-    } else {
-        printf("shared_create %s failed\n", name);
-    }
-}
-
-void shared_close(SharedData* data)
-{
-    if (data->descriptor != -1) {
-        munmap(data->bytes, data->nbytes);
-        shm_unlink(data->name);
-    }
-}
-
-// SHM based allreduce helper functions
-// buffer that holds shm name
-#define NAME_BUF_SIZE 1000
-#define MAX_BUF_SIZE 1048576
-#define SHM_BUFFER_NAME "deepspeed_allreduce_buffer"
-SharedData allreduce_buffer;
-struct allreduce_workspace {
-    enum coll_state state;
-    char buffer[MAX_BUF_SIZE];
-};
-struct allreduce_workspace* workspace;
-
-void wait_buffer_state_until(int index, enum coll_state state)
-{
-    volatile enum coll_state* state_ptr = &(workspace[index].state);
-
-    while (*state_ptr != state)
-        ;
-}
-
-void wait_buffer_state_until_not(int index, enum coll_state state)
-{
-    volatile enum coll_state* state_ptr = &(workspace[index].state);
-
-    while (*state_ptr == state)
-        ;
-}
-
-__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw")));
-inline __m512 cvt_bf16_to_fp32(const __m256i src)
-{
-    auto y = _mm512_cvtepu16_epi32(src);
-    return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2));
-}
-
-inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw")));
-inline __m256i cvt_fp32_to_bf16(const __m512 src)
-{
-    __m512i value = _mm512_castps_si512(src);
-    __m512i nan = _mm512_set1_epi32(0xffff);
-    auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
-    __m512i ones = _mm512_set1_epi32(0x1);
-    __m512i vec_bias = _mm512_set1_epi32(0x7fff);
-    // uint32_t lsb = (input >> 16) & 1;
-    auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones);
-    // uint32_t rounding_bias = 0x7fff + lsb;
-    t_value = _mm512_add_epi32(t_value, vec_bias);
-    // input += rounding_bias;
-    t_value = _mm512_add_epi32(t_value, value);
-    // input = input >> 16;
-    t_value = _mm512_srli_epi32(t_value, 16);
-    // Check NaN before converting back to bf16
-    t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
-    return _mm512_cvtusepi32_epi16(t_value);
-}
-
-void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in)
-    __attribute__((target("avx512bw")));
-
-void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
-    __attribute__((target("avx512bw")));
-
-void reduce_2_fp32_buffers(int num_elements, void* in_out, void* in)
-    __attribute__((target("avx512bw")));
-
-void reduce_fp32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
-    __attribute__((target("avx512bw")));
-
-// N_REDUCE_LIMIT is the number of buffers that can be reduced together in one shot.
-// Compared with do N-1 2-reduces which needs 2*(N-1) read and N-1 write,
-// N-reduce only needs N read and 1 write, this saves 2/3 memory bandwidth.
-// When increase N_REDUCE_LIMIT to a bigger number, do the following steps
-// 1. Extend REPEAT_<X> macros list down below
-// 2. Extend switch cases which call "REPEAT(X, ...)" down below
-#define N_REDUCE_LIMIT 8
-
-void reduce_all_buffers(struct allreduce_workspace* workspace,
-                        int num_elements,
-                        c10::ScalarType scalar_type,
-                        int num_buffers)
-{
-    switch (scalar_type) {
-        case c10::ScalarType::BFloat16:
-            if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) {
-                reduce_bf16_buffers(num_elements, num_buffers, workspace);
-            } else {
-                for (int i = 1; i < num_buffers; i++) {
-                    reduce_2_bf16_buffers(num_elements, workspace[0].buffer, workspace[i].buffer);
-                }
-            }
-            break;
-        case c10::ScalarType::Float:
-            if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) {
-                reduce_fp32_buffers(num_elements, num_buffers, workspace);
-            } else {
-                for (int i = 1; i < num_buffers; i++) {
-                    reduce_2_fp32_buffers(num_elements, workspace[0].buffer, workspace[i].buffer);
-                }
-            }
-            break;
-        default: assert(!"Should not get here");
-    }
-}
-
-#define REPEAT(N, x) REPEAT_##N(x)
-#define REPEAT_1(x) x(1)
-#define REPEAT_2(x) \
-    REPEAT_1(x);    \
-    x(2)
-#define REPEAT_3(x) \
-    REPEAT_2(x);    \
-    x(3)
-#define REPEAT_4(x) \
-    REPEAT_3(x);    \
-    x(4)
-#define REPEAT_5(x) \
-    REPEAT_4(x);    \
-    x(5)
-#define REPEAT_6(x) \
-    REPEAT_5(x);    \
-    x(6)
-#define REPEAT_7(x) \
-    REPEAT_6(x);    \
-    x(7)
-
-#define CVT_ADD_BF16(x)                                                                \
-    do {                                                                               \
-        auto in##x##_val =                                                             \
-            cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x].buffer + i))); \
-        inout_val = _mm512_add_ps(inout_val, in##x##_val);                             \
-    } while (0)
-
-// Reduce functions down below use vectorized algorithm, the number of bytes processed each
-// iteration depends on vector length.  256bit vector ==> 32 bytes, 512bit vector ==> 64 bytes
-// If you change implementation of reduce_2_bf16_buffers or reduce_2_fp32_buffers, check
-// whether this number needs to be changed
-#define VECTOR_LENGTH_IN_BYTES 32
-
-// num_elements must be divisible by 16 (caller check)
-void reduce_bf16_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
-{
-#pragma omp parallel for
-    for (int i = 0; i < num_elements * 2; i += VECTOR_LENGTH_IN_BYTES) {
-        auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0].buffer + i)));
-        switch (num_buffers) {
-            case 8: REPEAT(7, CVT_ADD_BF16); break;
-            case 7: REPEAT(6, CVT_ADD_BF16); break;
-            case 6: REPEAT(5, CVT_ADD_BF16); break;
-            case 5: REPEAT(4, CVT_ADD_BF16); break;
-            case 4: REPEAT(3, CVT_ADD_BF16); break;
-            case 3: REPEAT(2, CVT_ADD_BF16); break;
-            default: assert(!"Should not get here.");
-        }
-        _mm256_storeu_si256((__m256i*)(workspace[0].buffer + i), cvt_fp32_to_bf16(inout_val));
-    }
-}
+// #define DO_PROFILE
+#ifdef DO_PROFILE
+#include <cfloat>
+#include <chrono>
+#endif
 
-void reduce_2_bf16_buffers(int num_elements, void* in_out, void* in1)
-{
-#pragma omp parallel for
-    for (int i = 0; i < num_elements * 2; i += VECTOR_LENGTH_IN_BYTES) {
-        auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in_out + i)));
-        auto in1_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)((char*)in1 + i)));
-        inout_val = _mm512_add_ps(inout_val, in1_val);
-        _mm256_storeu_si256((__m256i*)((char*)in_out + i), cvt_fp32_to_bf16(inout_val));
-    }
-}
-
-#define CVT_ADD_F32(x)                                                         \
-    do {                                                                       \
-        auto in##x##_val = _mm256_loadu_ps((float*)(workspace[x].buffer + i)); \
-        inout_val = _mm256_add_ps(inout_val, in##x##_val);                     \
-    } while (0)
+// Communication settings
+static int world_rank = -1;
+static int world_size = -1;
 
-// num_elements must be divisible by 16 (caller check)
-void reduce_fp32_buffers(int num_elements, int num_buffers, struct allreduce_workspace* workspace)
-{
-#pragma omp parallel for
-    for (int i = 0; i < num_elements * 4; i += VECTOR_LENGTH_IN_BYTES) {
-        auto inout_val = _mm256_loadu_ps((float*)(workspace[0].buffer + i));
-        switch (num_buffers) {
-            case 8: REPEAT(7, CVT_ADD_F32); break;
-            case 7: REPEAT(6, CVT_ADD_F32); break;
-            case 6: REPEAT(5, CVT_ADD_F32); break;
-            case 5: REPEAT(4, CVT_ADD_F32); break;
-            case 4: REPEAT(3, CVT_ADD_F32); break;
-            case 3: REPEAT(2, CVT_ADD_F32); break;
-            default: assert(!"Should not get here.");
-        }
-        _mm256_storeu_ps((float*)(workspace[0].buffer + i), inout_val);
-    }
-}
-
-void reduce_2_fp32_buffers(int num_elements, void* in_out, void* in1)
-{
-#pragma omp parallel for
-    for (int i = 0; i < num_elements * 4; i += VECTOR_LENGTH_IN_BYTES) {
-        auto inout_val = _mm256_loadu_ps((float*)((char*)in_out + i));
-        auto in1_val = _mm256_loadu_ps((float*)((char*)in1 + i));
-        inout_val = _mm256_add_ps(inout_val, in1_val);
-        _mm256_storeu_ps((float*)((char*)in_out + i), inout_val);
-    }
-}
-
-// Communicatiooon settings
-int world_rank = -1;
-int world_size = -1;
-
-std::set<int> _comm_ids;
-std::set<int> _colors;
-std::vector<ccl::communicator> _ccl_comms;
-ccl::shared_ptr_class<ccl::kvs> sub_kvs;
-std::map<std::vector<int>, int> group_to_comm_id;
+static std::set<int> _comm_ids;
+static std::set<int> _colors;
+static std::vector<ccl::communicator> _ccl_comms;
+static ccl::shared_ptr_class<ccl::kvs> sub_kvs;
+static std::map<std::vector<int>, int> group_to_comm_id;
 
 ccl::communicator& _get_comm_from_group() { return _ccl_comms[0]; }
 ccl::communicator& _get_comm_from_group(py::object group) { return _ccl_comms[0]; }
@@ -300,11 +43,11 @@ ccl::communicator& _get_comm_from_group(std::vector<int> ranks)
 #define KVS_CREATE_SUCCESS 0
 #define KVS_CREATE_FAILURE -1
 
-bool is_initialized = 0;
+static bool is_initialized = 0;
 
-ccl::shared_ptr_class<ccl::kvs> kvs;
+static ccl::shared_ptr_class<ccl::kvs> kvs;
 
-bool all_ranks_local_p = false;
+static bool all_ranks_local_p = false;
 
 void initialize(int size, int rank, torch::Tensor& kvs_data)
 {
@@ -336,30 +79,8 @@ void initialize(int size, int rank, torch::Tensor& kvs_data)
     if (addr_string == NULL) { addr_string = ""; }
     auto port_string = std::getenv("MASTER_PORT");
     if (port_string == NULL) { port_string = ""; }
-    char shm_name[NAME_BUF_SIZE];
-    snprintf(shm_name,
-             NAME_BUF_SIZE,
-             "%s_%d_%s_%s",
-             SHM_BUFFER_NAME,
-             getuid(),
-             addr_string,
-             port_string);
-    // create shared workspace for SHM based allreduce
-    if (all_ranks_local_p) {
-        if (rank == 0) {
-            workspace =
-                (struct allreduce_workspace*)malloc(size * sizeof(struct allreduce_workspace));
-            shared_create(
-                &allreduce_buffer, shm_name, workspace, size * sizeof(struct allreduce_workspace));
-            workspace = (struct allreduce_workspace*)allreduce_buffer.bytes;
-            for (int i = 0; i < size; i++) { workspace[i].state = coll_begin; }
-        }
-        CCLCHECK(ccl::barrier(_get_comm_from_group()).wait());
-        if (rank != 0) {
-            shared_open(&allreduce_buffer, shm_name, size * sizeof(struct allreduce_workspace));
-        }
-        workspace = (struct allreduce_workspace*)allreduce_buffer.bytes;
-    }
+
+    if (all_ranks_local_p) { shm_initialize(size, rank, addr_string, port_string); }
 }
 
 /*
@@ -526,19 +247,22 @@ void all_reduce_caching(torch::Tensor& data,
                  .wait());
 }
 
-static void parallel_memcpy(void* to, void* from, size_t n_bytes)
-    __attribute__((target("avx512bw")));
-static void parallel_memcpy(void* to, void* from, size_t n_bytes)
+void inference_all_reduce(torch::Tensor& data, py::object op)
 {
-#pragma omp parallel for
-    for (int i = 0; i < n_bytes; i += VECTOR_LENGTH_IN_BYTES) {
-        auto val = _mm256_loadu_si256((__m256i*)((char*)from + i));
-        _mm256_storeu_si256((__m256i*)((char*)to + i), val);
-    }
-}
+#ifdef DO_PROFILE
+    static double total_time = 0.0;
+    static double total_time_sq = 0.0;
+    static int count = -16;  // warmup
+    static double max_time = 0.0;
+    static double min_time = DBL_MAX;
+    // make sure all rank reach this point before measuring time
+    // turn on this if you suspect each rank didn't reach here at the same time (stragger)
+    // if (all_ranks_local_p) {
+    // barrier_wait(0, world_size);
+    //}
+    auto start = std::chrono::system_clock::now();
+#endif
 
-void inference_all_reduce(torch::Tensor& data, py::object op, std::vector<int> group, bool async_op)
-{
     static py::object ReduceOp = py::module_::import("deepspeed.comm").attr("ReduceOp");
     static auto ReduceOpSum = (int)py::int_(ReduceOp.attr("SUM").attr("value"));
 
@@ -555,59 +279,40 @@ void inference_all_reduce(torch::Tensor& data, py::object op, std::vector<int> g
         default: data_type_fallback = true;
     }
 
-    if (data_type_fallback || (data_size % VECTOR_LENGTH_IN_BYTES) != 0 || !all_ranks_local_p) {
+    if (data_type_fallback || !all_ranks_local_p) {
         // fallback to oneccl allreduce
         CCLCHECK(ccl::allreduce(data.data_ptr(),
                                 data.data_ptr(),
                                 data.numel(),
                                 get_ccl_datatype(data.scalar_type()),
                                 get_ccl_reduce_op(op, data),
-                                _get_comm_from_group(group))
+                                _get_comm_from_group())
                      .wait());
-        return;
+    } else {
+        all_reduce_outer_loop(data, numel, data_size);
     }
 
-    for (int offset = 0; offset < data_size; offset += MAX_BUF_SIZE) {
-        auto data_ptr = ((char*)(data.data_ptr()) + offset);
-        size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset;
-        size_t chunk_el = chunk_size / (data_size / numel);
-
-        parallel_memcpy(workspace[world_rank].buffer, data_ptr, chunk_size);
-        std::atomic_thread_fence(std::memory_order_release);
-        workspace[world_rank].state = coll_allreduce_naive__copy_in_done;
-
-        if (world_rank == 0) {
-            // compute allreduce result on rank 0
-            for (int i = 1; i < world_size; i++) {
-                // wait until the other rank copy the buffer
-                wait_buffer_state_until(i, coll_allreduce_naive__copy_in_done);
-            }
-            reduce_all_buffers(workspace, chunk_el, data.scalar_type(), world_size);
-            std::atomic_thread_fence(std::memory_order_release);
-            workspace[world_rank].state = coll_allreduce_naive__reduce_done;
-            parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size);
-        }
-        if (world_rank != 0) {
-            wait_buffer_state_until(0, coll_allreduce_naive__reduce_done);
-            parallel_memcpy(data_ptr, workspace[0].buffer, chunk_size);
-            std::atomic_thread_fence(std::memory_order_release);
-            workspace[world_rank].state = coll_allreduce_naive__copy_out_done;
-        }
-        if (world_rank == 0) {
-            for (int i = 1; i < world_size; i++) {
-                wait_buffer_state_until(i, coll_allreduce_naive__copy_out_done);
-            }
-            std::atomic_thread_fence(std::memory_order_release);
-            workspace[world_rank].state = coll_begin;
-        }
-        if (world_rank != 0) {
-            // if rank 0 spin too fast it could be in state 1 of next allreduce
-            // in this case wait_buffer_state_until(0, 0) may cause deadlock
-            // what we are certain is when rank 0 finishes the state won't be 2
-            wait_buffer_state_until_not(0, coll_allreduce_naive__reduce_done);
-            workspace[world_rank].state = coll_begin;
+#ifdef DO_PROFILE
+    auto end = std::chrono::system_clock::now();
+    count++;
+    if (count > 0) {
+        double elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        if (elapsed > max_time) { max_time = elapsed; }
+        if (elapsed < min_time) { min_time = elapsed; }
+        total_time += elapsed;
+        total_time_sq += elapsed * elapsed;
+        if (world_rank == 0 && count == 1000) {
+            auto avg = total_time / count;
+            auto sd =
+                sqrt(total_time_sq / count - total_time * total_time / (count * count)) / avg * 100;
+            printf("      C++ kernel\t\t    %.2f\t  %.2f\t%.2f\t      %.2f\n",
+                   min_time,
+                   max_time,
+                   total_time / count,
+                   sd);
         }
     }
+#endif
 }
 
 void barrier(std::vector<int> group, bool async_op)
diff --git a/csrc/cpu/comm/shm.cpp b/csrc/cpu/comm/shm.cpp
new file mode 100644
index 000000000000..be44681ca062
--- /dev/null
+++ b/csrc/cpu/comm/shm.cpp
@@ -0,0 +1,692 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+#include <ATen/ATen.h>
+#include <fcntl.h>
+#include <immintrin.h>
+#include <semaphore.h>
+#include <sys/mman.h>
+#include "shm.h"
+
+// #define DO_PROFILE
+#ifdef DO_PROFILE
+#include <cfloat>
+#include <chrono>
+#endif
+
+// states for collectives
+enum coll_state {
+    coll_begin = 0,
+    coll_allreduce_naive__copy_in_done,
+    coll_allreduce_naive__reduce_done,
+    // alternative state when allreduce is working on alternative buffer
+    // of the double buffer.
+    coll_alt1_allreduce_naive__copy_in_done,
+    coll_alt2_allreduce_naive__copy_in_done,
+    coll_alt1_allreduce_naive__reduce_done,
+};
+
+// SHM building blocks
+struct SharedData {
+    const char* name;
+    int descriptor;
+    void* bytes;
+    size_t nbytes;
+};
+
+void shared_open(SharedData* data, const char* name, size_t nbytes)
+{
+    int d = shm_open(name, O_RDWR, S_IRUSR | S_IWUSR);
+    if (d != -1) {
+        void* bytes = mmap(NULL, nbytes, PROT_READ | PROT_WRITE, MAP_SHARED, d, 0);
+        data->name = name;
+        data->descriptor = d;
+        data->bytes = bytes;
+        data->nbytes = nbytes;
+    } else {
+        if (errno != ENOENT) {
+            // don't print if shm can not be found because we want to loop over from
+            // caller again until the other ranks created the shm
+            printf("shared_open %s failed, errno=%d\n", name, errno);
+        }
+        data->descriptor = -1;
+    }
+}
+
+void shared_create(SharedData* data, const char* name, void* bytes, size_t nbytes)
+{
+    int d = shm_open(name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);
+    if (d != -1) {
+        if (nbytes = write(d, bytes, nbytes)) { shared_open(data, name, nbytes); }
+    } else {
+        printf("shared_create %s failed\n", name);
+    }
+}
+
+void shared_close(SharedData* data)
+{
+    if (data->descriptor != -1) {
+        munmap(data->bytes, data->nbytes);
+        shm_unlink(data->name);
+    }
+}
+
+static int world_size;
+
+// SHM based allreduce helper functions
+// buffer that holds shm name
+#define NAME_BUF_SIZE 1000
+#define MAX_BUF_SIZE 1048576 * 32
+#define NAIVE_ALLREDUCE_THRESHOLD 1048576
+#define SHM_BUFFER_NAME "deepspeed_allreduce_buffer"
+struct allreduce_workspace {
+    enum coll_state states[2];  // idx=0 -- state for symmetric_naive_all_reduce
+                                // idx=1 -- state for distributed_naive_all_reduce
+    // double buffer to avoid syncing between rounds
+    // offset=0 -- 2*NAIVE_ALLREDUCE_THRESHOLD : buffer for symmetric_naive_all_reduce
+    // after that : buffer for distributed_naive_all_reduce
+    char buffer[2 * NAIVE_ALLREDUCE_THRESHOLD + 2 * MAX_BUF_SIZE];
+};
+
+#define BUFFER0_OFFSET(current_buffer) current_buffer* NAIVE_ALLREDUCE_THRESHOLD
+#define BUFFER1_OFFSET(current_buffer) 2 * NAIVE_ALLREDUCE_THRESHOLD + current_buffer* MAX_BUF_SIZE
+
+struct allreduce_workspace** workspace;
+
+// buffer for small messages, double buffer
+char** symmetric_buffer[2];
+// buffer for large messages, double buffer
+char** distributed_buffer[2];
+
+void wait_buffer_state_until_2(int index,
+                               enum coll_state state0,
+                               enum coll_state state1,
+                               int state_group)
+{
+    volatile enum coll_state* state_ptr = &(workspace[index]->states[state_group]);
+
+    while (1) {
+        volatile enum coll_state cur_state = *state_ptr;
+        if (cur_state == state0 || cur_state == state1) break;
+    }
+}
+
+__m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw")));
+inline __m512 cvt_bf16_to_fp32(const __m256i src)
+{
+    auto y = _mm512_cvtepu16_epi32(src);
+    return _mm512_castsi512_ps(_mm512_bslli_epi128(y, 2));
+}
+
+inline __m256i cvt_fp32_to_bf16(const __m512 src) __attribute__((target("avx512bw")));
+inline __m256i cvt_fp32_to_bf16(const __m512 src)
+{
+    __m512i value = _mm512_castps_si512(src);
+    __m512i nan = _mm512_set1_epi32(0xffff);
+    auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q);
+    __m512i ones = _mm512_set1_epi32(0x1);
+    __m512i vec_bias = _mm512_set1_epi32(0x7fff);
+    // uint32_t lsb = (input >> 16) & 1;
+    auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones);
+    // uint32_t rounding_bias = 0x7fff + lsb;
+    t_value = _mm512_add_epi32(t_value, vec_bias);
+    // input += rounding_bias;
+    t_value = _mm512_add_epi32(t_value, value);
+    // input = input >> 16;
+    t_value = _mm512_srli_epi32(t_value, 16);
+    // Check NaN before converting back to bf16
+    t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value);
+    return _mm512_cvtusepi32_epi16(t_value);
+}
+
+__m512 cvt_fp16_to_fp32(const __m256i src) __attribute__((target("avx512bw")));
+inline __m512 cvt_fp16_to_fp32(const __m256i src) { return _mm512_cvtph_ps(src); }
+
+inline __m256i cvt_fp32_to_fp16(const __m512 src) __attribute__((target("avx512bw")));
+inline __m256i cvt_fp32_to_fp16(const __m512 src)
+{
+    return _mm512_cvtps_ph(src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+    __attribute__((target("avx512bw")));
+
+void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+    __attribute__((target("avx512bw")));
+
+void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+    __attribute__((target("avx512bw")));
+
+void reduce_all_buffers(int start_elements,
+                        int num_elements,
+                        c10::ScalarType scalar_type,
+                        int to_buffer_idx,
+                        char* to_buffer,
+                        char** buffers)
+{
+    switch (scalar_type) {
+        case c10::ScalarType::BFloat16:
+            reduce_bf16_buffers(start_elements, num_elements, to_buffer, buffers);
+            break;
+        case c10::ScalarType::Half:
+            reduce_fp16_buffers(start_elements, num_elements, to_buffer, buffers);
+            break;
+        case c10::ScalarType::Float:
+            reduce_fp32_buffers(start_elements, num_elements, to_buffer, buffers);
+            break;
+        default: assert(!"Should not get here");
+    }
+}
+
+#define CVT_ADD_BF16(x)                                                                      \
+    do {                                                                                     \
+        auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[x] + i))); \
+        inout_val = _mm512_add_ps(inout_val, in##x##_val);                                   \
+    } while (0)
+
+// Reduce functions down below use vectorized algorithm, the number of bytes processed each
+// iteration depends on vector length.  256bit vector ==> 32 bytes, 512bit vector ==> 64 bytes
+// If you change implementation of reduce_bf16_buffers, etc. , check whether this number needs
+// to be changed
+#define VECTOR_LENGTH_IN_BYTES 32
+
+void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+{
+    const int element_size = 2;
+    const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
+    int main_elements = num_elements - (num_elements % vector_length);
+    int remain_elements = num_elements % vector_length;
+
+    // process aligned part
+#pragma omp parallel for
+    for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
+         i += VECTOR_LENGTH_IN_BYTES) {
+        auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[0] + i)));
+        switch (world_size) {
+            case 16: CVT_ADD_BF16(15);
+            case 15: CVT_ADD_BF16(14);
+            case 14: CVT_ADD_BF16(13);
+            case 13: CVT_ADD_BF16(12);
+            case 12: CVT_ADD_BF16(11);
+            case 11: CVT_ADD_BF16(10);
+            case 10: CVT_ADD_BF16(9);
+            case 9: CVT_ADD_BF16(8);
+            case 8: CVT_ADD_BF16(7);
+            case 7: CVT_ADD_BF16(6);
+            case 6: CVT_ADD_BF16(5);
+            case 5: CVT_ADD_BF16(4);
+            case 4: CVT_ADD_BF16(3);
+            case 3: CVT_ADD_BF16(2);
+            case 2: CVT_ADD_BF16(1);
+            case 1: break;
+            default:
+                for (int j = 1; j < world_size; j++) {
+                    auto in_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[j] + i)));
+                    inout_val = _mm512_add_ps(inout_val, in_val);
+                }
+        }
+        _mm256_storeu_si256((__m256i*)(to_buffer + i), cvt_fp32_to_bf16(inout_val));
+    }
+
+    // process remaining part
+    int i = (start_elements + main_elements) * element_size;
+    while (remain_elements > 0) {
+        float val = 0.0f;
+        for (int j = 0; j < world_size; j++) { val += *(at::BFloat16*)(buffers[j] + i); }
+        *(at::BFloat16*)(to_buffer + i) = val;
+        remain_elements--;
+        i += element_size;
+    }
+}
+
+#define CVT_ADD_FP16(x)                                                                      \
+    do {                                                                                     \
+        auto in##x##_val = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[x] + i))); \
+        inout_val = _mm512_add_ps(inout_val, in##x##_val);                                   \
+    } while (0)
+
+void reduce_fp16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+{
+    const int element_size = 2;
+    const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
+    int main_elements = num_elements - (num_elements % vector_length);
+    int remain_elements = num_elements % vector_length;
+
+    // process aligned part
+#pragma omp parallel for
+    for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
+         i += VECTOR_LENGTH_IN_BYTES) {
+        auto inout_val = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[0] + i)));
+        switch (world_size) {
+            case 16: CVT_ADD_FP16(15);
+            case 15: CVT_ADD_FP16(14);
+            case 14: CVT_ADD_FP16(13);
+            case 13: CVT_ADD_FP16(12);
+            case 12: CVT_ADD_FP16(11);
+            case 11: CVT_ADD_FP16(10);
+            case 10: CVT_ADD_FP16(9);
+            case 9: CVT_ADD_FP16(8);
+            case 8: CVT_ADD_FP16(7);
+            case 7: CVT_ADD_FP16(6);
+            case 6: CVT_ADD_FP16(5);
+            case 5: CVT_ADD_FP16(4);
+            case 4: CVT_ADD_FP16(3);
+            case 3: CVT_ADD_FP16(2);
+            case 2: CVT_ADD_FP16(1);
+            case 1: break;
+            default:
+                for (int j = 1; j < world_size; j++) {
+                    auto in_val = cvt_fp16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[j] + i)));
+                    inout_val = _mm512_add_ps(inout_val, in_val);
+                }
+        }
+        _mm256_storeu_si256((__m256i*)(to_buffer + i), cvt_fp32_to_fp16(inout_val));
+    }
+
+    // process remaining part
+    int i = (start_elements + main_elements) * element_size;
+    while (remain_elements > 0) {
+        float val = 0.0f;
+        for (int j = 0; j < world_size; j++) { val += *(at::Half*)(buffers[j] + i); }
+        *(at::Half*)(to_buffer + i) = val;
+        remain_elements--;
+        i += element_size;
+    }
+}
+
+#define CVT_ADD_F32(x)                                                \
+    do {                                                              \
+        auto in##x##_val = _mm256_loadu_ps((float*)(buffers[x] + i)); \
+        inout_val = _mm256_add_ps(inout_val, in##x##_val);            \
+    } while (0)
+
+void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
+{
+    const int element_size = 4;
+    const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
+    int main_elements = num_elements - (num_elements % vector_length);
+    int remain_elements = num_elements % vector_length;
+
+    // process aligned part
+#pragma omp parallel for
+    for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
+         i += VECTOR_LENGTH_IN_BYTES) {
+        auto inout_val = _mm256_loadu_ps((float*)(buffers[0] + i));
+        switch (world_size) {
+            case 16: CVT_ADD_F32(15);
+            case 15: CVT_ADD_F32(14);
+            case 14: CVT_ADD_F32(13);
+            case 13: CVT_ADD_F32(12);
+            case 12: CVT_ADD_F32(11);
+            case 11: CVT_ADD_F32(10);
+            case 10: CVT_ADD_F32(9);
+            case 9: CVT_ADD_F32(8);
+            case 8: CVT_ADD_F32(7);
+            case 7: CVT_ADD_F32(6);
+            case 6: CVT_ADD_F32(5);
+            case 5: CVT_ADD_F32(4);
+            case 4: CVT_ADD_F32(3);
+            case 3: CVT_ADD_F32(2);
+            case 2: CVT_ADD_F32(1);
+            case 1: break;
+            default:
+                for (int j = 1; j < world_size; j++) {
+                    auto in_val = _mm256_loadu_ps((float*)(buffers[j] + i));
+                    inout_val = _mm256_add_ps(inout_val, in_val);
+                }
+        }
+        _mm256_storeu_ps((float*)(to_buffer + i), inout_val);
+    }
+
+    // process remaining part
+    int i = (start_elements + main_elements) * element_size;
+    while (remain_elements > 0) {
+        float val = 0.0f;
+        for (int j = 0; j < world_size; j++) { val += *(float*)(buffers[j] + i); }
+        *(float*)(to_buffer + i) = val;
+        remain_elements--;
+        i += element_size;
+    }
+}
+
+static bool is_initialized = 0;
+static int world_rank;
+
+void shm_initialize(int size, int rank, char* addr_string, char* port_string)
+{
+    if (is_initialized) return;
+    is_initialized = 1;
+
+    world_size = size;
+    world_rank = rank;
+
+    char shm_name_prefix[NAME_BUF_SIZE];
+    char shm_name[NAME_BUF_SIZE];
+    snprintf(shm_name_prefix,
+             NAME_BUF_SIZE,
+             "%s_%d_%s_%s",
+             SHM_BUFFER_NAME,
+             getuid(),
+             addr_string,
+             port_string);
+    // create shared workspace for SHM based allreduce
+    SharedData allreduce_buffer;
+    // allocate workspace_buf for current rank
+    struct allreduce_workspace* workspace_buf;
+    struct allreduce_workspace* workspace_buf_other;
+    workspace_buf = (struct allreduce_workspace*)malloc(sizeof(struct allreduce_workspace));
+    snprintf(shm_name, NAME_BUF_SIZE, "%s_%d", shm_name_prefix, rank);
+    shared_create(&allreduce_buffer, shm_name, workspace_buf, sizeof(struct allreduce_workspace));
+    workspace_buf = (struct allreduce_workspace*)allreduce_buffer.bytes;
+    workspace_buf->states[0] = coll_alt2_allreduce_naive__copy_in_done;
+    workspace_buf->states[1] = coll_begin;
+
+    // create the workspace pointer list
+    workspace = (struct allreduce_workspace**)malloc(size * sizeof(struct allreduce_workspace*));
+    symmetric_buffer[0] = (char**)malloc(size * sizeof(char**));
+    symmetric_buffer[1] = (char**)malloc(size * sizeof(char**));
+    distributed_buffer[0] = (char**)malloc(size * sizeof(char**));
+    distributed_buffer[1] = (char**)malloc(size * sizeof(char**));
+
+    // map shm of all ranks
+    for (int i = 0; i < size; i++) {
+        if (i != rank) {
+            snprintf(shm_name, NAME_BUF_SIZE, "%s_%d", shm_name_prefix, i);
+            // printf("open %s, %d\n", shm_name, rank);
+            do {
+                shared_open(&allreduce_buffer, shm_name, sizeof(struct allreduce_workspace));
+            } while (allreduce_buffer.descriptor == -1 && errno == ENOENT);
+            workspace_buf_other = (struct allreduce_workspace*)allreduce_buffer.bytes;
+            workspace[i] = workspace_buf_other;
+        } else {
+            workspace[i] = workspace_buf;
+        }
+        symmetric_buffer[0][i] = workspace[i]->buffer + BUFFER0_OFFSET(0);
+        symmetric_buffer[1][i] = workspace[i]->buffer + BUFFER0_OFFSET(1);
+        distributed_buffer[0][i] = workspace[i]->buffer + BUFFER1_OFFSET(0);
+        distributed_buffer[1][i] = workspace[i]->buffer + BUFFER1_OFFSET(1);
+    }
+}
+
+static void parallel_memcpy(void* to, void* from, size_t n_bytes)
+    __attribute__((target("avx512bw")));
+static void parallel_memcpy(void* to, void* from, size_t n_bytes)
+{
+    auto aligned_bytes = n_bytes - (n_bytes % VECTOR_LENGTH_IN_BYTES);
+    // process aligned part
+#pragma omp parallel for
+    for (int i = 0; i < aligned_bytes; i += VECTOR_LENGTH_IN_BYTES) {
+        auto val = _mm256_loadu_si256((__m256i*)((char*)from + i));
+        _mm256_storeu_si256((__m256i*)((char*)to + i), val);
+    }
+
+    // process remaining part
+    for (int i = aligned_bytes; i < n_bytes; i++) { *((char*)to + i) = *((char*)from + i); }
+}
+
+#define positive_mod(num, mod) ((((num) % (mod)) + (mod)) % (mod))
+#define rank_mod(rank) positive_mod(rank, world_size)
+size_t slice_size(size_t chunk_el, int slice_idx)
+{
+    size_t slice_size = chunk_el / world_size;
+    return slice_idx == world_size - 1 ? slice_size + (chunk_el % world_size) : slice_size;
+}
+
+char* slice_data(char* data_ptr, size_t chunk_el, int el_size, int slice_idx)
+{
+    size_t slice_size = chunk_el / world_size;
+    size_t el_offset = slice_size * slice_idx;
+    return data_ptr + el_offset * el_size;
+}
+
+size_t slice_el_start(size_t chunk_el, int slice_idx)
+{
+    size_t slice_size = chunk_el / world_size;
+    return slice_size * slice_idx;
+}
+
+/*
+    Symmetrical naive all_reduce
+    step 0: before enter the function ith times, state is copy(i-1)
+    step 1: each rank copy data from input (data_ptr) to SHM buffer[i]
+    step 2: set own state to copy(i)
+    step 3: wait each other rank's state equal or later than copy(i)
+    step 4: reduce across SHM buffer(ith) directly into output (data_ptr)
+*/
+void symmetric_naive_all_reduce(char* data_ptr,
+                                c10::ScalarType scalar_type,
+                                size_t chunk_size,
+                                size_t chunk_el)
+{
+#ifdef DO_PROFILE
+    static double total_t1_t0 = 0.0;
+    static double total_t2_t1 = 0.0;
+    static double total_t3_t2 = 0.0;
+    static int count = -16;  // warmup
+    auto t0 = std::chrono::system_clock::now();
+#endif
+
+    /*
+        We can't have infinite number of buffers and states.  2 sets of buffer
+        and 3 sets of states is just enough.  Consider current rank is in step 3,
+        with it's own state set to copy(i), the other rank will them have the
+        following situations:
+        ------------------------------------------------
+        my state | can I proceed? | the other rank state
+        ================================================
+                 |       N        | copy(i-1)
+                 |----------------|---------------------
+        copy(i)  |       Y        | copy(i)
+                 |----------------|---------------------
+                 |       Y        | copy(i+1)
+        ------------------------------------------------
+        * When I have state as copy(i), the other rank cannot have state
+          copy(i-2) or before. In that case I'll be in state copy(i-1) and cannot
+          proceed to copy(i).
+        * The other rank cannot have state copy(i+2) or beyond because my
+          state is still copy(i), copy(i+1) is as far as the other rank could go.
+        * From a rank's POV, all the other ranks can be divided into three sets:
+          - Lagging ranks: ranks that are still working on previous iteration
+          - Syncing ranks: ranks that are working on current iteration
+          - Leading ranks: ranks that are working on next iteration
+        * We can have 3 sets of states, one set for syncing ranks; one set for
+          lagging ranks; one set of leading ranks.  With 3 sets of states, we can
+          distinguish between lagging and leading ranks.
+        * Note from any rank's POV, leading ranks and lagging ranks does not
+          appear at the same time.  Either all other ranks are syncing or
+          lagging, or all other ranks are syncing or leading.  Otherwise leading
+          and lagging ranks will be 2 iterations apart and this should not happen.
+        * So we have 2 sets of buffers, one buffer is used by current iter;
+          one buffer used by either lagging ranks or leading ranks.
+    */
+    const int state_group = 0;
+    static int current_buffer = 0;
+    static int state_idx = 0;
+
+    enum coll_state copy_current, copy_next;
+
+    switch (state_idx) {
+        case 0:
+            copy_current = coll_allreduce_naive__copy_in_done;
+            copy_next = coll_alt1_allreduce_naive__copy_in_done;
+            break;
+        case 1:
+            copy_current = coll_alt1_allreduce_naive__copy_in_done;
+            copy_next = coll_alt2_allreduce_naive__copy_in_done;
+            break;
+        case 2:
+            copy_current = coll_alt2_allreduce_naive__copy_in_done;
+            copy_next = coll_allreduce_naive__copy_in_done;
+            break;
+        default: assert(!"Should not get here.");
+    }
+    state_idx = (state_idx + 1) % 3;
+
+    parallel_memcpy(symmetric_buffer[current_buffer][world_rank], data_ptr, chunk_size);
+    std::atomic_thread_fence(std::memory_order_release);
+    workspace[world_rank]->states[state_group] = copy_current;
+
+#ifdef DO_PROFILE
+    auto t1 = std::chrono::system_clock::now();
+#endif
+
+    for (int i = 0; i < world_size; i++) {
+        // wait until the other rank copy the buffer
+        if (i != world_rank) { wait_buffer_state_until_2(i, copy_current, copy_next, state_group); }
+    }
+#ifdef DO_PROFILE
+    auto t2 = std::chrono::system_clock::now();
+#endif
+
+    // each rank reduce the buffer independently so therre is no need for synchronization afterward
+    reduce_all_buffers(
+        0, chunk_el, scalar_type, world_rank, data_ptr, symmetric_buffer[current_buffer]);
+
+    // switch buffer
+    current_buffer = 1 - current_buffer;
+
+#ifdef DO_PROFILE
+    auto t3 = std::chrono::system_clock::now();
+
+    count++;
+    if (count > 0) {
+        total_t1_t0 += std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+        total_t2_t1 += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+        total_t3_t2 += std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count();
+        if (world_rank == 0 && count == 1000) {
+            printf("symmetric_naive_all_reduce time breakdown:\n");
+            printf("\tcopy input buffer: %.2f\n", total_t1_t0 / count);
+            printf("\twait for copy: %.2f\n", total_t2_t1 / count);
+            printf("\treduce: %.2f\n", total_t3_t2 / count);
+        }
+    }
+#endif
+}
+
+// naive allreduce distributed, each rank do naive reduce on its slice
+void distributed_naive_reduce(char* data_ptr,
+                              c10::ScalarType scalar_type,
+                              size_t chunk_size,
+                              size_t chunk_el)
+{
+#ifdef DO_PROFILE
+    static double total_t1_t0 = 0.0;
+    static double total_t2_t1 = 0.0;
+    static double total_t3_t2 = 0.0;
+    static double total_t4_t3 = 0.0;
+    static double total_t5_t4 = 0.0;
+    static int count = -16;  // warmup
+    auto t0 = std::chrono::system_clock::now();
+#endif
+
+    const int state_group = 1;
+    static int current_buffer = 0;
+    static int state_idx = 0;
+
+    enum coll_state copy_current, copy_next, reduce_current;
+
+    // similar to symmetric_naive_allreduce, but here we only need two sets of
+    // states, because distributed naive reduce has two barriers in the algorithm
+    switch (state_idx) {
+        case 0:
+            copy_current = coll_allreduce_naive__copy_in_done;
+            reduce_current = coll_allreduce_naive__reduce_done;
+            copy_next = coll_alt1_allreduce_naive__copy_in_done;
+            break;
+        case 1:
+            copy_current = coll_alt1_allreduce_naive__copy_in_done;
+            reduce_current = coll_alt1_allreduce_naive__reduce_done;
+            copy_next = coll_allreduce_naive__copy_in_done;
+            break;
+        default: assert(!"Should not get here.");
+    }
+    state_idx = (state_idx + 1) % 2;
+
+    int data_size = chunk_size / chunk_el;
+    parallel_memcpy(distributed_buffer[current_buffer][world_rank], data_ptr, chunk_size);
+    std::atomic_thread_fence(std::memory_order_release);
+    workspace[world_rank]->states[state_group] = copy_current;
+
+#ifdef DO_PROFILE
+    auto t1 = std::chrono::system_clock::now();
+#endif
+
+    for (int i = 0; i < world_size; i++) {
+        // wait until all the other ranks copy the buffer
+        if (i != world_rank)
+            wait_buffer_state_until_2(i, copy_current, reduce_current, state_group);
+    }
+
+#ifdef DO_PROFILE
+    auto t2 = std::chrono::system_clock::now();
+#endif
+
+    // reduce scatter
+    reduce_all_buffers(slice_el_start(chunk_el, world_rank),
+                       slice_size(chunk_el, world_rank),
+                       scalar_type,
+                       world_rank,
+                       distributed_buffer[current_buffer][world_rank],
+                       distributed_buffer[current_buffer]);
+    std::atomic_thread_fence(std::memory_order_release);
+    workspace[world_rank]->states[state_group] = reduce_current;
+
+#ifdef DO_PROFILE
+    auto t3 = std::chrono::system_clock::now();
+#endif
+
+    for (int i = 0; i < world_size; i++) {
+        // wait until all the other ranks reduce the buffer
+        if (i != world_rank) wait_buffer_state_until_2(i, reduce_current, copy_next, state_group);
+    }
+
+    auto t4 = std::chrono::system_clock::now();
+
+    for (int i = 0; i < world_size; i++) {
+        int rank = (i + world_rank) % world_size;
+        parallel_memcpy(
+            slice_data(data_ptr, chunk_el, data_size, rank),
+            slice_data(
+                distributed_buffer[current_buffer][rank], chunk_el, chunk_size / chunk_el, rank),
+            slice_size(chunk_el, rank) * data_size);
+    }
+
+    current_buffer = 1 - current_buffer;
+
+#ifdef DO_PROFILE
+    auto t5 = std::chrono::system_clock::now();
+    count++;
+    if (count > 0) {
+        total_t1_t0 += std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+        total_t2_t1 += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+        total_t3_t2 += std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count();
+        total_t4_t3 += std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3).count();
+        total_t5_t4 += std::chrono::duration_cast<std::chrono::microseconds>(t5 - t4).count();
+        if (world_rank == 0 && count == 1000) {
+            printf("distributed_naive_reduce time breakdown:\n");
+            printf("\tcopy input buffer: %.2f\n", total_t1_t0 / count);
+            printf("\twait for copy: %.2f\n", total_t2_t1 / count);
+            printf("\treduce: %.2f\n", total_t3_t2 / count);
+            printf("\twait for reduce finish: %.2f\n", total_t4_t3 / count);
+            printf("\tcopy out: %.2f\n", total_t5_t4 / count);
+        }
+    }
+#endif
+}
+
+void all_reduce_outer_loop(torch::Tensor& data, size_t numel, int data_size)
+{
+    for (int offset = 0; offset < data_size; offset += MAX_BUF_SIZE) {
+        auto data_ptr = ((char*)(data.data_ptr()) + offset);
+        size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset;
+        size_t chunk_el = chunk_size / (data_size / numel);
+        if (chunk_size < NAIVE_ALLREDUCE_THRESHOLD)
+            symmetric_naive_all_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el);
+        else
+            distributed_naive_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el);
+    }
+}
diff --git a/csrc/cpu/comm/shm.h b/csrc/cpu/comm/shm.h
new file mode 100644
index 000000000000..7f73197a8caa
--- /dev/null
+++ b/csrc/cpu/comm/shm.h
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#ifndef __SHM_COLLECTIVES__
+#define __SHM_COLLECTIVES__
+#define VECTOR_LENGTH_IN_BYTES 32
+void shm_initialize(int size, int rank, char* addr_string, char* port_string);
+void all_reduce_outer_loop(torch::Tensor& data, size_t numel, int data_size);
+void barrier_wait(int root_idx, int num_ranks);
+#endif
diff --git a/csrc/cpu/comm/shm_interface.cpp b/csrc/cpu/comm/shm_interface.cpp
new file mode 100644
index 000000000000..5be5cb799a7b
--- /dev/null
+++ b/csrc/cpu/comm/shm_interface.cpp
@@ -0,0 +1,186 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+#include "shm.h"
+
+// #define DO_PROFILE
+#ifdef DO_PROFILE
+#include <cfloat>
+#include <chrono>
+#endif
+
+// Communication settings
+static int world_rank = -1;
+static int world_size = -1;
+
+static bool is_initialized = 0;
+
+static bool all_ranks_local_p = false;
+
+void initialize(int size, int rank)
+{
+    if (is_initialized) return;
+
+    // Check whether all ranks is on the same physical machine.
+    // If true, we will use an SHM based low latency allreduce
+
+    auto ls_string = std::getenv("LOCAL_SIZE");
+    int ls = 0;
+    if (ls_string != NULL) { ls = std::stoi(std::getenv("LOCAL_SIZE")); }
+
+    if (size >= 1 && size == ls) { all_ranks_local_p = true; }
+
+    world_size = size;
+    world_rank = rank;
+    is_initialized = 1;
+
+    auto addr_string = std::getenv("MASTER_ADDR");
+    if (addr_string == NULL) { addr_string = ""; }
+    auto port_string = std::getenv("MASTER_PORT");
+    if (port_string == NULL) { port_string = ""; }
+
+    if (all_ranks_local_p) { shm_initialize(size, rank, addr_string, port_string); }
+}
+
+void inference_all_reduce_(torch::Tensor& data, int op);
+
+// Success - return 0
+// Fail (cannot hornor the request and need to fall back) - return -1
+void inference_all_reduce_(torch::Tensor& data, int op)
+{
+    assert(op == 0);
+#ifdef DO_PROFILE
+    static double total_time = 0.0;
+    static double total_time_sq = 0.0;
+    static int count = -16;  // warmup
+    static double max_time = 0.0;
+    static double min_time = DBL_MAX;
+    // make sure all rank reach this point before measuring time
+    // turn on this if you suspect each rank didn't reach here at the same time (stragger)
+    // if (all_ranks_local_p) { barrier_wait(0, world_size); }
+    auto start = std::chrono::system_clock::now();
+#endif
+
+    auto numel = data.numel();
+
+    int data_size = 0;
+    bool data_type_fallback = false;
+
+    switch (data.scalar_type()) {
+        case c10::ScalarType::BFloat16: data_size = numel * 2; break;
+        case c10::ScalarType::Half: data_size = numel * 2; break;
+        case c10::ScalarType::Float: data_size = numel * 4; break;
+        default: data_type_fallback = true;
+    }
+
+    if (data_type_fallback) return;
+
+    all_reduce_outer_loop(data, numel, data_size);
+
+#ifdef DO_PROFILE
+    auto end = std::chrono::system_clock::now();
+    count++;
+    if (count > 0) {
+        double elapsed = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+        if (elapsed > max_time) { max_time = elapsed; }
+        if (elapsed < min_time) { min_time = elapsed; }
+        total_time += elapsed;
+        total_time_sq += elapsed * elapsed;
+        if (world_rank == 0 && count == 1000) {
+            auto avg = total_time / count;
+            auto sd =
+                sqrt(total_time_sq / count - total_time * total_time / (count * count)) / avg * 100;
+            printf("      C++ kernel\t\t    %.2f\t  %.2f\t%.2f\t      %.2f\n",
+                   min_time,
+                   max_time,
+                   total_time / count,
+                   sd);
+        }
+    }
+#endif
+    return;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("initialize", &initialize, "shm initialize"); }
+
+TORCH_LIBRARY(deepspeed, m)
+{
+    m.def("inference_all_reduce(Tensor self) -> Tensor");
+    m.def("inference_all_reduce_(Tensor(a!) self) -> Tensor(a!)");
+}
+
+torch::Tensor inference_all_reduce_meta(const torch::Tensor& self_)
+{
+    torch::Tensor result_ = torch::empty_like(self_);
+    return result_;
+}
+
+torch::Tensor& inference_all_reduce__meta(torch::Tensor& self_) { return self_; }
+
+torch::Tensor& inference_all_reduce__cpu(torch::Tensor& self_)
+{
+    TORCH_INTERNAL_ASSERT(self_.device().type() == torch::DeviceType::CPU);
+    torch::Tensor self_tensor = self_.contiguous();
+    inference_all_reduce_(self_tensor, 0);
+    return self_;
+}
+
+torch::Tensor inference_all_reduce_cpu(const torch::Tensor& self_)
+{
+    torch::Tensor result = self_.clone();
+    inference_all_reduce__cpu(result);
+    return result;
+}
+
+#include <ATen/FunctionalTensorWrapper.h>
+// The boilerplate functionalization logic, that teaches functionalization
+// how to map x_() calls into x() calls.
+// Long term, we'd like to not require users to write this logic.
+// HOWEVER, if you have a custom op that is mutable,
+// You will still need to write an out-of-place version of that op!
+at::Tensor& inference_all_reduce__functionalization_glue(at::Tensor& x)
+{
+    // We expect all tensor inputs to our op to be "functional tensors"
+    TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(x));
+    // First, sync and unwrap and functional tensors
+    at::functionalization::impl::sync(x);
+    auto x_ = at::functionalization::impl::from_functional_tensor(x);
+    // Grab the dispatcher entry corresponding to the out-of-place op, "x"
+    static auto op_handle = c10::Dispatcher::singleton()
+                                // specify namespace::op_name, op_overload_name
+                                .findSchemaOrThrow("deepspeed::inference_all_reduce", "")
+                                // Specify the C++ schema of the out-of-place op.
+                                .typed<at::Tensor(const at::Tensor&)>();
+    // Next, redispatch to the out-of-place op, x() (user called x_, we call x)
+    at::Tensor tmp_output;
+    {
+        at::AutoDispatchSkipFunctionalize guard;
+        tmp_output = op_handle.call(x_);
+    }
+    // Finally, tell functionalization about this mutation.
+    at::functionalization::impl::replace_(x, tmp_output);
+    at::functionalization::impl::commit_update(x);
+    at::functionalization::impl::sync(x);
+    return x;
+}
+
+TORCH_LIBRARY_IMPL(deepspeed, CPU, m)
+{
+    m.impl("inference_all_reduce", inference_all_reduce_cpu);
+    m.impl("inference_all_reduce_", inference_all_reduce__cpu);
+}
+
+TORCH_LIBRARY_IMPL(deepspeed, Meta, m)
+{
+    m.impl("inference_all_reduce", inference_all_reduce_meta);
+    m.impl("inference_all_reduce_", inference_all_reduce__meta);
+}
+
+TORCH_LIBRARY_IMPL(deepspeed, Functionalize, m)
+{
+    m.impl("inference_all_reduce_", inference_all_reduce__functionalization_glue);
+}
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
index 2a4300c5cac1..c102234a4dfb 100644
--- a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
+++ b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -125,11 +125,10 @@ struct CheckArch {
         std::cerr << #PTR " is not correctly aligned\n"; \
         return false;                                    \
     }
-#define EVOFORMER_CHECK(COND, ERR)                          \
-    if (!(COND)) {                                          \
-        std::cerr << "[Evoformer Attention]"                \
-                  << "'" #COND "' failed: " << ERR << "\n"; \
-        return false;                                       \
+#define EVOFORMER_CHECK(COND, ERR)                                                     \
+    if (!(COND)) {                                                                     \
+        std::cerr << "[Evoformer Attention]" << "'" #COND "' failed: " << ERR << "\n"; \
+        return false;                                                                  \
     }
 #endif
 
diff --git a/csrc/fp_quantizer/fp_quantize.cpp b/csrc/fp_quantizer/fp_quantize.cpp
new file mode 100644
index 000000000000..903d84270d32
--- /dev/null
+++ b/csrc/fp_quantizer/fp_quantize.cpp
@@ -0,0 +1,126 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "fp_quantize.h"
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include <vector>
+
+#define DISPATCH_QUANTIZE(T_TYPE, C_TYPE, mantisa, exponent)                             \
+    if (val.options().dtype() == torch::T_TYPE) {                                        \
+        launch_quantization<C_TYPE, mantisa, exponent>((C_TYPE*)val.data_ptr(),          \
+                                                       (uint8_t*)out.data_ptr(),         \
+                                                       num_groups,                       \
+                                                       group_size,                       \
+                                                       at::cuda::getCurrentCUDAStream(), \
+                                                       q_range,                          \
+                                                       q_bits,                           \
+                                                       q_mantisa_bits,                   \
+                                                       stochastic_rounding);             \
+    }
+
+at::Tensor quantize(torch::Tensor& out,
+                    torch::Tensor& val,
+                    torch::Tensor& scale,
+                    int group_size,
+                    int stochastic_rounding,
+                    int q_bits,
+                    int q_mantisa_bits)
+{
+    int total_elems = at::numel(val);
+    float q_range = q_bits == 8 ? (q_mantisa_bits == 3 ? 480.0 : 114688.0) :  // fp8 ranges
+                        (q_bits == 12 ? 510.0 :                               // fp12 range
+                             (q_bits == 6 ? 28.0 :                            // fp6 range
+                                  6.0));  // fp4 range (using power 2); TODO (Reza): add the power-4
+                                          // in case accuracy is not matching!
+    int num_groups = total_elems / group_size;
+
+    DISPATCH_QUANTIZE(kHalf, __half, 23, 8);
+#ifdef BF16_AVAILABLE
+    DISPATCH_QUANTIZE(kBFloat16, __nv_bfloat16, 23, 8);
+#endif
+
+    return out;
+}
+
+#define DISPATCH_DEQUANTIZE(T_TYPE, C_TYPE, mantisa)                              \
+    if (val.options().dtype() == torch::T_TYPE) {                                 \
+        launch_dequantization<C_TYPE, mantisa>((uint8_t*)val_q.data_ptr(),        \
+                                               (C_TYPE*)val.data_ptr(),           \
+                                               num_groups,                        \
+                                               group_size,                        \
+                                               q_mantisa_bits,                    \
+                                               q_exponent_bits,                   \
+                                               at::cuda::getCurrentCUDAStream()); \
+        return;                                                                   \
+    }
+
+void dequantize(torch::Tensor& val,
+                torch::Tensor& val_q,
+                torch::Tensor& scale,
+                int group_size,
+                int q_mantisa_bits,
+                int q_exponent_bits)
+{
+    int total_elems = at::numel(val);
+
+    int num_groups = total_elems / group_size;
+
+    DISPATCH_DEQUANTIZE(kHalf, __half, 10);
+#ifdef BF16_AVAILABLE
+    DISPATCH_DEQUANTIZE(kBFloat16, __nv_bfloat16, 7);
+#endif
+}
+
+#define DISPATCH_DEQUANTIZE_INDEX(T_TYPE, C_TYPE, mantisa)                                  \
+    if (val.options().dtype() == torch::T_TYPE) {                                           \
+        launch_selective_dequantization<C_TYPE, mantisa>((uint8_t*)val_q.data_ptr(),        \
+                                                         (C_TYPE*)val.data_ptr(),           \
+                                                         (int32_t*)indexes.data_ptr(),      \
+                                                         num_groups,                        \
+                                                         group_size,                        \
+                                                         num_indexes,                       \
+                                                         q_mantisa_bits,                    \
+                                                         q_exponent_bits,                   \
+                                                         at::cuda::getCurrentCUDAStream()); \
+        return;                                                                             \
+    }
+void selective_dequantize(torch::Tensor& val,
+                          torch::Tensor& val_q,
+                          torch::Tensor& indexes,
+                          int group_size,
+                          int q_mantisa_bits,
+                          int q_exponent_bits)
+{
+    int total_elems = at::numel(val);
+    int num_indexes = indexes.size(0);
+    int num_groups = total_elems / group_size;
+
+    DISPATCH_DEQUANTIZE_INDEX(kHalf, __half, 10);
+#ifdef BF16_AVAILABLE
+    DISPATCH_DEQUANTIZE_INDEX(kBFloat16, __nv_bfloat16, 7);
+#endif
+}
+
+at::Tensor get_scales(torch::Tensor& out, int num_groups)
+{
+    auto options = at::TensorOptions()
+                       .dtype(torch::kFloat)
+                       .layout(at::kStrided)
+                       .device(at::kCUDA)
+                       .requires_grad(false);
+    auto scales =
+        torch::from_blob(out.data_ptr(), {num_groups, 1}, {out.stride(0) / 4, 1}, options);
+    return scales;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("quantize", &quantize, "quantize function");
+    m.def("dequantize", &dequantize, "dequantize function");
+    m.def("get_scales", &get_scales, "get scales function");
+    m.def("selective_dequantize", &selective_dequantize, "selective dequantize function");
+}
diff --git a/csrc/fp_quantizer/fp_quantize.cu b/csrc/fp_quantizer/fp_quantize.cu
new file mode 100644
index 000000000000..66ea7392e011
--- /dev/null
+++ b/csrc/fp_quantizer/fp_quantize.cu
@@ -0,0 +1,532 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <stdexcept>
+#include "context.h"
+#include "fp_quantize.h"
+#include "memory_access_utils.h"
+#include "reduction_utils.h"
+
+#include <cuda.h>
+#include <stdint.h>
+
+#include <cuda_fp16.h>
+#include <curand_kernel.h>
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+#include <cuda_runtime_api.h>
+
+using ROp = reduce::ROpType;
+
+namespace quantization {
+
+constexpr int access_granularity = 16;
+constexpr int quanitzed_access_granularity = 4;
+constexpr int quanitzed_access_granularity_6bits = 2;
+constexpr int threads = 256;
+constexpr int warps = threads / 32;
+
+}  // namespace quantization
+
+template <int _mantisa_bits, int q_mantisa_bits, int stochastic_rounding>
+__device__ void round(uint32_t& mantisa, uint32_t& dst_exponent, curandStatePhilox4_32_10_t* state)
+{
+    constexpr uint32_t mantisa_mask = (1 << (_mantisa_bits - q_mantisa_bits)) - 1;
+    uint32_t offset = stochastic_rounding ? (curand_poisson(state, 10) & mantisa_mask)
+                                          : 1 << (_mantisa_bits - q_mantisa_bits - 1);
+    mantisa += offset;
+    dst_exponent += (((mantisa & ~mantisa_mask) == (1 << _mantisa_bits)) ? 1 : 0);
+}
+
+template <int _mantisa_bits, int _exponent_bits, int q_mantisa_bits, int q_exponent_bits>
+__device__ void clip(uint32_t& exponent, uint32_t& mantisa)
+{
+    constexpr uint32_t max_exponent = (1 << (q_exponent_bits - 1)) + (1 << (_exponent_bits - 1));
+    constexpr uint32_t min_exponent =
+        (1 << (_exponent_bits - 1)) - ((1 << (q_exponent_bits - 1)) - 1);
+    if (exponent > max_exponent) {
+        exponent = max_exponent;
+        mantisa = (((uint32_t)-1) >> (32 - q_mantisa_bits)) << 1;  //.11 .. 10
+    }
+    if (exponent < min_exponent) {
+        exponent = min_exponent;
+        mantisa = 0;
+    }
+}
+
+template <typename T,
+          int unroll,
+          int _mantisa_bits,
+          int _exponent_bits,
+          int total_q_bits = 8,
+          int q_mantisa_bits = 3,
+          int stochastic_rounding = 0>
+__global__ void apply_quantization(T* val,
+                                   uint8_t* q_val,
+                                   int group_size,
+                                   std::pair<uint64_t, uint64_t> seed,
+                                   float q_range)
+{
+    int tidx = threadIdx.x;
+    int wid = tidx >> 5;
+    int lane = tidx & 0x1f;
+    int gid = blockIdx.x * quantization::warps + wid;
+
+    constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1;
+    constexpr uint32_t _mantisa_mask = (1 << _mantisa_bits) - 1;
+    constexpr uint32_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits;
+    constexpr uint32_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits);
+    // CG helpers
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T);
+    constexpr uint32_t load_stride = vector_size * hw_warp_size;
+    constexpr uint32_t store_stride = (total_q_bits * vector_size / 8) * hw_warp_size;
+    const uint32_t thread_offset = lane * vector_size;
+    const uint32_t store_thread_offset = lane * (total_q_bits * vector_size / 8);
+    const uint32_t base_load_offset = gid * group_size + thread_offset;
+    const uint32_t base_store_offset =
+        gid * ((group_size * total_q_bits / 8) + 4) +
+        store_thread_offset;  // 4-byte for saving the scale per group
+    const T* load_base_ptr = val + base_load_offset;
+    T tmp_buf[unroll * vector_size];
+    T cur_max;
+    reduce::init<ROp::Max>(&cur_max);
+
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    curandStatePhilox4_32_10_t state;
+    curand_init(seed.first, idx, seed.second, &state);
+
+#pragma unroll
+    for (int i = 0; i < unroll; i++) {
+        if (i * load_stride + thread_offset < group_size) {
+            mem_access::load_global<quantization::access_granularity>(
+                &tmp_buf[vector_size * i], load_base_ptr + i * load_stride);
+            for (int j = 0; j < vector_size; j++)
+                cur_max = reduce::element<ROp::Max>(cur_max, __habs(tmp_buf[i * vector_size + j]));
+        }
+    }
+    reduce::_block<T, 1, ROp::Max>(tb, warp, &cur_max);
+
+    int mantisa_mask = ((1 << q_mantisa_bits) - 1);
+    mantisa_mask <<= (_mantisa_bits - q_mantisa_bits);
+
+    uint8_t* store_base_ptr = q_val + base_store_offset;
+    float scale = (float)q_range / conversion::to<float>(cur_max);
+#pragma unroll
+    for (int i = 0; i < unroll; i++) {
+        if (i * load_stride + thread_offset < group_size) {
+            uint64_t q_buf = 0;
+            uint64_t q_buf1 = 0;
+#pragma unroll
+            for (int j = 0; j < vector_size; j++) {
+                float val_f = conversion::to<float>(tmp_buf[i * vector_size + j]) * scale;
+                uint32_t* data = reinterpret_cast<uint32_t*>(&val_f);
+                uint32_t sign = (data[0] & _sign_mask) >> (_mantisa_bits + _exponent_bits);
+                uint32_t cur_exponent = (data[0] & _exponent_mask) >> _mantisa_bits;
+                uint32_t dst_mantisa = (data[0] & _mantisa_mask);
+
+                uint32_t dst_exponent = cur_exponent;
+
+                round<_mantisa_bits, q_mantisa_bits, stochastic_rounding>(
+                    dst_mantisa, dst_exponent, &state);
+                if (cur_exponent != 0)
+                    clip<_mantisa_bits, _exponent_bits, q_mantisa_bits, q_exponent_bits>(
+                        dst_exponent, dst_mantisa);
+
+                dst_mantisa = (dst_mantisa & mantisa_mask) >> (_mantisa_bits - q_mantisa_bits);
+
+                if (dst_exponent != (1 << q_exponent_bits) - 1)
+                    dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) +
+                                   (1 << (q_exponent_bits - 1)) - 1;
+                if (total_q_bits == 8 || total_q_bits == 4 || total_q_bits == 6)
+                    q_buf = q_buf |
+                            ((uint64_t)((uint8_t)(sign << (q_exponent_bits + q_mantisa_bits) |
+                                                  (dst_exponent << q_mantisa_bits) | dst_mantisa))
+                             << j * total_q_bits);
+                else if (total_q_bits == 12) {
+                    if (j < 5)
+                        q_buf =
+                            q_buf |
+                            ((uint64_t)((uint16_t)(sign << (q_exponent_bits + q_mantisa_bits) |
+                                                   (dst_exponent << q_mantisa_bits) | dst_mantisa))
+                             << j * total_q_bits);
+                    else
+                        q_buf1 =
+                            q_buf1 |
+                            ((uint64_t)((uint16_t)(sign << (q_exponent_bits + q_mantisa_bits) |
+                                                   (dst_exponent << q_mantisa_bits) | dst_mantisa))
+                             << (j - 5) * total_q_bits);
+                }
+            }
+            if (total_q_bits == 12) {
+                uint64_t last_nibble_mask = 0xf;
+                last_nibble_mask = q_buf1 & last_nibble_mask;
+                q_buf = (last_nibble_mask << 60) | q_buf;
+                q_buf1 >>= 4;
+            }
+            uint8_t* int8_data = reinterpret_cast<uint8_t*>(&q_buf);
+            uint8_t* int8_data1 = reinterpret_cast<uint8_t*>(&q_buf1);
+            if (total_q_bits == 6) {
+                mem_access::store_global<quantization::quanitzed_access_granularity_6bits>(
+                    store_base_ptr + i * store_stride, int8_data);
+                mem_access::store_global<quantization::quanitzed_access_granularity_6bits>(
+                    store_base_ptr + i * store_stride +
+                        quantization::quanitzed_access_granularity_6bits,
+                    int8_data + quantization::quanitzed_access_granularity_6bits);
+                mem_access::store_global<quantization::quanitzed_access_granularity_6bits>(
+                    store_base_ptr + i * store_stride +
+                        quantization::quanitzed_access_granularity_6bits * 2,
+                    int8_data + 2 * quantization::quanitzed_access_granularity_6bits);
+            } else {
+                mem_access::store_global<quantization::quanitzed_access_granularity>(
+                    store_base_ptr + i * store_stride, int8_data);
+
+                if (total_q_bits > 4) {
+                    mem_access::store_global<quantization::quanitzed_access_granularity>(
+                        store_base_ptr + i * store_stride +
+                            quantization::quanitzed_access_granularity,
+                        int8_data + quantization::quanitzed_access_granularity);
+                    if (total_q_bits == 12) {
+                        mem_access::store_global<quantization::quanitzed_access_granularity>(
+                            store_base_ptr + i * store_stride +
+                                quantization::quanitzed_access_granularity * 2,
+                            int8_data1);
+                    }
+                }
+            }
+        }
+    }
+    if (lane == 0) {
+        float q_scale = conversion::to<float>(cur_max) / (float)q_range;
+        uint8_t* scale_as_int8 = reinterpret_cast<uint8_t*>(&q_scale);
+        uint32_t scale_offset =
+            gid * ((group_size * total_q_bits / 8) + 4) + (group_size * total_q_bits / 8);
+        if (total_q_bits != 6)
+            mem_access::store_global<quantization::quanitzed_access_granularity>(
+                q_val + scale_offset, scale_as_int8);
+        else {
+            mem_access::store_global<quantization::quanitzed_access_granularity_6bits>(
+                q_val + scale_offset, scale_as_int8);
+            mem_access::store_global<quantization::quanitzed_access_granularity_6bits>(
+                q_val + scale_offset + quantization::quanitzed_access_granularity_6bits,
+                scale_as_int8 + quantization::quanitzed_access_granularity_6bits);
+        }
+    }
+}
+
+template <typename T,
+          int q_mantisa_bits,
+          int total_q_bits = 16,
+          int _mantisa_bits = 3,
+          int _exponent_bits = 4>
+__global__ void apply_dequantization(uint8_t* val, T* q_val, int group_size, int total_num_elements)
+{
+    constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T);
+    int tidx = (blockIdx.x * blockDim.x + threadIdx.x) * vector_size;
+
+    constexpr int quantized_bits = _mantisa_bits + _exponent_bits + 1;
+    constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1;
+    constexpr uint16_t _mantisa_mask = (1 << _mantisa_bits) - 1;
+    constexpr uint16_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits;
+    constexpr uint16_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits);
+    const uint32_t g_index = (tidx / group_size);
+    const uint32_t group_size_bytes = (group_size * quantized_bits / 8);
+    const uint8_t* load_base_ptr =
+        val + g_index * (group_size_bytes + 4) + (tidx % group_size) * quantized_bits / 8;
+
+    int mantisa_mask = ((1 << q_mantisa_bits) - 1);
+    mantisa_mask <<= (_mantisa_bits - q_mantisa_bits);
+
+    T* store_base_ptr = q_val + tidx;
+    float scale;
+
+    uint8_t* scale_as_int8 = reinterpret_cast<uint8_t*>(&scale);
+    if (quantized_bits == 6) {
+        mem_access::load_global<quantization::quanitzed_access_granularity>(
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
+        mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+            scale_as_int8 + quantization::quanitzed_access_granularity_6bits,
+            val + g_index * (group_size_bytes + 4) + group_size_bytes +
+                quantization::quanitzed_access_granularity_6bits);
+    } else
+        mem_access::load_global<quantization::quanitzed_access_granularity>(
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
+
+    if (tidx < total_num_elements) {
+        uint64_t q_buf_in;
+        uint64_t q_buf_in1;
+        uint8_t* int8_data = reinterpret_cast<uint8_t*>(&q_buf_in);
+        uint8_t* int8_data1 = reinterpret_cast<uint8_t*>(&q_buf_in1);
+        if (quantized_bits == 6) {
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data, load_base_ptr);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits * 2,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2);
+
+        } else {
+            mem_access::load_global<quantization::quanitzed_access_granularity>(int8_data,
+                                                                                load_base_ptr);
+            if (quantized_bits > 4) {
+                mem_access::load_global<quantization::quanitzed_access_granularity>(
+                    int8_data + quantization::quanitzed_access_granularity,
+                    load_base_ptr + quantization::quanitzed_access_granularity);
+                if (quantized_bits == 12) {
+                    mem_access::load_global<quantization::quanitzed_access_granularity>(
+                        int8_data1, load_base_ptr + quantization::quanitzed_access_granularity * 2);
+                }
+            }
+        }
+        T store_buf[vector_size];
+        uint16_t* q_buf = reinterpret_cast<uint16_t*>(store_buf);
+#pragma unroll
+        for (int j = 0; j < vector_size; j++) {
+            uint16_t new_data;
+            if (j < 5 || quantized_bits != 12) {
+                new_data = (uint16_t)(q_buf_in >> (j * quantized_bits));
+            } else {
+                if (j == 5) {
+                    new_data = (uint16_t)(q_buf_in1);
+                    new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60));
+                } else
+                    new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8));
+            }
+
+            uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits);
+            uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits;
+            uint16_t dst_mantisa = (new_data & _mantisa_mask);
+
+            if (dst_exponent != (1 << q_exponent_bits) - 1)
+                dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) +
+                               (1 << (q_exponent_bits - 1)) - 1;
+
+            q_buf[j] =
+                ((sign << (q_exponent_bits + q_mantisa_bits)) | (dst_exponent << q_mantisa_bits) |
+                 (dst_mantisa << (q_mantisa_bits - _mantisa_bits)));
+            float up_cast = conversion::to<float>(store_buf[j]);
+            store_buf[j] = conversion::to<T>(up_cast * scale);
+        }
+        mem_access::store_global<quantization::access_granularity>(store_base_ptr, store_buf);
+    }
+}
+
+#define LAUNCH_FOR_QUANTIZATION_UNROLL(COUNT)                                    \
+    case COUNT:                                                                  \
+        apply_quantization<T,                                                    \
+                           COUNT,                                                \
+                           mantisa,                                              \
+                           exponent,                                             \
+                           CONST_Q_BITS,                                         \
+                           CONST_Q_MANTISA_BITS,                                 \
+                           CONST_STOCHASTIC_ROUNDING>                            \
+            <<<grid, block, 0, stream>>>(val, q_val, group_size, seed, q_range); \
+        break;
+
+template <typename T, int mantisa, int exponent>
+void launch_quantization(T* val,
+                         uint8_t* q_val,
+                         int num_groups,
+                         int group_size,
+                         cudaStream_t stream,
+                         float q_range,
+                         int q_bits,
+                         int q_mantisa_bits,
+                         int stochastic_rounding)
+{
+    const dim3 grid((num_groups + quantization::warps - 1) / quantization::warps);
+    const dim3 block(quantization::threads);
+
+    std::pair<uint64_t, uint64_t> seed = FPContext::Instance().IncrementOffset(16);
+
+    constexpr int vals_per_unroll = hw_warp_size * quantization::access_granularity / sizeof(T);
+
+    const int copy_unroll = (group_size + vals_per_unroll - 1) / vals_per_unroll;
+    QUANT_SWITCH((q_bits - q_mantisa_bits - 1) * q_mantisa_bits + stochastic_rounding, [&] {
+        switch (copy_unroll) {
+            LAUNCH_FOR_QUANTIZATION_UNROLL(1)
+            LAUNCH_FOR_QUANTIZATION_UNROLL(2)
+            LAUNCH_FOR_QUANTIZATION_UNROLL(3)
+            LAUNCH_FOR_QUANTIZATION_UNROLL(4)
+            LAUNCH_FOR_QUANTIZATION_UNROLL(5)
+            LAUNCH_FOR_QUANTIZATION_UNROLL(6)
+        }
+    });
+}
+#define INSTANTIATE_LAUNCH_QUANTIZATION(T, mantisa, exponent) \
+    template void launch_quantization<T, mantisa, exponent>(  \
+        T*, uint8_t*, int, int, cudaStream_t, float q_range, int, int, int);
+// fp8(E4M3), nearest-rounding
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_QUANTIZATION(__nv_bfloat16, 23, 8);
+#endif
+INSTANTIATE_LAUNCH_QUANTIZATION(__half, 23, 8);
+
+template <typename T, int mantisa>
+void launch_dequantization(uint8_t* val,
+                           T* q_val,
+                           int num_groups,
+                           int group_size,
+                           int q_mantisa_bits,
+                           int q_exponent_bits,
+                           cudaStream_t stream)
+{
+    int blocks = ((num_groups * group_size) - 1) /
+                     (quantization::threads * (quantization::access_granularity / sizeof(T))) +
+                 1;
+    const dim3 grid(blocks);
+    const dim3 block(quantization::threads);
+    DEQUANT_SWITCH(q_mantisa_bits * q_exponent_bits, [&] {
+        apply_dequantization<T, mantisa, 16, CONST_Q_MANTISA_BITS, CONST_Q_EXPONENT_BITS>
+            <<<grid, block, 0, stream>>>(val, q_val, group_size, (num_groups * group_size));
+    });
+}
+#define INSTANTIATE_LAUNCH_DEQUANTIZATION(T, mantisa) \
+    template void launch_dequantization<T, mantisa>(uint8_t*, T*, int, int, int, int, cudaStream_t);
+// fp8(E4M3)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_DEQUANTIZATION(__nv_bfloat16, 7);
+#endif
+INSTANTIATE_LAUNCH_DEQUANTIZATION(__half, 10);
+
+template <typename T,
+          int q_mantisa_bits,
+          int total_q_bits = 16,
+          int _mantisa_bits = 3,
+          int _exponent_bits = 4>
+__global__ void apply_selective_dequantization(uint8_t* val,
+                                               T* q_val,
+                                               int32_t* indexes,
+                                               int group_size,
+                                               int total_num_elements)
+{
+    int index = indexes[blockIdx.x];
+    constexpr uint32_t vector_size = quantization::access_granularity / sizeof(T);
+    int tidx = (blockIdx.y * blockDim.x + threadIdx.x) * vector_size;
+    int input_index = index * total_num_elements + tidx;
+    constexpr int quantized_bits = _mantisa_bits + _exponent_bits + 1;
+    constexpr int q_exponent_bits = total_q_bits - q_mantisa_bits - 1;
+    constexpr uint16_t _mantisa_mask = (1 << _mantisa_bits) - 1;
+    constexpr uint16_t _exponent_mask = ((1 << _exponent_bits) - 1) << _mantisa_bits;
+    constexpr uint16_t _sign_mask = 1 << (_mantisa_bits + _exponent_bits);
+    const uint32_t g_index = (input_index / group_size);
+    const uint32_t group_size_bytes = (group_size * quantized_bits / 8);
+    const uint8_t* load_base_ptr =
+        val + g_index * (group_size_bytes + 4) + (input_index % group_size) * quantized_bits / 8;
+
+    int mantisa_mask = ((1 << q_mantisa_bits) - 1);
+    mantisa_mask <<= (_mantisa_bits - q_mantisa_bits);
+
+    T* store_base_ptr = q_val + tidx + blockIdx.x * total_num_elements;
+    float scale;
+
+    uint8_t* scale_as_int8 = reinterpret_cast<uint8_t*>(&scale);
+    if (quantized_bits == 6) {
+        mem_access::load_global<quantization::quanitzed_access_granularity>(
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
+        mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+            scale_as_int8 + quantization::quanitzed_access_granularity_6bits,
+            val + g_index * (group_size_bytes + 4) + group_size_bytes +
+                quantization::quanitzed_access_granularity_6bits);
+    } else
+        mem_access::load_global<quantization::quanitzed_access_granularity>(
+            scale_as_int8, val + g_index * (group_size_bytes + 4) + group_size_bytes);
+
+    if (tidx < total_num_elements) {
+        uint64_t q_buf_in;
+        uint64_t q_buf_in1;
+        uint8_t* int8_data = reinterpret_cast<uint8_t*>(&q_buf_in);
+        uint8_t* int8_data1 = reinterpret_cast<uint8_t*>(&q_buf_in1);
+        if (quantized_bits == 6) {
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data, load_base_ptr);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits);
+            mem_access::load_global<quantization::quanitzed_access_granularity_6bits>(
+                int8_data + quantization::quanitzed_access_granularity_6bits * 2,
+                load_base_ptr + quantization::quanitzed_access_granularity_6bits * 2);
+        } else {
+            mem_access::load_global<quantization::quanitzed_access_granularity>(int8_data,
+                                                                                load_base_ptr);
+            if (quantized_bits > 4) {
+                mem_access::load_global<quantization::quanitzed_access_granularity>(
+                    int8_data + quantization::quanitzed_access_granularity,
+                    load_base_ptr + quantization::quanitzed_access_granularity);
+                if (quantized_bits == 12) {
+                    mem_access::load_global<quantization::quanitzed_access_granularity>(
+                        int8_data1, load_base_ptr + quantization::quanitzed_access_granularity * 2);
+                }
+            }
+        }
+        T store_buf[vector_size];
+        uint16_t* q_buf = reinterpret_cast<uint16_t*>(store_buf);
+#pragma unroll
+        for (int j = 0; j < vector_size; j++) {
+            uint16_t new_data;
+            if (j < 5 || quantized_bits != 12) {
+                new_data = (uint16_t)(q_buf_in >> (j * quantized_bits));
+            } else {
+                if (j == 5) {
+                    new_data = (uint16_t)(q_buf_in1);
+                    new_data = (uint16_t)((new_data << 4) | (q_buf_in >> 60));
+                } else
+                    new_data = (uint16_t)(q_buf_in1 >> ((j - 6) * quantized_bits + 8));
+            }
+
+            uint16_t sign = (new_data & _sign_mask) >> (_mantisa_bits + _exponent_bits);
+            uint16_t dst_exponent = (new_data & _exponent_mask) >> _mantisa_bits;
+            uint16_t dst_mantisa = (new_data & _mantisa_mask);
+
+            if (dst_exponent != (1 << q_exponent_bits) - 1)
+                dst_exponent = (dst_exponent - ((1 << (_exponent_bits - 1)) - 1)) +
+                               (1 << (q_exponent_bits - 1)) - 1;
+
+            q_buf[j] =
+                ((sign << (q_exponent_bits + q_mantisa_bits)) | (dst_exponent << q_mantisa_bits) |
+                 (dst_mantisa << (q_mantisa_bits - _mantisa_bits)));
+            float up_cast = conversion::to<float>(store_buf[j]);
+            store_buf[j] = conversion::to<T>(up_cast * scale);
+        }
+        mem_access::store_global<quantization::access_granularity>(store_base_ptr, store_buf);
+    }
+}
+
+template <typename T, int mantisa>
+void launch_selective_dequantization(uint8_t* val,
+                                     T* q_val,
+                                     int32_t* indexes,
+                                     int num_groups,
+                                     int group_size,
+                                     int num_indexes,
+                                     int q_mantisa_bits,
+                                     int q_exponent_bits,
+                                     cudaStream_t stream)
+{
+    int total_elements_per_index = (num_groups / num_indexes) * group_size;
+    int blocks = (total_elements_per_index - 1) /
+                     (quantization::threads * (quantization::access_granularity / sizeof(T))) +
+                 1;
+    const dim3 grid(num_indexes, blocks);
+    const dim3 block(quantization::threads);
+    DEQUANT_SWITCH(q_mantisa_bits * q_exponent_bits, [&] {
+        apply_selective_dequantization<T, mantisa, 16, CONST_Q_MANTISA_BITS, CONST_Q_EXPONENT_BITS>
+            <<<grid, block, 0, stream>>>(val, q_val, indexes, group_size, total_elements_per_index);
+    });
+}
+#define INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(T, mantisa) \
+    template void launch_selective_dequantization<T, mantisa>(  \
+        uint8_t*, T*, int32_t*, int, int, int, int, int, cudaStream_t);
+// fp8(E4M3)
+#ifdef BF16_AVAILABLE
+INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(__nv_bfloat16, 7);
+#endif
+INSTANTIATE_LAUNCH_SELECTIVE_DEQUANTIZATION(__half, 10);
diff --git a/csrc/fp_quantizer/includes/context.h b/csrc/fp_quantizer/includes/context.h
new file mode 100644
index 000000000000..5bd9badbcb4f
--- /dev/null
+++ b/csrc/fp_quantizer/includes/context.h
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime_api.h>
+#include <cassert>
+#include <iostream>
+#include <vector>
+#include "cublas_v2.h"
+#include "cuda.h"
+#include "curand.h"
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <map>
+#include <memory>
+#include <stack>
+#include <string>
+#define WARP_SIZE 32
+
+class FPContext {
+public:
+    FPContext() : _seed(42)
+    {
+        curandCreateGenerator(&_gen, CURAND_RNG_PSEUDO_DEFAULT);
+        curandSetPseudoRandomGeneratorSeed(_gen, 123);
+    }
+
+    virtual ~FPContext() {}
+
+    static FPContext& Instance()
+    {
+        static FPContext _ctx;
+        return _ctx;
+    }
+
+    curandGenerator_t& GetRandGenerator() { return _gen; }
+
+    cudaStream_t GetCurrentStream()
+    {
+        // get current pytorch stream.
+        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+        return stream;
+    }
+
+    std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t offset_inc)
+    {
+        uint64_t offset = _curr_offset;
+        _curr_offset += offset_inc;
+        return std::pair<uint64_t, uint64_t>(_seed, offset);
+    }
+
+    void SetSeed(uint64_t new_seed) { _seed = new_seed; }
+
+private:
+    curandGenerator_t _gen;
+    cublasHandle_t _cublasHandle;
+    uint64_t _seed;
+    uint64_t _curr_offset;
+};
diff --git a/csrc/fp_quantizer/includes/fp_quantize.h b/csrc/fp_quantizer/includes/fp_quantize.h
new file mode 100644
index 000000000000..60c75541f603
--- /dev/null
+++ b/csrc/fp_quantizer/includes/fp_quantize.h
@@ -0,0 +1,128 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#include <cuda.h>
+#include <stdint.h>
+
+#include <cuda_fp16.h>
+
+#ifdef BF16_AVAILABLE
+#include <cuda_bf16.h>
+#endif
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+
+#define QUANT_SWITCH(Q_BITS, ...)                        \
+    [&] {                                                \
+        if (12 == Q_BITS) {                              \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 0; \
+            constexpr int CONST_Q_BITS = 8;              \
+            constexpr int CONST_Q_MANTISA_BITS = 3;      \
+            __VA_ARGS__();                               \
+        } else if (13 == Q_BITS) {                       \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 1; \
+            constexpr int CONST_Q_BITS = 8;              \
+            constexpr int CONST_Q_MANTISA_BITS = 3;      \
+            __VA_ARGS__();                               \
+        } else if (10 == Q_BITS) {                       \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 0; \
+            constexpr int CONST_Q_BITS = 8;              \
+            constexpr int CONST_Q_MANTISA_BITS = 2;      \
+            __VA_ARGS__();                               \
+        } else if (11 == Q_BITS) {                       \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 1; \
+            constexpr int CONST_Q_BITS = 8;              \
+            constexpr int CONST_Q_MANTISA_BITS = 2;      \
+            __VA_ARGS__();                               \
+        } else if (28 == Q_BITS) {                       \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 0; \
+            constexpr int CONST_Q_BITS = 12;             \
+            constexpr int CONST_Q_MANTISA_BITS = 7;      \
+            __VA_ARGS__();                               \
+        } else if (29 == Q_BITS) {                       \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 1; \
+            constexpr int CONST_Q_BITS = 12;             \
+            constexpr int CONST_Q_MANTISA_BITS = 7;      \
+            __VA_ARGS__();                               \
+        } else if (6 == Q_BITS) {                        \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 0; \
+            constexpr int CONST_Q_BITS = 6;              \
+            constexpr int CONST_Q_MANTISA_BITS = 2;      \
+            __VA_ARGS__();                               \
+        } else if (7 == Q_BITS) {                        \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 1; \
+            constexpr int CONST_Q_BITS = 6;              \
+            constexpr int CONST_Q_MANTISA_BITS = 2;      \
+            __VA_ARGS__();                               \
+        } else if (2 == Q_BITS) {                        \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 0; \
+            constexpr int CONST_Q_BITS = 4;              \
+            constexpr int CONST_Q_MANTISA_BITS = 1;      \
+            __VA_ARGS__();                               \
+        } else {                                         \
+            constexpr int CONST_STOCHASTIC_ROUNDING = 1; \
+            constexpr int CONST_Q_BITS = 4;              \
+            constexpr int CONST_Q_MANTISA_BITS = 1;      \
+            __VA_ARGS__();                               \
+        }                                                \
+    }()
+
+#define DEQUANT_SWITCH(Q_MANTISA_EXPONENT_BITS, ...) \
+    [&] {                                            \
+        if (12 == Q_MANTISA_EXPONENT_BITS) {         \
+            constexpr int CONST_Q_MANTISA_BITS = 3;  \
+            constexpr int CONST_Q_EXPONENT_BITS = 4; \
+            __VA_ARGS__();                           \
+        } else if (10 == Q_MANTISA_EXPONENT_BITS) {  \
+            constexpr int CONST_Q_MANTISA_BITS = 2;  \
+            constexpr int CONST_Q_EXPONENT_BITS = 5; \
+            __VA_ARGS__();                           \
+        } else if (28 == Q_MANTISA_EXPONENT_BITS) {  \
+            constexpr int CONST_Q_MANTISA_BITS = 7;  \
+            constexpr int CONST_Q_EXPONENT_BITS = 4; \
+            __VA_ARGS__();                           \
+        } else if (6 == Q_MANTISA_EXPONENT_BITS) {   \
+            constexpr int CONST_Q_MANTISA_BITS = 2;  \
+            constexpr int CONST_Q_EXPONENT_BITS = 3; \
+            __VA_ARGS__();                           \
+        } else {                                     \
+            constexpr int CONST_Q_MANTISA_BITS = 1;  \
+            constexpr int CONST_Q_EXPONENT_BITS = 2; \
+            __VA_ARGS__();                           \
+        }                                            \
+    }()
+
+template <typename T, int mantisa, int exponent>
+void launch_quantization(T* val,
+                         uint8_t* q_val,
+                         int num_groups,
+                         int group_size,
+                         cudaStream_t stream,
+                         float q_range,
+                         int q_bits,
+                         int q_mantisa_bits,
+                         int stochastic_rounding);
+
+template <typename T, int mantisa>
+void launch_dequantization(uint8_t* val,
+                           T* q_val,
+                           int num_groups,
+                           int group_size,
+                           int q_mantisa_bits,
+                           int q_exponent_bits,
+                           cudaStream_t stream);
+
+template <typename T, int mantisa>
+void launch_selective_dequantization(uint8_t* val,
+                                     T* q_val,
+                                     int32_t* indexes,
+                                     int num_groups,
+                                     int group_size,
+                                     int num_indexes,
+                                     int q_mantisa_bits,
+                                     int q_exponent_bits,
+                                     cudaStream_t stream);
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.cpp b/csrc/gds/py_lib/deepspeed_gds_op.cpp
new file mode 100644
index 000000000000..b7055c8cc72b
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_gds_op.cpp
@@ -0,0 +1,161 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_gds_op.h"
+
+using namespace std;
+
+// For when there is more than 1 device
+static std::map<const int64_t, std::set<void*>> base_ptr_registry;
+
+static void _safe_handle_register(const int fd, CUfileDescr_t& cf_descr, CUfileHandle_t& cf_handle)
+{
+    memset((void*)&cf_descr, 0, sizeof(CUfileDescr_t));
+    cf_descr.handle.fd = fd;
+    cf_descr.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
+    CUfileError_t status = cuFileHandleRegister(&cf_handle, &cf_descr);
+    if (status.err != CU_FILE_SUCCESS) {
+        std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+        close(fd);
+        exit(EXIT_FAILURE);
+    }
+}
+
+static void* _find_base_ptr(const int64_t device, char* buf_ptr)
+{
+    void* base_ptr = nullptr;
+    int64_t last = -1;
+    int64_t ptr_diff;
+    for (const auto& value : base_ptr_registry[device]) {
+        ptr_diff = buf_ptr - (char*)value;
+        if (last == -1 && ptr_diff >= 0) {
+            last = ptr_diff;
+            base_ptr = value;
+        } else if (ptr_diff < last && ptr_diff >= 0) {
+            last = ptr_diff;
+            base_ptr = value;
+        }
+    }
+    if (!base_ptr || buf_ptr < base_ptr) {
+        std::cerr << "BASE PTR ERROR :" << base_ptr << " BUF PTR " << (void*)buf_ptr << std::endl;
+        for (const auto& value : base_ptr_registry[device]) {
+            std::cerr << "BASE PTR AVAIL :" << value << std::endl;
+        }
+        exit(EXIT_FAILURE);
+    }
+
+    return base_ptr;
+}
+
+void gds_op_desc_t::add_buffer_to_registry(const torch::Tensor& buffer)
+{
+    const int64_t device = buffer.get_device();
+    void* reg_ptr = buffer.data_ptr();
+
+    // TODO: add checking to make sure pointer isn't already in set
+    const auto it = base_ptr_registry.find(device);
+    if (it == base_ptr_registry.end()) {
+        std::set<void*> new_ptr_set;
+        new_ptr_set.insert(reg_ptr);
+        base_ptr_registry.insert(std::pair<const int64_t, std::set<void*>>(device, new_ptr_set));
+    } else {
+        base_ptr_registry[device].insert(reg_ptr);
+    }
+
+    check_cudaruntimecall(cudaSetDevice(device));
+    CUfileError_t status = cuFileBufRegister(reg_ptr, buffer.nbytes(), 0);
+    if (status.err != CU_FILE_SUCCESS) {
+        std::cerr << "buffer register failed:" << cuFileGetErrorString(status) << std::endl;
+        exit(EXIT_FAILURE);
+    }
+}
+
+void gds_op_desc_t::remove_buffer_from_registry(const torch::Tensor& buffer)
+{
+    const int64_t device = buffer.get_device();
+    void* reg_ptr = buffer.data_ptr();
+
+    // std::cout << "DEREG PTR " <<  reg_ptr << std::endl;
+    check_cudaruntimecall(cudaSetDevice(device));
+    cuFileBufDeregister(reg_ptr);
+
+    // Remove from tracked registry
+    base_ptr_registry[device].erase(reg_ptr);
+}
+
+gds_op_desc_t::gds_op_desc_t(const bool read_op,
+                             const torch::Tensor& buffer,
+                             const int fd,
+                             const char* filename,
+                             const int64_t file_num_bytes,
+                             const int intra_op_parallelism,
+                             const bool validate,
+                             const int64_t file_offset)
+    : io_op_desc_t(read_op,
+                   buffer,
+                   fd,
+                   filename,
+                   file_num_bytes,
+                   intra_op_parallelism,
+                   validate,
+                   file_offset)
+{
+    _contiguous_buffer = _buffer.contiguous();
+    const int64_t device = _buffer.get_device();
+    check_cudaruntimecall(cudaSetDevice(device));
+    _base_ptr = _find_base_ptr(device, (char*)_contiguous_buffer.data_ptr());
+
+    _safe_handle_register(fd, _cf_descr, _cf_handle);
+}
+
+char* gds_op_desc_t::data_ptr() const { return (char*)_contiguous_buffer.data_ptr(); }
+
+void gds_op_desc_t::finish() { cuFileHandleDeregister(_cf_handle); }
+
+void gds_op_desc_t::validate()
+{
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    const auto cpu_buffer = _buffer.to(torch::kCPU);
+    validate_aio_operation(
+        _read_op, _filename.c_str(), (char*)(cpu_buffer.data_ptr()), _file_num_bytes);
+}
+
+void gds_op_desc_t::run(const int tid,
+                        std::unique_ptr<aio_context>& aio_ctxt,
+                        deepspeed_aio_config_t* aio_config)
+{
+    assert(tid < _intra_op_parallelism);
+    check_cudaruntimecall(cudaSetDevice(_buffer.get_device()));
+    const auto buf_offset = data_ptr() + (_num_bytes_per_thread * tid) - (char*)_base_ptr;
+    const auto tid_file_offset = _file_offset + (_num_bytes_per_thread * tid);
+
+    if (_read_op) {
+        auto ret =
+            cuFileRead(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, tid_file_offset); }
+    } else {
+        auto ret =
+            cuFileWrite(_cf_handle, _base_ptr, _num_bytes_per_thread, tid_file_offset, buf_offset);
+        if (ret < 0) { _report_error(ret, errno, tid_file_offset); }
+    }
+}
+
+void gds_op_desc_t::_report_error(const ssize_t return_code,
+                                  const int error_num,
+                                  const off_t offset)
+{
+    const auto op_string = _read_op ? "read failed with " : "write failed with ";
+    const auto error_string = IS_CUFILE_ERR(return_code) ? "cuFile error: " : "posix error: ";
+    const auto error_code = IS_CUFILE_ERR(return_code) ? cuFileGetErrorString(return_code)
+                                                       : cuFileGetErrorString(error_num);
+    std::cerr << op_string << error_string << error_code << " return code = " << return_code
+              << " filename = " << _filename.c_str() << " num bytes = " << _num_bytes_per_thread
+              << " offset = " << offset << std::endl;
+    exit(EXIT_FAILURE);
+}
diff --git a/csrc/gds/py_lib/deepspeed_gds_op.h b/csrc/gds/py_lib/deepspeed_gds_op.h
new file mode 100644
index 000000000000..d955527b1ba3
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_gds_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cstdlib>
+#include <fstream>
+#include <memory>
+#include <queue>
+#include <set>
+#include <string>
+
+#include "deepspeed_aio_op_desc.h"
+#include "deepspeed_gds_utils.h"
+
+struct gds_op_desc_t : io_op_desc_t {
+    CUfileDescr_t _cf_descr;
+    CUfileHandle_t _cf_handle;
+    void* _base_ptr;
+
+    gds_op_desc_t(const bool read_op,
+                  const torch::Tensor& buffer,
+                  const int fd,
+                  const char* filename,
+                  const int64_t file_num_bytes,
+                  const int intra_op_parallelism,
+                  const bool validate,
+                  const int64_t file_offset);
+
+    void run(const int tid,
+             std::unique_ptr<aio_context>& aio_ctxt,
+             deepspeed_aio_config_t* aio_config);
+
+    char* data_ptr() const;
+
+    void validate();
+
+    void finish();
+
+    void _report_error(const ssize_t return_code, const int error_num, const off_t offset);
+
+    static void add_buffer_to_registry(const torch::Tensor& buffer);
+
+    static void remove_buffer_from_registry(const torch::Tensor& buffer);
+};
diff --git a/csrc/gds/py_lib/deepspeed_gds_utils.h b/csrc/gds/py_lib/deepspeed_gds_utils.h
new file mode 100644
index 000000000000..12b014d90988
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_gds_utils.h
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <cstring>
+
+// CUDA/cuFile includes
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "cufile.h"
+
+// Macro for checking cuda errors following a cuda launch or api call
+#define cudaCheckError()                                                                     \
+    {                                                                                        \
+        cudaError_t e = cudaGetLastError();                                                  \
+        if (e != cudaSuccess) {                                                              \
+            printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+            exit(EXIT_FAILURE);                                                              \
+        }                                                                                    \
+    }
+
+#define check_cudadrivercall(fn)                                                           \
+    do {                                                                                   \
+        CUresult res = fn;                                                                 \
+        if (res != CUDA_SUCCESS) {                                                         \
+            const char* str = nullptr;                                                     \
+            cuGetErrorName(res, &str);                                                     \
+            std::cerr << "cuda driver api call failed " << #fn << " res : " << res << ", " \
+                      << __LINE__ << ":" << str << std::endl;                              \
+            std::cerr << "EXITING program!!!" << std::endl;                                \
+            exit(1);                                                                       \
+        }                                                                                  \
+    } while (0)
+
+#define check_cudaruntimecall(fn)                                                         \
+    do {                                                                                  \
+        cudaError_t res = fn;                                                             \
+        if (res != cudaSuccess) {                                                         \
+            const char* str = cudaGetErrorName(res);                                      \
+            std::cerr << "cuda runtime api call failed " << #fn << __LINE__ << ":" << str \
+                      << std::endl;                                                       \
+            std::cerr << "EXITING program!!!" << std::endl;                               \
+            exit(1);                                                                      \
+        }                                                                                 \
+    } while (0)
+
+#define check_cuFileCall(fn, api_msg)                                                  \
+    do {                                                                               \
+        CUfileError_t status = fn;                                                     \
+        if (status.err != CU_FILE_SUCCESS) {                                           \
+            std::cout << api_msg << " failed with error " << CUFILE_ERRSTR(status.err) \
+                      << std::endl;                                                    \
+            exit(EXIT_FAILURE);                                                        \
+        }                                                                              \
+    } while (0)
+
+//
+// cuda driver error description
+//
+static inline const char* GetCuErrorString(CUresult curesult)
+{
+    const char* descp;
+    if (cuGetErrorName(curesult, &descp) != CUDA_SUCCESS) descp = "unknown cuda error";
+    return descp;
+}
+
+//
+// cuFile APIs return both cuFile specific error codes as well as POSIX error codes
+// for ease, the below template can be used for getting the error description depending
+// on its type.
+
+// POSIX
+template <class T,
+          typename std::enable_if<std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
+std::string cuFileGetErrorString(T status)
+{
+    status = std::abs(status);
+    return IS_CUFILE_ERR(status) ? std::string(CUFILE_ERRSTR(status))
+                                 : std::string(std::strerror(status));
+}
+
+// CUfileError_t
+template <class T,
+          typename std::enable_if<!std::is_integral<T>::value, std::nullptr_t>::type = nullptr>
+std::string cuFileGetErrorString(T status)
+{
+    std::string errStr = cuFileGetErrorString(static_cast<int>(status.err));
+    if (IS_CUDA_ERR(status)) errStr.append(".").append(GetCuErrorString(status.cu_err));
+    return errStr;
+}
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
new file mode 100644
index 000000000000..f11245c75a5e
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.cpp
@@ -0,0 +1,125 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+    GPUDirect Storage functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include "deepspeed_py_gds_handle.h"
+#include <cstdlib>
+#include "deepspeed_gds_op.h"
+
+using namespace std;
+
+int deepspeed_gds_handle_t::s_cuFile_init = 0;
+
+deepspeed_gds_handle_t::deepspeed_gds_handle_t(const int block_size,
+                                               const int queue_depth,
+                                               const bool single_submit,
+                                               const bool overlap_events,
+                                               const int intra_op_parallelism)
+    : deepspeed_io_handle_t(block_size, queue_depth, single_submit, overlap_events, 1),
+      _intra_gds_op_parallelism(intra_op_parallelism)
+{
+    _init_cuFile(block_size, queue_depth);
+}
+
+deepspeed_gds_handle_t::~deepspeed_gds_handle_t() { _close_cuFile(); }
+
+const int deepspeed_gds_handle_t::get_intra_op_parallelism() const
+{
+    return _intra_gds_op_parallelism;
+}
+
+void deepspeed_gds_handle_t::_init_cuFile(const int block_size, const int queue_depth)
+{
+    if (deepspeed_gds_handle_t::s_cuFile_init == 0) {
+        std::string depthStr = std::to_string(queue_depth);
+        std::string threadsStr = std::to_string(_intra_gds_op_parallelism);
+        std::string json1 = R"({"execution": {"max_io_queue_depth": )" + depthStr + ", ";
+        std::string json2 = R"("max_request_parallelism": )" + threadsStr + ", ";
+        std::string json3 = R"("max_io_threads": )" + threadsStr + ", ";
+        std::string json4 = R"("parallel_io": true, "min_io_threshold_size_kb": 8192}})";
+        std::ofstream outFile("local_cufile.json");
+        if (outFile.is_open()) {
+            outFile << json1 + json2 + json3 + json4;
+            outFile.close();
+        } else {
+            std::cerr << "Can't open local cufile" << std::endl;
+            exit(EXIT_FAILURE);
+        }
+        // TODO: Address the following issues with this code
+        // (1) Fix C++14 warning
+        // (2) Create file in a different location than PWD
+        // (3) Handle multi-GPU/multi-rank scenarios: should cufile be shared, is per-rank cufile
+        // safe?
+        putenv("CUFILE_ENV_PATH_JSON=$PWD/local_cufile.json");
+        cuFileDriverOpen();
+        cudaCheckError();
+        size_t direct_io_size = (size_t)block_size / 1024;
+        CUfileError_t status = cuFileDriverSetMaxDirectIOSize(direct_io_size);
+        if (status.err != CU_FILE_SUCCESS) {
+            std::cerr << "file register error:" << cuFileGetErrorString(status) << std::endl;
+            exit(EXIT_FAILURE);
+        }
+    }
+    deepspeed_gds_handle_t::s_cuFile_init++;
+}
+
+void deepspeed_gds_handle_t::_close_cuFile()
+{
+    deepspeed_gds_handle_t::s_cuFile_init--;
+    if (deepspeed_gds_handle_t::s_cuFile_init == 0) { cuFileDriverClose(); }
+}
+
+torch::Tensor deepspeed_gds_handle_t::new_pinned_device_tensor(const size_t num_elem,
+                                                               const torch::Tensor& example_tensor)
+{
+    auto options = torch::TensorOptions().dtype(example_tensor.scalar_type()).device(torch::kCUDA);
+    auto dev_tensor = torch::empty(num_elem, options);
+    pin_device_tensor(dev_tensor);
+    return dev_tensor;
+}
+
+bool deepspeed_gds_handle_t::free_pinned_device_tensor(torch::Tensor& buffer)
+{
+    unpin_device_tensor(buffer);
+    return true;
+}
+
+bool deepspeed_gds_handle_t::pin_device_tensor(const torch::Tensor& buffer)
+{
+    gds_op_desc_t::add_buffer_to_registry(buffer);
+    return true;
+}
+
+bool deepspeed_gds_handle_t::unpin_device_tensor(const torch::Tensor& buffer)
+{
+    gds_op_desc_t::remove_buffer_from_registry(buffer);
+    return true;
+}
+
+std::shared_ptr<struct io_op_desc_t> deepspeed_gds_handle_t::_create_io_op_desc(
+    const bool read_op,
+    const torch::Tensor& buffer,
+    const int fd,
+    const char* filename,
+    const int64_t file_num_bytes,
+    const bool validate,
+    const int64_t file_offset)
+{
+    if (buffer.is_cuda()) {
+        return std::make_shared<gds_op_desc_t>(read_op,
+                                               buffer,
+                                               fd,
+                                               filename,
+                                               file_num_bytes,
+                                               _intra_op_parallelism,
+                                               validate,
+                                               file_offset);
+    }
+    return deepspeed_io_handle_t::_create_io_op_desc(
+        read_op, buffer, fd, filename, file_num_bytes, validate, file_offset);
+}
diff --git a/csrc/gds/py_lib/deepspeed_py_gds_handle.h b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
new file mode 100644
index 000000000000..25f68e177b2c
--- /dev/null
+++ b/csrc/gds/py_lib/deepspeed_py_gds_handle.h
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <condition_variable>
+#include <memory>
+#include "deepspeed_py_io_handle.h"
+
+struct deepspeed_gds_handle_t : deepspeed_io_handle_t {
+    const int _intra_gds_op_parallelism;
+
+    deepspeed_gds_handle_t(const int block_size,
+                           const int queue_depth,
+                           const bool single_submit,
+                           const bool overlap_events,
+                           const int intra_op_parallelism);
+
+    ~deepspeed_gds_handle_t();
+
+    torch::Tensor new_pinned_device_tensor(const size_t num_elem,
+                                           const torch::Tensor& example_tensor);
+
+    bool free_pinned_device_tensor(torch::Tensor&);
+
+    bool pin_device_tensor(const torch::Tensor& buffer);
+
+    bool unpin_device_tensor(const torch::Tensor& buffer);
+
+    void _init_cuFile(const int block_size, const int queue_depth);
+
+    void _close_cuFile();
+
+    const int get_intra_op_parallelism() const;
+
+    std::shared_ptr<struct io_op_desc_t> _create_io_op_desc(const bool read_op,
+                                                            const torch::Tensor& buffer,
+                                                            const int fd,
+                                                            const char* filename,
+                                                            const int64_t file_num_bytes,
+                                                            const bool validate,
+                                                            const int64_t file_offset);
+
+    static int s_cuFile_init;
+};
diff --git a/csrc/gds/py_lib/py_ds_gds.cpp b/csrc/gds/py_lib/py_ds_gds.cpp
new file mode 100644
index 000000000000..2f165ee2c32a
--- /dev/null
+++ b/csrc/gds/py_lib/py_ds_gds.cpp
@@ -0,0 +1,130 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
+*/
+
+#include <torch/extension.h>
+#include "deepspeed_py_gds_handle.h"
+using namespace pybind11::literals;
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    py::class_<deepspeed_gds_handle_t>(m, "gds_handle")
+        .def(py::init<const int, const int, const bool, const bool, const int>(),
+             "GDS handle constructor",
+             "block_size"_a = 1024 * 1024,
+             "queue_depth"_a = 128,
+             "single_submit"_a = false,
+             "overlap_events"_a = false,
+             "intra_op_parallelism"_a = 1)
+
+        .def("get_block_size", &deepspeed_gds_handle_t::get_block_size)
+        .def("get_queue_depth", &deepspeed_gds_handle_t::get_queue_depth)
+        .def("get_single_submit", &deepspeed_gds_handle_t::get_single_submit)
+        .def("get_overlap_events", &deepspeed_gds_handle_t::get_overlap_events)
+        .def("get_intra_op_parallelism", &deepspeed_gds_handle_t::get_intra_op_parallelism)
+
+        .def("read",
+             &deepspeed_gds_handle_t::read,
+             "Synchronous and non-parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "file_offset"_a = 0)
+
+        .def("write",
+             &deepspeed_gds_handle_t::write,
+             "Synchronous and non-parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "file_offset"_a = 0)
+
+        .def("pread",
+             &deepspeed_gds_handle_t::pread,
+             "Parallel file read with option of parallelism. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a,
+             "file_offset"_a = 0)
+
+        .def("pwrite",
+             &deepspeed_gds_handle_t::pwrite,
+             "Parallel file write with option of parallelism. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "validate"_a,
+             "async"_a,
+             "file_offset"_a = 0)
+
+        .def("sync_pread",
+             &deepspeed_gds_handle_t::sync_pread,
+             "Synchrononous parallel file read. Returns count of completed read ops",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("sync_pwrite",
+             &deepspeed_gds_handle_t::sync_pwrite,
+             "Synchronous parallel file write. Returns count of completed write ops",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("async_pread",
+             &deepspeed_gds_handle_t::async_pread,
+             "Asynchronous parallel file read. Returns 0 on success. Returns 0 on success, and "
+             "following wait() returns count of completed ops.",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("async_pwrite",
+             &deepspeed_gds_handle_t::async_pwrite,
+             "Asynchronous parallel file write. Returns 0 on success, and following wait() returns "
+             "count of completed ops.",
+             "buffer"_a,
+             "filename"_a,
+             "file_offset"_a = 0)
+
+        .def("new_cpu_locked_tensor",
+             &deepspeed_gds_handle_t::new_cpu_locked_tensor,
+             "Allocate pinned CPU tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_cpu_locked_tensor",
+             &deepspeed_gds_handle_t::free_cpu_locked_tensor,
+             "Free pinned CPU tensor.",
+             "tensor"_a)
+
+        .def("new_pinned_device_tensor",
+             &deepspeed_gds_handle_t::new_pinned_device_tensor,
+             "Allocate pinned device tensor.",
+             "num_elem"_a,
+             "example_tenosr"_a)
+
+        .def("free_pinned_device_tensor",
+             &deepspeed_gds_handle_t::free_pinned_device_tensor,
+             "Free pinned device tensor.",
+             "tensor"_a)
+
+        .def("pin_device_tensor",
+             &deepspeed_gds_handle_t::pin_device_tensor,
+             "Pin device tensor.",
+             "tensor"_a)
+
+        .def("unpin_device_tensor",
+             &deepspeed_gds_handle_t::unpin_device_tensor,
+             "Unpin device tensor.",
+             "tensor"_a)
+
+        .def("wait",
+             &deepspeed_gds_handle_t::wait,
+             "Wait for (ongoing) asynchronous operations to complete");
+}
diff --git a/csrc/gds/py_test/validate_gds.py b/csrc/gds/py_test/validate_gds.py
new file mode 100644
index 000000000000..b34b1194f582
--- /dev/null
+++ b/csrc/gds/py_test/validate_gds.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+from deepspeed.ops.op_builder import GDSBuilder
+assert GDSBuilder().is_compatible(True)
+assert GDSBuilder().load(True)
diff --git a/csrc/includes/cpu_adagrad.h b/csrc/includes/cpu_adagrad.h
index 59888adf17c3..6f500250f033 100644
--- a/csrc/includes/cpu_adagrad.h
+++ b/csrc/includes/cpu_adagrad.h
@@ -9,84 +9,35 @@
                   // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
 
 #include <stdio.h>
+#include <torch/extension.h>
 #include <cassert>
 #include "simd.h"
 
-#if defined(__ENABLE_CUDA__)
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-typedef __half ds_half_precision_t;
-#elif defined(__ENABLE_CANN__)
-#include "acl/acl.h"
-#include "torch_npu/csrc/core/npu/NPUStream.h"
-typedef c10::Half ds_half_precision_t;
-#else
-typedef unsigned short ds_half_precision_t;
-#endif
-
-#define STEP(SPAN)                                             \
-    void Step_##SPAN(float* _params,                           \
-                     float* grads,                             \
-                     float* _exp_avg_sq,                       \
-                     size_t _param_size,                       \
-                     ds_half_precision_t* dev_param = nullptr, \
-                     bool half_precision = false);
+#define STEP(SPAN)                                                           \
+    template <typename ds_params_precision_t, typename ds_state_precision_t> \
+    void Step_##SPAN(ds_params_precision_t* _params,                         \
+                     ds_params_precision_t* grads,                           \
+                     ds_state_precision_t* _exp_avg_sq,                      \
+                     size_t _param_size);
 
 class Adagrad_Optimizer {
 public:
     Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
         : _alpha(alpha), _eps(eps), _weight_decay(weight_decay)
     {
-#if defined(__ENABLE_CUDA__)
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = TrainingContext::Instance().GetCurrentStream();
-        _streams[1] = TrainingContext::Instance().GetNewStream();
-        _buf_index = false;
-#elif defined(__ENABLE_CANN__)
-        aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _buf_index = false;
-#endif
-    }
-    ~Adagrad_Optimizer()
-    {
-#if defined(__ENABLE_CUDA__)
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-#elif defined(__ENABLE_CANN__)
-        aclrtFreeHost(_doubled_buffer[0]);
-        aclrtFreeHost(_doubled_buffer[1]);
-#endif
     }
+    ~Adagrad_Optimizer() {}
 #if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
+    template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
     void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  ds_half_precision_t* dev_param = nullptr,
-                  bool half_precision = false);
+                  ds_params_precision_t* _params,
+                  ds_params_precision_t* grads,
+                  ds_state_precision_t* _exp_avg_sq,
+                  size_t param_size);
 #endif
     STEP(1)
     STEP(4)
     STEP(8)
-#if defined(__ENABLE_CUDA__)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-#elif defined(__ENABLE_CANN__)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream());
-    }
-#endif
     inline void IncrementStep(size_t step)
     {
         _step++;
@@ -107,29 +58,22 @@ class Adagrad_Optimizer {
     float _betta1_t;
     float _betta2_t;
     size_t _step;
-
-#if defined(__ENABLE_CUDA__)
-    bool _buf_index;
-    float* _doubled_buffer[2];
-    cudaStream_t _streams[2];
-#elif defined(__ENABLE_CANN__)
-    float* _doubled_buffer[2];
-    c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(),
-                                      c10_npu::getNPUStreamFromPool()};
-    bool _buf_index;
-#endif
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
+template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
 void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
-                                 float* _params,
-                                 float* grads,
-                                 float* _exp_avg_sq,
-                                 size_t _param_size,
-                                 ds_half_precision_t* dev_params,
-                                 bool half_precision)
+                                 ds_params_precision_t* _params,
+                                 ds_params_precision_t* grads,
+                                 ds_state_precision_t* _exp_avg_sq,
+                                 size_t _param_size)
 {
+#if !defined(__AVX512__)
+    if (std::is_same_v<ds_params_precision_t, c10::BFloat16> ||
+        std::is_same_v<ds_state_precision_t, c10::BFloat16>) {
+        return;
+    }
+#endif
     size_t new_rounded_size = 0;
     AVX_Data eps_4;
     eps_4.data = SIMD_SET(_eps);
@@ -145,24 +89,19 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
         size_t copy_size = TILE;
         if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
         size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#elif defined(__ENABLE_CANN__)
-        if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
-#endif
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
             AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + i, half_precision);
+            simd_load<span>(grad_4, grads + i);
 
             AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, grads + i, false);
+            simd_load<span>(momentum_4, grads + i);
 
             AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+            simd_load<span>(variance_4, _exp_avg_sq + i);
 
             AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + i, half_precision);
+            simd_load<span>(param_4, _params + i);
 
             if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
 
@@ -172,37 +111,9 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
             simd_div<span>(grad_4, momentum_4, grad_4);
             simd_fma<span>(param_4, grad_4, step_size_4, param_4);
 
-            simd_store<span>(_params + i, param_4, half_precision);
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-#endif
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+            simd_store<span>(_params + i, param_4);
+            simd_store<span>(_exp_avg_sq + i, variance_4);
         }
-#if defined(__ENABLE_CUDA__)
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
-        }
-#elif defined(__ENABLE_CANN__)
-        if (dev_params) {
-            size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
-            if (half_precision) memoryCopySize /= 2;
-            aclrtMemcpy(dev_params + t,
-                        memcpy_size,
-                        _doubled_buffer[_buf_index],
-                        memcpy_size,
-                        aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
-
-            _buf_index = !_buf_index;
-#endif
     }
     *rounded_size = new_rounded_size;
 }
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
index 44d3ed3cac61..a7db6fda3705 100644
--- a/csrc/includes/cpu_adam.h
+++ b/csrc/includes/cpu_adam.h
@@ -13,29 +13,13 @@
 #include <cassert>
 #include "simd.h"
 
-#if defined(__ENABLE_CUDA__)
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-typedef __half ds_half_precision_t;
-#elif defined(__ENABLE_CANN__)
-#include "acl/acl.h"
-#include "torch_npu/csrc/core/npu/NPUStream.h"
-typedef c10::Half ds_half_precision_t;
-#else
-#include <cmath>
-typedef unsigned short ds_half_precision_t;
-#endif
-
-#define STEP(SPAN)                                             \
-    void Step_##SPAN(float* _params,                           \
-                     float* grads,                             \
-                     float* _exp_avg,                          \
-                     float* _exp_avg_sq,                       \
-                     size_t _param_size,                       \
-                     ds_half_precision_t* dev_param = nullptr, \
-                     bool half_precision = false);
+#define STEP(SPAN)                                                           \
+    template <typename ds_params_precision_t, typename ds_state_precision_t> \
+    void Step_##SPAN(ds_params_precision_t* _params,                         \
+                     ds_params_precision_t* grads,                           \
+                     ds_state_precision_t* _exp_avg,                         \
+                     ds_state_precision_t* _exp_avg_sq,                      \
+                     size_t _param_size);
 
 class Adam_Optimizer {
 public:
@@ -55,56 +39,21 @@ class Adam_Optimizer {
           _step(0),
           _adamw_mode(adamw_mode)
     {
-#if defined(__ENABLE_CUDA__)
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = TrainingContext::Instance().GetCurrentStream();
-        _streams[1] = TrainingContext::Instance().GetNewStream();
-        _buf_index = false;
-#elif defined(__ENABLE_CANN__)
-        aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _buf_index = false;
-#endif
-    }
-    ~Adam_Optimizer()
-    {
-#if defined(__ENABLE_CUDA__)
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-#elif defined(__ENABLE_CANN__)
-        aclrtFreeHost(_doubled_buffer[0]);
-        aclrtFreeHost(_doubled_buffer[1]);
-#endif
     }
+    ~Adam_Optimizer() {}
 
 #if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
+    template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
     void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  float* _exp_avg_sq,
-                  size_t param_size,
-                  ds_half_precision_t* dev_param = nullptr,
-                  bool half_precision = false);
+                  ds_params_precision_t* _params,
+                  ds_params_precision_t* grads,
+                  ds_state_precision_t* _exp_avg,
+                  ds_state_precision_t* _exp_avg_sq,
+                  size_t param_size);
 #endif
     STEP(1)
     STEP(4)
     STEP(8)
-#if defined(__ENABLE_CUDA__)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-#elif defined(__ENABLE_CANN__)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream());
-    }
-#endif
     inline void IncrementStep(size_t step, float beta1, float beta2)
     {
         if (beta1 != _betta1 || beta2 != _betta2) {
@@ -154,32 +103,24 @@ class Adam_Optimizer {
     float _bias_correction2;
 
     bool _adamw_mode;
-
-#if defined(__ENABLE_CUDA__)
-    float* _doubled_buffer[2];
-    cudaStream_t _streams[2];
-    bool _buf_index;
-#elif defined(__ENABLE_CANN__)
-    float* _doubled_buffer[2];
-    c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(),
-                                      c10_npu::getNPUStreamFromPool()};
-    bool _buf_index;
-#endif
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
+template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
 void Adam_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              float* _exp_avg_sq,
-                              size_t _param_size,
-                              ds_half_precision_t* dev_params,
-                              bool half_precision)
+                              ds_params_precision_t* _params,
+                              ds_params_precision_t* grads,
+                              ds_state_precision_t* _exp_avg,
+                              ds_state_precision_t* _exp_avg_sq,
+                              size_t _param_size)
 {
+#if !defined(__AVX512__)
+    if (std::is_same_v<ds_params_precision_t, c10::BFloat16> ||
+        std::is_same_v<ds_state_precision_t, c10::BFloat16>) {
+        return;
+    }
+#endif
     size_t new_rounded_size = 0;
-    int rshft = half_precision ? 1 : 0;
 
     AVX_Data betta1_4;
     betta1_4.data = SIMD_SET(_betta1);
@@ -212,25 +153,19 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
         size_t copy_size = TILE;
         if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
         size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#elif defined(__ENABLE_CANN__)
-        if ((t / TILE) >= 2) { aclrtSynchronizeStream((_streams[_buf_index].stream());
-        }
-#endif
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
             AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + (i >> rshft), half_precision);
+            simd_load<span>(grad_4, grads + i);
 
             AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
+            simd_load<span>(momentum_4, _exp_avg + i);
 
             AVX_Data variance_4[span];
-            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+            simd_load<span>(variance_4, _exp_avg_sq + i);
 
             AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + (i >> rshft), half_precision);
+            simd_load<span>(param_4, _params + i);
 
             if (_weight_decay > 0 && !_adamw_mode) {
                 simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
@@ -251,38 +186,10 @@ void Adam_Optimizer::Step_AVX(size_t* rounded_size,
 
             simd_fma<span>(param_4, grad_4, step_size_4, param_4);
 
-            simd_store<span>(_params + (i >> rshft), param_4, half_precision);
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-#endif
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-            simd_store<span>(_exp_avg_sq + i, variance_4, false);
-        }
-#if defined(__ENABLE_CUDA__)
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
+            simd_store<span>(_params + i, param_4);
+            simd_store<span>(_exp_avg + i, momentum_4);
+            simd_store<span>(_exp_avg_sq + i, variance_4);
         }
-#elif defined(__ENABLE_CANN__)
-        if (dev_params) {
-            size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
-            if (half_precision) memoryCopySize /= 2;
-            aclrtMemcpy(dev_params + t,
-                        memcpy_size,
-                        _doubled_buffer[_buf_index],
-                        memcpy_size,
-                        aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
-
-            _buf_index = !_buf_index;
-#endif
     }
     *rounded_size = new_rounded_size;
 }
@@ -310,18 +217,4 @@ int ds_adam_step(int optimizer_id,
                  torch::Tensor& exp_avg,
                  torch::Tensor& exp_avg_sq);
 
-int ds_adam_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float epsilon,
-                           float weight_decay,
-                           bool bias_correction,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& exp_avg_sq,
-                           torch::Tensor& gpu_params);
-
 int destroy_adam_optimizer(int optimizer_id);
diff --git a/csrc/includes/cpu_lion.h b/csrc/includes/cpu_lion.h
index d83fe9473332..beaf357a3211 100644
--- a/csrc/includes/cpu_lion.h
+++ b/csrc/includes/cpu_lion.h
@@ -13,28 +13,12 @@
 #include <cassert>
 #include "simd.h"
 
-#if defined(__ENABLE_CUDA__)
-#include <cuda_fp16.h>
-#include <cuda_runtime_api.h>
-#include "cuda.h"
-#include "custom_cuda_layers.h"
-typedef __half ds_half_precision_t;
-#elif defined(__ENABLE_CANN__)
-#include "acl/acl.h"
-#include "torch_npu/csrc/core/npu/NPUStream.h"
-typedef c10::Half ds_half_precision_t;
-#else
-#include <cmath>
-typedef unsigned short ds_half_precision_t;
-#endif
-
-#define STEP(SPAN)                                             \
-    void Step_##SPAN(float* _params,                           \
-                     float* grads,                             \
-                     float* _exp_avg,                          \
-                     size_t _param_size,                       \
-                     ds_half_precision_t* dev_param = nullptr, \
-                     bool half_precision = false);
+#define STEP(SPAN)                                                           \
+    template <typename ds_params_precision_t, typename ds_state_precision_t> \
+    void Step_##SPAN(ds_params_precision_t* _params,                         \
+                     ds_params_precision_t* grads,                           \
+                     ds_state_precision_t* _exp_avg,                         \
+                     size_t _param_size);
 
 class Lion_Optimizer {
 public:
@@ -44,55 +28,21 @@ class Lion_Optimizer {
                    float weight_decay = 0)
         : _alpha(alpha), _betta1(betta1), _betta2(betta2), _weight_decay(weight_decay), _step(0)
     {
-#if defined(__ENABLE_CUDA__)
-        cudaMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        cudaMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _streams[0] = TrainingContext::Instance().GetCurrentStream();
-        _streams[1] = TrainingContext::Instance().GetNewStream();
-        _buf_index = false;
-#elif defined(__ENABLE_CANN__)
-        aclrtMallocHost((void**)_doubled_buffer, TILE * sizeof(float));
-        aclrtMallocHost((void**)(_doubled_buffer + 1), TILE * sizeof(float));
-
-        _buf_index = false;
-#endif
-    }
-    ~Lion_Optimizer()
-    {
-#if defined(__ENABLE_CUDA__)
-        cudaFreeHost(_doubled_buffer[0]);
-        cudaFreeHost(_doubled_buffer[1]);
-#elif defined(__ENABLE_CANN__)
-        aclrtFreeHost(_doubled_buffer[0]);
-        aclrtFreeHost(_doubled_buffer[1]);
-#endif
     }
+    ~Lion_Optimizer() {}
 
 #if defined(__AVX512__) or defined(__AVX256__)
-    template <int span>
+    template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
     void Step_AVX(size_t* rounded_size,
-                  float* _params,
-                  float* grads,
-                  float* _exp_avg,
-                  size_t param_size,
-                  ds_half_precision_t* dev_param = nullptr,
-                  bool half_precision = false);
+                  ds_params_precision_t* _params,
+                  ds_params_precision_t* grads,
+                  ds_state_precision_t* _exp_avg,
+                  size_t param_size);
 #endif
     STEP(1)
     STEP(4)
     STEP(8)
-#if defined(__ENABLE_CUDA__)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
-    }
-#elif defined(__ENABLE_CANN__)
-    inline void SynchronizeStreams()
-    {
-        for (int i = 0; i < 2; i++) aclrtSynchronizeStream(_streams[i].stream());
-    }
-#endif
+
     inline void IncrementStep(size_t step, float beta1, float beta2)
     {
         _step++;
@@ -114,31 +64,23 @@ class Lion_Optimizer {
     float _betta2;
     float _weight_decay;
     size_t _step;
-
-#if defined(__ENABLE_CUDA__)
-    float* _doubled_buffer[2];
-    cudaStream_t _streams[2];
-    bool _buf_index;
-#elif defined(__ENABLE_CANN__)
-    float* _doubled_buffer[2];
-    c10_npu::NPUStream _streams[2] = {c10_npu::getCurrentNPUStream(),
-                                      c10_npu::getNPUStreamFromPool()};
-    bool _buf_index;
-#endif
 };
 
 #if defined(__AVX512__) or defined(__AVX256__)
-template <int span>
+template <int span, typename ds_params_precision_t, typename ds_state_precision_t>
 void Lion_Optimizer::Step_AVX(size_t* rounded_size,
-                              float* _params,
-                              float* grads,
-                              float* _exp_avg,
-                              size_t _param_size,
-                              ds_half_precision_t* dev_params,
-                              bool half_precision)
+                              ds_params_precision_t* _params,
+                              ds_params_precision_t* grads,
+                              ds_state_precision_t* _exp_avg,
+                              size_t _param_size)
 {
+#if !defined(__AVX512__)
+    if (std::is_same_v<ds_params_precision_t, c10::BFloat16> ||
+        std::is_same_v<ds_state_precision_t, c10::BFloat16>) {
+        return;
+    }
+#endif
     size_t new_rounded_size = 0;
-    int rshft = half_precision ? 1 : 0;
 
     constexpr float neg1 = -1.0f;
     AVX_Data neg1_4;
@@ -169,21 +111,17 @@ void Lion_Optimizer::Step_AVX(size_t* rounded_size,
         size_t copy_size = TILE;
         if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
         size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
-        if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#elif defined(__ENABLE_CANN__)
-        if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
-#endif
+
 #pragma omp parallel for
         for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
             AVX_Data grad_4[span];
-            simd_load<span>(grad_4, grads + (i >> rshft), half_precision);
+            simd_load<span>(grad_4, grads + i);
 
             AVX_Data momentum_4[span];
-            simd_load<span>(momentum_4, _exp_avg + i, false);
+            simd_load<span>(momentum_4, _exp_avg + i);
 
             AVX_Data param_4[span];
-            simd_load<span>(param_4, _params + (i >> rshft), half_precision);
+            simd_load<span>(param_4, _params + i);
 
             AVX_Data tmp_4[span];
 
@@ -201,37 +139,9 @@ void Lion_Optimizer::Step_AVX(size_t* rounded_size,
             simd_mul<span>(momentum_4, momentum_4, betta2_4);
             simd_fma<span>(momentum_4, grad_4, betta2_minus1_4, momentum_4);
 
-            simd_store<span>(_params + (i >> rshft), param_4, half_precision);
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-            if (dev_params) {
-                simd_store<span>(_doubled_buffer[_buf_index] + (i - t), param_4, half_precision);
-            }
-#endif
-            simd_store<span>(_exp_avg + i, momentum_4, false);
-        }
-#if defined(__ENABLE_CUDA__)
-        if (dev_params) {
-            if (half_precision)
-                launch_param_update_half(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-            else
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, copy_size, _streams[_buf_index]);
-
-            _buf_index = !_buf_index;
+            simd_store<span>(_params + i, param_4);
+            simd_store<span>(_exp_avg + i, momentum_4);
         }
-#elif defined(__ENABLE_CANN__)
-        if (dev_params) {
-            size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
-            if (half_precision) memoryCopySize /= 2;
-            aclrtMemcpy(dev_params + t,
-                        memcpy_size,
-                        _doubled_buffer[_buf_index],
-                        memcpy_size,
-                        aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
-
-            _buf_index = !_buf_index;
-#endif
     }
     *rounded_size = new_rounded_size;
 }
@@ -254,15 +164,4 @@ int ds_lion_step(int optimizer_id,
                  torch::Tensor& grads,
                  torch::Tensor& exp_avg);
 
-int ds_lion_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float weight_decay,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& gpu_params);
-
 int destroy_lion_optimizer(int optimizer_id);
diff --git a/csrc/includes/cublas_wrappers.h b/csrc/includes/cublas_wrappers.h
index b57ff79923fc..2721fb990c7e 100644
--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@@ -17,6 +17,7 @@
 #include <rocblas/rocblas.h>
 #endif
 #include <stdio.h>
+#include <torch/version.h>
 
 int cublas_gemm_ex(cublasHandle_t handle,
                    cublasOperation_t transa,
@@ -29,7 +30,9 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const float* A,
                    const float* B,
                    float* C,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
@@ -46,7 +49,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    const __half* A,
                    const __half* B,
                    __half* C,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                    cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -67,7 +71,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
@@ -88,7 +93,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 int stride_B,
                                 int stride_C,
                                 int batch,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
 #else
                                 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
diff --git a/csrc/includes/custom_cuda_layers.h b/csrc/includes/custom_cuda_layers.h
index 265eb7b12444..21f19749d4cf 100644
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@@ -272,9 +272,6 @@ void launch_fuse_transpose_bias_kernel(const T* inp,
                                        int cols,
                                        cudaStream_t stream);
 
-void launch_param_update(const float* input, __half* output, int size, cudaStream_t stream);
-void launch_param_update_half(const float* input, __half* output, int size, cudaStream_t stream);
-
 void launch_token_sort(int32_t* indices,
                        int layers,
                        int batch_size,
diff --git a/csrc/includes/ds_kernel_utils.h b/csrc/includes/ds_kernel_utils.h
index 8e4888109fcd..f8b16ee6a315 100644
--- a/csrc/includes/ds_kernel_utils.h
+++ b/csrc/includes/ds_kernel_utils.h
@@ -23,7 +23,7 @@ used throughout the codebase.
 #ifdef __HIP_PLATFORM_AMD__
 
 // constexpr variant of warpSize for templating
-constexpr int hw_warp_size = 64;
+constexpr int hw_warp_size = ROCM_WAVEFRONT_SIZE;
 #define HALF_PRECISION_AVAILABLE = 1
 #include <hip/hip_cooperative_groups.h>
 #include <hip/hip_fp16.h>
diff --git a/csrc/includes/feed_forward.h b/csrc/includes/feed_forward.h
index 46e3ba748d52..d2056403d265 100644
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@@ -48,7 +48,9 @@ class FeedForward {
                        weights,
                        input_ptr,
                        out,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo(config_.gemm_algos[0]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[0]));
@@ -77,7 +79,8 @@ class FeedForward {
                        input_ptr,
                        out_grad,
                        weights_grad,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo(config_.gemm_algos[1]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[1]));
@@ -94,7 +97,8 @@ class FeedForward {
                        weights,
                        out_grad,
                        inp_grad_out,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo(config_.gemm_algos[2]));
 #else
                        cublasGemmAlgo_t(config_.gemm_algos[2]));
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index 278515174523..de5b55cd3df1 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -67,7 +67,9 @@ class GemmTest {
                            B,
                            A,
                            C,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -86,7 +88,8 @@ class GemmTest {
                            A,
                            C,
                            B,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -105,7 +108,8 @@ class GemmTest {
                            B,
                            C,
                            A,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                            static_cast<rocblas_gemm_algo>(algo));
 #else
                            static_cast<cublasGemmAlgo_t>(algo));
@@ -121,8 +125,11 @@ class GemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
         for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#elif defined(__HIP_PLATFORM_AMD__)
+        for (int algo = (int)HIPBLAS_GEMM_DEFAULT; algo <= (int)HIPBLAS_GEMM_DEFAULT;
 #else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
@@ -211,7 +218,8 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -245,7 +253,8 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -276,7 +285,8 @@ class StridedGemmTest {
                                         stride_b,
                                         stride_c,
                                         bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                         static_cast<rocblas_gemm_algo>(algo));
 #else
                                         static_cast<cublasGemmAlgo_t>(algo));
@@ -292,11 +302,17 @@ class StridedGemmTest {
         float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
         for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#else
+#ifdef __HIP_PLATFORM_AMD__
+        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+             algo <= (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
 #else
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
              algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
 #endif
              algo++) {
             int warm_up = 5;
diff --git a/csrc/includes/memory_access_utils.h b/csrc/includes/memory_access_utils.h
index 6789714d27c7..cb990b75bbe8 100644
--- a/csrc/includes/memory_access_utils.h
+++ b/csrc/includes/memory_access_utils.h
@@ -868,6 +868,35 @@ __device__ __forceinline__ void store_global<4, StorePolicy::CacheStreaming>(voi
 #endif
 }
 
+template <>
+__device__ __forceinline__ void store_global<2>(void* dst, const void* src)
+{
+    const int16_t* data = reinterpret_cast<const int16_t*>(src);
+
+    int16_t* dst_cast = reinterpret_cast<int16_t*>(dst);
+    dst_cast[0] = data[0];
+}
+
+template <>
+__device__ __forceinline__ void store_global<2, StorePolicy::CacheGlobal>(void* dst,
+                                                                          const void* src)
+{
+    const int16_t* data = reinterpret_cast<const int16_t*>(src);
+
+    int16_t* dst_cast = reinterpret_cast<int16_t*>(dst);
+    dst_cast[0] = data[0];
+}
+
+template <>
+__device__ __forceinline__ void store_global<2, StorePolicy::CacheStreaming>(void* dst,
+                                                                             const void* src)
+{
+    const int16_t* data = reinterpret_cast<const int16_t*>(src);
+
+    int16_t* dst_cast = reinterpret_cast<int16_t*>(dst);
+    dst_cast[0] = data[0];
+}
+
 /////////// Store Shared ///////////
 
 template <>
diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
index 45828832d8d2..5bdc96061a31 100644
--- a/csrc/includes/quantization.h
+++ b/csrc/includes/quantization.h
@@ -52,6 +52,36 @@ void launch_swizzled_quant(int8_t* q_data,
                            int devices_per_node,
                            cudaStream_t stream);
 
+void launch_loco_swizzled_quant(int8_t* quantized_data,
+                                float* quantized_scales,
+                                const __half* uncompressed_data,
+                                __half* error_feedback,
+                                const float err_beta,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int groups,
+                                int elems_per_group,
+                                int pipelining,
+                                int nodes,
+                                int devices_per_node,
+                                cudaStream_t stream);
+
+void launch_loco_dequant_reduce(int8_t* reduced_data,
+                                float* reduced_scales,
+                                const int8_t* input_data,
+                                const float* input_scales,
+                                int num_gpus,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int out_groups,
+                                int elems_per_out_group,
+                                int elems_per_in_tensor,
+                                int groups_per_in_tensor,
+                                int elems_per_in_group,
+                                __half2* error_feedback,
+                                const float err_beta,
+                                cudaStream_t stream);
+
 void launch_dequant_reduce(int8_t* reduced_data,
                            float* reduced_scales,
                            const int8_t* input_data,
diff --git a/csrc/includes/quantization_utils.h b/csrc/includes/quantization_utils.h
index 26db86ec1e0b..61630d0aae57 100644
--- a/csrc/includes/quantization_utils.h
+++ b/csrc/includes/quantization_utils.h
@@ -24,6 +24,7 @@ constexpr int max_threads = 1024;
 Class to hold the quantization parameters for a given tensor.
 Holds the implementation of the quantization operation.
 */
+
 template <Type qType, int numBits>
 class Params {
 public:
diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h
index eb8efab77ac1..eb9afb66a894 100644
--- a/csrc/includes/reduction_utils.h
+++ b/csrc/includes/reduction_utils.h
@@ -159,6 +159,12 @@ DS_D_INLINE float element<ROpType::Add>(const float lhs, const float rhs)
     return lhs + rhs;
 }
 
+template <>
+DS_D_INLINE double element<ROpType::Add>(const double lhs, const double rhs)
+{
+    return lhs + rhs;
+}
+
 template <>
 DS_D_INLINE float element<ROpType::Max>(const float lhs, const float rhs)
 {
@@ -189,6 +195,19 @@ DS_D_INLINE __half element<ROpType::Max>(const __half lhs, const __half rhs)
 #endif
 }
 
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 element<ROpType::Max>(const __nv_bfloat16 lhs, const __nv_bfloat16 rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    // Intrinsic limited to Ampere + newer
+    return __hmax(lhs, rhs);
+#else
+    return (lhs > rhs) ? lhs : rhs;
+#endif
+}
+#endif
+
 template <>
 DS_D_INLINE __half element<ROpType::Min>(const __half lhs, const __half rhs)
 {
@@ -220,6 +239,21 @@ DS_D_INLINE __half2 element<ROpType::Max>(const __half2 lhs, const __half2 rhs)
 #endif
 }
 
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat162 element<ROpType::Max>(const __nv_bfloat162 lhs, const __nv_bfloat162 rhs)
+{
+#if __CUDA_ARCH__ >= 800
+    return __hmax2(lhs, rhs);
+#else
+    __nv_bfloat162 ret_val;
+    ret_val.x = (lhs.x > rhs.x) ? lhs.x : rhs.x;
+    ret_val.y = (lhs.y > rhs.y) ? lhs.y : rhs.y;
+    return ret_val;
+#endif
+}
+#endif
+
 template <>
 DS_D_INLINE __half2 element<ROpType::Min>(const __half2 lhs, const __half2 rhs)
 {
@@ -295,6 +329,11 @@ DS_D_INLINE float init<ROpType::Add>()
 {
     return 0.0f;
 }
+template <>
+DS_D_INLINE double init<ROpType::Add>()
+{
+    return (double)0.0f;
+}
 
 template <>
 DS_D_INLINE float init<ROpType::Min>()
@@ -331,6 +370,15 @@ DS_D_INLINE __half init<ROpType::Max>()
     return __half(neg_inf);
 }
 
+#ifdef BF16_AVAILABLE
+template <>
+DS_D_INLINE __nv_bfloat16 init<ROpType::Max>()
+{
+    constexpr __nv_bfloat16_raw neg_inf = {0xFF80};
+    return __nv_bfloat16(neg_inf);
+}
+#endif
+
 template <>
 DS_D_INLINE __half2 init<ROpType::Add>()
 {
diff --git a/csrc/includes/simd.h b/csrc/includes/simd.h
index f77568be7835..a205026ec7c1 100644
--- a/csrc/includes/simd.h
+++ b/csrc/includes/simd.h
@@ -12,8 +12,22 @@
 
 #define TILE (128 * 1024 * 1024)
 #if defined(__AVX512__) or defined(__AVX256__)
+#include <immintrin.h>
 
-#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
+template <typename T>
+inline T readAs(const void* src)
+{
+    T res;
+    std::memcpy(&res, src, sizeof(T));
+    return res;
+}
+template <typename T>
+inline void writeAs(void* dst, const T& val)
+{
+    std::memcpy(dst, &val, sizeof(T));
+}
+
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
 
 #if defined(__AVX512__)
 #define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
@@ -30,11 +44,52 @@
 #define SIMD_XOR(x, y) _mm512_xor_ps(x, y)
 #define SIMD_WIDTH 16
 
-#define SIMD_LOAD2(x, h) \
-    ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x))
-#define SIMD_STORE2(x, d, h)                                                                      \
-    ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
-         : _mm512_storeu_ps(x, d))
+static __m512 load_16_bf16_as_f32(const void* data)
+{
+    __m256i a = readAs<__m256i>(data);     // use memcpy to avoid aliasing
+    __m512i b = _mm512_cvtepu16_epi32(a);  // convert 8 u16 to 8 u32
+    __m512i c = _mm512_slli_epi32(b, 16);  // logical shift left of all u32 by
+                                           // 16 bits (representing bf16->f32)
+    return readAs<__m512>(&c);             // use memcpy to avoid aliasing
+}
+
+static void store_16_f32_as_bf16_nearest(__m512 v, void* data)
+{
+    __m512i u32 = readAs<__m512i>(&v);
+
+    // flow assuming non-nan:
+
+    // uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+    __m512i b = _mm512_srli_epi32(u32, 16);
+    __m512i lsb_mask = _mm512_set1_epi32(0x00000001);
+    __m512i c = _mm512_and_si512(b, lsb_mask);
+    __m512i bias_constant = _mm512_set1_epi32(0x00007fff);
+    __m512i rounding_bias = _mm512_add_epi32(c, bias_constant);
+
+    // uint16_t res = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+    __m512i d = _mm512_add_epi32(u32, rounding_bias);
+    __m512i e = _mm512_srli_epi32(d, 16);
+    __m256i non_nan_res = _mm512_cvtusepi32_epi16(e);
+
+    // handle nan (exp is all 1s and mantissa != 0)
+    // if ((x & 0x7fffffffU) > 0x7f800000U)
+    __m512i mask_out_sign = _mm512_set1_epi32(0x7fffffff);
+    __m512i non_sign_bits = _mm512_and_si512(u32, mask_out_sign);
+    __m512i nan_threshold = _mm512_set1_epi32(0x7f800000);
+    __mmask16 nan_mask = _mm512_cmp_epi32_mask(non_sign_bits, nan_threshold, _MM_CMPINT_GT);
+
+    // mix in results with nans as needed
+    __m256i nans = _mm256_set1_epi16(0x7fc0);
+    __m256i res = _mm256_mask_mov_epi16(non_nan_res, nan_mask, nans);
+
+    writeAs(data, res);
+}
+#define SIMD_LOAD_BF16(x) load_16_bf16_as_f32(x)
+#define SIMD_STORE_BF16(x, d) store_16_f32_as_bf16_nearest(d, x)
+
+#define SIMD_LOAD_FP16(x) _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x)))
+#define SIMD_STORE_FP16(x, d) \
+    _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT)))
 
 #define INTV __m256i
 #elif defined(__AVX256__)
@@ -52,11 +107,11 @@
 #define SIMD_XOR(x, y) _mm256_xor_ps(x, y)
 #define SIMD_WIDTH 8
 
-#define SIMD_LOAD2(x, h) \
-    ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
-#define SIMD_STORE2(x, d, h)                                                                \
-    ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
-         : _mm256_storeu_ps(x, d))
+#define SIMD_LOAD_BF16(x) static_assert(false && "AVX256 does not support BFloat16")
+#define SIMD_STORE_BF16(x, d) static_assert(false && "AVX256 does not support BFloat16")
+#define SIMD_LOAD_FP16(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x))
+#define SIMD_STORE_FP16(x, d) \
+    _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT)))
 
 #define INTV __m128i
 #endif
@@ -70,20 +125,66 @@ union AVX_Data {
     // float data_f[16];
 };
 
-template <int span>
-inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
+template <int span, typename T>
+inline typename std::enable_if_t<std::is_same_v<T, c10::Half>, void> simd_store(T* dst,
+                                                                                AVX_Data* src)
 {
-    size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH);
+    size_t width = SIMD_WIDTH;
 #pragma unroll
-    for (size_t i = 0; i < span; ++i) { SIMD_STORE2(dst + width * i, src[i].data, half_precision); }
+    for (size_t i = 0; i < span; ++i) { SIMD_STORE_FP16((float*)(dst + width * i), src[i].data); }
 }
-template <int span>
-inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
+
+template <int span, typename T>
+inline typename std::enable_if_t<std::is_same_v<T, c10::BFloat16>, void> simd_store(T* dst,
+                                                                                    AVX_Data* src)
 {
-    size_t width = (half_precision ? 1 : SIMD_WIDTH);
+#ifdef __AVX512__
+    size_t width = SIMD_WIDTH;
 #pragma unroll
-    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); }
+    for (size_t i = 0; i < span; ++i) { SIMD_STORE_BF16((float*)(dst + width * i), src[i].data); }
+#else
+    throw std::runtime_error("AVX512 required for BFloat16");
+#endif
+}
+
+template <int span, typename T>
+inline typename std::enable_if_t<std::is_same_v<T, float>, void> simd_store(T* dst, AVX_Data* src)
+{
+    size_t width = SIMD_WIDTH;
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { SIMD_STORE(dst + width * i, src[i].data); }
 }
+
+template <int span, typename T>
+inline typename std::enable_if_t<std::is_same_v<T, c10::Half>, void> simd_load(AVX_Data* dst,
+                                                                               T* src)
+{
+    size_t width = SIMD_WIDTH;
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD_FP16((float*)(src + width * i)); }
+}
+
+template <int span, typename T>
+inline typename std::enable_if_t<std::is_same_v<T, c10::BFloat16>, void> simd_load(AVX_Data* dst,
+                                                                                   T* src)
+{
+#ifdef __AVX512__
+    size_t width = SIMD_WIDTH;
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD_BF16((float*)(src + width * i)); }
+#else
+    throw std::runtime_error("AVX512 required for BFloat16");
+#endif
+}
+
+template <int span, typename T>
+inline typename std::enable_if_t<std::is_same_v<T, float>, void> simd_load(AVX_Data* dst, T* src)
+{
+    size_t width = SIMD_WIDTH;
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD(src + width * i); }
+}
+
 template <int span>
 inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
 {
diff --git a/csrc/includes/strided_batch_gemm.h b/csrc/includes/strided_batch_gemm.h
index 86d1e3dea11a..9767fcf589b8 100644
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@@ -77,7 +77,9 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[0]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
@@ -105,7 +107,8 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     _config.batch_size,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[0]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[0]));
@@ -149,7 +152,8 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[1]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[1]));
@@ -178,7 +182,8 @@ class StridedBatchGemm {
                                     stride_b,
                                     stride_c,
                                     bsz,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                     rocblas_gemm_algo(_config.gemm_algos[2]));
 #else
                                     cublasGemmAlgo_t(_config.gemm_algos[2]));
diff --git a/csrc/lion/cpu_lion.cpp b/csrc/lion/cpu_lion.cpp
index a0562eac9c4a..c5cf3e9e9235 100644
--- a/csrc/lion/cpu_lion.cpp
+++ b/csrc/lion/cpu_lion.cpp
@@ -8,9 +8,6 @@
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("lion_update", &ds_lion_step, "DeepSpeed CPU Lion update (C++)");
-    m.def("lion_update_copy",
-          &ds_lion_step_plus_copy,
-          "DeepSpeed CPU Lion update and param copy (C++)");
     m.def("create_lion", &create_lion_optimizer, "DeepSpeed CPU Lion (C++)");
     m.def("destroy_lion", &destroy_lion_optimizer, "DeepSpeed CPU Lion destroy (C++)");
 }
diff --git a/csrc/lion/cpu_lion_impl.cpp b/csrc/lion/cpu_lion_impl.cpp
index 28314cf5b6e1..6a98162314f9 100644
--- a/csrc/lion/cpu_lion_impl.cpp
+++ b/csrc/lion/cpu_lion_impl.cpp
@@ -6,34 +6,28 @@
 #include <torch/extension.h>
 #include <cassert>
 #include <cmath>
+#include <functional>
 #include <iostream>
+#include <map>
 #include <memory>
 #include <type_traits>
 #include <unordered_map>
 #include "cpu_lion.h"
 
-#if defined(__ENABLE_CUDA__)
-#include <cuda_runtime_api.h>
-#include "cublas_v2.h"
-#include "cuda.h"
-#include "curand.h"
-#include "custom_cuda_layers.h"
-#endif
-
+using namespace std::string_literals;
 static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
 
 // C++ interface
 
-void Lion_Optimizer::Step_1(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Lion_Optimizer::Step_1(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
+                            ds_state_precision_t* _exp_avg,
+                            size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+    Step_AVX<1>(&rounded_size, _params, grads, _exp_avg, _param_size);
 #endif
     if (_param_size > rounded_size) {
         float betta1_minus1 = 1 - _betta1;
@@ -41,26 +35,15 @@ void Lion_Optimizer::Step_1(float* _params,
 
         float alpha = _alpha;
         float after_decay = 1 - alpha * _weight_decay;
-        ds_half_precision_t* grads_cast_h;
-        ds_half_precision_t* params_cast_h;
-        if (half_precision) {
-            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
-            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
-        }
 
         for (size_t t = rounded_size; t < _param_size; t += TILE) {
             size_t copy_size = TILE;
             if ((t + TILE) > _param_size) copy_size = _param_size - t;
             size_t offset = copy_size + t;
-#if defined(__ENABLE_CUDA__)
-            if ((t / TILE) >= 2) { cudaStreamSynchronize(_streams[_buf_index]); }
-#elif defined(__ENABLE_CANN__)
-            if ((t / TILE) >= 2) { aclrtSynchronizeStream(_streams[_buf_index].stream()); }
-#endif
 #pragma omp parallel for
             for (size_t k = t; k < offset; k++) {
-                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
-                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float grad = (float)grads[k];
+                float param = (float)_params[k];
                 float momentum = _exp_avg[k];
                 float tmp = momentum * _betta1;
                 tmp = grad * betta1_minus1 + tmp;
@@ -74,56 +57,28 @@ void Lion_Optimizer::Step_1(float* _params,
                 }
                 momentum = momentum * _betta2;
                 momentum = grad * betta2_minus1 + momentum;
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-                if (dev_params) _doubled_buffer[_buf_index][k - t] = param;
-#endif
-                if (half_precision)
-                    params_cast_h[k] = (ds_half_precision_t)param;
-                else
-                    _params[k] = param;
+                _params[k] = param;
                 _exp_avg[k] = momentum;
             }
-#if defined(__ENABLE_CUDA__)
-            if (dev_params) {
-                launch_param_update(
-                    _doubled_buffer[_buf_index], dev_params + t, (copy_size), _streams[_buf_index]);
-
-                _buf_index = !_buf_index;
-            }
-#elif defined(__ENABLE_CANN__)
-            if (dev_params) {
-                size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
-                aclrtMemcpy(dev_params + t,
-                            memcpy_size,
-                            _doubled_buffer[_buf_index],
-                            memcpy_size,
-                            aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);
-
-                _buf_index = !_buf_index;
-            }
-#endif
         }
     }
 }
 
-void Lion_Optimizer::Step_4(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Lion_Optimizer::Step_4(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
+                            ds_state_precision_t* _exp_avg,
+                            size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+    Step_AVX<4>(&rounded_size, _params, grads, _exp_avg, _param_size);
 #endif
     if (_param_size > rounded_size)
         Step_1((_params + rounded_size),
                (grads + rounded_size),
                (_exp_avg + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
+               (_param_size - rounded_size));
 }
 
 int create_lion_optimizer(int optimizer_id,
@@ -162,24 +117,76 @@ int create_lion_optimizer(int optimizer_id,
     return 0;
 }
 
-void Lion_Optimizer::Step_8(float* _params,
-                            float* grads,
-                            float* _exp_avg,
-                            size_t _param_size,
-                            ds_half_precision_t* dev_params,
-                            bool half_precision)
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void Lion_Optimizer::Step_8(ds_params_precision_t* _params,
+                            ds_params_precision_t* grads,
+                            ds_state_precision_t* _exp_avg,
+                            size_t _param_size)
 {
     size_t rounded_size = 0;
 #if defined(__AVX512__) or defined(__AVX256__)
-    Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _param_size, dev_params, half_precision);
+    Step_AVX<8>(&rounded_size, _params, grads, _exp_avg, _param_size);
 #endif
     if (_param_size > rounded_size)
         Step_4((_params + rounded_size),
                (grads + rounded_size),
                (_exp_avg + rounded_size),
-               (_param_size - rounded_size),
-               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
-               half_precision);
+               (_param_size - rounded_size));
+}
+
+template <typename ds_params_precision_t, typename ds_state_precision_t>
+void step_invoker(std::shared_ptr<Lion_Optimizer> opt,
+                  void* _params,
+                  void* grads,
+                  void* _exp_avg,
+                  size_t _param_size)
+{
+    opt->Step_8((ds_params_precision_t*)(_params),
+                (ds_params_precision_t*)(grads),
+                (ds_state_precision_t*)(_exp_avg),
+                _param_size);
+}
+
+std::map<std::tuple<c10::ScalarType, c10::ScalarType>,
+         std::function<void(std::shared_ptr<Lion_Optimizer>, void*, void*, void*, size_t)>>
+    invokers;
+
+// Fill map with template functions for each type
+template <class ds_params_precision_t, class ds_state_precision_t>
+void create_invoker()
+{
+    invokers[std::tuple(c10::CppTypeToScalarType<ds_params_precision_t>(),
+                        c10::CppTypeToScalarType<ds_state_precision_t>())] =
+        step_invoker<ds_params_precision_t, ds_state_precision_t>;
+}
+struct InvokerInitializer {
+    InvokerInitializer()
+    {
+        create_invoker<c10::Half, float>();
+        create_invoker<c10::Half, c10::Half>();
+        create_invoker<c10::BFloat16, float>();
+        create_invoker<c10::BFloat16, c10::BFloat16>();
+        create_invoker<float, float>();
+    }
+} _invoker_initializer;
+
+void invoke(std::shared_ptr<Lion_Optimizer> opt,
+            torch::Tensor& params,
+            torch::Tensor& grads,
+            torch::Tensor& exp_avg,
+            size_t param_size)
+{
+    c10::ScalarType params_type = at::typeMetaToScalarType(params.options().dtype());
+    c10::ScalarType state_type = at::typeMetaToScalarType(exp_avg.options().dtype());
+
+    auto it = invokers.find(std::tuple(params_type, state_type));
+    if (it == invokers.end()) {
+        throw std::runtime_error("Lion optimizer with param type "s + c10::toString(params_type) +
+                                 " and state type "s + c10::toString(state_type) +
+                                 " is not supported on current hardware"s);
+    }
+
+    it->second(opt, params.data_ptr(), grads.data_ptr(), exp_avg.data_ptr(), param_size);
 }
 
 int ds_lion_step(int optimizer_id,
@@ -196,67 +203,13 @@ int ds_lion_step(int optimizer_id,
     auto grads_c = grads.contiguous();
     auto exp_avg_c = exp_avg.contiguous();
 
-    // assert(params.options().dtype() == grads.options().dtype());
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-
     std::shared_ptr<Lion_Optimizer> opt =
         std::static_pointer_cast<Lion_Optimizer>(s_optimizers[optimizer_id]);
     opt->IncrementStep(step, beta1, beta2);
     opt->update_state(lr, weight_decay);
 
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                params_c.numel(),
-                nullptr,
-                (params.options().dtype() == at::kHalf));
+    invoke(opt, params_c, grads_c, exp_avg_c, params_c.numel());
 
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-    opt->SynchronizeStreams();
-#endif
-    return 0;
-}
-
-int ds_lion_step_plus_copy(int optimizer_id,
-                           size_t step,
-                           float lr,
-                           float beta1,
-                           float beta2,
-                           float weight_decay,
-                           torch::Tensor& params,
-                           torch::Tensor& grads,
-                           torch::Tensor& exp_avg,
-                           torch::Tensor& gpu_params)
-{
-#if defined(__ENABLE_CUDA__) or defined(__ENABLE_CANN__)
-    auto params_c = params.contiguous();
-    auto gpu_params_c = gpu_params.contiguous();
-    auto exp_avg_c = exp_avg.contiguous();
-    auto grads_c = grads.contiguous();
-
-    float* params_ptr = (float*)params_c.data_ptr();
-    float* grads_ptr = (float*)grads_c.data_ptr();
-    ds_half_precision_t* gpu_params_ptr = (ds_half_precision_t*)gpu_params_c.data_ptr();
-    float* exp_avg_ptr = (float*)exp_avg_c.data_ptr();
-
-    std::shared_ptr<Lion_Optimizer> opt =
-        std::static_pointer_cast<Lion_Optimizer>(s_optimizers[optimizer_id]);
-    opt->IncrementStep(step, beta1, beta2);
-    opt->update_state(lr, weight_decay);
-    opt->Step_8(params_ptr,
-                grads_ptr,
-                exp_avg_ptr,
-                params_c.numel(),
-                gpu_params_ptr,
-                (params.options().dtype() == at::kHalf));
-
-    opt->SynchronizeStreams();
-#else
-    assert(false);
-#endif
     return 0;
 }
 
diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
index a4210897092d..b48eaacd0881 100644
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@@ -176,6 +176,53 @@ at::Tensor dequantize_int8_to_half_experimental(at::Tensor& data_in,
     return output;
 }
 
+std::vector<at::Tensor> ds_loco_swizzle_quant(at::Tensor& input_vals,
+                                              at::Tensor& error_feedback,
+                                              float err_beta,
+                                              int groups,
+                                              int num_bits,
+                                              quantize::Type quant_type,
+                                              int pipeline_size,
+                                              int nodes,
+                                              int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+    auto scales = torch::empty({groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int quantization_scalar = 8 / num_bits;
+    const int compressed_vals = at::numel(input_vals) / quantization_scalar;
+
+    auto output = torch::empty({compressed_vals}, output_options);
+    const int elems_per_group = at::numel(input_vals) / groups;
+
+    launch_loco_swizzled_quant(reinterpret_cast<int8_t*>(output.data_ptr()),
+                               reinterpret_cast<float*>(scales.data_ptr()),
+                               reinterpret_cast<const __half*>(input_vals.data_ptr()),
+                               reinterpret_cast<__half*>(error_feedback.data_ptr()),
+                               err_beta,
+                               num_bits,
+                               quant_type,
+                               groups,
+                               elems_per_group,
+                               pipeline_size,
+                               nodes,
+                               devices_per_node,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 std::vector<at::Tensor> ds_swizzle_quant(at::Tensor& input_vals,
                                          int groups,
                                          int num_bits,
@@ -241,7 +288,7 @@ std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
                               .device(at::kCUDA)
                               .requires_grad(false);
 
-    std::vector<long int> sz(input_vals.sizes().begin(), input_vals.sizes().end());
+    std::vector<int64_t> sz(input_vals.sizes().begin(), input_vals.sizes().end());
     sz[sz.size() - 1] = sz.back() / devices_per_node;  // num of GPU per nodes
     const int elems_per_in_tensor = at::numel(input_vals) / devices_per_node;
     auto output = torch::empty(sz, output_options);
@@ -265,6 +312,61 @@ std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
     return {output, scales};
 }
 
+std::vector<at::Tensor> loco_quantized_reduction(at::Tensor& input_vals,
+                                                 at::Tensor& input_scales,
+                                                 at::Tensor& error_feedback,
+                                                 float err_beta,
+                                                 int in_groups,
+                                                 int out_groups,
+                                                 int num_bits,
+                                                 quantize::Type quant_type,
+                                                 int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+
+    auto scales = torch::empty({out_groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    std::vector<int64_t> sz(input_vals.sizes().begin(), input_vals.sizes().end());
+    sz[sz.size() - 1] = sz.back() / devices_per_node;
+
+    const int elems_per_in_tensor = at::numel(input_vals) / devices_per_node;
+
+    auto output = torch::empty(sz, output_options);
+
+    const int elems_per_in_group = elems_per_in_tensor / (in_groups / devices_per_node);
+    const int elems_per_out_group = elems_per_in_tensor / out_groups;
+
+    launch_loco_dequant_reduce((int8_t*)output.data_ptr(),
+                               (float*)scales.data_ptr(),
+                               (const int8_t*)input_vals.data_ptr(),
+                               (const float*)input_scales.data_ptr(),
+                               devices_per_node,
+                               num_bits,
+                               quant_type,
+                               out_groups,
+                               elems_per_out_group,
+                               elems_per_in_tensor,
+                               in_groups / devices_per_node,
+                               elems_per_in_group,
+                               (__half2*)error_feedback.data_ptr(),
+                               err_beta,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
@@ -295,4 +397,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "Dequantize int8 to half (experimental)");
     m.def("swizzle_quant", &ds_swizzle_quant);
     m.def("quantized_reduction", &quantized_reduction);
+    m.def("loco_swizzle_quant", &ds_loco_swizzle_quant, "LoCo Swizzled Quantization Kernel");
+    m.def("loco_quantized_reduction",
+          &loco_quantized_reduction,
+          "LoCo Quantization and Reduction Kernel");
 }
diff --git a/csrc/quantization/quant_reduce.cu b/csrc/quantization/quant_reduce.cu
index 26db1118c831..4100c5174b80 100644
--- a/csrc/quantization/quant_reduce.cu
+++ b/csrc/quantization/quant_reduce.cu
@@ -261,3 +261,297 @@ void launch_dequant_reduce(int8_t* reduced_data,
         }
     }
 }
+
+/*
+Modified loco_dequant_reduce function that performs dequantization and reduction,
+and incorporates error-feedback by updating the error_feedback tensor in-place.
+*/
+
+template <int numBits, int numTensors, int totalChunks, quantize::Type quantType>
+__global__ void __launch_bounds__(1024) loco_dequant_reduce(int8_t* reduced_data,
+                                                            float* reduced_scales,
+                                                            const int8_t* input_data,
+                                                            const float* input_scales,
+                                                            int elems_per_out_group,
+                                                            int elems_per_in_tensor,
+                                                            int groups_per_in_tensor,
+                                                            int elems_per_in_group,
+                                                            int num_tensors,
+                                                            __half2* error_feedback,
+                                                            const float err_beta)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    constexpr int mem_granularity = (numBits == 8) ? 8 : 4;
+    constexpr int elems_per_load = mem_granularity / sizeof(int8_t);
+    constexpr int storage_values = 16 / sizeof(__half2);
+
+    const int block_offset = tb.group_index().x * elems_per_out_group;
+    const int elem_offset = tb.thread_index().x * elems_per_load;
+    const int base_offset = block_offset + elem_offset;
+    const int stride = tb.group_dim().x * elems_per_load;
+
+    constexpr int scaling_factor = elems_per_load / storage_values;
+    const int block_offset_err = block_offset / scaling_factor;
+    const int elem_offset_err = tb.thread_index().x * storage_values;
+    const int base_offset_err = block_offset_err + elem_offset_err;
+    const int stride_err = tb.group_dim().x * storage_values;
+
+    __half2 local_buffer[totalChunks * storage_values];
+    __half2 err_buffer[totalChunks * storage_values];
+
+    quantize::GroupStats<quantType> stats;
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        __half2* iteration_buffer = local_buffer + i * storage_values;
+        __half2* iter_err_buffer = err_buffer + i * storage_values;
+
+#pragma unroll
+        for (int j = 0; j < storage_values; j++) {
+            iteration_buffer[j] = reduce::init<rop::Add, __half2>();
+        }
+
+        const int iter_offset = i * stride + base_offset;
+        const int iter_offset_err = i * stride_err + base_offset_err;
+        const int iter_scale_idx = iter_offset / elems_per_in_group;
+        bool do_loads = i * stride + elem_offset < elems_per_out_group;
+
+        if (numTensors > 0) {
+#pragma unroll
+            for (int j = 0; j < numTensors; j++) {
+                if (do_loads) {
+                    int8_t load_buffer[elems_per_load];
+
+                    mem_access::load_global<mem_granularity>(
+                        load_buffer, input_data + j * elems_per_in_tensor + iter_offset);
+
+                    quantize::Params<quantType, numBits> params(
+                        input_scales + j * groups_per_in_tensor, iter_scale_idx);
+
+                    __half2 dequant_buffer[storage_values];
+                    dequantize::chunk<numBits, quantType>(dequant_buffer, load_buffer, params);
+
+#pragma unroll
+                    for (int k = 0; k < storage_values; k++) {
+                        iteration_buffer[k] =
+                            reduce::element<rop::Add>(iteration_buffer[k], dequant_buffer[k]);
+                    }
+                }
+            }
+        } else {
+#pragma unroll 4
+            for (int j = 0; j < num_tensors; j++) {
+                if (do_loads) {
+                    int8_t load_buffer[elems_per_load];
+
+                    mem_access::load_global<mem_granularity>(
+                        load_buffer, input_data + j * elems_per_in_tensor + iter_offset);
+
+                    quantize::Params<quantType, numBits> params(
+                        input_scales + j * groups_per_in_tensor, iter_scale_idx);
+
+                    __half2 dequant_buffer[storage_values];
+                    dequantize::chunk<numBits, quantType>(dequant_buffer, load_buffer, params);
+
+#pragma unroll
+                    for (int k = 0; k < storage_values; k++) {
+                        iteration_buffer[k] =
+                            reduce::element<rop::Add>(iteration_buffer[k], dequant_buffer[k]);
+                    }
+                }
+            }
+        }
+        mem_access::load_global<quantize::granularity>(
+            iter_err_buffer, error_feedback + iter_offset_err, do_loads);
+#pragma unroll
+        for (int k = 0; k < storage_values; k++) {
+            iteration_buffer[k] = __hadd2(iteration_buffer[k], iter_err_buffer[k]);
+            stats.update(iteration_buffer[k]);
+        }
+    }
+
+    auto params = stats.template get_params<numBits, 1024>(tb, warp);
+
+    // Initialize dequantization parameters based on params
+    auto de_params = params;
+    de_params.scale = 1.0f / params.scale;
+    if constexpr (quantType == quantize::Type::Asymmetric) { de_params.offset = params.offset; }
+
+    if (tb.thread_index().x == 0) { params.store(reduced_scales, tb.group_index().x); }
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        const int iter_offset = i * stride + base_offset;
+        const int iter_offset_err = i * stride_err + base_offset_err;
+        __half2* iteration_buffer = local_buffer + i * storage_values;
+        __half2* iter_err_buffer = err_buffer + i * storage_values;
+
+        if (i * stride + elem_offset < elems_per_out_group) {
+            // ----------- Begin Error-Feedback Modification -----------
+            int8_t local_output[elems_per_load];
+            quantize::_chunk<numBits, quantType>(local_output, iteration_buffer, params);
+            mem_access::store_global<mem_granularity>(reduced_data + iter_offset, local_output);
+
+            // Dequantize the quantized output to compute the dequantized value
+            __half2 dequant_buffer[storage_values];
+            dequantize::chunk<numBits, quantType>(dequant_buffer, local_output, de_params);
+
+#pragma unroll
+            for (int k = 0; k < storage_values; k++) {
+                // __half2 to float2
+                float2 iter_buf_f = __half22float2(iteration_buffer[k]);
+                float2 dequant_buf_f = __half22float2(dequant_buffer[k]);
+
+                // Update within float precision
+                float2 new_error_f;
+                new_error_f.x = iter_buf_f.x - dequant_buf_f.x;
+                new_error_f.y = iter_buf_f.y - dequant_buf_f.y;
+
+                float2 iter_err_buf_f = __half22float2(iter_err_buffer[k]);
+
+                iter_err_buf_f.x = err_beta * iter_err_buf_f.x + (1.0f - err_beta) * new_error_f.x;
+                iter_err_buf_f.y = err_beta * iter_err_buf_f.y + (1.0f - err_beta) * new_error_f.y;
+
+                // float2 back to __half2
+                iter_err_buffer[k] = __float22half2_rn(iter_err_buf_f);
+            }
+            mem_access::store_global<quantize::granularity>(error_feedback + iter_offset_err,
+                                                            iter_err_buffer);
+        }
+    }
+}
+
+#define LAUNCH_LOCO_DEQUANT_REDUCE(num_chunks)                      \
+    loco_dequant_reduce<numBits, numTensors, num_chunks, quantType> \
+        <<<grid, block, 0, stream>>>(reduced_data,                  \
+                                     reduced_scales,                \
+                                     input_data,                    \
+                                     input_scales,                  \
+                                     elems_per_out_group,           \
+                                     elems_per_in_tensor,           \
+                                     groups_per_in_tensor,          \
+                                     elems_per_in_group,            \
+                                     num_tensors,                   \
+                                     error_feedback,                \
+                                     err_beta);
+
+template <int numBits, int numTensors, quantize::Type quantType>
+void launch_loco_dequant_reduce_impl(int8_t* reduced_data,
+                                     float* reduced_scales,
+                                     const int8_t* input_data,
+                                     const float* input_scales,
+                                     int out_groups,
+                                     int elems_per_out_group,
+                                     int elems_per_in_tensor,
+                                     int groups_per_in_tensor,
+                                     int elems_per_in_group,
+                                     int num_tensors,
+                                     __half2* error_feedback,
+                                     const float err_beta,
+                                     cudaStream_t stream)
+{
+    constexpr int elems_per_thread = numBits;
+    const int one_step_threads =
+        next_pow2((elems_per_out_group + elems_per_thread - 1) / (elems_per_thread));
+    const int threads = (one_step_threads < 1024) ? one_step_threads : 1024;
+
+    dim3 block(threads);
+    dim3 grid(out_groups);
+
+    const int elems_per_step = threads * elems_per_thread;
+    const int unroll_raw = (elems_per_out_group + elems_per_step - 1) / elems_per_step;
+
+    const int unroll = (unroll_raw >= 4) ? pow2_round<1>(unroll_raw) : unroll_raw;
+
+    if (unroll == 1) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(1);
+    } else if (unroll == 2) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(2);
+    } else if (unroll == 3) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(3);
+    } else if (unroll == 4) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(4);
+    } else if (unroll == 6) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(6);
+    } else if (unroll == 8) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(8);
+    } else if (unroll == 10) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(10);
+    } else if (unroll == 12) {
+        LAUNCH_LOCO_DEQUANT_REDUCE(12);
+    } else {
+        assert(false);
+    }
+}
+
+#define LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(NUM_BITS, NUM_GPUS, QUANT_TYPE)                   \
+    launch_loco_dequant_reduce_impl<NUM_BITS, NUM_GPUS, QUANT_TYPE>(reduced_data,         \
+                                                                    reduced_scales,       \
+                                                                    input_data,           \
+                                                                    input_scales,         \
+                                                                    out_groups,           \
+                                                                    elems_per_out_group,  \
+                                                                    elems_per_in_tensor,  \
+                                                                    groups_per_in_tensor, \
+                                                                    elems_per_in_group,   \
+                                                                    num_gpus,             \
+                                                                    error_feedback,       \
+                                                                    err_beta,             \
+                                                                    stream);
+
+void launch_loco_dequant_reduce(int8_t* reduced_data,
+                                float* reduced_scales,
+                                const int8_t* input_data,
+                                const float* input_scales,
+                                int num_gpus,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int out_groups,
+                                int elems_per_out_group,
+                                int elems_per_in_tensor,
+                                int groups_per_in_tensor,
+                                int elems_per_in_group,
+                                __half2* error_feedback,
+                                const float err_beta,
+                                cudaStream_t stream)
+{
+    if (quant_type == quantize::Type::Symmetric) {
+        if (num_bits == 4) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 8, quantize::Type::Symmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 16, quantize::Type::Symmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, -1, quantize::Type::Symmetric);
+            }
+        } else if (num_bits == 8) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 8, quantize::Type::Symmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 16, quantize::Type::Symmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, -1, quantize::Type::Symmetric);
+            }
+        }
+    } else if (quant_type == quantize::Type::Asymmetric) {
+        if (num_bits == 4) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 8, quantize::Type::Asymmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, 16, quantize::Type::Asymmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(4, -1, quantize::Type::Asymmetric);
+            }
+        } else if (num_bits == 8) {
+            if (num_gpus == 8) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 8, quantize::Type::Asymmetric);
+            } else if (num_gpus == 16) {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, 16, quantize::Type::Asymmetric);
+            } else {
+                LAUNCH_LOCO_DEQUANT_REDUCE_IMPL(8, -1, quantize::Type::Asymmetric);
+            }
+        }
+    }
+}
diff --git a/csrc/quantization/swizzled_quantize.cu b/csrc/quantization/swizzled_quantize.cu
index 5a02a0ae8120..a4b6096c81af 100644
--- a/csrc/quantization/swizzled_quantize.cu
+++ b/csrc/quantization/swizzled_quantize.cu
@@ -3,6 +3,7 @@
 
 // DeepSpeed Team
 
+#include "dequantization_utils.h"
 #include "memory_access_utils.h"
 #include "quantization_utils.h"
 #include "reduction_utils.h"
@@ -194,3 +195,233 @@ void launch_swizzled_quant(int8_t* q_data,
         }
     }
 }
+
+template <int numBits, int totalChunks, int threads, quantize::Type quantType>
+__global__ void loco_swizzled_quant_kernel(int8_t* quantized_data,
+                                           float* quantized_scales,
+                                           const __half* uncompressed_data,
+                                           __half* error_feedback,
+                                           const float err_beta,
+                                           int groups,
+                                           int elems_per_group,
+                                           int pipelining,
+                                           int nodes,
+                                           int devices_per_node)
+{
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Indexing offsets, same as normal quantization for in-case
+    const int block_rank_data =
+        blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
+    const int block_offset_data = block_rank_data * elems_per_group;
+    const int elem_offset = tb.thread_index().x * quantize::h_per_load;
+    const int base_offset_data = block_offset_data + elem_offset;
+    const int stride = tb.size() * quantize::h_per_load;
+    const __half* uncompressed_data_base = uncompressed_data + base_offset_data;
+
+    const int partition_id = blockIdx.z;
+    const int partition_offset = partition_id / devices_per_node;
+    const int partition_base = (partition_id % devices_per_node) * nodes;
+    const int pipelining_offset = blockIdx.y * (devices_per_node * nodes);
+    const int output_partition = (pipelining_offset + partition_base + partition_offset);
+    const int block_rank_err = output_partition * gridDim.x + blockIdx.x;
+
+    const int block_offset_err = block_rank_err * elems_per_group;
+    const int base_offset_err = block_offset_err + elem_offset;
+    __half* error_feedback_base = error_feedback + base_offset_err;
+
+    __half2 local_buffer[totalChunks * quantize::h2_per_load];
+    __half2 err_buffer[totalChunks * quantize::h2_per_load];
+
+    quantize::GroupStats<quantType> stats;
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        __half2* iteration_buffer = local_buffer + i * quantize::h2_per_load;
+        __half2* iter_err_buffer = err_buffer + i * quantize::h2_per_load;
+        const int i_stride = i * stride;
+        bool do_loads = (elem_offset + i_stride) < elems_per_group;
+
+        mem_access::load_global<quantize::granularity>(
+            iteration_buffer, uncompressed_data_base + i_stride, do_loads);
+
+        mem_access::load_global<quantize::granularity>(
+            iter_err_buffer, error_feedback_base + i_stride, do_loads);
+
+#pragma unroll
+        for (int j = 0; j < quantize::h2_per_load; j++) {
+            iteration_buffer[j] = __hadd2(iteration_buffer[j], iter_err_buffer[j]);
+            stats.update(iteration_buffer[j]);
+        }
+    }
+
+    auto params = stats.template get_params<numBits, threads>(tb, warp);
+
+    // Initialize dequantization parameters based on params
+    auto de_params = params;
+    de_params.scale = 1.0f / params.scale;
+    if constexpr (quantType == quantize::Type::Asymmetric) { de_params.offset = params.offset; }
+
+    if (threadIdx.x == 0) { params.store(quantized_scales, block_rank_err); }
+
+    constexpr int out_scalar_effect = 8 / numBits;
+    const int out_block_offset = block_rank_err * elems_per_group / out_scalar_effect;
+    const int out_base_offset = out_block_offset + elem_offset / out_scalar_effect;
+    int8_t* out_base = quantized_data + out_base_offset;
+
+    const int out_stride = stride / out_scalar_effect;
+    constexpr int num_int8_out = quantize::h_per_load / out_scalar_effect;
+
+#pragma unroll
+    for (int i = 0; i < totalChunks; i++) {
+        const int i_stride = i * stride;
+        __half2* iteration_buffer = local_buffer + i * quantize::h2_per_load;
+        __half2* iter_err_buffer = err_buffer + i * quantize::h2_per_load;
+
+        if (i_stride + elem_offset < elems_per_group) {
+            int8_t local_output[quantize::h_per_load / out_scalar_effect];
+            quantize::_chunk<numBits, quantType>(local_output, iteration_buffer, params);
+            mem_access::store_global<num_int8_out>(out_base + i * out_stride, local_output);
+
+            // Dequantize the quantized output to compute the dequantized value
+            __half2 dequant_buffer[quantize::h2_per_load];
+            dequantize::chunk<numBits, quantType>(dequant_buffer, local_output, de_params);
+
+// Compute new error: sum - dequant_buffer
+#pragma unroll
+            for (int k = 0; k < quantize::h2_per_load; k++) {
+                // __half2 to float2
+                float2 iter_buf_f = __half22float2(iteration_buffer[k]);
+                float2 dequant_buf_f = __half22float2(dequant_buffer[k]);
+
+                // Update within float precision
+                float2 new_error_f;
+                new_error_f.x = iter_buf_f.x - dequant_buf_f.x;
+                new_error_f.y = iter_buf_f.y - dequant_buf_f.y;
+
+                float2 iter_err_buf_f = __half22float2(iter_err_buffer[k]);
+
+                iter_err_buf_f.x = err_beta * iter_err_buf_f.x + (1.0f - err_beta) * new_error_f.x;
+                iter_err_buf_f.y = err_beta * iter_err_buf_f.y + (1.0f - err_beta) * new_error_f.y;
+
+                // float2 back to __half2
+                iter_err_buffer[k] = __float22half2_rn(iter_err_buf_f);
+            }
+            __half2* error_feedback_base_h2 = reinterpret_cast<__half2*>(error_feedback_base);
+            mem_access::store_global<quantize::granularity>(error_feedback_base_h2 + i_stride / 2,
+                                                            iter_err_buffer);
+        }
+    }
+}
+
+#define LAUNCH_LOCO_SWIZZLE_QUANT(total_chunks, threads)              \
+    loco_swizzled_quant_kernel<numBits, total_chunks, threads, qType> \
+        <<<grid, block, 0, stream>>>(output_data,                     \
+                                     params,                          \
+                                     input_data,                      \
+                                     error_feedback,                  \
+                                     err_beta,                        \
+                                     groups,                          \
+                                     elems_per_group,                 \
+                                     pipelining,                      \
+                                     nodes,                           \
+                                     devices_per_node);
+
+template <int numBits, quantize::Type qType>
+void launch_loco_swizzled_quant_impl(int8_t* output_data,
+                                     float* params,
+                                     const __half* input_data,
+                                     __half* error_feedback,
+                                     const float err_beta,
+                                     int groups,
+                                     int elems_per_group,
+                                     int pipelining,
+                                     int nodes,
+                                     int devices_per_node,
+                                     cudaStream_t stream)
+{
+    const int one_step_threads =
+        next_pow2((elems_per_group + swiz_quant::h_per_step - 1) / swiz_quant::h_per_step);
+    const int max_threads = (one_step_threads < swiz_quant::max_threads) ? one_step_threads
+                                                                         : swiz_quant::max_threads;
+    const int threads = (max_threads < swiz_quant::min_threads) ? swiz_quant::min_threads
+                                                                : max_threads;
+
+    dim3 block(threads);
+    const int groups_per_partition = groups / (nodes * devices_per_node);
+    assert(groups_per_partition % pipelining == 0);
+    const int contiguous_groups = groups_per_partition / pipelining;
+    const int partitions = nodes * devices_per_node;
+    dim3 grid(contiguous_groups, pipelining, partitions);
+
+    const int elems_per_step = threads * swiz_quant::h_per_step;
+    const int external_unroll = ((elems_per_group + elems_per_step - 1) / elems_per_step);
+    const int total_unroll = external_unroll * swiz_quant::step_granularity;
+
+    assert(total_unroll % 2 == 0);
+
+    if (threads == 32) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 32);
+    } else if (threads == 64) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 64);
+    } else if (threads == 128) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 128);
+    } else if (threads == 256) {
+        LAUNCH_LOCO_SWIZZLE_QUANT(2, 256);
+    } else if (threads == 512) {
+        if (total_unroll == 2) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(2, 512);
+        } else if (total_unroll == 4) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(4, 512);
+        } else if (total_unroll == 6) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(6, 512);
+        } else if (total_unroll == 8) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(8, 512);
+        } else if (total_unroll == 10) {
+            LAUNCH_LOCO_SWIZZLE_QUANT(10, 512);
+        }
+    }
+}
+
+#define DISPATCH_LOCO_SWIZZLE_QUANT(num_bits, qtype)                   \
+    launch_loco_swizzled_quant_impl<num_bits, qtype>(output_data,      \
+                                                     params,           \
+                                                     input_data,       \
+                                                     error_feedback,   \
+                                                     err_beta,         \
+                                                     groups,           \
+                                                     elems_per_group,  \
+                                                     pipelining,       \
+                                                     nodes,            \
+                                                     devices_per_node, \
+                                                     stream);
+
+void launch_loco_swizzled_quant(int8_t* output_data,
+                                float* params,
+                                const __half* input_data,
+                                __half* error_feedback,
+                                const float err_beta,
+                                int num_bits,
+                                quantize::Type q_type,
+                                int groups,
+                                int elems_per_group,
+                                int pipelining,
+                                int nodes,
+                                int devices_per_node,
+                                cudaStream_t stream)
+{
+    if (num_bits == 4) {
+        if (q_type == quantize::Type::Asymmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(4, quantize::Type::Asymmetric);
+        } else if (q_type == quantize::Type::Symmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(4, quantize::Type::Symmetric);
+        }
+    } else if (num_bits == 8) {
+        if (q_type == quantize::Type::Asymmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(8, quantize::Type::Asymmetric);
+        } else if (q_type == quantize::Type::Symmetric) {
+            DISPATCH_LOCO_SWIZZLE_QUANT(8, quantize::Type::Symmetric);
+        }
+    }
+}
diff --git a/csrc/random_ltd/token_sort.cu b/csrc/random_ltd/token_sort.cu
index 3049471cfe34..3c1dff49429f 100644
--- a/csrc/random_ltd/token_sort.cu
+++ b/csrc/random_ltd/token_sort.cu
@@ -16,7 +16,7 @@ constexpr int mem_vals = granularity / sizeof(int32_t);
 constexpr int max_buffer_size = (threads + 1) * mem_vals;
 
 #ifdef __HIP_PLATFORM_AMD__
-constexpr int warp_size = 64;
+constexpr int warp_size = ROCM_WAVEFRONT_SIZE;
 #else
 constexpr int warp_size = 32;
 #endif
diff --git a/csrc/transformer/cublas_wrappers.cu b/csrc/transformer/cublas_wrappers.cu
index 7821a8759ab0..d982e65b8a81 100644
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@@ -5,7 +5,9 @@
 
 #include "cublas_wrappers.h"
 
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -33,7 +35,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -67,20 +70,39 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          k,
                                          (const void*)alpha,
                                          (const void*)A,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          (transa == CUBLAS_OP_N) ? m : k,
                                          (const void*)B,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          (transb == CUBLAS_OP_N) ? k : n,
                                          (const void*)beta,
                                          C,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          m,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                         HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -96,7 +118,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -124,7 +147,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
                    cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -158,20 +182,39 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          k,
                                          (const void*)alpha,
                                          (const void*)A,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_16F,
+#else
                                          CUDA_R_16F,
+#endif
                                          (transa == CUBLAS_OP_N) ? m : k,
                                          (const void*)B,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_16F,
+#else
                                          CUDA_R_16F,
+#endif
                                          (transb == CUBLAS_OP_N) ? k : n,
                                          (const void*)beta,
                                          (void*)C,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_16F,
+#else
                                          CUDA_R_16F,
+#endif
                                          m,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                         HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -187,7 +230,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
@@ -223,7 +267,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -263,24 +308,43 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        k,
                                                        alpha,
                                                        A,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        (op_A == CUBLAS_OP_N) ? m : k,
                                                        stride_A,
                                                        B,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        (op_B == CUBLAS_OP_N) ? k : n,
                                                        stride_B,
                                                        beta,
                                                        C,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        m,
                                                        stride_C,
                                                        batch,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                                       HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -297,7 +361,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
@@ -333,7 +398,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -373,24 +439,43 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        k,
                                                        alpha,
                                                        A,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_16F,
+#else
                                                        CUDA_R_16F,
+#endif
                                                        (op_A == CUBLAS_OP_N) ? m : k,
                                                        stride_A,
                                                        B,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_16F,
+#else
                                                        CUDA_R_16F,
+#endif
                                                        (op_B == CUBLAS_OP_N) ? k : n,
                                                        stride_B,
                                                        beta,
                                                        C,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_16F,
+#else
                                                        CUDA_R_16F,
+#endif
                                                        m,
                                                        stride_C,
                                                        batch,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                                       HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
diff --git a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
index a06dbb48fd33..bbb8a7f00b1f 100644
--- a/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
+++ b/csrc/transformer/inference/csrc/apply_rotary_pos_emb.cu
@@ -99,7 +99,7 @@ __global__ void apply_rotary_pos_half(T* mixed_query,
                                                                                   rope_theta,  \
                                                                                   max_out_tokens);
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) and ROCM_WAVEFRONT_SIZE == 64
 #define LAUNCH_FOR_ALIGNMENT(ALIGNMENT)         \
     if (threads_per_head == 4) {                \
         LAUNCH_ROT_POS_EMB_HALF(4, ALIGNMENT);  \
diff --git a/csrc/transformer/inference/csrc/pt_binding.cpp b/csrc/transformer/inference/csrc/pt_binding.cpp
index b7277d1e1678..19dbe73726f7 100644
--- a/csrc/transformer/inference/csrc/pt_binding.cpp
+++ b/csrc/transformer/inference/csrc/pt_binding.cpp
@@ -163,7 +163,9 @@ at::Tensor einsum_sec_sm_ecm(at::Tensor& Q, at::Tensor& W)
                    (T*)W.data_ptr(),
                    (T*)Q.data_ptr(),
                    (T*)O.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -216,7 +218,8 @@ void attention_unfused(at::Tensor& prev_key_cont,
                                 seq_len * k,
                                 seq_len * soft_len,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -253,7 +256,8 @@ void attention_unfused(at::Tensor& prev_key_cont,
                                 seq_len * soft_len,
                                 seq_len * k,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -388,7 +392,8 @@ void attention_unfused(T* prev_key_cont,
                                 seq_len * k,
                                 seq_len * soft_len,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -421,7 +426,8 @@ void attention_unfused(T* prev_key_cont,
                                 seq_len * soft_len,
                                 seq_len * k,
                                 bsz * heads,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                                 rocblas_gemm_algo_standard);
 #else
                                 CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -446,14 +452,17 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                            unsigned layer_id,
                                            unsigned num_layers,
                                            at::Tensor& alibi,
-                                           float rope_theta)
+                                           float rope_theta,
+                                           bool is_prompt,
+                                           std::optional<at::Tensor> token_idx,
+                                           std::optional<at::Tensor> position_ids)
 {
     unsigned bsz = query_key_value.size(0);
     unsigned seq_len = query_key_value.size(1);
     int k = query_key_value.size(2) / (heads + 2 * (num_kv > 0 ? num_kv : heads));
     unsigned hidden_dim = heads * k;
 
-    bool is_prompt = (seq_len > 1);
+    is_prompt = (seq_len > 1);
 
     if (is_prompt) InferenceContext::Instance().reset_tokens(seq_len);
     unsigned soft_len = InferenceContext::Instance().current_tokens();
@@ -536,22 +545,23 @@ std::vector<at::Tensor> ds_softmax_context(at::Tensor& query_key_value,
                                1);
 
     if (layer_id == num_layers - 1) InferenceContext::Instance().advance_tokens();
-    auto prev_key = torch::from_blob(workspace + offset,
-                                     {bsz, heads, all_tokens, k},
-                                     {hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
-                                      k * InferenceContext::Instance().GetMaxTokenLength(),
-                                      k,
-                                      1},
-                                     options);
-
-    auto prev_value =
-        torch::from_blob(workspace + offset + value_offset,
-                         {bsz, heads, all_tokens, k},
-                         {hidden_dim * InferenceContext::Instance().GetMaxTokenLength(),
-                          k * InferenceContext::Instance().GetMaxTokenLength(),
-                          k,
-                          1},
-                         options);
+    auto prev_key = torch::from_blob(
+        workspace + offset,
+        {bsz, heads, all_tokens, k},
+        {hidden_dim * static_cast<int64_t>(InferenceContext::Instance().GetMaxTokenLength()),
+         k * static_cast<int64_t>(InferenceContext::Instance().GetMaxTokenLength()),
+         k,
+         1},
+        options);
+
+    auto prev_value = torch::from_blob(
+        workspace + offset + value_offset,
+        {bsz, heads, all_tokens, k},
+        {hidden_dim * static_cast<int64_t>(InferenceContext::Instance().GetMaxTokenLength()),
+         k * static_cast<int64_t>(InferenceContext::Instance().GetMaxTokenLength()),
+         k,
+         1},
+        options);
 
     return {output, prev_key, prev_value};
 }
@@ -886,7 +896,8 @@ void quantized_gemm(void* output,
                    weight16,
                    (T*)input,
                    (T*)output,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -931,7 +942,8 @@ at::Tensor qkv_unfused_cublas(at::Tensor& output,
                        (T*)weight.data_ptr(),
                        workspace,
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1003,7 +1015,8 @@ std::vector<at::Tensor> ds_rms_qkv(at::Tensor& input,
                        (T*)weight.data_ptr(),
                        (T*)rms_norm.data_ptr(),
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1089,7 +1102,8 @@ void quantized_gemm(at::Tensor& output,
                    (T*)weight16.data_ptr(),
                    (T*)input.data_ptr(),
                    (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1135,7 +1149,8 @@ at::Tensor ds_linear_layer(at::Tensor& input,
                    (T*)weight.data_ptr(),
                    (T*)input_cont.data_ptr(),
                    (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                    rocblas_gemm_algo_standard);
 #else
                    CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1353,7 +1368,8 @@ at::Tensor ds_vector_matmul(at::Tensor& input,
                        (T*)weight.data_ptr(),
                        (T*)input.data_ptr(),
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1439,7 +1455,8 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
                        (T*)weight.data_ptr(),
                        inp_norm,
                        intermediate,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1483,7 +1500,8 @@ at::Tensor mlp_unfused_cublas(at::Tensor& output,
                        (T*)weight1.data_ptr(),
                        intermediate,
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1578,7 +1596,9 @@ std::vector<at::Tensor> ds_rms_mlp_gemm(at::Tensor& input,
     auto output = at::from_blob(output_ptr, input.sizes(), options);
     auto inp_norm = at::from_blob(inp_norm_ptr, input.sizes(), options);
     auto intermediate_gemm =
-        at::from_blob(intermediate_ptr, {input.size(0), input.size(1), mlp_1_out_neurons}, options);
+        at::from_blob(intermediate_ptr,
+                      {input.size(0), input.size(1), static_cast<int64_t>(mlp_1_out_neurons)},
+                      options);
 
     auto act_func_type = static_cast<ActivationFuncType>(activation_type);
 
@@ -1617,7 +1637,8 @@ std::vector<at::Tensor> ds_rms_mlp_gemm(at::Tensor& input,
                        (T*)weight_interm.data_ptr(),
                        (T*)inp_norm.data_ptr(),
                        intermediate_ptr,
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1680,7 +1701,8 @@ std::vector<at::Tensor> ds_rms_mlp_gemm(at::Tensor& input,
                        (T*)weight_out.data_ptr(),
                        intermediate_ptr,
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard,
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP,
@@ -1742,7 +1764,8 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                        (T*)weight.data_ptr(),
                        (T*)input.data_ptr(),
                        (T*)intermediate.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -1776,7 +1799,8 @@ at::Tensor fused_gemm_gelu(at::Tensor& input,
                        (T*)weight_out.data_ptr(),
                        (T*)intermediate.data_ptr(),
                        (T*)output.data_ptr(),
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
                        rocblas_gemm_algo_standard);
 #else
                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
@@ -2010,7 +2034,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "DeepSpeed memory allocation for GPT inference with " #_name " (CUDA)");                \
     m.def("dequantize_" #_name,                                                                   \
           &ds_dequantize<_dtype>,                                                                 \
-          "DeepSpeed dequantize with " #_name " (CUDA)")
+          "DeepSpeed dequantize with " #_name " (CUDA)");
 
     DEF_OPS(fp32, float);
     DEF_OPS(fp16, __half);
diff --git a/csrc/transformer/inference/includes/inference_cublas_wrappers.h b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
index 640751b12c8f..40c3e443941d 100644
--- a/csrc/transformer/inference/includes/inference_cublas_wrappers.h
+++ b/csrc/transformer/inference/includes/inference_cublas_wrappers.h
@@ -18,7 +18,9 @@
 #endif
 #include <stdio.h>
 
-#ifdef __HIP_PLATFORM_AMD__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -49,7 +51,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
 #endif
 {
     const int ldb = (b_stride == -1) ? ((transb == CUBLAS_OP_N) ? k : n) : b_stride;
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_status status = rocblas_gemm_ex(handle,
                                             transa,
                                             transb,
@@ -83,20 +86,39 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          k,
                                          (const void*)alpha,
                                          (const void*)A,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          (transa == CUBLAS_OP_N) ? m : k,
                                          (const void*)B,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          ldb,
                                          (const void*)beta,
                                          C,
+#ifdef __HIP_PLATFORM_AMD__
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          m,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                         HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -113,7 +135,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
 }
 
 template <typename T>
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_gemm_ex(rocblas_handle handle,
                    rocblas_operation transa,
                    rocblas_operation transb,
@@ -144,7 +167,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
 #endif
 {
     const int ldb = (b_stride == -1) ? ((transb == CUBLAS_OP_N) ? k : n) : b_stride;
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
                                                                      : rocblas_datatype_bf16_r;
     rocblas_status status = rocblas_gemm_ex(handle,
@@ -171,8 +195,12 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                             algo,
                                             0,
                                             0);
+#else
+#ifdef __HIP_PLATFORM_AMD__
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? HIPBLAS_R_16F : HIPBLAS_R_16B;
 #else
     constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
+#endif
     cublasStatus_t status = cublasGemmEx(handle,
                                          transa,
                                          transb,
@@ -190,11 +218,18 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                          (void*)C,
                                          cublas_dtype_16,
                                          m,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                         HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -210,7 +245,8 @@ int cublas_gemm_ex(cublasHandle_t handle,
     return 0;
 }
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
@@ -246,7 +282,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_status status =
         rocblas_gemm_strided_batched_ex(handle,
                                         op_A,
@@ -286,24 +323,43 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        k,
                                                        alpha,
                                                        A,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        (op_A == CUBLAS_OP_N) ? m : k,
                                                        stride_A,
                                                        B,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        (op_B == CUBLAS_OP_N) ? k : n,
                                                        stride_B,
                                                        beta,
                                                        C,
+#ifdef __HIP_PLATFORM_AMD__
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        m,
                                                        stride_C,
                                                        batch,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                                       HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -321,7 +377,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
 }
 
 template <typename T>
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 int cublas_strided_batched_gemm(rocblas_handle handle,
                                 int m,
                                 int n,
@@ -357,7 +414,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                 cublasGemmAlgo_t algo)
 #endif
 {
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     constexpr auto rocblas_dtype_16 = std::is_same<T, __half>::value ? rocblas_datatype_f16_r
                                                                      : rocblas_datatype_bf16_r;
     rocblas_status status =
@@ -390,8 +448,12 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                         algo,
                                         0,
                                         0);
+#else
+#ifdef __HIP_PLATFORM_AMD__
+    constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? HIPBLAS_R_16F : HIPBLAS_R_16B;
 #else
     constexpr auto cublas_dtype_16 = std::is_same<T, __half>::value ? CUDA_R_16F : CUDA_R_16BF;
+#endif
     cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                        op_A,
                                                        op_B,
@@ -413,11 +475,18 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                        m,
                                                        stride_C,
                                                        batch,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                                       HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        algo);
 #endif
 
-#ifdef __HIP_PLATFORM_AMD__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
diff --git a/csrc/xpu/adagrad/cpu_adagrad.cpp b/csrc/xpu/adagrad/cpu_adagrad.cpp
new file mode 100644
index 000000000000..dc727f8fa216
--- /dev/null
+++ b/csrc/xpu/adagrad/cpu_adagrad.cpp
@@ -0,0 +1,196 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "cpu_adagrad.h"
+#include <torch/extension.h>
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <unordered_map>
+
+static std::unordered_map<int, std::shared_ptr<void>> s_optimizers;
+
+// C++ interface
+
+void Adagrad_Optimizer::Step_1(float* _params,
+                               float* grads,
+                               float* _exp_avg_sq,
+                               size_t _param_size,
+                               ds_half_precision_t* dev_params,
+                               bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<1>(
+        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size) {
+        float step_size = -1 * _alpha;
+        ds_half_precision_t* grads_cast_h;
+        ds_half_precision_t* params_cast_h;
+        if (half_precision) {
+            grads_cast_h = reinterpret_cast<ds_half_precision_t*>(grads);
+            params_cast_h = reinterpret_cast<ds_half_precision_t*>(_params);
+        }
+        for (size_t t = rounded_size; t < _param_size; t += TILE) {
+            size_t copy_size = TILE;
+            if ((t + TILE) > _param_size) copy_size = _param_size - t;
+            size_t offset = copy_size + t;
+#pragma omp parallel for
+            for (size_t k = t; k < offset; k++) {
+                float grad = half_precision ? (float)grads_cast_h[k] : grads[k];
+                float param = half_precision ? (float)params_cast_h[k] : _params[k];
+                float momentum = grads[k];
+                float variance = _exp_avg_sq[k];
+                if (_weight_decay > 0) { grad = param * _weight_decay + grad; }
+
+                variance += grad * grad;
+
+                grad = sqrt(variance);
+                grad += _eps;
+                grad = momentum / grad;
+                param = grad * step_size + param;
+                if (half_precision)
+                    params_cast_h[k] = (ds_half_precision_t)param;
+                else
+                    _params[k] = param;
+                // STORE UPDATE TERM TO GRAD'S MEMORY
+                grads[k] = grad * step_size;
+                _exp_avg_sq[k] = variance;
+            }
+        }
+    }
+}
+
+void Adagrad_Optimizer::Step_4(float* _params,
+                               float* grads,
+                               float* _exp_avg_sq,
+                               size_t _param_size,
+                               ds_half_precision_t* dev_params,
+                               bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<4>(
+        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_1((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int create_adagrad_optimizer(int optimizer_id,
+                             float alpha = 1e-2,
+                             float eps = 1e-8,
+                             float weight_decay = 0,
+                             bool should_log = false)
+{
+    auto opt = std::make_shared<Adagrad_Optimizer>(alpha, eps, weight_decay);
+
+    s_optimizers[optimizer_id] = opt;
+
+    if (should_log) {
+        std::string avx_type = "";
+#if defined(__AVX512__)
+        avx_type = "AVX512";
+#else
+#if defined(__AVX256__)
+        avx_type = "AVX2";
+#else
+        avx_type = "scalar";
+#endif
+#endif
+
+        printf("Adagrad Optimizer #%d is created with %s arithmetic capability.\n",
+               optimizer_id,
+               avx_type.c_str());
+        printf("Config: alpha=%f, weight_decay=%f\n", alpha, weight_decay);
+    }
+
+    return 0;
+}
+
+void Adagrad_Optimizer::Step_8(float* _params,
+                               float* grads,
+                               float* _exp_avg_sq,
+                               size_t _param_size,
+                               ds_half_precision_t* dev_params,
+                               bool half_precision)
+{
+    size_t rounded_size = 0;
+#if defined(__AVX512__) or defined(__AVX256__)
+    Step_AVX<8>(
+        &rounded_size, _params, grads, _exp_avg_sq, _param_size, dev_params, half_precision);
+#endif
+    if (_param_size > rounded_size)
+        Step_4((_params + rounded_size),
+               (grads + rounded_size),
+               (_exp_avg_sq + rounded_size),
+               (_param_size - rounded_size),
+               (dev_params != nullptr ? (dev_params + rounded_size) : dev_params),
+               half_precision);
+}
+
+int ds_adagrad_step(int optimizer_id,
+                    size_t step,
+                    float lr,
+                    float epsilon,
+                    float weight_decay,
+                    torch::Tensor& params,
+                    torch::Tensor& grads,
+                    torch::Tensor& exp_avg_sq)
+{
+    auto params_c = params.contiguous();
+    auto grads_c = grads.contiguous();
+    auto exp_avg_sq_c = exp_avg_sq.contiguous();
+
+    float* params_ptr = (float*)params_c.data_ptr();
+    float* grads_ptr = (float*)grads_c.data_ptr();
+    float* exp_avg_sq_ptr = (float*)exp_avg_sq_c.data_ptr();
+
+    std::shared_ptr<Adagrad_Optimizer> opt =
+        std::static_pointer_cast<Adagrad_Optimizer>(s_optimizers[optimizer_id]);
+    opt->IncrementStep(step);
+    opt->update_state(lr, epsilon, weight_decay);
+    opt->Step_8(params_ptr, grads_ptr, exp_avg_sq_ptr, params_c.numel());
+
+    return 0;
+}
+
+int ds_adagrad_step_plus_copy(int optimizer_id,
+                              size_t step,
+                              float lr,
+                              float epsilon,
+                              float weight_decay,
+                              torch::Tensor& params,
+                              torch::Tensor& grads,
+                              torch::Tensor& exp_avg_sq,
+                              torch::Tensor& gpu_params)
+{
+    assert(false);
+    return 0;
+}
+
+int destroy_adagrad_optimizer(int optimizer_id)
+{
+    s_optimizers.erase(optimizer_id);
+
+    return 0;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("adagrad_update", &ds_adagrad_step, "DeepSpeed CPU Adagrad update (C++)");
+    m.def("adagrad_update_copy",
+          &ds_adagrad_step_plus_copy,
+          "DeepSpeed CPU Adagrad update and param copy (C++)");
+    m.def("create_adagrad", &create_adagrad_optimizer, "DeepSpeed CPU Adagrad (C++)");
+    m.def("destroy_adagrad", &destroy_adagrad_optimizer, "DeepSpeed CPU Adagrad destroy (C++)");
+}
diff --git a/csrc/xpu/adam/fused_adam_frontend.cpp b/csrc/xpu/adam/fused_adam_frontend.cpp
new file mode 100755
index 000000000000..13b390248608
--- /dev/null
+++ b/csrc/xpu/adam/fused_adam_frontend.cpp
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <torch/extension.h>
+
+void multi_tensor_adam_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const float epsilon,
+                            const int step,
+                            const int mode,
+                            const int bias_correction,
+                            const float weight_decay);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("multi_tensor_adam",
+          &multi_tensor_adam_cuda,
+          "Compute and apply gradient update to parameters for Adam optimizer");
+}
diff --git a/csrc/xpu/adam/multi_tensor_adam.dp.cpp b/csrc/xpu/adam/multi_tensor_adam.dp.cpp
new file mode 100644
index 000000000000..0720a020247a
--- /dev/null
+++ b/csrc/xpu/adam/multi_tensor_adam.dp.cpp
@@ -0,0 +1,159 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <sycl/sycl.hpp>
+
+#include <assert.h>
+
+#include <cmath>
+#include "multi_tensor_apply.dp.hpp"
+#include "type_shim.h"
+
+#define BLOCK_SIZE 512
+#define ILP 4
+
+typedef enum : int {
+    ADAM_MODE_0 = 0,  // L2 regularization mode
+    ADAM_MODE_1 = 1   // Decoupled weight decay mode(AdamW)
+} adamMode_t;
+
+using MATH_T = float;
+
+template <typename T>
+struct AdamFunctor {
+    __inline__ __attribute__((always_inline)) void operator()(int chunk_size,
+                                                              volatile int* noop_gmem,
+                                                              TensorListMetadata<4>& tl,
+                                                              const float beta1,
+                                                              const float beta2,
+                                                              const float beta1_correction,
+                                                              const float beta2_correction,
+                                                              const float epsilon,
+                                                              const float lr,
+                                                              adamMode_t mode,
+                                                              const float decay)
+    {
+        auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
+        int tensor_loc = tl.block_to_tensor[item_ct1.get_group(2)];
+
+        int chunk_idx = tl.block_to_chunk[item_ct1.get_group(2)];
+        int n = tl.sizes[tensor_loc];
+
+        T* g = (T*)tl.addresses[0][tensor_loc];
+        g += chunk_idx * chunk_size;
+
+        T* p = (T*)tl.addresses[1][tensor_loc];
+        p += chunk_idx * chunk_size;
+
+        T* m = (T*)tl.addresses[2][tensor_loc];
+        m += chunk_idx * chunk_size;
+
+        T* v = (T*)tl.addresses[3][tensor_loc];
+        v += chunk_idx * chunk_size;
+
+        n -= chunk_idx * chunk_size;
+
+        // see note in multi_tensor_scale_kernel.cu
+        for (int i_start = 0; i_start < n && i_start < chunk_size;
+             i_start += item_ct1.get_local_range(2) * ILP) {
+            MATH_T r_g[ILP];
+            MATH_T r_p[ILP];
+            MATH_T r_m[ILP];
+            MATH_T r_v[ILP];
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + item_ct1.get_local_id(2) + ii * item_ct1.get_local_range(2);
+                if (i < n && i < chunk_size) {
+                    r_g[ii] = g[i];
+                    r_p[ii] = p[i];
+                    r_m[ii] = m[i];
+                    r_v[ii] = v[i];
+                } else {
+                    r_g[ii] = MATH_T(0);
+                    r_p[ii] = MATH_T(0);
+                    r_m[ii] = MATH_T(0);
+                    r_v[ii] = MATH_T(0);
+                }
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                if (mode == ADAM_MODE_0) {  // L2
+                    r_g[ii] = r_g[ii] + (decay * r_p[ii]);
+                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
+                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
+                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+                    MATH_T denom = sycl::sqrt(next_v_unbiased) + epsilon;
+                    MATH_T update = next_m_unbiased / denom;
+                    r_p[ii] = r_p[ii] - (lr * update);
+                } else {  // weight decay
+                    r_m[ii] = beta1 * r_m[ii] + (1 - beta1) * r_g[ii];
+                    r_v[ii] = beta2 * r_v[ii] + (1 - beta2) * r_g[ii] * r_g[ii];
+                    MATH_T next_m_unbiased = r_m[ii] / beta1_correction;
+                    MATH_T next_v_unbiased = r_v[ii] / beta2_correction;
+                    MATH_T denom = sycl::sqrt(next_v_unbiased) + epsilon;
+                    MATH_T update = (next_m_unbiased / denom) + (decay * r_p[ii]);
+                    r_p[ii] = r_p[ii] - (lr * update);
+                }
+            }
+#pragma unroll
+            for (int ii = 0; ii < ILP; ii++) {
+                int i = i_start + item_ct1.get_local_id(2) + ii * item_ct1.get_local_range(2);
+                if (i < n && i < chunk_size) {
+                    p[i] = r_p[ii];
+                    m[i] = r_m[ii];
+                    v[i] = r_v[ii];
+                }
+            }
+        }
+    }
+};
+
+void multi_tensor_adam_cuda(int chunk_size,
+                            at::Tensor noop_flag,
+                            std::vector<std::vector<at::Tensor>> tensor_lists,
+                            const float lr,
+                            const float beta1,
+                            const float beta2,
+                            const float epsilon,
+                            const int step,
+                            const int mode,
+                            const int bias_correction,
+                            const float weight_decay)
+{
+    using namespace at;
+
+    // Handle bias correction mode
+    float bias_correction1 = 1.0f, bias_correction2 = 1.0f;
+    if (bias_correction == 1) {
+        bias_correction1 = 1 - std::pow(beta1, step);
+        bias_correction2 = 1 - std::pow(beta2, step);
+    }
+
+    // Assume single type across p,g,m1,m2 now
+    DISPATCH_DOUBLE_FLOAT_AND_HALF(tensor_lists[0][0].scalar_type(),
+                                   0,
+                                   "adam",
+                                   multi_tensor_apply<4>(BLOCK_SIZE,
+                                                         chunk_size,
+                                                         noop_flag,
+                                                         tensor_lists,
+                                                         AdamFunctor<scalar_t_0>(),
+                                                         beta1,
+                                                         beta2,
+                                                         bias_correction1,
+                                                         bias_correction2,
+                                                         epsilon,
+                                                         lr,
+                                                         (adamMode_t)mode,
+                                                         weight_decay);)
+}
diff --git a/csrc/xpu/adam/multi_tensor_apply.dp.hpp b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
new file mode 100644
index 000000000000..14a130e2a23e
--- /dev/null
+++ b/csrc/xpu/adam/multi_tensor_apply.dp.hpp
@@ -0,0 +1,226 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <c10/xpu/XPUStream.h>
+#include <ipex.h>
+#include <sycl/sycl.hpp>
+#include "compat.h"
+
+#include <assert.h>
+#include <tuple>
+#include <utility>
+
+namespace at {
+namespace cuda {
+sycl::queue* getCurrentCUDAStream()
+{
+    c10::xpu::XPUStream stream = c10::xpu::getCurrentXPUStream();
+    auto& queue = stream.queue();
+    return &queue;
+}
+
+sycl::queue* getStreamFromPool(bool)
+{
+    // not implemented
+    return nullptr;
+}
+}  // namespace cuda
+}  // namespace at
+// #include <iostream>
+
+// This header is the one-stop shop for all your multi-tensor apply needs.
+
+// TODO:  Kernel arg size limit may be <4KB for some other cards (ie Jetson)
+constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30};
+constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320};
+
+template <int n>
+struct TensorListMetadata {
+    void* addresses[n][depth_to_max_tensors[n - 1]];
+    int sizes[depth_to_max_tensors[n - 1]];
+    unsigned char block_to_tensor[depth_to_max_blocks[n - 1]];
+    int block_to_chunk[depth_to_max_blocks[n - 1]];  // I fear this needs to be a full int.
+    int start_tensor_this_launch;
+};
+
+template <typename T, typename U, typename... ArgTypes>
+class multi_tensor_apply_kernel {
+public:
+    multi_tensor_apply_kernel(int chunk_size,
+                              volatile int* noop_flag,
+                              T tl,
+                              U callable,
+                              ArgTypes... args)
+        : chunk_size(chunk_size), noop_flag(noop_flag), tl(tl), callable(callable), args(args...)
+    {
+    }
+
+    // This should be identical to original __global__ function
+    static void inline __global__function(int chunk_size,
+                                          volatile int* noop_flag,
+                                          T tl,
+                                          U callable,
+                                          ArgTypes... args)
+    {
+        callable(chunk_size, noop_flag, tl, args...);
+    }
+
+    // If global function template contains parameter pack,
+    // we only deal with parameter pack at the end of template parameter list
+    template <typename Tuple, std::size_t... I>
+    static void inline __tuple_expand_driver(int chunk_size,
+                                             volatile int* noop_flag,
+                                             T tl,
+                                             U callable,
+                                             Tuple args,
+                                             std::index_sequence<I...>)
+    {
+        __global__function(chunk_size, noop_flag, tl, callable, std::get<I>(args)...);
+    }
+
+    //
+    // Because __global__ function can't really use any reference types, we can sure that args
+    // are all good behaviors
+    //
+    void operator()(sycl::nd_item<3>) const
+    {
+        __tuple_expand_driver(chunk_size,
+                              noop_flag,
+                              tl,
+                              callable,
+                              args,
+                              std::make_index_sequence<sizeof...(ArgTypes)>());
+    }
+
+private:
+    int chunk_size;
+    volatile int* noop_flag;
+    T tl;
+    U callable;
+    std::tuple<ArgTypes...> args;
+};
+
+// to make sure multi_tensor_apply_kernel can be used in sycl::buffer
+namespace sycl {
+template <typename T, typename U, typename... ArgTypes>
+struct is_device_copyable<multi_tensor_apply_kernel<T, U, ArgTypes...>> : std::true_type {};
+}  // namespace sycl
+
+template <int depth, typename T, typename... ArgTypes>
+void multi_tensor_apply(int block_size,
+                        int chunk_size,
+                        const at::Tensor& noop_flag,
+                        const std::vector<std::vector<at::Tensor>>& tensor_lists,
+                        T callable,
+                        ArgTypes... args)
+{
+    TORCH_CHECK(tensor_lists.size() == depth, "tensor_lists.size() != depth");
+    int len0 = tensor_lists[0].size();
+    TORCH_CHECK(len0 > 0, "tensor_lists[0].size() is not > 0");
+    auto ref_device = tensor_lists[0][0].device();
+    TORCH_CHECK(ref_device.type() == at::kXPU, "expected input to be on cuda");
+    for (int l = 0; l < tensor_lists.size(); l++)  // No range-based for because I need indices
+    {
+        TORCH_CHECK(tensor_lists[l].size() == len0, "Size mismatch among tensor lists");
+        for (int t = 0; t < tensor_lists[l].size(); t++) {
+            // TODO:  Print which tensor fails.
+            bool contiguous_memory = tensor_lists[l][t].is_contiguous();
+#ifdef VERSION_GE_1_5
+            contiguous_memory = (contiguous_memory ||
+                                 tensor_lists[l][t].is_contiguous(at::MemoryFormat::ChannelsLast));
+#endif
+            TORCH_CHECK(contiguous_memory, "A tensor was not contiguous.");
+            TORCH_CHECK(tensor_lists[l][t].device() == ref_device,
+                        "A tensor was not on the same device as the first tensor");
+            TORCH_CHECK(tensor_lists[l][t].numel() == tensor_lists[0][t].numel(), "Size mismatch");
+        }
+    }
+
+    int ntensors = tensor_lists[0].size();
+
+    TensorListMetadata<depth> tl;
+
+    /* const at::cuda::OptionalCUDAGuard device_guard(device_of(tensor_lists[0][0])); */
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    tl.start_tensor_this_launch = 0;
+    int loc_block_info = 0;
+    int loc_tensor_info = 0;
+    for (int t = 0; t < ntensors; t++) {
+        tl.sizes[loc_tensor_info] = tensor_lists[0][t].numel();
+        for (int d = 0; d < depth; d++)
+            tl.addresses[d][loc_tensor_info] = tensor_lists[d][t].data_ptr();
+        loc_tensor_info++;
+
+        int chunks_this_tensor = (tensor_lists[0][t].numel() + chunk_size - 1) / chunk_size;
+
+        for (int chunk = 0; chunk < chunks_this_tensor; chunk++) {
+            // std::cout << chunks_this_tensor << std::endl;
+            tl.block_to_tensor[loc_block_info] = loc_tensor_info - 1;
+            tl.block_to_chunk[loc_block_info] = chunk;
+            loc_block_info++;
+
+            bool tensors_full = (loc_tensor_info == depth_to_max_tensors[depth - 1] &&
+                                 chunk == chunks_this_tensor - 1);
+            bool blocks_full = (loc_block_info == depth_to_max_blocks[depth - 1]);
+            bool last_chunk = (t == ntensors - 1 && chunk == chunks_this_tensor - 1);
+            if (tensors_full || blocks_full || last_chunk) {
+                // using accscalar_t = acc_type<scalar_t, true>;
+                /* multi_tensor_apply_kernel<TensorListMetadata<depth>, T, ArgTypes...>
+                 * fn(chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...); */
+                if constexpr (sizeof(multi_tensor_apply_kernel(
+                                  chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...)) <
+                              2048) {
+                    ((sycl::queue*)(stream))
+                        ->parallel_for(
+                            sycl::nd_range<3>(sycl::range<3>(1, 1, loc_block_info) *
+                                                  sycl::range<3>(1, 1, block_size),
+                                              sycl::range<3>(1, 1, block_size)),
+                            multi_tensor_apply_kernel(
+                                chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...));
+                } else {
+                    auto capture = multi_tensor_apply_kernel(
+                        chunk_size, noop_flag.DATA_PTR<int>(), tl, callable, args...);
+                    sycl::buffer params(const_cast<const decltype(capture)*>(&capture),
+                                        sycl::range<1>(1));
+                    stream->submit([&](sycl::handler& cgh) {
+                        auto device_params =
+                            params.template get_access<sycl::access_mode::read,
+                                                       sycl::target::constant_buffer>(cgh);
+                        cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, loc_block_info) *
+                                                               sycl::range<3>(1, 1, block_size),
+                                                           sycl::range<3>(1, 1, block_size)),
+                                         [=](sycl::nd_item<3> item) { device_params[0](item); });
+                    });
+                }
+                0;
+
+                // Reset.  The control flow possibilities here make my brain hurt.
+                loc_block_info = 0;
+                if (chunk == chunks_this_tensor - 1) {
+                    // std::cout << "Hit case 1 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    loc_tensor_info = 0;
+                    tl.start_tensor_this_launch = t + 1;
+                } else {
+                    // std::cout << "Hit case 2 " << cond1 << " " << cond2 << " " << cond3 <<
+                    // std::endl;
+                    tl.sizes[0] = tl.sizes[loc_tensor_info - 1];
+                    for (int d = 0; d < depth; d++)
+                        tl.addresses[d][0] = tl.addresses[d][loc_tensor_info - 1];
+                    loc_tensor_info = 1;
+                    tl.start_tensor_this_launch = t;
+                }
+            }
+        }
+    }
+}
diff --git a/csrc/xpu/common/custom_cuda_kernel.dp.cpp b/csrc/xpu/common/custom_cuda_kernel.dp.cpp
new file mode 100644
index 000000000000..cfd004ef1357
--- /dev/null
+++ b/csrc/xpu/common/custom_cuda_kernel.dp.cpp
@@ -0,0 +1,92 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <sycl/sycl.hpp>
+
+inline void has_capability_or_fail(const sycl::device& dev,
+                                   const std::initializer_list<sycl::aspect>& props)
+{
+    for (const auto& it : props) {
+        if (dev.has(it)) continue;
+        switch (it) {
+            case sycl::aspect::fp64:
+                throw std::runtime_error("'double' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() + "' device");
+                break;
+            case sycl::aspect::fp16:
+                throw std::runtime_error("'half' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() + "' device");
+                break;
+            default:
+#define __SYCL_ASPECT(ASPECT, ID) \
+    case sycl::aspect::ASPECT: return #ASPECT;
+#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
+#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
+                auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
+                    switch (AspectNum) {
+#include <sycl/info/aspects.def>
+#include <sycl/info/aspects_deprecated.def>
+                        default: return "unknown aspect";
+                    }
+                };
+#undef __SYCL_ASPECT_DEPRECATED_ALIAS
+#undef __SYCL_ASPECT_DEPRECATED
+#undef __SYCL_ASPECT
+                throw std::runtime_error("'" + getAspectNameStr(it) + "' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() + "' device");
+        }
+        break;
+    }
+}
+
+void param_update_kernel(const float* input, sycl::half* output, int size)
+{
+    auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
+    int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
+
+    if (id < size) { output[id] = (sycl::half)input[id]; }
+}
+
+void launch_param_update(const float* input, sycl::half* output, int size, sycl::queue* stream)
+{
+    int threads = 1024;
+
+    sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1);
+    sycl::range<3> block_dim(1, 1, threads);
+
+    {
+        has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(grid_dim * block_dim, block_dim),
+            [=](sycl::nd_item<3> item_ct1) { param_update_kernel(input, output, size); });
+    }
+}
+
+void param_update_kernel_half(const float* input, sycl::half* output, int size)
+{
+    auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
+    int id = item_ct1.get_group(2) * item_ct1.get_local_range(2) + item_ct1.get_local_id(2);
+    sycl::half2* output_cast = reinterpret_cast<sycl::half2*>(output);
+    if (id < size) {
+        float input_f = input[id];
+        sycl::half2* input_h = reinterpret_cast<sycl::half2*>(&input_f);
+        output_cast[id] = *input_h;
+    }
+}
+
+void launch_param_update_half(const float* input, sycl::half* output, int size, sycl::queue* stream)
+{
+    int threads = 1024;
+    size /= 2;
+    sycl::range<3> grid_dim(1, 1, (size - 1) / threads + 1);
+    sycl::range<3> block_dim(1, 1, threads);
+
+    {
+        has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(grid_dim * block_dim, block_dim),
+            [=](sycl::nd_item<3> item_ct1) { param_update_kernel_half(input, output, size); });
+    }
+}
diff --git a/csrc/xpu/includes/compat.h b/csrc/xpu/includes/compat.h
new file mode 100755
index 000000000000..6d54446d472e
--- /dev/null
+++ b/csrc/xpu/includes/compat.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/*
+Copyright NVIDIA/apex
+This file is adapted from fused adam in NVIDIA/apex, commit a109f85
+*/
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
diff --git a/csrc/xpu/includes/cpu_adagrad.h b/csrc/xpu/includes/cpu_adagrad.h
new file mode 100644
index 000000000000..660f860917f6
--- /dev/null
+++ b/csrc/xpu/includes/cpu_adagrad.h
@@ -0,0 +1,120 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#define NOMINMAX  // Windows idiosyncrasy
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
+
+#include <stdio.h>
+#include <cassert>
+#include "simd.h"
+
+typedef unsigned short ds_half_precision_t;
+
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg_sq,                       \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
+                     bool half_precision = false);
+
+class Adagrad_Optimizer {
+public:
+    Adagrad_Optimizer(float alpha = 1e-2, float eps = 1e-8, float weight_decay = 0)
+        : _alpha(alpha), _eps(eps), _weight_decay(weight_decay)
+    {
+    }
+    ~Adagrad_Optimizer() {}
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg_sq,
+                  size_t param_size,
+                  ds_half_precision_t* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+    inline void IncrementStep(size_t step)
+    {
+        _step++;
+        if (_step != step) { _step = step; }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+    }
+
+private:
+    float _alpha;
+    float _eps;
+    float _weight_decay;
+
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+};
+
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
+                                 float* _params,
+                                 float* grads,
+                                 float* _exp_avg_sq,
+                                 size_t _param_size,
+                                 ds_half_precision_t* dev_params,
+                                 bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    float step_size = -1 * _alpha;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0) weight_decay4.data = SIMD_SET(_weight_decay);
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + i, half_precision);
+
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, grads + i, false);
+
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + i, half_precision);
+
+            if (_weight_decay > 0) { simd_fma<span>(grad_4, param_4, weight_decay4, grad_4); }
+
+            simd_fma<span>(variance_4, grad_4, grad_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_add<span>(grad_4, grad_4, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+
+            simd_store<span>(_params + i, param_4, half_precision);
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
diff --git a/csrc/xpu/includes/cpu_adam.h b/csrc/xpu/includes/cpu_adam.h
new file mode 100644
index 000000000000..7bc0364c569d
--- /dev/null
+++ b/csrc/xpu/includes/cpu_adam.h
@@ -0,0 +1,237 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#define NOMINMAX  // Windows idiosyncrasy
+                  // https://stackoverflow.com/questions/4913922/possible-problems-with-nominmax-on-visual-c
+
+#include <stdio.h>
+#include <torch/extension.h>
+#include <cassert>
+#include "simd.h"
+
+#include <cmath>
+typedef unsigned short ds_half_precision_t;
+
+#define STEP(SPAN)                                             \
+    void Step_##SPAN(float* _params,                           \
+                     float* grads,                             \
+                     float* _exp_avg,                          \
+                     float* _exp_avg_sq,                       \
+                     size_t _param_size,                       \
+                     ds_half_precision_t* dev_param = nullptr, \
+                     bool half_precision = false);
+
+class Adam_Optimizer {
+public:
+    Adam_Optimizer(float alpha = 1e-3,
+                   float betta1 = 0.9,
+                   float betta2 = 0.999,
+                   float eps = 1e-8,
+                   float weight_decay = 0,
+                   bool adamw_mode = true)
+        : _alpha(alpha),
+          _betta1(betta1),
+          _betta2(betta2),
+          _eps(eps),
+          _weight_decay(weight_decay),
+          _betta1_t(1.0),
+          _betta2_t(1.0),
+          _step(0),
+          _adamw_mode(adamw_mode)
+    {
+    }
+    ~Adam_Optimizer() {}
+
+#if defined(__AVX512__) or defined(__AVX256__)
+    template <int span>
+    void Step_AVX(size_t* rounded_size,
+                  float* _params,
+                  float* grads,
+                  float* _exp_avg,
+                  float* _exp_avg_sq,
+                  size_t param_size,
+                  ds_half_precision_t* dev_param = nullptr,
+                  bool half_precision = false);
+#endif
+    STEP(1)
+    STEP(4)
+    STEP(8)
+    inline void IncrementStep(size_t step, float beta1, float beta2)
+    {
+        if (beta1 != _betta1 || beta2 != _betta2) {
+            _step = step;
+            _betta1 = beta1;
+            _betta2 = beta2;
+            _betta1_t = std::pow(_betta1, step);
+            _betta2_t = std::pow(_betta2, step);
+        } else {
+            _step++;
+            if (_step != step) {
+                _betta1_t = std::pow(_betta1, step);
+                _betta2_t = std::pow(_betta2, step);
+                _step = step;
+            } else {
+                _betta1_t *= _betta1;
+                _betta2_t *= _betta2;
+            }
+        }
+    }
+    inline void update_state(float lr, float epsilon, float weight_decay, bool bias_correction)
+    {
+        _alpha = lr;
+        _eps = epsilon;
+        _weight_decay = weight_decay;
+
+        _bias_correction1 = 1.0f;
+        _bias_correction2 = 1.0f;
+        if (bias_correction == 1) {
+            _bias_correction1 = 1 - _betta1_t;
+            _bias_correction2 = 1 / sqrt(1 - _betta2_t);
+        }
+    }
+
+private:
+    float _alpha;
+    float _betta1;
+    float _betta2;
+    float _eps;
+    float _weight_decay;
+
+    float _betta1_t;
+    float _betta2_t;
+    size_t _step;
+
+    float _bias_correction1;
+    float _bias_correction2;
+
+    bool _adamw_mode;
+};
+
+#if defined(__AVX512__) or defined(__AVX256__)
+template <int span>
+void Adam_Optimizer::Step_AVX(size_t* rounded_size,
+                              float* _params,
+                              float* grads,
+                              float* _exp_avg,
+                              float* _exp_avg_sq,
+                              size_t _param_size,
+                              ds_half_precision_t* dev_params,
+                              bool half_precision)
+{
+    size_t new_rounded_size = 0;
+    int rshft = half_precision ? 1 : 0;
+
+    AVX_Data betta1_4;
+    betta1_4.data = SIMD_SET(_betta1);
+    AVX_Data betta2_4;
+    betta2_4.data = SIMD_SET(_betta2);
+
+    float betta1_minus1 = 1 - _betta1;
+    float betta2_minus1 = 1 - _betta2;
+    AVX_Data betta1_minus1_4;
+    betta1_minus1_4.data = SIMD_SET(betta1_minus1);
+    AVX_Data betta2_minus1_4;
+    betta2_minus1_4.data = SIMD_SET(betta2_minus1);
+
+    AVX_Data bias2_sqrt;
+    bias2_sqrt.data = SIMD_SET(_bias_correction2);
+
+    AVX_Data eps_4;
+    eps_4.data = SIMD_SET(_eps);
+
+    float step_size = -1 * _alpha / _bias_correction1;
+    AVX_Data step_size_4;
+    step_size_4.data = SIMD_SET(step_size);
+
+    float w_decay = -1 * _alpha * _weight_decay;
+    AVX_Data weight_decay4;
+    if (_weight_decay > 0)
+        weight_decay4.data = (_adamw_mode ? SIMD_SET(w_decay) : SIMD_SET(_weight_decay));
+    new_rounded_size = ROUND_DOWN(_param_size, SIMD_WIDTH * span);
+    for (size_t t = 0; t < new_rounded_size; t += TILE) {
+        size_t copy_size = TILE;
+        if ((t + TILE) > new_rounded_size) copy_size = new_rounded_size - t;
+        size_t offset = copy_size + t;
+#pragma omp parallel for
+        for (size_t i = t; i < offset; i += SIMD_WIDTH * span) {
+            AVX_Data grad_4[span];
+            simd_load<span>(grad_4, grads + (i >> rshft), half_precision);
+
+            AVX_Data momentum_4[span];
+            simd_load<span>(momentum_4, _exp_avg + i, false);
+
+            AVX_Data variance_4[span];
+            simd_load<span>(variance_4, _exp_avg_sq + i, false);
+
+            AVX_Data param_4[span];
+            simd_load<span>(param_4, _params + (i >> rshft), half_precision);
+
+            if (_weight_decay > 0 && !_adamw_mode) {
+                simd_fma<span>(grad_4, param_4, weight_decay4, grad_4);
+            }
+
+            simd_mul<span>(momentum_4, momentum_4, betta1_4);
+            simd_fma<span>(momentum_4, grad_4, betta1_minus1_4, momentum_4);
+            simd_mul<span>(variance_4, variance_4, betta2_4);
+            simd_mul<span>(grad_4, grad_4, grad_4);
+            simd_fma<span>(variance_4, grad_4, betta2_minus1_4, variance_4);
+            simd_sqrt<span>(grad_4, variance_4);
+            simd_fma<span>(grad_4, grad_4, bias2_sqrt, eps_4);
+            simd_div<span>(grad_4, momentum_4, grad_4);
+
+            if (_weight_decay > 0 && _adamw_mode) {
+                simd_fma<span>(param_4, param_4, weight_decay4, param_4);
+            }
+
+            simd_fma<span>(param_4, grad_4, step_size_4, param_4);
+
+            simd_store<span>(_params + (i >> rshft), param_4, half_precision);
+            simd_store<span>(_exp_avg + i, momentum_4, false);
+            simd_store<span>(_exp_avg_sq + i, variance_4, false);
+        }
+    }
+    *rounded_size = new_rounded_size;
+}
+#endif
+
+int create_adam_optimizer(int optimizer_id,
+                          float alpha = 1e-3,
+                          float betta1 = 0.9,
+                          float betta2 = 0.999,
+                          float eps = 1e-8,
+                          float weight_decay = 0,
+                          bool adamw_mode = true,
+                          bool should_log = false);
+
+int ds_adam_step(int optimizer_id,
+                 size_t step,
+                 float lr,
+                 float beta1,
+                 float beta2,
+                 float epsilon,
+                 float weight_decay,
+                 bool bias_correction,
+                 torch::Tensor& params,
+                 torch::Tensor& grads,
+                 torch::Tensor& exp_avg,
+                 torch::Tensor& exp_avg_sq);
+
+int ds_adam_step_plus_copy(int optimizer_id,
+                           size_t step,
+                           float lr,
+                           float beta1,
+                           float beta2,
+                           float epsilon,
+                           float weight_decay,
+                           bool bias_correction,
+                           torch::Tensor& params,
+                           torch::Tensor& grads,
+                           torch::Tensor& exp_avg,
+                           torch::Tensor& exp_avg_sq,
+                           torch::Tensor& gpu_params);
+
+int destroy_adam_optimizer(int optimizer_id);
diff --git a/csrc/xpu/includes/simd.h b/csrc/xpu/includes/simd.h
new file mode 100644
index 000000000000..097e2d8585cc
--- /dev/null
+++ b/csrc/xpu/includes/simd.h
@@ -0,0 +1,198 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#pragma once
+
+#if (__x86_64__ || __i386__)
+#include <cpuid.h>
+#include <x86intrin.h>
+#endif
+
+#define TILE (128 * 1024 * 1024)
+#if defined(__AVX512__) or defined(__AVX256__)
+
+#define ROUND_DOWN(size, step) ((size) & ~((step) - 1))
+
+#if defined(__AVX512__)
+#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm512_loadu_ps(x)
+#define SIMD_SET(x) _mm512_set1_ps(x)
+#define SIMD_ADD(x, y) _mm512_add_ps(x, y)
+#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
+#define SIMD_AND(x, y) _mm512_and_ps(x, y)
+#define SIMD_ANDNOT(x, y) _mm512_andnot_ps(x, y)
+#define SIMD_OR(x, y) _mm512_or_ps(x, y)
+#define SIMD_XOR(x, y) _mm512_xor_ps(x, y)
+#define SIMD_WIDTH 16
+
+#define SIMD_LOAD2(x, h) \
+    ((h) ? _mm512_cvtph_ps(_mm256_castps_si256(_mm256_loadu_ps(x))) : _mm512_loadu_ps(x))
+#define SIMD_STORE2(x, d, h)                                                                      \
+    ((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
+         : _mm512_storeu_ps(x, d))
+
+#define INTV __m256i
+#elif defined(__AVX256__)
+#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
+#define SIMD_LOAD(x) _mm256_loadu_ps(x)
+#define SIMD_SET(x) _mm256_set1_ps(x)
+#define SIMD_ADD(x, y) _mm256_add_ps(x, y)
+#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
+#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
+#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
+#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
+#define SIMD_AND(x, y) _mm256_and_ps(x, y)
+#define SIMD_ANDNOT(x, y) _mm256_andnot_ps(x, y)
+#define SIMD_OR(x, y) _mm256_or_ps(x, y)
+#define SIMD_XOR(x, y) _mm256_xor_ps(x, y)
+#define SIMD_WIDTH 8
+
+#define SIMD_LOAD2(x, h) \
+    ((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
+#define SIMD_STORE2(x, d, h)                                                                \
+    ((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
+         : _mm256_storeu_ps(x, d))
+
+#define INTV __m128i
+#endif
+
+union AVX_Data {
+#if defined(__AVX512__)
+    __m512 data;
+#elif defined(__AVX256__)
+    __m256 data;
+#endif
+    // float data_f[16];
+};
+
+template <int span>
+inline void simd_store(float* dst, AVX_Data* src, bool half_precision)
+{
+    size_t width = (half_precision ? SIMD_WIDTH / 2 : SIMD_WIDTH);
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { SIMD_STORE2(dst + width * i, src[i].data, half_precision); }
+}
+template <int span>
+inline void simd_load(AVX_Data* dst, float* src, bool half_precision)
+{
+    size_t width = (half_precision ? 1 : SIMD_WIDTH);
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_LOAD2(src + width * i, half_precision); }
+}
+template <int span>
+inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data* src_a)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a[i].data);
+    }
+}
+template <int span>
+inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data src_m_r, AVX_Data src_a)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r.data, src_a.data);
+    }
+}
+template <int span>
+inline void simd_fma(AVX_Data* dst, AVX_Data* src_m_l, AVX_Data* src_m_r, AVX_Data* src_a)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_FMA(src_m_l[i].data, src_m_r[i].data, src_a[i].data);
+    }
+}
+template <int span>
+inline void simd_sqrt(AVX_Data* dst, AVX_Data* src)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_SQRT(src[i].data); }
+}
+template <int span>
+inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_add(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ADD(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_mul(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_MUL(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_div(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_DIV(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_and(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_AND(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_andnot(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) {
+        dst[i].data = SIMD_ANDNOT(src_a_l[i].data, src_a_r[i].data);
+    }
+}
+template <int span>
+inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_or(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_OR(src_a_l[i].data, src_a_r[i].data); }
+}
+template <int span>
+inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r.data); }
+}
+template <int span>
+inline void simd_xor(AVX_Data* dst, AVX_Data* src_a_l, AVX_Data* src_a_r)
+{
+#pragma unroll
+    for (size_t i = 0; i < span; ++i) { dst[i].data = SIMD_XOR(src_a_l[i].data, src_a_r[i].data); }
+}
+
+#endif
diff --git a/csrc/xpu/includes/type_shim.h b/csrc/xpu/includes/type_shim.h
new file mode 100644
index 000000000000..1897afd1fea2
--- /dev/null
+++ b/csrc/xpu/includes/type_shim.h
@@ -0,0 +1,155 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+/* Taken from NVIDIA/apex commit 855808f3fc268e9715d613f3c2e56469d8c986d8 */
+#include <sycl/sycl.hpp>
+/* #include <dpct/dpct.hpp> */
+#include <ATen/ATen.h>
+
+// Forward/backward compatibility hack around
+// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
+// pending more future-proof guidance from upstream.
+// struct TypeShim
+// {
+//   const at::Type& payload;
+//   TypeShim(const at::Type& type) : payload(type) {}
+//   // Enable trivial conversion to a const at::Type& for pre-3aeb78
+//   operator const at::Type&(){ return payload; };
+//   // Enable dispatch switch statements to take *this directly for  post-3aeb78
+//   //operator at::ScalarType(){ return payload.; };
+// };
+
+#define DISPATCH_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                          \
+    switch (TYPE) {                                                              \
+        case at::ScalarType::Float: {                                            \
+            using scalar_t_##LEVEL = float;                                      \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Half: {                                             \
+            using scalar_t_##LEVEL = at::Half;                                   \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::BFloat16: {                                         \
+            using scalar_t_##LEVEL = at::BFloat16;                               \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+
+#define DISPATCH_DOUBLE_FLOAT_AND_HALF(TYPE, LEVEL, NAME, ...)                   \
+    switch (TYPE) {                                                              \
+        case at::ScalarType::Double: {                                           \
+            using scalar_t_##LEVEL = double;                                     \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Float: {                                            \
+            using scalar_t_##LEVEL = float;                                      \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Half: {                                             \
+            using scalar_t_##LEVEL = at::Half;                                   \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::BFloat16: {                                         \
+            using scalar_t_##LEVEL = at::BFloat16;                               \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+
+#define DISPATCH_DOUBLE_AND_FLOAT(TYPE, LEVEL, NAME, ...)                        \
+    switch (TYPE) {                                                              \
+        case at::ScalarType::Double: {                                           \
+            using scalar_t_##LEVEL = double;                                     \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        case at::ScalarType::Float: {                                            \
+            using scalar_t_##LEVEL = float;                                      \
+            __VA_ARGS__;                                                         \
+            break;                                                               \
+        }                                                                        \
+        default: AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+    }
+
+template <typename T>
+__inline__ __attribute__((always_inline)) T
+reduce_block_into_lanes(T* x,
+                        T val,
+                        int lanes = 1,
+                        bool share_result = false)  // lanes is intended to be <= 32.
+{
+    auto item_ct1 = sycl::ext::oneapi::experimental::this_nd_item<3>();
+    int tid = item_ct1.get_local_id(2) + item_ct1.get_local_id(1) * item_ct1.get_local_range(2);
+    int blockSize = item_ct1.get_local_range(2) *
+                    item_ct1.get_local_range(1);  // blockSize is intended to be a multiple of 32.
+
+    if (blockSize >= 64) {
+        x[tid] = val;
+        /*
+        DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control
+        flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:6: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if
+        there is no access to global memory.
+        */
+        item_ct1.barrier();
+    }
+
+#pragma unroll
+    for (int i = (blockSize >> 1); i >= 64; i >>= 1) {
+        if (tid < i) x[tid] = x[tid] + x[tid + i];
+        /*
+        DPCT1118:2: SYCL group functions and algorithms must be encountered in converged control
+        flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:7: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if
+        there is no access to global memory.
+        */
+        item_ct1.barrier();
+    }
+
+    T final;
+
+    if (tid < 32) {
+        if (blockSize >= 64)
+            final = x[tid] + x[tid + 32];
+        else
+            final = val;
+            // __SYNCWARP();
+
+#pragma unroll
+        for (int i = 16; i >= lanes; i >>= 1)
+            final = final + __shfl_down_sync(0xffffffff, final, i);
+    }
+
+    if (share_result) {
+        if (tid < lanes) x[tid] = final;  // EpilogueOp
+        // Make sure the smem result is visible to all warps.
+        /*
+        DPCT1118:3: SYCL group functions and algorithms must be encountered in converged control
+        flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:8: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if
+        there is no access to global memory.
+        */
+        item_ct1.barrier();
+    }
+
+    return final;
+}
diff --git a/csrc/xpu/packbits/packing.cpp b/csrc/xpu/packbits/packing.cpp
new file mode 100644
index 000000000000..fb09bb16efdd
--- /dev/null
+++ b/csrc/xpu/packbits/packing.cpp
@@ -0,0 +1,100 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <ipex.h>
+#include <torch/extension.h>
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace xpu;
+
+void packbitskernel(const float* input, uint8_t* output, const int input_size, id<1> item_ct1)
+{
+    // get the sign bit of each float and pack them into byte
+    int i = item_ct1;
+    for (int j = 0; j < 8; ++j) {
+        int k = i * 8 + j;
+        int bit = k < input_size && (!sycl::signbit(input[k]));
+        output[i] |= bit << (7 - j);
+    }
+}
+
+void unpackbitskernel(const uint8_t* input, float* output, id<1> item_ct1)
+{
+    // use the bit value to set float, bit 0 -> float -1, bit 1 -> float 1
+    int i = item_ct1;
+    output[i] = (float((input[i / 8] >> (7 - i % 8)) & 1) - 0.5) * 2;
+}
+
+sycl::queue get_current_queue(at::Device device)
+{
+    c10::impl::VirtualGuardImpl impl(device.type());
+    c10::Stream _stream = impl.getStreamFromGlobalPool(device, /*isHighPriority=*/false);
+    sycl::queue queue = xpu::get_queue_from_stream(_stream);
+    return queue;
+}
+
+/*
+pack float tensor into uint8 tensor. Every eight float elements get packed into one uint8
+if float x >= 0, will be packed as a '1' bit, or will be packed as '0'
+Arguments:
+    tensor: A bool tensor that get packed.
+    input_size: numel of input tensor
+    rank: device id in order to get corresponding stream
+*/
+at::Tensor packbits(at::Tensor tensor, int input_size, int rank)
+{
+    at::Device device = "xpu:" + std::to_string(rank);
+    sycl::queue q = get_current_queue(device);
+
+    int packed_size = (input_size + 7) / 8;
+    auto unit8_options = at::TensorOptions().dtype(at::kByte).device(at::kXPU);
+    at::Tensor packed = torch::zeros({packed_size}, unit8_options);
+
+    float* input = (float*)tensor.data_ptr();
+    uint8_t* output = (uint8_t*)packed.data_ptr();
+
+    auto event = q.submit([&](sycl::handler& cgh) {
+        cgh.parallel_for<>(range(packed_size), [=](id<1> item_ct1) {
+            packbitskernel(input, output, input_size, item_ct1);
+        });
+    });
+
+    return packed;
+}
+
+/*
+unpack uint8 tensor into float tensor. Every uint8 element get unpacked into eight float
+a '1' bit will be converted to a float(1), a '0' bit will be converted to a float(-1).
+Arguments:
+    tensor: A uint8 tensor that get unpacked.
+    input_size: numel of input tensor
+    rank: device id in order to get corresponding stream
+*/
+at::Tensor unpackbits(at::Tensor tensor, int input_size, int rank)
+{
+    at::Device device = "xpu:" + std::to_string(rank);
+    sycl::queue q = get_current_queue(device);
+
+    auto float_options = at::TensorOptions().dtype(at::kFloat).device(at::kXPU);
+    at::Tensor unpacked = torch::empty({input_size * 8}, float_options);
+
+    uint8_t* input = (uint8_t*)tensor.data_ptr();
+    float* output = (float*)unpacked.data_ptr();
+
+    auto event = q.submit([&](sycl::handler& cgh) {
+        cgh.parallel_for<>(range(input_size * 8),
+                           [=](id<1> item_ct1) { unpackbitskernel(input, output, item_ct1); });
+    });
+
+    return unpacked;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("packbits", &packbits, "DeepSpeed XPU packbits (C++)");
+    m.def("unpackbits", &unpackbits, "DeepSpeed XPU unpackbits (C++)");
+}
diff --git a/deepspeed/__init__.py b/deepspeed/__init__.py
index 87b959f56be0..fd1f421b8954 100755
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -12,16 +12,21 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from packaging import version as pkg_version
 
-try:
-    import triton  # noqa: F401 # type: ignore
-    HAS_TRITON = True
-except ImportError:
+# Skip Triton import for AMD due to pytorch-triton-rocm module breaking device API in DeepSpeed
+if not (hasattr(torch.version, 'hip') and torch.version.hip is not None):
+    try:
+        import triton  # noqa: F401 # type: ignore
+        HAS_TRITON = True
+    except ImportError:
+        HAS_TRITON = False
+else:
     HAS_TRITON = False
 
 from . import ops
 from . import module_inject
 
 from .accelerator import get_accelerator
+from .constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from .runtime.engine import DeepSpeedEngine, DeepSpeedOptimizerCallable, DeepSpeedSchedulerCallable
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER
 from .runtime.hybrid_engine import DeepSpeedHybridEngine
@@ -32,13 +37,13 @@
 from .runtime.config import DeepSpeedConfig, DeepSpeedConfigError
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-from .module_inject import replace_transformer_layer, revert_transformer_layer
+from .module_inject import replace_transformer_layer, revert_transformer_layer, set_autotp_mode
 
 from .utils import log_dist, OnDevice, logger
 from .comm.comm import init_distributed
 
-from .runtime import zero
-from .runtime import DeepSpeedOptimizer, ZeROOptimizer
+from .runtime import zero, domino
+from .runtime.compiler import is_compile_supported
 
 from .pipe import PipelineModule
 
@@ -67,10 +72,12 @@ def initialize(args=None,
                model_parameters: Optional[torch.nn.Module] = None,
                training_data: Optional[torch.utils.data.Dataset] = None,
                lr_scheduler: Optional[Union[_LRScheduler, DeepSpeedSchedulerCallable]] = None,
+               distributed_port: int = TORCH_DISTRIBUTED_DEFAULT_PORT,
                mpu=None,
                dist_init_required: Optional[bool] = None,
                collate_fn=None,
                config=None,
+               mesh_param=None,
                config_params=None):
     """Initialize the DeepSpeed Engine.
 
@@ -91,6 +98,8 @@ def initialize(args=None,
         lr_scheduler: Optional: Learning Rate Scheduler Object or a Callable that takes an Optimizer and returns a Scheduler object.
             The scheduler object should define a get_lr(), step(), state_dict(), and load_state_dict() methods
 
+        distributed_port: Optional: Master node (rank 0)'s free port that needs to be used for communication during distributed training
+
         mpu: Optional: A model parallelism unit object that implements
             get_{model,data}_parallel_{rank,group,world_size}()
 
@@ -132,18 +141,32 @@ def initialize(args=None,
     global dist
     from deepspeed import comm as dist
     dist_backend = get_accelerator().communication_backend_name()
-    dist.init_distributed(dist_backend=dist_backend, dist_init_required=dist_init_required)
+    dist.init_distributed(dist_backend=dist_backend,
+                          distributed_port=distributed_port,
+                          dist_init_required=dist_init_required)
 
+    ##TODO: combine reuse mpu as mesh device and vice versa
     # Set config using config_params for backwards compat
     if config is None and config_params is not None:
         config = config_params
 
+    mesh_device = None
+    if mesh_param:
+        logger.info(f"mesh_param to Initialize mesh device: {mesh_param}")
+        mesh_device = dist.initialize_mesh_device(mesh_param, ("data_parallel", "sequence_parallel"))
+    #if config file has sequence parallelize and data parallelize, then use them to initialize mesh device
+    elif config is not None:
+        if "sequence_parallel_size" in config and "data_parallel_size" in config:
+            logger.info(f"config to Initialize mesh device: {config}")
+            mesh_device = dist.initialize_mesh_device((config["data_parallel_size"], config["sequence_parallel_size"]), \
+            ("data_parallel", "sequence_parallel"))
+
     # Check for deepscale_config for backwards compat
     if hasattr(args, "deepscale_config") and args.deepscale_config is not None:
         logger.warning("************ --deepscale_config is deprecated, please use --deepspeed_config ************")
         if hasattr(args, "deepspeed_config"):
-            assert (args.deepspeed_config is
-                    None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
+            assert (args.deepspeed_config
+                    is None), "Not sure how to proceed, we were given both a deepscale_config and deepspeed_config"
         args.deepspeed_config = args.deepscale_config
         args.deepscale_config = None
 
@@ -152,9 +175,8 @@ def initialize(args=None,
         assert config is None, "Not sure how to proceed, we were given deepspeed configs in the deepspeed arguments and deepspeed.initialize() function call"
         config = args.deepspeed_config
     assert config is not None, "DeepSpeed requires --deepspeed_config to specify configuration file"
-
     if not isinstance(model, PipelineModule):
-        config_class = DeepSpeedConfig(config, mpu)
+        config_class = DeepSpeedConfig(config, mpu, mesh_device=mesh_device)
         if config_class.hybrid_engine.enabled:
             engine = DeepSpeedHybridEngine(args=args,
                                            model=model,
@@ -178,6 +200,7 @@ def initialize(args=None,
                                      dist_init_required=dist_init_required,
                                      collate_fn=collate_fn,
                                      config=config,
+                                     mesh_device=mesh_device,
                                      config_class=config_class)
     else:
         assert mpu is None, "mpu must be None with pipeline parallelism"
@@ -198,7 +221,12 @@ def initialize(args=None,
     # Restore zero.Init context if necessary
     zero.partition_parameters.restore_init_context()
 
-    return_items = [engine, engine.optimizer, engine.training_dataloader, engine.lr_scheduler]
+    return_items = [
+        engine,
+        engine.optimizer,
+        engine.training_dataloader,
+        engine.lr_scheduler,
+    ]
     return tuple(return_items)
 
 
@@ -234,12 +262,6 @@ def _add_core_arguments(parser):
                        type=str,
                        help='Deprecated DeepSpeed json configuration file.')
 
-    group.add_argument('--deepspeed_mpi',
-                       default=False,
-                       action='store_true',
-                       help="Run via MPI, this will attempt to discover the necessary variables to initialize torch "
-                       "distributed from the MPI environment")
-
     return parser
 
 
@@ -292,7 +314,7 @@ def init_inference(model, config=None, **kwargs):
     .. code-block:: python
 
         generator.model = deepspeed.init_inference(generator.model,
-                                                    mp_size=world_size,
+                                                    tensor_parallel={"tp_size": world_size},
                                                     dtype=torch.half,
                                                     replace_with_kernel_inject=True)
         string = generator("DeepSpeed is")
@@ -342,3 +364,34 @@ def init_inference(model, config=None, **kwargs):
     engine = InferenceEngine(model, config=ds_inference_config)
 
     return engine
+
+
+def tp_model_init(model, tp_size, dtype):
+    """
+    Initialize the model for tensor parallelism.
+
+    Args:
+        model (torch.nn.Module): The model to be initialized.
+        tp_size (int): The tensor parallelism size.
+        dtype (torch.dtype): The data type to be used for the model.
+
+    Returns:
+        torch.nn.Module: The initialized model with tensor parallelism.
+    """
+    # avoid re-entry
+    assert not hasattr(
+        model, 'ds_autotp_parsed'), "ds_autotp_parsed' attribute already exists in the model, re-entry is not allowed."
+
+    set_autotp_mode(training=True)
+
+    from deepspeed.runtime.tensor_parallel import TpTrainingManager
+    # The expected usage here is for it to be invoked by transformers package.
+
+    #TODO: We should provide a custom TP mapping solution without using autoTP
+    #as modifying the autoTP logic may be more difficult for users compared to configuring it
+
+    model = TpTrainingManager(model=model, tp_size=tp_size, dtype=dtype).module
+
+    setattr(model, 'ds_autotp_parsed', True)
+
+    return model
diff --git a/deepspeed/autotuning/README.md b/deepspeed/autotuning/README.md
index b1fa435364d2..1a9adfede948 100755
--- a/deepspeed/autotuning/README.md
+++ b/deepspeed/autotuning/README.md
@@ -214,7 +214,7 @@ If `"stage"` is not defined or set as `"all"`, then the overwriting applies to a
 
 Currently, the DeepSpeed Autotuner does not tune offloading behaviors but instead uses the values defined in the offload section of the DeepSpeed configuration file. See [Parameter offloading](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) and [Optimizer offloading](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) for details.
 
-If using NVME for offloading, users can run a benchmark offline to select the optimal `aio` setup in DeepSpeed. Refer to [profiling NVMe and configuring aio param section](https://github.com/microsoft/DeepSpeed/issues/998).
+If using NVME for offloading, users can run a benchmark offline to select the optimal `aio` setup in DeepSpeed. Refer to [profiling NVMe and configuring aio param section](https://github.com/deepspeedai/DeepSpeed/issues/998).
 
 ## Autotuning Output
 
@@ -336,13 +336,13 @@ The Autotuner stops exploring the space when any of the following conditions mee
 
 ## Using Autotuning with Hugging Face
 
-Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/transformers/main_classes/deepspeed.html?highlight=gradient_accumulation_steps#shared-configuration).
+Hugging Face users can set some configurations values to ["auto"](https://huggingface.co/docs/transformers/deepspeed#deepspeed-and-trainer-parameters).
 `"auto"` means the value will be set to the default in Hugging Face or be overwritten using the supplied values from the command line arguments.
 In DeepSpeed Autotuning, if the user-provided DeepSpeed configuration file has "auto" keywords, they are treated as the value "auto".
 
 ##  GPT2-large Example
 
-This section shows an example of using DeepSpeed autotuning. For more examples, refer to [autotuning](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo.
+This section shows an example of using DeepSpeed autotuning. For more examples, refer to [autotuning](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo.
 
 Example training script:
 
@@ -412,4 +412,4 @@ Notation: Hugging Face (HF), DeepSpeed (DS), ZeRO stage (z), gradient accumulati
 | ---------- | -------------------- | ------------------------ | ------------------------------ |
 | GPT2-large | 27.874 (mbs = 1)     | 56.797 (z = 1, mbs = 2), | 69.061 (z = 1, mbs = 3)        |
 
-As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models.
+As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models.
diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
index c77415beb358..442a732b8baa 100755
--- a/deepspeed/autotuning/autotuner.py
+++ b/deepspeed/autotuning/autotuner.py
@@ -71,7 +71,7 @@ def __init__(self, args, active_resources):
                 logger.info(f"Created autotuning experiments directory: {self.exps_dir}")
             except:
                 logger.error(
-                    f"Failed to create {self.exps_dir}, please check `exps_dir` in the autotuning config file is accessible by all the nodes in the job."
+                    f"Failed to create {self.exps_dir}, please check exps_dir in the autotuning config file is accessible by all the nodes in the job."
                 )
                 exit(-1)
 
@@ -84,7 +84,7 @@ def __init__(self, args, active_resources):
                 logger.info(f"Created autotuning results directory: {self.exps_dir}")
             except:
                 logger.error(
-                    f"Failed to create {self.results_dir}, please check `results_dir` in the autotuning config file is accessible by all the nodes in the job."
+                    f"Failed to create {self.results_dir}, please check results_dir in the autotuning config file is accessible by all the nodes in the job."
                 )
                 exit(-1)
 
@@ -248,8 +248,8 @@ def mp_size(self):
         return self.autotuning_config.mp_size
 
     def max_train_micro_batch_size_per_gpu(self):
-        if self.max_train_batch_size(
-        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+        if self.max_train_batch_size() and self.max_train_batch_size(
+        ) > 0:  # if the user specifies a max_train_batch_size
             max_train_micro_batch_size = self.max_train_batch_size() * self.mp_size() // (
                 self.exp_num_gpus * self.exp_num_nodes)  # gradient accumulation steps >=1
             return min(self.autotuning_config.max_train_micro_batch_size_per_gpu, max_train_micro_batch_size)
@@ -530,6 +530,9 @@ def tune_space(self, tuning_space, prev_max_mbs=0, prev_best_mbs=0, prev_best_me
 
         # calculate max micro batch size using gpu memory, model instantiation memory and activation memory
         # calculated_max_micro_batch_size = (memory_per_gpu - instantiation_memory) // activation_memory_micro_batch_size_1
+        logger.info(f'GPU memory: {self.gpu_mem}')
+        logger.info(f'Instantiation mem required for stage {stage}: {self.get_instantiation_memory_required_per_gpu(stage)}')
+        logger.info(f'Activation mem: {self.activation_mem}')
         calculated_max_micro_batch_size = int(
             self.gpu_mem - self.get_instantiation_memory_required_per_gpu(stage)) // self.activation_mem
         logger.info(
@@ -683,6 +686,7 @@ def model_info_profile_run(self):
         exp_config[DS_CONFIG] = ds_config
         exp_config['num_gpus'] = self.exp_num_gpus
         exp_config['num_nodes'] = self.exp_num_nodes
+        exp_config['hostfile'] = self.args.hostfile
         exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
 
         with open(exp_path, 'w', buffering=BUFSIZE) as fd:
@@ -703,6 +707,8 @@ def model_info_profile_run(self):
             with open(model_info_path, 'r') as f:
                 model_info = hjson.load(f)
                 return model_info
+        else:
+            print(f'Could not find model_info at {model_info_path}')
 
     def update_records(self, space_name, exp, metric_val, num_exps):
         if space_name not in self.records:
@@ -761,6 +767,7 @@ def run_tuning_micro_batch_sizes(self, tuning_micro_batch_sizes, max_train_batch
             exp_config[DS_CONFIG] = ds_config
             exp_config['num_gpus'] = self.exp_num_gpus
             exp_config['num_nodes'] = self.exp_num_nodes
+            exp_config['hostfile'] = self.args.hostfile
             exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
 
             with open(exp_path, 'w', buffering=BUFSIZE) as fd:
@@ -962,8 +969,8 @@ def get_min_max_micro_batch_size(self, stage, min_micro_batch_size, calculated_m
                     low = mid + 1
                     self.update_records(tuning_space_name, exp, metric_val, 1)
                     used_micro_batch_sizes.append(mid)
-                    if prev_metric_val and (
-                        (metric_val - prev_metric_val) / prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
+                    if prev_metric_val and ((metric_val - prev_metric_val) /
+                                            prev_metric_val) < METRIC_PERCENT_DIFF_CONST:
                         logger.info(f"performance plateaus at mbs = {low}")
                         break
                     prev_metric_val = metric_val
@@ -1024,8 +1031,8 @@ def get_tuning_micro_batch_size_list(self, min_micro_batch_size, max_micro_batch
         # NUM_GPUS=$(( ${NUM_WORKERS} * ${NUM_GPUS_PER_WORKER} ))
         # DP_SIZE=$(( ${NUM_GPUS} / (${PP_SIZE} * ${MP_SIZE}) ))
         # GRAD_ACC_STEPS=$(( ${TARGET_GLOBAL_BATCH_SIZE} / (${BATCH_SIZE} * ${DP_SIZE}) ))
-        if self.max_train_batch_size(
-        ) and self.max_train_batch_size() > 0:  # if the user specifies a max_train_batch_size
+        if self.max_train_batch_size() and self.max_train_batch_size(
+        ) > 0:  # if the user specifies a max_train_batch_size
             max_train_batch_size_per_gpu = self.max_train_batch_size() * self.mp_size() // (self.exp_num_gpus *
                                                                                             self.exp_num_nodes)
         else:
@@ -1055,6 +1062,7 @@ def run_ds_config(self, ds_config, exp_name):
         exp_config[DS_CONFIG] = ds_config
         exp_config['num_gpus'] = self.exp_num_gpus
         exp_config['num_nodes'] = self.exp_num_nodes
+        exp_config['hostfile'] = self.args.hostfile
         exp_path = os.path.join(self.exps_dir, f'{exp_name}.json')
 
         logger.debug(f'run_ds_config exp_name = {exp_name}')
diff --git a/deepspeed/autotuning/scheduler.py b/deepspeed/autotuning/scheduler.py
index 40978aa00ab9..29610beaccbe 100755
--- a/deepspeed/autotuning/scheduler.py
+++ b/deepspeed/autotuning/scheduler.py
@@ -5,7 +5,6 @@
 
 import copy
 
-from numpy import BUFSIZE
 import json
 import subprocess
 import sys
@@ -18,6 +17,8 @@
 from tqdm import tqdm
 
 from ..utils import logger
+from ..launcher.constants import MVAPICH_LAUNCHER, PDSH_LAUNCHER, OPENMPI_LAUNCHER, SLURM_LAUNCHER
+from .constants import *
 from .constants import AUTOTUNING, AUTOTUNING_METRIC_PATH
 from .utils import get_val_by_key, search_error, was_interruptted
 """
@@ -89,8 +90,14 @@ def schedule_experiments(self, exp_paths):
 
     def run_job(self, exp: dict, reservations):
         exp_id = exp["exp_id"]
+        exp["master_addr"] = self.args.master_addr
         exp["master_port"] = self.args.master_port + exp_id
         exp["result_dir"] = os.path.join(self.results_dir, exp['name'])
+        exp["hostfile"] = self.args.hostfile
+        exp["launcher"] = self.args.launcher
+        exp["no_ssh_check"] = self.args.no_ssh_check
+        if self.args.launcher == 'slurm' and hasattr(self.args, 'comment'):
+            exp["comment"] = self.args.comment
         user_script = self.args.user_script
         user_args = self.args.user_args
 
@@ -315,13 +322,26 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         slots = ",".join(map(str, reservation.slots))
         include_str += f"{reservation.node.host}:{slots}@"
     include_str = include_str[:-1]
+    master_addr = exp["master_addr"]
     master_port = exp["master_port"]
-    exp["launcher_args"] = [
-        "--include",
-        f"{include_str}",
-        "--master_port",
-        str(master_port),
-    ]
+    hostfile = exp["hostfile"]
+    launcher_args = ["--launcher", exp["launcher"]]
+    if master_addr:
+        launcher_args += ["--master_addr", master_addr]
+    if exp["launcher"] not in (MVAPICH_LAUNCHER, OPENMPI_LAUNCHER, SLURM_LAUNCHER):
+        launcher_args += [
+            "--include",
+            f"{include_str}",
+            "--master_port",
+            str(master_port),
+        ]
+    if hostfile != '':
+        launcher_args += ["--hostfile", hostfile]
+    if 'comment' in exp:
+        launcher_args += ["--comment", exp["comment"]]
+    if exp['no_ssh_check']:
+        launcher_args += ["--no_ssh_check"]
+    exp["launcher_args"] = launcher_args
     logger.debug(f'launcher args={exp["launcher_args"]}')
 
     exp["user"] = get_user()
@@ -387,7 +407,7 @@ def run_experiment(exp: dict, reservations, user_script, user_args):
         err.flush()
         os.fsync(out)
         os.fsync(err)
-
+    
     clean_up(exp, reservations)
 
     logger.info(f"Done running exp_id = {exp['exp_id']}, exp_name = {exp['name']}, with resource = {include_str}")
@@ -406,16 +426,22 @@ def clean_up(exp: dict, reservations):
     nodes_str = nodes_str[:-1]
     logger.debug(f"Cleaning up exp_id = {exp['exp_id']} on the following workers: {nodes_str}")
 
-    # PDSH flags for max node fan out and specific hosts to launch on
-    # See https://linux.die.net/man/1/pdsh for flag details
-    pdsh_cmd = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', nodes_str]
+
+    if exp['launcher'] == 'slurm':
+        runner_cmd = ['srun', '-w', nodes_str]
+        if 'comment' in exp:
+            runner_cmd += ['--comment', exp['comment']]
+    else:
+        # PDSH flags for max node fan out and specific hosts to launch on
+        # See https://linux.die.net/man/1/pdsh for flag details
+        runner_cmd = ['pdsh', '-f', str(PDSH_MAX_FAN_OUT), '-w', nodes_str]
 
     kill_cmd = [
         'pkill',
         '-f',
         exp['name'],
     ]
-    cmd = pdsh_cmd + kill_cmd
+    cmd = runner_cmd + kill_cmd
     logger.debug("cmd = {}".format(' '.join(cmd)))
 
     result = subprocess.Popen(cmd, env=env)
diff --git a/deepspeed/autotuning/utils.py b/deepspeed/autotuning/utils.py
index 8c9a5fa85bf2..b851353520fb 100644
--- a/deepspeed/autotuning/utils.py
+++ b/deepspeed/autotuning/utils.py
@@ -42,7 +42,7 @@ def find_replace_str(value, replace_dict):
     if not isinstance(value, str):
         return str(value)
 
-    matches = re.findall(r"\$[A-Za-z0-9_]+", value)
+    matches = re.findall(r"\$[\w]+", value)
     for var in matches:
         var_key = var.replace("$", "").lower()
         if var_key == "nvme_path":
diff --git a/deepspeed/checkpoint/__init__.py b/deepspeed/checkpoint/__init__.py
index c9822693867d..551b83205772 100644
--- a/deepspeed/checkpoint/__init__.py
+++ b/deepspeed/checkpoint/__init__.py
@@ -5,7 +5,7 @@
 
 from .reshape_meg_2d import reshape_meg_2d_parallel
 
-from .deepspeed_checkpoint import DeepSpeedCheckpoint
+from .deepspeed_checkpoint import DeepSpeedCheckpoint, NeoxCheckpoint
 
 from .utils import (get_layer_ckpt_name_for_rank, get_model_ckpt_name_for_rank, get_zero_ckpt_name_for_rank)
 
@@ -15,6 +15,6 @@
 
 from .zero_checkpoint import ZeROCheckpoint
 
-from .universal_checkpoint import enable_universal_checkpoint
+from .universal_checkpoint import enable_universal_checkpoint, SubparamShape
 
 from .constants import *
diff --git a/deepspeed/checkpoint/constants.py b/deepspeed/checkpoint/constants.py
index f809a0c39270..046bc242002f 100644
--- a/deepspeed/checkpoint/constants.py
+++ b/deepspeed/checkpoint/constants.py
@@ -16,6 +16,7 @@
 BASE_OPTIMIZER_STATE = 'base_optimizer_state'
 BASE_OPTIMIZER_STATE_STEP = 'base_optimizer_state_step'
 SINGLE_PARTITION_OF_FP32_GROUPS = "single_partition_of_fp32_groups"
+PARAM_GROUPS = 'param_groups'
 GROUP_PADDINGS = 'group_paddings'
 PARTITION_COUNT = 'partition_count'
 ZERO_STAGE = 'zero_stage'
@@ -73,6 +74,8 @@
 # Similarly, load_hp_checkpoint_state has to take the needed actions when loading from universal.
 PARAM_N_SUB_PARAMS = "param_n_sub_params"
 
+SUB_PARAM_SHAPE = "sub_param_shape"
+
 # Regex list of parameters that require special handling
 VOCABULARY_PARAMETER_PATTERNS = 'vocabulary_parameter_patterns'
 PIPELINE_REPLICATED_PARAMETER_PATTERNS = 'pipeline_replicated_parameter_patterns'
@@ -80,3 +83,5 @@
 PARAMETER_WITH_ROW_PARALLELISM_PATTERNS = 'parameter_with_row_parallelism_patterns'
 TP_REPLICATED_PARAMETER_PATTERNS = 'tp_replicated_parameter_patterns'
 PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0 = 'parameter_with_2_sub_params_cat_dim_0'
+PARAMETER_WITH_SUB_PARAMS = 'parameter_with_sub_params'
+SUB_PARAMS_SHAPE = 'sub_params_shape'
diff --git a/deepspeed/checkpoint/deepspeed_checkpoint.py b/deepspeed/checkpoint/deepspeed_checkpoint.py
index 77634222d292..9ffc83595e32 100644
--- a/deepspeed/checkpoint/deepspeed_checkpoint.py
+++ b/deepspeed/checkpoint/deepspeed_checkpoint.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import os
+import re
 from typing import Dict
 import torch
 
@@ -21,6 +22,7 @@
 ARGS_KEY = 'args'
 CHECKPOINT_INFO_KEY = 'checkpoint_info'
 ITERATION_KEY = 'iteration'
+LAYER_FILE_PREFIX_PATTERN = r'layer_(\d+)-model_.*'
 
 SEQUENTIAL_LAYERS = [
     'input_layernorm.weight', 'input_layernorm.bias', 'self_attention.dense.bias', 'post_attention_layernorm.weight',
@@ -32,9 +34,18 @@
 
 class DeepSpeedCheckpoint(object):
 
-    def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
+    def __init__(self,
+                 dir,
+                 tp_degree=None,
+                 pp_degree=None,
+                 dp_degree=None,
+                 final_layer_norm_idx=FINAL_LAYER_NORM_INDEX):
+        self.final_layer_norm_idx = final_layer_norm_idx
         self.dir = dir
-        self._validate_folder(dir)
+
+        pipeline_parallel = len(get_files_with_prefix(get_files(dir), LAYER_FILE_PREFIX)) > 0
+
+        self._validate_folder(dir, pipeline_parallel)
 
         self.zero_checkpoint = ZeROCheckpoint(dir)
 
@@ -70,9 +81,21 @@ def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
         self.pp_to_transformer_map = self._build_pp_transformer_map()
         self.transformer_file_map = self._build_transformer_file_map()
         self.tp_to_embedding_map = self._build_tp_other_layer_map(EMBEDDING_LAYER_INDEX)
-        self.tp_to_final_norm_map = self._build_tp_other_layer_map(FINAL_LAYER_NORM_INDEX)
+        self.tp_to_final_norm_map = self._build_tp_other_layer_map(self.final_layer_norm_idx)
         self._build_global_state()
 
+    @property
+    def original_tp_degree(self):
+        return self.zero_checkpoint.get_src_tp_degree()
+
+    @property
+    def original_pp_degree(self):
+        return self.zero_checkpoint.get_src_pp_degree()
+
+    @property
+    def zero_files(self):
+        return self.zero_checkpoint.file_list
+
     def is_change_tp_degree(self):
         return self.tp_degree != self.zero_checkpoint.get_src_tp_degree()
 
@@ -105,7 +128,7 @@ def show_transformer_file_map(self):
         self._dump_mapping(self.transformer_file_map, 'rank_to_transformer_files')
 
     def _build_global_state(self):
-        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+        sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
         self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
         self.global_state[ARGS_KEY] = sd.get(ARGS_KEY, None)
 
@@ -122,18 +145,21 @@ def get_embedding_layer_id(self):
         return self.layer_keys[EMBEDDING_LAYER_INDEX]
 
     def get_final_norm_layer_id(self):
-        return self.layer_keys[FINAL_LAYER_NORM_INDEX]
+        return self.layer_keys[self.final_layer_norm_idx]
 
     def get_iteration(self):
         if not ITERATION_KEY in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
             self.global_state[ITERATION_KEY] = sd.get(ITERATION_KEY, 0)
 
         return self.global_state[ITERATION_KEY]
 
     def get_embedding_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_embedding_map.keys()
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in self.tp_to_embedding_map[tp_index]]
+        sd_list = [
+            torch.load(fname, map_location=torch.device('cpu'), weights_only=False)
+            for fname in self.tp_to_embedding_map[tp_index]
+        ]
         sd = self._merge_state_dicts(sd_list)
         return sd
 
@@ -143,7 +169,7 @@ def get_embedding_files(self, tp_index: int) -> list:
 
     def _get_checkpoint_value(self, key):
         if not key in self.global_state:
-            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'))
+            sd = torch.load(self.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
             self.global_state[key] = sd.get(key, None)
 
         return self.global_state[key]
@@ -158,7 +184,7 @@ def get_2d_parallel_state(self, tp_index: int, pp_index: int) -> dict:
         assert tp_index < self.tp_degree
         assert pp_index < self.pp_degree
         fname_list = self.get_2d_parallel_files(tp_index=tp_index, pp_index=pp_index)
-        sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
+        sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]
 
         merged_sd = None
         for sd in sd_list:
@@ -174,7 +200,7 @@ def get_transformer_state(self, tp_index: int, pp_index: int) -> list:
         assert pp_index < self.pp_degree
         t_list = []
         for fname_list in self.transformer_file_map[(tp_index, pp_index)]:
-            sd_list = [torch.load(fname, map_location=torch.device('cpu')) for fname in fname_list]
+            sd_list = [torch.load(fname, map_location=torch.device('cpu'), weights_only=False) for fname in fname_list]
             sd = self._merge_state_dicts(sd_list)
             t_list.append(sd)
         return t_list
@@ -185,7 +211,7 @@ def get_pp_transformer_map(self, pp_index: int) -> list:
 
     def get_final_norm_state(self, tp_index: int) -> Dict:
         assert tp_index in self.tp_to_final_norm_map.keys()
-        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'))
+        sd = torch.load(self.tp_to_final_norm_map[tp_index][0], map_location=torch.device('cpu'), weights_only=False)
         return sd
 
     def get_final_norm_files(self, tp_index: int) -> list:
@@ -193,7 +219,10 @@ def get_final_norm_files(self, tp_index: int) -> list:
         return self.tp_to_final_norm_map[tp_index]
 
     def _build_tp_other_layer_map(self, layer_index: int):
-        assert layer_index < len(self.layer_files)
+        data_map = {}
+        if len(self.layer_files) < 1:
+            return data_map
+        assert layer_index <= len(self.layer_files)
         layer_files = get_files_with_prefix(self.layer_files, self.layer_keys[layer_index])
         layer_file_partitions = partition_data(layer_files, self.tp_degree)
         data_map = {i: flist for i, flist in enumerate(layer_file_partitions)}
@@ -207,9 +236,13 @@ def get_2d_parallel_files(self, tp_index: int, pp_index: int) -> list:
 
     def _build_pp_transformer_map(self):
         data_map = {}
-        transformer_layers = self.layer_keys[1:-1]
-        layers_per_pp = len(transformer_layers) // self.pp_degree
-        data_map = {i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] for i in range(0, self.pp_degree)}
+        if self.pp_degree > 0:
+            transformer_layers = self.layer_keys[1:self.final_layer_norm_idx]
+            layers_per_pp = len(transformer_layers) // self.pp_degree
+            data_map = {
+                i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp]
+                for i in range(0, self.pp_degree)
+            }
         return data_map
 
     def _dump_mapping(self, data_map, map_tag=None):
@@ -219,16 +252,16 @@ def _dump_mapping(self, data_map, map_tag=None):
             print(f'{k} = {v}')
 
     def _build_transformer_file_map(self):
-        transformer_layer_keys = self.layer_keys[1:-1]
+        transformer_layer_keys = self.layer_keys[1:self.final_layer_norm_idx]
         file_map = {}
         # XXX: this is not guaranteed
-        layers_per_pp = len(transformer_layer_keys) // self.pp_degree
-        if layers_per_pp == 0:
-            layers_per_pp = 1
+        layers_per_pp = 1
+        if self.pp_degree > 0:
+            layers_per_pp = len(transformer_layer_keys) // self.pp_degree
         #print(f"{transformer_layer_keys} {layers_per_pp}")
         for key_index, layer_key in enumerate(transformer_layer_keys):
             pp_index = key_index // layers_per_pp
-            layer_files = get_files_with_prefix(self.layer_files, layer_key)
+            layer_files = get_files_with_prefix(self.layer_files, layer_key + '-')
             layer_file_partitions = partition_data(layer_files, self.tp_degree)
             for tp_index in range(self.tp_degree):
                 map_key = (tp_index, pp_index)
@@ -240,8 +273,8 @@ def _build_transformer_file_map(self):
 
     def _sanity_check(self):
         assert len(self.mp_rank_files) % self.tp_degree == 0
-        assert len(self.layer_keys) > 2
         assert self.zero_checkpoint.num_files % (self.pp_degree * self.tp_degree) == 0
+        assert self.zero_checkpoint.num_files % (self.tp_degree) == 0
         # XXX: fix me - isn't always the case
         # only true with  --pp-partition-method 'type:transformer|embedding' \
         # assert (len(self.layer_keys) - 2) % self.pp_degree == 0
@@ -253,11 +286,13 @@ def validate_files(self):
 
     def _get_layer_keys(self):
         key_set = set()
-        key_len = len(LAYER_FILE_PREFIX) + 2
         for file_path in self.layer_files:
             _, fname = os.path.split(file_path)
-            key_set.add(fname[:key_len])
-        return sorted(list(key_set))
+            layer_id = re.search(LAYER_FILE_PREFIX_PATTERN, fname).group(1)
+            key_set.add(layer_id)
+        sorted_ids = sorted(list(key_set), key=int)
+        layer_keys = [LAYER_FILE_PREFIX + str(layer_id) for layer_id in sorted_ids]
+        return layer_keys
 
     def _merge_state_dicts(self, sd_list):
         merged_sd = {}
@@ -270,12 +305,29 @@ def _merge_state_dicts(self, sd_list):
 
         return merged_sd
 
+    def _validate_folder(self, dir, pipeline_parallel):
+        basic_folder_validation(dir)
+
+        file_list = get_files(dir)
+        file_prefix_list = [MODEL_FILE_PREFIX]
+        if pipeline_parallel:
+            file_prefix_list.extend([LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01'])
+        for file_prefix in file_prefix_list:
+            ckpt_files = get_files_with_prefix(file_list, file_prefix)
+            assert len(ckpt_files) > 0, f'{dir} seems a bogus DeepSpeed checkpoint folder: Cannot find {file_prefix}* files in there.'
+
+
+class NeoxCheckpoint(DeepSpeedCheckpoint):
+
     def _validate_folder(self, dir):
         basic_folder_validation(dir)
 
         file_list = get_files(dir)
 
-        for file_prefix in [MODEL_FILE_PREFIX, LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01']:
+        for file_prefix in [
+                MODEL_FILE_PREFIX,
+                LAYER_FILE_PREFIX
+        ]:
             ckpt_files = get_files_with_prefix(file_list, file_prefix)
             assert len(
                 ckpt_files
diff --git a/deepspeed/checkpoint/ds_to_universal.py b/deepspeed/checkpoint/ds_to_universal.py
index 8be187aa89c2..f7b75eee66d0 100755
--- a/deepspeed/checkpoint/ds_to_universal.py
+++ b/deepspeed/checkpoint/ds_to_universal.py
@@ -6,35 +6,44 @@
 # DeepSpeed Team
 
 from functools import partial
+from itertools import chain
 import argparse
 import glob
 import itertools
-import multiprocessing
+import math
+from concurrent.futures import ProcessPoolExecutor
 import os
 import re
 import shutil
 import torch
 import tqdm
-# from pprint import pprint
+#from pprint import pprint
 
 from deepspeed.checkpoint import DeepSpeedCheckpoint
 from deepspeed.checkpoint import (
     OPTIMIZER_STATE_DICT,
+    ZERO_STAGE,
     BASE_OPTIMIZER_STATE,
     SINGLE_PARTITION_OF_FP32_GROUPS,
+    PARAM_GROUPS,
     PARAM_SLICE_MAPPINGS,
     PARAM_SHAPES,
     PARAM,
     CAT_DIM,
     PARAM_N_SUB_PARAMS,
+    SUB_PARAM_SHAPE,
     VOCAB_TENSOR,
     UNIVERSAL_CHECKPOINT_INFO,
+    UNIVERSAL_CHECKPOINT_VERSION_KEY,
+    UNIVERSAL_CHECKPOINT_VERSION_VALUE,
     VOCABULARY_PARAMETER_PATTERNS,
     PIPELINE_REPLICATED_PARAMETER_PATTERNS,
     TP_REPLICATED_PARAMETER_PATTERNS,
     PARAMETER_TO_AVERAGE_PATTERNS,
     PARAMETER_WITH_ROW_PARALLELISM_PATTERNS,
     PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0,
+    PARAMETER_WITH_SUB_PARAMS,
+    SubparamShape,
 )
 
 
@@ -60,11 +69,27 @@ def parse_arguments():
                         dest='strict',
                         action='store_false',
                         help='Do not perform validity checks on converted checkpoint.')
+    parser.add_argument('--inject_missing_state',
+                        action='store_true',
+                        help='Inject missing checkpoint state into the checkpoint if it is absent.')
     args = parser.parse_args()
     print(f'args = {args}')
     return args
 
 
+def atoi(text):
+    return int(text) if text.isdigit() else text
+
+
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+
+
 def _create_checkpoint_paths(base_folder, iteration, tp_degree, pp_degree):
     path_list = []
     iter_folder = f'iter_{iteration:07d}'
@@ -110,6 +135,9 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D):
             fp32=fp32_groups[param_group_id],
         )
 
+        if "step" in state_groups[param_group_id]:
+            flat_state["step"] = state_groups[param_group_id]["step"]
+
         for name, fragment_mapping in param_slice_mappings[param_group_id].items():
             if pp_index > 0 and any(re.match(pattern, name) for pattern in pipeline_replicated_params):
                 # Skip tied weights that are replicated in first and last pp stages
@@ -121,9 +149,33 @@ def extract_zero_shards(dir, ds_checkpoint, indices_3D):
                                     fragment_mapping.start, fragment_mapping.numel)
 
 
+def extract_zero_shards_stage3(optim_files, param_shapes, dp_degree, temp_dir, dp_index):
+    state_dict = torch.load(optim_files[dp_index], map_location='cpu', weights_only=False)
+
+    flat_state = dict(
+        exp_avg=state_dict[OPTIMIZER_STATE_DICT]['optimizer_state_dict']['state'][0]["exp_avg"],
+        exp_avg_sq=state_dict[OPTIMIZER_STATE_DICT]['optimizer_state_dict']['state'][0]["exp_avg_sq"],
+        fp32=state_dict[OPTIMIZER_STATE_DICT]['fp32_flat_groups'][0],
+    )
+
+    offset = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        partitioned_numel, _ = _zero_partitioned_param_info(unpartitioned_numel, dp_degree)
+        padding_free_numel = min(partitioned_numel, abs(unpartitioned_numel - dp_index * partitioned_numel))
+        for state_key in flat_state.keys():
+            dump_param_fragment(temp_dir, 0, dp_index, state_key, flat_state[state_key], name, offset,
+                                padding_free_numel)
+        offset += partitioned_numel
+
+
 cnt = 0
 
 
+def dp_index_to_str(dp_index):
+    return f"{dp_index:0>2d}"
+
+
 def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor, param_name, offset, numel):
 
     global cnt  # temp hack
@@ -132,23 +184,47 @@ def dump_param_fragment(dir, tp_index, dp_index, state_name, state_flat_tensor,
     os.makedirs(param_base_path, exist_ok=True)
 
     cnt += 1
-    counter = f"{dp_index:0>2d}"
 
-    path = os.path.join(param_base_path, f"{state_name}.{counter}")
+    path = os.path.join(param_base_path, f"{state_name}.{dp_index_to_str(dp_index)}")
 
     #print(f"{param_name}: {offset}: {numel} => {path}")
 
-    t = state_flat_tensor.narrow(0, offset, numel).clone()
-    _save_checkpoint(path, t)
+    # State might be a python int or a tensor
+    if state_name != "step" and torch.is_tensor(state_flat_tensor):
+        state_flat_tensor = state_flat_tensor.narrow(0, offset, numel).clone()
+    _save_checkpoint(path, state_flat_tensor)
 
 
-def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape):
+def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape=None):
     slices = []
     for tp_index in range(tp_degree):
         prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}")
-        paths = sorted(list(glob.glob(f"{prefix_path}.*")))
-        shards = [torch.load(p) for p in paths]
-        slice = torch.cat(shards, dim=0).reshape(slice_shape)
+        paths = glob.glob(f"{prefix_path}.*")
+
+        if len(paths) == 0:
+            continue
+
+        pattern = re.compile(f"{prefix_path}\\.([0-9]+)")
+        dp_indices = set()
+        for p in paths:
+            m = pattern.match(p)
+            if m:
+                dp_indices.add(int(m.group(1)))
+            else:
+                raise ValueError(f"Cannot parse dp_rank from {p}")
+
+        paths = [f"{prefix_path}.{dp_index_to_str(dp_index)}" for dp_index in sorted(list(dp_indices))]
+        shards = [torch.load(p, weights_only=False) for p in paths]
+
+        if state == "step":
+            assert all(v == shards[0] for v in shards), "All shards must have the same step value"
+            slice = shards[0]
+        else:
+            if slice_shape is None:
+                slice = torch.cat(shards, dim=0)
+            else:
+                slice = torch.cat(shards, dim=0).reshape(slice_shape)
+
         slices.append(slice)
     return slices
 
@@ -165,8 +241,11 @@ def merge_tp_slices(ds_checkpoint, dir, slice_dir, tp_degree, name_and_shape):
     parameters_with_row_parallelism = universal_checkpoint_info.get(PARAMETER_WITH_ROW_PARALLELISM_PATTERNS, [])
     vocabulary_parameters = universal_checkpoint_info.get(VOCABULARY_PARAMETER_PATTERNS, [])
     parameters_with_2_sub_params_cat_dim_0 = universal_checkpoint_info.get(PARAMETER_WITH_2_SUB_PARAMS_CAT_DIM_0, [])
+    parameter_with_sub_params = universal_checkpoint_info.get(PARAMETER_WITH_SUB_PARAMS, [])
+
     unmatched_patterns = set(replicated_parameters + parameters_to_average + parameters_with_row_parallelism +
                              vocabulary_parameters + parameters_with_2_sub_params_cat_dim_0)
+    unmatched_patterns.update(chain.from_iterable(SubparamShape(**s).patterns for s in parameter_with_sub_params))
 
     def get_matched_pattern(patterns_, name_):
         matched_ = [pattern_ for pattern_ in patterns_ if re.match(pattern_, name_)]
@@ -177,6 +256,21 @@ def get_matched_pattern(patterns_, name_):
             return pattern_
         return None
 
+    def get_matched_sub_params_pattern(name_):
+        for subparam_shape_dict in parameter_with_sub_params:
+            subparam_shape = SubparamShape(**subparam_shape_dict)
+            for pattern_ in subparam_shape.patterns:
+                if re.match(pattern_, name_):
+                    unmatched_patterns.discard(pattern_)
+                    return subparam_shape
+        return None
+
+    matched_sub_params_shape = get_matched_sub_params_pattern(name)
+
+    step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape)
+    if step_merged:
+        _save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0])
+
     for state in ("fp32", "exp_avg", "exp_avg_sq"):
         slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
         final_path = os.path.join(param_base_path, f"{state}.pt")
@@ -200,6 +294,26 @@ def get_matched_pattern(patterns_, name_):
             param = torch.cat([merged_chunks_0, merged_chunks_1], dim=cat_dim)
             ckpt_dict[CAT_DIM] = cat_dim
             ckpt_dict[PARAM_N_SUB_PARAMS] = 2
+        elif matched_sub_params_shape:
+            merged_chunks = []
+            partition_dim = matched_sub_params_shape.partition_dim
+
+            sub_dim_sizes = matched_sub_params_shape.shape[partition_dim]
+            if not isinstance(sub_dim_sizes, tuple):
+                sub_dim_sizes = (sub_dim_sizes, )
+
+            partition_shape = [sum(d) if isinstance(d, tuple) else d for d in matched_sub_params_shape.shape]
+            partition_shape = [d // tp_degree if i == partition_dim else d for i, d in enumerate(partition_shape)]
+            slices = [s.view(partition_shape) for s in slices]
+
+            offset = 0
+            for sub_dim_size in sub_dim_sizes:
+                part_sub_dim_size = sub_dim_size // tp_degree
+                merged_chunks.append(
+                    torch.cat([s.narrow(partition_dim, offset, part_sub_dim_size) for s in slices], dim=partition_dim))
+                offset += part_sub_dim_size
+            param = torch.cat(merged_chunks, dim=partition_dim)
+            ckpt_dict[SUB_PARAM_SHAPE] = matched_sub_params_shape
         else:
             cat_dim = 1 if get_matched_pattern(parameters_with_row_parallelism, name) else 0
             # print(f"merge {name} with CAT DIM: {cat_dim}")
@@ -221,19 +335,28 @@ def get_matched_pattern(patterns_, name_):
     return unmatched_patterns
 
 
-def _get_chunks(l, n):
-    for i in range(0, len(l), n):
-        yield l[i:i + n]
+def merge_zero3_slices(dp_degree, dir, slice_dir, name):
+    slice_base_path = os.path.join(slice_dir, name)
+    param_base_path = os.path.join(dir, name)
+
+    for state in ("fp32", "exp_avg", "exp_avg_sq"):
+        slices = _merge_zero_shards(slice_base_path, state, 1)
+        final_path = os.path.join(param_base_path, f"{state}.pt")
+        _save_checkpoint(final_path, slices[0])
 
 
 def _do_parallel_work(do_work, work_chunks, num_workers):
-    pool = multiprocessing.Pool(num_workers)
     results = []
-    for batch in tqdm.tqdm(work_chunks):
-        res = pool.map(do_work, batch)
-        results.extend(res)
-    pool.close()
-    pool.join()
+    if num_workers > 1:
+        with ProcessPoolExecutor(max_workers=num_workers) as executor:
+            future_list = [executor.submit(do_work, work) for work in work_chunks]
+            for f in tqdm.tqdm(future_list):
+                results.append(f.result())
+    else:
+        # No parallel pass for unit testing
+        # We can't create child processes in tests
+        for work in tqdm.tqdm(work_chunks):
+            results.append(do_work(work))
     return results
 
 
@@ -241,21 +364,21 @@ def _extract_zero_shard_files(args, ds_checkpoint, temp_dir):
     _3d_range_list = list(
         itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree),
                           range(ds_checkpoint.dp_degree)))
-    # pprint(f'{_3d_range_list=}')
-    work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers))
-    # pprint(f'{work_chunks=}')
+    #pprint(f'{_3d_range_list=}')
 
-    # extract_zero_shards(temp_dir, ds_checkpoint, _3d_range_list[0])
     do_work = partial(extract_zero_shards, temp_dir, ds_checkpoint)
-    _do_parallel_work(do_work, work_chunks, args.num_extract_workers)
+    _do_parallel_work(do_work, _3d_range_list, args.num_extract_workers)
+
+
+def _extract_zero_shard_files_stage3(args, optim_files, param_shapes, dp_degree, temp_dir):
+    do_work = partial(extract_zero_shards_stage3, optim_files, param_shapes, dp_degree, temp_dir)
+    _do_parallel_work(do_work, list(range(dp_degree)), args.num_extract_workers)
 
 
 def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir):
-    work_chunks = list(_get_chunks(list(slice_shapes.items()), args.num_merge_workers))
-    #pprint(work_chunks)
     zero_output_folder = os.path.join(args.output_folder, "zero")
     do_work = partial(merge_tp_slices, ds_checkpoint, zero_output_folder, temp_dir, ds_checkpoint.tp_degree)
-    unmatched_patterns_lists = _do_parallel_work(do_work, work_chunks, args.num_merge_workers)
+    unmatched_patterns_lists = _do_parallel_work(do_work, list(slice_shapes.items()), args.num_merge_workers)
 
     # verify that all patterns were used
     # if a pattern was not used by any of the workers, then it was not used at all -> assert/alert
@@ -267,60 +390,150 @@ def _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir):
         print(f'Warning: Unused patterns={unmatched_patterns} while merging tp slices')
 
 
+def _merge_zero3_slice_files(args, param_shapes, dp_degree, temp_dir):
+    zero_output_folder = os.path.join(args.output_folder, "zero")
+    do_work = partial(merge_zero3_slices, dp_degree, zero_output_folder, temp_dir)
+    _do_parallel_work(do_work, param_shapes.keys(), args.num_merge_workers)
+
+
+def _zero_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+
+
+def _parse_model_states_stage3(files):
+    return torch.load(files[0], map_location=torch.device('cpu'), weights_only=False)[PARAM_SHAPES]
+
+
 def _save_optimizer_state(args, ds_checkpoint):
     sharded_states = [BASE_OPTIMIZER_STATE, PARAM_SLICE_MAPPINGS, SINGLE_PARTITION_OF_FP32_GROUPS]
     sd = ds_checkpoint.get_zero_checkpoint_state(pp_index=0, tp_index=0, dp_index=0)
 
     optim_sd = sd[OPTIMIZER_STATE_DICT]
     output_sd = {k: v for k, v in optim_sd.items() if k not in sharded_states}
+    output_sd[PARAM_GROUPS] = optim_sd[BASE_OPTIMIZER_STATE][PARAM_GROUPS]
+    zero_output_folder = os.path.join(args.output_folder, "zero")
+    output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt")
+    _save_checkpoint(output_file_path, output_sd)
+
+
+def _save_optimizer_state_stage3(args, optim_files):
+    sd = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
+    output_sd = sd[OPTIMIZER_STATE_DICT]
+    output_sd[PARAM_GROUPS] = output_sd[OPTIMIZER_STATE_DICT][PARAM_GROUPS]
     zero_output_folder = os.path.join(args.output_folder, "zero")
     output_file_path = os.path.join(zero_output_folder, f"optimizer_state.pt")
     _save_checkpoint(output_file_path, output_sd)
 
 
+def _get_optim_files(checkpoint_dir):
+    return _get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+
+
+def _get_model_state_files(checkpoint_dir):
+    return _get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+
+
+def _get_checkpoint_files(checkpoint_dir, glob_pattern):
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+
+    return ckpt_files
+
+
+def _get_zero_stage(optim_files):
+    state_dict = torch.load(optim_files[0], map_location=torch.device('cpu'), weights_only=False)
+    optimizer_state = state_dict[OPTIMIZER_STATE_DICT]
+    zero_stage = optimizer_state.get(ZERO_STAGE, 1)
+    return zero_stage
+
+
+def _inject_missing_state(ds_checkpoint):
+    if UNIVERSAL_CHECKPOINT_INFO not in ds_checkpoint.global_state:
+        sd = torch.load(ds_checkpoint.mp_rank_files[0], map_location=torch.device('cpu'), weights_only=False)
+        if UNIVERSAL_CHECKPOINT_INFO not in sd:
+            ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO] = {}
+            ds_checkpoint.global_state[UNIVERSAL_CHECKPOINT_INFO][
+                UNIVERSAL_CHECKPOINT_VERSION_KEY] = UNIVERSAL_CHECKPOINT_VERSION_VALUE
+
+
 def _check_for_required_state(ds_checkpoint):
     universal_checkpoint_info = ds_checkpoint.get_checkpoint_info(UNIVERSAL_CHECKPOINT_INFO)
     assert universal_checkpoint_info is not None, f'Required {UNIVERSAL_CHECKPOINT_INFO} state is missing in checkpoint. Verify that client creates this state.'
 
 
-def main():
+def main(args):
     print(f'Convert DeepSpeed Checkpoint to Universal Checkpoint')
 
-    args = parse_arguments()
     print(f'Converting DeepSpeed checkpoint in {args.input_folder} to Universal checkpoint in {args.output_folder}')
 
-    ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)
-    _check_for_required_state(ds_checkpoint)
+    optim_files = _get_optim_files(args.input_folder)
+    zero_stage = _get_zero_stage(optim_files)
+
+    if zero_stage <= 2:
+        ds_checkpoint = DeepSpeedCheckpoint(args.input_folder)
+        if args.inject_missing_state:
+            _inject_missing_state(ds_checkpoint)
+        else:
+            _check_for_required_state(ds_checkpoint)
+
+        iteration = ds_checkpoint.get_iteration()
+        #_create_latest_file(args.output_folder, iteration)
+        checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree,
+                                                    ds_checkpoint.pp_degree)
 
-    iteration = ds_checkpoint.get_iteration()
-    #_create_latest_file(args.output_folder, iteration)
-    checkpoint_paths = _create_checkpoint_paths(args.output_folder, iteration, ds_checkpoint.tp_degree,
-                                                ds_checkpoint.pp_degree)
+        slice_shapes = []
+        for mp_rank_file in ds_checkpoint.mp_rank_files:
+            mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'), weights_only=False)
+            slice_shapes += mp_sd[PARAM_SHAPES]
 
-    slice_shapes = []
-    for mp_rank_file in ds_checkpoint.mp_rank_files:
-        mp_sd = torch.load(mp_rank_file, map_location=torch.device('cpu'))
-        slice_shapes += mp_sd[PARAM_SHAPES]
+        # fix back to normal flat dict, merge duplicates for tp>1
+        slice_shapes = dict((k, v) for d in slice_shapes for k, v in d.items())
+        temp_dir = os.path.join(args.output_folder, 'tmp')
 
-    # fix back to normal flat dict, merge duplicates for tp>1
-    slice_shapes = dict((k, v) for d in slice_shapes for k, v in d.items())
-    temp_dir = os.path.join(args.output_folder, 'tmp')
+        print('*** 1. Extracting ZeRO fragments')
+        _extract_zero_shard_files(args, ds_checkpoint, temp_dir)
 
-    print('*** 1. Extracting ZeRO fragments')
-    _extract_zero_shard_files(args, ds_checkpoint, temp_dir)
+        print('*** 2. Merging slices .....')
+        _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir)
 
-    print('*** 2. Merging slices')
-    _merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir)
+        print('*** 3. Saving common optimizer states')
+        _save_optimizer_state(args, ds_checkpoint)
 
-    print('*** 3. Saving common optimizer states')
-    _save_optimizer_state(args, ds_checkpoint)
+        if not args.keep_temp_folder:
+            shutil.rmtree(temp_dir, ignore_errors=True)
 
-    if not args.keep_temp_folder:
-        shutil.rmtree(temp_dir, ignore_errors=True)
+        # Copy mp* files into output folder
+        for f in glob.glob(os.path.join(args.input_folder, 'mp*')):
+            shutil.copy2(f, args.output_folder)
 
-    # Copy mp* files into output folder
-    for f in glob.glob(os.path.join(args.input_folder, 'mp*')):
-        shutil.copy2(f, args.output_folder)
+    else:
+        model_files = _get_model_state_files(args.input_folder)
+        param_shapes = _parse_model_states_stage3(model_files)
+        param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+        dp_degree = len(model_files)
+
+        temp_dir = os.path.join(args.output_folder, 'tmp')
+
+        print('*** 1. Extracting ZeRO fragments')
+        _extract_zero_shard_files_stage3(args, optim_files, param_shapes, dp_degree, temp_dir)
+
+        print('*** 2. Merging slices .....')
+        _merge_zero3_slice_files(args, param_shapes, dp_degree, temp_dir)
+
+        print('*** 3. Saving common optimizer states')
+        _save_optimizer_state_stage3(args, optim_files)
+
+        if not args.keep_temp_folder:
+            shutil.rmtree(temp_dir, ignore_errors=True)
+
+        # Copy *model_states files into output folder
+        for f in glob.glob(os.path.join(args.input_folder, '*model_states.pt')):
+            shutil.copy2(f, args.output_folder)
 
     # Update latest to output folder
     checkpoint_root_folder, step_folder = os.path.split(args.output_folder)
@@ -332,4 +545,5 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    args = parse_arguments()
+    main(args)
diff --git a/deepspeed/checkpoint/reshape_3d_utils.py b/deepspeed/checkpoint/reshape_3d_utils.py
index b5bf41e2d160..76e21892c482 100644
--- a/deepspeed/checkpoint/reshape_3d_utils.py
+++ b/deepspeed/checkpoint/reshape_3d_utils.py
@@ -70,10 +70,20 @@ def can_reshape(self, target_3d_desc):
         return len(err_msg) == 0, err_msg
 
 
+def get_num_pp0_files(file_list):
+    num_layer1_files = len(get_files_with_prefix(file_list, f'{LAYER_FILE_PREFIX}01'))
+    num_layer2_files = len(get_files_with_prefix(file_list, f'{LAYER_FILE_PREFIX}02'))
+    if (num_layer1_files > 0) or (num_layer2_files == 0):
+        return num_layer1_files
+    else:
+        return num_layer2_files
+
+
+
 def get_model_3d_descriptor(dir):
     file_list = get_files(dir)
     zero_file_list = get_zero_files(dir)
-    num_pp0_files = len(get_files_with_prefix(file_list, f'{LAYER_FILE_PREFIX}01'))
+    num_pp0_files = get_num_pp0_files(file_list)
     if num_pp0_files > 0:
         tp_degree = num_pp0_files
         pp_degree = len(get_files_with_prefix(file_list, MODEL_FILE_PREFIX)) // tp_degree
@@ -81,7 +91,7 @@ def get_model_3d_descriptor(dir):
     else:
         tp_degree = len(get_files_with_prefix(file_list, MODEL_FILE_PREFIX))
         dp_degree = max(1, len(zero_file_list) // tp_degree)
-        pp_degree = 0
+        pp_degree = 1
 
     return model_3d_desc(pp_degree, tp_degree, dp_degree)
 
diff --git a/deepspeed/checkpoint/reshape_utils.py b/deepspeed/checkpoint/reshape_utils.py
index 15b6ce28b2fd..85e124dbe3bc 100644
--- a/deepspeed/checkpoint/reshape_utils.py
+++ b/deepspeed/checkpoint/reshape_utils.py
@@ -4,9 +4,10 @@
 # DeepSpeed Team
 
 import os
+import re
 import torch
 from collections import OrderedDict
-from .constants import (ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX)
+from .constants import (ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX, MODEL_FILE_PREFIX)
 
 
 def basic_folder_validation(dir):
@@ -38,12 +39,28 @@ def get_files(dir):
     return file_list
 
 
+def sort_zero_files(files, prefix):
+    pattern = f"{prefix}([0-9]+)_{MODEL_FILE_PREFIX}([0-9]+)"
+    rank_pairs = []
+    for f in files:
+        m = re.search(pattern, f)
+        if m:
+            dp_rank = int(m.group(1))
+            mp_rank = int(m.group(2))
+            rank_pairs.append((dp_rank, mp_rank, f))
+        else:
+            raise ValueError(f"Cannot parse dp_rank and mp_rank from {f}")
+
+    sorted_files = sorted(rank_pairs, key=lambda x: (x[0], x[1]))
+    return [f for _, _, f in sorted_files]
+
+
 def get_zero_files(dir):
     file_list = get_files(dir)
     for prefix in [ZERO_FILE_PREFIX, FP16_ZERO_FILE_PREFIX, BF16_ZERO_FILE_PREFIX]:
         zero_files = get_files_with_prefix(file_list, prefix)
         if len(zero_files) > 0:
-            return zero_files
+            return sort_zero_files(zero_files, prefix)
 
     return []
 
@@ -60,12 +77,25 @@ def _key_list_to_string(key_list):
     return '.'.join(key_list)
 
 
+def _to_list_if_int(list_or_int):
+    if isinstance(list_or_int, int):
+        return [list_or_int]
+    else:
+        return list_or_int
+
+
 def merge_state_dict(dict_a, dict_b, key_list):
     merged_dict = type(dict_a)({})
 
     for key, value in dict_b.items():
         if key in dict_a.keys():
-            merged_dict[key] = merge_state(dict_a[key], dict_b[key], [str(key)])
+            # TODO: Fix ugliest hack ever
+            if key == PARTITION_COUNT:
+                count_a = _to_list_if_int(dict_a[key])
+                count_b = _to_list_if_int(dict_b[key])
+                merged_dict[key] = merge_state(count_a, count_b, [str(key)])
+            else:
+                merged_dict[key] = merge_state(dict_a[key], dict_b[key], [str(key)])
         else:
             merged_dict[key] = value
 
diff --git a/deepspeed/checkpoint/universal_checkpoint.py b/deepspeed/checkpoint/universal_checkpoint.py
index 5849a834cdd3..266d5a063595 100644
--- a/deepspeed/checkpoint/universal_checkpoint.py
+++ b/deepspeed/checkpoint/universal_checkpoint.py
@@ -4,23 +4,42 @@
 # DeepSpeed Team
 
 import os
+import re
 import torch
 import types
-from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS)
+from typing import List, Tuple, Union
+from dataclasses import dataclass
+from .constants import (FP32_WEIGHT_KEY, PARAM, VOCAB_TENSOR, CAT_DIM, PARAM_N_SUB_PARAMS, SUB_PARAM_SHAPE)
+
+
+@dataclass
+class SubparamShape:
+    patterns: List[str]
+    shape: Tuple[Union[Tuple[int], int]]
+    partition_dim: int
 
 
 def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
     hp_mapping = self._hp_mapping
-    optim_state_keys = hp_mapping.get_optim_state_keys()
-    hp_keys = [FP32_WEIGHT_KEY] + optim_state_keys
-    checkpoint_files = {key: os.path.join(folder, f"{key}.pt") for key in hp_keys}
+    hp_mapping.optim_fragment = {}
 
-    for file in checkpoint_files.values():
-        assert os.path.isfile(file), f'{file} is not a valid file'
+    hp_keys = []
+    for file in os.listdir(folder):
+        # We expect files named something like "exp_avg.pt", "exp_avg_sq.pt", "fp32.pt"
+        pattern = r'(.+).pt'
+        match = re.search(pattern, file)
+        if match:
+            hp_keys.append(match.group(1))
 
+    step = None
     for key in hp_keys:
-        ckpt_file = checkpoint_files[key]
-        ckpt_dict = torch.load(ckpt_file)
+        ckpt_file = os.path.join(folder, f"{key}.pt")
+        ckpt_dict = torch.load(ckpt_file, weights_only=False)
+
+        if key == "step":
+            step = ckpt_dict
+            continue
+
         full_hp_param = ckpt_dict[PARAM]
 
         # need to deal with slices that were averaged.
@@ -62,17 +81,36 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
 
         assert full_param_numel == tp_world_size * tp_slice_numel, \
             f'Loading {ckpt_file} full param numel {full_param_numel} != tensor slice numel {tp_slice_numel} * tp_world_size {tp_world_size}'
-        dst_tensor = hp_mapping.hp_fragment if key == FP32_WEIGHT_KEY else hp_mapping.get_optim_state_fragment(key)
 
         #        print(f"{full_hp_param.shape=} {full_param_numel=} {folder=}")
         #        print(f"{dst_tensor.shape=} {dst_tensor.numel()=}{folder=}")
 
+        sub_param_shape = ckpt_dict.get(SUB_PARAM_SHAPE, None)
         # since when we do many to 1 on tp we cat sometimes on dim=0 and other times on dim=1 we have to do exactly the same in reverse
         # special case is when a single parameter is effectively a container for multiple sub parameters
         # (more details at PARAM_N_SUB_PARAMS definition)
         chunk_dim = ckpt_dict.get(CAT_DIM, 0)
         n_sub_params = ckpt_dict.get(PARAM_N_SUB_PARAMS, 1)
-        if n_sub_params > 1:
+        if sub_param_shape:
+            partition_dim = sub_param_shape.partition_dim
+            sub_dim_sizes = sub_param_shape.shape[partition_dim]
+            if not isinstance(sub_dim_sizes, tuple):
+                sub_dim_sizes = (sub_dim_sizes, )
+
+            partition_shape = [sum(d) if isinstance(d, tuple) else d for d in sub_param_shape.shape]
+            full_hp_param = full_hp_param.view(partition_shape)
+
+            offset = 0
+            merged_chunks = []
+            for sub_dim_size in sub_dim_sizes:
+                sub_params_tp_slice = full_hp_param.narrow(partition_dim,
+                                                           offset, sub_dim_size).chunk(tp_world_size,
+                                                                                       dim=partition_dim)[tp_rank]
+                merged_chunks.append(sub_params_tp_slice)
+                offset += sub_dim_size
+            tp_hp_slice = torch.cat(merged_chunks, dim=partition_dim)
+
+        elif n_sub_params > 1:
             sub_params = full_hp_param.chunk(n_sub_params, dim=chunk_dim)
             sub_params_tp_slice = [p.chunk(tp_world_size, dim=chunk_dim)[tp_rank] for p in sub_params]
             tp_hp_slice = torch.cat(sub_params_tp_slice, dim=chunk_dim)
@@ -84,13 +122,23 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
 
         lp_frag_address = hp_mapping.lp_fragment_address
         tp_hp_fragment = tp_hp_slice.narrow(0, lp_frag_address.start, lp_frag_address.numel)
-        assert dst_tensor.numel() == lp_frag_address.numel, \
-            f'Load checkpoint {key} dst_tensor numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}'
 
         #        print(f"{key} SHAPE: {tp_hp_slice.shape=}")
         #        print(f"{key} SHAPE: {dst_tensor.shape=}")
         #        print(f"{key} SHAPE: {tp_hp_fragment.shape=}")
-        dst_tensor.data.copy_(tp_hp_fragment.data)
+
+        if key == FP32_WEIGHT_KEY:
+            dst_tensor = hp_mapping.get_hp_fragment()
+            assert dst_tensor.numel() == lp_frag_address.numel, \
+                f'Load checkpoint {key} dst numel {dst_tensor.numel()} != src numel {lp_frag_address.numel}'
+            dst_tensor.data.copy_(tp_hp_fragment.data)
+        else:
+            assert tp_hp_fragment.numel() == lp_frag_address.numel, \
+                f'Load checkpoint {key} dst numel {tp_hp_fragment.numel()} != src numel {lp_frag_address.numel}'
+
+            hp_mapping.optim_fragment[key] = tp_hp_fragment.clone().detach()
+
+    return step
 
 
 def enable_universal_checkpoint(param_list):
diff --git a/deepspeed/checkpoint/utils.py b/deepspeed/checkpoint/utils.py
index c305e8884e83..5964da00728e 100644
--- a/deepspeed/checkpoint/utils.py
+++ b/deepspeed/checkpoint/utils.py
@@ -51,7 +51,12 @@ def clone_tensors_for_torch_save(item, device=torch.device('cpu')):
         - copy of ``item`` with cloned tensors on target device
     """
     if torch.is_tensor(item):
-        return item.detach().clone().to(device)
+        if type(device) is str:
+            device = torch.device(device)
+        if device == item.device:
+            return item.detach().clone()
+        else:
+            return item.detach().to(device)
     elif isinstance(item, list):
         return [clone_tensors_for_torch_save(v, device) for v in item]
     elif isinstance(item, tuple):
diff --git a/deepspeed/checkpoint/zero_checkpoint.py b/deepspeed/checkpoint/zero_checkpoint.py
index c65745d3dd0c..b5721555b07a 100644
--- a/deepspeed/checkpoint/zero_checkpoint.py
+++ b/deepspeed/checkpoint/zero_checkpoint.py
@@ -54,7 +54,7 @@ def get_state_for_rank(self, pp_index, tp_index, dp_index, keys_to_ignore=[], st
         state_file_list = self.get_files_for_rank(pp_index, tp_index, dp_index)
         merged_sd = None
         for state_file in state_file_list:
-            sd = torch.load(state_file, map_location=torch.device('cpu'))
+            sd = torch.load(state_file, map_location=torch.device('cpu'), weights_only=False)
             for key in keys_to_ignore:
                 sd.pop(key, None)
 
@@ -105,9 +105,11 @@ def _strip_tensor_paddings(self, sd):
             if group_paddings[key] == 0:
                 continue
             for state_name, state_value in group_state.items():
-                if torch.is_tensor(state_value):
+                if state_name != "step" and torch.is_tensor(state_value):
                     raw_length = state_value.numel() - group_paddings[key]
                     group_state[state_name] = torch.narrow(state_value, 0, 0, raw_length).clone()
+                else:
+                    group_state[state_name] = state_value
 
     def _clear_group_paddings(self, sd):
         group_paddings = self._get_optimizer_state(sd, GROUP_PADDINGS)
@@ -128,7 +130,7 @@ def _get_param_group_states(self, sd):
             return None
 
         base_optimizer_state = optimizer_state.get(BASE_OPTIMIZER_STATE, None)
-        if base_optimizer_state is None:
+        if base_optimizer_state is None or isinstance(base_optimizer_state, list):
             return None
 
         return base_optimizer_state.get(GROUP_STATE_KEY, None)
diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
index 6e915d0ca430..8753cf9f70ed 100644
--- a/deepspeed/comm/ccl.py
+++ b/deepspeed/comm/ccl.py
@@ -8,13 +8,14 @@
 
 import torch
 from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import NotImplementedBuilder
 from .reduce_op import ReduceOp
 from .torch import TorchBackend
 
 
 def build_ccl_op():
     builder = get_accelerator().create_op_builder("CCLCommBuilder")
-    if builder is None:
+    if builder is None or isinstance(builder, NotImplementedBuilder):
         return None
     ccl_cpp_module = builder.load()
     print(f'DeepSpeed {builder.absolute_name()} built successfully')
@@ -61,7 +62,8 @@ def is_initialized(self):
 
     def run_collective(self, name, **kwargs):
         if name in self.available_coll:
-            kwargs['group'] = self.get_all_ranks_from_group(kwargs['group'])
+            if 'group' in kwargs:
+                kwargs['group'] = self.get_all_ranks_from_group(kwargs['group'])
             if 'dst' in kwargs:
                 kwargs['dst'] = kwargs['group'].index(kwargs['dst'])
             if 'src' in kwargs:
@@ -71,23 +73,38 @@ def run_collective(self, name, **kwargs):
             return CCLHandler(self.ccl_comm_op)
         else:
             func = "super(CCLBackend, self)." + name
-            return eval(func)(*(kwargs.values()))
+            eval(func)(*(kwargs.values()))
+            return CCLHandler(self.ccl_comm_op)
 
     def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
         use_caching = False
         if use_caching:
             match_id = f"{tensor.size()}-{op}"
-            return self.run_collective(name="all_reduce_caching",
-                                       tensor=tensor,
-                                       op=op,
-                                       match_id=match_id,
-                                       group=group,
-                                       async_op=async_op)
+            name = "all_reduce_caching"
+            if name in self.available_coll:
+                group = self.get_all_ranks_from_group(group)
+                return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
+            else:
+                return self.run_collective(name=name,
+                                           tensor=tensor,
+                                           op=op,
+                                           match_id=match_id,
+                                           group=group,
+                                           async_op=async_op)
         else:
-            return self.run_collective(name="all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
-
-    def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
-        return self.run_collective(name="inference_all_reduce", tensor=tensor, op=op, group=group, async_op=async_op)
+            name = "all_reduce"
+            if name in self.available_coll:
+                group = self.get_all_ranks_from_group(group)
+                return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
+            else:
+                return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
+
+    def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None):
+        name = "inference_all_reduce"
+        if name in self.available_coll:
+            return self.ccl_comm_op.inference_all_reduce(tensor, op)
+        else:
+            return self.run_collective(name=name, tensor=tensor, op=op, group=None, async_op=False)
 
     def broadcast(self, tensor, src, group=None, async_op=False):
         return self.run_collective(name="broadcast", tensor=tensor, src=src, group=group, async_op=async_op)
@@ -120,11 +137,11 @@ def all_to_all_single(self, output, input, output_split_sizes, input_split_sizes
                                    input_split_sizes=input_split_sizes,
                                    group=group)
 
-    def send(self, tensor, dst, group=None, async_op=False):
-        return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, async_op=async_op)
+    def send(self, tensor, dst, group=None, tag=0):
+        return self.run_collective(name="send", tensor=tensor, dst=dst, group=group, tag=tag)
 
-    def recv(self, tensor, src, group=None, async_op=False):
-        return self.run_collective(name="recv", tensor=tensor, src=src, group=group, async_op=async_op)
+    def recv(self, tensor, src, group=None, tag=0):
+        return self.run_collective(name="recv", tensor=tensor, src=src, group=group, tag=tag)
 
     def gather(self, tensor, gather_list, dst, group=None, async_op=False):
         return self.run_collective(name="gather", tensor=tensor, gather_list=gather_list, dst=dst, group=group)
@@ -170,7 +187,7 @@ def get_all_ranks_from_group(self, group):
             while True:
                 results.append(super(CCLBackend, self).get_global_rank(group, rank))
                 rank += 1
-        except ValueError:
+        except (ValueError, RuntimeError):
             pass
         if tuple(results) not in self.groups:
             self._new_group(results, group)
diff --git a/deepspeed/comm/comm.py b/deepspeed/comm/comm.py
old mode 100644
new mode 100755
index 568211645f40..800bc6078c3a
--- a/deepspeed/comm/comm.py
+++ b/deepspeed/comm/comm.py
@@ -160,7 +160,7 @@ def init_deepspeed_backend(ds_backend, timeout, init_method):
         ccl_backend = CCLBackend(rank=rank, world_size=size, timeout=timeout, init_method=init_method)
         utils.logger.info(f"Initialize {ds_backend} backend")
     elif ds_backend == HCCL_BACKEND:
-        utils.logger.warn("HCCL backend in DeepSpeed not yet implemented")
+        utils.logger.debug("HCCL backend in DeepSpeed not yet implemented")
     else:
         utils.logger.debug(f"DeepSpeed does not support {ds_backend} backend")
 
@@ -224,6 +224,12 @@ def broadcast(tensor, src, group=None, async_op=False, prof=False, log_name='bro
     return cdb.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
 
+@timed_op
+def broadcast_object_list(object_list, src, group=None, device=None):
+    global cdb
+    return cdb.broadcast_object_list(object_list=object_list, src=src, group=group, device=device)
+
+
 @timed_op
 def all_gather(tensor_list,
                tensor,
@@ -505,7 +511,7 @@ def inference_all_reduce(tensor,
                          log_name='all_reduce',
                          debug=get_caller_func()):
     global cdb
-    return cdb.inference_all_reduce(tensor, op, group, async_op)
+    return cdb.inference_all_reduce(tensor, op, group)
 
 
 @timed_op
@@ -595,11 +601,26 @@ def get_all_ranks_from_group(group=None):
         while True:
             group_ranks.append(cdb.get_global_rank(group, rank))
             rank += 1
-    except RuntimeError:
+    except (RuntimeError, ValueError):
         pass
     return group_ranks
 
 
+def initialize_mesh_device(mesh_shape, mesh_dim_names):
+    global cdb
+    assert cdb is not None and cdb.is_initialized(
+    ), 'DeepSpeed backend not set, please initialize it using init_process_group()'
+    mesh_device = None
+    if hasattr(cdb, 'init_device_mesh'):
+        utils.logger.info(f"Initializing mesh device with backend {cdb.name} \
+                with shape {mesh_shape} and dim names {mesh_dim_names}")
+        mesh_device = cdb.init_device_mesh(mesh_shape, mesh_dim_names)
+    else:
+        if get_rank() == 0:
+            utils.logger.warning_once(f"Backend {cdb.name} does not support mesh device initialization")
+    return mesh_device
+
+
 # Main DeepSpeed Comms. public API.
 def init_distributed(dist_backend=None,
                      auto_mpi_discovery=True,
@@ -614,11 +635,11 @@ def init_distributed(dist_backend=None,
     ''' Initialize dist backend, potentially performing MPI discovery if needed
 
     Arguments:
-        dist_backend: Optional (str). torch distributed backend, e.g., nccl, mpi, gloo
+        dist_backend: Optional (str). torch distributed backend, e.g., nccl, mpi, gloo, hccl
         auto_mpi_discovery Optional (bool). if distributed environment variables are not set, attempt to discover them from MPI
         distributed_port: Optional (int). torch distributed backend port
         verbose: Optional (bool). verbose logging
-        timeout: Optional (timedelta). Timeout for operations executed against the process group. Default value equals 30 minutes.
+        timeout: Optional (timedelta). Timeout for operations executed against the process group. The default value of 30 minutes can be overridden by the environment variable `DEEPSPEED_TIMEOUT`.
         init_method: Optional (string). Torch distributed, URL specifying how to initialize the process group. Default is “env://” if no init_method or store is specified.
         config: Optional (dict). DeepSpeed configuration for setting up comms options (e.g. Comms profiling)
         rank: Optional (int). The current manually specified rank. Some init_method like “tcp://” need the rank and world_size as well (see: https://pytorch.org/docs/stable/distributed.html#tcp-initialization)
@@ -682,9 +703,14 @@ def mpi_discovery(distributed_port=TORCH_DISTRIBUTED_DEFAULT_PORT, verbose=True)
 
     master_addr = None
     if rank == 0:
-        hostname_cmd = ["hostname -I"]
-        result = subprocess.check_output(hostname_cmd, shell=True)
-        master_addr = result.decode('utf-8').split()[0]
+        import shlex
+        try:
+            hostname_cmd = shlex.split("hostname -I")
+            result = subprocess.check_output(hostname_cmd)
+            master_addr = result.decode('utf-8').split()[0]
+        except subprocess.CalledProcessError:  # hostname -I not available (e.g. on macOS)
+            import socket
+            master_addr = socket.gethostbyname(socket.gethostname())
     master_addr = comm.bcast(master_addr, root=0)
 
     # Determine local rank by assuming hostnames are unique
diff --git a/deepspeed/comm/config.py b/deepspeed/comm/config.py
index 1c441bb6bfe9..57501c9dd237 100644
--- a/deepspeed/comm/config.py
+++ b/deepspeed/comm/config.py
@@ -3,20 +3,12 @@
 
 # DeepSpeed Team
 
-from .constants import *
-from ..pydantic_v1 import BaseModel
-
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
-class CommsConfig(BaseModel):
-
-    class Config:
-        validate_all = True
-        validate_assignment = True
-        use_enum_values = True
-        extra = 'forbid'
+from .constants import *
 
 
-class CommsLoggerConfig(CommsConfig):
+class CommsLoggerConfig(DeepSpeedConfigModel):
     enabled: bool = COMMS_LOGGER_ENABLED_DEFAULT
     prof_all: bool = COMMS_LOGGER_PROF_ALL_DEFAULT
     prof_ops: list = COMMS_LOGGER_PROF_OPS_DEFAULT
diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py
old mode 100644
new mode 100755
index 531f54d2628f..efa0640fb87b
--- a/deepspeed/comm/torch.py
+++ b/deepspeed/comm/torch.py
@@ -3,11 +3,14 @@
 
 # DeepSpeed Team
 
+import deepspeed
 from deepspeed import utils
 
 from .utils import *
 from .backend import *
 from .comm import *
+from ..runtime import compiler
+from deepspeed.utils.torch import required_torch_version
 import os
 
 DS_COMM_ALL_GATHER_OFF = False
@@ -17,25 +20,19 @@
 DS_COMM_REDUCE_OFF = False
 
 
-def is_torch_ver_eq_2_0():
-    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
-    if TORCH_MAJOR == 2 and TORCH_MINOR == 0:
-        return True
-    return False
+def disable_compiler_collective(func):
+    if required_torch_version(min_version=2.3):
+        return func
+    return compiler.disable(func)
 
 
-def is_torch_ver_ge_2_1():
-    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
-    if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
-        return True
-    return False
-
-
-def torch_ver_ge_1_13():
-    TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[:2])
-    if TORCH_MAJOR >= 1 and TORCH_MINOR >= 13:
-        return True
-    return False
+def build_shm_op():
+    builder = get_accelerator().create_op_builder("ShareMemCommBuilder")
+    if builder is None or not deepspeed.ops.__compatible_ops__[builder.NAME]:
+        return None
+    shm_cpp_module = builder.load()
+    print(f'DeepSpeed {builder.absolute_name()} built successfully')
+    return shm_cpp_module
 
 
 def has_coalescing_manager():
@@ -44,13 +41,13 @@ def has_coalescing_manager():
 
 
 def has_all_reduce_coalesced():
-    return hasattr(torch.distributed, "all_reduce_coalesced") and torch_ver_ge_1_13()
+    return hasattr(torch.distributed, "all_reduce_coalesced") and required_torch_version(min_version=1.13)
 
 
 def get_coalescing_manager(group, device, reqs, async_op):
-    if is_torch_ver_eq_2_0():
+    if required_torch_version(min_version=2.0, max_version=2.0):
         return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, reqs=reqs)
-    elif is_torch_ver_ge_2_1():
+    elif required_torch_version(min_version=2.1):
         return torch.distributed.distributed_c10d._coalescing_manager(group, device=device, async_ops=async_op)
     else:
         return torch.distributed.distributed_c10d._coalescing_manager(group, reqs)
@@ -107,6 +104,7 @@ class TorchBackend(Backend):
 
     def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='torch'):
         super(TorchBackend, self).__init__()
+        self.shm_comm_op = build_shm_op()
         self.has_all_reduce_coalesced = has_all_reduce_coalesced()
         self.has_coalescing_manager = has_coalescing_manager()
         self.all_gather_function = self.get_all_gather_function()
@@ -118,8 +116,11 @@ def __init__(self, backend, timeout, init_method, rank=-1, world_size=-1, name='
         # it is not so we can run on a single GPU without doing any init_process_group
         self.single_gpu_mode = True
         self.init_process_group(backend, timeout, init_method, rank, world_size)
+        if self.shm_comm_op != None:
+            self.shm_comm_op.initialize(self.get_world_size(), self.get_rank())
 
     @classmethod
+    @disable_compiler_collective
     def get_all_gather_function(self):
         if hasattr(torch.distributed, "all_gather_into_tensor"):
             return torch.distributed.all_gather_into_tensor
@@ -128,6 +129,7 @@ def get_all_gather_function(self):
         return None
 
     @classmethod
+    @disable_compiler_collective
     def get_reduce_scatter_function(self):
         if hasattr(torch.distributed, "reduce_scatter_tensor"):
             return torch.distributed.reduce_scatter_tensor
@@ -150,14 +152,19 @@ def init_process_group(self, backend, timeout, init_method, rank, world_size):
                                                  world_size=world_size)
         self.using_mpi = torch.distributed.get_backend() == 'mpi'
 
+    @disable_compiler_collective
     def all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         op = self._reduce_op(op)
         return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
 
-    def inference_all_reduce(self, tensor, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
-        op = self._reduce_op(op)
-        return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=async_op)
+    def inference_all_reduce(self, tensor, op, group=None):
+        if not hasattr(torch.ops, 'deepspeed') or not hasattr(torch.ops.deepspeed, 'inference_all_reduce_'):
+            op = self._reduce_op(op)
+            return torch.distributed.all_reduce(tensor=tensor, op=op, group=group, async_op=False)
+        else:
+            return torch.ops.deepspeed.inference_all_reduce_(tensor)
 
+    @disable_compiler_collective
     def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group=None, async_op=False):
         """ proxy func to torch.distributed.all_reduce_coalesced,
         which is included in PyTorch 1.13 and above
@@ -168,6 +175,7 @@ def all_reduce_coalesced(self, tensors, op=torch.distributed.ReduceOp.SUM, group
         op = self._reduce_op(op)
         return torch.distributed.all_reduce_coalesced(tensors=tensors, op=op, group=group, async_op=async_op)
 
+    @disable_compiler_collective
     def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -175,6 +183,7 @@ def reduce(self, tensor, dst, op=ReduceOp.SUM, group=None, async_op=False):
             return Noop()
         return torch.distributed.reduce(tensor=tensor, dst=dst, op=self._reduce_op(op), group=group, async_op=async_op)
 
+    @disable_compiler_collective
     def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_op=False):
         if DS_COMM_REDUCE_SCATTER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -187,6 +196,7 @@ def reduce_scatter(self, output, input_list, op=ReduceOp.SUM, group=None, async_
                                                     group=group,
                                                     async_op=async_op)
 
+    @disable_compiler_collective
     def broadcast(self, tensor, src, group=None, async_op=False):
         if DS_COMM_BROADCAST_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -195,6 +205,11 @@ def broadcast(self, tensor, src, group=None, async_op=False):
         else:
             return torch.distributed.broadcast(tensor=tensor, src=src, group=group, async_op=async_op)
 
+    @disable_compiler_collective
+    def broadcast_object_list(self, object_list, src, group=None, device=None):
+        return torch.distributed.broadcast_object_list(object_list=object_list, src=src, group=group, device=device)
+
+    @disable_compiler_collective
     def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -203,6 +218,7 @@ def all_gather(self, tensor_list, tensor, group=None, async_op=False):
         else:
             return torch.distributed.all_gather(tensor_list=tensor_list, tensor=tensor, group=group, async_op=async_op)
 
+    @disable_compiler_collective
     def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_op=False):
         if self.has_all_gather_into_tensor():
             return self.all_gather_function(output_tensor=output_tensor,
@@ -210,6 +226,7 @@ def all_gather_into_tensor(self, output_tensor, input_tensor, group=None, async_
                                             group=group,
                                             async_op=async_op)
 
+    @disable_compiler_collective
     def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=False):
         if DS_COMM_ALL_GATHER_OFF:
             if int(os.getenv('RANK', '0')) == 0:
@@ -227,6 +244,7 @@ def all_gather_base(self, output_tensor, input_tensor, group=None, async_op=Fals
                                      "please consider upgrading your pytorch installation.")
                 pass
 
+    @disable_compiler_collective
     def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_op=False):
         """"""
         assert len(output_tensors) == len(input_tensors), ""
@@ -250,6 +268,7 @@ def all_gather_coalesced(self, output_tensors, input_tensors, group=None, async_
             else:
                 reqs[-1].wait()
 
+    @disable_compiler_collective
     def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, group=None, async_op=False):
         if self.has_reduce_scatter_tensor():
             return self.reduce_scatter_function(output_tensor,
@@ -263,6 +282,7 @@ def reduce_scatter_tensor(self, output_tensor, input_tensor, op=ReduceOp.SUM, gr
                                  "please consider upgrading your pytorch installation.")
             pass
 
+    @disable_compiler_collective
     def all_to_all_single(self,
                           output,
                           input,
@@ -277,21 +297,27 @@ def all_to_all_single(self,
                                                    group=group,
                                                    async_op=async_op)
 
+    @disable_compiler_collective
     def all_to_all(self, output_tensor_list, input_tensor_list, group=None, async_op=False):
         return torch.distributed.all_to_all(output_tensor_list, input_tensor_list, group=group, async_op=async_op)
 
+    @disable_compiler_collective
     def send(self, tensor, dst, group=None, tag=0):
         return torch.distributed.send(tensor=tensor, dst=dst, group=group, tag=tag)
 
+    @disable_compiler_collective
     def recv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.recv(tensor=tensor, src=src, group=group, tag=tag)
 
+    @disable_compiler_collective
     def isend(self, tensor, dst, group=None, tag=0):
         return torch.distributed.isend(tensor=tensor, dst=dst, group=group, tag=tag)
 
+    @disable_compiler_collective
     def irecv(self, tensor, src=None, group=None, tag=0):
         return torch.distributed.irecv(tensor=tensor, src=src, group=group, tag=tag)
 
+    @disable_compiler_collective
     def gather(self, tensor, gather_list=None, dst=0, group=None, async_op=False):
         return torch.distributed.gather(tensor=tensor,
                                         gather_list=gather_list,
@@ -299,6 +325,7 @@ def gather(self, tensor, gather_list=None, dst=0, group=None, async_op=False):
                                         group=group,
                                         async_op=async_op)
 
+    @disable_compiler_collective
     def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
         return torch.distributed.scatter(tensor=tensor,
                                          scatter_list=scatter_list,
@@ -306,11 +333,13 @@ def scatter(self, tensor, scatter_list=None, src=0, group=None, async_op=False):
                                          group=group,
                                          async_op=async_op)
 
+    @disable_compiler_collective
     def barrier(self, group=torch.distributed.GroupMember.WORLD, async_op=False, device_ids=None):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
         return torch.distributed.barrier(group=group, async_op=async_op, device_ids=device_ids)
 
+    @disable_compiler_collective
     def monitored_barrier(self, group=torch.distributed.GroupMember.WORLD, timeout=None, wait_all_ranks=False):
         if group is None:
             group = torch.distributed.GroupMember.WORLD
@@ -367,6 +396,19 @@ def _reduce_op(self, op):
                 op = torch.distributed.ReduceOp.BXOR
         return op
 
+    def init_device_mesh(self, mesh_shape, mesh_dim_names):
+        if not required_torch_version(min_version=2.2):
+            raise RuntimeError(f"Current torch version does not have device mesh"
+                               f"api (torch.__version__: {torch.__version__})")
+        if not required_torch_version(max_version=2.4):
+            return torch.distributed.device_mesh.init_device_mesh(get_accelerator().device_name(),
+                                                                  mesh_shape,
+                                                                  mesh_dim_names=mesh_dim_names)
+        else:
+            return torch.distributed.device_mesh.init_device_mesh(get_accelerator().current_device_name(),
+                                                                  mesh_shape,
+                                                                  mesh_dim_names=mesh_dim_names)
+
 
 # This will become a light-weight wrapper around torch.distributed functions
 # TODO: create some example to show how this wrapper can help profile communication
diff --git a/deepspeed/elasticity/elastic_agent.py b/deepspeed/elasticity/elastic_agent.py
index 91a217cb9971..8fd4293d312c 100644
--- a/deepspeed/elasticity/elastic_agent.py
+++ b/deepspeed/elasticity/elastic_agent.py
@@ -6,7 +6,7 @@
 from torch.distributed.elastic.agent.server.local_elastic_agent import LocalElasticAgent
 from typing import Any, Dict, Optional, Tuple
 from datetime import datetime
-from torch.distributed.elastic.agent.server.api import log, _get_socket_with_port
+from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.distributed.elastic.metrics import put_metric
 from torch.distributed.elastic.agent.server.api import (
     RunResult,
@@ -24,6 +24,10 @@
 from contextlib import closing
 import subprocess
 
+from torch.distributed.elastic.utils.logging import get_logger
+
+log = get_logger(__name__)
+
 
 class DSElasticAgent(LocalElasticAgent):
 
@@ -39,15 +43,20 @@ def __init__(
         self.ds_env = env
 
     @staticmethod
-    def _set_master_addr_port(store: Store, master_addr: Optional[str], master_port: Optional[int]):
+    def _set_master_addr_port(store: Store,
+                              master_addr: Optional[str],
+                              master_port: Optional[int],
+                              local_addr: Optional[str] = None):
         if master_port is None:
-            sock = _get_socket_with_port()
+            sock = get_free_port()
             with closing(sock):
                 master_port = sock.getsockname()[1]
 
         if master_addr is None:
             # master_addr = _get_fq_hostname()
-            result = subprocess.check_output("hostname -I", shell=True)
+            import shlex
+            safe_cmd = shlex.split("hostname -I")
+            result = subprocess.check_output(safe_cmd)
             master_addr = result.decode('utf-8').split()[0]
 
         store.set("MASTER_ADDR", master_addr.encode(encoding="UTF-8"))
@@ -151,8 +160,8 @@ def _invoke_run(self, role: str = "default") -> RunResult:
                          f" Waiting {self._exit_barrier_timeout} seconds for other agents to finish.")
                 self._exit_barrier()
                 return run_result
-            elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED
-                           } or len(participants) > len(rdzv_handler._state_holder.state.participants):
+            elif state in {WorkerState.UNHEALTHY, WorkerState.FAILED} or len(participants) > len(
+                    rdzv_handler._state_holder.state.participants):
                 if self._remaining_restarts > 0:
                     log.info(f"[{role}] Worker group {state.name}. "
                              f"{self._remaining_restarts}/{spec.max_restarts} attempts left;"
diff --git a/deepspeed/elasticity/utils.py b/deepspeed/elasticity/utils.py
index 848ea8646449..78ae0352cf6e 100644
--- a/deepspeed/elasticity/utils.py
+++ b/deepspeed/elasticity/utils.py
@@ -3,7 +3,7 @@
 
 # DeepSpeed Team
 
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 
 def is_torch_elastic_compatible():
diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index 912262453748..37e33b1e8739 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -9,7 +9,7 @@
 import subprocess
 import argparse
 from .ops.op_builder.all_ops import ALL_OPS
-from .git_version_info import installed_ops, torch_info
+from .git_version_info import installed_ops, torch_info, accelerator_name
 from deepspeed.accelerator import get_accelerator
 
 GREEN = '\033[92m'
@@ -51,7 +51,8 @@ def op_report(verbose=True):
     for op_name, builder in ALL_OPS.items():
         dots = "." * (max_dots - len(op_name))
         is_compatible = OKAY if builder.is_compatible(verbose) else no
-        is_installed = installed if installed_ops.get(op_name, False) else no
+        is_installed = installed if installed_ops.get(op_name,
+                                                      False) and accelerator_name == get_accelerator()._name else no
         dots2 = '.' * ((len(h[1]) + (max_dots2 - len(h[1]))) - (len(is_installed) - color_len))
         print(op_name, dots, is_installed, dots2, is_compatible)
     print("-" * (max_dots + max_dots2 + len(h[0]) + len(h[1])))
@@ -109,7 +110,7 @@ def installed_cann_version():
 def get_shm_size():
     try:
         shm_stats = os.statvfs('/dev/shm')
-    except (OSError, FileNotFoundError, ValueError):
+    except (OSError, FileNotFoundError, ValueError, AttributeError):
         return "UNKNOWN", None
 
     shm_size = shm_stats.f_frsize * shm_stats.f_blocks
diff --git a/deepspeed/git_version_info.py b/deepspeed/git_version_info.py
index 635842c760ea..70c536d2f78e 100644
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@@ -18,5 +18,14 @@
 
     from .ops.op_builder.all_ops import ALL_OPS
     installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
-    compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
+    accelerator_name = ""
     torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
+
+# compatible_ops list is recreated for each launch
+from .ops.op_builder.all_ops import ALL_OPS
+
+compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
+for op_name, builder in ALL_OPS.items():
+    op_compatible = builder.is_compatible()
+    compatible_ops[op_name] = op_compatible
+    compatible_ops["deepspeed_not_implemented"] = False
diff --git a/deepspeed/inference/config.py b/deepspeed/inference/config.py
index 1d5018aaa75b..6df61f7c8841 100644
--- a/deepspeed/inference/config.py
+++ b/deepspeed/inference/config.py
@@ -5,38 +5,25 @@
 
 import torch
 import deepspeed
-from deepspeed.pydantic_v1 import Field, validator
+from pydantic import Field, field_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
-from typing import Dict, Union
+from typing import Dict, Union, Optional
 from enum import Enum
 
 
 class DtypeEnum(Enum):
-    # The torch dtype must always be the first value (so we return torch.dtype)
-    fp16 = torch.float16, "torch.float16", "fp16", "float16", "half"
-    fp32 = torch.float32, "torch.float32", "fp32", "float32", "float"
-    bf16 = torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat"
-    int8 = torch.int8, "torch.int8", "int8"
-
-    # Copied from https://stackoverflow.com/a/43210118
-    # Allows us to use multiple values for each Enum index and returns first
-    # listed value when Enum is called
-    def __new__(cls, *values):
-        obj = object.__new__(cls)
-        # first value is canonical value
-        obj._value_ = values[0]
-        for other_value in values[1:]:
-            cls._value2member_map_[other_value] = obj
-        obj._all_values = values
-        return obj
-
-    def __repr__(self):
-        return "<%s.%s: %s>" % (
-            self.__class__.__name__,
-            self._name_,
-            ", ".join([repr(v) for v in self._all_values]),
-        )
+    fp16 = (torch.float16, "torch.float16", "fp16", "float16", "half")
+    fp32 = (torch.float32, "torch.float32", "fp32", "float32", "float")
+    bf16 = (torch.bfloat16, "torch.bfloat16", "bf16", "bfloat16", "bfloat")
+    int8 = (torch.int8, "torch.int8", "int8")
+
+    @classmethod
+    def from_str(cls, value: str):
+        for dtype in cls:
+            if value in dtype.value:
+                return dtype
+        raise ValueError(f"'{value}' is not a valid DtypeEnum")
 
 
 class MoETypeEnum(str, Enum):
@@ -53,6 +40,9 @@ class DeepSpeedTPConfig(DeepSpeedConfigModel):
     tp_size: int = 1
     """ Number of devices to split the model across using tensor parallelism. """
 
+    tp_grain_size: int = 64
+    "Desired MLP/lm_head tp size granularity. DNN library favors tensor size in granularity of power of 2, we pick 64 as a default size."
+
     mpu: object = None
     """
     A model parallelism unit object that implements
@@ -91,24 +81,24 @@ class QuantTypeEnum(str, Enum):
 
 
 class BaseQuantConfig(DeepSpeedConfigModel):
-    enabled = True
-    num_bits = 8
+    enabled: bool = True
+    num_bits: int = 8
     q_type: QuantTypeEnum = QuantTypeEnum.sym
     q_groups: int = 1
 
 
 class WeightQuantConfig(BaseQuantConfig):
-    enabled = True
+    enabled: bool = True
     quantized_initialization: Dict = {}
     post_init_quant: Dict = {}
 
 
 class ActivationQuantConfig(BaseQuantConfig):
-    enabled = True
+    enabled: bool = True
 
 
 class QKVQuantConfig(DeepSpeedConfigModel):
-    enabled = True
+    enabled: bool = True
 
 
 class QuantizationConfig(DeepSpeedConfigModel):
@@ -120,9 +110,9 @@ class QuantizationConfig(DeepSpeedConfigModel):
 
 # todo: brainstorm on how to do ckpt loading for DS inference
 class InferenceCheckpointConfig(DeepSpeedConfigModel):
-    checkpoint_dir: str = None
-    save_mp_checkpoint_path: str = None
-    base_dir: str = None
+    checkpoint_dir: Optional[str] = None
+    save_mp_checkpoint_path: Optional[str] = None
+    base_dir: Optional[str] = None
 
 
 class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
@@ -136,7 +126,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     `(attention_output projection, transformer output projection)`
     """
 
-    dtype: DtypeEnum = torch.float16
+    dtype: torch.dtype = torch.float16
     """
     Desired model data type, will convert model to this type.
     Supported target types: `torch.half`, `torch.int8`, `torch.float`
@@ -184,6 +174,15 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     values for :any:`DeepSpeedMoEConfig`.
     """
 
+    keep_module_on_host: bool = False
+    """
+    When loading checkpoints to model parameters, they are moved to the device. In very large models
+    this might fill the device and cause OOM. Setting this flag to true, will keep checkpoints on
+    host and not move them directly to the device (giving an option to quantize checkpoint data before
+    moving it to the device for example).
+    Set only for models with injection policies and auto TP.
+    """
+
     quant: QuantizationConfig = {}
     """
     NOTE: only works for int8 dtype.
@@ -198,7 +197,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     """
 
     #todo: refactor the following 3 into the new checkpoint_config
-    checkpoint: Union[str, Dict] = None
+    checkpoint: Optional[Union[str, Dict]] = None
     """
     Path to deepspeed compatible checkpoint or path to JSON with load policy.
     """
@@ -214,7 +213,7 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
     specifying whether the inference-module is created with empty or real Tensor
     """
 
-    save_mp_checkpoint_path: str = None
+    save_mp_checkpoint_path: Optional[str] = None
     """
     The path for which we want to save the loaded model with a checkpoint. This
     feature is used for adjusting the parallelism degree to help alleviate the
@@ -243,19 +242,21 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
 
     replace_method: str = Field(
         "auto",
-        deprecated=True,
-        deprecated_msg="This parameter is no longer needed, please remove from your call to DeepSpeed-inference")
+        json_schema_extra={
+            "deprecated": True,
+            "deprecated_msg": "This parameter is no longer needed, please remove from your call to DeepSpeed-inference"
+        })
 
-    injection_policy: Dict = Field(None, alias="injection_dict")
+    injection_policy: Optional[Dict] = Field(None, alias="injection_dict")
     """
     Dictionary mapping a client nn.Module to its corresponding injection
     policy. e.g., `{BertLayer : deepspeed.inference.HFBertLayerPolicy}`
     """
 
-    injection_policy_tuple: tuple = None
+    injection_policy_tuple: Optional[tuple] = None
     """ TODO: Add docs """
 
-    config: Dict = Field(None, alias="args")  # todo: really no need for this field if we can refactor
+    config: Optional[Dict] = Field(None, alias="args")  # todo: really no need for this field if we can refactor
 
     max_out_tokens: int = Field(1024, alias="max_tokens")
     """
@@ -274,31 +275,49 @@ class DeepSpeedInferenceConfig(DeepSpeedConfigModel):
 
     transposed_mode: bool = Field(False, alias="transposed_mode")
 
-    mp_size: int = Field(1, deprecated=True, new_param="tensor_parallel.tp_size")
+    mp_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.tp_size"})
     """
     Desired model parallel size, default is 1 meaning no model parallelism.
     Deprecated, please use the ``tensor_parallel` config to control model
     parallelism.
     """
-    mpu: object = Field(None, deprecated=True, new_param="tensor_parallel.mpu")
-    ep_size: int = Field(1, deprecated=True, new_param="moe.ep_size")
-    ep_group: object = Field(None, alias="expert_group", deprecated=True, new_param="moe.ep_group")
-    ep_mp_group: object = Field(None, alias="expert_mp_group", deprecated=True, new_param="moe.ep_mp_group")
-    moe_experts: list = Field([1], deprecated=True, new_param="moe.moe_experts")
-    moe_type: MoETypeEnum = Field(MoETypeEnum.standard, deprecated=True, new_param="moe.type")
-
-    @validator("moe")
+    mpu: object = Field(None, json_schema_extra={"deprecated": True, "new_param": "tensor_parallel.mpu"})
+    ep_size: int = Field(1, json_schema_extra={"deprecated": True, "new_param": "moe.ep_size"})
+    ep_group: object = Field(None,
+                             alias="expert_group",
+                             json_schema_extra={
+                                 "deprecated": True,
+                                 "new_param": "moe.ep_group"
+                             })
+    ep_mp_group: object = Field(None,
+                                alias="expert_mp_group",
+                                json_schema_extra={
+                                    "deprecated": True,
+                                    "new_param": "moe.ep_mp_group"
+                                })
+    moe_experts: list = Field([1], json_schema_extra={"deprecated": True, "new_param": "moe.moe_experts"})
+    moe_type: MoETypeEnum = Field(MoETypeEnum.standard,
+                                  json_schema_extra={
+                                      "deprecated": True,
+                                      "new_param": "moe.type"
+                                  })
+
+    @field_validator("dtype", mode="before")
+    def validate_dtype(cls, field_value, values):
+        if isinstance(field_value, str):
+            return DtypeEnum.from_str(field_value).value[0]
+        if isinstance(field_value, torch.dtype):
+            return field_value
+        raise TypeError(f"Invalid type for dtype: {type(field_value)}")
+
+    @field_validator("moe")
     def moe_backward_compat(cls, field_value, values):
         if isinstance(field_value, bool):
             return DeepSpeedMoEConfig(moe=field_value)
         return field_value
 
-    @validator("use_triton")
+    @field_validator("use_triton")
     def has_triton(cls, field_value, values):
         if field_value and not deepspeed.HAS_TRITON:
             raise ValueError('Triton needs to be installed to use deepspeed with triton kernels')
         return field_value
-
-    class Config:
-        # Get the str representation of the datatype for serialization
-        json_encoders = {torch.dtype: lambda x: str(x)}
diff --git a/deepspeed/inference/engine.py b/deepspeed/inference/engine.py
index 149d20cd9305..0a74d19e91f5 100755
--- a/deepspeed/inference/engine.py
+++ b/deepspeed/inference/engine.py
@@ -6,6 +6,7 @@
 import torch
 import time
 import os
+import deepspeed
 from deepspeed import comm as dist
 from deepspeed.utils.logging import log_dist
 
@@ -13,7 +14,7 @@
 from packaging import version as pkg_version
 from deepspeed.runtime.checkpoint_engine.torch_checkpoint_engine import TorchCheckpointEngine
 from deepspeed.utils.timer import SynchronizedWallClockTimer
-
+from deepspeed.runtime.compiler import is_compile_supported
 from ..runtime.state_dict_factory import SDLoaderFactory
 from ..runtime.weight_quantizer import WeightQuantization
 from ..module_inject import replace_transformer_layer, generic_injection
@@ -26,7 +27,7 @@
 from ..module_inject.auto_tp import AutoTP
 
 from ..module_inject.replace_policy import generic_policies
-from ..module_inject.auto_tp_model_utils import build_bloom_alibi_tensor, build_mpt_atten_bias_tensor, build_mpt_alibi_tensor
+from ..module_inject.auto_tp_model_utils import build_bloom_alibi_tensor, build_mpt_atten_bias_tensor, build_mpt_alibi_tensor, get_alibi_mask
 from ..ops.transformer.inference.ds_attention import DeepSpeedSelfAttention
 from ..model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
@@ -51,12 +52,7 @@ def __init__(self, model, config):
         DS_INFERENCE_ENABLED = True
 
         super().__init__()
-
-        # Have to import here because inference_module is a global, but python
-        # globals only work at the module level and will not be updated unless
-        # we import it each time we init a new inference engine.
-        from ..model_implementations.transformers.ds_transformer import inference_module
-        if inference_module is not None:
+        if DeepSpeedTransformerInference.workspace is not None:
             self.destroy()
 
         self.module = model
@@ -71,6 +67,10 @@ def __init__(self, model, config):
         if hasattr(self.module, "config"):
             TransformerPolicy.hf_model_config = self.module.config
 
+        if config.dtype not in get_accelerator().supported_dtypes():
+            raise ValueError(
+                f"Data type {config.dtype} is not supported by {get_accelerator().device_name()} accelerator")
+
         # todo: keep this self.injection_dict because we don't use to change config.injection_policy API
         # todo: this will get changed when Molly's PR on auto injection dict is merged
         self.injection_dict = config.injection_policy
@@ -79,7 +79,6 @@ def __init__(self, model, config):
         self.mp_group = config.tensor_parallel.tp_group
         self.mpu = config.tensor_parallel.mpu
 
-        #self._validate_args(self.mpu, config.replace_with_kernel_inject)
         self.quantize_merge_count = 1
         self.quantization_scales = None
 
@@ -109,11 +108,6 @@ def __init__(self, model, config):
             assert pkg_version.parse(torch.__version__) >= pkg_version.parse("1.10"), \
                 "If you want to use cuda graph, please upgrade torch to at least v1.10"
 
-        # Check if model passed to engine is loaded w/ meta tensors, in which case
-        # kernel injection must be enabled.
-        # NOTE: This check assumes a Hugging Face hierarchy for the device type i.e. module.device.type
-        self.model_meta_device = self.module.device.type == 'meta' if hasattr(self.module, "device") else False
-
         # convert model to intended dtype
         if config.dtype:
             self._convert_to_dtype(config)
@@ -170,7 +164,12 @@ def __init__(self, model, config):
                     self._apply_injection_policy(config, client_module)
 
         device = get_accelerator().current_device_name()
-        self.module.to(device)
+        # NOTE: This check assumes a Hugging Face hierarchy for the device type i.e. module.device.type
+        is_meta_device = hasattr(self.module, "device") and self.module.device.type == 'meta'
+        if is_meta_device:
+            self.module.to_empty(device=device)
+        elif not config.keep_module_on_host:
+            self.module.to(device)
 
         if config.tensor_parallel.tp_size > 1:
             _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name())
@@ -182,17 +181,14 @@ def __init__(self, model, config):
 
         # Check if local CUDA graphs can be created in replacement modules
         self.local_cuda_graph = self._local_cuda_graph_used(self.module)
+        self._is_compiled = False
 
     def destroy(self):
-        # Have to import here because inference_module is a global, but python
-        # globals only work at the module level and will not be updated unless
-        # we import it each time we init a new inference engine.
-        from ..model_implementations.transformers.ds_transformer import inference_module
         DeepSpeedTransformerInference.layer_id = 0
         DeepSpeedSelfAttention.num_layers = 0
-        if inference_module is not None:
-            inference_module.release_workspace()
-            inference_module = None
+        if DeepSpeedTransformerInference.workspace.is_allocated():
+            DeepSpeedTransformerInference.workspace.release_workspace()
+        DeepSpeedTransformerInference.workspace = None
 
     def profile_model_time(self, use_cuda_events=True):
         if not self.model_profile_enabled and not self._config.enable_cuda_graph:
@@ -220,6 +216,10 @@ def build_alibi_tensor(self):
             if hasattr(self.module.transformer, 'build_mpt_alibi_tensor'):
                 self.module.transformer.build_mpt_alibi_tensor_orig = self.module.transformer.build_mpt_alibi_tensor
                 self.module.transformer.__class__.build_mpt_alibi_tensor = build_mpt_alibi_tensor
+        if hasattr(self.module, 'model'):
+            if hasattr(self.module.model, 'get_alibi_mask'):
+                self.module.model.get_alibi_mask_orig = self.module.model.get_alibi_mask
+                self.module.model.__class__.get_alibi_mask = get_alibi_mask
 
     def build_attn_bias(self):
         if hasattr(self.module, 'transformer'):
@@ -298,29 +298,6 @@ def _init_quantization_setting(self, quantization_setting):
             f"mlp_extra_grouping = {self.mlp_extra_grouping}, "
             f"quantize_groups = {self.quantize_groups}", [0])
 
-    # TODO: remove this function and add this functionality to pydantic config checking
-    def _validate_args(self, mpu, replace_with_kernel_inject):
-        # TODO: to support SD pipeline we need to avoid this check for now
-        if replace_with_kernel_inject and not isinstance(self.module, Module):
-            raise ValueError(f"model must be a torch.nn.Module, got {type(self.module)}")
-        if not isinstance(self._config.tensor_parallel.tp_size, int) or self._config.tensor_parallel.tp_size < 1:
-            raise ValueError(f"mp_size must be an int >= 1, got {self._config.tensor_parallel.tp_size}")
-
-        if mpu:
-            methods = ["get_model_parallel_group", "get_data_parallel_group"]
-            for method in methods:
-                if not hasattr(mpu, method):
-                    raise ValueError(f"mpu is missing {method}")
-        if self._config.checkpoint is not None and not isinstance(self._config.checkpoint, (str, dict)):
-            raise ValueError(f"checkpoint must be None, str or dict, got {type(self._config.checkpoint)}")
-
-        supported_dtypes = [None, torch.half, torch.int8, torch.float]
-        if self._config.dtype not in supported_dtypes:
-            raise ValueError(f"{self._config.dtype} not supported, valid dtype: {supported_dtypes}")
-
-        if self.injection_dict is not None and not isinstance(self.injection_dict, dict):
-            raise ValueError(f"injection_dict must be None or a dict, got: {self.injection_dict}")
-
     def load_model_with_checkpoint(self, r_module):
         self.mp_replace = ReplaceWithTensorSlicing(
             mp_group=self.mp_group, mp_size=self._config.tensor_parallel.tp_size)  #, out_dim=0, in_dim=1)
@@ -450,7 +427,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
         checkpoint = sd_loader['checkpoints']
 
         if type(checkpoint) is list:
-            self.sd = torch.load(checkpoint[0], map_location='cpu')
+            self.sd = torch.load(checkpoint[0], map_location='cpu', weights_only=False)
             self.key_list = list(self.sd.keys())
 
             self.load_model_with_checkpoint(self.module)
@@ -458,7 +435,7 @@ def _load_checkpoint(self, load_dir, load_module_strict=True, tag=None):
             for i in range(1, len(checkpoint)):
                 if not dist.is_initialized() or dist.get_rank() == 0:
                     print(f"loading checkpoint ({i})")
-                self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name())
+                self.sd = torch.load(checkpoint[i], map_location=get_accelerator().device_name(), weights_only=False)
                 self.key_list = list(self.sd.keys())
                 self.load_model_with_checkpoint(self.module)
         else:
@@ -524,11 +501,11 @@ def _create_cuda_graph(self, *inputs, **kwargs):
         get_accelerator().current_stream().wait_stream(cuda_stream)
 
         # create cuda_graph and assign static_inputs and static_outputs
-        self._cuda_graphs = torch.cuda.CUDAGraph()
+        self._cuda_graphs = get_accelerator().create_graph()
         self.static_inputs = inputs
         self.static_kwargs = kwargs
 
-        with torch.cuda.graph(self._cuda_graphs):
+        with get_accelerator().capture_to_graph(self._cuda_graphs):
             self.static_output = self.module(*self.static_inputs, **self.static_kwargs)
 
         self.cuda_graph_created = True
@@ -540,7 +517,7 @@ def _graph_replay(self, *inputs, **kwargs):
         for k in kwargs:
             if torch.is_tensor(kwargs[k]):
                 self.static_kwargs[k].copy_(kwargs[k])
-        self._cuda_graphs.replay()
+        get_accelerator().replay_graph(self._cuda_graphs)
         return self.static_output
 
     def model_times(self):
@@ -616,14 +593,33 @@ def _generate(self, *inputs, **kwargs):
 
         if num_beams > 1:
             raise NotImplementedError("DeepSpeed does not support `num_beams` > 1, if this is important to you please "
-                                      "add your request to: https://github.com/microsoft/DeepSpeed/issues/2506")
+                                      "add your request to: https://github.com/deepspeedai/DeepSpeed/issues/2506")
 
         if ("input_ids" in kwargs) and (kwargs["input_ids"].dim() == 2):
             for input_tensor in kwargs["input_ids"]:
                 tensor_length = input_tensor.shape[-1]
                 if tensor_length > self._config.max_out_tokens:
                     raise RuntimeError(
-                        f"Input with size {tensor_length} exceeds maximum length of {self._config.max_out_tokens}. Please increase `max_tokens` in the DeepSpeed Inference Config."
+                        f"Input with size {tensor_length} exceeds maximum length of {self._config.max_out_tokens}. Please increase max_tokens in the DeepSpeed Inference Config."
                     )
 
         return self.module.generate(*inputs, **kwargs)
+
+    def compile(self, backend=get_accelerator().get_compile_backend(), compile_kwargs={}) -> None:
+        """
+        Compile the module using the specified backend and kwargs.
+        """
+        if not is_compile_supported():
+            raise RuntimeError("compile is not supported in your version of PyTorch.")
+
+        if self._is_compiled:
+            return
+
+        # Avoid graph breaks
+        deepspeed.utils.nvtx.enable_nvtx = False
+        self.module.compile(backend=backend, **compile_kwargs)
+        self._is_compiled = True
+
+    @property
+    def is_compiled(self) -> bool:
+        return self._is_compiled
diff --git a/deepspeed/inference/quantization/layers.py b/deepspeed/inference/quantization/layers.py
index c90354aca90f..e9a7e5629f1b 100644
--- a/deepspeed/inference/quantization/layers.py
+++ b/deepspeed/inference/quantization/layers.py
@@ -86,7 +86,7 @@ def __init__(self, config: Dict, pre_quant_layer: nn.Embedding) -> None:
                                                  device=pre_quant_layer.weight.device,
                                                  dtype=pre_quant_layer.weight.dtype)
 
-        assert pre_quant_layer.max_norm == None, 'Not supported'
+        assert pre_quant_layer.max_norm is None, 'Not supported'
         assert pre_quant_layer.norm_type == 2, 'Not supported'
         assert pre_quant_layer.scale_grad_by_freq == False, 'Not supported'
         assert pre_quant_layer.sparse == False, 'Not supported'
diff --git a/deepspeed/inference/quantization/utils.py b/deepspeed/inference/quantization/utils.py
index 712abc384a44..a5e8f28bdec9 100644
--- a/deepspeed/inference/quantization/utils.py
+++ b/deepspeed/inference/quantization/utils.py
@@ -14,14 +14,14 @@
 
 device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu'
 
-quantizer_cuda_module = None
+quantizer_module = None
 
 
-def get_quantizer_cuda_module():
-    global quantizer_cuda_module
-    if quantizer_cuda_module is None:
-        quantizer_cuda_module = deepspeed.ops.op_builder.QuantizerBuilder().load()
-    return quantizer_cuda_module
+def get_quantizer_module():
+    global quantizer_module
+    if quantizer_module is None:
+        quantizer_module = deepspeed.ops.op_builder.QuantizerBuilder().load()
+    return quantizer_module
 
 
 def tensor_clamp(tensor: Tensor, min, max) -> Tensor:
@@ -107,19 +107,19 @@ def dequantize(self, tensor: Tensor, quant_scale: Tensor, quant_min: Tensor) ->
         if self.config['group_size'] % 8 == 0 and \
                 (self.config['num_bits'] == 4 or self.config['num_bits'] == 8) and \
                 self.config['group_dim'] == len(tensor.shape) - 1 and \
-                    self.dtype == torch.float16 and device == 'cuda':
+                    self.dtype == torch.float16 and device == get_accelerator().device_name():
 
             last_dimension_size = self.config['group_size']
             if self.config['num_bits'] == 4:
                 last_dimension_size = last_dimension_size // 2
-                quantized_tensor = get_quantizer_cuda_module().dequantize_int4_to_half_experimental(
+                quantized_tensor = get_quantizer_module().dequantize_int4_to_half_experimental(
                     tensor.reshape(-1, last_dimension_size), quant_scale, quant_min,
                     tensor.numel() // last_dimension_size, self.config['group_size'])
                 shape = list(tensor.shape)
                 shape[-1] = shape[-1] * 2
             elif self.config['num_bits'] == 8:
                 # last_dimension_size = last_dimension_size // 2
-                quantized_tensor = get_quantizer_cuda_module().dequantize_int8_to_half_experimental(
+                quantized_tensor = get_quantizer_module().dequantize_int8_to_half_experimental(
                     tensor.reshape(-1, last_dimension_size), quant_scale, quant_min,
                     tensor.numel() // last_dimension_size, self.config['group_size'])
                 shape = list(tensor.shape)
diff --git a/deepspeed/inference/v2/allocator.py b/deepspeed/inference/v2/allocator.py
index bebdcf83aee3..fcc0d94c0f82 100644
--- a/deepspeed/inference/v2/allocator.py
+++ b/deepspeed/inference/v2/allocator.py
@@ -5,17 +5,27 @@
 
 from functools import reduce
 from typing import Iterable
-
+from collections import defaultdict
 import torch
 
 from deepspeed.accelerator import get_accelerator
 
 
-def empty_from(tensor: torch.Tensor, shape: Iterable[int]) -> torch.Tensor:
-    shape_size = reduce(lambda x, y: x * y, shape)
-    if shape_size == 0:
-        raise ValueError("Cannot create empty tensor with size 0")
-    return tensor.flatten()[:shape_size].view(shape)
+class Allocator:
+    cache = defaultdict(dict)
+
+    def empty_from(tensor: torch.Tensor, shape: Iterable[int]) -> torch.Tensor:
+        try:
+            return Allocator.cache[tensor][shape]
+        except KeyError:
+            shape_size = reduce(lambda x, y: x * y, shape)
+            if shape_size == 0:
+                raise ValueError("Cannot create empty tensor with size 0")
+            Allocator.cache[tensor][shape] = tensor.flatten()[:shape_size].view(shape)
+            return Allocator.cache[tensor][shape]
+
+
+empty_from = Allocator.empty_from
 
 
 def on_device(method) -> torch.Tensor:
diff --git a/deepspeed/inference/v2/checkpoint/huggingface_engine.py b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
index 029e3f7774c0..b17bb886838f 100644
--- a/deepspeed/inference/v2/checkpoint/huggingface_engine.py
+++ b/deepspeed/inference/v2/checkpoint/huggingface_engine.py
@@ -8,26 +8,28 @@
 import torch
 from .base_engine import CheckpointEngineBase
 from typing import Iterable, Tuple
+from functools import partial
 
 from ..logging import inference_logger
 
 
 class HuggingFaceCheckpointEngine(CheckpointEngineBase):
 
-    def __init__(self, model_name_or_path: str, auth_token: str = None) -> None:
+    def __init__(self, model_name_or_path: str, auth_token: str = None, **hf_kwargs) -> None:
         super().__init__()
         from transformers import AutoConfig, GenerationConfig
 
         self.model_name_or_path = model_name_or_path
         self.auth_token = auth_token
-        self.model_config = AutoConfig.from_pretrained(self.model_name_or_path)
-        self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
+        self.model_config = AutoConfig.from_pretrained(self.model_name_or_path, **hf_kwargs)
         # Define this property here so we can use it in the model implementation
         if not hasattr(self.model_config, "max_seq_length"):
-            self.model_config.max_seq_length = self.model_config.max_position_embeddings
-        else:
-            self.model_config.max_seq_length = self.generation_config.max_length
-
+            if hasattr(self.model_config, "max_position_embeddings"):
+                self.model_config.max_seq_length = self.model_config.max_position_embeddings
+            else:
+                generation_config = GenerationConfig.from_pretrained(self.model_name_or_path)
+                self.model_config.max_seq_length = generation_config.max_length
+        self._local_checkpoint_dir = None
         self._all_ckpt_paths = self._fetch_checkpoint_files()
 
     def _fetch_checkpoint_files(self):
@@ -38,20 +40,30 @@ def _fetch_checkpoint_files(self):
         # currently coming from the ckpt engine init but maybe a catch all kwargs for other
         # snapshot download parameters would be more flexible.
 
-        # NOTE(jeff): allow_patterns here are explicitly not using safetensors or other
-        # checkpoint files that may be present. Example of all files in the llama-2-7b
-        # repo here: https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main
-        from huggingface_hub import snapshot_download
+        from huggingface_hub import snapshot_download, list_repo_tree
+
+        def model_has_safetensors(model_name_or_path: str) -> bool:
+            if os.path.isdir(model_name_or_path):
+                file_list = os.listdir(model_name_or_path)
+            else:
+                file_list = [rf.path for rf in list_repo_tree(model_name_or_path)]
+            for f in file_list:
+                if f.endswith(".safetensors"):
+                    return True
+            return False
 
         if os.path.isdir(self.model_name_or_path):
             self._local_checkpoint_dir = self.model_name_or_path
         else:
+            # We need to download the checkpoint files from HF
+            if model_has_safetensors(self.model_name_or_path):
+                # Prioritize downloading safetensors if they are available
+                allow_patterns = ["*.safetensors", "*.json"]
+            else:
+                # Fallback to bin files when safetensors are not present
+                allow_patterns = ["*.bin", "*.json", "*.pt"]
             self._local_checkpoint_dir = snapshot_download(self.model_name_or_path,
-                                                           allow_patterns=[
-                                                               "*.bin",
-                                                               "*.json",
-                                                               "*.pt",
-                                                           ],
+                                                           allow_patterns=allow_patterns,
                                                            revision=None,
                                                            token=self.auth_token)
 
@@ -59,11 +71,22 @@ def _fetch_checkpoint_files(self):
             self._local_checkpoint_dir
         ), f"Checkpoint dir {self._local_checkpoint_dir} is not a directory, cannot load checkpoint."
 
-        model_param_json = os.path.join(self._local_checkpoint_dir, "pytorch_model.bin.index.json")
+        # Set the appropriate file names based on whether we have safetensors or not
+        if model_has_safetensors(self._local_checkpoint_dir):
+            from safetensors.torch import load_file
+            model_param_json_fname = "model.safetensors.index.json"
+            model_file_fname = "model.safetensors"
+            self._checkpoint_load_fn = load_file
+        else:
+            model_param_json_fname = "pytorch_model.bin.index.json"
+            model_file_fname = "pytorch_model.bin"
+            self._checkpoint_load_fn = partial(torch.load, map_location="cpu", weights_only=False)
+
+        model_param_json = os.path.join(self._local_checkpoint_dir, model_param_json_fname)
 
         if not os.path.isfile(model_param_json):
             # We don't need any json as all such HF models will have pytorch_model.bin
-            all_checkpoint_files = [os.path.join(self._local_checkpoint_dir, 'pytorch_model.bin')]
+            all_checkpoint_files = [os.path.join(self._local_checkpoint_dir, model_file_fname)]
         else:
             param_map = json.load(open(model_param_json, "r"))
 
@@ -84,7 +107,13 @@ def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]:
         """
         for checkpoint in self._all_ckpt_paths:
             inference_logger().info(f"Loading checkpoint: {checkpoint}")
-            checkpoint_sd = torch.load(checkpoint, map_location='cpu')
+            checkpoint_sd = self._checkpoint_load_fn(checkpoint)
+
+            # If the model has tied embeddings, we need to make sure the lm_head weights are tied to the embeddings weights
+            if hasattr(self.model_config, "tie_word_embeddings") and self.model_config.tie_word_embeddings:
+                if self.model_config.model_type == "qwen2":
+                    checkpoint_sd["lm_head.weight"] = checkpoint_sd["model.embed_tokens.weight"]
+
             param_keys = list(checkpoint_sd.keys())
             for param_name in param_keys:
                 param = checkpoint_sd[param_name]
diff --git a/deepspeed/inference/v2/config_v2.py b/deepspeed/inference/v2/config_v2.py
index 64e7e29b1844..325b57d8f56a 100644
--- a/deepspeed/inference/v2/config_v2.py
+++ b/deepspeed/inference/v2/config_v2.py
@@ -3,7 +3,8 @@
 
 # DeepSpeed Team
 
-from deepspeed.pydantic_v1 import Field
+from pydantic import Field
+from typing import Optional
 
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from .ragged import DSStateManagerConfig
@@ -16,6 +17,16 @@ class DeepSpeedTPConfig(DeepSpeedConfigModel):
     """ Number of devices to split the model across using tensor parallelism. """
 
 
+class QuantizationConfig(DeepSpeedConfigModel):
+    """ Configure tensor parallelism settings """
+
+    quantization_mode: Optional[str] = None
+    """ The quantization mode in string format. The supported modes are as follows:
+        - 'wf6af16', weight-only quantization with FP6 weight and FP16 activation.
+    """
+    # TODO: may reuse the constants in deepspeed/compression/constants.py
+
+
 class RaggedInferenceEngineConfig(DeepSpeedConfigModel):
     """ Sets parameters for DeepSpeed Inference Engine. """
 
@@ -29,3 +40,5 @@ class RaggedInferenceEngineConfig(DeepSpeedConfigModel):
     """
     Configuration for managing persistent state
     """
+
+    quantization: QuantizationConfig = {}
diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py
index ecca9f3c1b34..9c3188dfebb8 100644
--- a/deepspeed/inference/v2/engine_factory.py
+++ b/deepspeed/inference/v2/engine_factory.py
@@ -17,6 +17,13 @@
     OPTPolicy,
     Llama2Policy,
     MistralPolicy,
+    MixtralPolicy,
+    FalconPolicy,
+    PhiPolicy,
+    Phi3Policy,
+    QwenPolicy,
+    Qwen2Policy,
+    Qwen2MoePolicy,
 )
 from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
 from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
@@ -93,7 +100,7 @@ def build_hf_engine(path: str,
         if model_config.model_type == "opt":
             if not model_config.do_layer_norm_before:
                 raise ValueError(
-                    "Detected OPT-350m model. This model is not currently supported. If this is not the 350m model, please open an issue: https://github.com/microsoft/DeepSpeed-MII/issues"
+                    "Detected OPT-350m model. This model is not currently supported. If this is not the 350m model, please open an issue: https://github.com/deepspeedai/DeepSpeed-MII/issues"
                 )
             policy = OPTPolicy(model_config, checkpoint_engine=checkpoint_engine)
         elif model_config.model_type == "llama":
@@ -104,6 +111,24 @@ def build_hf_engine(path: str,
             assert version.parse(transformers.__version__) >= version.parse("4.34.0"), \
                 f"Mistral requires transformers >= 4.34.0, you have version {transformers.__version__}"
             policy = MistralPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "mixtral":
+            # Ensure we're using the correct version of transformers for mistral
+            import transformers
+            assert version.parse(transformers.__version__) >= version.parse("4.36.1"), \
+                f"Mistral requires transformers >= 4.36.1, you have version {transformers.__version__}"
+            policy = MixtralPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "falcon":
+            policy = FalconPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "phi":
+            policy = PhiPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "phi3":
+            policy = Phi3Policy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "qwen":
+            policy = QwenPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "qwen2":
+            policy = Qwen2Policy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "qwen2_moe":
+            policy = Qwen2MoePolicy(model_config, checkpoint_engine=checkpoint_engine)
         else:
             raise ValueError(f"Unsupported model type {model_config.model_type}")
 
diff --git a/deepspeed/inference/v2/engine_v2.py b/deepspeed/inference/v2/engine_v2.py
index ff73f7ea9680..4a358310377f 100644
--- a/deepspeed/inference/v2/engine_v2.py
+++ b/deepspeed/inference/v2/engine_v2.py
@@ -104,7 +104,10 @@ def _initialize_tp_group(self):
         ranks = list(range(self._config.tensor_parallel.tp_size))
         return dist.new_group(ranks=ranks)
 
-    def put(self, batch_uids: Iterable[int], batch_tokens: Iterable[torch.Tensor]) -> torch.Tensor:
+    def put(self,
+            batch_uids: Iterable[int],
+            batch_tokens: Iterable[torch.Tensor],
+            do_checks: bool = True) -> torch.Tensor:
         """
         Put a ragged batch onto the inference engine. This will perform one forward and return
         a Tensor of the shape [len(batch_uids), *output_shape]. Logits for the non-final tokens
@@ -113,12 +116,14 @@ def put(self, batch_uids: Iterable[int], batch_tokens: Iterable[torch.Tensor]) -
         Arguments:
             batch_uids: Iterable of uids for the batch on the host
             batch_tokens: Iterable of token tensors for the batch on the host
+            do_checks: Check schedulability when it is set to True. You can skip this check for better performance when it has already been completed.
         """
 
-        token_lens = [len(tokens) for tokens in batch_tokens]
-        schedule_check = self.can_schedule(batch_uids, token_lens)
-        if schedule_check != SchedulingResult.Success:
-            raise SchedulingError(schedule_check)
+        if do_checks:
+            token_lens = [len(tokens) for tokens in batch_tokens]
+            schedule_check = self.can_schedule(batch_uids, token_lens)
+            if schedule_check != SchedulingResult.Success:
+                raise SchedulingError(schedule_check)
 
         self._batch.clear()
         for uid, tokens in zip(batch_uids, batch_tokens):
@@ -128,7 +133,7 @@ def put(self, batch_uids: Iterable[int], batch_tokens: Iterable[torch.Tensor]) -
             host_seq_desc.pre_forward(tokens.numel())
 
             # We can disable checks since we already validated schedulability.
-            self._batch.insert_sequence(host_seq_desc, tokens, do_checks=False)
+            self._batch.insert_sequence(host_seq_desc, tokens, do_checks=do_checks)
 
         # Send all metadata to the device
         self._batch.finalize()
@@ -225,6 +230,15 @@ def can_schedule(self, uids: Iterable[int], lengths: Iterable[int]) -> Schedulin
 
         return SchedulingResult.Success
 
+    def get_remaining_block_capacity(self, uid: int) -> int:
+        """
+        Get the remaining capacity of the last block already allocated.
+        """
+        seq_desc = self._state_manager.get_sequence(uid)
+        if seq_desc is None:
+            return 0
+        return self._model.get_remaining_block_capacity(seq_desc)
+
     def flush(self, uid: int) -> None:
         """
         Remove all state associated with a sequence from the inference engine.
diff --git a/deepspeed/inference/v2/kernels/core_ops/__init__.py b/deepspeed/inference/v2/kernels/core_ops/__init__.py
index bbb53e5b58a2..1d16b484a560 100644
--- a/deepspeed/inference/v2/kernels/core_ops/__init__.py
+++ b/deepspeed/inference/v2/kernels/core_ops/__init__.py
@@ -8,3 +8,4 @@
 from .cuda_layer_norm import *
 from .cuda_rms_norm import *
 from .gated_activations import *
+from .cuda_linear import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cu b/deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation_cuda.cu
similarity index 100%
rename from deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation.cu
rename to deepspeed/inference/v2/kernels/core_ops/bias_activations/bias_activation_cuda.cu
diff --git a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h
index 450991b3c387..294db7528699 100644
--- a/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h
+++ b/deepspeed/inference/v2/kernels/core_ops/blas_kernels/blas_utils.h
@@ -13,7 +13,7 @@
 #endif
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
 #include <mma.h>
 #endif
 #include <stdio.h>
@@ -33,7 +33,7 @@ class BlasContext {
             std::cerr << message << std::endl;
             throw std::runtime_error(message);
         }
-#ifndef __HIP_PLATFORM_HCC__
+#ifndef __HIP_PLATFORM_AMD__
         cublasSetMathMode(_handle, CUBLAS_TENSOR_OP_MATH);
 #endif
     }
@@ -55,7 +55,9 @@ class BlasContext {
 
 enum class BlasType { FP32, FP16, BF16 };
 
-#ifdef __HIP_PLATFORM_HCC__
+// TODO HIP: Remove backward compatibility for torch<=2.0 in future
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
 rocblas_operation get_trans_op(bool do_trans)
 {
     return (do_trans) ? rocblas_operation_transpose : rocblas_operation_none;
@@ -76,9 +78,15 @@ cublasOperation_t get_trans_op(bool do_trans) { return (do_trans) ? CUBLAS_OP_T
 cublasDataType_t get_datatype(BlasType type)
 {
     switch (type) {
+#ifdef __HIP_PLATFORM_AMD__
+        case BlasType::FP32: return HIPBLAS_R_32F;
+        case BlasType::FP16: return HIPBLAS_R_16F;
+        case BlasType::BF16: return HIPBLAS_R_16B;
+#else
         case BlasType::FP32: return CUDA_R_32F;
         case BlasType::FP16: return CUDA_R_16F;
         case BlasType::BF16: return CUDA_R_16BF;
+#endif
         default: throw std::runtime_error("Unsupported BlasType");
     }
 }
@@ -99,7 +107,8 @@ int blas_gemm_ex(void* C,
                  const float* beta,
                  BlasType type)
 {
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_operation_t transa_op = get_trans_op(transa);
     rocblas_operation_t transb_op = get_trans_op(transb);
 
@@ -151,11 +160,18 @@ int blas_gemm_ex(void* C,
                                          C,
                                          abc_type,
                                          ldc,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                         HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                         HIPBLAS_R_32F,
+#else
                                          CUDA_R_32F,
+#endif
                                          CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
@@ -190,7 +206,8 @@ int blas_strided_batched_gemm(void* C,
                               int batch,
                               BlasType type)
 {
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     rocblas_operation_t transa_op = get_trans_op(transa);
     rocblas_operation_t transb_op = get_trans_op(transb);
 
@@ -253,11 +270,18 @@ int blas_strided_batched_gemm(void* C,
                                                        ldc,
                                                        stride_C,
                                                        batch,
+#if defined(__HIP_PLATFORM_AMD__) && defined(HIPBLAS_V2)
+                                                       HIPBLAS_COMPUTE_32F,
+#elif defined(__HIP_PLATFORM_AMD__)
+                                                       HIPBLAS_R_32F,
+#else
                                                        CUDA_R_32F,
+#endif
                                                        CUBLAS_GEMM_DEFAULT_TENSOR_OP);
 #endif
 
-#ifdef __HIP_PLATFORM_HCC__
+#if defined(__HIP_PLATFORM_AMD__) && \
+    ((TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 0))
     if (status != rocblas_status_success) {
 #else
     if (status != CUBLAS_STATUS_SUCCESS) {
diff --git a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
index 58df88e56136..3f36a6bf01cb 100644
--- a/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
+++ b/deepspeed/inference/v2/kernels/core_ops/core_ops.cpp
@@ -10,6 +10,7 @@
 #include "blas.h"
 #include "gated_activation_kernels.h"
 #include "layer_norm.h"
+#include "linear_kernels.h"
 #include "rms_norm.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
@@ -33,4 +34,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     // rms_norm.h
     m.def("rms_norm", &rms_norm, "DeepSpeed rms norm in CUDA");
     m.def("rms_pre_norm", &rms_pre_norm, "DeepSpeed rms pre norm in CUDA");
+
+    // linear_kernels.h
+    m.def("cuda_wf6af16_linear", &cuda_wf6af16_linear, "DeepSpeed Wf6Af16 linear in CUDA");
+    m.def(
+        "preprocess_weight", &preprocess_weight, "preprocess the FP16 weight to be 2bit and 4 bit");
 }
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu
similarity index 99%
rename from deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu
rename to deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu
index 15f52c46622b..fb6dd0578f1d 100644
--- a/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu
@@ -252,7 +252,6 @@ __global__ void fused_residual_ln(T* output,
     for (int i = 0; i < unRoll; i++) {
         T* iteration_buffer = local_buffer + i * T_per_load;
         T residual_buffer[T_per_load];
-        T bias_buffer[T_per_load];
 
         mem_access::load_global<ln::granularity>(
             iteration_buffer, input_base + i * stride, thread_offset + i * stride < elems_per_row);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/__init__.py b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py
similarity index 69%
rename from deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/__init__.py
rename to deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py
index b50a0838d9f8..cd08409c0a7a 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/__init__.py
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/__init__.py
@@ -3,4 +3,4 @@
 
 # DeepSpeed Team
 
-from .top_1_gating import RaggedTop1Gating
+from .cuda_linear import *
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
new file mode 100644
index 000000000000..69aa9e8920e2
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/cuda_linear.py
@@ -0,0 +1,207 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from ....inference_utils import DtypeEnum
+from ....logging import inference_logger
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+from ... import DSKernelBase
+
+
+class CUDAWf6Af16Linear(DSKernelBase):
+    """
+    Wrapper around the CUDA kernel of Wf6Af16 quantized linear.
+
+    Performs z = x @ y
+    """
+    supported_dtypes = [DtypeEnum.fp16]
+
+    def __init__(self):
+        self.inf_module = InferenceCoreBuilder().load()
+        self.inf_module.create_handle()
+        self.kernel = self.inf_module.cuda_wf6af16_linear
+        # The split_k_map is profiled on A100-80G GPU for some common shapes.
+        # It is an array of dictionaries, where the array index is the tokens chunk id.
+        # The dictionary is the mapping from the output channel to the split-K size.
+        self.split_k_map = [
+            {  # tokens: [1, 64]
+                3072: 18,
+                4096: 13,
+                5120: 10,
+                6144: 9,
+                8192: 6,
+                10240: 5,
+                14336: 7,
+                28672: 7,
+                57344: 7
+            },
+            {  # tokens: [65:128]
+                3072: 9,
+                4096: 6,
+                5120: 5,
+                6144: 9,
+                8192: 3,
+                10240: 5,
+                14336: 7,
+                28672: 7,
+                57344: 6
+            },
+            {  # tokens: [129:192]
+                3072: 6,
+                4096: 4,
+                5120: 7,
+                6144: 3,
+                8192: 2,
+                10240: 5,
+                14336: 5,
+                28672: 5,
+                57344: 4
+            },
+            {  # tokens: [193:256]
+                3072: 9,
+                4096: 3,
+                5120: 5,
+                6144: 2,
+                8192: 5,
+                10240: 4,
+                14336: 8,
+                28672: 6,
+                57344: 4
+            },
+            {  # tokens: [257:320]
+                3072: 7,
+                4096: 5,
+                5120: 2,
+                6144: 5,
+                8192: 4,
+                10240: 1,
+                14336: 3,
+                28672: 3,
+                57344: 4
+            },
+            {  # tokens: [321:384]
+                3072: 3,
+                4096: 2,
+                5120: 5,
+                6144: 3,
+                8192: 1,
+                10240: 8,
+                14336: 3,
+                28672: 4,
+                57344: 3
+            },
+            {  # tokens: [385:448]
+                3072: 5,
+                4096: 7,
+                5120: 3,
+                6144: 5,
+                8192: 7,
+                10240: 3,
+                14336: 1,
+                28672: 1,
+                57344: 3
+            },
+            {  # tokens: [449:512]
+                3072: 2,
+                4096: 5,
+                5120: 4,
+                6144: 1,
+                8192: 5,
+                10240: 2,
+                14336: 6,
+                28672: 4,
+                57344: 1
+            },
+            {  # tokens: [513:576]
+                3072: 2,
+                4096: 3,
+                5120: 1,
+                6144: 1,
+                8192: 3,
+                10240: 3,
+                14336: 3,
+                28672: 1,
+                57344: 1
+            },
+            {  # tokens: [577:640]
+                3072: 5,
+                4096: 4,
+                5120: 1,
+                6144: 4,
+                8192: 2,
+                10240: 1,
+                14336: 1,
+                28672: 1,
+                57344: 1
+            },
+            {  # tokens: [641:704]
+                3072: 3,
+                4096: 1,
+                5120: 2,
+                6144: 2,
+                8192: 1,
+                10240: 2,
+                14336: 1,
+                28672: 1,
+                57344: 1
+            },
+            {  # tokens: [705:768]
+                3072: 3,
+                4096: 1,
+                5120: 3,
+                6144: 2,
+                8192: 1,
+                10240: 1,
+                14336: 1,
+                28672: 1,
+                57344: 1
+            }
+        ]
+
+    def __call__(self, output: torch.Tensor, hidden_states: torch.Tensor, weights_2bit: torch.Tensor,
+                 weights_4bit: torch.Tensor, scale: torch.Tensor, out_channels, tokens, in_channels) -> torch.Tensor:
+        """
+        Matmul kernel of FP6 weight-only quantized linear. All inputs should be contiguous.
+        It does not support batched-matmul.
+
+        Parameters:
+            output (torch.Tensor): Output tensor. Shape is of [token_number, out_features]
+            hidden_states (torch.Tensor): Input tensor. Shape is of [token_number, in_features]
+            weights_2bit (torch.Tensor): Input tensor of the 2-bit slice. Shape is of [out_features*2/8, in_features]
+            weights_4bit (torch.Tensor): Input tensor of the 4-bit slice. Shape is of [out_features*4/8, in_features]
+            scale (torch.Tensor): Input tensor. Shape is of [out_features], since the scale is per output channel
+            out_channels (int): The number of output channels
+            tokens (int): The number of tokens
+            in_channels (int): The number of input channels
+        """
+
+        if out_channels % 256 != 0 or in_channels % 64 != 0:
+            raise ValueError("The out and in channel should be multiple of 256 and 64 respectively.")
+
+        # TODO: add a more general heuristic to determine the split-K.
+        split_k = -1  # not initialized
+        if tokens <= 768:
+            # Try to find the split-K from the pre-profiled map.
+            tokens_chunk_id = (tokens - 1) // 64
+            split_k = self.split_k_map[tokens_chunk_id].get(out_channels, -1)
+        if split_k == -1:
+            split_k = 1
+            inference_logger().warning(
+                f"The split-K setting may be suboptimal for shape {tokens}x{in_channels}x{out_channels}...")
+
+        workspace = self.get_workspace(out_channels, tokens, in_channels, split_k, torch.float, hidden_states.device)
+        self.kernel(output, hidden_states, weights_2bit, weights_4bit, scale, workspace, out_channels, tokens,
+                    in_channels, split_k)
+
+    def get_workspace(self, out_channels: int, tokens: int, in_channels: int, split_k: int, dtype,
+                      device) -> torch.Tensor:
+        """
+        Allocate workspace for the kernel. The workspace is used to store the intermediate results of the matmul before
+        split-K. The split-K size is determined by the size of the matmul.
+        """
+        workspace = torch.empty((split_k, out_channels, tokens), dtype=dtype, device=device)
+
+        return workspace
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h
new file mode 100644
index 000000000000..76e8eda2d35e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/configs.h
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef CONFIGS_H
+#define CONFIGS_H
+
+// #define DEBUG_MODE
+#define PIPELINE_LEVEL_GMEM 2
+#define PIPELINE_LEVEL_SMEM 2  // only support 2
+
+/************************ Hardware Parameters ************************/
+#define WARP_SIZE 32
+#define REG_BIT_WIDTH 32
+// mma: M=16 K=16 N=8
+#define MMA_8 8
+#define MMA_16 16
+// for memory access
+#define THREAD_OPT_ACCESS_BIT_WIDTH_128 128  // LDS.128, cp_async.128, ...
+#define BIT_WIDTH_PER_HALF 16                // Half precision: FP16
+
+/******************** Register Allocation For GEMM ********************/
+#define REG_PER_THREAD_C_TENSOR_16_16 8  // 8 for FP32 Accumulation
+/********************** Memory Padding Parameters **********************/
+// Eliminating bank-conflict
+#define PADDING_BYTES_16 16  // Padding 16 bytes each column
+#define PADDING_SHARED_MEM_FOR_B_8 \
+    8  // Padding 8 half  each column, during CopyFromGlobalToShared() for B
+#define PADDING_SHARED_MEM_FOR_C_4 \
+    4  // Padding 4 float each column, during StoreToSharedMemoryFromRegister() for C
+/************************* WARP Tiling part-1 *************************/
+#define WARP_ROW_MMA_TENSORS 4
+#define WARP_M (WARP_ROW_MMA_TENSORS * MMA_16)  // 64
+#define WARP_K_MMA_TENSORS 4
+#define WARP_K (WARP_K_MMA_TENSORS * MMA_16)  // 64
+template <int BLOCK_ROW_WARPS_, int BLOCK_COL_WARPS_, int WARP_COL_MMA_TENSORS_>
+struct TilingConfig {
+    // Depending on "n" dimension of the GEMM
+    static constexpr int BLOCK_ROW_WARPS = BLOCK_ROW_WARPS_;
+    static constexpr int BLOCK_COL_WARPS = BLOCK_COL_WARPS_;
+    static constexpr int WARP_COL_MMA_TENSORS = WARP_COL_MMA_TENSORS_;
+    /************************* WARP Tiling part-2 *************************/
+    static constexpr int WARP_N = WARP_COL_MMA_TENSORS * MMA_8;
+    /*************************Thread Block Tiling *************************/
+    static constexpr int TILE_M = WARP_M * BLOCK_ROW_WARPS;
+    static constexpr int TILE_N = MMA_8 * WARP_COL_MMA_TENSORS * BLOCK_COL_WARPS;
+    static constexpr int TILE_K = WARP_K;
+    /********************** #Thread per Thread Block **********************/
+    static constexpr int BLOCK_WARPS = BLOCK_ROW_WARPS * BLOCK_COL_WARPS;
+    static constexpr int BLOCK_THREADS = BLOCK_WARPS * WARP_SIZE;
+    /******************************* Others *******************************/
+    static constexpr int SMEM_SIZE_B_TILE = TILE_N * (TILE_K + PADDING_BYTES_16) * 2 *
+                                            PIPELINE_LEVEL_GMEM;  // sizeof(half)=2, doubleBuffer=2
+    static constexpr int SMEM_SIZE_C_TILE =
+        TILE_N * (TILE_M + PADDING_BYTES_16) * 4;  // sizeof(float)=4
+};
+
+/************************ General Config for Quant-LLM **********************/
+#define WEIGHT_FRAG1_BIT_WIDTH 2
+#define WEIGHT_FRAG2_BIT_WIDTH 4
+#define WEIGHT_BIT_WIDTH (WEIGHT_FRAG1_BIT_WIDTH + WEIGHT_FRAG2_BIT_WIDTH)  // 6
+// #define QUANT_GROUP_SIZE_DIVIDED_BY_64  4                                                   //
+//  QuantGroupSize: 4*64 = 256
+/*************************** 64*64 Weghts of A WARP *************************/
+#define WEIGHT_PER_UNIT (WARP_M * WARP_K)  // 64*64
+#define SMEM_SIZE_IN_BYTES_PER_WARP_A1          \
+    (WEIGHT_PER_UNIT * WEIGHT_FRAG1_BIT_WIDTH / \
+     8)  // 1024 Bytes   #doubleBuffer not takedn into consideration
+#define SMEM_SIZE_IN_BYTES_PER_WARP_A2          \
+    (WEIGHT_PER_UNIT * WEIGHT_FRAG2_BIT_WIDTH / \
+     8)  // 2048 Bytes   #doubleBuffer not takedn into consideration
+#define SMEM_SIZE_A1_TILE                 \
+    (SMEM_SIZE_IN_BYTES_PER_WARP_A1 * 4 * \
+     PIPELINE_LEVEL_GMEM)  // #WARP=4, #Trible-Buffer for 3-level pipeline for A = 12 KB; double
+                           // buffer for 2-level pipeline A= 8  KB.
+#define SMEM_SIZE_A2_TILE                 \
+    (SMEM_SIZE_IN_BYTES_PER_WARP_A2 * 4 * \
+     PIPELINE_LEVEL_GMEM)  // #WARP=4, #Trible-Buffer for 3-level pipeline for A = 24 KB; double
+                           // buffer for 2-level pipeline A= 16 KB.
+/******************** Global Memory Layout For QUANTIZED DATA ******************/
+#define NUM_INT4_PER_UNIT_2BIT_FRAG (WEIGHT_PER_UNIT * WEIGHT_FRAG1_BIT_WIDTH / 128)  // 64
+#define NUM_INT4_PER_UNIT_4BIT_FRAG (WEIGHT_PER_UNIT * WEIGHT_FRAG2_BIT_WIDTH / 128)  // 128
+/******************** Register Allocation For QUANTIZED DATA ******************/
+#define WEIGHT_PER_THREAD (WEIGHT_PER_UNIT / WARP_SIZE)                   // 128
+#define REG_PER_THREAD_2BIT_FRAG (WEIGHT_PER_THREAD / REG_BIT_WIDTH * 2)  // 8
+#define REG_PER_THREAD_4BIT_FRAG (WEIGHT_PER_THREAD / REG_BIT_WIDTH * 4)  // 16
+/******************** Register Allocation For QUANT Scales ******************/
+#define WARP_REG_QUANT_SCALE 4  // 8 rows per thread -> 8 FP16 scales -> 4 registers
+#define WARP_REG_QUANT_SCALE_DISTRIBUTED \
+    1  // T0-T3, T4-T7, ..., T28-T31 share the same scales, using shfl to get all the scales for
+       // each thread
+
+#endif  // CONFIGS_H
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh
new file mode 100644
index 000000000000..860f70b226cb
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_matmul.cuh
@@ -0,0 +1,272 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_KERNEL_MATMUL_CUH
+#define DEEPSPEED_CUDA_LINEAR_KERNEL_MATMUL_CUH
+
+#include "configs.h"
+#include "utils_core.cuh"
+#include "utils_gmem.cuh"
+
+/*
+ * C = A*B
+ * A: row major with ahead-of-time layout transformation, FP6
+ * B: col major, FP16
+ * C: col major, FP16
+ */
+template <typename TilingConfig, typename OutputDataType>
+__global__ void QUANT_GEMM_Kernel(const uint4* Weight1,
+                                  const uint4* Weight2,
+                                  const half* Scales,
+                                  const half* B,
+                                  OutputDataType* C,
+                                  const size_t M_Global,
+                                  const size_t N_Global,
+                                  const size_t K_Global,
+                                  int Split_K)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 900
+
+#ifdef DEBUG_MODE
+    assert(K_Global % TilingConfig::TILE_K == 0);
+    assert(M_Global % TilingConfig::TILE_M == 0);
+    assert(gridDim.y == Split_K * (M_Global / TilingConfig::TILE_M));
+#endif
+    extern __shared__ __align__(128)
+        half smem[];  // Dynamic shared memory for FP16 A tiles， 128 Bytes aligned
+    half(*smem_array)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+        reinterpret_cast<half(*)[WARP_K + PADDING_SHARED_MEM_FOR_B_8]>(
+            smem +
+            (SMEM_SIZE_A1_TILE + SMEM_SIZE_A2_TILE) / 2);  // Dynamic shared memory for FP16 B tiles
+    __shared__ half QuantScales[64 * TilingConfig::BLOCK_WARPS];  // static shared memory for
+                                                                  // quantization scales, 64 row per
+                                                                  // warp * 4 warps = 512 Bytes
+    // Thread Block Mapping, considering SplitK
+    const size_t BatchID = blockIdx.y / (M_Global / TilingConfig::TILE_M);
+    const size_t x = blockIdx.x;  // Output Block ID: (BlockID_Row = y; BlockID_Col = x )
+    const size_t y =
+        blockIdx.y %
+        (M_Global / TilingConfig::TILE_M);  // Output Block ID: (BlockID_Row = y; BlockID_Col = x )
+    const size_t Tile_Start_M = y * TilingConfig::TILE_M;
+    const size_t Tile_Start_N = x * TilingConfig::TILE_N;
+    const size_t NumColumnToCopy = (N_Global - Tile_Start_N) < TilingConfig::TILE_N
+                                       ? (N_Global - Tile_Start_N)
+                                       : TilingConfig::TILE_N;
+    const size_t NumBlock_K = K_Global / TilingConfig::TILE_K;
+    const size_t AverageNumBlock_K = NumBlock_K / Split_K;
+    const size_t ExtraNumBlock_K = NumBlock_K - AverageNumBlock_K * Split_K;
+    size_t NumIter = AverageNumBlock_K;
+    if (BatchID < ExtraNumBlock_K) NumIter++;
+    size_t StartBlockID_K = AverageNumBlock_K * BatchID;
+    if (BatchID < ExtraNumBlock_K)
+        StartBlockID_K += BatchID;
+    else
+        StartBlockID_K += ExtraNumBlock_K;
+    // Warp ID.
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int WARP_i =
+        warpId / TilingConfig::BLOCK_COL_WARPS;  // WARP_i: row number;  WARP_j: column number
+    // int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
+    // Global Memory Address for Matrix A (Weight)
+    // ///////////////////////////////////////////////////////////////////////// StartPTR for each
+    // ThreadBlock(TB)
+    const uint4* TB_StartGPTR_A1 =
+        Weight1 + (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_UNIT_2BIT_FRAG;
+    const uint4* TB_StartGPTR_A2 =
+        Weight2 + (y * TilingConfig::BLOCK_ROW_WARPS) * NumBlock_K * NUM_INT4_PER_UNIT_4BIT_FRAG;
+    // StartPTR for each WARP.
+    const uint4* WARP_StartGPTR_A1 =
+        TB_StartGPTR_A1 + WARP_i * NumBlock_K * NUM_INT4_PER_UNIT_2BIT_FRAG;
+    const uint4* WARP_StartGPTR_A2 =
+        TB_StartGPTR_A2 + WARP_i * NumBlock_K * NUM_INT4_PER_UNIT_4BIT_FRAG;
+    // StartPTR for each WARP, considering SplitK
+    const size_t WARP_Start_UnitID_K = StartBlockID_K;
+    WARP_StartGPTR_A1 += WARP_Start_UnitID_K * NUM_INT4_PER_UNIT_2BIT_FRAG;
+    WARP_StartGPTR_A2 += WARP_Start_UnitID_K * NUM_INT4_PER_UNIT_4BIT_FRAG;
+    // Copying A tile from Global to Shared, using double-buffer
+    // ////////////////////////////////////////////////////////// StartSPTR for each ThreadBlock
+    uint32_t* AFrag_2BIT_SPTR = reinterpret_cast<uint32_t*>(smem);
+    uint32_t* AFrag_4BIT_SPTR =
+        AFrag_2BIT_SPTR +
+        SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * TilingConfig::BLOCK_WARPS *
+            PIPELINE_LEVEL_GMEM;  // 8 buffers including double buffers, 12 for trible buffers
+    // StartSPTR for each WARP
+    AFrag_2BIT_SPTR += warpId * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4;
+    AFrag_4BIT_SPTR += warpId * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4;
+    // Pre-fetch of A tile
+    for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) {
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A1>(
+            AFrag_2BIT_SPTR + i * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * 4, WARP_StartGPTR_A1);
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A2>(
+            AFrag_4BIT_SPTR + i * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * 4, WARP_StartGPTR_A2);
+        WARP_StartGPTR_A1 += SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16;
+        WARP_StartGPTR_A2 += SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16;
+    }
+    // Global Memory Address for Matrix A (QuantScale)
+    // /////////////////////////////////////////////////////////////////////
+    const half* TB_StartGPTR_A_Scale = Scales + (y * TilingConfig::BLOCK_ROW_WARPS) * 64;
+    const half* WARP_StartGPTR_A_Scales = TB_StartGPTR_A_Scale + WARP_i * 64;
+    CopyFromGlobalToShared_Scales(QuantScales + WARP_i * 64, WARP_StartGPTR_A_Scales);
+    // Copying B tile from Global to Shared, considering SplitK
+    // /////////////////////////////////////////////////////////////
+    const half* BTile_GPTR = B + Tile_Start_N * K_Global + StartBlockID_K * TilingConfig::TILE_K;
+    for (int i = 0; i < PIPELINE_LEVEL_GMEM - 1; i++) {
+        CopyFromGlobalToShared<TilingConfig::TILE_N, TilingConfig::BLOCK_WARPS>(
+            smem_array + i * TilingConfig::TILE_N, BTile_GPTR, K_Global, NumColumnToCopy);
+        BTile_GPTR += TilingConfig::TILE_K;
+    }
+    // Register Allocation for A,B, and C, Initilazed to Zeros
+    // /////////////////////////////////////////////////////////////////////
+    constexpr int NumRegSets_a =
+        WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA block
+    constexpr int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                                     ? 1
+                                     : TilingConfig::WARP_COL_MMA_TENSORS /
+                                           2;  // 1 set = 4 registers, containing a 16*16 MMA block
+#ifdef PIPELINE_LEVEL_SMEM
+    uint32_t a[NumRegSets_a * PIPELINE_LEVEL_SMEM]
+              [4];  // double/Trible buffer is used // Registers to store decompressed FP6
+    uint32_t b[NumRegSets_b * PIPELINE_LEVEL_SMEM]
+              [4];  // double/Triple buffer is used // Register to store FP16 B matrix (a slice)
+#endif
+    float c[NumRegSets_a * NumRegSets_b][REG_PER_THREAD_C_TENSOR_16_16];
+    for (int i = 0; i < NumRegSets_a * NumRegSets_b; i++)
+        for (int j = 0; j < REG_PER_THREAD_C_TENSOR_16_16; j++) c[i][j] = 0.0f;
+    //
+    cp_async_wait_all();
+    __syncthreads();
+
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    uint32_t Scales_RPTR[4];  // 4 Registers per thread for Quantization Scales
+    ExtractFromSharedToReg_Scales(Scales_RPTR, QuantScales + WARP_i * 64);
+#ifdef PIPELINE_LEVEL_SMEM
+    // Initializing the Software Pipeline: writing registers.
+    // ////////////////////////////////////////////////////////////////////////////////////////////////
+    initialize_mma_slice<TilingConfig>(
+        a, b, AFrag_2BIT_SPTR, AFrag_4BIT_SPTR, smem_array, Scales_RPTR);
+#endif
+// The outer loop.
+// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+#pragma unroll(1)
+    for (size_t tile_id_k = 0; tile_id_k < NumIter; tile_id_k++) {
+        // Trible-Buffer for A Tile
+        uint32_t* __restrict__ read_SPTR_Frag1 =
+            AFrag_2BIT_SPTR + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 *
+                                  4;  // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+        uint32_t* __restrict__ read_SPTR_Frag2 =
+            AFrag_4BIT_SPTR + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 *
+                                  4;  // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+#ifdef PIPELINE_LEVEL_SMEM
+        uint32_t* __restrict__ read2_SPTR_Frag1 =
+            AFrag_2BIT_SPTR +
+            ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 * 4;
+        uint32_t* __restrict__ read2_SPTR_Frag2 =
+            AFrag_4BIT_SPTR +
+            ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 * 4;
+#endif
+        uint32_t* __restrict__ write_SPTR_Frag1 =
+            AFrag_2BIT_SPTR + ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 4 *
+                                  4;  // 1024 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+        uint32_t* __restrict__ write_SPTR_Frag2 =
+            AFrag_4BIT_SPTR + ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) *
+                                  SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 4 *
+                                  4;  // 2048 (1)*4: 4 WARPs; (2)/4: int*+1 = char*+16
+        // Trible-Buffer for B Tile
+        half(*__restrict__ read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+            smem_array + ((tile_id_k + 0) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+#ifdef PIPELINE_LEVEL_SMEM
+        half(*__restrict__ read2_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+            smem_array + ((tile_id_k + 1) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+#endif
+        half(*__restrict__ write_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8] =
+            smem_array +
+            ((tile_id_k + (PIPELINE_LEVEL_GMEM - 1)) % PIPELINE_LEVEL_GMEM) * TilingConfig::TILE_N;
+        //
+        bool GlobalCopy = (tile_id_k + PIPELINE_LEVEL_GMEM - 1) < NumIter;
+        // Copying A tile from Global to Register, Bypassing L1, using double-buffer
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A1>(
+            write_SPTR_Frag1, WARP_StartGPTR_A1, GlobalCopy);
+        CopyFromGlobalToShared_A<SMEM_SIZE_IN_BYTES_PER_WARP_A2>(
+            write_SPTR_Frag2, WARP_StartGPTR_A2, GlobalCopy);
+        // copying B tile from GlobalMemory to SharedMemory
+        CopyFromGlobalToShared<TilingConfig::TILE_N, TilingConfig::BLOCK_WARPS>(
+            write_SPTR, BTile_GPTR, K_Global, NumColumnToCopy, GlobalCopy);
+        cp_async_group_commit();
+#ifdef PIPELINE_LEVEL_SMEM
+        core_mma_slice<TilingConfig>(c,
+                                     a,
+                                     b,
+                                     read_SPTR_Frag1,
+                                     read_SPTR_Frag2,
+                                     read_SPTR,
+                                     Scales_RPTR,
+                                     1);  // read_SPTR_Frag1, read_SPTR_Frag2 are different for each
+                                          // WARP; read_SPTR is shared among WARPs
+        core_mma_slice<TilingConfig>(
+            c, a, b, read_SPTR_Frag1, read_SPTR_Frag2, read_SPTR, Scales_RPTR, 2);
+        core_mma_slice<TilingConfig>(
+            c, a, b, read_SPTR_Frag1, read_SPTR_Frag2, read_SPTR, Scales_RPTR, 3);
+        // Barriers and Synchronizations
+        cp_async_wait_group<PIPELINE_LEVEL_GMEM - 2>();
+        __syncthreads();
+        core_mma_slice<TilingConfig>(
+            c, a, b, read2_SPTR_Frag1, read2_SPTR_Frag2, read2_SPTR, Scales_RPTR, 0);
+        // Updating global PTRs
+        WARP_StartGPTR_A1 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16;  // 4KB/16=256 (1)/16: int4*+1 = char*+16
+        WARP_StartGPTR_A2 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16;  // 8KB/16=512 (1)/16: int4*+1 = char*+16
+        BTile_GPTR += TilingConfig::TILE_K;
+#else
+        PipelinedCoreLoop<TilingConfig>(
+            c,
+            read_SPTR,
+            read_SPTR_Frag1,
+            read_SPTR_Frag2,
+            Scales_RPTR);  // read_SPTR_Frag1, read_SPTR_Frag2 are different for each WARP;
+                           // read_SPTR is shared among WARPs
+        // Updating global PTRs
+        WARP_StartGPTR_A1 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A1 / 16;  // 4KB/16=256 (1)/16: int4*+1 = char*+16
+        WARP_StartGPTR_A2 +=
+            SMEM_SIZE_IN_BYTES_PER_WARP_A2 / 16;  // 8KB/16=512 (1)/16: int4*+1 = char*+16
+        BTile_GPTR += TilingConfig::TILE_K;
+        // Barriers and Synchronizations
+        cp_async_wait_group<PIPELINE_LEVEL_GMEM - 2>();
+        __syncthreads();
+#endif
+    }
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+    // Store the C fragments to shared memory.
+    float(*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4] =
+        reinterpret_cast<float(*)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4]>(smem);
+    StoreToSharedMemoryFromRegister<TilingConfig>(smem_CFrag, c);
+    __syncthreads();
+    // Now that shared memory contains all the D tiles, stream them to global memory.
+    OutputDataType* BlockGlobalPTR =
+        C + BatchID * (M_Global * N_Global) + Tile_Start_M + Tile_Start_N * M_Global;
+    for (size_t i = warpId; i < NumColumnToCopy; i += TilingConfig::BLOCK_WARPS)  // i-th column
+#pragma unroll
+        for (size_t j = threadIdx.x % WARP_SIZE; j < TilingConfig::TILE_M;
+             j += WARP_SIZE)  // j-th row
+        {
+            if constexpr (std::is_same<OutputDataType, half>::value)
+                BlockGlobalPTR[j + i * M_Global] = __float2half_rn(smem_CFrag[i][j]);
+            else
+                BlockGlobalPTR[j + i * M_Global] = smem_CFrag[i][j];
+        }
+
+#else
+    assert(("The FP6 functions are only available on Ampere GPUs.", false));
+#endif
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh
new file mode 100644
index 000000000000..c417e6a46a7c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/kernel_reduction.cuh
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_KERNEL_REDUCTION_CUH
+#define DEEPSPEED_CUDA_LINEAR_KERNEL_REDUCTION_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#define REDUCTION_ELEMENT_PER_THREADBLOCK 256
+#define HALF_PER_128BIT 8
+
+__global__ void SplitK_Reduction(half* C,
+                                 float* Reduction_Workspace,
+                                 size_t M_Global,
+                                 size_t N_Global,
+                                 int Split_K)
+{
+    half* WARP_GPTR_C = C + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x;
+    float* WARP_GPTR_R = Reduction_Workspace + REDUCTION_ELEMENT_PER_THREADBLOCK * blockIdx.x;
+    half* THREAD_GPTR_C = WARP_GPTR_C + threadIdx.x * HALF_PER_128BIT;
+    float* THREAD_GPTR_R = WARP_GPTR_R + threadIdx.x * HALF_PER_128BIT;
+    // Initializing Thread-Local Results
+    float Results[HALF_PER_128BIT];
+#pragma unroll
+    for (int i = 0; i < HALF_PER_128BIT; i++) Results[i] = 0.0f;
+    // Reduction
+    for (int i = 0; i < Split_K; i++) {
+#pragma unroll
+        for (int j = 0; j < HALF_PER_128BIT; j++) Results[j] += THREAD_GPTR_R[j];
+        THREAD_GPTR_R += M_Global * N_Global;
+    }
+// Writing to global memory
+#pragma unroll
+    for (int i = 0; i < HALF_PER_128BIT; i++) THREAD_GPTR_C[i] = __float2half_rn(Results[i]);
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh
new file mode 100644
index 000000000000..982d5a80010c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_cp.async.cuh
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_PTX_CP_ASYNC_CUH
+#define DEEPSPEED_CUDA_LINEAR_PTX_CP_ASYNC_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+template <int SizeInBytes>
+__device__ __forceinline__ void cp_async(half* smem_ptr,
+                                         const half* global_ptr,
+                                         bool pred_guard = true)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    static_assert(SizeInBytes == 16, "Size is not supported");
+    unsigned smem_int_ptr = __cvta_generic_to_shared(smem_ptr);
+    asm volatile(
+        "{ \n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %0, 0;\n"
+        "  @p cp.async.cg.shared.global [%1], [%2], %3;\n"
+        "}\n" ::"r"((int)pred_guard),
+        "r"(smem_int_ptr),
+        "l"(global_ptr),
+        "n"(SizeInBytes));
+#else
+    assert(
+        ("The async copy functions are only supported on Ampere and newer architectures", false));
+#endif
+}
+
+/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block.
+__device__ __forceinline__ void cp_async_group_commit()
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile("cp.async.commit_group;\n" ::);
+#else
+    assert(
+        ("The async copy functions are only supported on Ampere and newer architectures", false));
+#endif
+}
+
+/// Blocks until all but <N> previous cp.async.commit_group operations have committed.
+template <int N>
+__device__ __forceinline__ void cp_async_wait_group()
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+#else
+    assert(
+        ("The async copy functions are only supported on Ampere and newer architectures", false));
+#endif
+}
+
+/// Blocks until all previous cp.async.commit_group operations have committed.
+// cp.async.wait_all is equivalent to :
+// cp.async.commit_group;
+// cp.async.wait_group 0;
+__device__ __forceinline__ void cp_async_wait_all()
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile("cp.async.wait_all;\n" ::);
+#else
+    assert(
+        ("The async copy functions are only supported on Ampere and newer architectures", false));
+#endif
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh
new file mode 100644
index 000000000000..56f86a46f6b5
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/ptx_mma.cuh
@@ -0,0 +1,139 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_PTX_MMA_CUH
+#define DEEPSPEED_CUDA_LINEAR_PTX_MMA_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <assert.h>
+#include "configs.h"
+
+#ifdef PIPELINE_LEVEL_SMEM
+template <typename TilingConfig>
+__device__ __forceinline__ void B_FromSharedToReg(
+    uint32_t (*__restrict__ Reg)[4],
+    half (*__restrict__ read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    int slice_id)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#ifdef DEBUG_MODE
+    static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+                  (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0));
+#endif
+
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
+    int warp_start_col = TilingConfig::WARP_COL_MMA_TENSORS * MMA_8 *
+                         WARP_j;  // each warp may start from reading warp_start_col'th column of
+                                  // the B tile in shared memory
+#ifdef DEBUG_MODE
+    assert(warp_start_col == 0);
+#endif
+
+    int col = (lane_id % 8) + (lane_id / 16) * 8;
+    int row = (lane_id % 16) / 8 * 8;
+    uint32_t smem_local_ptr = static_cast<uint32_t>(
+        __cvta_generic_to_shared(&read_SPTR[warp_start_col + col][slice_id * MMA_16 + row]));
+    if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
+        asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
+                     : "=r"(Reg[0][0]), "=r"(Reg[0][1])
+                     : "r"(smem_local_ptr));
+    } else {
+#pragma unroll
+        for (int i = 0; i < TilingConfig::WARP_COL_MMA_TENSORS / 2; i++) {
+            asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+                         : "=r"(Reg[i][0]), "=r"(Reg[i][1]), "=r"(Reg[i][2]), "=r"(Reg[i][3])
+                         : "r"(smem_local_ptr));
+            smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half);
+        }
+    }
+#else
+    assert(
+        ("The matrix load functions are only supported on Ampere and newer architectures", false));
+#endif
+}
+#else
+// Debug: Whether ldmatrix.trans is required???
+// B is in column-major
+template <typename TilingConfig>
+__device__ __forceinline__ void B_FromSharedToReg(
+    uint32_t (*__restrict__ Reg)[4],
+    half (*__restrict__ read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    int k_offset)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#ifdef DEBUG_MODE
+    static_assert((TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+                  (TilingConfig::WARP_COL_MMA_TENSORS % 2 == 0));
+#endif
+
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int WARP_j = warpId % TilingConfig::BLOCK_COL_WARPS;
+    int warp_start_col = TilingConfig::WARP_COL_MMA_TENSORS * MMA_8 *
+                         WARP_j;  // each warp may start from reading warp_start_col'th column of
+                                  // the B tile in shared memory
+#ifdef DEBUG_MODE
+    assert(warp_start_col == 0);
+#endif
+
+    int col = (lane_id % 8) + (lane_id / 16) * 8;
+    int row = (lane_id % 16) / 8 * 8;
+    uint32_t smem_local_ptr = static_cast<uint32_t>(
+        __cvta_generic_to_shared(&read_SPTR[warp_start_col + col][k_offset + row]));
+    if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
+        asm volatile("ldmatrix.sync.aligned.x2.m8n8.shared.b16 {%0, %1}, [%2];\n"
+                     : "=r"(Reg[0][0]), "=r"(Reg[0][1])
+                     : "r"(smem_local_ptr));
+    } else {
+#pragma unroll
+        for (int i = 0; i < TilingConfig::WARP_COL_MMA_TENSORS / 2; i++) {
+            asm volatile("ldmatrix.sync.aligned.x4.m8n8.shared.b16 {%0, %1, %2, %3}, [%4];\n"
+                         : "=r"(Reg[i][0]), "=r"(Reg[i][1]), "=r"(Reg[i][2]), "=r"(Reg[i][3])
+                         : "r"(smem_local_ptr));
+            smem_local_ptr += 16 * (WARP_K + PADDING_SHARED_MEM_FOR_B_8) * sizeof(half);
+        }
+    }
+#else
+    assert(
+        ("The matrix load functions are only supported on Ampere and newer architectures", false));
+#endif
+}
+#endif
+
+__device__ __forceinline__ void MMA_FP16_M16N8K16(uint32_t* __restrict__ c,
+                                                  uint32_t* __restrict__ a,
+                                                  uint32_t* __restrict__ b)
+{
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    asm volatile(
+        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+        "{ %0, %1, %2, %3},"
+        "{ %4, %5, %6, %7 },"
+        "{ %8, %9 },"
+        "{ %10, %11, %12, %13 };"
+        : "=r"(c[0]), "=r"(c[1]), "=r"(c[2]), "=r"(c[3])
+        : "r"(a[0]),
+          "r"(a[1]),
+          "r"(a[2]),
+          "r"(a[3]),
+          "r"(b[0]),
+          "r"(b[1]),
+          "r"(c[0]),
+          "r"(c[1]),
+          "r"(c[2]),
+          "r"(c[3]));
+#else
+    assert(("The mma functions are only implemented for Ampere and newer architectures", false));
+#endif
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh
new file mode 100644
index 000000000000..bd8a009a02c6
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_core.cuh
@@ -0,0 +1,246 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_CORE_CUH
+#define DEEPSPEED_CUDA_LINEAR_UTILS_CORE_CUH
+
+#include <assert.h>
+
+#include "configs.h"
+#include "ptx_mma.cuh"
+#include "utils_paralleldequant.cuh"
+
+#ifdef PIPELINE_LEVEL_SMEM
+template <int NUM_INT_PER_THREAD>
+__device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[],
+                                                               uint32_t* SPTR,
+                                                               int slice_id)
+{
+    SPTR += slice_id * (NUM_INT_PER_THREAD * WARP_SIZE);
+    int lane_id = threadIdx.x % WARP_SIZE;
+#pragma unroll
+    for (int i = 0; i < NUM_INT_PER_THREAD; i++) { Reg[i] = SPTR[lane_id + i * WARP_SIZE]; }
+}
+
+template <typename TilingConfig>
+__device__ __forceinline__ void initialize_mma_slice(
+    uint32_t (*a)[4],
+    uint32_t (*b)[4],
+    uint32_t* __restrict__ A1_SPTR_read,
+    uint32_t* __restrict__ A2_SPTR_read,
+    half (*__restrict__ B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    uint32_t* RPTR_Scales)
+{
+    // Writing registers
+    // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6
+    // register per thread;
+    uint32_t a_1[2];  // NO double buffer
+    uint32_t a_2[4];  // NO double buffer
+    CopyFromSharedToRegister_AFrag<2>(a_1, A1_SPTR_read, 0);
+    CopyFromSharedToRegister_AFrag<4>(a_2, A2_SPTR_read, 0);
+    Dequant_32FP6_4Way(a, a_1, a_2, RPTR_Scales);  // SIMT Dequant: dequantizing FP6 to FP16 at
+                                                   // register level, dequantizing a slice each time
+    B_FromSharedToReg<TilingConfig>(b, B_SPTR_read, 0);  // Loading B from shared to registers
+}
+
+template <typename TilingConfig>
+__device__ __forceinline__ void core_mma_slice(
+    float c[][REG_PER_THREAD_C_TENSOR_16_16],
+    uint32_t (*a)[4],
+    uint32_t (*b)[4],
+    uint32_t* __restrict__ A1_SPTR_read,
+    uint32_t* __restrict__ A2_SPTR_read,
+    half (*__restrict__ B_SPTR_read)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    uint32_t* RPTR_Scales,
+    int slice_id)  // writing slice[slice_id] to registers, k=0 -> slice_id=1 for prefetching
+{
+#ifdef DEBUG_MODE
+    assert(
+        (TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+        (TilingConfig::WARP_COL_MMA_TENSORS % 2 ==
+         0));  // if WARP_COL_MMA_TENSORS == 1, B tile in registers is padded to a 16*16 MMA block
+#endif
+    const int NumRegSets_a =
+        WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA block
+    const int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                                 ? 1
+                                 : TilingConfig::WARP_COL_MMA_TENSORS /
+                                       2;  // 1 set = 4 registers, containing a 16*16 MMA block
+    uint32_t(*c_uint_ptr)[REG_PER_THREAD_C_TENSOR_16_16] =
+        reinterpret_cast<uint32_t(*)[REG_PER_THREAD_C_TENSOR_16_16]>(
+            c);  // Registers for accumulated FP32 results
+
+    // Setting RPTRs for double buffers
+    uint32_t(*a_read)[4] = a;
+    uint32_t(*a_write)[4] = a;
+    uint32_t(*b_read)[4] = b;
+    uint32_t(*b_write)[4] = b;
+    if (slice_id % 2 == 1) {
+        b_write += NumRegSets_b;
+        a_write += NumRegSets_a;
+    } else {
+        b_read += NumRegSets_b;
+        a_read += NumRegSets_a;
+    }
+
+// Reading registers and issuing core tensor core computations (a slice of A and B tile in shared
+// memory)
+#pragma unroll
+    for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
+        if (TilingConfig::WARP_COL_MMA_TENSORS == 1) {
+            MMA_FP16_M16N8K16(c_uint_ptr[i], a_read[i], b_read[0]);
+        } else {
+#pragma unroll
+            for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS / 2; j++) {
+                MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS], a_read[i], b_read[j]);
+                MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS] + 4,
+                                  a_read[i],
+                                  b_read[j] + 2);  // c+4; b+2
+            }
+        }
+    }
+
+    // Writing registers
+    // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6
+    // register per thread;
+    uint32_t a_1[2];  // NO double buffer
+    uint32_t a_2[4];  // NO double buffer
+    CopyFromSharedToRegister_AFrag<2>(a_1, A1_SPTR_read, slice_id);
+    CopyFromSharedToRegister_AFrag<4>(a_2, A2_SPTR_read, slice_id);
+    Dequant_32FP6_4Way(
+        a_write, a_1, a_2, RPTR_Scales);  // SIMT Dequant: dequantizing FP6 to FP16 at register
+                                          // level, dequantizing a slice each time
+    B_FromSharedToReg<TilingConfig>(
+        b_write, B_SPTR_read, slice_id);  // Loading B from shared to registers
+}
+
+#else
+// Old version with naive pipeline design
+template <int NUM_INT_PER_THREAD>
+__device__ __forceinline__ void CopyFromSharedToRegister_AFrag(uint32_t Reg[], uint32_t* SPTR)
+{
+    int lane_id = threadIdx.x % WARP_SIZE;
+#pragma unroll
+    for (int i = 0; i < NUM_INT_PER_THREAD; i++) { Reg[i] = SPTR[lane_id + i * WARP_SIZE]; }
+}
+template <typename TilingConfig>
+__device__ __forceinline__ void PipelinedCoreLoop(
+    float c[][REG_PER_THREAD_C_TENSOR_16_16],
+    half __restrict__ (*read_SPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    uint32_t* __restrict__ read_SPTR_Frag1,
+    uint32_t* __restrict__ read_SPTR_Frag2,
+    uint32_t* RPTR_Scales)
+{
+#ifdef DEBUG_MODE
+    assert(
+        (TilingConfig::WARP_COL_MMA_TENSORS == 1) ||
+        (TilingConfig::WARP_COL_MMA_TENSORS % 2 ==
+         0));  // if WARP_COL_MMA_TENSORS == 1, B tile in registers is padded to a 16*16 MMA block
+#endif
+    const int NumRegSets_a =
+        WARP_ROW_MMA_TENSORS;  // 1 set = 4 registers, containing a 16*16 MMA block
+    const int NumRegSets_b = (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                                 ? 1
+                                 : TilingConfig::WARP_COL_MMA_TENSORS /
+                                       2;  // 1 set = 4 registers, containing a 16*16 MMA block
+
+    // Registers to store FP32 results
+    uint32_t(*c_uint_ptr)[REG_PER_THREAD_C_TENSOR_16_16] =
+        reinterpret_cast<uint32_t(*)[REG_PER_THREAD_C_TENSOR_16_16]>(c);
+    // Registers to store FP6 fragments for a slice (64*16) of A matrix => 32 FP6 per thread => 6
+    // register per thread;
+    uint32_t a_1[2 * 2];  // double buffer is used
+    uint32_t a_2[4 * 2];  // double buffer is used
+    // Registers to store decompressed FP6
+    uint32_t a[NumRegSets_a * 1][4];  // No double buffer
+    // Register to store FP16 B matrix (a slice)
+    uint32_t b[NumRegSets_b * 2][4];  // double buffer is used
+
+    // Overlapped Smem and TC pipeline: pre-loading from shared to registers
+    CopyFromSharedToRegister_AFrag<2>(a_1, read_SPTR_Frag1);
+    CopyFromSharedToRegister_AFrag<4>(a_2, read_SPTR_Frag2);
+    B_FromSharedToReg<TilingConfig>(b, read_SPTR, 0);
+
+#pragma unroll
+    for (int k = 0; k < WARP_K_MMA_TENSORS; k++) {
+        uint32_t(*b_read)[4] = b;
+        uint32_t(*b_write)[4] = b;
+        uint32_t* a_1_read = a_1;
+        uint32_t* a_1_write = a_1;
+        uint32_t* a_2_read = a_2;
+        uint32_t* a_2_write = a_2;
+        if (k % 2 == 0) {
+            b_write += NumRegSets_b;
+            a_1_write += 2;
+            a_2_write += 4;
+        } else {
+            b_read += NumRegSets_b;
+            a_1_read += 2;
+            a_2_read += 4;
+        }
+        // data loading
+        if (k + 1 < WARP_K_MMA_TENSORS) {
+            // updating SPTR for fragment1 and fragment2
+            read_SPTR_Frag1 += 2 * WARP_SIZE;
+            read_SPTR_Frag2 += 4 * WARP_SIZE;
+            CopyFromSharedToRegister_AFrag<2>(a_1_write, read_SPTR_Frag1);
+            CopyFromSharedToRegister_AFrag<4>(a_2_write, read_SPTR_Frag2);
+            B_FromSharedToReg<TilingConfig>(b_write, read_SPTR, (k + 1) * MMA_16);
+        }
+        // SIMT Dequant + Tensor Core computations
+        Dequant_32FP6_4Way(
+            a, a_1_read, a_2_read, RPTR_Scales);  // Dequantizing FP6 to FP16 at register level,
+                                                  // dequantizing a slice each time
+#pragma unroll
+        for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
+            if (TilingConfig::WARP_COL_MMA_TENSORS == 1)
+                MMA_FP16_M16N8K16(c_uint_ptr[i], a[i], b_read[0]);
+            else {
+#pragma unroll
+                for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS / 2; j++) {
+                    MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS], a[i], b_read[j]);
+                    MMA_FP16_M16N8K16(c_uint_ptr[i + j * WARP_ROW_MMA_TENSORS] + 4,
+                                      a[i],
+                                      b_read[j] + 2);  // c+4; b+2
+                }
+            }
+        }
+    }
+}
+#endif  // #ifdef PIPELINE_LEVEL_SMEM
+
+template <typename TilingConfig>
+__device__ __forceinline__ void StoreToSharedMemoryFromRegister(
+    float (*smem_CFrag)[TilingConfig::TILE_M + PADDING_SHARED_MEM_FOR_C_4],
+    float c[][REG_PER_THREAD_C_TENSOR_16_16])
+{
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warpId = threadIdx.x / WARP_SIZE;
+    int warp_row_offset = warpId * (MMA_16 * WARP_ROW_MMA_TENSORS);
+#pragma unroll
+    for (int i = 0; i < WARP_ROW_MMA_TENSORS; i++) {
+#pragma unroll
+        for (int j = 0; j < TilingConfig::WARP_COL_MMA_TENSORS;
+             j++) {  // Dealing with one 16*8 Tensor
+            int RegSetID = i + (j / 2) * WARP_ROW_MMA_TENSORS;
+            int RegOffset = (j % 2) * (REG_PER_THREAD_C_TENSOR_16_16 / 2);
+            int Tensor_row_offset = warp_row_offset + i * MMA_16;
+            int Tensor_col_offset = j * MMA_8;
+#pragma unroll
+            for (int r = 0; r < REG_PER_THREAD_C_TENSOR_16_16 / 2; r++) {
+                int row_offset = lane_id / 4;
+                if (r >= 2) row_offset += 8;
+                int col_offset = (lane_id % 4) * 2;
+                if (r % 2 == 1) col_offset += 1;
+                smem_CFrag[Tensor_col_offset + col_offset][Tensor_row_offset + row_offset] =
+                    c[RegSetID][r + RegOffset];
+            }
+        }
+    }
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh
new file mode 100644
index 000000000000..3dd7e9e0104e
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_gmem.cuh
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_GMEM_CUH
+#define DEEPSPEED_CUDA_LINEAR_UTILS_GMEM_CUH
+
+#include <assert.h>
+#include "configs.h"
+#include "ptx_cp.async.cuh"
+
+/*
+ * Copying A1/A2 from global memory to shared memory.
+ * Usually 1024 or 2048 Bytes
+ */
+template <int SMEM_SIZE_IN_BYTES_PER_WARP>
+__device__ __forceinline__ void CopyFromGlobalToShared_A(uint32_t* SPTR,
+                                                         const uint4* GPTR,
+                                                         bool pred_guard = true)
+{
+#ifdef DEBUG_MODE
+    static_assert(SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE % 16 == 0);
+#endif
+    int lane_id = threadIdx.x % WARP_SIZE;
+    half* SPTR_HALF = reinterpret_cast<half*>(SPTR);
+    const half* GPTR_HALF = reinterpret_cast<const half*>(GPTR);
+    SPTR_HALF += lane_id * 8;
+    GPTR_HALF += lane_id * 8;
+#pragma unroll
+    for (int i = 0; i < SMEM_SIZE_IN_BYTES_PER_WARP / WARP_SIZE / 16; i++) {
+        cp_async<16>(SPTR_HALF, GPTR_HALF, pred_guard);
+        SPTR_HALF += 256;  // Forward 512 Bytes
+        GPTR_HALF += 256;  // Forward 512 Bytes
+    }
+}
+
+/*
+ * Copying 64 Quant Scales (FP16) from global memory to shared memory.
+ */
+__device__ __forceinline__ void CopyFromGlobalToShared_Scales(half* SPTR_QuantScales,
+                                                              const half* GPTR_A_Scales)
+{
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int Offset_Shared = lane_id * 2;
+    int Offset_Global = lane_id / 4 + (lane_id % 4) * 16;
+    for (int i = 0; i < 2; i++)
+        SPTR_QuantScales[Offset_Shared + i] = GPTR_A_Scales[Offset_Global + i * 8];
+}
+
+/*
+ * (1) Copying X  rows * 64 columns of FP16 values, originally in row    major
+ * (2) Copying 64 rows * X  columns of FP16 values, originally in column major
+ * 16 Bytes per thread -> 512 Bytes per WARP = 4 line per WARP = 1 line per 8 Threads
+ */
+template <int MaxNumOfLinesToCopy, int BLOCK_WARPS>
+__device__ __forceinline__ void CopyFromGlobalToShared(
+    half (*__restrict__ SharedPTR)[WARP_K + PADDING_SHARED_MEM_FOR_B_8],
+    const half* GlobalPTR,
+    const int GlobalStride,
+    const int NumOfLinesLeft,  // To support arbitrary N dimensions.
+    bool Pred = true)
+{
+    // static parameters: 1 Group (8 Threads) can copy 1 line (64 FP16) each time
+    const int NumOfThreads = BLOCK_WARPS * WARP_SIZE;
+    const int NumOfGroups = NumOfThreads / 8;
+    const int MaxIteration = (MaxNumOfLinesToCopy - 1) / NumOfGroups + 1;
+    // runtime variables
+    const int line_id = threadIdx.x / 8;
+    const int line_offset = (threadIdx.x % 8) * 8;
+    // PTR for source global memory and target shared memory
+    GlobalPTR += line_id * GlobalStride + line_offset;
+    SharedPTR += line_id;
+#pragma unroll
+    for (int i = 0; i < MaxIteration; i++) {
+        bool AsyncCopyPred = (line_id + i * NumOfGroups) < NumOfLinesLeft && Pred;
+        cp_async<16>(&(*SharedPTR)[line_offset], GlobalPTR, AsyncCopyPred);
+        //
+        GlobalPTR += NumOfGroups * GlobalStride;
+        SharedPTR += NumOfGroups;
+    }
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
new file mode 100644
index 000000000000..11603fcc576c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/utils_paralleldequant.cuh
@@ -0,0 +1,127 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_UTILS_PARALLELDEQUANT_CUH
+#define DEEPSPEED_CUDA_LINEAR_UTILS_PARALLELDEQUANT_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+/*
+ * Input:   R1
+ * Outputs: R1, R2
+ * Note:    Simplified Exponent calculation is applied.
+ */
+__device__ __forceinline__ void FP6_FP16_Cast_4Way(uint32_t* R1, uint32_t* R2)
+{
+    *R2 = *R1 & 0x80808080;
+    *R1 = *R1 >> 2;
+    *R1 = *R1 & 0x1f1f1f1f;
+    *R2 = *R2 | *R1;
+    *R1 = *R2 & 0x9f009f00;
+    *R2 = *R2 & 0x009f009f;
+    *R2 = *R2 << 8;
+}
+
+/*
+ * Input:   R1
+ * Outputs: R1, R2
+ * Note:    Simplified Exponent calculation is NOT applied.
+ */
+__device__ __forceinline__ void FP6_FP16_Cast_4Way_Naive(uint32_t* R1, uint32_t* R2)
+{
+    //*R2 = *R1 & 0x80808080;
+    *R2 = *R1 & 0xc0c0c0c0;
+    *R1 = *R1 >> 2;
+    //*R1 = *R1 & 0x1f1f1f1f;
+    *R1 = *R1 & 0x0f0f0f0f;
+    *R2 = *R2 | *R1;
+    //
+    //*R1 = *R2 & 0x9f009f00;
+    //*R2 = *R2 & 0x009f009f;
+    *R1 = *R2 & 0xcf00cf00;
+    if (!(*R1 & 0x40000000) && (*R1 & 0x0c000000)) *R1 = *R1 | 0x30000000;
+    if (!(*R1 & 0x00004000) && (*R1 & 0x00000c00)) *R1 = *R1 | 0x00003000;
+    *R2 = *R2 & 0x00cf00cf;
+    if (!(*R2 & 0x00400000) && (*R2 & 0x000c0000)) *R2 = *R2 | 0x00300000;
+    if (!(*R2 & 0x00000040) && (*R2 & 0x0000000c)) *R2 = *R2 | 0x00000030;
+    //
+    *R2 = *R2 << 8;
+    //*R1 = 0x3c003c00;
+    //*R2 = 0x3c003c00;
+}
+
+__device__ __forceinline__ uint32_t MultScale(uint32_t PackedFP16Pair, half Scale)
+{
+    half* FP16_1 = reinterpret_cast<half*>(&PackedFP16Pair);
+    half* FP16_2 = FP16_1 + 1;
+    uint32_t output;
+    half* output_half_ptr = reinterpret_cast<half*>(&output);
+    output_half_ptr[0] = __hmul(__hmul(*FP16_1, __float2half(4096.0f)), Scale);
+    output_half_ptr[1] = __hmul(__hmul(*FP16_2, __float2half(4096.0f)), Scale);
+    return output;
+}
+
+__device__ __forceinline__ void Dequant_32FP6_4Way(uint32_t (*__restrict__ Reg)[4],
+                                                   uint32_t* __restrict__ read_RPTR_Frag1,
+                                                   uint32_t* __restrict__ read_RPTR_Frag2,
+                                                   uint32_t* Scales)
+{
+    uint32_t* OutputRegs = reinterpret_cast<uint32_t*>(Reg);
+    uint32_t* Frag1_PTR = read_RPTR_Frag1;
+    uint32_t* Frag2_PTR = read_RPTR_Frag2;
+    half* Scale_RPTR = reinterpret_cast<half*>(Scales);
+    uint32_t Packed_FP6 = 0;
+    uint32_t tmp = 0;
+// Dequantizing 32 FP6, each Loop dequantizing 4 FP6
+#pragma unroll(8)
+    for (int i = 0; i < 8; i++) {
+        // Frag1
+        Packed_FP6 = (*Frag1_PTR) & 0xc0c0c0c0;
+        if (i % 4 == 3)
+            Frag1_PTR++;
+        else
+            (*Frag1_PTR) = (*Frag1_PTR) << 2;
+        // Frag2
+        tmp = (*Frag2_PTR) & 0xf0f0f0f0;
+        tmp = tmp >> 2;
+        if (i % 2 == 1)
+            Frag2_PTR++;
+        else
+            (*Frag2_PTR) = (*Frag2_PTR) << 4;
+        // Packed_FP6
+        Packed_FP6 = Packed_FP6 | tmp;
+        //
+        FP6_FP16_Cast_4Way(&Packed_FP6, &tmp);
+        //
+        *OutputRegs = MultScale(Packed_FP6, Scale_RPTR[0]);  // Muliply FP16 scales
+        OutputRegs += 1;
+        *OutputRegs = MultScale(tmp, Scale_RPTR[1]);  // Muliply FP16 scales
+        OutputRegs += 1;
+        // Updating offset for FP16 scales for every two iterations
+        if (i % 2 == 1) Scale_RPTR += 2;
+    }
+}
+
+/*
+ *
+ */
+__device__ __forceinline__ void ExtractFromSharedToReg_Scales(uint32_t* Scales,
+                                                              half* WARP_SPTR_Scales)
+{
+    int lane_id = threadIdx.x % WARP_SIZE;
+    uint32_t* SPTR_uint = reinterpret_cast<uint32_t*>(WARP_SPTR_Scales);
+    uint32_t tmpReg = SPTR_uint[lane_id];
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        // T __shfl_sync(unsigned mask, T var, int srcLane, int width=warpSize);
+        Scales[i] = __shfl_sync(0xffffffff, tmpReg, i, 4);
+    }
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h
new file mode 100644
index 000000000000..98805e6c111c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/include/weight_prepacking.h
@@ -0,0 +1,209 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_WEIGHT_PREPACKING_H
+#define DEEPSPEED_CUDA_LINEAR_WEIGHT_PREPACKING_H
+
+#include <assert.h>
+#include <stdio.h>
+#include <vector>
+
+using namespace std;
+
+void Padding_8_FP6_To_8_Bytes(unsigned char Padded_FP6[],
+                              unsigned char* FP6_Array)  // padding 0 to the lowerest bit location
+{
+    Padded_FP6[0] = FP6_Array[0] & 0xfc;
+    Padded_FP6[1] = (FP6_Array[0] << 6) | ((FP6_Array[1] >> 2) & 0xfc);
+    Padded_FP6[2] = (FP6_Array[1] << 4) | ((FP6_Array[2] >> 4) & 0xfc);
+    Padded_FP6[3] = FP6_Array[2] << 2;
+    Padded_FP6[4] = FP6_Array[3] & 0xfc;
+    Padded_FP6[5] = (FP6_Array[3] << 6) | ((FP6_Array[4] >> 2) & 0xfc);
+    Padded_FP6[6] = (FP6_Array[4] << 4) | ((FP6_Array[5] >> 4) & 0xfc);
+    Padded_FP6[7] = FP6_Array[5] << 2;
+}
+
+unsigned char Extract_2_Bits_From_4_PaddedFP6(unsigned char B1,
+                                              unsigned char B2,
+                                              unsigned char B3,
+                                              unsigned char B4)
+{
+    unsigned char out;
+    out = (B1 & 0xc0) | ((B2 & 0xc0) >> 2) | ((B3 & 0xc0) >> 4) | ((B4 & 0xc0) >> 6);
+    return out;
+}
+
+unsigned char Extract_4_Bits_From_2_PaddedFP6(
+    unsigned char B1,
+    unsigned char
+        B2)  // The highest two bits are already extracted by Extract_2_Bits_From_4_PaddedFP6();
+{
+    unsigned char out;
+    out = ((B1 << 2) & 0xf0) | ((B2 >> 2) & 0x0f);
+    return out;
+}
+
+// dealing with 4 1*8 blocks of FP6
+void Assign_32_FP6_To_4_Thread(vector<unsigned char> Seg_2bit[],
+                               vector<unsigned char> Seg_4bit[],
+                               unsigned char* PTR_1,
+                               unsigned char* PTR_2,
+                               unsigned char* PTR_3,
+                               unsigned char* PTR_4)
+{
+    unsigned char Padded_8_FP8[4][8];
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[0], PTR_1);
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[1], PTR_2);
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[2], PTR_3);
+    Padding_8_FP6_To_8_Bytes(Padded_8_FP8[3], PTR_4);
+    //
+    unsigned char Seg1_Byte1_T[4];
+    unsigned char Seg1_Byte2_T[4];
+    unsigned char Seg2_Byte1_T[4];
+    unsigned char Seg2_Byte2_T[4];
+    unsigned char Seg2_Byte3_T[4];
+    unsigned char Seg2_Byte4_T[4];
+    for (int t = 0; t < 4; t++) {
+        Seg1_Byte1_T[t] = Extract_2_Bits_From_4_PaddedFP6(Padded_8_FP8[0][0 + t * 2],
+                                                          Padded_8_FP8[0][1 + t * 2],
+                                                          Padded_8_FP8[1][0 + t * 2],
+                                                          Padded_8_FP8[1][1 + t * 2]);
+        Seg1_Byte2_T[t] = Extract_2_Bits_From_4_PaddedFP6(Padded_8_FP8[2][0 + t * 2],
+                                                          Padded_8_FP8[2][1 + t * 2],
+                                                          Padded_8_FP8[3][0 + t * 2],
+                                                          Padded_8_FP8[3][1 + t * 2]);
+        Seg2_Byte1_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[0][0 + t * 2], Padded_8_FP8[0][1 + t * 2]);
+        Seg2_Byte2_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[1][0 + t * 2], Padded_8_FP8[1][1 + t * 2]);
+        Seg2_Byte3_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[2][0 + t * 2], Padded_8_FP8[2][1 + t * 2]);
+        Seg2_Byte4_T[t] =
+            Extract_4_Bits_From_2_PaddedFP6(Padded_8_FP8[3][0 + t * 2], Padded_8_FP8[3][1 + t * 2]);
+    }
+    //
+    for (int t = 0; t < 4; t++) {
+        Seg_2bit[t].push_back(Seg1_Byte1_T[t]);
+        Seg_2bit[t].push_back(Seg1_Byte2_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte1_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte2_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte3_T[t]);
+        Seg_4bit[t].push_back(Seg2_Byte4_T[t]);
+    }
+    return;
+}
+
+void BitInterleaving_2bit(unsigned char* PTR_4Bytes)
+{
+    unsigned int* PTR_UINT = reinterpret_cast<unsigned int*>(PTR_4Bytes);
+    unsigned int input = *PTR_UINT;
+    //
+    // int order_2bit[16] = {1,5,9,13,3,7,11,15,2,6,10,14,4,8,12,16};  // pre-defined order for
+    // bit-interleaving in QuantLLM
+    int order_2bit[16] = {
+        2, 6, 10, 14, 4, 8, 12, 16, 1, 5, 9, 13, 3, 7, 11, 15};  // pre-defined order for
+                                                                 // bit-interleaving in QuantLLM
+    unsigned int Frags_2bit[16];  // The highest 2 bits are used to store the extracted fragments.
+    for (int i = 0; i < 16; i++) Frags_2bit[i] = (input << 2 * (order_2bit[i] - 1)) & 0xc0000000;
+    //
+    unsigned int output = 0x00000000;
+    for (int i = 0; i < 16; i++) output |= (Frags_2bit[i] >> (i * 2));
+    //
+    *PTR_UINT = output;
+}
+
+void BitInterleaving_4bit(unsigned char* PTR_4Bytes)
+{
+    unsigned int* PTR_UINT = reinterpret_cast<unsigned int*>(PTR_4Bytes);
+    unsigned int input = *PTR_UINT;
+    //
+    // int order_4bit[8] = {1,5,3,7,2,6,4,8};  // pre-defined order for bit-interleaving in QuantLLM
+    int order_4bit[8] = {
+        2, 6, 4, 8, 1, 5, 3, 7};  // pre-defined order for bit-interleaving in QuantLLM
+    unsigned int Frags_4bit[8];   // The highest4 bits are used to store the extracted fragments.
+    for (int i = 0; i < 8; i++) Frags_4bit[i] = (input << 4 * (order_4bit[i] - 1)) & 0xf0000000;
+    //
+    unsigned int output = 0x00000000;
+    for (int i = 0; i < 8; i++) output |= (Frags_4bit[i] >> (i * 4));
+    //
+    *PTR_UINT = output;
+}
+
+/*
+ * Inputs:
+ * (1) unsigned char Weight_6bit [M*K*6/8]
+ * Outputs:
+ * (1) unsigned char Weight_2bit [M*K*2/8]
+ * (2) unsigned char Weight_4bit [M*K*4/8]
+ *
+ * Assumption: Weight_6bit, Weight_2bit, Weight_4bit all stored continuously in row-major.
+ * 8 FP6 = 6 Bytes
+ * 8 FP4 = 4 Bytes
+ * 8 FP2 = 2 Bytes
+ */
+void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K)
+{
+    assert(M % 64 == 0);
+    assert(K % 64 == 0);
+    //
+    unsigned char* Weight_6bit = reinterpret_cast<unsigned char*>(FP6Weights);
+    unsigned char* Weight_2bit = Weight_6bit;
+    unsigned char* Weight_4bit = Weight_6bit + M * K * 2 / 8;
+    //
+    vector<unsigned char> A_Segment_2bit[32];
+    vector<unsigned char> A_Segment_4bit[32];
+    //
+    size_t BytesPerRow = K * 6 / 8;
+    // Pass-1: (1) 2+4 split; (2) assign weights to 32 threads.
+    for (size_t i = 0; i < M / 64; i++)  //
+    {
+        for (size_t j = 0; j < K / 16; j++) {
+            for (size_t k = 0; k < 64 / 16; k++) {
+                size_t row = i * 64 + k * 16;
+                size_t col = j * 16;
+                unsigned char* StartPTR_1 = Weight_6bit + row * BytesPerRow + col * 6 / 8;
+                unsigned char* StartPTR_2 = StartPTR_1 + 8 * BytesPerRow;
+                unsigned char* StartPTR_3 = StartPTR_1 + 8 * 6 / 8;
+                unsigned char* StartPTR_4 = StartPTR_2 + 8 * 6 / 8;
+                // Dealing with each 16*16 blocks then...
+                for (int l = 0; l < 8; l++)
+                    Assign_32_FP6_To_4_Thread(&A_Segment_2bit[l * 4],
+                                              &A_Segment_4bit[l * 4],
+                                              StartPTR_1 + l * BytesPerRow,
+                                              StartPTR_2 + l * BytesPerRow,
+                                              StartPTR_3 + l * BytesPerRow,
+                                              StartPTR_4 + l * BytesPerRow);
+            }
+        }
+    }
+    // Verifying the length of 2_bit segments and 4_bit segments
+    size_t BytesPerThread_2bit = M * K * 2 / 8 / 32;
+    size_t BytesPerThread_4bit = M * K * 4 / 8 / 32;
+    for (int i = 0; i < 32; i++) {
+        assert(A_Segment_2bit[i].size() == BytesPerThread_2bit);
+        assert(A_Segment_4bit[i].size() == BytesPerThread_4bit);
+    }
+    // Pass-2: Optimizing coleasced global memory access
+    for (size_t i = 0; i < BytesPerThread_2bit / 4; i++)
+        for (int t = 0; t < 32; t++)
+            for (int b = 0; b < 4; b++)
+                Weight_2bit[i * 128 + t * 4 + (3 - b)] =
+                    A_Segment_2bit[t]
+                                  [i * 4 + b];  // why (3-b): special byte order within a register
+    for (size_t i = 0; i < BytesPerThread_4bit / 4; i++)
+        for (int t = 0; t < 32; t++)
+            for (int b = 0; b < 4; b++)
+                Weight_4bit[i * 128 + t * 4 + (3 - b)] =
+                    A_Segment_4bit[t][i * 4 + b];  // why (3-b):special byte order within a register
+    // Pass-3: Bit-level interleaving
+    for (size_t i = 0; i < BytesPerThread_2bit * 32 / 4; i++)
+        BitInterleaving_2bit(Weight_2bit + 4 * i);
+    for (size_t i = 0; i < BytesPerThread_4bit * 32 / 4; i++)
+        BitInterleaving_4bit(Weight_4bit + 4 * i);
+}
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp
new file mode 100644
index 000000000000..3b4966eb822b
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp
@@ -0,0 +1,224 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include <ATen/cuda/CUDAContext.h>
+
+#include "linear_kernels.h"
+
+namespace {
+
+// For bit-level debugging.
+template <typename T>
+void print_bits(T num)
+{
+    char bits[sizeof(T) * 8 + 1] = {'\0'};
+    for (int bit = 0; bit < (sizeof(T) * 8); bit++) {
+        bits[sizeof(T) * 8 - 1 - bit] = '0' + (num & 0x01);
+        num = num >> 1;
+    }
+    printf("%s\n", bits);
+}
+
+void print_bits(half num)
+{
+    char bits[sizeof(half) * 8 + 1] = {'\0'};
+    auto int_num = *reinterpret_cast<uint16_t*>(&num);
+    for (int bit = 0; bit < (sizeof(half) * 8); bit++) {
+        bits[sizeof(half) * 8 - 1 - bit] = '0' + (int_num & 0x01);
+        int_num = int_num >> 1;
+    }
+    printf("%s\n", bits);
+}
+
+/*
+ * Function to pack 4 fake quantized FP16 value into continuously stored 4 FP6 values.
+ */
+void cast_fp16_fp6(uint16_t* FP16x4, uint8_t* FP6x4)
+{
+    // Constants for FP6
+    constexpr int exponent_nbits_fp6 = 3;
+    constexpr int mantissa_nbits_fp6 = 2;
+    constexpr int exp_bias_fp6 = (1 << (exponent_nbits_fp6 - 1)) - 1;
+    // Constants for FP16
+    constexpr int exponent_nbits_fp16 = 5;
+    constexpr int mantissa_nbits_fp16 = 10;
+    constexpr int exp_bias_fp16 = (1 << (exponent_nbits_fp16 - 1)) - 1;
+
+    int fp6_temp[4];
+
+    float absmin_nonzero_fp6 = 0.0625;
+    // Note that we regard the exponent of '111' as a regular value rather than NaN or inf. This is
+    // the same with that in qtorch.
+    float absmax_fp6 = 28;
+
+    for (int i = 0; i < 4; ++i) {
+        uint16_t source = FP16x4[i];
+        float fp6_value_abs = std::abs(__half2float(*((half*)(&source))));
+        if ((fp6_value_abs != 0 && fp6_value_abs < absmin_nonzero_fp6) ||
+            fp6_value_abs > absmax_fp6) {
+            // TODO(zhen): a better way may be rounding it to the nearest FP6 value.
+            throw std::invalid_argument("Input value out of range for FP6.");
+        }
+
+        // It is not safe to do shift operation on uint16_t. So we promote it to int.
+        int source_promote = int(source);
+
+        int sign_bit = (source_promote >> 15);
+        // Extracting exponent represented in FP16. The sign mask 0x7FFF is '0111 1111 1111 1111'
+        int exp_bit = (source_promote & 0x7FFF) >> mantissa_nbits_fp16;
+        // Extracting mantissa represented in FP16
+        int mant_bit = source_promote & ((1 << mantissa_nbits_fp16) - 1);
+
+        int new_exp_bit;
+        int new_mant_bit;
+
+        if (exp_bit == 0) {
+            // Subnormal FP16 number. Too small for FP6.
+            new_exp_bit = 0;
+            new_mant_bit = 0;
+        } else {
+            new_mant_bit = mant_bit >> (mantissa_nbits_fp16 - mantissa_nbits_fp6);
+            new_exp_bit = exp_bit - exp_bias_fp16 + exp_bias_fp6;
+
+            // Deal with subnormal FP6 values.
+            int target_exp_val = exp_bit - exp_bias_fp16;
+            int min_fp6_exp_val = -exp_bias_fp6 + 1;
+            bool subnormal_fp6 = target_exp_val < min_fp6_exp_val;
+            if (subnormal_fp6) {
+                // TODO(zhen): add the rounding logic.
+                new_exp_bit = 0;
+                // The implicit 1 in the mantissa of FP16 is not present in subnormal FP6. Thus we
+                // need to add it
+                new_mant_bit = (new_mant_bit | (1 << mantissa_nbits_fp6)) >>
+                               (min_fp6_exp_val - target_exp_val);
+            }
+        }
+
+        fp6_temp[i] = (sign_bit << (exponent_nbits_fp6 + mantissa_nbits_fp6)) |
+                      (new_exp_bit << mantissa_nbits_fp6) | new_mant_bit;
+    }
+    // Pack the values
+    FP6x4[0] = fp6_temp[0] << 2 | (fp6_temp[1] >> 4);
+    FP6x4[1] = (fp6_temp[1] & 0x0F) << 4 | (fp6_temp[2] >> 2);
+    FP6x4[2] = (fp6_temp[2] & 0x03) << 6 | fp6_temp[3];
+}
+
+/*
+ *  Function to prepack FP16 weights into continuous FP6 values.
+ *
+ *  Parameters:
+ *     weight_16bit: input weight in FP16, size M*K
+ *     weight_6bit: output weight in packed FP6, continuously stored, size M*K*6/8
+ *     M, K: the shape of the weight
+ */
+void weight_prepacking_fp16_to_fp6(uint16_t* weight_16bit,
+                                   uint8_t* weight_6bit_packed,
+                                   size_t M,
+                                   size_t K)
+{
+    // Every four 16-bit elements are packed into three 6-bit values (4*6bit == 3*8bit).
+    if (K * 6 % 8 != 0) { throw std::invalid_argument("(K * 6 % 8) should be 0"); }
+    size_t K_fp6_packed = K * 6 / 8;
+    // #pragma omp parallel for
+    for (auto m = 0; m < M; m++) {
+        uint8_t* ptr_6bit = weight_6bit_packed + m * K_fp6_packed;
+        uint16_t* ptr_16bit = weight_16bit + m * K;
+        for (auto k = 0; k < K; k += 4) {
+            cast_fp16_fp6(ptr_16bit, ptr_6bit);
+            ptr_16bit += 4;
+            ptr_6bit += 3;
+        }
+    }
+}
+
+}  // namespace
+
+/*
+ * Function to execute the FP6 linear kernel.
+ *
+ * Parameters:
+ *    output: output tensor, size M*N
+ *    hidden_states: input activation tensor, size N*K
+ *    weights_2bit: packed 2bit weights, size M*K*2/8
+ *    weights_4bit: packed 4bit weights, size M*K*4/8
+ *    scales: scale tensor, size M
+ *    workspace: workspace tensor, size M*N*split_k
+ *    M: the output channel number of the weight
+ *    N: the token number of the activation
+ *    K: the input channel number of the weight
+ *    split_k: the split size of the GEMM calculation
+ */
+void cuda_wf6af16_linear(torch::Tensor& output,
+                         torch::Tensor& hidden_states,
+                         torch::Tensor& weights_2bit,
+                         torch::Tensor& weights_4bit,
+                         torch::Tensor& scales,
+                         torch::Tensor& workspace,
+                         int M,
+                         int N,
+                         int K,
+                         int split_k)
+{
+    TORCH_CHECK(weights_2bit.device().type() == torch::kCUDA, "weight_2bit must be on CUDA");
+    TORCH_CHECK(weights_4bit.device().type() == torch::kCUDA, "weight_4bit must be on CUDA");
+    TORCH_CHECK(hidden_states.device().type() == torch::kCUDA, "X must be on CUDA");
+    TORCH_CHECK(scales.device().type() == torch::kCUDA, "scales must be on CUDA");
+
+    auto status = fp6_linear_kernel(at::cuda::getCurrentCUDAStream(),
+                                    (uint4*)(weights_2bit.data_ptr<uint8_t>()),
+                                    (uint4*)(weights_4bit.data_ptr<uint8_t>()),
+                                    (half*)(scales.data_ptr<at::Half>()),
+                                    (half*)(hidden_states.data_ptr<at::Half>()),
+                                    (half*)(output.data_ptr<at::Half>()),
+                                    M,
+                                    N,
+                                    K,
+                                    workspace.data_ptr<float>(),
+                                    split_k);
+    if (status != cudaSuccess) {
+        AT_ERROR("fp6_linear_kernel failed with error: ", cudaGetErrorString(status));
+    }
+}
+
+/*
+ * Function to prepack the fake 6-bit-quantized FP16 weights into 2bit and 4bit.
+ *
+ * Parameters:
+ *    weight: input weight in FP16 (containing the quantized FP6-ranged value), size M*K
+ * Returns:
+ *   weight_2bit: output weight in 2bit, size M*K*2/8
+ *   weight_4bit: output weight in 4bit, size M*K*4/8
+ */
+std::vector<torch::Tensor> preprocess_weight(torch::Tensor& weight)
+{
+    TORCH_CHECK(weight.dim() == 2, "weight must be 2-dimensional");
+    TORCH_CHECK(weight.scalar_type() == torch::kFloat16, "weight must be FP16");
+    TORCH_CHECK(weight.is_contiguous(), "weight must be contiguous");
+    TORCH_CHECK(weight.device().type() == torch::kCPU, "weight must be on CPU");
+    auto M = weight.size(0);
+    auto K = weight.size(1);
+    TORCH_CHECK(K % 4 == 0, "K must be multiple of 4");
+
+    // Pack weight from FP16 to FP6.
+    uint16_t* weight_16bit_ptr = reinterpret_cast<uint16_t*>(weight.data_ptr<at::Half>());
+    std::vector<uint8_t> weight_6bit_packed(M * K * 6 / 8);
+    uint8_t* weight_6bit_ptr = weight_6bit_packed.data();
+    weight_prepacking_fp16_to_fp6(weight_16bit_ptr, weight_6bit_ptr, M, K);
+
+    // Split weight into 2bit and 4bit.
+    weight_matrix_prepacking(reinterpret_cast<int*>(weight_6bit_ptr), M, K);
+    uint8_t* weight_2bit_ptr = weight_6bit_ptr;
+
+    // Make sure that the new split tensor does not share the underlying memory with the original
+    // one. Otherwise it will incur some problems when the original tensor is deleted. It also
+    // makes the memory flattern risky.
+    auto weight_2bit =
+        torch::from_blob(weight_2bit_ptr, {M * K * 2 / 8}, torch::kUInt8).clone().detach();
+    uint8_t* weight_4bit_ptr = weight_2bit_ptr + M * K * 2 / 8;
+    auto weight_4bit =
+        torch::from_blob(weight_4bit_ptr, {M * K * 4 / 8}, torch::kUInt8).clone().detach();
+
+    return {weight_2bit, weight_4bit};
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h
new file mode 100644
index 000000000000..01a6b7c18af8
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#ifndef DEEPSPEED_CUDA_LINEAR_KERNELS_H
+#define DEEPSPEED_CUDA_LINEAR_KERNELS_H
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+#include "ds_kernel_utils.h"
+
+#include "linear_kernels_cuda.h"
+
+void cuda_wf6af16_linear(torch::Tensor& output,
+                         torch::Tensor& hidden_states,
+                         torch::Tensor& weights_2bit,
+                         torch::Tensor& weights_4bit,
+                         torch::Tensor& scale,
+                         torch::Tensor& workspace,
+                         int M,
+                         int N,
+                         int K,
+                         int split_k);
+
+std::vector<torch::Tensor> preprocess_weight(torch::Tensor& Weight);
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu
new file mode 100644
index 000000000000..ea0203c42f84
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu
@@ -0,0 +1,318 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+// clang-format off
+// Put the torch headers at the front to avoid conflict with other headers on
+// `at::nullopt` and `at::optional`.
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+// clang-format on
+
+#include "include/kernel_matmul.cuh"
+#include "include/kernel_reduction.cuh"
+#include "include/weight_prepacking.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "linear_kernels_cuda.h"
+
+template <typename TilingConfig, typename OutputDataType>
+static void Kernel_Ex(cudaStream_t stream,
+                      const uint4* Weight1,
+                      const uint4* Weight2,
+                      const half* Scales,
+                      const half* B,
+                      OutputDataType* C,
+                      const size_t M_Global,
+                      const size_t N_Global,
+                      const size_t K_Global,
+                      int Split_K)
+{
+#ifdef DEBUG_MODE
+    printf("\n");
+    printf("Launcher.cu->Kernel_Ex():\n");
+    printf("M: %d, N: %d, K: %d, SplitK: %d\n", M_Global, N_Global, K_Global, Split_K);
+    printf("TILE_M: %d, TILE_K: %d, TILE_N: %d\n",
+           TilingConfig::TILE_M,
+           TilingConfig::TILE_K,
+           TilingConfig::TILE_N);
+#endif
+    static size_t SHMEM_SZ =
+        max(TilingConfig::SMEM_SIZE_B_TILE + SMEM_SIZE_A1_TILE + SMEM_SIZE_A2_TILE,
+            TilingConfig::SMEM_SIZE_C_TILE);
+    cudaFuncSetAttribute(QUANT_GEMM_Kernel<TilingConfig, OutputDataType>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         SHMEM_SZ);
+    size_t dimN = (N_Global - 1) / TilingConfig::TILE_N + 1;
+    size_t dimM = M_Global * Split_K / TilingConfig::TILE_M;
+    dim3 GridDim(dimN, dimM, 1);
+    dim3 BlockDim(WARP_SIZE * TilingConfig::BLOCK_WARPS, 1, 1);
+
+#ifdef DEBUG_MODE
+    printf(
+        "GridDim.x: %d, GridDim.y: %d, GridDim.z: %d, BlockDim.x: %d, BlockDim.y: %d, BlockDim.z: "
+        "%d SHMEM_SZ: %d\n",
+        GridDim.x,
+        GridDim.y,
+        GridDim.z,
+        BlockDim.x,
+        BlockDim.y,
+        BlockDim.z,
+        SHMEM_SZ);
+    printf("\n");
+#endif
+
+    QUANT_GEMM_Kernel<TilingConfig, OutputDataType><<<GridDim, BlockDim, SHMEM_SZ, stream>>>(
+        Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+}
+
+/*
+ *
+ */
+cudaError_t fp6_linear_kernel(cudaStream_t stream,
+                              const uint4* Weight1,
+                              const uint4* Weight2,
+                              const half* Scales,
+                              const half* B,
+                              half* C,
+                              const size_t M_Global,
+                              const size_t N_Global,
+                              const size_t K_Global,
+                              float* Reduction_Workspace,  // Reduction_Workspace_Size = Split_K *
+                                                           // M_Global * N_Global * sizeof(fp32)
+                              int Split_K)
+{
+    assert(M_Global % 256 == 0);
+    assert(K_Global % 64 == 0);
+    assert(N_Global > 0);
+
+    // Work around to support more N shapes:
+    size_t N_PowerOf2;
+    if (N_Global > 0 && N_Global <= 8) N_PowerOf2 = 8;
+    if (N_Global > 8 && N_Global <= 16) N_PowerOf2 = 16;
+    if (N_Global > 16 && N_Global <= 32) N_PowerOf2 = 32;
+    if (N_Global > 32 && N_Global <= 64) N_PowerOf2 = 64;
+    if (N_Global > 64 && N_Global <= 128) N_PowerOf2 = 128;
+    if (N_Global > 128) N_PowerOf2 = ((N_Global - 1) / 128 + 1) * 128;
+
+    if (Split_K == 1) {
+        switch (N_PowerOf2) {
+            case 8:
+                Kernel_Ex<TilingConfig<4, 1, 1>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 16:
+                Kernel_Ex<TilingConfig<4, 1, 2>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 32:
+                Kernel_Ex<TilingConfig<4, 1, 4>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 64:
+                Kernel_Ex<TilingConfig<4, 1, 8>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            case 128:
+                Kernel_Ex<TilingConfig<4, 1, 8>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+            default:
+                if (N_PowerOf2 % 128 != 0) {
+                    printf("QuantLLM_API Error: Unsupported N dimension %lu!\n", N_PowerOf2);
+                    return cudaErrorUnknown;
+                }
+                Kernel_Ex<TilingConfig<4, 1, 8>, half>(
+                    stream, Weight1, Weight2, Scales, B, C, M_Global, N_Global, K_Global, Split_K);
+                break;
+        }
+    } else {
+        switch (N_PowerOf2) {
+            case 8:
+                Kernel_Ex<TilingConfig<4, 1, 1>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 16:
+                Kernel_Ex<TilingConfig<4, 1, 2>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 32:
+                Kernel_Ex<TilingConfig<4, 1, 4>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 64:
+                Kernel_Ex<TilingConfig<4, 1, 8>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            case 128:
+                Kernel_Ex<TilingConfig<4, 1, 8>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+            default:
+                if (N_PowerOf2 % 128 != 0) {
+                    printf("QuantLLM_API Error: Unsupported N dimension %lu!\n", N_PowerOf2);
+                    return cudaErrorUnknown;
+                }
+                Kernel_Ex<TilingConfig<4, 1, 8>, float>(stream,
+                                                        Weight1,
+                                                        Weight2,
+                                                        Scales,
+                                                        B,
+                                                        Reduction_Workspace,
+                                                        M_Global,
+                                                        N_Global,
+                                                        K_Global,
+                                                        Split_K);
+                break;
+        }
+        // Reduction for SplitK
+        dim3 GridDim((M_Global * N_Global) / REDUCTION_ELEMENT_PER_THREADBLOCK, 1, 1);
+        dim3 BlockDim(WARP_SIZE, 1, 1);
+        SplitK_Reduction<<<GridDim, BlockDim, 0, stream>>>(
+            C, Reduction_Workspace, M_Global, N_Global, Split_K);
+    }
+    return cudaGetLastError();
+}
+
+/*
+Computes FP6-FP16 GEMM (PyTorch interface).
+
+[Mathematical Formula]
+Standard definition of linear layer:    Out = In * trans(W), where In, Out, and W are stored in
+row-major. After Equivalent transformation    :    trans(Out) = W * trans(In). Note that we do not
+perform "transpose" during runtime, we instead interpret the In/Out as column-major matrices when
+calling our CUDA kernel.
+
+[Inputs]
+  _in_feats:  tensor of shape [B, IC];                  // half
+  _weights:   int tensor of shape [OC, IC // 16 * 3];   // 3 INT32 words contains 16 FP6 weights.
+  _scales:    tensor of shape [OC];                     // half
+  splitK:     splitting the MatMul problem along K dimension for higher GPU utilization, default 1.
+[Outputs]
+  _out_feats: tensor of shape [B, OC];                  // half
+*/
+torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats,
+                                      torch::Tensor _weights,
+                                      torch::Tensor _scales,
+                                      int splitK)
+{
+    int num_in_feats = _in_feats.size(0);
+    int num_in_channels = _in_feats.size(1);
+    int num_out_channels = _weights.size(0);
+    assert(num_in_channels % 64 == 0);
+    assert((num_in_channels / 16 * 3) ==
+           _weights.size(1));  // Making sure the K dimension is matched.
+    //
+    int M = num_out_channels;
+    int K = num_in_channels;
+    int N = num_in_feats;
+    // Input Tensors
+    auto weight1 = reinterpret_cast<const uint4*>(
+        _weights.data_ptr<int>());  // weights is [OC, IC] but in FP6.
+    auto weight2 = weight1 + num_in_channels * num_out_channels * 2 / 128;
+    auto in_feats = reinterpret_cast<const half*>(_in_feats.data_ptr<at::Half>());
+    auto scales = reinterpret_cast<const half*>(_scales.data_ptr<at::Half>());
+    // Output Tensors
+    auto options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+    at::Tensor _out_feats = torch::empty({num_in_feats, num_out_channels}, options);
+    auto out_feats = reinterpret_cast<half*>(_out_feats.data_ptr<at::Half>());
+
+    float* Reduction_Workspace = nullptr;
+    if (splitK != 1) {
+        auto options = torch::TensorOptions().dtype(torch::kFloat32).device(_in_feats.device());
+        at::Tensor _workspace = torch::empty({splitK, num_in_feats, num_out_channels}, options);
+        auto Reduction_Workspace = reinterpret_cast<float*>(
+            _out_feats.data_ptr<float>());  // Reduction_Workspace_Size = Split_K * M_Global *
+                                            // N_Global * sizeof(fp32)
+    }
+
+    fp6_linear_kernel(0,  // Using default stream here.
+                      weight1,
+                      weight2,
+                      scales,
+                      in_feats,
+                      out_feats,
+                      M,
+                      N,
+                      K,
+                      Reduction_Workspace,
+                      splitK);
+
+    return _out_feats;
+}
+
+/*
+ * Inputs:
+ * (1) unsigned char Weight_6bit [M*K*6/8]
+ * Outputs:
+ * (1) unsigned char Weight_2bit [M*K*2/8]
+ * (2) unsigned char Weight_4bit [M*K*4/8]
+ *
+ * Assumption: Weight_6bit, Weight_2bit, Weight_4bit all stored continuously in row-major.
+ * 8 FP6 = 6 Bytes
+ * 8 FP4 = 4 Bytes
+ * 8 FP2 = 2 Bytes
+ */
+
+/*
+ * Weight prepacking (Pytorch interface).
+ * [Input & Output]
+ *  fp6_tensor: int tensor of shape [OC, IC // 16 * 3];   // 3 INT32 words contains 16 FP6 weights.
+ * [Output]
+ *  packed_tensor: int tensor of shape [OC, IC // 16 * 3];
+ */
+torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t OC, size_t IC)
+{
+    assert((OC % 256 == 0) && (IC % 64 == 0));
+    assert((fp6_tensor.size(0) == OC) && (fp6_tensor.size(1) == IC / 16 * 3));
+    // auto packed_tensor = torch::empty_like(fp6_tensor);
+    // auto packed_tensor_ptr = reinterpret_cast<int*>(packed_tensor.data_ptr<int>());
+    auto fp6_tensor_ptr = reinterpret_cast<int*>(fp6_tensor.data_ptr<int>());
+    weight_matrix_prepacking(fp6_tensor_ptr, OC, IC);
+    return fp6_tensor;
+}
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h
new file mode 100644
index 000000000000..6a83290f0cb5
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+// This is a copy of FP6-LLM kernel code: https://arxiv.org/abs/2401.14112
+
+#ifndef DEEPSPEED_CUDA_LINEAR_FP6_LINEAR_CUH
+#define DEEPSPEED_CUDA_LINEAR_FP6_LINEAR_CUH
+
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include <torch/extension.h>
+
+/*
+ * Computes FP6-FP16 GEMM (C++ interface).
+ */
+cudaError_t fp6_linear_kernel(cudaStream_t stream,
+                              const uint4* Weight1,
+                              const uint4* Weight2,
+                              const half* Scales,
+                              const half* B,
+                              half* C,
+                              const size_t M_Global,
+                              const size_t N_Global,
+                              const size_t K_Global,
+                              float* Reduction_Workspace,  // Reduction_Workspace_Size = Split_K *
+                                                           // M_Global * N_Global * sizeof(fp32)
+                              int Split_K);
+
+/*
+ * Computes FP6-FP16 GEMM (PyTorch interface).
+ */
+torch::Tensor fp6_linear_forward_cuda(torch::Tensor _in_feats,
+                                      torch::Tensor _weights,
+                                      torch::Tensor _scales,
+                                      int splitK = 1);
+
+/*
+ * In-place weight prepacking (C++ interface).
+ */
+void weight_matrix_prepacking(int* FP6Weights, size_t M, size_t K);
+
+/*
+ * Weight prepacking (Pytorch interface).
+ */
+torch::Tensor weight_matrix_prepacking_cpu(torch::Tensor fp6_tensor, size_t M, size_t K);
+
+#endif
diff --git a/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu b/deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu
similarity index 100%
rename from deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu
rename to deepspeed/inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu
diff --git a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu
similarity index 96%
rename from deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu
rename to deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu
index 84a9906cf037..fc14b1831361 100644
--- a/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu
+++ b/deepspeed/inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu
@@ -17,7 +17,7 @@ constexpr int access_size = 16;
 constexpr int threads = 1024;
 
 template <ActivationType ActType>
-float gated_act_fn(float x, float y);
+DS_D_INLINE float gated_act_fn(float x, float y);
 
 template <>
 DS_D_INLINE float gated_act_fn<ActivationType::GEGLU>(float x, float y)
@@ -119,7 +119,10 @@ void launch_gated_activation_impl(T* output,
         DISPATCH_UNROLL(5);
     } else if (unroll == 6) {
         DISPATCH_UNROLL(6);
+    } else if (unroll == 7) {
+        DISPATCH_UNROLL(7);
     } else {
+        // TODO: provide a kernel with an outer loop to handle larger columns.
         throw std::runtime_error(
             "Called with more columns than supported, please report this bug and this limit will "
             "be increased.");
diff --git a/deepspeed/inference/v2/kernels/includes/ds_kernel_utils.h b/deepspeed/inference/v2/kernels/includes/ds_kernel_utils.h
index 8e4888109fcd..f8b16ee6a315 100644
--- a/deepspeed/inference/v2/kernels/includes/ds_kernel_utils.h
+++ b/deepspeed/inference/v2/kernels/includes/ds_kernel_utils.h
@@ -23,7 +23,7 @@ used throughout the codebase.
 #ifdef __HIP_PLATFORM_AMD__
 
 // constexpr variant of warpSize for templating
-constexpr int hw_warp_size = 64;
+constexpr int hw_warp_size = ROCM_WAVEFRONT_SIZE;
 #define HALF_PRECISION_AVAILABLE = 1
 #include <hip/hip_cooperative_groups.h>
 #include <hip/hip_fp16.h>
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/__init__.py
index 988152b2e7c0..38a4ebd6fba3 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/__init__.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/__init__.py
@@ -10,4 +10,4 @@
 from .logits_gather import *
 from .moe_gather import *
 from .moe_scatter import *
-from .top_1_gating import *
+from .top_k_gating import *
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cu b/deepspeed/inference/v2/kernels/ragged_ops/embed/embed_cuda.cu
similarity index 100%
rename from deepspeed/inference/v2/kernels/ragged_ops/embed/embed.cu
rename to deepspeed/inference/v2/kernels/ragged_ops/embed/embed_cuda.cu
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/includes/top_k_utils.h b/deepspeed/inference/v2/kernels/ragged_ops/includes/top_k_utils.h
new file mode 100644
index 000000000000..f5104f899d9c
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/includes/top_k_utils.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#define TOP_K_SWITCH(N_TOP_K, ...)         \
+    [&] {                                  \
+        if (1 == N_TOP_K) {                \
+            constexpr int CONST_TOP_K = 1; \
+            __VA_ARGS__();                 \
+        } else if (2 == N_TOP_K) {         \
+            constexpr int CONST_TOP_K = 2; \
+            __VA_ARGS__();                 \
+        } else if (4 == N_TOP_K) {         \
+            constexpr int CONST_TOP_K = 4; \
+            __VA_ARGS__();                 \
+        } else if (8 == N_TOP_K) {         \
+            constexpr int CONST_TOP_K = 8; \
+            __VA_ARGS__();                 \
+        }                                  \
+    }()
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp
index 8493bbf4b9af..634a63b81a31 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp
@@ -13,6 +13,8 @@
                                         (C_TYPE*)k.data_ptr(),             \
                                         (C_TYPE*)v.data_ptr(),             \
                                         (C_TYPE*)inv_freq_ptr,             \
+                                        rotary_dim,                        \
+                                        theta_base,                        \
                                         batch_wrapper,                     \
                                         qkv_stride,                        \
                                         kv_cache_stride,                   \
@@ -51,6 +53,9 @@ void kv_trained_rotary_embeddings(torch::Tensor& kv_cache,
     TORCH_CHECK(n_tokens == k.size(0));
     TORCH_CHECK(n_tokens == v.size(0));
 
+    const float theta_base = 0.f;
+    const int32_t rotary_dim = inv_freq.size(0) * 2;
+
     // Dimensions
     const int32_t block_size = kv_cache.size(1);
     const int32_t n_kv_heads = kv_cache.size(3);
@@ -91,6 +96,8 @@ void kv_rotary_embeddings(torch::Tensor& kv_cache,
                           torch::Tensor& q,
                           torch::Tensor& k,
                           torch::Tensor& v,
+                          const int32_t rotary_dim,
+                          const float theta_base,
                           torch::Tensor& batch_metadata,
                           torch::Tensor& seq_metadata,
                           torch::Tensor& tokens_to_seq,
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cuh b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cuh
index be38ff30c46c..ff24b3f5bd80 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cuh
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cuh
@@ -18,6 +18,8 @@ void launch_kv_rotary_kernel(T* kv_cache,
                              T* k,
                              T* v,
                              T* inv_freq,
+                             const int32_t rotary_dim,
+                             const float theta_base,
                              const BatchWrapperCPP batch_desc,
                              const int qkv_stride,
                              const int kv_cache_stride,
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.h b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.h
index 0615825c0a21..c0700eda7147 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.h
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.h
@@ -45,6 +45,8 @@ void kv_rotary_embeddings(torch::Tensor& kv_cache,
                           torch::Tensor& q,
                           torch::Tensor& k,
                           torch::Tensor& v,
+                          const int32_t rotary_dim,
+                          const float theta_base,
                           torch::Tensor& batch_metadata,
                           torch::Tensor& seq_metadata,
                           torch::Tensor& tokens_to_seq,
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
index 630d58d90a23..aacbec0bd3ae 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
@@ -18,10 +18,11 @@ class BlockedRotaryEmbeddings(DSKernelBase):
     """
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
-    supported_head_sizes = [64, 128]
-    supported_q_ratios = [1, 2, 4, 5, 8]
+    supported_head_sizes = [64, 80, 96, 128]
+    supported_q_ratios = [1, 2, 4, 5, 6, 7, 8, 16, 29, 35, 36, 71]
 
-    def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
+    def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype, rotary_dim: int,
+                 theta_base: float) -> None:
         """
         Args:
             head_size: The size of the attention head.
@@ -51,6 +52,8 @@ def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch
         self.head_size = head_size
         self.n_q_heads = n_q_heads
         self.n_kv_heads = n_kv_heads
+        self.rotary_dim = rotary_dim
+        self.theta_base = theta_base
 
     def __call__(self, kv_cache: torch.Tensor, qkv: torch.Tensor, ragged_batch: RaggedBatchWrapper) -> None:
         """
@@ -66,5 +69,5 @@ def __call__(self, kv_cache: torch.Tensor, qkv: torch.Tensor, ragged_batch: Ragg
         k = qkv[:, self.head_size * self.n_q_heads:self.head_size * (self.n_q_heads + self.n_kv_heads)]
         v = qkv[:, self.head_size * (self.n_q_heads + self.n_kv_heads):]
 
-        self.kernel(kv_cache, q, k, v, ragged_batch.batch_metadata_buffer(), ragged_batch.inflight_seq_descriptors(),
-                    ragged_batch.tokens_to_seq(), ragged_batch.kv_ptrs())
+        self.kernel(kv_cache, q, k, v, self.rotary_dim, self.theta_base, ragged_batch.batch_metadata_buffer(),
+                    ragged_batch.inflight_seq_descriptors(), ragged_batch.tokens_to_seq(), ragged_batch.kv_ptrs())
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu
similarity index 54%
rename from deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu
rename to deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu
index 63ea5bc88bab..f7bc693eefee 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu
@@ -3,6 +3,7 @@
 
 // DeepSpeed Team
 
+#include <cassert>
 #include "blocked_kv_rotary.cuh"
 #include "conversion_utils.h"
 #include "ds_kernel_utils.h"
@@ -21,12 +22,14 @@ constexpr int threads = 256;
 Supports head size 32, 64, 128, 256
 */
 
-template <typename T, int qRatio, int headSize, bool doRotary>
+template <typename T, int qRatio, int headSize, bool doRotary, int paddedHeadSize>
 __global__ void kv_rotary_pos_kernel(T* kv_cache,
                                      T* q,
                                      T* k,
                                      T* v,
                                      const T* inv_freq,
+                                     const int32_t rotary_dim,
+                                     const float theta_base,
                                      const BatchWrapperCPP batch_desc,
                                      const int qkv_stride,
                                      const int kv_cache_stride,
@@ -35,28 +38,31 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
 {
     // Derived constexpr
     constexpr int vector_T = kv_rot::granularity / sizeof(T);
-    constexpr int threads_per_head = headSize / vector_T;
-    constexpr int half_head_size = headSize >> 1;
+    constexpr int real_threads_per_head = headSize / vector_T;
+    constexpr int threads_per_head = paddedHeadSize / vector_T;
+
     constexpr int tokens_per_block = kv_rot::threads / threads_per_head;
 
     // CG helpers
     cg::thread_block tb = cg::this_thread_block();
     cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
-    cg::thread_block_tile<threads_per_head> head_group =
-        cg::tiled_partition<threads_per_head>(warp);
+    cg::thread_block_tile<threads_per_head> head_group = cg::tiled_partition<threads_per_head>(tb);
 
     // Parallelize on the head dimension for X blocks
     const int head_idx = blockIdx.x;
 
     const int block_seq_idx = threadIdx.x / threads_per_head;
-    const int base_neuron_idx = (threadIdx.x * vector_T) % headSize;
-    const int half_idx = base_neuron_idx % half_head_size;
-    const int half_head_lanes = threads_per_head / 2;
+    const int base_neuron_idx = head_group.thread_rank() * vector_T;
+    const int half_rotary_size = rotary_dim / 2;
+    const int half_dim_lanes = half_rotary_size / vector_T;
+    const int half_idx = base_neuron_idx % half_rotary_size;
 
     // Multiple tokens processed by the same threadblock
     const int token_idx = blockIdx.y * tokens_per_block + block_seq_idx;
     const bool valid_token = token_idx < batch_desc.batch_metadata->n_tokens;
-    const bool load_inv_freq = (inv_freq != nullptr) && valid_token;
+
+    const bool valid_thread = valid_token && (head_group.thread_rank() < real_threads_per_head);
+    const bool load_inv_freq = (inv_freq != nullptr) && valid_thread;
 
     // If we have GQA, then only one of the Q heads needs to do rotary + copy
     // for each of the heads in the group.
@@ -67,9 +73,9 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
     const int kv_head_idx = head_idx / qRatio;
 
     // Ensure we don't access invalid portions of the seq_metadata
-    const int32_t seq_id = (valid_token) ? batch_desc.tokens_to_seq[token_idx] : 0;
+    const int32_t seq_id = (valid_thread) ? batch_desc.tokens_to_seq[token_idx] : 0;
     const InflightSeqDescriptor seq_desc = batch_desc.seq_metadata[seq_id];
-    // This will give an invalid index if valid_token is false, but should never affect memory.
+    // This will give an invalid index if valid_thread is false, but should never affect memory.
     const int32_t global_token_idx = seq_desc.seen_tokens + (token_idx - seq_desc.start_idx);
 
     T* q_row = q + token_idx * qkv_stride + head_idx * headSize;
@@ -81,7 +87,7 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
         const KVCacheDescriptor kv_desc = batch_desc.kv_desc;
         const int32_t seq_kv_block_idx = global_token_idx / kv_desc.block_size;
         const int32_t mapped_kv_block_idx =
-            (valid_token) ? kv_desc.block_lists[seq_id][seq_kv_block_idx] : 0;
+            (valid_thread) ? kv_desc.block_lists[seq_id][seq_kv_block_idx] : 0;
 
         const int32_t kv_block_offset = global_token_idx % kv_desc.block_size;
         const int32_t kv_offset =
@@ -94,12 +100,11 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
 
         T k_reg[vector_T], v_reg[vector_T], inv_freq_reg[vector_T];
 
-        mem_access::load_global<kv_rot::granularity>(q_reg, q_row + base_neuron_idx, valid_token);
-        mem_access::load_global<kv_rot::granularity>(k_reg, k_row + base_neuron_idx, valid_token);
-        mem_access::load_global<kv_rot::granularity>(v_reg, v_row + base_neuron_idx, valid_token);
+        mem_access::load_global<kv_rot::granularity>(q_reg, q_row + base_neuron_idx, valid_thread);
+        mem_access::load_global<kv_rot::granularity>(k_reg, k_row + base_neuron_idx, valid_thread);
+        mem_access::load_global<kv_rot::granularity>(v_reg, v_row + base_neuron_idx, valid_thread);
         mem_access::load_global<kv_rot::granularity>(
             inv_freq_reg, inv_freq + half_idx, load_inv_freq);
-
         if constexpr (doRotary) {
 #pragma unroll
             for (int i = 0; i < vector_T; i++) {
@@ -110,31 +115,37 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
                     inv_freq_flt = conversion::to<float>(inv_freq_reg[i]) * (float)global_token_idx;
                 } else {
                     inv_freq_flt =
-                        (float)((head_neuron_idx % half_head_size) * 2) / (float)headSize;
+                        (float)((head_neuron_idx % half_rotary_size) * 2) / (float)rotary_dim;
                     // Conversion to T and back means that both branches of this if statement
                     // will produce the same results if using the same algo for producing the
                     // freqs.
-                    T trunc_freq = conversion::to<T>(1.0 / powf(10000.0, inv_freq_flt));
+                    T trunc_freq = conversion::to<T>(1.0 / powf(theta_base, inv_freq_flt));
                     inv_freq_flt = conversion::to<float>(trunc_freq) * (float)global_token_idx;
                 }
 
-                float rotary_sign = (head_neuron_idx >= half_head_size) ? -1.0f : 1.0f;
+                float rotary_sign = (head_neuron_idx >= half_rotary_size) ? -1.0f : 1.0f;
                 float q_f = conversion::to<float>(q_reg[i]);
                 float k_f = conversion::to<float>(k_reg[i]);
                 float q_rot = q_f * rotary_sign;
                 float k_rot = k_f * rotary_sign;
 
-                const float q_rot_temp = head_group.shfl_xor(q_rot, half_head_lanes);
-                const float k_rot_temp = head_group.shfl_xor(k_rot, half_head_lanes);
+                const int target_lane = (head_neuron_idx < half_rotary_size)
+                                            ? head_group.thread_rank() + half_dim_lanes
+                                            : head_group.thread_rank() - half_dim_lanes;
+
+                const float q_rot_temp = head_group.shfl(q_rot, target_lane);
+                const float k_rot_temp = head_group.shfl(k_rot, target_lane);
 
-                q_reg[i] =
-                    conversion::to<T>(q_f * cosf(inv_freq_flt) + q_rot_temp * sinf(inv_freq_flt));
-                k_reg[i] =
-                    conversion::to<T>(k_f * cosf(inv_freq_flt) + k_rot_temp * sinf(inv_freq_flt));
+                if (base_neuron_idx < rotary_dim) {
+                    q_reg[i] = conversion::to<T>(q_f * cosf(inv_freq_flt) +
+                                                 q_rot_temp * sinf(inv_freq_flt));
+                    k_reg[i] = conversion::to<T>(k_f * cosf(inv_freq_flt) +
+                                                 k_rot_temp * sinf(inv_freq_flt));
+                }
             }
         }
 
-        if (valid_token) {
+        if (valid_thread) {
             mem_access::store_global<kv_rot::granularity>(kv_cache + kv_offset + base_neuron_idx,
                                                           k_reg);
             mem_access::store_global<kv_rot::granularity>(
@@ -143,7 +154,7 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
     } else {
         T inv_freq_reg[vector_T];
 
-        mem_access::load_global<kv_rot::granularity>(q_reg, q_row + base_neuron_idx, valid_token);
+        mem_access::load_global<kv_rot::granularity>(q_reg, q_row + base_neuron_idx, valid_thread);
         mem_access::load_global<kv_rot::granularity>(
             inv_freq_reg, inv_freq + half_idx, load_inv_freq);
 
@@ -157,47 +168,77 @@ __global__ void kv_rotary_pos_kernel(T* kv_cache,
                     inv_freq_flt = conversion::to<float>(inv_freq_reg[i]) * (float)global_token_idx;
                 } else {
                     inv_freq_flt =
-                        (float)((head_neuron_idx % half_head_size) * 2) / (float)headSize;
-                    inv_freq_flt = 1.0 / powf(10000.0, inv_freq_flt) * (float)global_token_idx;
+                        (float)((head_neuron_idx % half_rotary_size) * 2) / (float)rotary_dim;
+                    inv_freq_flt = 1.0 / powf(theta_base, inv_freq_flt) * (float)global_token_idx;
                 }
 
-                float rotary_sign = (head_neuron_idx >= half_head_size) ? -1.0f : 1.0f;
+                float rotary_sign = (head_neuron_idx >= half_rotary_size) ? -1.0f : 1.0f;
                 float q_f = conversion::to<float>(q_reg[i]);
                 float q_rot = q_f * rotary_sign;
 
-                const float q_rot_temp = head_group.shfl_xor(q_rot, half_head_lanes);
+                const int target_lane = (head_neuron_idx < half_rotary_size)
+                                            ? head_group.thread_rank() + half_dim_lanes
+                                            : head_group.thread_rank() - half_dim_lanes;
 
-                q_reg[i] =
-                    conversion::to<T>(q_f * cosf(inv_freq_flt) + q_rot_temp * sinf(inv_freq_flt));
+                const float q_rot_temp = head_group.shfl(q_rot, target_lane);
+                if (base_neuron_idx < rotary_dim)
+                    q_reg[i] = conversion::to<T>(q_f * cosf(inv_freq_flt) +
+                                                 q_rot_temp * sinf(inv_freq_flt));
             }
         }
     }
 
-    if (valid_token && doRotary) {
+    if (valid_thread && doRotary) {
         mem_access::store_global<kv_rot::granularity>(q_row + base_neuron_idx, q_reg);
     }
 }
 
-#define DISPATCH_KV_ROTARY_IMPL(Q_RATIO, HEAD_SIZE)       \
-    if (q_ratio == Q_RATIO && head_size == HEAD_SIZE)     \
-        kv_rotary_pos_kernel<T, Q_RATIO, HEAD_SIZE, true> \
-            <<<grid, block, 0, stream>>>(kv_cache,        \
-                                         q,               \
-                                         k,               \
-                                         v,               \
-                                         inv_freq,        \
-                                         batch_desc,      \
-                                         qkv_stride,      \
-                                         kv_cache_stride, \
-                                         v_offset,        \
+#define DISPATCH_KV_ROTARY_IMPL(Q_RATIO, HEAD_SIZE, PADDED_HEAD_SIZE)       \
+    if (q_ratio == Q_RATIO && head_size == HEAD_SIZE)                       \
+        kv_rotary_pos_kernel<T, Q_RATIO, HEAD_SIZE, true, PADDED_HEAD_SIZE> \
+            <<<grid, block, 0, stream>>>(kv_cache,                          \
+                                         q,                                 \
+                                         k,                                 \
+                                         v,                                 \
+                                         inv_freq,                          \
+                                         rotary_dim,                        \
+                                         theta_base,                        \
+                                         batch_desc,                        \
+                                         qkv_stride,                        \
+                                         kv_cache_stride,                   \
+                                         v_offset,                          \
                                          inv_freq_stride);
 
+#define LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, HEAD_SIZE) \
+    if (padded_head_size == 64) {                                  \
+        DISPATCH_KV_ROTARY_IMPL(Q_RATIO, HEAD_SIZE, 64);           \
+    } else if (padded_head_size == 128) {                          \
+        DISPATCH_KV_ROTARY_IMPL(Q_RATIO, HEAD_SIZE, 128);          \
+    } else {                                                       \
+        assert(false);                                             \
+    }
+
+#define LAUNCH_KV_ROTARY_FOR_Q_RATIO(Q_RATIO)                 \
+    if (head_size == 64) {                                    \
+        LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 64);  \
+    } else if (head_size == 80) {                             \
+        LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 80);  \
+    } else if (head_size == 96) {                             \
+        LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 96);  \
+    } else if (head_size == 128) {                            \
+        LAUNCH_KV_ROTARY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 128); \
+    } else {                                                  \
+        assert(false);                                        \
+    }
+
 template <typename T>
 void launch_kv_rotary_kernel(T* kv_cache,
                              T* q,
                              T* k,
                              T* v,
                              T* inv_freq,
+                             const int32_t rotary_dim,
+                             const float theta_base,
                              const BatchWrapperCPP batch_desc,
                              const int qkv_stride,
                              const int kv_cache_stride,
@@ -210,23 +251,28 @@ void launch_kv_rotary_kernel(T* kv_cache,
                              cudaStream_t stream)
 {
     constexpr int vector_T = kv_rot::granularity / sizeof(T);
-    const int threads_per_head = head_size / vector_T;
+
+    const int padded_head_size = next_pow2(head_size);
+    const int threads_per_head = padded_head_size / vector_T;
+
     const int tokens_per_block = kv_rot::threads / threads_per_head;
 
     const dim3 block(kv_rot::threads);
     const int token_blocks = (n_tokens + tokens_per_block - 1) / tokens_per_block;
     const dim3 grid(n_q_heads, token_blocks);
 
-    DISPATCH_KV_ROTARY_IMPL(1, 64)
-    DISPATCH_KV_ROTARY_IMPL(1, 128)
-    DISPATCH_KV_ROTARY_IMPL(2, 64)
-    DISPATCH_KV_ROTARY_IMPL(2, 128)
-    DISPATCH_KV_ROTARY_IMPL(4, 64)
-    DISPATCH_KV_ROTARY_IMPL(4, 128)
-    DISPATCH_KV_ROTARY_IMPL(5, 64)
-    DISPATCH_KV_ROTARY_IMPL(5, 128)
-    DISPATCH_KV_ROTARY_IMPL(8, 64)
-    DISPATCH_KV_ROTARY_IMPL(8, 128)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(1)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(2)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(4)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(5)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(6)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(7)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(8)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(16)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(29)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(35)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(36)
+    LAUNCH_KV_ROTARY_FOR_Q_RATIO(71)
 }
 
 #define INSTANTIATE_KV_ROTARY_KERNEL(TYPE)                                        \
@@ -235,6 +281,8 @@ void launch_kv_rotary_kernel(T* kv_cache,
                                                 TYPE * k,                         \
                                                 TYPE * v,                         \
                                                 TYPE * inv_freq,                  \
+                                                const int32_t rotary_dim,         \
+                                                const float theta_base,           \
                                                 const BatchWrapperCPP batch_desc, \
                                                 const int qkv_stride,             \
                                                 const int kv_cache_stride,        \
@@ -252,10 +300,43 @@ INSTANTIATE_KV_ROTARY_KERNEL(__half)
 INSTANTIATE_KV_ROTARY_KERNEL(__nv_bfloat16)
 #endif
 
-#define DISPATCH_KV_COPY_IMPL(Q_RATIO, HEAD_SIZE)                                       \
-    if (q_ratio == Q_RATIO && head_size == HEAD_SIZE)                                   \
-        kv_rotary_pos_kernel<T, Q_RATIO, HEAD_SIZE, false><<<grid, block, 0, stream>>>( \
-            kv_cache, q, k, v, nullptr, batch_desc, qkv_stride, kv_cache_stride, v_offset, 0);
+#define DISPATCH_KV_COPY_IMPL(Q_RATIO, HEAD_SIZE, PADDED_HEAD_SIZE)          \
+    if (q_ratio == Q_RATIO && head_size == HEAD_SIZE)                        \
+        kv_rotary_pos_kernel<T, Q_RATIO, HEAD_SIZE, false, PADDED_HEAD_SIZE> \
+            <<<grid, block, 0, stream>>>(kv_cache,                           \
+                                         q,                                  \
+                                         k,                                  \
+                                         v,                                  \
+                                         nullptr,                            \
+                                         -1,                                 \
+                                         0.f,                                \
+                                         batch_desc,                         \
+                                         qkv_stride,                         \
+                                         kv_cache_stride,                    \
+                                         v_offset,                           \
+                                         0);
+
+#define LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, HEAD_SIZE) \
+    if (padded_head_size == 64) {                                \
+        DISPATCH_KV_COPY_IMPL(Q_RATIO, HEAD_SIZE, 64);           \
+    } else if (padded_head_size == 128) {                        \
+        DISPATCH_KV_COPY_IMPL(Q_RATIO, HEAD_SIZE, 128);          \
+    } else {                                                     \
+        assert(false);                                           \
+    }
+
+#define LAUNCH_KV_COPY_FOR_Q_RATIO(Q_RATIO)                 \
+    if (head_size == 64) {                                  \
+        LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 64);  \
+    } else if (head_size == 80) {                           \
+        LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 80);  \
+    } else if (head_size == 96) {                           \
+        LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 96);  \
+    } else if (head_size == 128) {                          \
+        LAUNCH_KV_COPY_FOR_Q_RATIO_HEAD_SIZE(Q_RATIO, 128); \
+    } else {                                                \
+        assert(false);                                      \
+    }
 
 template <typename T>
 void launch_kv_copy_kernel(T* kv_cache,
@@ -273,23 +354,19 @@ void launch_kv_copy_kernel(T* kv_cache,
                            cudaStream_t stream)
 {
     constexpr int vector_T = kv_rot::granularity / sizeof(T);
-    const int threads_per_head = head_size / vector_T;
+    const int padded_head_size = next_pow2(head_size);
+    const int threads_per_head = padded_head_size / vector_T;
     const int tokens_per_block = kv_rot::threads / threads_per_head;
 
     const dim3 block(kv_rot::threads);
     const int token_blocks = (n_tokens + tokens_per_block - 1) / tokens_per_block;
     const dim3 grid(n_q_heads, token_blocks);
 
-    DISPATCH_KV_COPY_IMPL(1, 64)
-    DISPATCH_KV_COPY_IMPL(1, 128)
-    DISPATCH_KV_COPY_IMPL(2, 64)
-    DISPATCH_KV_COPY_IMPL(2, 128)
-    DISPATCH_KV_COPY_IMPL(4, 64)
-    DISPATCH_KV_COPY_IMPL(4, 128)
-    DISPATCH_KV_COPY_IMPL(5, 64)
-    DISPATCH_KV_COPY_IMPL(5, 128)
-    DISPATCH_KV_COPY_IMPL(8, 64)
-    DISPATCH_KV_COPY_IMPL(8, 128)
+    LAUNCH_KV_COPY_FOR_Q_RATIO(1)
+    LAUNCH_KV_COPY_FOR_Q_RATIO(2)
+    LAUNCH_KV_COPY_FOR_Q_RATIO(4)
+    LAUNCH_KV_COPY_FOR_Q_RATIO(5)
+    LAUNCH_KV_COPY_FOR_Q_RATIO(8)
 }
 
 #define INSTANTIATE_KV_COPY_KERNEL(TYPE)                                        \
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
index 59da1db0f5d6..f527be227ce1 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_trained_kv_rotary.py
@@ -23,7 +23,7 @@ class BlockedTrainedRotaryEmbeddings(DSKernelBase):
     """
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
-    supported_head_sizes = [64, 128]
+    supported_head_sizes = [64, 80, 96, 128]
     supported_q_ratios = [1, 2, 4, 5, 8]
 
     def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
@@ -65,7 +65,7 @@ def __call__(self, kv_cache: torch.Tensor, qkv: torch.Tensor, ragged_batch: Ragg
             kv_cache (torch.Tensor): Pre-allocated KV cache of [num_blocks, block_size, 2, n_kv_heads, head_size]
             qkv: Input tensor of shape [num_tokens, head_size * (n_q_heads + 2 * n_kv_heads)]
             ragged_batch: Wrapper for the ragged batch.
-            inverse_freqs: Inverse frequencies for the rotary embeddings. Shape [max_seq_len, head_size // 2]
+            inverse_freqs: Inverse frequencies for the rotary embeddings. Shape [max_seq_len, rotary_dim // 2]
         """
 
         q = qkv[:, :self.head_size * self.n_q_heads]
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
index c9f6ffd37b3e..4b2ad858a1bf 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/linear_blocked_kv_copy.py
@@ -23,7 +23,7 @@ class LinearBlockedKVCopy(DSKernelBase):
     """
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
-    supported_head_sizes = [64, 128]
+    supported_head_sizes = [64, 80, 96, 128]
     supported_q_ratios = [1, 2, 4, 5, 8]
 
     def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu b/deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu
similarity index 100%
rename from deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu
rename to deepspeed/inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp
index e55e1f48c125..506629406f0d 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp
@@ -16,6 +16,8 @@
                           n_channels,                              \
                           n_experts,                               \
                           n_tokens,                                \
+                          n_top_k,                                 \
+                          normalize_scales,                        \
                           at::cuda::getCurrentCUDAStream());       \
         return;                                                    \
     }
@@ -27,17 +29,21 @@ void moe_gather(torch::Tensor& layer_output,
                 const torch::Tensor& moe_output,
                 const torch::Tensor& scores,
                 const torch::Tensor& mapped_slots,
-                const torch::Tensor& expert_count)
+                const torch::Tensor& expert_count,
+                const bool normalize_scales)
 {
     const int32_t n_channels = layer_output.size(1);
     const int32_t n_experts = expert_count.size(0);
     const int32_t n_tokens = layer_output.size(0);
+    const int32_t n_top_k = mapped_slots.size(1);
 
-    TORCH_CHECK(moe_output.size(0) == n_tokens);
+    TORCH_CHECK(moe_output.size(0) == n_tokens * n_top_k);
     TORCH_CHECK(moe_output.size(1) == n_channels);
     TORCH_CHECK(scores.size(0) == n_tokens);
     TORCH_CHECK(mapped_slots.size(0) == n_tokens);
 
+    TORCH_CHECK(scores.size(1) == n_top_k);
+
     TORCH_CHECK(layer_output.scalar_type() == moe_output.scalar_type());
     TORCH_CHECK(scores.scalar_type() == torch::kFloat32);
     TORCH_CHECK(mapped_slots.scalar_type() == torch::kInt32);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cuh b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cuh
index f98a727ead58..b348d0cfb330 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cuh
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cuh
@@ -17,4 +17,6 @@ void launch_moe_gather(T* layer_output,
                        const int32_t n_channels,
                        const int32_t n_experts,
                        const int32_t n_tokens,
+                       const int32_t n_top_k,
+                       const bool normalize_scales,
                        cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.h b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.h
index 7ffe9f8b4dc6..ec9e03057eb8 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.h
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.h
@@ -16,4 +16,5 @@ void moe_gather(torch::Tensor& layer_output,
                 const torch::Tensor& moe_output,
                 const torch::Tensor& scores,
                 const torch::Tensor& mapped_slots,
-                const torch::Tensor& expert_counts);
+                const torch::Tensor& expert_counts,
+                const bool normalize_scales);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.py b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.py
index c37683d03fbe..f03938171ba4 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.py
@@ -18,7 +18,7 @@ class MoEGather(DSKernelBase):
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
 
-    def __init__(self, dtype: DtypeEnum, channels: int) -> None:
+    def __init__(self, dtype: DtypeEnum, channels: int, normalize_scores: bool = False) -> None:
 
         if not isinstance(dtype, DtypeEnum):
             dtype = DtypeEnum(dtype)
@@ -31,6 +31,7 @@ def __init__(self, dtype: DtypeEnum, channels: int) -> None:
 
         inf_module = RaggedOpsBuilder().load()
         self.kernel = inf_module.moe_gather
+        self.normalize_scores = normalize_scores
 
     def __call__(self, layer_output: torch.Tensor, moe_output: torch.Tensor, scores: torch.Tensor,
                  mapped_slots: torch.Tensor, expert_counts: torch.Tensor) -> torch.Tensor:
@@ -40,13 +41,13 @@ def __call__(self, layer_output: torch.Tensor, moe_output: torch.Tensor, scores:
 
         Arguments:
             layer_output (torch.Tensor): The output of the layer of shape [n_tokens, hidden_size]. This has been scaled appropriately.
-            moe_output (torch.Tensor): The output of the MoE of shape [n_tokens, hidden_size].
+            moe_output (torch.Tensor): The output of the MoE of shape [n_tokens * n_top_k, hidden_size].
             scores (torch.Tensor): The gating scores of shape [n_tokens].
-            mapped_slots (torch.Tensor): The index of the token in the expert's input of shape [n_tokens]. The index of token ``i`` in layer_output is ``mapped_slots[i]``.
+            mapped_slots (torch.Tensor): The index of the token in the expert's input of shape [n_tokens, n_top_k]. The indices of token ``i`` in layer_output is ``mapped_slots[i]``.
             expert_counts (torch.Tensor): The number of tokens assigned to each expert of shape [n_experts]. This is passed to fuse the clearing of this data structure into the gather.
 
         Returns:
             layer_output
         """
-        self.kernel(layer_output, moe_output, scores, mapped_slots, expert_counts)
+        self.kernel(layer_output, moe_output, scores, mapped_slots, expert_counts, self.normalize_scores)
         return layer_output
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu
similarity index 50%
rename from deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu
rename to deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu
index c2fae24f5080..4153a2a3636f 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu
@@ -7,7 +7,8 @@
 #include "ds_kernel_utils.h"
 #include "moe_gather.cuh"
 #include "reduction_utils.h"
-#include "top_1_gating.cuh"
+#include "top_k_gating.cuh"
+#include "top_k_utils.h"
 
 namespace gather {
 
@@ -16,65 +17,105 @@ constexpr int threads = 256;
 
 }  // namespace gather
 
-template <typename T, int copyUnroll>
+template <typename T, int copyUnroll, int N_TOP_K>
 __global__ void moe_gather_kernel(T* layer_output,
                                   const T* moe_output,
                                   const float* scores,
                                   const int32_t* mapped_slots,
                                   int32_t* expert_counts,
                                   const int32_t n_channels,
-                                  const int32_t n_experts)
+                                  const int32_t n_experts,
+                                  const bool normalize_scales)
 {
     constexpr int32_t vector_size = gather::access_granularity / sizeof(T);
     constexpr int32_t stride = vector_size * gather::threads;
 
     const int32_t token_idx = blockIdx.x;
-    const int32_t mapped_slot = mapped_slots[token_idx];
+    int32_t token_mapped_slots[N_TOP_K];
+
+    bool all_slots_invalid = true;
+    for (int i = 0; i < N_TOP_K; i++) {
+        token_mapped_slots[i] = mapped_slots[token_idx * N_TOP_K + i];
+        all_slots_invalid &= (token_mapped_slots[i] == gating::unassigned);
+    }
 
     if (token_idx == 0) {
         // Reset expert counts for its next use.
         if (threadIdx.x < n_experts) { expert_counts[threadIdx.x] = 0; }
     }
 
-    if (mapped_slot == gating::unassigned) {
-        // This token was not assigned.
+    if (all_slots_invalid) {
+        // This token was not assigned to anything.
         // TODO(cmikeh2): It's possible we want different behavior here moving forward.
         return;
     }
 
-    const float score = scores[token_idx];
+    float token_scores[N_TOP_K];
+    for (int i = 0; i < N_TOP_K; i++) { token_scores[i] = scores[token_idx * N_TOP_K + i]; }
+
+    if (normalize_scales) {
+        // Normalize the scores so that they sum to 1.
+        float sum = 0.0f;
+        for (int i = 0; i < N_TOP_K; i++) { sum += token_scores[i]; }
+
+        if (sum > 0.0f) {
+            for (int i = 0; i < N_TOP_K; i++) { token_scores[i] /= sum; }
+        }
+    }
+
     const int32_t channel_offset = threadIdx.x * vector_size;
 
-    const T* moe_output_base = moe_output + mapped_slot * n_channels + channel_offset;
+    const T* moe_output_bases[N_TOP_K];
+#pragma unroll
+    for (int i = 0; i < N_TOP_K; i++) {
+        moe_output_bases[i] = moe_output + token_mapped_slots[i] * n_channels + channel_offset;
+    }
+
     T* layer_output_base = layer_output + token_idx * n_channels + channel_offset;
 
 #pragma unroll
     for (int i = 0; i < copyUnroll; i++) {
-        T reg_buffer[vector_size];
-
         if (i * stride + channel_offset < n_channels) {
-            mem_access::load_global<gather::access_granularity>(reg_buffer,
-                                                                moe_output_base + i * stride);
+            float accum_buffer[vector_size];
+            for (int j = 0; j < vector_size; j++) {
+                accum_buffer[j] = reduce::init<reduce::ROpType::Add>();
+            }
+
+#pragma unroll
+            for (int j = 0; j < N_TOP_K; j++) {
+                T reg_buffer[vector_size];
+                mem_access::load_global<gather::access_granularity>(
+                    reg_buffer, moe_output_bases[j] + i * stride);
 
+#pragma unroll
+                for (int k = 0; k < vector_size; k++) {
+                    float up_cast = conversion::to<float>(reg_buffer[k]);
+                    accum_buffer[k] += up_cast * token_scores[j];
+                }
+            }
+
+            T store_buffer[vector_size];
 #pragma unroll
             for (int j = 0; j < vector_size; j++) {
-                // There are accuracy implications of downcasting the score to a 16-bit
-                // data type, so we up-convert the input to 32-bit, multiply, and then
-                // down-convert back to 16-bit.
-                float up_cast = conversion::to<float>(reg_buffer[j]);
-                reg_buffer[j] = conversion::to<T>(up_cast * score);
+                store_buffer[j] = conversion::to<T>(accum_buffer[j]);
             }
 
             mem_access::store_global<gather::access_granularity>(layer_output_base + i * stride,
-                                                                 reg_buffer);
+                                                                 store_buffer);
         }
     }
 }
 
-#define LAUNCH_FOR_UNROLL(COUNT)                                                                   \
-    case COUNT:                                                                                    \
-        moe_gather_kernel<T, COUNT><<<grid, block, 0, stream>>>(                                   \
-            layer_output, moe_output, scores, mapped_slots, expert_counts, n_channels, n_experts); \
+#define LAUNCH_FOR_UNROLL(COUNT)                                                                \
+    case COUNT:                                                                                 \
+        moe_gather_kernel<T, COUNT, CONST_TOP_K><<<grid, block, 0, stream>>>(layer_output,      \
+                                                                             moe_output,        \
+                                                                             scores,            \
+                                                                             mapped_slots,      \
+                                                                             expert_counts,     \
+                                                                             n_channels,        \
+                                                                             n_experts,         \
+                                                                             normalize_scales); \
         break;
 
 template <typename T>
@@ -86,6 +127,8 @@ void launch_moe_gather(T* layer_output,
                        const int32_t n_channels,
                        const int32_t n_experts,
                        const int32_t n_tokens,
+                       const int32_t n_top_k,
+                       const bool normalize_scales,
                        cudaStream_t stream)
 {
     constexpr int vals_per_unroll = gather::threads * gather::access_granularity / sizeof(T);
@@ -94,14 +137,16 @@ void launch_moe_gather(T* layer_output,
     const dim3 block(gather::threads);
     const dim3 grid(n_tokens);
 
-    switch (copy_unroll) {
-        LAUNCH_FOR_UNROLL(1)
-        LAUNCH_FOR_UNROLL(2)
-        LAUNCH_FOR_UNROLL(3)
-        LAUNCH_FOR_UNROLL(4)
-        LAUNCH_FOR_UNROLL(5)
-        LAUNCH_FOR_UNROLL(6)
-    }
+    TOP_K_SWITCH(n_top_k, [&] {
+        switch (copy_unroll) {
+            LAUNCH_FOR_UNROLL(1)
+            LAUNCH_FOR_UNROLL(2)
+            LAUNCH_FOR_UNROLL(3)
+            LAUNCH_FOR_UNROLL(4)
+            LAUNCH_FOR_UNROLL(5)
+            LAUNCH_FOR_UNROLL(6)
+        }
+    });
 }
 
 #define INSTANTIATE_GATHER_FOR_TYPE(TYPE)                              \
@@ -113,6 +158,8 @@ void launch_moe_gather(T* layer_output,
                                           const int32_t n_channels,    \
                                           const int32_t n_experts,     \
                                           const int32_t n_tokens,      \
+                                          const int32_t n_top_k,       \
+                                          const bool normalize_scales, \
                                           cudaStream_t stream);
 
 INSTANTIATE_GATHER_FOR_TYPE(__half)
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp
index 902f1cc0ea15..8f7ecbd1a287 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp
@@ -18,6 +18,7 @@
                            n_channels,                                \
                            n_tokens,                                  \
                            n_experts,                                 \
+                           n_top_k,                                   \
                            at::cuda::getCurrentCUDAStream());         \
         return;                                                       \
     }
@@ -36,13 +37,17 @@ void moe_scatter(torch::Tensor& moe_input,
 {
     const int32_t n_tokens = activations.size(0);
     const int32_t n_channels = activations.size(1);
+    const int32_t n_top_k = assignments.size(1);
 
     // Should have a lot of matching buffer sizes here.
-    TORCH_CHECK(n_tokens == moe_input.size(0));
     TORCH_CHECK(n_tokens == assignments.size(0));
     TORCH_CHECK(n_tokens == offsets.size(0));
     TORCH_CHECK(n_channels == moe_input.size(1));
 
+    TORCH_CHECK(n_top_k == offsets.size(1));
+    TORCH_CHECK(n_top_k * n_tokens == moe_input.size(0));
+    TORCH_CHECK(n_top_k == mapped_slots.size(1));
+
     const int32_t n_experts = expert_count_cumsums.size(0);
 
     TORCH_CHECK(moe_input.scalar_type() == activations.scalar_type());
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu
deleted file mode 100644
index 0746cd7be645..000000000000
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu
+++ /dev/null
@@ -1,208 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// SPDX-License-Identifier: Apache-2.0
-
-// DeepSpeed Team
-
-#include "ds_kernel_utils.h"
-#include "moe_scatter.cuh"
-#include "reduction_utils.h"
-#include "top_1_gating.cuh"
-
-using ROp = reduce::ROpType;
-
-namespace scatter {
-
-constexpr int access_granularity = 16;
-constexpr int threads = 256;
-constexpr int warps = threads / hw_warp_size;
-
-}  // namespace scatter
-
-template <typename T, int copyUnroll>
-__global__ void moe_scatter_kernel(T* moe_input,
-                                   int64_t* expert_count_cumsums,
-                                   int32_t* mapped_slots,
-                                   const T* activations,
-                                   const int32_t* assignments,
-                                   const int32_t* expert_counts,
-                                   const int32_t* offsets,
-                                   const int32_t n_channels,
-                                   const int32_t n_experts)
-{
-    constexpr int32_t vector_size = scatter::access_granularity / sizeof(T);
-    constexpr int32_t load_stride = vector_size * scatter::threads;
-
-    const int32_t token_idx = blockIdx.x;
-    const int32_t tidx = threadIdx.x;
-    const int32_t warp_rank = tidx / hw_warp_size;
-
-    // Bank aligned and sufficient
-    __shared__ int32_t red_buffer[32];
-    __shared__ int32_t token_0_row;
-
-    // CG helpers
-    cg::thread_block tb = cg::this_thread_block();
-    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
-
-    const int assigned_expert = assignments[token_idx];
-
-    // For the different codepaths, we'll converge on this variable for doing
-    // the token copy.
-    int32_t token_base_row;
-
-    if (token_idx == 0) {
-        // Token 0 will perform a cumsum on the data
-        int32_t expert_vals;
-        if (tidx < n_experts) {
-            expert_vals = expert_counts[tidx];
-        } else {
-            expert_vals = 0;
-        }
-
-#pragma unroll
-        for (int i = 1; i < hw_warp_size; i *= 2) {
-            int32_t maybe_add = warp.shfl_up(expert_vals, i);
-            expert_vals = (warp.thread_rank() < i) ? expert_vals : expert_vals + maybe_add;
-        }
-
-        if (warp.thread_rank() == hw_warp_size - 1) {
-            mem_access::store_shared<4>(red_buffer + warp_rank, &expert_vals);
-        }
-
-        tb.sync();
-
-        int32_t phase_2_val = 0;
-        if (warp.thread_rank() < scatter::warps) {
-            mem_access::load_shared<4>(&phase_2_val, red_buffer + warp.thread_rank());
-        }
-
-#pragma unroll
-        for (int i = 1; i < hw_warp_size; i *= 2) {
-            int32_t maybe_add = warp.shfl_up(phase_2_val, i);
-            phase_2_val = (warp.thread_rank() < i) ? phase_2_val : phase_2_val + maybe_add;
-        }
-
-        int warp_offset = 0;
-        if (warp_rank > 0) { warp_offset = warp.shfl(phase_2_val, warp_rank - 1); }
-        const int32_t expert_cumsum = warp_offset + expert_vals;
-
-        if (tidx < n_experts) {
-            int64_t expert_cumsum_64 = (int64_t)expert_cumsum;
-            expert_count_cumsums[tidx] = expert_cumsum_64;
-        }
-
-        if (assigned_expert == gating::unassigned) return;
-        if (assigned_expert - 1 == tidx) token_0_row = expert_cumsum;
-
-        tb.sync();
-
-        if (assigned_expert != 0) {
-            token_base_row = token_0_row;
-        } else {
-            token_base_row = 0;
-        }
-
-    } else if (assigned_expert == gating::unassigned) {
-        // For whatever reason, don't need to perform the copy, so we'll early return
-        // and signal this wasn't mapped with a negative 1.
-        if (tidx == 0) mapped_slots[token_idx] = gating::unassigned;
-        return;
-    } else {
-        // For all other valid tokens, we can just do a block-scoped sum.
-        if (tidx < assigned_expert) {
-            token_base_row = expert_counts[tidx];
-        } else {
-            token_base_row = 0;
-        }
-
-        warp.sync();
-
-        // TODO(cmikeh2): Shouldn't use the internal api.
-        reduce::_block<int32_t, scatter::warps, ROp::Add>(tb, warp, &token_base_row);
-    }
-
-    // Data copy to appropriate location
-    const int32_t thread_offset = tidx * vector_size;
-
-    const int32_t base_load_offset = token_idx * n_channels + thread_offset;
-    const T* load_base_ptr = activations + base_load_offset;
-
-    const int32_t store_row = token_base_row + offsets[token_idx];
-    const int32_t base_store_offset = store_row * n_channels + thread_offset;
-    T* store_base_ptr = moe_input + base_store_offset;
-
-#pragma unroll
-    for (int i = 0; i < copyUnroll; i++) {
-        T tmp_buf[vector_size];
-
-        if (i * load_stride + thread_offset < n_channels) {
-            mem_access::load_global<scatter::access_granularity>(tmp_buf,
-                                                                 load_base_ptr + i * load_stride);
-            mem_access::store_global<scatter::access_granularity>(store_base_ptr + i * load_stride,
-                                                                  tmp_buf);
-        }
-    }
-
-    if (threadIdx.x == 0) { mapped_slots[token_idx] = store_row; }
-}
-
-#define LAUNCH_FOR_UNROLL(COUNT)                                                       \
-    case COUNT:                                                                        \
-        moe_scatter_kernel<T, COUNT><<<grid, block, 0, stream>>>(moe_input,            \
-                                                                 expert_count_cumsums, \
-                                                                 mapped_slots,         \
-                                                                 activations,          \
-                                                                 assignments,          \
-                                                                 expert_counts,        \
-                                                                 offsets,              \
-                                                                 n_channels,           \
-                                                                 n_experts);           \
-        break;
-
-template <typename T>
-void launch_moe_scatter(T* moe_input,
-                        int64_t* expert_count_cumsums,
-                        int32_t* mapped_slots,
-                        const T* activations,
-                        const int32_t* expert_counts,
-                        const int32_t* assignments,
-                        const int32_t* offsets,
-                        const int32_t n_channels,
-                        const int32_t n_tokens,
-                        const int32_t n_experts,
-                        cudaStream_t stream)
-{
-    constexpr int vals_per_unroll = scatter::threads * scatter::access_granularity / sizeof(T);
-    const int copy_unroll = (n_channels + vals_per_unroll - 1) / vals_per_unroll;
-
-    const dim3 block(scatter::threads);
-    const dim3 grid(n_tokens);
-
-    switch (copy_unroll) {
-        LAUNCH_FOR_UNROLL(1);
-        LAUNCH_FOR_UNROLL(2);
-        LAUNCH_FOR_UNROLL(3);
-        LAUNCH_FOR_UNROLL(4);
-        LAUNCH_FOR_UNROLL(5);
-        LAUNCH_FOR_UNROLL(6);
-    }
-}
-
-#define INSTANTIATE_SCATTER_FOR_TYPE(TYPE)                 \
-    template void launch_moe_scatter<TYPE>(TYPE*,          \
-                                           int64_t*,       \
-                                           int32_t*,       \
-                                           const TYPE*,    \
-                                           const int32_t*, \
-                                           const int32_t*, \
-                                           const int32_t*, \
-                                           const int32_t,  \
-                                           const int32_t,  \
-                                           const int32_t,  \
-                                           cudaStream_t);
-
-INSTANTIATE_SCATTER_FOR_TYPE(__half);
-
-#ifdef BF16_AVAILABLE
-INSTANTIATE_SCATTER_FOR_TYPE(__nv_bfloat16);
-#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cuh b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cuh
index 5c94cb0ef734..d9756c80f05a 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cuh
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cuh
@@ -19,4 +19,5 @@ void launch_moe_scatter(T* moe_input,
                         const int32_t n_channels,
                         const int32_t n_tokens,
                         const int32_t n_experts,
+                        const int32_t n_top_k,
                         cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.py b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.py
index 5cd6ae5f0fe2..7efcedb4e880 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.py
@@ -40,13 +40,13 @@ def __call__(self, moe_input: torch.Tensor, expert_cumsum: torch.Tensor, mapped_
         Scatters the hidden states such that the token stride for each expert's input is contiguous.
 
         Arguments:
-            moe_input (torch.Tensor): The direct input for the MoE GEMM of shape [n_tokens, hidden_size].
+            moe_input (torch.Tensor): The direct input for the MoE GEMM of shape [n_tokens * n_top_k, hidden_size].
             expert_cumsum (torch.Tensor): The cumulative sum of the expert counts of shape [n_experts].
-            mapped_slots (torch.Tensor): The index of the token in the expert's input of shape [n_tokens].
+            mapped_slots (torch.Tensor): The index of the token in the expert's input of shape [n_tokens, n_top_k].
             hidden_states (torch.Tensor): The hidden states of shape [n_tokens, hidden_size].
             expert_counts (torch.Tensor): The number of tokens assigned to each expert of shape [n_experts].
-            assignments (torch.Tensor): The expert assignments of shape [n_tokens].
-            offsets (torch.Tensor): The offsets into the expert for a given token of shape [n_tokens].
+            assignments (torch.Tensor): The expert assignments of shape [n_tokens, n_top_k].
+            offsets (torch.Tensor): The offsets into the expert for a given token of shape [n_tokens, n_top_K].
 
         Returns:
             Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: The MoE input (with scattered values), the cumsum of the offsets (for the MoE kernels themselves), and the assignments Tensor modified in place to show which row that token was mapped to in the input.
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu
new file mode 100644
index 000000000000..d3eb4f649e79
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu
@@ -0,0 +1,216 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0
+
+// DeepSpeed Team
+
+#include "ds_kernel_utils.h"
+#include "reduction_utils.h"
+#include "top_k_gating.cuh"
+#include "top_k_utils.h"
+
+using ROp = reduce::ROpType;
+
+namespace scatter {
+
+constexpr int access_granularity = 16;
+constexpr int threads = 256;
+constexpr int warps = threads / hw_warp_size;
+constexpr int max_experts = 1024;
+
+}  // namespace scatter
+
+template <typename T, int copyUnroll, int N_TOP_K>
+__global__ void moe_scatter_kernel(T* moe_input,
+                                   int64_t* expert_count_cumsums,
+                                   int32_t* mapped_slots,
+                                   const T* activations,
+                                   const int32_t* assignments,
+                                   const int32_t* expert_counts,
+                                   const int32_t* offsets,
+                                   const int32_t n_channels,
+                                   const int32_t n_experts)
+{
+    constexpr int32_t vector_size = scatter::access_granularity / sizeof(T);
+    constexpr int32_t load_stride = vector_size * scatter::threads;
+
+    const int32_t token_idx = blockIdx.x;
+    const int32_t tidx = threadIdx.x;
+    const int32_t warp_rank = tidx / hw_warp_size;
+
+    // Bank aligned and sufficient
+    __shared__ int32_t red_buffer[32];
+    __shared__ int32_t expert_offsets[scatter::max_experts];
+
+    // CG helpers
+    cg::thread_block tb = cg::this_thread_block();
+    cg::thread_block_tile<hw_warp_size> warp = cg::tiled_partition<hw_warp_size>(tb);
+
+    // Fetch the assigned experts for this token.
+    int assigned_experts[N_TOP_K];
+    for (int i = 0; i < N_TOP_K; i++) {
+        assigned_experts[i] = assignments[token_idx * N_TOP_K + i];
+    }
+
+    bool all_unassigned = true;
+    for (int i = 0; i < N_TOP_K; i++) {
+        if (assigned_experts[i] != gating::unassigned) {
+            all_unassigned = false;
+        } else {
+            mapped_slots[token_idx * N_TOP_K + i] = gating::unassigned;
+        }
+    }
+    if (all_unassigned && token_idx != 0) return;
+
+    // Do a prefix scan on the expert counts to get the base offsets. Here we use the
+    // single up-sweep variant.
+    int32_t expert_vals;
+    if (tidx < n_experts) {
+        expert_vals = expert_counts[tidx];
+    } else {
+        expert_vals = 0;
+    }
+
+#pragma unroll
+    for (int i = 1; i < hw_warp_size; i *= 2) {
+        int32_t maybe_add = warp.shfl_up(expert_vals, i);
+        expert_vals = (warp.thread_rank() < i) ? expert_vals : expert_vals + maybe_add;
+    }
+
+    if (warp.thread_rank() == hw_warp_size - 1) {
+        mem_access::store_shared<4>(red_buffer + warp_rank, &expert_vals);
+    }
+
+    tb.sync();
+
+    int32_t phase_2_val = 0;
+    if (warp.thread_rank() < scatter::warps) {
+        mem_access::load_shared<4>(&phase_2_val, red_buffer + warp.thread_rank());
+    }
+
+#pragma unroll
+    for (int i = 1; i < hw_warp_size; i *= 2) {
+        int32_t maybe_add = warp.shfl_up(phase_2_val, i);
+        phase_2_val = (warp.thread_rank() < i) ? phase_2_val : phase_2_val + maybe_add;
+    }
+
+    int warp_offset = 0;
+    if (warp_rank > 0) { warp_offset = warp.shfl(phase_2_val, warp_rank - 1); }
+    const int32_t expert_cumsum = warp_offset + expert_vals;
+
+    // Token 0 will write the
+    if (token_idx == 0 && tidx < n_experts) {
+        int64_t expert_cumsum_64 = (int64_t)expert_cumsum;
+        expert_count_cumsums[tidx] = expert_cumsum_64;
+    }
+
+    // Since token 0 has now written the expert cumsum to global memory,
+    // if it has no valid experts, we can early return.
+    if (token_idx == 0 && all_unassigned) return;
+
+    if (tidx < n_experts) { expert_offsets[tidx] = expert_cumsum; }
+
+    // Ensure all the expert offsets are written in shared memory.
+    tb.sync();
+
+    // Data copy to appropriate location
+    const int32_t thread_offset = tidx * vector_size;
+
+    const int32_t base_load_offset = token_idx * n_channels + thread_offset;
+    const T* load_base_ptr = activations + base_load_offset;
+
+    int32_t store_rows[N_TOP_K];
+    T* store_base_ptrs[N_TOP_K];
+#pragma unroll
+    for (int i = 0; i < N_TOP_K; i++) {
+        const int32_t cur_expert_offset =
+            (assigned_experts[i] > 0) ? expert_offsets[assigned_experts[i] - 1] : 0;
+        store_rows[i] = cur_expert_offset + offsets[token_idx * N_TOP_K + i];
+        const int32_t base_store_offset = store_rows[i] * n_channels + thread_offset;
+        store_base_ptrs[i] = moe_input + base_store_offset;
+    }
+
+#pragma unroll
+    for (int i = 0; i < copyUnroll; i++) {
+        T tmp_buf[vector_size];
+
+        if (i * load_stride + thread_offset < n_channels) {
+            mem_access::load_global<scatter::access_granularity>(tmp_buf,
+                                                                 load_base_ptr + i * load_stride);
+#pragma unroll
+            for (int j = 0; j < N_TOP_K; j++) {
+                mem_access::store_global<scatter::access_granularity>(
+                    store_base_ptrs[j] + i * load_stride, tmp_buf);
+            }
+        }
+    }
+
+    if (threadIdx.x == 0) {
+        for (int i = 0; i < N_TOP_K; i++) { mapped_slots[token_idx * N_TOP_K + i] = store_rows[i]; }
+    }
+}
+
+#define LAUNCH_FOR_UNROLL(COUNT)                               \
+    case COUNT:                                                \
+        moe_scatter_kernel<T, COUNT, CONST_TOP_K>              \
+            <<<grid, block, 0, stream>>>(moe_input,            \
+                                         expert_count_cumsums, \
+                                         mapped_slots,         \
+                                         activations,          \
+                                         assignments,          \
+                                         expert_counts,        \
+                                         offsets,              \
+                                         n_channels,           \
+                                         n_experts);           \
+        break;
+
+template <typename T>
+void launch_moe_scatter(T* moe_input,
+                        int64_t* expert_count_cumsums,
+                        int32_t* mapped_slots,
+                        const T* activations,
+                        const int32_t* expert_counts,
+                        const int32_t* assignments,
+                        const int32_t* offsets,
+                        const int32_t n_channels,
+                        const int32_t n_tokens,
+                        const int32_t n_experts,
+                        const int32_t n_top_k,
+                        cudaStream_t stream)
+{
+    constexpr int vals_per_unroll = scatter::threads * scatter::access_granularity / sizeof(T);
+    const int copy_unroll = (n_channels + vals_per_unroll - 1) / vals_per_unroll;
+
+    const dim3 block(scatter::threads);
+    const dim3 grid(n_tokens);
+
+    TOP_K_SWITCH(n_top_k, [&] {
+        switch (copy_unroll) {
+            LAUNCH_FOR_UNROLL(1);
+            LAUNCH_FOR_UNROLL(2);
+            LAUNCH_FOR_UNROLL(3);
+            LAUNCH_FOR_UNROLL(4);
+            LAUNCH_FOR_UNROLL(5);
+            LAUNCH_FOR_UNROLL(6);
+        }
+    });
+}
+
+#define INSTANTIATE_SCATTER_FOR_TYPE(TYPE)                 \
+    template void launch_moe_scatter<TYPE>(TYPE*,          \
+                                           int64_t*,       \
+                                           int32_t*,       \
+                                           const TYPE*,    \
+                                           const int32_t*, \
+                                           const int32_t*, \
+                                           const int32_t*, \
+                                           const int32_t,  \
+                                           const int32_t,  \
+                                           const int32_t,  \
+                                           const int32_t,  \
+                                           cudaStream_t);
+
+INSTANTIATE_SCATTER_FOR_TYPE(__half);
+
+#ifdef BF16_AVAILABLE
+INSTANTIATE_SCATTER_FOR_TYPE(__nv_bfloat16);
+#endif
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/ragged_ops.cpp b/deepspeed/inference/v2/kernels/ragged_ops/ragged_ops.cpp
index 1c09fc52bbb1..f320f46e2620 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/ragged_ops.cpp
+++ b/deepspeed/inference/v2/kernels/ragged_ops/ragged_ops.cpp
@@ -12,7 +12,7 @@
 #include "logits_gather.h"
 #include "moe_gather.h"
 #include "moe_scatter.h"
-#include "top_1_gating.h"
+#include "top_k_gating.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
@@ -43,6 +43,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     // moe_scatter.h
     m.def("moe_scatter", &moe_scatter, "MoE scatter for top-1-gating.");
 
-    // top_1_gating.h
-    m.def("top_1_gating", &top_1_gating, "Top-1 gating for MoE with ragged batch awareness.");
+    // top_k_gating.h
+    m.def("top_k_gating", &top_k_gating, "Top-1 gating for MoE with ragged batch awareness.");
 }
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/__init__.py b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/__init__.py
new file mode 100644
index 000000000000..487735b015b0
--- /dev/null
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .top_k_gating import RaggedTopKGating
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp
similarity index 67%
rename from deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp
rename to deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp
index 55c68454b228..5eec7e2b955f 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp
@@ -3,12 +3,12 @@
 
 // DeepSpeed Team
 
-#include "top_1_gating.h"
+#include "top_k_gating.h"
 #include <c10/cuda/CUDAStream.h>
 
-#define DISPATCH_TOP_1_GATING(T_TYPE, C_TYPE)                   \
+#define DISPATCH_TOP_K_GATING(T_TYPE, C_TYPE)                   \
     if (logits.options().dtype() == torch::T_TYPE) {            \
-        launch_top_1_gating((int32_t*)expert_counts.data_ptr(), \
+        launch_top_k_gating((int32_t*)expert_counts.data_ptr(), \
                             (float*)scores.data_ptr(),          \
                             (int32_t*)assignments.data_ptr(),   \
                             (int32_t*)offsets.data_ptr(),       \
@@ -16,14 +16,15 @@
                             batch_metadata_ptr,                 \
                             n_tokens,                           \
                             n_experts,                          \
+                            n_top_k,                            \
                             at::cuda::getCurrentCUDAStream());  \
         return;                                                 \
     }
 
 /*
-Perform softmax plus atomics in order to do first pass of top_1_gating.
+Perform softmax plus atomics in order to do first pass of top_k_gating.
 */
-void top_1_gating(torch::Tensor& expert_counts,
+void top_k_gating(torch::Tensor& expert_counts,
                   torch::Tensor& scores,
                   torch::Tensor& assignments,
                   torch::Tensor& offsets,
@@ -31,10 +32,15 @@ void top_1_gating(torch::Tensor& expert_counts,
                   torch::Tensor& batch_metadata)
 {
     const int32_t n_tokens = scores.size(0);
+    const int32_t n_top_k = scores.size(1);
 
-    // Should have the same buffer size for scores and offsets
+    // Should have the same buffer size for scores, offsets, and assignments
     TORCH_CHECK(n_tokens == offsets.size(0));
     TORCH_CHECK(n_tokens == logits.size(0));
+    TORCH_CHECK(n_tokens == assignments.size(0));
+
+    TORCH_CHECK(n_top_k == offsets.size(1));
+    TORCH_CHECK(n_top_k == assignments.size(1));
 
     TORCH_CHECK(expert_counts.scalar_type() == torch::kInt32);
     TORCH_CHECK(scores.scalar_type() == torch::kFloat);
@@ -45,11 +51,11 @@ void top_1_gating(torch::Tensor& expert_counts,
     const RaggedBatchDescriptor* batch_metadata_ptr =
         reinterpret_cast<const RaggedBatchDescriptor*>(batch_metadata.data_ptr());
 
-    DISPATCH_TOP_1_GATING(kFloat, float)
-    DISPATCH_TOP_1_GATING(kHalf, __half)
+    DISPATCH_TOP_K_GATING(kFloat, float)
+    DISPATCH_TOP_K_GATING(kHalf, __half)
 #ifdef BF16_AVAILABLE
-    DISPATCH_TOP_1_GATING(kBFloat16, __nv_bfloat16)
+    DISPATCH_TOP_K_GATING(kBFloat16, __nv_bfloat16)
 #endif
 
-    TORCH_CHECK(false, "Unsupported dtype for logits in top_1_gating");
+    TORCH_CHECK(false, "Unsupported dtype for logits in top_k_gating");
 }
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cuh b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cuh
similarity index 87%
rename from deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cuh
rename to deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cuh
index c83ad56ff2f1..c525cc5f524e 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cuh
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cuh
@@ -13,7 +13,7 @@ constexpr int unassigned = -1;
 }  // namespace gating
 
 template <typename T>
-void launch_top_1_gating(int32_t* expert_counts,
+void launch_top_k_gating(int32_t* expert_counts,
                          float* scores,
                          int32_t* assignments,
                          int32_t* offsets,
@@ -21,4 +21,5 @@ void launch_top_1_gating(int32_t* expert_counts,
                          const RaggedBatchDescriptor* batch_metadata,
                          const int32_t n_tokens,
                          const int32_t n_experts,
+                         const int32_t n_top_k,
                          cudaStream_t stream);
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.h b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.h
similarity index 86%
rename from deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.h
rename to deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.h
index b431f4cad30c..00840c3c93b5 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.h
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.h
@@ -8,12 +8,12 @@
 #include <c10/cuda/CUDAStream.h>
 #include <torch/extension.h>
 #include "ragged_dtypes.h"
-#include "top_1_gating.cuh"
+#include "top_k_gating.cuh"
 
 /*
 Perform softmax plus atomics to get token mapping.
 */
-void top_1_gating(torch::Tensor& expert_counts,
+void top_k_gating(torch::Tensor& expert_counts,
                   torch::Tensor& scores,
                   torch::Tensor& assignments,
                   torch::Tensor& offsets,
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.py b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.py
similarity index 87%
rename from deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.py
rename to deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.py
index 1df97c2e9f8d..72ba2b6019bb 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.py
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.py
@@ -13,7 +13,7 @@
 from deepspeed.ops.op_builder import RaggedOpsBuilder
 
 
-class RaggedTop1Gating(DSKernelBase):
+class RaggedTopKGating(DSKernelBase):
     """
     CUDA implementation of top-1 gating. This will perform a softmax on the logits,
     and return the scale as well as its idx within that expert's allocation.
@@ -26,28 +26,28 @@ def __init__(self, logit_dtype: DtypeEnum) -> None:
         if not isinstance(logit_dtype, DtypeEnum):
             logit_dtype = DtypeEnum(logit_dtype)
 
-        if logit_dtype not in RaggedTop1Gating.supported_logit_dtypes:
+        if logit_dtype not in RaggedTopKGating.supported_logit_dtypes:
             raise RuntimeError(f"Unsupported logit dtype {logit_dtype}")
 
         inf_module = RaggedOpsBuilder().load()
-        self.kernel = inf_module.top_1_gating
+        self.kernel = inf_module.top_k_gating
 
     def __call__(self, expert_counts: torch.Tensor, scores: torch.Tensor, assignments: torch.Tensor,
                  offsets: torch.Tensor, logits: torch.Tensor,
                  batch: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
-        Perform the ragged top_1_gating.
+        Perform the ragged top_k_gating.
 
         Arguments:
             expert_counts (torch.Tensor): Tensor of 0s of shape [n_experts] to be filled with
                 number of tokens assigned to each expert. This must be filled with 0s else
                 the copy kernel will buffer overflow. In order to minimize the zero-fill cost,
                 it is recommended to write to 0 during the MoE output remapping.
-            scores (torch.Tensor): Preallocated output of shape [n_tokens] to place expert scaling
+            scores (torch.Tensor): Preallocated output of shape [n_tokens, n_top_k] to place expert scaling
                 value.
-            expert_assignment (torch.Tensor): Preallocated output of shape [n_tokens] to place
+            expert_assignment (torch.Tensor): Preallocated output of shape [n_tokens, n_top_k] to place
                 which expert a token has been assigned to.
-            expert_offset (torch.Tensor): Preallocated output of shape [n_tokens] to place which
+            expert_offset (torch.Tensor): Preallocated output of shape [n_tokens, n_top_k] to place which
                 offset within an experts group a token is.
             logits (torch.Tensor): Raw logits of gating function.
             batch (RaggedBatchWrapper): Batch information for ragged tensor.
diff --git a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu
similarity index 59%
rename from deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu
rename to deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu
index 02daee9f692e..58f95c045593 100644
--- a/deepspeed/inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu
+++ b/deepspeed/inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu
@@ -6,12 +6,13 @@
 #include "conversion_utils.h"
 #include "memory_access_utils.h"
 #include "reduction_utils.h"
-#include "top_1_gating.cuh"
+#include "top_k_gating.cuh"
+#include "top_k_utils.h"
 
 using ROp = reduce::ROpType;
 
-template <typename T>
-__global__ void top_1_gating_kernel(int32_t* expert_counts,
+template <typename T, int TOP_K>
+__global__ void top_k_gating_kernel(int32_t* expert_counts,
                                     float* scores,
                                     int32_t* assignments,
                                     int32_t* offsets,
@@ -30,8 +31,11 @@ __global__ void top_1_gating_kernel(int32_t* expert_counts,
     // Padding tokens do not require
     if (token_idx >= batch_metadata->n_tokens) {
         if (threadIdx.x == 0) {
-            offsets[token_idx] = gating::unassigned;
-            assignments[token_idx] = gating::unassigned;
+#pragma unroll
+            for (int i = 0; i < TOP_K; i++) {
+                assignments[token_idx * TOP_K + i] = gating::unassigned;
+                offsets[token_idx * TOP_K + i] = gating::unassigned;
+            }
         }
         return;
     }
@@ -44,34 +48,46 @@ __global__ void top_1_gating_kernel(int32_t* expert_counts,
     } else {
         reduce::init<ROp::Max>(&logit_val);
     }
+    float reduce_val = logit_val;
+
+    int32_t local_assigned_experts[TOP_K];
+    float local_assigned_logits[TOP_K];
 
     // Training code tends to use ``torch.argmax`` to select the expert, which
     // which has ties broken by the lower index. Since our fused comparison algorithm
     // breaks ties by the higher index (since it's the lower 32-bits of the 64-bit
     // comparison), we invert the expert index to break ties by the lower index.
     int32_t inverted_expert = n_experts - expert_idx - 1;
-    // Perform softmax
-    const reduce::IdxReduceResult res =
-        reduce::idx_reduce<ROp::Max, max_warps>(tb, warp, logit_val, inverted_expert);
-    // Recover the original expert index
-    const int32_t assigned_expert = n_experts - res.idx - 1;
-    const float max_logit = res.val;
 
+    // Find the top k logits
+    for (int i = 0; i < TOP_K; ++i) {
+        const reduce::IdxReduceResult res =
+            reduce::idx_reduce<ROp::Max, max_warps>(tb, warp, reduce_val, inverted_expert);
+        local_assigned_experts[i] = n_experts - res.idx - 1;
+        local_assigned_logits[i] = res.val;
+
+        // Set the max logit to -inf so that it is not selected again
+        if (threadIdx.x == n_experts - res.idx - 1) { reduce::init<ROp::Max>(&reduce_val); }
+    }
+
+    const float max_logit = local_assigned_logits[0];
     float softmax_sum = __expf(logit_val - max_logit);
     reduce::block<ROp::Add>(tb, warp, softmax_sum);
 
-    // Compute the score
-    const float score = __expf(max_logit - max_logit) / softmax_sum;
+    for (int i = 0; i < TOP_K; ++i) {
+        const float softmax = __expf(local_assigned_logits[i] - max_logit) / softmax_sum;
 
-    if (threadIdx.x == 0) {
-        scores[token_idx] = score;
-        assignments[token_idx] = assigned_expert;
-        offsets[token_idx] = atomicAdd(expert_counts + assigned_expert, 1);
+        if (threadIdx.x == 0) {
+            scores[token_idx * TOP_K + i] = softmax;
+            assignments[token_idx * TOP_K + i] = local_assigned_experts[i];
+            offsets[token_idx * TOP_K + i] =
+                atomicAdd(expert_counts + local_assigned_experts[i], 1);
+        }
     }
 }
 
 template <typename T>
-void launch_top_1_gating(int32_t* expert_counts,
+void launch_top_k_gating(int32_t* expert_counts,
                          float* scores,
                          int32_t* assignments,
                          int32_t* offsets,
@@ -79,17 +95,20 @@ void launch_top_1_gating(int32_t* expert_counts,
                          const RaggedBatchDescriptor* batch_metadata,
                          const int32_t n_tokens,
                          const int32_t n_experts,
+                         const int32_t n_top_k,
                          cudaStream_t stream)
 {
     const dim3 grid(n_tokens);
     const dim3 block(((n_experts + hw_warp_size - 1) / hw_warp_size) * hw_warp_size);
 
-    top_1_gating_kernel<T><<<grid, block, 0, stream>>>(
-        expert_counts, scores, assignments, offsets, logits, batch_metadata, n_experts);
+    TOP_K_SWITCH(n_top_k, [&] {
+        top_k_gating_kernel<T, CONST_TOP_K><<<grid, block, 0, stream>>>(
+            expert_counts, scores, assignments, offsets, logits, batch_metadata, n_experts);
+    });
 }
 
-#define INSTANTIATE_TOP_1_KERNEL(T)                                                   \
-    template void launch_top_1_gating<T>(int32_t * expert_counts,                     \
+#define INSTANTIATE_top_k_KERNEL(T)                                                   \
+    template void launch_top_k_gating<T>(int32_t * expert_counts,                     \
                                          float* scores,                               \
                                          int32_t* assignments,                        \
                                          int32_t* offsets,                            \
@@ -97,10 +116,10 @@ void launch_top_1_gating(int32_t* expert_counts,
                                          const RaggedBatchDescriptor* batch_metadata, \
                                          const int32_t n_tokens,                      \
                                          const int32_t n_experts,                     \
+                                         const int32_t n_top_k,                       \
                                          cudaStream_t stream);
 
-INSTANTIATE_TOP_1_KERNEL(float)
-INSTANTIATE_TOP_1_KERNEL(__half)
+INSTANTIATE_top_k_KERNEL(float) INSTANTIATE_top_k_KERNEL(__half)
 #ifdef BF16_AVAILABLE
-INSTANTIATE_TOP_1_KERNEL(__nv_bfloat16)
+    INSTANTIATE_top_k_KERNEL(__nv_bfloat16)
 #endif
diff --git a/deepspeed/inference/v2/model_implementations/__init__.py b/deepspeed/inference/v2/model_implementations/__init__.py
index dae406271245..3483d9348c55 100644
--- a/deepspeed/inference/v2/model_implementations/__init__.py
+++ b/deepspeed/inference/v2/model_implementations/__init__.py
@@ -12,3 +12,10 @@
 from .llama_v2 import *
 from .opt import *
 from .mistral import *
+from .mixtral import *
+from .falcon import *
+from .phi import *
+from .phi3 import *
+from .qwen import *
+from .qwen_v2 import *
+from .qwen_v2_moe import *
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
index ddb8996e03a3..17def1fa021f 100644
--- a/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/mlp_parameters.py
@@ -66,6 +66,24 @@ def finalize(self) -> torch.Tensor:
         return self.inference_model.transform_mlp_1_param(fused_param)
 
 
+class FusedGatedMLPParameter(ParameterBase):
+    """
+    Gated MLP projection container.
+    """
+
+    params: torch.Tensor
+    """
+    Weight parameter for the fused gating and non-gating weight parameters.
+    """
+
+    def finalize(self) -> torch.Tensor:
+        gate_params = self.params[:self.params.shape[0] // 2]
+        up_params = self.params[self.params.shape[0] // 2:]
+        total_neurons = gate_params.shape[0] + up_params.shape[0]
+        fused_param = torch.cat([gate_params, up_params], dim=-1).reshape(total_neurons, -1)
+        return self.inference_model.transform_mlp_1_param(fused_param)
+
+
 class MLP2Parameter(ParameterBase):
     """
     Second MLP projection weight container. This performs a straight pass-through to the
diff --git a/deepspeed/inference/v2/model_implementations/common_parameters/moe_parameters.py b/deepspeed/inference/v2/model_implementations/common_parameters/moe_parameters.py
index df5f1427a5cf..8ababf567ba9 100644
--- a/deepspeed/inference/v2/model_implementations/common_parameters/moe_parameters.py
+++ b/deepspeed/inference/v2/model_implementations/common_parameters/moe_parameters.py
@@ -33,7 +33,7 @@ class UnfusedMoEMLP1Parameter(ParameterBase):
     and need to be joined into a single group.
     """
 
-    experts: ParamList("num_experts")  # noqa: F821
+    experts: ParamList("n_experts")  # noqa: F821
 
     def finalize(self) -> torch.Tensor:
         stacked_experts = torch.stack([p for p in self.experts], dim=0)
@@ -46,7 +46,7 @@ class UnfusedMoEMLP2Parameter(ParameterBase):
     and need to be joined into a single group.
     """
 
-    experts: ParamList("num_experts")  # noqa: F821
+    experts: ParamList("n_experts")  # noqa: F821
 
     def finalize(self) -> torch.Tensor:
         stacked_experts = torch.stack([p for p in self.experts], dim=0)
@@ -57,13 +57,22 @@ class UnfusedMoEGatedMLPParameter(ParameterBase):
     """
     MoE Parameter for a gated activation function in which the gating matrix is not
     fused in the same parameter as the non-gating matrix.
+
+    This is a stacked version of the ``GatedMLPParameter``. Please see that class for more
+    documentation on the layout of the parameters.
     """
 
-    gating_experts: ParamList("num_experts")  # noqa: F821
+    gating_experts: ParamList("n_experts")  # noqa: F821
 
-    up_experts: ParamList("num_experts")  # noqa: F821
+    up_experts: ParamList("n_experts")  # noqa: F821
 
     def finalize(self) -> torch.Tensor:
-        fused_params = [torch.cat([gate, weight], dim=0) for gate, weight in zip(self.gating_experts, self.up_experts)]
-        stacked_params = torch.stack(fused_params, dim=0)
-        return self.inference_model.transform_moe_mlp_2_param(stacked_params)
+        transposed_experts = []
+        for gate, up in zip(self.gating_experts, self.up_experts):
+            assert gate.shape[0] == up.shape[0], "Gated MLP parameters must have the same number of neurons."
+            total_neurons = gate.shape[0] + up.shape[0]
+            fused_expert = torch.cat([gate, up], dim=-1).reshape(total_neurons, -1)
+            transposed_experts.append(fused_expert)
+
+        stacked_experts = torch.stack(transposed_experts, dim=0)
+        return self.inference_model.transform_moe_mlp_1_param(stacked_experts)
diff --git a/deepspeed/inference/v2/model_implementations/falcon/__init__.py b/deepspeed/inference/v2/model_implementations/falcon/__init__.py
new file mode 100644
index 000000000000..20f37538274c
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/falcon/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import FalconPolicy
diff --git a/deepspeed/inference/v2/model_implementations/falcon/container.py b/deepspeed/inference/v2/model_implementations/falcon/container.py
new file mode 100644
index 000000000000..caccfe1ecb00
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/falcon/container.py
@@ -0,0 +1,129 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Falcon 7b model looks like this:
+
+FalconForCausalLM(
+  (transformer): FalconModel(
+    (word_embeddings): Embedding(65024, 4544)
+    (h): ModuleList(
+      (0-31): 32 x FalconDecoderLayer(
+        (self_attention): FalconAttention(
+          (maybe_rotary): FalconRotaryEmbedding()
+          (query_key_value): FalconLinear(in_features=4544, out_features=4672, bias=False)
+          (dense): FalconLinear(in_features=4544, out_features=4544, bias=False)
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (mlp): FalconMLP(
+          (dense_h_to_4h): FalconLinear(in_features=4544, out_features=18176, bias=False)
+          (act): GELU(approximate='none')
+          (dense_4h_to_h): FalconLinear(in_features=18176, out_features=4544, bias=False)
+        )
+        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
+  )
+  (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
+)
+'''
+
+
+class FalconTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Falcon model.
+    """
+    qkv_w: FusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: MLP1Parameter
+    mlp_2_w: MLP2Parameter
+    ln_attn_gamma: NormParameter
+    ln_attn_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attention.query_key_value.weight": "qkv_w.params",
+        "self_attention.dense.weight": "attn_out_w.params",
+        "mlp.dense_h_to_4h.weight": "mlp_1_w.params",
+        "mlp.dense_4h_to_h.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "ln_attn_gamma.params",
+        "input_layernorm.bias": "ln_attn_beta.params",
+    }
+
+
+class FalconNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Falcon model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm_gamma: NormParameter
+    final_norm_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "transformer.word_embeddings.weight": "word_emb.params",
+        "transformer.ln_f.weight": "final_norm_gamma.params",
+        "transformer.ln_f.bias": "final_norm_beta.params",
+        "lm_head.weight": "word_unembed.params",
+    }
+
+
+'''
+ # HF Falcon 40b model looks like this:
+
+ FalconForCausalLM(
+  (transformer): FalconModel(
+    (word_embeddings): Embedding(65024, 8192)
+    (h): ModuleList(
+      (0-59): 60 x FalconDecoderLayer(
+        (self_attention): FalconAttention(
+          (maybe_rotary): FalconRotaryEmbedding()
+          (query_key_value): FalconLinear(in_features=8192, out_features=9216, bias=False)
+          (dense): FalconLinear(in_features=8192, out_features=8192, bias=False)
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (mlp): FalconMLP(
+          (dense_h_to_4h): FalconLinear(in_features=8192, out_features=32768, bias=False)
+          (act): GELU(approximate='none')
+          (dense_4h_to_h): FalconLinear(in_features=32768, out_features=8192, bias=False)
+        )
+        (ln_attn): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+        (ln_mlp): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (ln_f): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+  )
+  (lm_head): Linear(in_features=8192, out_features=65024, bias=False)
+)
+'''
+
+
+class FalconNewArchTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Falcon model.
+    """
+    qkv_w: GQAMegatronQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: MLP1Parameter
+    mlp_2_w: MLP2Parameter
+    ln_attn_gamma: NormParameter
+    ln_attn_beta: NormParameter
+    ln_mlp_gamma: NormParameter
+    ln_mlp_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attention.query_key_value.weight": "qkv_w.params",
+        "self_attention.dense.weight": "attn_out_w.params",
+        "mlp.dense_h_to_4h.weight": "mlp_1_w.params",
+        "mlp.dense_4h_to_h.weight": "mlp_2_w.params",
+        "ln_attn.weight": "ln_attn_gamma.params",
+        "ln_attn.bias": "ln_attn_beta.params",
+        "ln_mlp.weight": "ln_mlp_gamma.params",
+        "ln_mlp.bias": "ln_mlp_beta.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/falcon/model.py b/deepspeed/inference/v2/model_implementations/falcon/model.py
new file mode 100644
index 000000000000..b2830c80b562
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/falcon/model.py
@@ -0,0 +1,213 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from .. import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...ragged import RaggedBatchWrapper
+
+from .container import FalconNonTransformerContainer, FalconTransformerContainer
+
+
+class FalconInferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[FalconNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[FalconTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties inherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties inherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return 4 * self._config.hidden_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_kv_heads if (self._config.new_decoder_architecture
+                                             or not self._config.multi_query) else 1
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.GELU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.LayerNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> RotateHalfConfig:
+        """
+        The positional embedding configuration for the model.
+        """
+        return RotateHalfConfig()
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        assert self.config.parallel_attn, "Only parallel attention implementation is supported"
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        attn_ln_out = hidden_states
+        attn_hidden_state = self.qkv(attn_ln_out, cur_params.qkv_w, b=None)
+        attn_hidden_state = self.attn(attn_hidden_state, kv_cache, ragged_batch_info)
+        attention_output = self.attn_out(attn_hidden_state, cur_params.attn_out_w, b=None)
+
+        if self.config.new_decoder_architecture:
+            residual, mlp_ln_out = self.norm(residual,
+                                             None,
+                                             gamma=cur_params.ln_mlp_gamma,
+                                             beta=cur_params.ln_mlp_beta)
+        else:
+            mlp_ln_out = hidden_states
+
+        mlp_hidden_state = self.mlp_1(mlp_ln_out, cur_params.mlp_1_w, b=None)
+        mlp_output = self.mlp_2(mlp_hidden_state, cur_params.mlp_2_w, b=None)
+
+        mlp_output.add_(attention_output)
+
+        if self.tp_size > 1:
+            dist.all_reduce(mlp_output, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, mlp_output = self.norm(residual,
+                                             mlp_output,
+                                             next_params.ln_attn_gamma,
+                                             beta=next_params.ln_attn_beta)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(mlp_output)
+
+        return residual, mlp_output
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm_gamma,
+                              beta=self._non_transformer.final_norm_beta)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual,
+                                            None,
+                                            gamma=self._transformer[0].ln_attn_gamma,
+                                            beta=self._transformer[0].ln_attn_beta)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/falcon/policy.py b/deepspeed/inference/v2/model_implementations/falcon/policy.py
new file mode 100644
index 000000000000..c6612090a0df
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/falcon/policy.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import FalconNonTransformerContainer, FalconTransformerContainer
+from .container import FalconNewArchTransformerContainer
+from .model import FalconInferenceModel
+
+
+class FalconPolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> FalconInferenceModel:
+        return FalconInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        trans_container_cls = FalconNewArchTransformerContainer if self._model_config.new_decoder_architecture else FalconTransformerContainer
+        transformer_containers = [trans_container_cls(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['transformer.h'], transformer_containers)
+
+        map.set_non_transformer_params(FalconNonTransformerContainer(self.model))
+
+        map.set_unmapped_params(
+            [f'model.layers.{i}.self_attn.rotary_emb.inv_freq' for i in range(self.model.num_layers)])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
index f9da7ac5d23e..c5e02adaffc4 100644
--- a/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
+++ b/deepspeed/inference/v2/model_implementations/flat_model_helpers.py
@@ -27,9 +27,9 @@ class TensorMetadata(DeepSpeedConfigModel):
     """
     A class to represent a tensor specification.
     """
-    dtype: Optional[str]
-    shape: Optional[Tuple[int, ...]]
-    strides: Optional[Tuple[int, ...]]
+    dtype: Optional[str] = None
+    shape: Optional[Tuple[int, ...]] = None
+    strides: Optional[Tuple[int, ...]] = None
     offset: int
 
 
@@ -37,7 +37,7 @@ class ParameterMetadata(DeepSpeedConfigModel):
     """
     A class to represent a parameter specification.
     """
-    core_param: TensorMetadata = None
+    core_param: Optional[TensorMetadata] = None
     aux_params: Dict[str, TensorMetadata] = {}
 
 
@@ -164,7 +164,7 @@ def process_layer(layer_container: LayerContainer, l_name: str, cur_offset: int)
                                                                    strides=tensor.stride(),
                                                                    offset=cur_offset)
 
-                cur_offset += pad_to_aligned_offset(elem_size(param.dtype) * param.numel())
+                cur_offset += pad_to_aligned_offset(elem_size(tensor.dtype) * tensor.numel())
 
             layer_metadata.params[p_name] = param_metadata
 
diff --git a/deepspeed/inference/v2/model_implementations/inference_model_base.py b/deepspeed/inference/v2/model_implementations/inference_model_base.py
index a7ff699e9058..894a4137407e 100644
--- a/deepspeed/inference/v2/model_implementations/inference_model_base.py
+++ b/deepspeed/inference/v2/model_implementations/inference_model_base.py
@@ -199,6 +199,10 @@ def get_kv_requirements(self, sequence: DSSequenceDescriptor, max_new_tokens: in
         """
         raise NotImplementedError()
 
+    @abstractmethod
+    def get_remaining_block_capacity(self, sequence: DSSequenceDescriptor) -> int:
+        raise NotImplementedError()
+
     @abstractmethod
     def maybe_allocate_kv(self, sequence: DSSequenceDescriptor, n_new_tokens: int) -> None:
         """
diff --git a/deepspeed/inference/v2/model_implementations/inference_policy_base.py b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
index d5a326c03599..2f4266a8cb88 100644
--- a/deepspeed/inference/v2/model_implementations/inference_policy_base.py
+++ b/deepspeed/inference/v2/model_implementations/inference_policy_base.py
@@ -205,7 +205,7 @@ def populate_model_parameters(self) -> None:
             buffer_path = make_param_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
             metadata_path = make_metadata_filename(self._inf_checkpoint_path, self.model.tp_rank, self.model.tp_size)
 
-            buffer = torch.load(buffer_path)
+            buffer = torch.load(buffer_path, weights_only=False)
             metadata = json.load(open(metadata_path, "r"))
             metadata = ModelMetadata.parse_raw(metadata)
 
diff --git a/deepspeed/inference/v2/model_implementations/inference_transformer_base.py b/deepspeed/inference/v2/model_implementations/inference_transformer_base.py
index 8f6a0b7fa688..fae67dc8fc2a 100644
--- a/deepspeed/inference/v2/model_implementations/inference_transformer_base.py
+++ b/deepspeed/inference/v2/model_implementations/inference_transformer_base.py
@@ -22,6 +22,7 @@
     DSUnembedConfig,
     NormTypeEnum,
     PositionalEmbeddingType,
+    RotateHalfConfig,
 )
 from ..modules import heuristics
 from ..ragged import (
@@ -152,6 +153,14 @@ def norm_type(self) -> NormTypeEnum:
         """
         ...
 
+    @property
+    @abstractmethod
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        """
+        The positional embedding configuration for the model.
+        """
+        ...
+
     """
     Derived helpers
     """
@@ -319,12 +328,13 @@ def make_attn_layer(self) -> None:
                                             scale_factor=softmax_scale,
                                             input_dtype=self.activation_dtype,
                                             output_dtype=self.activation_dtype,
-                                            positional_embedding_type=self.positional_embedding_type)
+                                            positional_embedding_type=self.positional_embedding_type,
+                                            positional_embedding_config=self.positional_embedding_config)
 
         self.attn = heuristics.instantiate_attention(attn_config, self._engine_config)
 
     def get_kv_requirements(self, sequence: DSSequenceDescriptor, max_new_tokens: int,
-                            max_new_blocks: int) -> Tuple[int, torch.Tensor]:
+                            max_new_blocks: int) -> Tuple[int, int]:
         """
         See ``DSInferenceModelBase.get_kv_requirements`` for documentation.
 
@@ -341,7 +351,10 @@ def get_kv_requirements(self, sequence: DSSequenceDescriptor, max_new_tokens: in
         token_capacity = (max_new_blocks +
                           sequence.cur_allocated_blocks) * self.attn.kv_block_size - sequence.seen_tokens
 
-        return token_capacity, torch.tensor([max_new_blocks])
+        return token_capacity, max_new_blocks
+
+    def get_remaining_block_capacity(self, sequence: DSSequenceDescriptor) -> int:
+        return sequence.seen_tokens % self.attn.kv_block_size
 
     def maybe_allocate_kv(self, sequence: DSSequenceDescriptor, n_new_tokens: int) -> None:
         """
@@ -350,7 +363,8 @@ def maybe_allocate_kv(self, sequence: DSSequenceDescriptor, n_new_tokens: int) -
         This method assumes an autoregressive dense attention pattern. Override this method
         if this does not match the model's attention pattern.
         """
-        _, n_needed_blocks = self.get_kv_requirements(sequence, n_new_tokens, self.state_manager.free_blocks)
+        free_block = self.state_manager.free_blocks[0]
+        _, n_needed_blocks = self.get_kv_requirements(sequence, n_new_tokens, free_block)
 
         if n_needed_blocks > 0:
             new_blocks = self.state_manager.allocate_blocks(n_needed_blocks)
@@ -521,12 +535,26 @@ def transform_norm_param(self, param: torch.Tensor) -> InferenceParameter:
 class DSMoETransformerModelBase(DSTransformerModelBase):
 
     @property
-    def num_experts(self) -> int:
+    def n_experts(self) -> int:
         """
         Return the number of experts in the model.
         """
         raise NotImplementedError("Attempted to access an unimplemented number of experts")
 
+    @property
+    def n_top_k(self) -> int:
+        """
+        Number of experts per token.
+        """
+        raise NotImplementedError("Attempted to access an unimplemented number of experts per token")
+
+    @property
+    def normalize_expert_scores(self) -> bool:
+        """
+        Whether to normalize expert scores. If true, sum(expert_scores) = 1.
+        """
+        raise NotImplementedError("Attempted to access an unimplemented normalization flag")
+
     def make_moe_layer(self) -> None:
         """
         Instantiates the MoE layer for the model. This sets the `self.moe` attribute.
@@ -538,9 +566,11 @@ def make_moe_layer(self) -> None:
             model_dim=self.model_dim,
             intermediate_features=sharded_dim,
             activation=self.mlp_activation_fn,
-            n_experts=self.num_experts,
+            n_experts=self.n_experts,
+            top_k=self.n_top_k,
             input_dtype=self.activation_dtype,
             output_dtype=self.activation_dtype,
+            normalize_scores=self.normalize_expert_scores,
         )
 
         self.moe = heuristics.instantiate_moe(moe_config, self._engine_config)
diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py
index f26c87556665..feb65b4a5f5d 100644
--- a/deepspeed/inference/v2/model_implementations/layer_container_base.py
+++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py
@@ -14,7 +14,7 @@
 
 # Currently have dependency loops for the type hints.
 InferenceModel = Type["InferenceModel"]
-LayerContainer = Type["LayerContainer"]
+LayerContainer = Type["LayerContainer"]  # noqa: F811
 
 MAPPING_KEY = "PARAM_MAPPING"
 PLIST_HELPERS = "_ds_plist_strip_vals"
@@ -161,7 +161,7 @@ def __call__(cls, *args, **kwargs):
         return instance
 
 
-class LayerContainer(metaclass=LayerMetaclass):
+class LayerContainer(metaclass=LayerMetaclass):  # noqa: F811
     """
     Abstract base class for containing model parameters.
 
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/__init__.py b/deepspeed/inference/v2/model_implementations/llama_v2/__init__.py
index 5d2b5ae562ee..79605a76a4c2 100644
--- a/deepspeed/inference/v2/model_implementations/llama_v2/__init__.py
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/__init__.py
@@ -3,4 +3,4 @@
 
 # DeepSpeed Team
 
-from .llama_v2_policy import Llama2Policy
+from .policy import Llama2Policy
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_containers.py b/deepspeed/inference/v2/model_implementations/llama_v2/container.py
similarity index 95%
rename from deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_containers.py
rename to deepspeed/inference/v2/model_implementations/llama_v2/container.py
index e9c473ce512b..9de9bdb34574 100644
--- a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_containers.py
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/container.py
@@ -5,8 +5,8 @@
 
 # Create a container object to save model-specific tensors using the policy file above.
 
-from ...model_implementations.common_parameters import *
-from ...model_implementations.layer_container_base import LayerContainer
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
 '''
  # HF Llama model looks like this:
 
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_model.py b/deepspeed/inference/v2/model_implementations/llama_v2/model.py
similarity index 92%
rename from deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_model.py
rename to deepspeed/inference/v2/model_implementations/llama_v2/model.py
index 9b628f77de01..a0c81f4d749e 100644
--- a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_model.py
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/model.py
@@ -11,12 +11,12 @@
 
 from ...allocator import empty_from
 from ...inference_utils import ActivationType, DtypeEnum
-from ...model_implementations import *
+from .. import *
 from ...modules.configs import *
 from ...modules.interfaces import *
 from ...ragged import RaggedBatchWrapper
 
-from .llama_v2_containers import Llama2NonTransformerContainer, Llama2TransformerContainer
+from .container import Llama2NonTransformerContainer, Llama2TransformerContainer
 
 
 class Llama2InferenceModel(DSTransformerModelBase):
@@ -105,6 +105,10 @@ def norm_type(self) -> NormTypeEnum:
     def positional_embedding_type(self) -> PositionalEmbeddingType:
         return PositionalEmbeddingType.rotate_half
 
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
     """
     Forward implementations
     """
@@ -145,8 +149,7 @@ def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hid
         kv_cache = self.state_manager.get_cache(layer_idx)
 
         hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=None)
-        hidden_states = self.attn(hidden_states, kv_cache,
-                                  ragged_batch_info)  #, inv_freqs=None) #cur_params.rotary_emb)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
         hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
 
         if self.tp_size > 1:
@@ -176,8 +179,10 @@ def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: Ragge
         Performs unembedding of the hidden states to logits. This will only sample the final
         token of each sequence.
         """
-        logits = self.unembed(hidden_states, self._non_transformer.word_unembed, ragged_batch_info,
-                              self._non_transformer.final_norm)
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
 
         if self.tp_size > 1:
             comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
diff --git a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_policy.py b/deepspeed/inference/v2/model_implementations/llama_v2/policy.py
similarity index 76%
rename from deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_policy.py
rename to deepspeed/inference/v2/model_implementations/llama_v2/policy.py
index c8253be79fad..bb13ab6d5bf4 100644
--- a/deepspeed/inference/v2/model_implementations/llama_v2/llama_v2_policy.py
+++ b/deepspeed/inference/v2/model_implementations/llama_v2/policy.py
@@ -6,9 +6,9 @@
 from typing import Any
 
 from ...config_v2 import RaggedInferenceEngineConfig
-from ...model_implementations.inference_policy_base import ContainerMap, InferenceV2Policy
-from ...model_implementations.llama_v2.llama_v2_containers import Llama2NonTransformerContainer, Llama2TransformerContainer
-from ...model_implementations.llama_v2.llama_v2_model import Llama2InferenceModel
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import Llama2NonTransformerContainer, Llama2TransformerContainer
+from .model import Llama2InferenceModel
 
 
 class Llama2Policy(InferenceV2Policy):
diff --git a/deepspeed/inference/v2/model_implementations/mistral/model.py b/deepspeed/inference/v2/model_implementations/mistral/model.py
index d9b06b91e308..318d362f1a64 100644
--- a/deepspeed/inference/v2/model_implementations/mistral/model.py
+++ b/deepspeed/inference/v2/model_implementations/mistral/model.py
@@ -104,6 +104,10 @@ def norm_type(self) -> NormTypeEnum:
     def positional_embedding_type(self) -> PositionalEmbeddingType:
         return PositionalEmbeddingType.rotate_half
 
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
     """
     Forward implementations
     """
@@ -144,8 +148,7 @@ def _forward_transformer(self, layer_idx: int, residual: torch.Tensor, hidden_st
         kv_cache = self.state_manager.get_cache(layer_idx)
 
         hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=None)
-        hidden_states = self.attn(hidden_states, kv_cache,
-                                  ragged_batch_info)  #, inv_freqs=None) #cur_params.rotary_emb)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
         hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
 
         if self.tp_size > 1:
@@ -175,8 +178,10 @@ def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: Ragge
         Performs unembedding of the hidden states to logits. This will only sample the final
         token of each sequence.
         """
-        logits = self.unembed(hidden_states, self._non_transformer.word_unembed, ragged_batch_info,
-                              self._non_transformer.final_norm)
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
 
         if self.tp_size > 1:
             comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
diff --git a/deepspeed/inference/v2/model_implementations/mistral/policy.py b/deepspeed/inference/v2/model_implementations/mistral/policy.py
index f6d0a0fe5987..b67ec311c952 100644
--- a/deepspeed/inference/v2/model_implementations/mistral/policy.py
+++ b/deepspeed/inference/v2/model_implementations/mistral/policy.py
@@ -5,10 +5,10 @@
 
 from typing import Any
 
-from deepspeed.inference.v2.config_v2 import RaggedInferenceEngineConfig
-from deepspeed.inference.v2.model_implementations.inference_policy_base import ContainerMap, InferenceV2Policy
-from deepspeed.inference.v2.model_implementations.mistral.container import MistralNonTransformerContainer, MistralTransformerContainer
-from deepspeed.inference.v2.model_implementations.mistral.model import MistralInferenceModel
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import MistralNonTransformerContainer, MistralTransformerContainer
+from .model import MistralInferenceModel
 
 
 class MistralPolicy(InferenceV2Policy):
diff --git a/deepspeed/inference/v2/model_implementations/mixtral/__init__.py b/deepspeed/inference/v2/model_implementations/mixtral/__init__.py
new file mode 100644
index 000000000000..2cb1aa889291
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mixtral/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import MixtralPolicy
diff --git a/deepspeed/inference/v2/model_implementations/mixtral/container.py b/deepspeed/inference/v2/model_implementations/mixtral/container.py
new file mode 100644
index 000000000000..6ec4a0552b8f
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mixtral/container.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from deepspeed.inference.v2.model_implementations.common_parameters import *
+from deepspeed.inference.v2.model_implementations.layer_container_base import LayerContainer
+
+
+class MixtralTransformerContainer(LayerContainer):
+
+    qkv_w: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    moe_gate: MoEGatingWeightParameter
+    moe_mlp_1: UnfusedMoEGatedMLPParameter
+    moe_mlp_2: UnfusedMoEMLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "block_sparse_moe.gate.weight": "moe_gate.params",
+        "block_sparse_moe.experts.*.w1.weight": "moe_mlp_1.gating_experts",
+        "block_sparse_moe.experts.*.w3.weight": "moe_mlp_1.up_experts",
+        "block_sparse_moe.experts.*.w2.weight": "moe_mlp_2.experts",
+    }
+
+
+class MixtralNonTransformerContainer(LayerContainer):
+
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "lm_head.weight": "word_unembed.params",
+        "model.norm.weight": "final_norm.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/mixtral/model.py b/deepspeed/inference/v2/model_implementations/mixtral/model.py
new file mode 100644
index 000000000000..878cd8e31cec
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mixtral/model.py
@@ -0,0 +1,261 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...config_v2 import RaggedInferenceEngineConfig
+from ...inference_utils import ActivationType, DtypeEnum
+from ...model_implementations import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...ragged import RaggedBatchWrapper
+from ..inference_model_base import (
+    DSModelImplementationConfig,
+    MPType,
+)
+
+from .container import MixtralNonTransformerContainer, MixtralTransformerContainer
+
+
+class MixtralInferenceModel(DSMoETransformerModelBase):
+    """
+    Inference model implementation for Mixtral models.
+    """
+
+    _non_transformer: Optional[MixtralNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[MixtralTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_position_embeddings
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        activation = self._config.hidden_act.lower()
+        if activation == "gelu":
+            return ActivationType.GEGLU
+        elif activation == "relu":
+            return ActivationType.ReGLU
+        elif activation == "gegelu":
+            return ActivationType.GEGLU
+        elif activation == "silu":
+            return ActivationType.SiGLU
+        else:
+            raise NotImplementedError(f"Activation {activation} not supported")
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        """
+        The positional embedding configuration for the model.
+        """
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    """
+    Inherited from `DSMoETransformerModelBase`
+    """
+
+    @property
+    def n_experts(self) -> int:
+        return self._config.num_local_experts
+
+    @property
+    def n_top_k(self) -> int:
+        return self._config.num_experts_per_tok
+
+    @property
+    def normalize_expert_scores(self) -> bool:
+        return True
+
+    """
+    Model implementation
+    """
+
+    def __init__(self, config: DSModelImplementationConfig, engine_config: RaggedInferenceEngineConfig,
+                 base_mp_group: MPType) -> None:
+        """
+        Base implementation for initialization. By default, this will initialize
+        the traditional components of a transformer model:
+            - Embedding
+            - QKV projection
+            - Self attention
+            - Attention output projection
+            - Feed forward network
+            - Normalization
+            - Unembedding
+
+        Arguments:
+            config (DSModelImplementationConfig): Model-specific configuration. No assumptions
+                should be made about this config that are not closely tied to the specific
+                model implementation.
+            engine_config (RaggedInferenceEngineConfig): Engine configuration.
+            base_mp_group (MPType): Base communication group for Tensor-parallel inference.
+        """
+        super().__init__(config, engine_config, base_mp_group)
+
+        self.make_norm_layer()
+        self.make_qkv_layer()
+        self.make_attn_layer()
+        self.make_attn_out_layer()
+        self.make_moe_layer()
+        self.make_embedding_layer()
+        self.make_unembedding_layer()
+        self._kv_cache_config = None
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                             ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma)
+
+        hidden_states = self.moe(hidden_states, ragged_batch_info, cur_params.moe_gate, cur_params.moe_mlp_1,
+                                 cur_params.moe_mlp_2)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer(layer_idx, residual, hidden_states, wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/mixtral/policy.py b/deepspeed/inference/v2/model_implementations/mixtral/policy.py
new file mode 100644
index 000000000000..2f0087919720
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/mixtral/policy.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import MixtralTransformerContainer, MixtralNonTransformerContainer
+from .model import MixtralInferenceModel
+
+
+class MixtralPolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> MixtralInferenceModel:
+        return MixtralInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+
+        map = ContainerMap()
+
+        transformer_containers = [MixtralTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(MixtralNonTransformerContainer(self.model))
+
+        map.set_unmapped_params([])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/opt/container.py b/deepspeed/inference/v2/model_implementations/opt/container.py
index 5ddbbde3f141..e97599ef8e50 100644
--- a/deepspeed/inference/v2/model_implementations/opt/container.py
+++ b/deepspeed/inference/v2/model_implementations/opt/container.py
@@ -5,8 +5,8 @@
 
 # Create a container object to save model-specific tensors using the policy file above.
 
-from ...model_implementations.common_parameters import *
-from ...model_implementations.layer_container_base import LayerContainer
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
 '''
  # HF OPT model looks like this:
 
diff --git a/deepspeed/inference/v2/model_implementations/opt/model.py b/deepspeed/inference/v2/model_implementations/opt/model.py
index fa221e15a0b7..adf011d8f1a7 100644
--- a/deepspeed/inference/v2/model_implementations/opt/model.py
+++ b/deepspeed/inference/v2/model_implementations/opt/model.py
@@ -12,11 +12,7 @@
 from ...allocator import empty_from
 from ...inference_utils import ActivationType, DtypeEnum
 from ...model_implementations import *
-from ...modules.configs import (
-    DSEmbeddingsConfig,
-    NormTypeEnum,
-    PositionalEmbeddingType,
-)
+from ...modules.configs import *
 from ...ragged import RaggedBatchWrapper
 from .container import OPTNonTransformerContainer, OPTTransformerContainer
 
@@ -94,6 +90,10 @@ def norm_type(self) -> NormTypeEnum:
     def positional_embedding_type(self) -> PositionalEmbeddingType:
         return PositionalEmbeddingType.none
 
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return None
+
     """
     Overrides of ``DSTransformerModelBase`` methods
     """
@@ -131,8 +131,7 @@ def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hid
         kv_cache = self.state_manager.get_cache(layer_idx)
 
         hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
-        hidden_states = self.attn(hidden_states, kv_cache,
-                                  ragged_batch_info)  #, inv_freqs=None) #cur_params.rotary_emb)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
         hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=cur_params.attn_out_b)
 
         if self.tp_size > 1:
@@ -164,8 +163,11 @@ def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hid
         return residual, hidden_states
 
     def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
-        logits = self.unembed(hidden_states, self._non_transformer.word_unembed, ragged_batch_info,
-                              self._non_transformer.final_norm_w, self._non_transformer.final_norm_b)
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm_w,
+                              beta=self._non_transformer.final_norm_b)
 
         if self.tp_size > 1:
             comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
diff --git a/deepspeed/inference/v2/model_implementations/opt/policy.py b/deepspeed/inference/v2/model_implementations/opt/policy.py
index af5750260ead..d57d5beb48d5 100644
--- a/deepspeed/inference/v2/model_implementations/opt/policy.py
+++ b/deepspeed/inference/v2/model_implementations/opt/policy.py
@@ -6,9 +6,9 @@
 from typing import Any
 
 from ...config_v2 import RaggedInferenceEngineConfig
-from ...model_implementations.inference_policy_base import ContainerMap, InferenceV2Policy
-from ...model_implementations.opt.container import OPTNonTransformerContainer, OPTTransformerContainer
-from ...model_implementations.opt.model import OPTInferenceModel
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import OPTNonTransformerContainer, OPTTransformerContainer
+from .model import OPTInferenceModel
 
 
 class OPTPolicy(InferenceV2Policy):
diff --git a/deepspeed/inference/v2/model_implementations/phi/__init__.py b/deepspeed/inference/v2/model_implementations/phi/__init__.py
new file mode 100644
index 000000000000..3ab107e75a91
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import PhiPolicy
diff --git a/deepspeed/inference/v2/model_implementations/phi/containers.py b/deepspeed/inference/v2/model_implementations/phi/containers.py
new file mode 100644
index 000000000000..21f07eb8c99a
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi/containers.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Phi-2 model looks like this:
+
+PhiForCausalLM(
+  (model): PhiModel(
+    (embed_tokens): Embedding(51200, 2560)
+    (embed_dropout): Dropout(p=0.0, inplace=False)
+    (layers): ModuleList(
+      (0-31): 32 x PhiDecoderLayer(
+        (self_attn): PhiAttention(
+          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
+          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
+          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
+          (dense): Linear(in_features=2560, out_features=2560, bias=True)
+          (rotary_emb): PhiRotaryEmbedding()
+        )
+        (mlp): PhiMLP(
+          (activation_fn): NewGELUActivation()
+          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
+          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
+        )
+        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
+        (resid_dropout): Dropout(p=0.1, inplace=False)
+      )
+    )
+    (final_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
+  )
+  (lm_head): Linear(in_features=2560, out_features=51200, bias=True)
+)
+'''
+
+
+class PhiTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Phi model.
+    """
+    qkv_w: UnfusedQKVParameter
+    qkv_b: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    attn_out_b: AttentionOutputParameter
+    mlp_1_w: MLP1Parameter
+    mlp_1_b: MLP1Parameter
+    mlp_2_w: MLP2Parameter
+    mlp_2_b: MLP2Parameter
+    ln_gamma: NormParameter
+    ln_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.q_proj.bias": "qkv_b.q_params",
+        "self_attn.k_proj.bias": "qkv_b.k_params",
+        "self_attn.v_proj.bias": "qkv_b.v_params",
+        "self_attn.dense.weight": "attn_out_w.params",
+        "self_attn.dense.bias": "attn_out_b.params",
+        "mlp.fc1.weight": "mlp_1_w.params",
+        "mlp.fc1.bias": "mlp_1_b.params",
+        "mlp.fc2.weight": "mlp_2_w.params",
+        "mlp.fc2.bias": "mlp_2_b.params",
+        "input_layernorm.weight": "ln_gamma.params",
+        "input_layernorm.bias": "ln_beta.params",
+    }
+
+
+class PhiNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Phi model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed_w: UnembedParameter
+    word_unembed_b: UnembedParameter
+    final_norm_gamma: NormParameter
+    final_norm_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.final_layernorm.weight": "final_norm_gamma.params",
+        "model.final_layernorm.bias": "final_norm_beta.params",
+        "lm_head.weight": "word_unembed_w.params",
+        "lm_head.bias": "word_unembed_b.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/phi/model.py b/deepspeed/inference/v2/model_implementations/phi/model.py
new file mode 100644
index 000000000000..2d5826810cb5
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi/model.py
@@ -0,0 +1,199 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from .. import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...ragged import RaggedBatchWrapper
+
+from .containers import PhiNonTransformerContainer, PhiTransformerContainer
+
+
+class PhiInferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[PhiNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[PhiTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties inherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties inherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.GELU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.LayerNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        rotary_dim = int(self._config.partial_rotary_factor * self.head_size)
+        return RotateHalfConfig(rotate_dim=rotary_dim, theta_base=self._config.rope_theta)
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        attn_ln_out = hidden_states
+        attn_hidden_state = self.qkv(attn_ln_out, cur_params.qkv_w, b=cur_params.qkv_b)
+        attn_hidden_state = self.attn(attn_hidden_state, kv_cache, ragged_batch_info)
+        attention_output = self.attn_out(attn_hidden_state, cur_params.attn_out_w, b=cur_params.attn_out_b)
+
+        mlp_ln_out = hidden_states
+        mlp_hidden_state = self.mlp_1(mlp_ln_out, cur_params.mlp_1_w, b=cur_params.mlp_1_b)
+        mlp_output = self.mlp_2(mlp_hidden_state, cur_params.mlp_2_w, b=cur_params.mlp_2_b)
+
+        mlp_output.add_(attention_output)
+
+        if self.tp_size > 1:
+            dist.all_reduce(mlp_output, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, mlp_output = self.norm(residual, mlp_output, next_params.ln_gamma, beta=next_params.ln_beta)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(mlp_output)
+
+        return residual, mlp_output
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed_w,
+                              ragged_batch_info,
+                              bias=self._non_transformer.word_unembed_b,
+                              gamma=self._non_transformer.final_norm_gamma,
+                              beta=self._non_transformer.final_norm_beta)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual,
+                                            None,
+                                            gamma=self._transformer[0].ln_gamma,
+                                            beta=self._transformer[0].ln_beta)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/phi/policy.py b/deepspeed/inference/v2/model_implementations/phi/policy.py
new file mode 100644
index 000000000000..4b081a8e61bd
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi/policy.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .containers import PhiNonTransformerContainer, PhiTransformerContainer
+from .model import PhiInferenceModel
+
+
+class PhiPolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> PhiInferenceModel:
+        return PhiInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        trans_container_cls = PhiTransformerContainer
+        transformer_containers = [trans_container_cls(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(PhiNonTransformerContainer(self.model))
+
+        map.set_unmapped_params(
+            [f'model.layers.{i}.self_attn.rotary_emb.inv_freq' for i in range(self.model.num_layers)])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/phi3/__init__.py b/deepspeed/inference/v2/model_implementations/phi3/__init__.py
new file mode 100644
index 000000000000..1a4b756d210c
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import Phi3Policy
diff --git a/deepspeed/inference/v2/model_implementations/phi3/containers.py b/deepspeed/inference/v2/model_implementations/phi3/containers.py
new file mode 100644
index 000000000000..1cb52a75ae0b
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/containers.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Phi-3 model looks like this:
+
+Phi3ForCausalLM(
+  (model): Phi3Model(
+    (embed_tokens): Embedding(32064, 3072)
+    (embed_dropout): Dropout(p=0.0, inplace=False)
+    (layers): ModuleList(
+      (0-31): 32 x Phi3DecoderLayer(
+        (self_attn): Phi3Attention(
+          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
+          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
+          (rotary_emb): Phi3RotaryEmbedding()
+        )
+        (mlp): PhiMLP(
+          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
+          (down_proj): Linear(in_features=16384, out_features=3072, bias=False)
+          (activation_fn): SiLU()
+        )
+        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
+        (resid_attn_dropout): Dropout(p=0.0)
+        (resid_mlp_dropout): Dropout(p=0.0)
+        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
+      )
+    )
+    (final_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
+  )
+  (lm_head): Linear(in_features=3072, out_features=32064, bias=False)
+)
+'''
+
+
+class Phi3TransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Phi model.
+    """
+    qkv_w: FusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: FusedGatedMLPParameter
+    mlp_2_w: MLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.qkv_proj.weight": "qkv_w.params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate_up_proj.weight": "mlp_1_w.params",
+        "mlp.down_proj.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Phi3NonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Phi model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed_w: UnembedParameter
+    final_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm_gamma.params",
+        "lm_head.weight": "word_unembed_w.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/phi3/model.py b/deepspeed/inference/v2/model_implementations/phi3/model.py
new file mode 100644
index 000000000000..507bb4fc9af1
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/model.py
@@ -0,0 +1,204 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from .. import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...ragged import RaggedBatchWrapper
+
+from .containers import Phi3NonTransformerContainer, Phi3TransformerContainer
+
+
+class Phi3InferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[Phi3NonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Phi3TransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties inherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties inherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        if self._config.torch_dtype == torch.float16:
+            return DtypeEnum.fp16
+        elif self._config.torch_dtype == torch.bfloat16:
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        activation = self._config.hidden_act.lower()
+        if activation == "gelu":
+            return ActivationType.GEGLU
+        elif activation == "relu":
+            return ActivationType.ReGLU
+        elif activation == "gegelu":
+            return ActivationType.GEGLU
+        elif activation == "silu":
+            return ActivationType.SiGLU
+        else:
+            raise NotImplementedError(f"Activation {activation} not supported")
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=None)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed_w,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm_gamma)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, gamma=self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/phi3/policy.py b/deepspeed/inference/v2/model_implementations/phi3/policy.py
new file mode 100644
index 000000000000..a1b445929053
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/phi3/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .containers import Phi3NonTransformerContainer, Phi3TransformerContainer
+from .model import Phi3InferenceModel
+
+
+class Phi3Policy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Phi3InferenceModel:
+        return Phi3InferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Phi3TransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Phi3NonTransformerContainer(self.model))
+
+        map.set_unmapped_params([])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/qwen/__init__.py b/deepspeed/inference/v2/model_implementations/qwen/__init__.py
new file mode 100644
index 000000000000..18206048fa29
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import QwenPolicy
diff --git a/deepspeed/inference/v2/model_implementations/qwen/container.py b/deepspeed/inference/v2/model_implementations/qwen/container.py
new file mode 100644
index 000000000000..313de68555b9
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen/container.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Qwen model looks like this:
+
+QWenLMHeadModel(
+  (transformer): QWenModel(
+    (wte): Embedding(151936, 4096)
+    (drop): Dropout(p=0.0, inplace=False)
+    (rotary_emb): RotaryEmbedding()
+    (h): ModuleList(
+      (0-31): 32 x QWenBlock(
+        (ln_1): RMSNorm()
+        (attn): QWenAttention(
+          (c_attn): Linear(in_features=4096, out_features=12288, bias=True)
+          (c_proj): Linear(in_features=4096, out_features=4096, bias=False)
+          (attn_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (ln_2): RMSNorm()
+        (mlp): QWenMLP(
+          (w1): Linear(in_features=4096, out_features=11008, bias=False)
+          (w2): Linear(in_features=4096, out_features=11008, bias=False)
+          (c_proj): Linear(in_features=11008, out_features=4096, bias=False)
+        )
+      )
+    )
+    (ln_f): RMSNorm()
+  )
+  (lm_head): Linear(in_features=4096, out_features=151936, bias=False)
+)
+'''
+
+
+class QwenTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Qwen model.
+    """
+    qkv_w: FusedQKVParameter
+    qkv_b: FusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: GatedMLPParameter
+    mlp_2_w: MLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "attn.c_attn.weight": "qkv_w.params",
+        "attn.c_attn.bias": "qkv_b.params",
+        "attn.c_proj.weight": "attn_out_w.params",
+        "mlp.w1.weight": "mlp_1_w.up_params",
+        "mlp.w2.weight": "mlp_1_w.gate_params",
+        "mlp.c_proj.weight": "mlp_2_w.params",
+        "ln_1.weight": "attn_norm_gamma.params",
+        "ln_2.weight": "mlp_norm_gamma.params",
+    }
+
+
+class QwenNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Qwen model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "transformer.wte.weight": "word_emb.params",
+        "transformer.ln_f.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/qwen/model.py b/deepspeed/inference/v2/model_implementations/qwen/model.py
new file mode 100644
index 000000000000..e867e4be6713
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen/model.py
@@ -0,0 +1,223 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from .. import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...modules import heuristics
+from ...ragged import RaggedBatchWrapper
+
+from .container import QwenNonTransformerContainer, QwenTransformerContainer
+
+
+class QwenInferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[QwenNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[QwenTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size // 2
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.hidden_size // self._config.kv_channels
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        autoset_precision = self._config.bf16 + self._config.fp16 == 0
+        if autoset_precision:
+            return DtypeEnum.fp16
+        if self._config.fp16:
+            return DtypeEnum.fp16
+        elif self._config.bf16:
+            # TODO(ZonePG): bf16 inference results may be different from huggingface bf16,
+            # because in rms_norm, Qwen still use float() instead of bf16
+            return DtypeEnum.bf16
+        else:
+            raise NotImplementedError("Only fp16 and bf16 are supported")
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.SiGLU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rotary_emb_base)
+
+    def make_norm_layer(self) -> None:
+        """
+        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+        """
+        norm_config = DSNormConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            type=self.norm_type,
+            channels=self.model_dim,
+            residual_dtype=self.activation_dtype,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            eps=self._config.layer_norm_epsilon,
+        )
+
+        self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config)
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        # Should be configurable in the future
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/qwen/policy.py b/deepspeed/inference/v2/model_implementations/qwen/policy.py
new file mode 100644
index 000000000000..a9263f553621
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import QwenNonTransformerContainer, QwenTransformerContainer
+from .model import QwenInferenceModel
+
+
+class QwenPolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> QwenInferenceModel:
+        return QwenInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [QwenTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['transformer.h'], transformer_containers)
+
+        map.set_non_transformer_params(QwenNonTransformerContainer(self.model))
+
+        map.set_unmapped_params(['transformer.rotary_emb.inv_freq'])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py b/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py
new file mode 100644
index 000000000000..80b09757c74d
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import Qwen2Policy
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/container.py b/deepspeed/inference/v2/model_implementations/qwen_v2/container.py
new file mode 100644
index 000000000000..6556d87d6afb
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/container.py
@@ -0,0 +1,82 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Qwen2 model looks like this:
+
+Qwen2ForCausalLM(
+  (model): Qwen2Model(
+    (embed_tokens): Embedding(151936, 1024)
+    (layers): ModuleList(
+      (0-23): 24 x Qwen2DecoderLayer(
+        (self_attn): Qwen2SdpaAttention(
+          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
+          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
+          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
+          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
+          (rotary_emb): Qwen2RotaryEmbedding()
+        )
+        (mlp): Qwen2MLP(
+          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
+          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
+          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
+          (act_fn): SiLU()
+        )
+        (input_layernorm): Qwen2RMSNorm()
+        (post_attention_layernorm): Qwen2RMSNorm()
+      )
+    )
+    (norm): Qwen2RMSNorm()
+  )
+  (lm_head): Linear(in_features=1024, out_features=151936, bias=False)
+)
+'''
+
+
+class Qwen2TransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Qwen2 model.
+    """
+    qkv_w: UnfusedQKVParameter
+    qkv_b: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: GatedMLPParameter
+    mlp_2_w: MLP2Parameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.q_proj.bias": "qkv_b.q_params",
+        "self_attn.k_proj.bias": "qkv_b.k_params",
+        "self_attn.v_proj.bias": "qkv_b.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate_proj.weight": "mlp_1_w.gate_params",
+        "mlp.up_proj.weight": "mlp_1_w.up_params",
+        "mlp.down_proj.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Qwen2NonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Qwen2 model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/model.py b/deepspeed/inference/v2/model_implementations/qwen_v2/model.py
new file mode 100644
index 000000000000..d535462a954d
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/model.py
@@ -0,0 +1,221 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...inference_utils import ActivationType, DtypeEnum
+from .. import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...modules import heuristics
+from ...ragged import RaggedBatchWrapper
+
+from .container import Qwen2NonTransformerContainer, Qwen2TransformerContainer
+
+
+class Qwen2InferenceModel(DSTransformerModelBase):
+    """
+    Inference model implementation for ragged batching for Llama-2 models.
+    """
+
+    _non_transformer: Optional[Qwen2NonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Qwen2TransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_seq_length
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        # TODO(ZonePG): bf16 inference results may be different from huggingface bf16,
+        # because in rms_norm, Qwen still use float() instead of bf16
+        # if self._config.torch_dtype == torch.float16:
+        #     return DtypeEnum.fp16
+        # elif self._config.torch_dtype == torch.bfloat16:
+        #     return DtypeEnum.bf16
+        # else:
+        #     raise NotImplementedError("Only fp16 and bf16 are supported")
+        return DtypeEnum.fp16
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.SiGLU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    def make_norm_layer(self) -> None:
+        """
+        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+        """
+        norm_config = DSNormConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            type=self.norm_type,
+            channels=self.model_dim,
+            residual_dtype=self.activation_dtype,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            eps=self._config.rms_norm_eps,
+        )
+
+        self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config)
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer_layer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                                   ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        # Should be configurable in the future
+        hidden_states = self.mlp_1(hidden_states, cur_params.mlp_1_w, b=None)
+        hidden_states = self.mlp_2(hidden_states, cur_params.mlp_2_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer_layer(layer_idx, residual, hidden_states,
+                                                                      wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py b/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py
new file mode 100644
index 000000000000..9c5db2ba0065
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2/policy.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import Qwen2NonTransformerContainer, Qwen2TransformerContainer
+from .model import Qwen2InferenceModel
+
+
+class Qwen2Policy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Qwen2InferenceModel:
+        return Qwen2InferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Qwen2TransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Qwen2NonTransformerContainer(self.model))
+
+        map.set_unmapped_params(
+            [f'model.layers.{i}.self_attn.rotary_emb.inv_freq' for i in range(self.model.num_layers)])
+
+        return map
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
new file mode 100644
index 000000000000..23e06a770023
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .policy import Qwen2MoePolicy
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py
new file mode 100644
index 000000000000..e499379da7e3
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/container.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ..common_parameters import *
+from ..layer_container_base import LayerContainer
+'''
+ # HF Qwen2-57B-A14B model looks like this:
+
+Qwen2MoeForCausalLM(
+  (model): Qwen2MoeModel(
+    (embed_tokens): Embedding(151936, 3584)
+    (layers): ModuleList(
+      (0-27): 28 x Qwen2MoeDecoderLayer(
+        (self_attn): Qwen2MoeSdpaAttention(
+          (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
+          (k_proj): Linear(in_features=3584, out_features=512, bias=True)
+          (v_proj): Linear(in_features=3584, out_features=512, bias=True)
+          (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
+          (rotary_emb): Qwen2MoeRotaryEmbedding()
+        )
+        (mlp): Qwen2MoeSparseMoeBlock(
+          (gate): Linear(in_features=3584, out_features=64, bias=False)
+          (experts): ModuleList(
+            (0-63): 64 x Qwen2MoeMLP(
+              (gate_proj): Linear(in_features=3584, out_features=2560, bias=False)
+              (up_proj): Linear(in_features=3584, out_features=2560, bias=False)
+              (down_proj): Linear(in_features=2560, out_features=3584, bias=False)
+              (act_fn): SiLU()
+            )
+          )
+          (shared_expert): Qwen2MoeMLP(
+            (gate_proj): Linear(in_features=3584, out_features=20480, bias=False)
+            (up_proj): Linear(in_features=3584, out_features=20480, bias=False)
+            (down_proj): Linear(in_features=20480, out_features=3584, bias=False)
+            (act_fn): SiLU()
+          )
+          (shared_expert_gate): Linear(in_features=3584, out_features=1, bias=False)
+        )
+        (input_layernorm): Qwen2MoeRMSNorm((3584,), eps=1e-06)
+        (post_attention_layernorm): Qwen2MoeRMSNorm((3584,), eps=1e-06)
+      )
+    )
+    (norm): Qwen2MoeRMSNorm((3584,), eps=1e-06)
+  )
+  (lm_head): Linear(in_features=3584, out_features=151936, bias=False)
+)
+'''
+
+
+class Qwen2MoeTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Qwen2Moe model.
+    """
+    qkv_w: UnfusedQKVParameter
+    qkv_b: UnfusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    moe_gate: MoEGatingWeightParameter
+    moe_mlp_1: UnfusedMoEGatedMLPParameter
+    moe_mlp_2: UnfusedMoEMLP2Parameter
+    shared_moe_mlp_1: GatedMLPParameter
+    shared_moe_mlp_2: MLP2Parameter
+    shared_moe_gate: MoEGatingWeightParameter
+    attn_norm_gamma: NormParameter
+    mlp_norm_gamma: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attn.q_proj.weight": "qkv_w.q_params",
+        "self_attn.k_proj.weight": "qkv_w.k_params",
+        "self_attn.v_proj.weight": "qkv_w.v_params",
+        "self_attn.q_proj.bias": "qkv_b.q_params",
+        "self_attn.k_proj.bias": "qkv_b.k_params",
+        "self_attn.v_proj.bias": "qkv_b.v_params",
+        "self_attn.o_proj.weight": "attn_out_w.params",
+        "mlp.gate.weight": "moe_gate.params",
+        "mlp.experts.*.gate_proj.weight": "moe_mlp_1.gating_experts",
+        "mlp.experts.*.up_proj.weight": "moe_mlp_1.up_experts",
+        "mlp.experts.*.down_proj.weight": "moe_mlp_2.experts",
+        "mlp.shared_expert.gate_proj.weight": "shared_moe_mlp_1.gate_params",
+        "mlp.shared_expert.up_proj.weight": "shared_moe_mlp_1.up_params",
+        "mlp.shared_expert.down_proj.weight": "shared_moe_mlp_2.params",
+        "mlp.shared_expert_gate.weight": "shared_moe_gate.params",
+        "input_layernorm.weight": "attn_norm_gamma.params",
+        "post_attention_layernorm.weight": "mlp_norm_gamma.params",
+    }
+
+
+class Qwen2MoeNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Qwen2Moe model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm: NormParameter
+
+    PARAM_MAPPING = {
+        "model.embed_tokens.weight": "word_emb.params",
+        "model.norm.weight": "final_norm.params",
+        "lm_head.weight": "word_unembed.params",
+    }
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py
new file mode 100644
index 000000000000..c7841b24e5fc
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/model.py
@@ -0,0 +1,359 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Iterable, Optional, Tuple
+
+import torch
+
+import deepspeed.comm as dist
+
+from ...allocator import empty_from
+from ...config_v2 import RaggedInferenceEngineConfig
+from ...inference_utils import ActivationType, DtypeEnum
+from ...model_implementations import *
+from ...modules.configs import *
+from ...modules.interfaces import *
+from ...modules import heuristics
+from ...ragged import RaggedBatchWrapper
+from ..inference_model_base import (
+    DSModelImplementationConfig,
+    MPType,
+)
+
+from .container import Qwen2MoeNonTransformerContainer, Qwen2MoeTransformerContainer
+
+
+class Qwen2MoeInferenceModel(DSMoETransformerModelBase):
+    """
+    Inference model implementation for Qwen2MoE models.
+    """
+
+    _non_transformer: Optional[Qwen2MoeNonTransformerContainer]
+    """
+    Embed + unembed container. Specializing the type annotation.
+    """
+
+    _transformer: Optional[Iterable[Qwen2MoeTransformerContainer]]
+    """
+    Per-layer transformer container. Specializing the type annotation.
+    """
+    """
+    Properties ineherited from `DSInferenceModelBase`
+    """
+
+    @property
+    def max_sequence_length(self) -> int:
+        return self._config.max_position_embeddings
+
+    """
+    Properties ineherited from `DSTransformerModelBase`
+    """
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.num_hidden_layers
+
+    @property
+    def model_dim(self) -> int:
+        return self._config.hidden_size
+
+    @property
+    def vocab_size(self) -> int:
+        return self._config.vocab_size
+
+    @property
+    def head_size(self) -> int:
+        return self.model_dim // self.n_heads
+
+    @property
+    def n_heads(self) -> int:
+        return self._config.num_attention_heads
+
+    @property
+    def intermediate_dim(self) -> int:
+        return self._config.shared_expert_intermediate_size
+
+    @property
+    def n_heads_kv(self) -> int:
+        return self._config.num_key_value_heads
+
+    @property
+    def activation_dtype(self) -> DtypeEnum:
+        # TODO(ZonePG): bf16 inference results may be different from huggingface bf16,
+        # because in rms_norm, Qwen still use float() instead of bf16
+        # if self._config.torch_dtype == torch.float16:
+        #     return DtypeEnum.fp16
+        # elif self._config.torch_dtype == torch.bfloat16:
+        #     return DtypeEnum.bf16
+        # else:
+        #     raise NotImplementedError("Only fp16 and bf16 are supported")
+        return DtypeEnum.fp16
+
+    @property
+    def mlp_activation_fn(self) -> ActivationType:
+        return ActivationType.SiGLU
+
+    @property
+    def norm_type(self) -> NormTypeEnum:
+        return NormTypeEnum.RMSNorm
+
+    @property
+    def positional_embedding_type(self) -> PositionalEmbeddingType:
+        return PositionalEmbeddingType.rotate_half
+
+    @property
+    def positional_embedding_config(self) -> Optional[RotateHalfConfig]:
+        return RotateHalfConfig(theta_base=self._config.rope_theta)
+
+    """
+    Inherited from `DSMoETransformerModelBase`
+    """
+
+    @property
+    def n_experts(self) -> int:
+        return self._config.num_experts
+
+    @property
+    def n_top_k(self) -> int:
+        return self._config.num_experts_per_tok
+
+    @property
+    def normalize_expert_scores(self) -> bool:
+        return self._config.norm_topk_prob
+
+    def make_moe_layer(self) -> None:
+        """
+        Instantiates the MoE layer for the model. This sets the `self.moe` attribute.
+        """
+        sharded_dim = sharded_intermediate_dim(self.intermediate_dim // self.n_top_k, self.tp_size, self.tp_rank)
+
+        moe_config = DSMoEConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            model_dim=self.model_dim,
+            intermediate_features=sharded_dim,
+            activation=self.mlp_activation_fn,
+            n_experts=self.n_experts,
+            top_k=self.n_top_k,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            normalize_scores=self.normalize_expert_scores,
+        )
+
+        self.moe = heuristics.instantiate_moe(moe_config, self._engine_config)
+
+    ######### MLP 1 #########
+    def make_shared_expert_mlp_1_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the first MLP in the feedforward network.
+        This sets the `self.mlp_1` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=self.model_dim,
+            out_channels=shard_size,
+            activation=self.mlp_activation_fn,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_mlp_1 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    ######### MLP 2 #########
+    def make_shared_expert_mlp_2_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the second MLP in the feedforward network.
+        This sets the `self.mlp_2` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.intermediate_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=shard_size,
+            out_channels=self.model_dim,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_mlp_2 = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    ######### MLP 2 #########
+    def make_shared_expert_gate_layer(self) -> None:
+        """
+        Instantiates the linear projection layer for the second MLP in the feedforward network.
+        This sets the `self.mlp_2` attribute.
+        """
+        shard_size = sharded_intermediate_dim(self.model_dim, self.tp_size, self.tp_rank)
+
+        linear_config = DSLinearConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            in_channels=shard_size,
+            out_channels=8,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+        )
+
+        self.shared_expert_gate = heuristics.instantiate_linear(linear_config, self._engine_config)
+
+    def make_norm_layer(self) -> None:
+        """
+        Instantiates the normalization layer for the model. This sets the `self.norm` attribute.
+
+        TODO(cmikeh2): In the future we'll distinguish between the different norm objects,
+        but for now we'll just use the same one for all of them.
+        """
+        norm_config = DSNormConfig(
+            max_tokens=self._engine_config.state_manager.max_ragged_batch_size,
+            type=self.norm_type,
+            channels=self.model_dim,
+            residual_dtype=self.activation_dtype,
+            input_dtype=self.activation_dtype,
+            output_dtype=self.activation_dtype,
+            eps=self._config.rms_norm_eps,
+        )
+
+        self.norm = heuristics.instantiate_pre_norm(norm_config, self._engine_config)
+
+    """
+    Model implementation
+    """
+
+    def __init__(self, config: DSModelImplementationConfig, engine_config: RaggedInferenceEngineConfig,
+                 base_mp_group: MPType) -> None:
+        """
+        Base implementation for initialization. By default, this will initialize
+        the traditional components of a transformer model:
+            - Embedding
+            - QKV projection
+            - Self attention
+            - Attention output projection
+            - Feed forward network
+            - Normalization
+            - Unembedding
+
+        Arguments:
+            config (DSModelImplementationConfig): Model-specific configuration. No assumptions
+                should be made about this config that are not closely tied to the specific
+                model implementation.
+            engine_config (RaggedInferenceEngineConfig): Engine configuration.
+            base_mp_group (MPType): Base communication group for Tensor-parallel inference.
+        """
+        super().__init__(config, engine_config, base_mp_group)
+
+        self.make_norm_layer()
+        self.make_qkv_layer()
+        self.make_attn_layer()
+        self.make_attn_out_layer()
+        self.make_moe_layer()
+        self.make_shared_expert_mlp_1_layer()
+        self.make_shared_expert_mlp_2_layer()
+        self.make_shared_expert_gate_layer()
+        self.make_embedding_layer()
+        self.make_unembedding_layer()
+        self._kv_cache_config = None
+
+    """
+    Forward implementations
+    """
+
+    def _forward_embed(self, ragged_batch: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs the embedding lookup prior to running the transformer of the model.
+
+        Arguments:
+            ragged_batch (RaggedBatchWrapper): The batch to embed.
+
+        Returns:
+            torch.Tensor: The embedded batch.
+        """
+        embed = self.embed(ragged_batch, self._non_transformer.word_emb)
+
+        if embed.shape[-1] != self.model_dim:
+            raise ValueError(f"Embedding output shape {embed.shape} does not match model_dim {self.model_dim}")
+
+        return embed
+
+    def _forward_transformer(self, layer_idx: int, residual: torch.Tensor, hidden_states: torch.Tensor,
+                             ragged_batch_info: RaggedBatchWrapper) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Executes one (slightly offset) layer of the transformer. This implementation does a peak-ahead
+        optimization to fuse the layer norm of the next layer into the current layer.
+
+        Arguments:
+            layer_idx (int): The index of the layer to execute.
+            residual (torch.Tensor): The residual tensor from the previous layer.
+            hidden_states (torch.Tensor): The hidden states from the previous layer. This is the
+                hidden states after pre normalization.
+            ragged_batch_info (RaggedBatchWrapper): The batch metadata.
+        """
+        # TODO(cmikeh2): Distribute ragged_batch_info to all modules
+
+        cur_params = self._transformer[layer_idx]
+        kv_cache = self.state_manager.get_cache(layer_idx)
+
+        hidden_states = self.qkv(hidden_states, cur_params.qkv_w, b=cur_params.qkv_b)
+        hidden_states = self.attn(hidden_states, kv_cache, ragged_batch_info)
+        hidden_states = self.attn_out(hidden_states, cur_params.attn_out_w, b=None)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        residual, hidden_states = self.norm(residual, hidden_states, cur_params.mlp_norm_gamma, beta=None)
+
+        shared_expert_output = self.shared_expert_mlp_1(hidden_states, cur_params.shared_moe_mlp_1, b=None)
+        shared_expert_output = self.shared_expert_mlp_2(shared_expert_output, cur_params.shared_moe_mlp_2, b=None)
+        shared_expert_gate_output = self.shared_expert_gate(hidden_states, cur_params.shared_moe_gate, b=None)[..., :1]
+        # shared_expert_gate_output shape[-1] is 1
+        shared_expert_output.mul_(torch.sigmoid(shared_expert_gate_output))
+        hidden_states = self.moe(hidden_states, ragged_batch_info, cur_params.moe_gate, cur_params.moe_mlp_1,
+                                 cur_params.moe_mlp_2)
+        hidden_states.add_(shared_expert_output)
+
+        if self.tp_size > 1:
+            dist.all_reduce(hidden_states, group=self._base_mp_group)
+
+        if layer_idx != self.num_layers - 1:
+            next_params = self._transformer[layer_idx + 1]
+            residual, hidden_states = self.norm(residual, hidden_states, next_params.attn_norm_gamma, beta=None)
+        else:
+            # On last layer, we just need to perform the residual add. Adding into the residual
+            # here is safe.
+            residual.add_(hidden_states)
+
+        return residual, hidden_states
+
+    def _forward_unembed(self, hidden_states: torch.Tensor, ragged_batch_info: RaggedBatchWrapper) -> torch.Tensor:
+        """
+        Performs unembedding of the hidden states to logits. This will only sample the final
+        token of each sequence.
+        """
+        logits = self.unembed(hidden_states,
+                              self._non_transformer.word_unembed,
+                              ragged_batch_info,
+                              gamma=self._non_transformer.final_norm)
+
+        if self.tp_size > 1:
+            comm_buffer = empty_from(self._comm_logits, (self.tp_size, logits.shape[0], logits.shape[1]))
+            full_logits = empty_from(self._return_logits, (logits.shape[0], self.vocab_size))
+
+            dist.all_gather_into_tensor(comm_buffer, logits, group=self._base_mp_group)
+
+            full_logits.copy_(comm_buffer.permute(1, 0, 2).reshape(logits.shape[0], self.vocab_size))
+
+            return full_logits
+        else:
+            return logits
+
+    def forward(self, wrapped_batch: RaggedBatchWrapper) -> torch.Tensor:
+
+        residual = self._forward_embed(wrapped_batch)
+
+        residual, hidden_states = self.norm(residual, None, self._transformer[0].attn_norm_gamma, beta=None)
+
+        for layer_idx in range(self.num_layers):
+            residual, hidden_states = self._forward_transformer(layer_idx, residual, hidden_states, wrapped_batch)
+
+        return self._forward_unembed(residual, wrapped_batch)
diff --git a/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py
new file mode 100644
index 000000000000..630bafe993a8
--- /dev/null
+++ b/deepspeed/inference/v2/model_implementations/qwen_v2_moe/policy.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any
+
+from ...config_v2 import RaggedInferenceEngineConfig
+from ..inference_policy_base import ContainerMap, InferenceV2Policy
+from .container import Qwen2MoeNonTransformerContainer, Qwen2MoeTransformerContainer
+from .model import Qwen2MoeInferenceModel
+
+
+class Qwen2MoePolicy(InferenceV2Policy):
+
+    def instantiate_model(self, engine_config: RaggedInferenceEngineConfig, mp_group: Any) -> Qwen2MoeInferenceModel:
+        return Qwen2MoeInferenceModel(config=self._model_config, engine_config=engine_config, base_mp_group=mp_group)
+
+    def build_container_map(self) -> ContainerMap:
+        map = ContainerMap()
+
+        transformer_containers = [Qwen2MoeTransformerContainer(self.model) for _ in range(self.model.num_layers)]
+
+        map.set_transformer_params(['model.layers'], transformer_containers)
+
+        map.set_non_transformer_params(Qwen2MoeNonTransformerContainer(self.model))
+
+        map.set_unmapped_params([])
+
+        return map
diff --git a/deepspeed/inference/v2/modules/configs/__init__.py b/deepspeed/inference/v2/modules/configs/__init__.py
index 19b9fb99ddea..3429e69b47de 100644
--- a/deepspeed/inference/v2/modules/configs/__init__.py
+++ b/deepspeed/inference/v2/modules/configs/__init__.py
@@ -3,7 +3,12 @@
 
 # DeepSpeed Team
 
-from .attention_configs import (DSSelfAttentionConfig, PositionalEmbeddingType, MaskingType)
+from .attention_configs import (
+    DSSelfAttentionConfig,
+    PositionalEmbeddingType,
+    MaskingType,
+    RotateHalfConfig,
+)
 from .embedding_config import DSEmbeddingsConfig
 from .linear_config import DSLinearConfig
 from .moe_config import DSMoEConfig
diff --git a/deepspeed/inference/v2/modules/configs/attention_configs.py b/deepspeed/inference/v2/modules/configs/attention_configs.py
index bcdc3d2613d5..be6a3535024c 100644
--- a/deepspeed/inference/v2/modules/configs/attention_configs.py
+++ b/deepspeed/inference/v2/modules/configs/attention_configs.py
@@ -4,10 +4,11 @@
 # DeepSpeed Team
 
 from enum import Enum
-from typing import Dict
+from typing import Dict, Optional
 
 from ...inference_utils import DtypeEnum
 from ...modules.ds_module import DSModuleConfig
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 class PositionalEmbeddingType(Enum):
@@ -25,6 +26,28 @@ class PositionalEmbeddingType(Enum):
     alibi = "alibi"
 
 
+class RotateHalfConfig(DeepSpeedConfigModel):
+
+    use_trained_freqs: bool = False
+    """
+    Whether to use a passed `trained_freqs` tensor for the attention implementation
+    or to use default synthesized frequencies.
+    """
+
+    theta_base: float = 10_000.0
+    """
+    Base for theta. This will only be used if `use_trained_freqs` is False.
+    """
+
+    rotate_dim: Optional[int] = None
+    """
+    How many neurons to rotate. If None, then all neurons will be rotated. Many external configs
+    will set this number to half the head dimension and then internally multiply by 2. To make it
+    more clear to understand what is happening (rotate_dim < head_dim -> then only partial rotation),
+    we do not do this multiplication internally.
+    """
+
+
 class MaskingType(Enum):
 
     # No masking
@@ -79,4 +102,9 @@ class DSSelfAttentionConfig(DSModuleConfig):
     positional_embedding_type: PositionalEmbeddingType = PositionalEmbeddingType.none
 
     # Positional embedding args
-    positional_embedding_args: Dict = {}
+    positional_embedding_config: Optional[RotateHalfConfig] = None
+    """
+    To extend this for the other positional embedding types, we would need to add
+    new configs for each type (as necessary) and annotate this with the
+    Union[RotateHalfConfig, OtherConfig, ...] type.
+    """
diff --git a/deepspeed/inference/v2/modules/configs/moe_config.py b/deepspeed/inference/v2/modules/configs/moe_config.py
index 1a88d54af19f..7bc944f55e17 100644
--- a/deepspeed/inference/v2/modules/configs/moe_config.py
+++ b/deepspeed/inference/v2/modules/configs/moe_config.py
@@ -48,3 +48,9 @@ class DSMoEConfig(DSModuleConfig):
     """
     Activation function of the first MLP1
     """
+
+    normalize_scores: bool = False
+    """
+    Whether normalization is applied to the selected scores. If true, the module
+    should rescale the scores such that their sum is 1.0.
+    """
diff --git a/deepspeed/inference/v2/modules/heuristics.py b/deepspeed/inference/v2/modules/heuristics.py
index b89e95c0d834..f719e299a4b2 100644
--- a/deepspeed/inference/v2/modules/heuristics.py
+++ b/deepspeed/inference/v2/modules/heuristics.py
@@ -86,8 +86,24 @@ def instantiate_linear(linear_config: DSLinearConfig, engine_config: RaggedInfer
         A linear module implementing the given configuration.
     """
 
-    # Currently, we only have one implementation, so we just return it.
-    config = ConfigBundle(name="blas_fp_linear", config=linear_config)
+    quantization_mode = engine_config.quantization.quantization_mode
+    if quantization_mode is None:
+        config = ConfigBundle(name="blas_fp_linear", config=linear_config)
+    else:
+        # Currently, we only support ``quantized_wf6af16_linear`` on NVIDIA Ampere GPUs.
+        if quantization_mode == "wf6af16":
+            import torch
+            if not torch.cuda.is_available():  #ignore-cuda
+                raise ValueError("WF6AF16 quantization is only supported on CUDA")
+            else:
+                is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
+                if is_rocm_pytorch:
+                    raise ValueError("WF6AF16 quantization is only supported on NVIDIA GPUs")
+                elif torch.cuda.get_device_properties(0).major != 8:  #ignore-cuda
+                    raise ValueError("WF6AF16 quantization is only supported on Ampere architectures")
+            config = ConfigBundle(name="quantized_wf6af16_linear", config=linear_config)
+        else:
+            raise ValueError(f"Unsupported quantization mode: {quantization_mode}")
     return DSLinearRegistry.instantiate_config(config)
 
 
diff --git a/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py b/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py
index bb482f0c58d6..3515b3c2b690 100644
--- a/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py
+++ b/deepspeed/inference/v2/modules/implementations/attention/dense_blocked_attention.py
@@ -68,9 +68,16 @@ def __init__(self, config: DSSelfAttentionConfig, implementation_config: Dict[st
 
         Args:
             config (DSSelfAttentionConfig): The self attention config for all attention DSModules.
-            implementation_config (Dict[str, Any]): The implementation config for this DSModule may
-                contain a `trained_freqs` key. If passed, the implementation will expect a `trained_freqs`
-                tensor in the `forward` method and will not synthesize the frequencies internally.
+            implementation_config (Dict[str, Any]):
+                There are two (dependent) potential components in the implementtion config.
+
+                1. `trained_freqs` - If the embedding weights for RoPE are trained, the implementation
+                config should contain {'trained_freqs': True}. This will mean the implementation will
+                expect a `trained_freqs` tensor in the `forward` method and will not synthesize the
+                values internally.
+
+                2. `theta_base` - The base value for synthesized frequencies in the rotary embeddings.
+                This will only be used if `trained_freqs` is False or not present in the `implementation_config`. If this is not included, the default value of 10000.0 will be used.
         """
         super().__init__(config, implementation_config)
 
@@ -79,14 +86,20 @@ def __init__(self, config: DSSelfAttentionConfig, implementation_config: Dict[st
             self._kv_copy = LinearBlockedKVCopy(self._config.head_size, self._config.n_heads_q,
                                                 self._config.n_heads_kv, self._config.input_dtype)
         elif embed_type == PositionalEmbeddingType.rotate_half:
-            use_trained_freqs = "trained_freqs" in self._config.positional_embedding_args and self._config.positional_embedding_args[
-                "trained_freqs"]
-            if use_trained_freqs:
+            rotary_config = config.positional_embedding_config
+            assert rotary_config is not None, "Rotary config must be provided if using rotate_half as Positional Embedding Type."
+
+            if rotary_config.use_trained_freqs:
+                # Theta and rotary dim are effectively embedded into either the values (theta) or the shape (rotary_dim)
+                # of the trained_freqs tensor.
                 self._kv_copy = BlockedTrainedRotaryEmbeddings(self._config.head_size, self._config.n_heads_q,
                                                                self._config.n_heads_kv, self._config.input_dtype)
             else:
+                theta_base = rotary_config.theta_base
+                rotary_dim = rotary_config.rotate_dim if rotary_config.rotate_dim is not None else self._config.head_size
                 self._kv_copy = BlockedRotaryEmbeddings(self._config.head_size, self._config.n_heads_q,
-                                                        self._config.n_heads_kv, self._config.input_dtype)
+                                                        self._config.n_heads_kv, self._config.input_dtype, rotary_dim,
+                                                        theta_base)
 
         self._softmax_scale = self._config.scale_factor
 
diff --git a/deepspeed/inference/v2/modules/implementations/embedding/ragged_embedding.py b/deepspeed/inference/v2/modules/implementations/embedding/ragged_embedding.py
index 6782bcae81c8..90cdd39d1be7 100644
--- a/deepspeed/inference/v2/modules/implementations/embedding/ragged_embedding.py
+++ b/deepspeed/inference/v2/modules/implementations/embedding/ragged_embedding.py
@@ -32,7 +32,7 @@ def supports_config(config: DSEmbeddingsConfig) -> bool:
         if config.use_token_type:
             return False
 
-        if config.output_normalization != None:
+        if config.output_normalization is not None:
             return False
 
         try:
diff --git a/deepspeed/inference/v2/modules/implementations/linear/__init__.py b/deepspeed/inference/v2/modules/implementations/linear/__init__.py
index e76aab71c4cf..0501af54c4e6 100644
--- a/deepspeed/inference/v2/modules/implementations/linear/__init__.py
+++ b/deepspeed/inference/v2/modules/implementations/linear/__init__.py
@@ -4,3 +4,4 @@
 # DeepSpeed Team
 
 from .blas_fp_linear import BlasFPLinear
+from .quantized_linear import QuantizedWf6Af16Linear, fp_quantize
diff --git a/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py b/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py
new file mode 100644
index 000000000000..933cf55b2391
--- /dev/null
+++ b/deepspeed/inference/v2/modules/implementations/linear/quantized_linear.py
@@ -0,0 +1,205 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Any, Dict, Optional
+
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import InferenceCoreBuilder
+from ....allocator import empty_from
+from ....inference_utils import is_gated
+from ....kernels.core_ops import (
+    CUDAWf6Af16Linear,
+    CUDABiasActivation,
+    CUDAGatedActivation,
+)
+
+from ...interfaces import DSLinearBase, DSLinearRegistry
+from ...configs import DSLinearConfig
+from ....inference_parameter import InferenceParameter
+
+
+def fp_quantize(input: torch.FloatTensor,
+                num_bits: int = 6,
+                exp_bits: int = 3,
+                min_value: torch.FloatTensor = None,
+                max_value: torch.FloatTensor = None,
+                group_size: int = -1):
+    """
+    Args:
+        inputs (`torch.FloatTensor`)
+            The input which needs to be quantized
+        num_bits (int, >=4)
+            Number of bits to use for quantization
+        exp_bits:
+            fp exp_bits
+        min_value/max_vlue (torch.FloatTensor)
+            Used for static activation quantization
+        group_size (int) N
+            The quantization block size, each N numbers has its own scaling
+            factor and off-site. -1 means use the last dim as the group_size
+    Returns:
+        quantized_fake_fp6
+            The quantized weights, in fp16 format and contains fp6 value.
+        scales
+            Quantization scales
+    """
+
+    try:
+        from qtorch.quant import float_quantize
+    except ImportError:
+        raise ImportError("Please install qtorch to use this function")
+
+    assert (min_value is None and max_value is None) or (min_value is not None and max_value is not None)
+
+    assert input.dtype == torch.float16
+
+    orig_device = input.device
+    input = input.to(torch.float32).to(get_accelerator().current_device())
+    if num_bits == 6 and exp_bits == 3:  # this is default
+        q_range = 28
+    else:
+        raise NotImplementedError
+
+    man_bits = num_bits - exp_bits - 1
+    input_shape = input.shape
+
+    if group_size == -1:
+        group_size = input_shape[-1]
+    else:
+        # Only support per-channel quantization
+        raise NotImplementedError
+    num_groups = input.numel() // group_size
+    input = input.reshape(num_groups, -1)
+
+    if min_value is None:
+        max_input = torch.amax(torch.abs(input), dim=-1).view(num_groups, -1)
+    else:
+        max_input = torch.max(min_value.abs(), max_value)  # .view(-1)
+    scales = max_input / q_range  # q_range + 1
+    scales[scales == 0] = 1  # avoid zero scales
+    scaled_input = input / scales
+
+    quantized_fake_fp6 = float_quantize(scaled_input, exp_bits, man_bits, rounding="nearest")
+
+    quantized_fake_fp6 = quantized_fake_fp6.reshape(input_shape).contiguous().to(torch.float16).to(orig_device)
+    scales = scales.to(torch.float16).to(orig_device)
+    # Now the dequantized value is quantized_fake_fp6 * scales
+
+    return quantized_fake_fp6, scales
+
+
+@DSLinearRegistry.register_module
+class QuantizedWf6Af16Linear(DSLinearBase):
+    """
+    Linear DSModule for FP6 weight-only quantization kernel, where weight is FP6
+    and activation is FP16.
+    """
+
+    @staticmethod
+    def name():
+        return 'quantized_wf6af16_linear'
+
+    @staticmethod
+    def supports_config(config: DSLinearConfig) -> bool:
+        if config.input_dtype != config.output_dtype:
+            return False
+
+        # As for fp6 data items, they are packed and stored in a set of fp16
+        # tensors. E.g., 8 fp6 data items are stored in 3 fp16 tensor.
+        if config.input_dtype != torch.float16:
+            return False
+
+        if is_gated(config.activation):
+            try:
+                _ = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation)
+            except ValueError:
+                return False
+        else:
+            try:
+                _ = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation)
+            except ValueError:
+                return False
+
+        return True
+
+    def __init__(self, config: DSLinearConfig, implementation_config: Dict[str, Any]) -> None:
+        super().__init__(config, implementation_config)
+
+        self._linear_impl = CUDAWf6Af16Linear()
+
+        if is_gated(config.activation):
+            # In the FP6 kernel implementation, the MatMul is W * A, where W is
+            # the weight and A is activation. M is the output channel size.
+            self.out_channels = self._config.out_channels * 2
+            self.in_channels = self._config.in_channels
+            self._is_gated = True
+            self._act_fn = CUDAGatedActivation(config.out_channels, config.output_dtype, config.activation)
+            self._double_buffer = torch.empty((config.max_tokens, config.out_channels * 2),
+                                              dtype=config.output_dtype,
+                                              device=get_accelerator().current_device())
+        else:
+            self.out_channels = self._config.out_channels
+            self.in_channels = self._config.in_channels
+            self._is_gated = False
+            self._act_fn = CUDABiasActivation(config.out_channels, config.output_dtype, config.activation)
+
+        self._output = torch.empty((config.max_tokens, config.out_channels),
+                                   dtype=config.output_dtype,
+                                   device=get_accelerator().current_device())
+
+        self.inf_module = InferenceCoreBuilder().load()
+        self.inf_module.create_handle()
+        self.preprocess_weight = self.inf_module.preprocess_weight
+
+        self.quantizer = fp_quantize
+
+    def transform_param(self, param: torch.Tensor) -> InferenceParameter:
+        """
+        Converts param to same data type as input and output.
+
+        Parameters:
+            param (torch.Tensor): Weight or bias tensor.
+        """
+        # It expects that the quantization scales are store in the attribute `scales`.
+
+        if param.ndim == 1:  # bias, do nothing
+            return InferenceParameter.initialize(param)
+
+        quantized_fake_fp6, scales = self.quantizer(param, num_bits=6, exp_bits=3)
+
+        # This is for debugging, will delete before release.
+        assert (quantized_fake_fp6.dtype == torch.float16)
+        assert quantized_fake_fp6.shape[0] == self.out_channels
+        assert scales.numel() == self.out_channels
+
+        weights_2bit, weights_4bit = self.preprocess_weight(quantized_fake_fp6)
+
+        return InferenceParameter.initialize(weights_2bit, weights_4bit=weights_4bit, scales=scales)
+
+    def forward(self, hidden_states: torch.Tensor, w: torch.Tensor, b: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weights_2bit = w
+        weights_4bit = w.weights_4bit
+        scales = w.scales
+        output = empty_from(self._output, (hidden_states.shape[0], self._config.out_channels))
+        if self._is_gated:
+            staging_output = empty_from(self._double_buffer, (hidden_states.shape[0], self.out_channels))
+            self._linear_impl(staging_output, hidden_states, weights_2bit, weights_4bit, scales, self.out_channels,
+                              hidden_states.shape[0], self.in_channels)
+            self._act_fn(output, staging_output, b)
+        else:
+            self._linear_impl(output, hidden_states, weights_2bit, weights_4bit, scales, self.out_channels,
+                              hidden_states.shape[0], self.in_channels)
+            self._act_fn(output, b)
+
+        return output
+
+    @property
+    def output(self) -> torch.Tensor:
+        """
+        Return the padded, pre-allocated output Tensor.
+        """
+        return self._output
diff --git a/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py b/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
index e43a737515ed..a9b01d1233cd 100644
--- a/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
+++ b/deepspeed/inference/v2/modules/implementations/moe/cutlass_multi_gemm.py
@@ -9,12 +9,12 @@
 
 from deepspeed.accelerator import get_accelerator
 from ....allocator import empty_from
-from ....inference_utils import ActivationType
-from ....kernels.core_ops import BlasLibLinear
+from ....inference_utils import ActivationType, is_gated
+from ....kernels.core_ops import BlasLibLinear, CUDAGatedActivation
 from ....kernels.ragged_ops import (
     MoEGather,
     MoEScatter,
-    RaggedTop1Gating,
+    RaggedTopKGating,
 )
 from ....ragged import RaggedBatchWrapper
 
@@ -42,11 +42,7 @@ def supports_config(config: DSMoEConfig) -> bool:
         if config.input_dtype != torch.float16 and config.input_dtype != torch.bfloat16:
             return False
 
-        if config.top_k != 1:
-            return False
-
-        if config.activation in [ActivationType.GEGLU, ActivationType.ReGLU, ActivationType.SiGLU]:
-            # Currently not supporting gated activations in MoE
+        if config.top_k != 1 and config.top_k != 2 and config.top_k != 4 and config.top_k != 8:
             return False
 
         return True
@@ -57,15 +53,24 @@ def __init__(self, config: DSMoEConfig, implementation_config: Dict[str, Any]) -
         # Convenience variables for frequently accessed items.
         self.max_tokens = self._config.max_tokens
         self.n_experts = self._config.n_experts
+        self.n_top_k = self._config.top_k
         self.intermediate_dim = self._config.intermediate_features
 
-        self._mlp_1 = MoEGEMM(fp_dtype=implementation_config['weight_dtype'], act_fn=config.activation)
+        moe_op_act_fn = ActivationType.IDENTITY if is_gated(self._config.activation) else self._config.activation
+
+        self._mlp_1 = MoEGEMM(fp_dtype=implementation_config['weight_dtype'], act_fn=moe_op_act_fn)
         self._mlp_2 = MoEGEMM(fp_dtype=implementation_config['weight_dtype'], act_fn=ActivationType.IDENTITY)
 
+        if is_gated(self._config.activation):
+            self._activation = CUDAGatedActivation(self._config.model_dim, self._config.input_dtype,
+                                                   self._config.activation)
+        else:
+            self._activation = None
+
         self._gate_proj = BlasLibLinear(self._config.input_dtype)
-        self._top_1_gate = RaggedTop1Gating(config.input_dtype)
+        self._top_1_gate = RaggedTopKGating(config.input_dtype)
         self._moe_scatter = MoEScatter(config.input_dtype, config.model_dim)
-        self._moe_gather = MoEGather(config.input_dtype, config.model_dim)
+        self._moe_gather = MoEGather(config.input_dtype, config.model_dim, config.normalize_scores)
 
         self._create_buffers()
 
@@ -78,32 +83,38 @@ def _create_buffers(self):
         self._expert_counts = torch.empty((self.n_experts, ),
                                           dtype=torch.int32,
                                           device=get_accelerator().current_device())
-        self._scores = torch.empty((self._config.max_tokens, ),
+        self._scores = torch.empty((self._config.max_tokens, self.n_top_k),
                                    dtype=torch.float32,
                                    device=get_accelerator().current_device())
-        self._assignments = torch.empty((self._config.max_tokens, ),
+        self._assignments = torch.empty((self._config.max_tokens, self.n_top_k),
                                         dtype=torch.int32,
                                         device=get_accelerator().current_device())
-        self._offsets = torch.empty((self._config.max_tokens, ),
+        self._offsets = torch.empty((self._config.max_tokens, self.n_top_k),
                                     dtype=torch.int32,
                                     device=get_accelerator().current_device())
 
         # Scatter buffers
-        self._moe_input = torch.empty((self._config.max_tokens, self._config.model_dim),
+        self._moe_input = torch.empty((self._config.max_tokens * self.n_top_k, self._config.model_dim),
                                       dtype=self._config.input_dtype,
                                       device=get_accelerator().current_device())
         self._expert_cumsum = torch.empty((self._config.n_experts, ),
                                           dtype=torch.int64,
                                           device=get_accelerator().current_device())
-        self._mapped_slots = torch.empty((self._config.max_tokens, ),
+        self._mapped_slots = torch.empty((self._config.max_tokens, self.n_top_k),
                                          dtype=torch.int32,
                                          device=get_accelerator().current_device())
 
         # GEMM Buffers
-        self._intermediate = torch.empty((self._config.max_tokens, self._config.intermediate_features),
+        self._intermediate = torch.empty((self._config.max_tokens * self.n_top_k, self._config.intermediate_features),
                                          dtype=self._config.output_dtype,
                                          device=get_accelerator().current_device())
-        self._output_unordered = torch.empty((self._config.max_tokens, self._config.model_dim),
+        if self._activation is not None:
+            self._gated_intermediate = torch.empty(
+                (self._config.max_tokens * self.n_top_k, self._config.intermediate_features * 2),
+                dtype=self._config.output_dtype,
+                device=get_accelerator().current_device())
+
+        self._output_unordered = torch.empty((self._config.max_tokens * self.n_top_k, self._config.model_dim),
                                              dtype=self._config.output_dtype,
                                              device=get_accelerator().current_device())
 
@@ -167,11 +178,11 @@ def _gate(self, hidden_states: torch.Tensor, batch_metadata: RaggedBatchWrapper,
 
         # Get views on the buffers for gating
         logits = empty_from(self._logits, (hidden_states.shape[0], self._logits.shape[-1]))
-        scores = empty_from(self._scores, (hidden_states.shape[0], ))
-        assignments = empty_from(self._assignments, (hidden_states.shape[0], ))
-        offsets = empty_from(self._offsets, (hidden_states.shape[0], ))
-        mapped_slots = empty_from(self._mapped_slots, (hidden_states.shape[0], ))
-        moe_input = empty_from(self._moe_input, (hidden_states.shape[0], self._moe_input.shape[-1]))
+        scores = empty_from(self._scores, (hidden_states.shape[0], self.n_top_k))
+        assignments = empty_from(self._assignments, (hidden_states.shape[0], self.n_top_k))
+        offsets = empty_from(self._offsets, (hidden_states.shape[0], self.n_top_k))
+        mapped_slots = empty_from(self._mapped_slots, (hidden_states.shape[0], self.n_top_k))
+        moe_input = empty_from(self._moe_input, (hidden_states.shape[0] * self.n_top_k, self._moe_input.shape[-1]))
 
         self._gate_proj(logits, hidden_states, gate_w)
         self._expert_counts.zero_()
@@ -200,18 +211,31 @@ def forward(self,
         moe_input, expert_cumsum, scores, mapped_slots = self._gate(hidden_states, batch_metadata, gate_w)
 
         # Get views on the buffers for GEMM
-        intermediate = empty_from(self._intermediate, (hidden_states.shape[0], self._intermediate.shape[-1]))
+        intermediate = empty_from(self._intermediate,
+                                  (hidden_states.shape[0] * self.n_top_k, self._intermediate.shape[-1]))
         output_unordered = empty_from(self._output_unordered,
-                                      (hidden_states.shape[0], self._output_unordered.shape[-1]))
+                                      (hidden_states.shape[0] * self.n_top_k, self._output_unordered.shape[-1]))
         output = empty_from(self._output, (hidden_states.shape[0], self._output.shape[-1]))
 
-        self._mlp_1(
-            intermediate,
-            moe_input,
-            mlp_1_w,
-            expert_cumsum,
-            mlp_1_b,
-        )
+        if self._activation is not None:
+            gated_intermediate = empty_from(
+                self._gated_intermediate, (hidden_states.shape[0] * self.n_top_k, self._gated_intermediate.shape[-1]))
+            self._mlp_1(
+                gated_intermediate,
+                moe_input,
+                mlp_1_w,
+                expert_cumsum,
+                mlp_1_b,
+            )
+            self._activation(intermediate, gated_intermediate)
+        else:
+            self._mlp_1(
+                intermediate,
+                moe_input,
+                mlp_1_w,
+                expert_cumsum,
+                mlp_1_b,
+            )
 
         self._mlp_2(
             output_unordered,
diff --git a/deepspeed/inference/v2/modules/implementations/unembed/ragged_unembed.py b/deepspeed/inference/v2/modules/implementations/unembed/ragged_unembed.py
index 40d70cbd4df7..36130902c665 100644
--- a/deepspeed/inference/v2/modules/implementations/unembed/ragged_unembed.py
+++ b/deepspeed/inference/v2/modules/implementations/unembed/ragged_unembed.py
@@ -9,8 +9,8 @@
 
 from deepspeed.accelerator import get_accelerator
 from ....allocator import empty_from
-from ....inference_utils import DtypeEnum
-from ....kernels.core_ops import CUDAFPLN, BlasLibLinear, CUDARMSNorm
+from ....inference_utils import DtypeEnum, ActivationType
+from ....kernels.core_ops import CUDAFPLN, BlasLibLinear, CUDARMSNorm, CUDABiasActivation
 from ....kernels.ragged_ops import RaggedLogitsGather
 from ....ragged import RaggedBatchWrapper
 from ...interfaces import DSUnembedBase, DSUnembedRegistry
@@ -65,6 +65,8 @@ def __init__(self, config: DSUnembedConfig, implementation_config: Dict[str, Any
             self._norm = None
 
         self._linear = BlasLibLinear(self._config.dtype)
+        # Here the activation kernel is being used to apply bias, hence the identity activation type!
+        self._act_fn = CUDABiasActivation(self._config.vocab_size, self._config.dtype, ActivationType.IDENTITY)
 
         self._intermediate = torch.empty((self._config.max_sequences, self._config.model_dim),
                                          dtype=self._config.dtype,
@@ -82,6 +84,7 @@ def forward(self,
                 hidden_states: torch.Tensor,
                 vocab_embedding: torch.Tensor,
                 ragged_metadata: RaggedBatchWrapper,
+                bias: Optional[torch.Tensor] = None,
                 gamma: Optional[torch.Tensor] = None,
                 beta: Optional[torch.Tensor] = None) -> torch.Tensor:
         """
@@ -111,5 +114,7 @@ def forward(self,
 
         output = empty_from(self._output, (ragged_metadata.current_sequences, self._config.vocab_size))
         self._linear(output, cut_down_hidden_states, vocab_embedding)
+        if bias is not None:
+            self._act_fn(output, bias)
 
         return output
diff --git a/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp b/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp
index 8a29dd2d5945..ce115f993c3c 100644
--- a/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp
+++ b/deepspeed/inference/v2/ragged/csrc/ragged_ops.cpp
@@ -23,7 +23,7 @@ copies.
 */
 torch::Tensor allocate_fast_host_buffer(torch::Tensor device_mirror)
 {
-#ifdef __HIP_PLATFORM_HCC__
+#ifdef __HIP_PLATFORM_AMD__
     auto options =
         torch::TensorOptions().device(torch::kCPU).pinned_memory(true).dtype(device_mirror.dtype());
     auto buffer = torch::empty(device_mirror.sizes(), options);
diff --git a/deepspeed/inference/v2/ragged/kv_cache.py b/deepspeed/inference/v2/ragged/kv_cache.py
index 50da350b6506..ceba3190b93c 100644
--- a/deepspeed/inference/v2/ragged/kv_cache.py
+++ b/deepspeed/inference/v2/ragged/kv_cache.py
@@ -140,9 +140,6 @@ def __init__(self,
 
         self._caches = tuple(caches)
         self._allocators = tuple(allocators)
-        self._free_blocks = torch.empty(len(self._allocators), dtype=torch.int32, device="cpu")
-        for i, allocator in enumerate(self._allocators):
-            self._free_blocks[i] = allocator.free_blocks
 
     def reserve(self, num_blocks: int, cache_group: int = 0) -> torch.Tensor:
         """
@@ -201,9 +198,7 @@ def free_blocks(self) -> torch.Tensor:
         """
         Return the number of free blocks in each cache
         """
-        for i, allocator in enumerate(self._allocators):
-            self._free_blocks[i] = allocator.free_blocks
-        return self._free_blocks
+        return [allocator.free_blocks for allocator in self._allocators]
 
     @property
     def num_caches(self) -> int:
diff --git a/deepspeed/inference/v2/ragged/manager_configs.py b/deepspeed/inference/v2/ragged/manager_configs.py
index a5e98e5bcef1..17283b8bc0c4 100644
--- a/deepspeed/inference/v2/ragged/manager_configs.py
+++ b/deepspeed/inference/v2/ragged/manager_configs.py
@@ -6,7 +6,7 @@
 from enum import Enum
 from typing import Tuple
 
-from deepspeed.pydantic_v1 import PositiveInt, validator
+from pydantic import PositiveInt, model_validator
 
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 from ..inference_utils import DtypeEnum
@@ -173,11 +173,9 @@ class DSStateManagerConfig(DeepSpeedConfigModel):
     Enable tracking for offloading KV-cache to host memory. Currently unsupported.
     """
 
-    @validator("max_ragged_sequence_count")
-    def max_ragged_sequence_count_validator(cls, v: int, values: dict):
+    @model_validator(mode="after")
+    def max_ragged_sequence_count_validator(self):
         # If the attributes below failed their validation they won't appear in the values dict.
-        if "max_tracked_sequences" in values and v > values["max_tracked_sequences"]:
-            raise ValueError("max_ragged_sequence_count must be less than max_tracked_sequences")
-        if "max_ragged_batch_size" in values and v > values["max_ragged_batch_size"]:
-            raise ValueError("max_ragged_sequence_count must be less than max_ragged_batch_size")
-        return v
+        assert self.max_ragged_sequence_count <= self.max_tracked_sequences, "max_ragged_sequence_count must be less than max_tracked_sequences"
+        assert self.max_ragged_sequence_count <= self.max_ragged_batch_size, "max_ragged_sequence_count must be less than max_ragged_batch_size"
+        return self
diff --git a/deepspeed/inference/v2/ragged/ragged_manager.py b/deepspeed/inference/v2/ragged/ragged_manager.py
index 8eff4560b4d0..ecc3c52a5834 100644
--- a/deepspeed/inference/v2/ragged/ragged_manager.py
+++ b/deepspeed/inference/v2/ragged/ragged_manager.py
@@ -127,10 +127,7 @@ def get_sequence(self, uid: int) -> Optional[DSSequenceDescriptor]:
         Get the sequence descriptor for the given sequence id. If the sequence does not exist,
         then None is returned.
         """
-        if uid not in self._seqs:
-            return None
-
-        return self._seqs[uid]
+        return self._seqs.get(uid, None)
 
     def get_or_create_sequence(self, uid: int) -> DSSequenceDescriptor:
         """
@@ -139,8 +136,9 @@ def get_or_create_sequence(self, uid: int) -> DSSequenceDescriptor:
         if one may be allocated and should not be used from APIs that are attempting
         to test the schedulability of a hypothetical batch.
         """
-        if uid in self._seqs:
-            return self._seqs[uid]
+        seq = self.get_sequence(uid)
+        if seq is not None:
+            return seq
         else:
             return self._create_sequence(uid)
 
diff --git a/deepspeed/inference/v2/ragged/ragged_wrapper.py b/deepspeed/inference/v2/ragged/ragged_wrapper.py
index 15abd1987aa1..056ecfa2ac40 100644
--- a/deepspeed/inference/v2/ragged/ragged_wrapper.py
+++ b/deepspeed/inference/v2/ragged/ragged_wrapper.py
@@ -113,12 +113,23 @@ def __init__(self, config: DSStateManagerConfig) -> None:
         # Default behavior should be no padding
         self._is_padded = False
 
+        self._current_tokens = 0
+        self._current_sequences = 0
+        self._batch_tokens = []
+        self._inflight_seq_descriptors_shadow_buf = []
+        self._kv_blocks_ptr_buf = []
+        self._token_to_seq_storage_shadow_buf = []
+
     def clear(self) -> None:
         """
         Clear the ragged batch. This will reset the number of tokens and sequences to 0.
         """
-        self._batch_metadata_storage_shadow[0] = 0
-        self._batch_metadata_storage_shadow[1] = 0
+        self._current_tokens = 0
+        self._current_sequences = 0
+        self._batch_tokens = []
+        self._inflight_seq_descriptors_shadow_buf = []
+        self._kv_blocks_ptr_buf = []
+        self._token_to_seq_storage_shadow_buf = []
 
     def insert_sequence(self, seq_descriptor: DSSequenceDescriptor, tokens: torch.Tensor, do_checks=True) -> None:
         """
@@ -140,18 +151,23 @@ def insert_sequence(self, seq_descriptor: DSSequenceDescriptor, tokens: torch.Te
         if do_checks and self.current_tokens + seq_tokens > self._config.max_ragged_batch_size:
             raise RuntimeError(f"Ragged batch is full due to capacity limit: {self._config.max_ragged_batch_size})")
 
-        self._input_ids_shadow[self.current_tokens:self.current_tokens + seq_tokens].copy_(tokens)
-        self._token_to_seq_storage_shadow[self.current_tokens:self.current_tokens + seq_tokens].fill_(
-            self.current_sequences)
+        # The values in _inflight_seq_descriptors_shadow_buf, _token_to_seq_storage_shadow_buf, _kv_blocks_ptr_buf, etc.,
+        # are ultimately stored in PyTorch tensors: _inflight_seq_descriptors_shadow, _token_to_seq_storage_shadow, _kv_ptrs_shadow, etc.
+        # However, we found it inefficient to iterate over and substitute values into tensor slices or to use copy/fill calls for this purpose.
+        # Therefore, we initially store the values in Python lists or primitive data types and then copy them collectively in the finalize() method,
+        # instead of updating the tensors directly in each iteration.
+        self._batch_tokens.append(tokens)
+        self._inflight_seq_descriptors_shadow_buf.append(self.current_tokens)
+        self._inflight_seq_descriptors_shadow_buf.append(seq_tokens)
+        self._inflight_seq_descriptors_shadow_buf.append(seq_descriptor.seen_tokens)
+        self._inflight_seq_descriptors_shadow_buf.append(0)  # alignment
 
-        self._inflight_seq_descriptors_shadow[self.current_sequences][0] = self.current_tokens
-        self._inflight_seq_descriptors_shadow[self.current_sequences][1] = seq_tokens
-        self._inflight_seq_descriptors_shadow[self.current_sequences][2] = seq_descriptor.seen_tokens
+        self._token_to_seq_storage_shadow_buf.extend([self.current_sequences] * seq_tokens)
 
-        self._kv_ptrs_shadow[self.current_sequences] = seq_descriptor.kv_blocks_ptr
+        self._kv_blocks_ptr_buf.append(seq_descriptor.kv_blocks_ptr)
 
-        self._batch_metadata_storage_shadow[0] += seq_tokens
-        self._batch_metadata_storage_shadow[1] += 1
+        self._current_tokens += seq_tokens
+        self._current_sequences += 1
 
     @property
     def tensor_toks(self) -> torch.Tensor:
@@ -171,6 +187,15 @@ def finalize(self, padding: Optional[bool] = False) -> None:
         """
         cur_toks = self.current_tokens
 
+        # Batch-copy the values recorded in insert_sequence() into PyTorch tensors to enhance efficiency.
+        self._inflight_seq_descriptors_shadow.flatten()[:len(self._inflight_seq_descriptors_shadow_buf)].copy_(
+            torch.tensor(self._inflight_seq_descriptors_shadow_buf))
+        self._input_ids_shadow[:self.current_tokens].copy_(torch.cat(self._batch_tokens, dim=0))
+        self._token_to_seq_storage_shadow[:len(self._token_to_seq_storage_shadow_buf)].copy_(
+            torch.tensor(self._token_to_seq_storage_shadow_buf))
+        self._kv_ptrs_shadow[:len(self._kv_blocks_ptr_buf)].copy_(torch.tensor(self._kv_blocks_ptr_buf))
+        self._batch_metadata_storage_shadow.copy_(torch.tensor([cur_toks, self.current_sequences]))
+
         if padding:
             padded_toks = to_padded(cur_toks)
             self._input_ids_shadow[cur_toks:padded_toks].fill_(-1)
@@ -256,7 +281,7 @@ def current_tokens(self) -> int:
         The number of tokens in the in-flight ragged batch. This will not trigger
         synchronization with the device.
         """
-        return self._batch_metadata_storage_shadow[0].item()
+        return self._current_tokens
 
     @property
     def current_sequences(self) -> int:
@@ -264,4 +289,4 @@ def current_sequences(self) -> int:
         The number of sequences in the in-flight ragged batch. This will not trigger
         synchronization with the device.
         """
-        return self._batch_metadata_storage_shadow[1].item()
+        return self._current_sequences
diff --git a/deepspeed/inference/v2/ragged/sequence_descriptor.py b/deepspeed/inference/v2/ragged/sequence_descriptor.py
index c8a0c20764f6..6b9f65255eec 100644
--- a/deepspeed/inference/v2/ragged/sequence_descriptor.py
+++ b/deepspeed/inference/v2/ragged/sequence_descriptor.py
@@ -168,7 +168,11 @@ def cur_allocated_blocks(self, cache_group: int = 0) -> int:
         Arguments:
             cache_group (int): The cache group to query.
         """
-        return self._blocks_per_allocation_group[cache_group].sum()
+        # Currently, there is only one allocation group.
+        # A shortcut is used here to bypass the overhead of sum().
+        if len(self._blocks_per_allocation_group) == 1:
+            return self._blocks_per_allocation_group[0].item()
+        return self._blocks_per_allocation_group[cache_group].sum().item()
 
     def kv_cache_ids(self, cache_group: int = 0, on_device: bool = False) -> torch.Tensor:
         """
diff --git a/deepspeed/launcher/constants.py b/deepspeed/launcher/constants.py
index 366ae0b236f8..5bd6ee489def 100644
--- a/deepspeed/launcher/constants.py
+++ b/deepspeed/launcher/constants.py
@@ -7,6 +7,7 @@
 PDSH_MAX_FAN_OUT = 1024
 
 OPENMPI_LAUNCHER = 'openmpi'
+JSRUN_LAUNCHER = 'jsrun'
 MPICH_LAUNCHER = 'mpich'
 IMPI_LAUNCHER = 'impi'
 SLURM_LAUNCHER = 'slurm'
diff --git a/deepspeed/launcher/launch.py b/deepspeed/launcher/launch.py
index b3cb9d1205a9..079182a3b45b 100755
--- a/deepspeed/launcher/launch.py
+++ b/deepspeed/launcher/launch.py
@@ -22,6 +22,7 @@
 from collections import defaultdict
 from typing import Dict
 from argparse import ArgumentParser, REMAINDER
+from deepspeed.accelerator import get_accelerator
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..nebula.constants import DLTS_POD_ENV_PATH
 from ..utils import logger, get_numactl_cmd
@@ -146,8 +147,8 @@ def main():
     node_list = list(world_info.keys())
     args.nnodes = len(node_list)
     local_node = node_list[args.node_rank]
-    local_gpu_ids = world_info[local_node]
-    num_local_procs = len(local_gpu_ids)
+    local_accelerator_ids = world_info[local_node]
+    num_local_procs = len(local_accelerator_ids)
     logger.info(f"nnodes={args.nnodes}, num_local_procs={num_local_procs}, node_rank={args.node_rank}")
 
     global_rank_mapping = defaultdict(list)
@@ -161,8 +162,10 @@ def main():
             curr_global_rank += 1
     logger.info(f"global_rank_mapping={global_rank_mapping}")
     logger.info(f"dist_world_size={dist_world_size}")
-    current_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, local_gpu_ids))
-    logger.info(f"Setting CUDA_VISIBLE_DEVICES={current_env['CUDA_VISIBLE_DEVICES']}")
+
+    get_accelerator().set_visible_devices_envs(current_env, local_accelerator_ids)
+    for env in get_accelerator().visible_devices_envs():
+        logger.info(f"Setting {env}={current_env[env]}")
 
     # set PyTorch distributed related environmental variables
     current_env["MASTER_ADDR"] = args.master_addr
@@ -249,7 +252,8 @@ def main():
                 process = subprocess.Popen(cmd, env=current_env, stdout=log_fd, stderr=log_fd)
             else:
                 process = subprocess.Popen(cmd, env=current_env)
-
+            # logs the command from processes
+            logger.info(f"process {process.pid} spawned with command: {cmd}")
             processes.append(process)
     else:
         from ..elasticity import DSElasticAgent
diff --git a/deepspeed/launcher/launcher_helper.py b/deepspeed/launcher/launcher_helper.py
new file mode 100644
index 000000000000..05ce14bcc52e
--- /dev/null
+++ b/deepspeed/launcher/launcher_helper.py
@@ -0,0 +1,108 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import sys
+import argparse
+import subprocess
+from deepspeed.utils import logger
+from deepspeed.launcher.constants import MPICH_LAUNCHER
+
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser(description="DeepSpeed launcher helper to map environment variables for"
+                                     "multi-node/multi-gpu training jobs.",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument("--launcher",
+                        default=MPICH_LAUNCHER,
+                        type=str,
+                        help="(optional) choose launcher backend for multi-node "
+                        "training. Options currently include MPICH.")
+
+    parser.add_argument("--module",
+                        action="store_true",
+                        help="Change each process to interpret the launch "
+                        "script as a Python module, executing with the same "
+                        "behavior as 'python -m'.")
+
+    parser.add_argument("--no_python",
+                        action="store_true",
+                        help="Skip prepending the training script with "
+                        "'python' - just execute it directly.")
+
+    parser.add_argument("user_script", type=str, help="User script to launch, followed by any required "
+                        "arguments.")
+
+    parser.add_argument('user_args', nargs=argparse.REMAINDER)
+
+    parser.add_argument("--bind_cores_to_rank",
+                        action="store_true",
+                        help="Bind each rank to different cores of the host")
+
+    parser.add_argument("--bind_core_list",
+                        type=str,
+                        default=None,
+                        help="List of cores to bind to with comma separated list of "
+                        "numbers and range. i.e. 1,3-5,7 => [1,3,4,5,7].  When not "
+                        "specified, all cores on system would be used rank binding")
+
+    return parser.parse_args(args=args)
+
+
+def env_mapping(env, rank_name_list=None, local_rank_name_list=None):
+    rank = None
+    for rank_name in rank_name_list:
+        if rank_name in env:
+            if rank == None:
+                rank = env.get(rank_name)
+            elif rank != env.get(rank_name):
+                raise EnvironmentError(f"rank number doesn't match!")
+    if rank == None:
+        raise EnvironmentError(f"rank number is not in current env!")
+    env['RANK'] = rank
+
+    local_rank = None
+    for local_rank_name in local_rank_name_list:
+        if local_rank_name in env:
+            if local_rank == None:
+                local_rank = env.get(local_rank_name)
+            elif local_rank != env.get(local_rank_name):
+                raise EnvironmentError(f"local_rank number doesn't match!")
+    if local_rank == None:
+        raise EnvironmentError(f"rank number is not in current env!")
+    env['LOCAL_RANK'] = local_rank
+
+    return env
+
+
+def main(args=None):
+    args = parse_args(args)
+
+    env = os.environ.copy()
+
+    args.launcher = args.launcher.lower()
+    if args.launcher == MPICH_LAUNCHER:
+        rank_name_list = ["PMIX_RANK"] + ["PMI_RANK"]
+        local_rank_name_list = ["PALS_LOCAL_RANKID"] + ["MPI_LOCALRANKID"]
+        env = env_mapping(env, rank_name_list=rank_name_list, local_rank_name_list=local_rank_name_list)
+    else:
+        raise NotImplementedError(f"Unknown launcher {args.launcher}")
+
+    python_exec = []
+    if not args.no_python:
+        python_exec += [sys.executable, "-u"]
+        if args.module:
+            python_exec.append("-m")
+    cmd = python_exec + [args.user_script] + args.user_args
+
+    logger.info(f"launcher_helper cmd = {' '.join(cmd)}")
+
+    result = subprocess.Popen(cmd, env=env, close_fds=False)
+    result.wait()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepspeed/launcher/multinode_runner.py b/deepspeed/launcher/multinode_runner.py
index 730146f5bcd2..e72d47f904ac 100644
--- a/deepspeed/launcher/multinode_runner.py
+++ b/deepspeed/launcher/multinode_runner.py
@@ -3,6 +3,7 @@
 
 # DeepSpeed Team
 
+import json
 import os
 import sys
 import shutil
@@ -34,7 +35,7 @@ def get_cmd(self, environment, active_resources):
         """Return the command to execute on node"""
 
     def add_export(self, key, var):
-        self.exports[key.strip()] = var.strip()
+        self.exports[key.strip()] = f"\"{var.strip()}\""
 
     def parse_user_args(self):
         return self.args.user_args
@@ -56,17 +57,26 @@ def __init__(self, args, world_info_base64):
     def backend_exists(self):
         return shutil.which('pdsh')
 
+    def parse_user_args(self):
+        processed_args = []
+        for arg in self.args.user_args:
+            # With pdsh, if we are passing a string as an argument, it will get
+            # split on whitespace. To avoid this and support strings that
+            # contain '"', we do this extra processing step:
+            if " " in arg:
+                arg = '"{}"'.format(arg.replace('"', '\\"'))
+            processed_args.append(arg)
+        return processed_args
+
     @property
     def name(self):
         return "pdsh"
 
-    def parse_user_args(self):
-        return list(map(lambda x: x if x.startswith("-") else f"'{x}'", self.args.user_args))
-
     def get_cmd(self, environment, active_resources):
         environment['PDSH_RCMD_TYPE'] = 'ssh'
         if self.args.ssh_port is not None:  # only specify ssh port if it is specified
-            environment["PDSH_SSH_ARGS_APPEND"] += f" -p {self.args.ssh_port}"
+            environment["PDSH_SSH_ARGS_APPEND"] = f"{environment.get('PDSH_SSH_ARGS_APPEND', '')} \
+            -p {self.args.ssh_port}"
 
         active_workers = ",".join(active_resources.keys())
         logger.info("Running on the following workers: %s" % active_workers)
@@ -95,6 +105,8 @@ def get_cmd(self, environment, active_resources):
             deepspeed_launch.append("--no_local_rank")
         if self.args.save_pid:
             deepspeed_launch += ["--save_pid", f"{os.getpid()}"]
+        if self.args.enable_each_rank_log:
+            deepspeed_launch.append(f"--enable_each_rank_log={self.args.enable_each_rank_log}")
         if self.args.elastic_training:
             deepspeed_launch.append("--enable_elastic_training")
             deepspeed_launch.append(f"--max_elastic_nodes={self.args.max_elastic_nodes}")
@@ -123,28 +135,77 @@ def name(self):
 
     def validate_args(self):
         super().validate_args()
+
+        # Validate and set MPI environment variables
+        # Quentin suggestion
+        # self._setup_mpi_environment()
+
         #TODO: Allow for include/exclude at node-level but not gpu-level
         if self.args.include != "" or self.args.exclude != "":
             raise ValueError(f"{self.name} backend does not support worker include/exclusion")
         if self.args.num_nodes != -1 or self.args.num_gpus != -1:
             raise ValueError(f"{self.name} backend does not support limiting num nodes/gpus")
 
+    def _setup_mpi_environment(self):
+        """Sets up MPI-related environment variables or raises an error if they're missing."""
+
+        required_vars = ['OMPI_COMM_WORLD_LOCAL_RANK', 'OMPI_COMM_WORLD_RANK', 'OMPI_COMM_WORLD_SIZE']
+
+        # Check if all these are present
+        if not all(var in os.environ for var in required_vars):
+            raise EnvironmentError("MPI environment variables are not set. "
+                                   "Ensure you are running the script with an MPI-compatible launcher.")
+
+        # Now safe to read all
+        os.environ['LOCAL_RANK'] = os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
+        os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+        os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+
     def get_cmd(self, environment, active_resources):
         total_process_count = sum(self.resource_pool.values())
 
+        launcher_args = split(self.args.launcher_args)
+
+        # If btl_tcp_if_include option is provided through launcher_args, we use it. Otherwise, we add
+        # `--mca btl_tcp_if_include eth0` option as a default value for compatibility.
+        btl_tcp_opt = ['--mca', 'btl_tcp_if_include', 'eth0']
+        if len(launcher_args) >= 2:
+            for i in range(len(launcher_args) - 1):
+                if launcher_args[i] in ['-mca', '--mca'] and launcher_args[i + 1] == 'btl_tcp_if_include':
+                    btl_tcp_opt = []
+                    break
+
+        # TODO:  mpirun --allow-run-as-root -np 32 -x PATH -x LD_LIBRARY_PATH --bind-to none -mca btl tcp,self -mca coll_hcoll_enable 0 -x NCCL_IB_AR_THRESHOLD=0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_IB_SPLIT_DATA_ON_QPS=0 -x NCCL_IB_QPS_PER_CONNECTION=2 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -mca plm_rsh_args "-p 2222" --hostfile /root/hostfile ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
         mpirun_cmd = [
             'mpirun',
-            '-n',
+            '--allow-run-as-root',
+            '-np',
             f'{total_process_count}',
             '-hostfile',
             f'{self.args.hostfile}',
-            '--mca',
-            'btl',
-            '^openib',
-            '--mca',
-            'btl_tcp_if_include',
-            'eth0',
-        ] + split(self.args.launcher_args)
+            '-mca',
+            'btl tcp,self',
+            '-mca',
+            'coll_hcoll_enable 0',
+            '-mca',
+            'plm_rsh_args "-p 2222"',
+            '-x',
+            'PATH',
+            '-x',
+            'LD_LIBRARY_PATH',
+            '-x',
+            'NCCL_IB_AR_THRESHOLD=0',
+            '-x',
+            'NCCL_IB_PCI_RELAXED_ORDERING=1',
+            '-x',
+            'NCCL_IB_SPLIT_DATA_ON_QPS=0',
+            '-x',
+            'NCCL_IB_QPS_PER_CONNECTION=2',
+            '-x',
+            'CUDA_DEVICE_ORDER=PCI_BUS_ID',
+            '--bind-to',
+            'none',
+        ] + btl_tcp_opt + launcher_args
 
         export_cmd = []
         for k, v in self.exports.items():
@@ -159,6 +220,61 @@ def get_cmd(self, environment, active_resources):
         return mpirun_cmd + export_cmd + python_exec + [self.user_script] + self.user_arguments
 
 
+class JSRunner(MultiNodeRunner):
+    def __init__(self, args, world_info_base64, resource_pool):
+        super().__init__(args, world_info_base64)
+        self.resource_pool = resource_pool
+        self.add_export('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5')
+
+    def backend_exists(self):
+        #TODO: if IB is available we should suggestion mvapich
+        #This ompi check will still work for jsrun since spectrum-mpi is based on ompi
+        return shutil.which('ompi_info')
+
+    @property
+    def name(self):
+        return "jsrun"
+
+    def validate_args(self):
+        super().validate_args()
+        #TODO: Allow for include/exclude at node-level but not gpu-level
+        if self.args.include != "" or self.args.exclude != "":
+            raise ValueError(
+                f"{self.name} backend does not support worker include/exclusion")
+        if self.args.num_nodes != -1 or self.args.num_gpus != -1:
+            raise ValueError(
+                f"{self.name} backend does not support limiting num nodes/gpus")
+
+    def get_cmd(self, environment, active_resources):
+        total_process_count = sum(self.resource_pool.values())
+
+        jsrun_cmd = [
+            'jsrun',
+            '-n',
+            f'{total_process_count}',
+            '-c',
+            f'{7}',
+            '-g',
+            f'{1}',
+            '-a',
+            f'{1}',
+
+        ] + split(self.args.launcher_args)
+
+        export_cmd = []
+        for k, v in self.exports.items():
+            export_cmd += ['-E', "{}={}".format(k, v)]
+
+        python_exec = []
+        if not self.args.no_python:
+            python_exec = [sys.executable, "-u"]
+            if self.args.module:
+                python_exec.append("-m")
+
+        return jsrun_cmd + export_cmd + python_exec + [self.user_script
+                                                        ] + self.user_arguments
+
+
 class MPICHRunner(MultiNodeRunner):
 
     def __init__(self, args, world_info_base64, resource_pool):
@@ -191,6 +307,10 @@ def get_cmd(self, environment, active_resources):
 
         mpirun_cmd = [
             'mpirun',
+            '-n',
+            f'{total_process_count}',
+            '-ppn',
+            f'{process_per_node}',
         ] + split(self.args.launcher_args)
         export_cmd = []
 
@@ -202,32 +322,29 @@ def get_cmd(self, environment, active_resources):
         export_cmd += ['-genv', 'WORLD_SIZE', str(total_process_count)]
         export_cmd += ['-genv', 'LOCAL_SIZE', str(process_per_node)]
 
-        hosts = list(self.resource_pool.keys())
-
-        per_host_cmd = []
-        host_id = 0
-        host_count = 0
-        for i in range(total_process_count):
-            local_rank = i % process_per_node
-            python_exec = []
-            if not self.args.no_python:
-                python_exec += [sys.executable, "-u"]
-                if self.args.module:
-                    python_exec.append("-m")
-            env_mapping = ['-env', 'RANK', str(i)]
-            env_mapping += ['-env', 'LOCAL_RANK', str(local_rank)]
+        export_cmd += ['-hosts']
+        hosts = ""
+        for i, host in enumerate(self.resource_pool.keys()):
             if i == 0:
-                per_host_cmd = ['-n', '1', '-host', hosts[host_id]
-                                ] + env_mapping + python_exec + [self.user_script] + self.user_arguments
+                hosts = f"{host}"
             else:
-                per_host_cmd = per_host_cmd + [':', '-n', '1', '-host', hosts[host_id]
-                                               ] + env_mapping + python_exec + [self.user_script] + self.user_arguments
-            host_count = host_count + 1
-            if host_count == process_per_node:
-                host_id = host_id + 1
-                host_count = 0
+                hosts += f",{host}"
+        export_cmd += [hosts]
 
-        return mpirun_cmd + export_cmd + per_host_cmd
+        helper_args = ["--launcher"] + [self.args.launcher]
+        python_exec = []
+        if not self.args.no_python:
+            python_exec += [sys.executable, "-u"]
+            if self.args.module:
+                python_exec.append("-m")
+                helper_args.append("--module")
+        else:
+            helper_args.append("--no_python")
+
+        helper_cmd = str(os.path.dirname(os.path.realpath(__file__))) + '/launcher_helper.py'
+        helper_cmd = [helper_cmd] + helper_args + [self.user_script] + self.user_arguments
+
+        return mpirun_cmd + export_cmd + python_exec + helper_cmd
 
 
 class IMPIRunner(MultiNodeRunner):
@@ -324,6 +441,35 @@ def __init__(self, args, world_info_base64, resource_pool):
     def backend_exists(self):
         return shutil.which('sinfo')
 
+    def parse_user_args(self):
+        user_args = []
+        for arg in self.args.user_args:
+            if arg.startswith('{') and arg.endswith('}'):
+                try:
+                    arg_dict = json.loads(arg)
+                    if 'config_files' in arg_dict:
+                        config_files = {}
+                        for k, v in arg_dict.get('config_files', {}).items():
+                            config_files[k] = json.loads(v)
+                        arg_dict['config_files'] = config_files
+                except json.JSONDecodeError as jde:
+                    raise ValueError(
+                        'SLURM is picky and needs you to use plain json for your configs. Check for comments and lowercase trues'
+                    ) from jde
+                arg = json.dumps(arg_dict, separators=(',', ':'))
+            user_args.append(arg)
+        return user_args
+
+    @staticmethod
+    def _pdsh_include_to_nodelist(include_string: str):
+        """If an `--include` string of the form `node1@node2` has been passed in, transforms it to a format SLURM will accept."""
+        NODE_SEP = '@'
+        SLOT_LIST_START = ':'
+        if NODE_SEP not in include_string:
+            return include_string
+        if SLOT_LIST_START in include_string:
+            raise NotImplementedError('Currently only allocating whole nodes is supported while using the SLURM launcher.')
+        return include_string.replace(NODE_SEP, ',')
     @property
     def name(self):
         return 'slurm'
@@ -338,15 +484,16 @@ def get_cmd(self, environment, active_resources):
             f'{total_process_count}',
         ] + split(self.args.launcher_args)
 
-        if getattr(self.args, 'slurm_comment', ''):
-            srun_cmd += ['--comment', self.args.slurm_comment]
+        if getattr(self.args, 'comment', ''):
+            srun_cmd += ['--comment', self.args.comment]
+
+        if getattr(self.args, 'account', ''):
+            srun_cmd += ['--account', self.args.account]
 
         if self.args.include != "":
-            srun_cmd.append('--include')
-            srun_cmd.append(f'{self.args.include}')
-        if self.args.exclude != "":
-            srun_cmd.append('--exclude')
-            srun_cmd.append(f'{self.args.exclude}')
+            srun_cmd.append('--nodelist')
+            srun_cmd.append(self._pdsh_include_to_nodelist(self.args.include))
+
         if self.args.num_nodes > 0:
             srun_cmd.append('--nodes')
             srun_cmd.append(f'{self.args.num_nodes}')
@@ -396,7 +543,7 @@ def backend_exists(self):
         if not mpiname_exists:
             warnings.warn("mpiname does not exist, mvapich is not installed properly")
         else:
-            results = subprocess.check_output('mpiname', shell=True)
+            results = subprocess.check_output(['mpiname'])
             mpiname_results = results.decode('utf-8').strip()
             if "MVAPICH2-GDR" in mpiname_results:
                 exists = True
diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index a7fa2b5053e5..b75098e6489f 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -12,7 +12,6 @@
 import os
 import re
 import sys
-import shlex
 import json
 import base64
 import argparse
@@ -21,9 +20,12 @@
 from copy import deepcopy
 import signal
 import time
+from typing import Tuple, List, Dict
+from collections import defaultdict
+import shlex
 
-from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, IMPIRunner
-from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER, IMPI_LAUNCHER
+from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, JSRunner, IMPIRunner
+from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER, JSRUN_LAUNCHER, IMPI_LAUNCHER
 from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
 from ..nebula.constants import NEBULA_EXPORT_ENVS
 from ..utils import logger
@@ -32,7 +34,7 @@
 from deepspeed.accelerator import get_accelerator
 
 DLTS_HOSTFILE = "/job/hostfile"
-EXPORT_ENVS = ['MLFLOW', 'NCCL', 'PYTHON', 'MV2', 'UCX']
+EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX']
 EXPORT_ENVS += NEBULA_EXPORT_ENVS
 DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
 DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
@@ -118,6 +120,12 @@ def parse_args(args=None):
                         help="(optional) IP address of node 0, will be "
                         "inferred via 'hostname -I' if not specified.")
 
+    parser.add_argument("--node_rank",
+                        default=-1,
+                        type=int,
+                        help="ID of each node in the range [0:N). "
+                        "Only required when --no_ssh is set.")
+
     parser.add_argument("--launcher",
                         default=PDSH_LAUNCHER,
                         type=str,
@@ -146,6 +154,10 @@ def parse_args(args=None):
                         help="Do not pass local_rank as an argument when calling "
                         "the user's training script.")
 
+    parser.add_argument("--no_ssh",
+                        action="store_true",
+                        help="Launch training independently on each node without ssh setup.")
+
     parser.add_argument("--no_ssh_check",
                         action="store_true",
                         help="Do not perform ssh check in multi-node launcher model")
@@ -173,6 +185,20 @@ def parse_args(args=None):
                         help="Run DeepSpeed autotuner to discover optimal configuration parameters "
                         "before running job.")
 
+    parser.add_argument(
+        "--comment",
+        default="",
+        type=str,
+        help="A comment that can be used for metadata. Used to pass --comment argument to srun in Slurm launcher"
+    )
+
+    parser.add_argument(
+        "--account",
+        default="",
+        type=str,
+        help="Used to pass --account argument to srun in Slurm launcher"
+    )
+
     parser.add_argument("--elastic_training",
                         action="store_true",
                         help="Enable elastic training support in DeepSpeed.")
@@ -193,7 +219,7 @@ def parse_args(args=None):
                         "numbers and range. i.e. 1,3-5,7 => [1,3,4,5,7].  When not "
                         "specified, all cores on system would be used rank binding")
 
-    parser.add_argument("--ssh_port", type=int, default=None, help="SSH port to use for remote connections")
+    parser.add_argument("--ssh_port", type=int, default=2222, help="SSH port to use for remote connections")
 
     return parser.parse_args(args=args)
 
@@ -253,6 +279,31 @@ def _stable_remove_duplicates(data):
     return new_list
 
 
+def parse_node_config(node_config: str) -> Tuple[str, List[int]]:
+    SLOT_LIST_START = ':'
+    SLOT_SEP = ','
+
+    if SLOT_LIST_START not in node_config:
+        return node_config, []
+
+    hostname, slots = node_config.split(SLOT_LIST_START)
+    slots = [int(x) for x in slots.split(SLOT_SEP)]
+
+    return hostname, slots
+
+
+def parse_node_config_list(node_config_list: List[str]) -> Dict[str, List[int]]:
+    NODE_SEP = '@'
+
+    node_configs = defaultdict(list)
+
+    for node_config in node_config_list.split(NODE_SEP):
+        hostname, slots = parse_node_config(node_config)
+        node_configs[hostname] += slots
+
+    return {k: sorted(list(set(v))) for k, v in node_configs.items()}
+
+
 def parse_resource_filter(host_info, include_str="", exclude_str=""):
     '''Parse an inclusion or exclusion string and filter a hostfile dictionary.
 
@@ -267,11 +318,6 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
           slot 0 on worker-1.
     '''
 
-    # Constants that define our syntax
-    NODE_SEP = '@'
-    SLOT_LIST_START = ':'
-    SLOT_SEP = ','
-
     # Ensure include/exclude are mutually exclusive
     if (include_str != "") and (exclude_str != ""):
         raise ValueError('include_str and exclude_str are mutually exclusive.')
@@ -289,12 +335,9 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
         parse_str = exclude_str
 
     # foreach node in the list
-    for node_config in parse_str.split(NODE_SEP):
+    for hostname, slots in parse_node_config_list(parse_str).items():
         # Node can either be alone or node:slot,slot,slot
-        if SLOT_LIST_START in node_config:
-            hostname, slots = node_config.split(SLOT_LIST_START)
-            slots = [int(x) for x in slots.split(SLOT_SEP)]
-
+        if len(slots) > 0:
             # sanity checks
             if hostname not in host_info:
                 raise ValueError(f"Hostname '{hostname}' not found in hostfile")
@@ -312,7 +355,6 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
 
         # User just specified the whole node
         else:
-            hostname = node_config
             # sanity check hostname
             if hostname not in host_info:
                 raise ValueError(f"Hostname '{hostname}' not found in hostfile")
@@ -345,8 +387,10 @@ def parse_resource_filter(host_info, include_str="", exclude_str=""):
 
 def parse_inclusion_exclusion(resource_pool, inclusion, exclusion):
     active_resources = collections.OrderedDict()
+    node_configs = parse_node_config_list(inclusion)
+
     for hostname, slots in resource_pool.items():
-        active_resources[hostname] = list(range(slots))
+        active_resources[hostname] = node_configs[hostname] if hostname in node_configs else list(range(slots))
 
     return parse_resource_filter(active_resources, include_str=inclusion, exclude_str=exclusion)
 
@@ -389,26 +433,24 @@ def parse_num_nodes(str_num_nodes: str, elastic_training: bool):
 def main(args=None):
     args = parse_args(args)
 
-    # For when argparse interprets remaining args as a single string
-    args.user_args = shlex.split(" ".join(list(map(lambda x: x if x.startswith("-") else f'"{x}"', args.user_args))))
-
     if args.elastic_training:
         assert args.master_addr != "", "Master Addr is required when elastic training is enabled"
 
     resource_pool = fetch_hostfile(args.hostfile)
 
-    # respect CUDA_VISIBLE_DEVICES for a single node and no explicit resource filters
-    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
-    if not resource_pool and len(cuda_visible_devices):
-        detected_str = f"Detected CUDA_VISIBLE_DEVICES={cuda_visible_devices}"
+    # respect VISIBLE_DEVICES for a single node and no explicit resource filters
+    visible_devices_env = get_accelerator().visible_devices_envs()[0]
+    visible_devices = os.environ.get(visible_devices_env, "")
+    if not resource_pool and len(visible_devices):
+        detected_str = f"Detected VISIBLE_DEVICES={visible_devices}"
         if len(args.include) or len(args.exclude) or args.num_nodes > 1 or args.num_gpus > 0:
             print(
                 f"{detected_str} but ignoring it because one or several of --include/--exclude/--num_gpus/--num_nodes cl args were used. If you want to use CUDA_VISIBLE_DEVICES don't pass any of these arguments to deepspeed."
             )
         else:
-            args.include = f"localhost:{cuda_visible_devices}"
+            args.include = f"localhost:{visible_devices}"
             print(f"{detected_str}: setting --include={args.include}")
-        del os.environ["CUDA_VISIBLE_DEVICES"]
+        del os.environ[visible_devices_env]
 
     if args.num_nodes >= 0 or args.num_gpus >= 0:
         if args.include != "" or args.exclude != "":
@@ -431,14 +473,15 @@ def main(args=None):
     env = os.environ.copy()
 
     # validate that passwordless-ssh is workly properly with this hostfile
-    if multi_node_exec and not args.no_ssh_check:
+    if multi_node_exec and not args.no_ssh_check and not args.no_ssh:
         first_host = list(active_resources.keys())[0]
         try:
             ssh_check_cmd = "ssh -o PasswordAuthentication=no "
             if args.ssh_port is not None:
                 ssh_check_cmd += f"-p {args.ssh_port} "
             ssh_check_cmd += f"{first_host} hostname"
-            subprocess.check_call(ssh_check_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL, shell=True)
+            safe_ssh_cmd = shlex.split(ssh_check_cmd)
+            # subprocess.check_call(safe_ssh_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL)
         except subprocess.CalledProcessError:
             raise RuntimeError(
                 f"Using hostfile at {args.hostfile} but host={first_host} was not reachable via ssh. If you are running with a single node please remove {args.hostfile} or setup passwordless ssh."
@@ -447,9 +490,13 @@ def main(args=None):
     if not args.master_addr:
         assert multi_node_exec
         first_host = list(active_resources.keys())[0]
-        hostname_cmd = [f"ssh {first_host} hostname -I"]
+        ssh_check_cmd = "ssh "
+        if args.ssh_port is not None:
+            ssh_check_cmd += f" -p {args.ssh_port}"
+        ssh_check_cmd += f" {first_host} hostname -I"
+        hostname_cmd = shlex.split(ssh_check_cmd)
         try:
-            result = subprocess.check_output(hostname_cmd, shell=True)
+            result = subprocess.check_output(hostname_cmd)
         except subprocess.CalledProcessError as err:
             logger.error(
                 "Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
@@ -458,7 +505,7 @@ def main(args=None):
         args.master_addr = result.decode('utf-8').split()[0]
         if not args.master_addr:
             raise RuntimeError(
-                f"Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
+                "Unable to detect suitable master address via `hostname -I`, please manually specify one via --master_addr"
             )
         logger.info(f"Using IP address of {args.master_addr} for node {first_host}")
 
@@ -483,16 +530,22 @@ def main(args=None):
     if args.elastic_training:
         assert not args.no_local_rank, "--no_local_rank argument is not supported in Elastic training"
 
+    if args.no_ssh:
+        assert (0 <= args.node_rank <
+                len(active_resources)), "Launching training without ssh, but --node_rank is not set correctly."
+
     # encode world info as base64 to make it easier to pass via command line
     world_info_base64 = encode_world_info(active_resources)
 
-    multi_node_exec = args.force_multi or len(active_resources) > 1
+    multi_node_exec = (args.force_multi or len(active_resources) > 1) and not args.no_ssh
 
     if not multi_node_exec:
         deepspeed_launch = [
             sys.executable, "-u", "-m", "deepspeed.launcher.launch", f"--world_info={world_info_base64}",
             f"--master_addr={args.master_addr}", f"--master_port={args.master_port}"
         ]
+        if args.no_ssh:
+            deepspeed_launch.append(f"--node_rank={args.node_rank}")
         if args.no_python:
             deepspeed_launch.append("--no_python")
         if args.module:
@@ -518,6 +571,8 @@ def main(args=None):
             runner = PDSHRunner(args, world_info_base64)
         elif args.launcher == OPENMPI_LAUNCHER:
             runner = OpenMPIRunner(args, world_info_base64, resource_pool)
+        elif args.launcher == JSRUN_LAUNCHER:
+            runner = JSRunner(args, world_info_base64, resource_pool)
         elif args.launcher == MPICH_LAUNCHER:
             runner = MPICHRunner(args, world_info_base64, resource_pool)
         elif args.launcher == IMPI_LAUNCHER:
@@ -544,17 +599,15 @@ def main(args=None):
                 # key exists in launcher env -> var list should be used
                 excluded_vars += var_list
 
-        exports = ""
+        # load envs from accelerator
+        exports = EXPORT_ENVS + get_accelerator().export_envs()
         for var in env.keys():
-            if any([var.startswith(name) for name in EXPORT_ENVS]):
+            if any([var.startswith(name) for name in exports]):
                 if not any([var == name for name in excluded_vars]):
                     runner.add_export(var, env[var])
 
         for environ_path in DEEPSPEED_ENVIRONMENT_PATHS:
-            environ_file = DEEPSPEED_ENVIRONMENT_NAME
-            # handle if users to enter path for `DS_ENV_FILE`
-            if not os.path.isfile(environ_file):
-                environ_file = os.path.join(environ_path, DEEPSPEED_ENVIRONMENT_NAME)
+            environ_file = os.path.join(environ_path, DEEPSPEED_ENVIRONMENT_NAME)
             if os.path.isfile(environ_file):
                 logger.info(f"deepspeed_env file = {environ_file}")
                 with open(environ_file, 'r') as fd:
@@ -568,6 +621,7 @@ def main(args=None):
             cmd = runner.get_cmd(env, active_resources)
 
     logger.info(f"cmd = {' '.join(cmd)}")
+
     result = subprocess.Popen(cmd, env=env)
 
     def sigkill_handler(signum, frame):
diff --git a/deepspeed/linear/__init__.py b/deepspeed/linear/__init__.py
new file mode 100644
index 000000000000..9931a95a0a40
--- /dev/null
+++ b/deepspeed/linear/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .optimized_linear import OptimizedLinear
+from .config import LoRAConfig, QuantizationConfig
+from .context_manager import Init, init_lora
diff --git a/deepspeed/linear/config.py b/deepspeed/linear/config.py
new file mode 100644
index 000000000000..1459704a32c5
--- /dev/null
+++ b/deepspeed/linear/config.py
@@ -0,0 +1,54 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from dataclasses import dataclass, field
+from typing import List
+
+import torch
+
+
+@dataclass
+class LoRAConfig:
+    """
+    Configuration settings for LoRAOptimizedLinear.
+
+    Attributes:
+        lora_r (int): LoRA attention dimension, also known as the rank. Defaults is 64.
+        lora_alpha (float): LoRA scaling factor, default is 16.
+        base_weight_sharding (int): The degree to which the base weights are sharded,
+            should typically be set to the data-parallel world size to maximize the memory
+            reduction benefits. Defaults to 1, which means this feature is disabled.
+        offload (bool): offload frozen parameters to cpu when not in use
+        offload_ratio (float): ratio of parameters to offload to cpu when not in use
+        delay_lora_init (bool): initialize lora parameters at time of model init or allow manual init later
+        target_mods (str): target module names to apply LoRA to, defaults to llama-3.1 arch
+    """
+    lora_r: int = 64
+    lora_alpha: float = 16.
+    base_weight_sharding: int = 1
+    offload: bool = False
+    offload_ratio: float = 0.0
+    delay_lora_init: bool = False
+    target_mods: List[str] = field(
+        default_factory=lambda: ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'])
+
+
+@dataclass
+class QuantizationConfig:
+    """
+    Configuration settings for quantization for LoRAOptimizedLinear, QuantizedLinear,
+    and QuantizedParameter
+
+    Attributes:
+        q_bits (int): The number of bits used for quantization. Default is 8.
+        mantissa_bits (int): The number of bits reserved for the mantissa in fixed-point quantization. Default is 3.
+        group_size (int): The number of elements used for quantization. Default is 512.
+        q_dtype (torch.dtype): The data type to quantize to. Default is uint8. (in CUDA, buffers are allocated as
+                                     uint8, but inside the kernels the quantization is done to fp8)
+    """
+    q_bits: int = 8
+    mantissa_bits: int = 3
+    group_size: int = 512
+    q_dtype: torch.dtype = torch.uint8
diff --git a/deepspeed/linear/context_manager.py b/deepspeed/linear/context_manager.py
new file mode 100644
index 000000000000..204fa0fe9c1d
--- /dev/null
+++ b/deepspeed/linear/context_manager.py
@@ -0,0 +1,90 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .optimized_linear import LoRAOptimizedLinear, OptimizedLinear
+
+import torch
+
+try:
+    import transformers
+except ImportError:
+    transformers = None
+
+
+def init_lora(model):
+    model.requires_grad_(False)
+    for m in model.modules():
+        if isinstance(m, LoRAOptimizedLinear):
+            m.init_lora()
+
+
+class Init(object):
+    """
+    Init context wrapper similar in style to zero.Init. Allows for injecting OptimizedLinear during model
+    construction which will shard base weights and reduce overall memory usage during model init. Primarily
+    useful when initializing a model via transformers.AutoModelForCausalLM.
+
+    Example usage:
+        lora_config = deepspeed.linear.LoRAConfig(..)
+        quant_config = deepspeed.linear.QuantizationConfig(..)
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B")
+
+    """
+
+    def __init__(self, lora_config=None, quant_config=None):
+        self._orig_nn_linear = torch.nn.Linear
+        self._orig_causallm_pretrained = None
+        if transformers != None:
+            self._orig_causallm_pretrained = transformers.AutoModelForCausalLM.from_pretrained
+            self._orig_causallm_config = transformers.AutoModelForCausalLM.from_config
+        self.lora_config = lora_config
+        self.quant_config = quant_config
+        self._post_init_complete = False
+
+    def __enter__(self):
+
+        class OptLinearWrapper:
+            _orig_nn_linear = self._orig_nn_linear
+            _lora_config = self.lora_config
+            _quant_config = self.quant_config
+
+            def __new__(self, *args, **kwargs):
+                self._lora_config.delay_lora_init = True
+                kwargs['lora_config'] = self._lora_config
+                kwargs['quantization_config'] = self._quant_config
+                kwargs['linear_cls'] = self._orig_nn_linear
+                return OptimizedLinear(*args, **kwargs)
+
+        def _model_init(model):
+            if self.lora_config != None:
+                init_lora(model)
+            self._post_init_complete = True
+            return model
+
+        # ensures non-lora params are frozen and lora weights are initialized
+        def from_pretrained(*args, **kwargs):
+            model = self._orig_causallm_pretrained(*args, **kwargs)
+            return _model_init(model)
+
+        def from_config(*args, **kwargs):
+            model = self._orig_causallm_config(*args, **kwargs)
+            return _model_init(model)
+
+        torch.nn.Linear = OptLinearWrapper
+        if transformers != None:
+            transformers.AutoModelForCausalLM.from_pretrained = from_pretrained
+            transformers.AutoModelForCausalLM.from_config = from_config
+
+    def __exit__(self, *args, **kwargs):
+        torch.nn.Linear = self._orig_nn_linear
+        if not self._post_init_complete:
+            print('WARNING: For some reason LoRA modules are not initialized, this is usually done automatically '
+                  'if using transformers via (AutoModelForCausalLM from_pretrained/from_config). '
+                  'You must call `init_lora` on each module in order to use DeepSpeed LoRA, otherwise '
+                  'you will error out during runtime.')
+        else:
+            transformers.AutoModelForCausalLM.from_pretrained = self._orig_causallm_pretrained
+            transformers.AutoModelForCausalLM.from_config = self._orig_causallm_config
diff --git a/deepspeed/linear/optimized_linear.py b/deepspeed/linear/optimized_linear.py
new file mode 100644
index 000000000000..3720196aa255
--- /dev/null
+++ b/deepspeed/linear/optimized_linear.py
@@ -0,0 +1,222 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+from dataclasses import is_dataclass
+from deepspeed.accelerator import get_accelerator
+import deepspeed.comm as dist
+
+from .config import LoRAConfig, QuantizationConfig
+from .quantization import QuantizedParameter, QuantizedLinear
+
+
+class OptimizedLinear(nn.Module):
+    """
+    Optimized version of nn.Linear that adds features such as:
+      * LoRA w. base weight sharding
+      * FP [6,8,12] quantization
+
+    Arguments:
+        input_dim: Required: size of each input sample
+        output_dim: Required: size of each output sample
+        bias: Optional: If set to False, the layer will not learn an additive bias. Default: False
+        lora_config: Optional: LoRAConfig defining lora features and base-weight-sharding degree
+        quantization_config: Optional: QuantizationConfig defining quantization features
+        dtype: Optional: parameter dtype, only supports bfloat16 currently
+
+    Returns:
+        Returns a new nn.Module depending on the input config. Either native
+        torch.nn.Linear, QuantizedLinear, or the full-featured DSOptimizedLinear.
+    """
+
+    def __new__(self,
+                input_dim: int,
+                output_dim: int,
+                bias: bool = False,
+                lora_config: LoRAConfig = None,
+                quantization_config: QuantizationConfig = None,
+                device=None,
+                dtype=torch.bfloat16,
+                linear_cls=nn.Linear):
+
+        if quantization_config is not None and not is_dataclass(quantization_config):
+            raise ValueError(f"Expecting QuantizationConfig but received {type(quantization_config)}")
+        if lora_config is not None and not is_dataclass(lora_config):
+            raise ValueError(f"Expecting LoRAConfig but received {type(lora_config)}")
+        if lora_config is None and quantization_config is None:
+            # Everything disabled, fall back to normal nn.Linear
+            self = linear_cls(input_dim, output_dim, bias=bias, dtype=dtype, device=device)
+
+        elif lora_config:
+            # lora enabled, quantization may or may not be
+            self = LoRAOptimizedLinear(input_dim=input_dim,
+                                       output_dim=output_dim,
+                                       bias=bias,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=dtype,
+                                       device=device,
+                                       linear_cls=linear_cls)
+
+        elif quantization_config:
+            # only quantization enabled, no lora
+            self = QuantizedLinear(input_dim=input_dim,
+                                   output_dim=output_dim,
+                                   bias=bias,
+                                   quantization_config=quantization_config,
+                                   dtype=dtype)
+        return self
+
+
+class LoRAOptimizedLinear(nn.Module):
+
+    def __init__(self,
+                 input_dim: int,
+                 output_dim: int,
+                 bias: bool = False,
+                 lora_config: LoRAConfig = None,
+                 quantization_config: QuantizationConfig = None,
+                 device=None,
+                 dtype=torch.bfloat16,
+                 linear_cls=nn.Linear):
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.bias = bias
+        self.lora_config = lora_config
+        self.quantization_config = quantization_config
+        self.device = get_accelerator().current_device_name() if device is None else device
+        self.linear_cls = linear_cls
+        self.dtype = dtype
+        assert self.lora_config is not None, "DSOptimizedLinear requires a LoRA config"
+        assert not self.bias, "bias=True is not supported by LoRAOptimizedLinear"
+        self.zero_shards = self.lora_config.base_weight_sharding
+        self.sharded_weight_size = int(float(self.input_dim) // self.zero_shards)
+        if self.zero_shards > 1:
+            assert self.zero_shards == dist.get_world_size(
+            ), "base weight sharding is only supported across world size"
+            w = torch.nn.Parameter(torch.empty(self.output_dim * self.sharded_weight_size, dtype=dtype),
+                                   requires_grad=False)
+        else:
+            w = torch.nn.Parameter(torch.empty((self.output_dim, self.input_dim), dtype=dtype), requires_grad=False)
+        torch.nn.init.xavier_uniform_(w.reshape(self.sharded_weight_size, self.output_dim))
+
+        if self.quantization_config is not None:
+            assert dtype == torch.bfloat16, "only bfloat16 is supported when using quantization"
+            self.weight = QuantizedParameter(w, quantization_config=quantization_config)
+        else:
+            self.weight = w
+
+        self.disabled = False
+        self._initialized = False
+        if not self.lora_config.delay_lora_init:
+            self.init_lora()
+
+    def disable(self):
+        self.disabled = True
+        self.weight = torch.nn.Parameter(torch.empty((self.output_dim, self.input_dim), dtype=self.dtype),
+                                         requires_grad=False)
+
+    def init_lora(self):
+        if self.disabled:
+            return
+
+        if self.quantization_config is not None:
+            # ensure quant-param wasn't stripped, in some cases transformers will do this during model init
+            if not isinstance(self.weight, QuantizedParameter):
+                self.weight = QuantizedParameter(self.weight, quantization_config=self.quantization_config)
+
+        self._initialized = True
+        self.weight.requires_grad = False
+
+        # Mark base weight to prevent broadcast and ensure proper offload behavior
+        self.weight.ds_optim_param = True
+
+        self.lora_scaling_factor = self.lora_config.lora_alpha / self.lora_config.lora_r
+
+        # Keeping lora weights in bf16 precision for ease of training.
+        self.lora_weight_1 = self.linear_cls(self.input_dim,
+                                             self.lora_config.lora_r,
+                                             bias=self.bias,
+                                             device=self.device,
+                                             dtype=self.dtype)
+        self.lora_weight_2 = self.linear_cls(self.lora_config.lora_r,
+                                             self.output_dim,
+                                             bias=self.bias,
+                                             device=self.device,
+                                             dtype=self.dtype)
+
+        # initialize "A" with kaiming uniform and "B" with zeros following this
+        # https://github.com/huggingface/peft/blob/62122b5add8d6892f70c82eaef2147a6ba33b90b/src/peft/tuners/lora/layer.py#L155
+        nn.init.kaiming_uniform_(self.lora_weight_1.weight, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_weight_2.weight)
+        self.lora_weight_1.weight.requires_grad = True
+        self.lora_weight_2.weight.requires_grad = True
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                              error_msgs):
+        if not any([target in prefix for target in self.lora_config.target_mods]):
+            # module does not match any target_mods, we must revert to normal nn.Linear via disable
+            self.disable()
+            return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys,
+                                                 unexpected_keys, error_msgs)
+
+        if self.zero_shards > 1:
+            if not dist.is_initialized():
+                raise RuntimeError(
+                    "attempting to use optimized linear base weight sharding but torch-distributed is not initialized, please init first."
+                )
+            rank = dist.get_rank()
+            shape_local = self.output_dim * self.sharded_weight_size
+            base_weight_name = f"{prefix}weight"
+            incoming_param = state_dict[base_weight_name]
+            state_dict[base_weight_name] = incoming_param.flatten().narrow(0, rank * shape_local, shape_local)
+
+        return super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys,
+                                             error_msgs)
+
+    def full_weight(self):
+        base_weight = self.weight
+        if getattr(base_weight, 'ds_offload', False):
+            # move to gpu so we can dequant and all-gather
+            assert base_weight.device == torch.device('cpu'), \
+                f"expected base weight on cpu but found {base_weight.device}"
+            base_weight.offload(revert=True)
+            local_weight = base_weight.dequantized() if isinstance(base_weight, QuantizedParameter) else base_weight
+            base_weight.offload()
+        else:
+            local_weight = base_weight.dequantized() if isinstance(base_weight, QuantizedParameter) else base_weight
+
+        tensor_out = torch.empty(self.output_dim * self.input_dim,
+                                 dtype=local_weight.dtype,
+                                 device=local_weight.device)
+        dist.all_gather_into_tensor(tensor_out, local_weight)
+        return tensor_out.reshape(self.output_dim, self.input_dim)
+
+    def linear_without_F_linear(self, input, weight):
+        output = torch.mm(input.reshape(-1, input.shape[-1]), weight)
+        output = output.view(*input.shape[:-1], weight.shape[1])
+        return output
+
+    def forward(self, input_tensor):
+        if self.disabled:
+            return F.linear(input_tensor, self.weight)
+        assert self._initialized, "init_lora was never called, please initialize before proceeding"
+
+        # Gather the sharded base weight
+        if self.zero_shards > 1:
+            with torch.no_grad():
+                base_weight = self.full_weight()
+        elif self.quantization_config:
+            base_weight = self.weight.dequantized()
+        else:
+            base_weight = self.weight
+
+        base_weight_output = F.linear(input_tensor, base_weight)
+        lora_output = self.lora_weight_2(self.lora_weight_1(input_tensor))
+        return base_weight_output + self.lora_scaling_factor * lora_output
diff --git a/deepspeed/linear/quantization.py b/deepspeed/linear/quantization.py
new file mode 100644
index 000000000000..2023601be281
--- /dev/null
+++ b/deepspeed/linear/quantization.py
@@ -0,0 +1,147 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Optional
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.fp_quantizer import Quantizer, FP_Quantize
+from .config import QuantizationConfig
+
+
+class QuantizedParameter(nn.Parameter):
+    """
+    Quantized parameter class that implements weight quantization. Weights
+    are stored in quantized form on GPUs, and can be dequantized on-the-fly when
+    needed by the model. The weights are actually quantized during any `.to(device)`.
+
+    Arguments:
+        data (Tensor): parameter tensor.
+        requires_grad (bool, optional): if the parameter requires gradient. Defaults
+            to False and is not supported to be True. Argument provided only for interface
+            compatibility with torch.nn.Parameter.
+        quantization_config (QuantizationConfig, optional):
+        quantizer (Quantizer, optional): Defaults to FP_Quantize but can be any quantizer
+            that implements deepspeed.ops.fp_quantizer.Quantizer. This argument is also
+            required since the quantizer is stashed in the Parameter itself, some models
+            may clone the Parameter by passing an attribute __dict__. For an example, see
+            tests/unit/linear/test_quant_param.py::TestQuantParam::test_hf_clone
+    """
+
+    def __new__(
+        cls,
+        data: Optional[torch.Tensor] = None,
+        requires_grad: bool = False,  # quantized weights must be frozen
+        quantization_config: QuantizationConfig = None,
+        quantizer: Quantizer = None,
+    ):
+        if requires_grad:
+            raise ValueError(f"requires_grad=True is not supported with QuantizedParameter")
+        if data is None:
+            data = torch.empty(0)
+        self = torch.Tensor._make_subclass(cls, data, requires_grad)
+        self.quantization_config = QuantizationConfig() if quantization_config is None else quantization_config
+        if quantizer is not None:
+            self.quantizer = quantizer
+        else:
+            # if FPQuantizerBuilder is not compatible in this env this init will fail
+            self.quantizer = FP_Quantize(quantization_config=self.quantization_config)
+        self._ensure_quantized(self)
+        return self
+
+    def _ensure_quantized(self, tensor: torch.Tensor):
+        # If the tensor is on the accelerator and is not quantized, then quantize it in-place.
+        if get_accelerator().on_accelerator(tensor) and tensor.dtype != self.quantization_config.q_dtype:
+            with get_accelerator().stream(get_accelerator().current_stream(tensor.device)):
+                tensor.data = self.quantizer.quantize(tensor.data,
+                                                      q_bits=self.quantization_config.q_bits,
+                                                      q_mantisa_bits=self.quantization_config.mantissa_bits)
+            assert tensor.dtype == self.quantization_config.q_dtype
+
+    def dequantized(self) -> torch.Tensor:
+        """
+        Return a tensor containing the dequantized weights of this parameter.
+        """
+        if get_accelerator().on_accelerator(self.data) and self.data.dtype == self.quantization_config.q_dtype:
+            with get_accelerator().stream(get_accelerator().current_stream(self.data.device)):
+                return self.quantizer.dequantize(self.data,
+                                                 q_bits=self.quantization_config.q_bits,
+                                                 q_mantisa_bits=self.quantization_config.mantissa_bits)
+        return self.data
+
+    def offload(self, revert=False):
+        if getattr(self, 'ds_offload', False):
+            if revert:
+                self.data = self.to(get_accelerator().current_device_name())
+            else:
+                self.data = self.to('cpu')
+
+    def __getstate__(self):
+        state = self.__dict__
+        state["data"] = self.data
+        state["quantization_config"] = self.quantization_config
+        state["requires_grad"] = self.requires_grad
+        return state
+
+    def __setstate__(self, state):
+        self.quantizer = state["quantizer"]
+        self.quantization_config = state["quantization_config"]
+        self.data = state["data"]
+        self.requires_grad = state["requires_grad"]
+
+    def __deepcopy__(self, memo):
+        new_instance = type(self).__new__(type(self))
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        new_instance.quantizer = copy.deepcopy(state["quantizer"])
+        new_instance.quantization_config = copy.deepcopy(state["quantization_config"])
+        new_instance.data = copy.deepcopy(state["data"])
+        return new_instance
+
+    def __copy__(self):
+        new_instance = type(self).__new__(type(self))
+        state = self.__getstate__()
+        new_instance.__setstate__(state)
+        return new_instance
+
+    def cuda(self, device=None, non_blocking=False):
+        device = "cuda" if device is None else device
+        self.quantizer.to(device, non_blocking=non_blocking)
+        return self.to(device, non_blocking=non_blocking)
+
+    def to(self, *args, **kwargs):
+        """
+        Move the parameter to the given device. Then, if the device is a cuda device,
+        quantize it.
+        """
+        tensor = super().to(*args, **kwargs)
+        self.quantizer.to(*args, **kwargs)
+        self._ensure_quantized(tensor)
+        return tensor
+
+
+class QuantizedLinear(nn.Linear):
+    """
+    Linear layer that implements weight quantization. Parameters
+    are stored via `QuantizedParameter` and are dequantized on-the-fly during any
+    forward pass.
+    """
+
+    def __init__(self,
+                 input_dim: int,
+                 output_dim: int,
+                 bias: bool = False,
+                 quantization_config: QuantizationConfig = None,
+                 dtype=torch.bfloat16):
+        super().__init__(input_dim, output_dim, bias=bias, dtype=dtype)
+        assert dtype == torch.bfloat16, "currently only supports bfloat16 dtype"
+        self.weight = QuantizedParameter(self.weight.data, quantization_config=quantization_config)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.linear(input, self.weight.dequantized(), self.bias)
diff --git a/deepspeed/model_implementations/diffusers/unet.py b/deepspeed/model_implementations/diffusers/unet.py
index 27627914cfcb..8d5ddd95437a 100644
--- a/deepspeed/model_implementations/diffusers/unet.py
+++ b/deepspeed/model_implementations/diffusers/unet.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import torch
+from deepspeed.accelerator import get_accelerator
 from ..features.cuda_graph import CUDAGraph
 
 
@@ -29,7 +30,7 @@ def _graph_replay(self, *inputs, **kwargs):
         for k in kwargs:
             if torch.is_tensor(kwargs[k]):
                 self.static_kwargs[k].copy_(kwargs[k])
-        self._cuda_graphs.replay()
+        get_accelerator().replay_graph(self._cuda_graphs)
         return self.static_output
 
     def forward(self, *inputs, **kwargs):
@@ -53,11 +54,11 @@ def _create_cuda_graph(self, *inputs, **kwargs):
         torch.cuda.current_stream().wait_stream(cuda_stream)
 
         # create cuda_graph and assign static_inputs and static_outputs
-        self._cuda_graphs = torch.cuda.CUDAGraph()
+        self._cuda_graphs = get_accelerator().create_graph()
         self.static_inputs = inputs
         self.static_kwargs = kwargs
 
-        with torch.cuda.graph(self._cuda_graphs):
+        with get_accelerator().capture_to_graph(self._cuda_graphs):
             self.static_output = self._forward(*self.static_inputs, **self.static_kwargs)
 
         self.cuda_graph_created = True
@@ -68,7 +69,8 @@ def _forward(self,
                  encoder_hidden_states,
                  return_dict=True,
                  cross_attention_kwargs=None,
-                 timestep_cond=None):
+                 timestep_cond=None,
+                 added_cond_kwargs=None):
         if cross_attention_kwargs:
             return self.unet(sample,
                              timestamp,
diff --git a/deepspeed/model_implementations/diffusers/vae.py b/deepspeed/model_implementations/diffusers/vae.py
index 05084f1b985a..ce50ade647a8 100644
--- a/deepspeed/model_implementations/diffusers/vae.py
+++ b/deepspeed/model_implementations/diffusers/vae.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import torch
+from deepspeed.accelerator import get_accelerator
 from ..features.cuda_graph import CUDAGraph
 
 
@@ -27,7 +28,7 @@ def _graph_replay_decoder(self, *inputs, **kwargs):
         for k in kwargs:
             if torch.is_tensor(kwargs[k]):
                 self.static_decoder_kwargs[k].copy_(kwargs[k])
-        self._decoder_cuda_graph.replay()
+        get_accelerator().replay_graph(self._decoder_cuda_graph)
         return self.static_decoder_output
 
     def _decode(self, x, return_dict=True, generator=None):
@@ -43,11 +44,11 @@ def _create_cuda_graph_decoder(self, *inputs, **kwargs):
         torch.cuda.current_stream().wait_stream(cuda_stream)
 
         # create cuda_graph and assign static_inputs and static_outputs
-        self._decoder_cuda_graph = torch.cuda.CUDAGraph()
+        self._decoder_cuda_graph = get_accelerator().create_graph()
         self.static_decoder_inputs = inputs
         self.static_decoder_kwargs = kwargs
 
-        with torch.cuda.graph(self._decoder_cuda_graph):
+        with get_accelerator().capture_to_graph(self._decoder_cuda_graph):
             self.static_decoder_output = self._decode(*self.static_decoder_inputs, **self.static_decoder_kwargs)
 
         self.decoder_cuda_graph_created = True
@@ -70,7 +71,7 @@ def _graph_replay_encoder(self, *inputs, **kwargs):
         for k in kwargs:
             if torch.is_tensor(kwargs[k]):
                 self.static_encoder_kwargs[k].copy_(kwargs[k])
-        self._encoder_cuda_graph.replay()
+        get_accelerator().replay_graph(self._encoder_cuda_graph)
         return self.static_encoder_output
 
     def _encode(self, x, return_dict=True):
@@ -86,11 +87,11 @@ def _create_cuda_graph_encoder(self, *inputs, **kwargs):
         torch.cuda.current_stream().wait_stream(cuda_stream)
 
         # create cuda_graph and assign static_inputs and static_outputs
-        self._encoder_cuda_graph = torch.cuda.CUDAGraph()
+        self._encoder_cuda_graph = get_accelerator().create_graph()
         self.static_encoder_inputs = inputs
         self.static_encoder_kwargs = kwargs
 
-        with torch.cuda.graph(self._encoder_cuda_graph):
+        with get_accelerator().capture_to_graph(self._encoder_cuda_graph):
             self.static_encoder_output = self._encode(*self.static_encoder_inputs, **self.static_encoder_kwargs)
 
         self.encoder_cuda_graph_created = True
@@ -113,7 +114,7 @@ def _graph_replay(self, *inputs, **kwargs):
         for k in kwargs:
             if torch.is_tensor(kwargs[k]):
                 self.static_kwargs[k].copy_(kwargs[k])
-        self._all_cuda_graph.replay()
+        get_accelerator().replay_graph(self._all_cuda_graph)
         return self.static_output
 
     def forward(self, *inputs, **kwargs):
@@ -137,11 +138,11 @@ def _create_cuda_graph(self, *inputs, **kwargs):
         torch.cuda.current_stream().wait_stream(cuda_stream)
 
         # create cuda_graph and assign static_inputs and static_outputs
-        self._all_cuda_graph = torch.cuda.CUDAGraph()
+        self._all_cuda_graph = get_accelerator().create_graph()
         self.static_inputs = inputs
         self.static_kwargs = kwargs
 
-        with torch.cuda.graph(self._all_cuda_graph):
+        with get_accelerator().capture_to_graph(self._all_cuda_graph):
             self.static_output = self._forward(*self.static_inputs, **self.static_kwargs)
 
         self.all_cuda_graph_created = True
diff --git a/deepspeed/model_implementations/transformers/clip_encoder.py b/deepspeed/model_implementations/transformers/clip_encoder.py
index 8d9291896986..848a5b48dcf1 100644
--- a/deepspeed/model_implementations/transformers/clip_encoder.py
+++ b/deepspeed/model_implementations/transformers/clip_encoder.py
@@ -38,7 +38,7 @@ def _graph_replay(self, *inputs, **kwargs):
         for k in kwargs:
             if torch.is_tensor(kwargs[k]):
                 self.static_kwargs[self.iter][k].copy_(kwargs[k])
-        self._cuda_graphs[self.iter].replay()
+        get_accelerator().replay_graph(self._cuda_graphs[self.iter])
         return self.static_output[self.iter]
 
     def forward(self, *inputs, **kwargs):
@@ -63,11 +63,11 @@ def _create_cuda_graph(self, *inputs, **kwargs):
         torch.cuda.current_stream().wait_stream(cuda_stream)
 
         # create cuda_graph and assign static_inputs and static_outputs
-        self._cuda_graphs[self.iter] = torch.cuda.CUDAGraph()
+        self._cuda_graphs[self.iter] = get_accelerator().create_graph()
         self.static_inputs[self.iter] = inputs
         self.static_kwargs[self.iter] = kwargs
 
-        with torch.cuda.graph(self._cuda_graphs[self.iter]):
+        with get_accelerator().capture_to_graph(self._cuda_graphs[self.iter]):
             self.static_output[self.iter] = self._forward(*self.static_inputs[self.iter],
                                                           **self.static_kwargs[self.iter])
 
diff --git a/deepspeed/model_implementations/transformers/ds_llama2.py b/deepspeed/model_implementations/transformers/ds_llama2.py
index 7d9eb4113a8a..325bfb4f7e18 100644
--- a/deepspeed/model_implementations/transformers/ds_llama2.py
+++ b/deepspeed/model_implementations/transformers/ds_llama2.py
@@ -4,11 +4,8 @@
 # DeepSpeed Team
 
 import torch
-from deepspeed import comm as dist
 from deepspeed.model_implementations.transformers.ds_transformer import DeepSpeedTransformerInference
 
-inference_module = None
-
 
 class DeepSpeedLlama2Inference(DeepSpeedTransformerInference):
     """Initialize the DeepSpeed OPT Transformer Layer.
@@ -27,18 +24,10 @@ def forward(self, *args, **kwargs):
 
         input = args[0]
         input_mask = None
-        # Allocate memory only on first layer forward
-        if self.config.layer_id == 0 and self._alloc_workspace:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
-                                    self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
-                                    self.config.min_out_tokens)
-            self._alloc_workspace = False
-
         get_present = True
 
+        self.allocate_workspace(input.size())
+
         # We set the prev key/value to None when there is a prompt
         if input.shape[1] > 1:
             self.layer_past = None
diff --git a/deepspeed/model_implementations/transformers/ds_transformer.py b/deepspeed/model_implementations/transformers/ds_transformer.py
index a41df58ad059..7e3c81b714c0 100644
--- a/deepspeed/model_implementations/transformers/ds_transformer.py
+++ b/deepspeed/model_implementations/transformers/ds_transformer.py
@@ -6,19 +6,18 @@
 import torch
 import torch.nn as nn
 from deepspeed import comm as dist
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from deepspeed.utils.logging import log_dist
 
 from deepspeed.ops.transformer.inference.ds_mlp import DeepSpeedMLP
 from deepspeed.ops.transformer.inference.ds_attention import DeepSpeedSelfAttention, BloomSelfAttention
+from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
 import deepspeed
-if deepspeed.HAS_TRITON:
+if deepspeed.HAS_TRITON and get_accelerator().is_triton_supported():
     from deepspeed.ops.transformer.inference.triton.mlp import TritonMLP
     from deepspeed.ops.transformer.inference.triton.attention import TritonSelfAttention
 
-inference_module = None
-
 
 class DeepSpeedTransformerInference(nn.Module):
     """Initialize the DeepSpeed Transformer Layer.
@@ -37,6 +36,7 @@ class DeepSpeedTransformerInference(nn.Module):
                 for specific downstream tasks.
     """
     layer_id = 0
+    workspace = None
 
     def __init__(self,
                  config,
@@ -52,10 +52,6 @@ def __init__(self,
         DeepSpeedTransformerInference.layer_id += 1
 
         data_type = torch.half if self.config.dtype == torch.int8 else self.config.dtype
-        global inference_module
-        if inference_module is None:
-            builder = InferenceBuilder()
-            inference_module = builder.load()
 
         if DeepSpeedTransformerInference.layer_id == 1:
             log_dist(f"DeepSpeed-Inference config: {self.config.__dict__}", [0])
@@ -88,22 +84,25 @@ def __init__(self,
             self.norm_b = nn.Parameter(torch.empty(self.config.hidden_size, dtype=data_type, device=device),
                                        requires_grad=False)
         self.layer_past = None
-        try:
-            if config.dtype == torch.float32:
-                self.allocate_workspace = inference_module.allocate_workspace_fp32
-            elif config.dtype == torch.bfloat16:
-                self.allocate_workspace = inference_module.allocate_workspace_bf16
-            else:
-                self.allocate_workspace = inference_module.allocate_workspace_fp32
-            self._alloc_workspace = True
-        except AttributeError:
-            self.allocate_workspace = None
-            self._alloc_workspace = False
+        self.layer_norm = LayerNormOp()
+        if DeepSpeedTransformerInference.workspace is None:
+            DeepSpeedTransformerInference.workspace = WorkspaceOp(self.config)
+        self._should_allocate_workspace = True
+
+    def allocate_workspace(self, size):
+        # Allocate memory only on first layer forward
+        if self.config.layer_id == 0 and self._should_allocate_workspace:
+            DeepSpeedTransformerInference.workspace.allocate_workspace(
+                self.config.hidden_size, self.config.heads, size[1], size[0], DeepSpeedTransformerInference.layer_id,
+                self.config.mp_size, self.config.bigscience_bloom,
+                dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
+                self.config.min_out_tokens)
+            self._should_allocate_workspace = False
 
     @classmethod
     def reset_cache(cls):
-        if inference_module is not None:
-            inference_module.reset_cache()
+        if cls.workspace is not None:
+            cls.workspace.reset_cache()
 
     def forward(
             self,
@@ -136,15 +135,7 @@ def forward(
 
         input_mask = (input_mask if attn_mask is None else attn_mask) if attention_mask is None else attention_mask
 
-        # Allocate memory only on first layer forward
-        if self.config.layer_id == 0 and self._alloc_workspace:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedTransformerInference.layer_id, self.config.mp_size,
-                                    self.config.bigscience_bloom,
-                                    dist.get_rank() if dist.is_initialized() else 0, self.config.max_out_tokens,
-                                    self.config.min_out_tokens)
-            self._alloc_workspace = False
+        self.allocate_workspace(input.size())
 
         get_present = (get_present or get_key_value or use_cache)
         input_mask = input_mask if attention_mask is None else attention_mask
@@ -163,7 +154,7 @@ def forward(
 
         if (self.config.dtype in [torch.float16, torch.bfloat16, torch.int8]) \
             and input.dtype == torch.float:
-            target_dtype = torch.half if self.dtype == torch.int8 else self.dtype
+            target_dtype = torch.half if self.config.dtype == torch.int8 else self.config.dtype
             input = input.to(target_dtype)
 
         with torch.no_grad():
@@ -178,14 +169,15 @@ def forward(
                                               output_attentions,
                                               self.norm_w,
                                               self.norm_b,
-                                              alibi)
+                                              alibi,
+                                              **kwargs)
 
             presents = (key, value)
             self.layer_past = presents if layer_past is None else None
             output = self.mlp(attention_output, input, inp_norm, self.attention.attn_ob)
 
             if not self.config.pre_layer_norm:
-                output = inference_module.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
+                output = self.layer_norm(output, self.norm_w, self.norm_b, self.config.epsilon)
 
             output = output.to(input_type)
         if get_present:
diff --git a/deepspeed/module_inject/__init__.py b/deepspeed/module_inject/__init__.py
index 4bdabf383b26..9fc2f979a04b 100755
--- a/deepspeed/module_inject/__init__.py
+++ b/deepspeed/module_inject/__init__.py
@@ -6,5 +6,5 @@
 from .replace_module import replace_transformer_layer, revert_transformer_layer, ReplaceWithTensorSlicing, GroupQuantizer, generic_injection
 from .module_quantize import quantize_transformer_layer
 from .replace_policy import HFBertLayerPolicy
-from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize
+from .layers import LinearAllreduce, LinearLayer, EmbeddingLayer, Normalize, set_autotp_mode
 from .policy import DSPolicy
diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py
old mode 100644
new mode 100755
index c235cc766209..b089ec420d47
--- a/deepspeed/module_inject/auto_tp.py
+++ b/deepspeed/module_inject/auto_tp.py
@@ -11,10 +11,22 @@
 from typing import Optional
 import torch
 from deepspeed import comm as dist
-from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
+from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce, Yuan_LinearAllreduce, Yuan_LinearLayer, GateUpPack_LinearLayer, Conv_LinearALlreduce, fused_LinearLayer, conv_LinearLayer
 from deepspeed.accelerator import get_accelerator
-from .fusedqkv_utils import require_tp_fused_qkvw, prepare_tp_fused_qkvw
+from .fusedqkv_utils import require_tp_fused_qkvw
 from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+from deepspeed.utils import groups
+from deepspeed.module_inject.layers import is_autotp_training_mode
+
+
+def move(tensor, device, copy=True):
+    if tensor.is_meta:
+        return torch.empty_like(tensor, device=device)
+    else:
+        # Using new tensors help in freeing memory (after split for example) was done before by calling clone().
+        # Using copy=True instead of clone() will help in case of cpu --> cpu.
+        # Otherwise to() will not create a new copy for the view of the full tensor, and it will not be de-referenced.
+        return tensor.to(device, copy=copy)
 
 
 class ReplaceWithTensorSlicing:
@@ -121,7 +133,13 @@ class Loading():
 
     def is_load_module(module):
         load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm]
-        load_layer_names = ["LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm"]
+        load_layer_names = [
+            "LPLayerNorm", "SharedEmbedding", "OPTLearnedPositionalEmbedding", "LlamaRMSNorm", "FalconLinear",
+            "MistralRMSNorm", "T5LayerNorm", "MixtralRMSNorm", "Phi3RotaryEmbedding", "Phi3SuScaledRotaryEmbedding",
+            "Phi3RMSNorm", "YuanRMSNorm", "YuanRotaryEmbedding", "Phi3LongRoPEScaledRotaryEmbedding", "Qwen2RMSNorm",
+            "DeepseekV2RMSNorm", "DeepseekV3RMSNorm", "DeepseekV2YarnRotaryEmbedding", "DeepseekV3YarnRotaryEmbedding",
+            "MoEGate"
+        ]
         return module.__class__ in load_layers or module._get_name() in load_layer_names
 
     def load_buffer(module, state_dict, prefix):
@@ -174,7 +192,14 @@ def load(module, state_dict, prefix, mp_group=None):
 
 class AutoTP():
 
-    def __init__(self, module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl):
+    def __init__(self,
+                 module,
+                 all_reduce_linears,
+                 prefix,
+                 state_dict,
+                 linear_layer_setting,
+                 orig_layer_impl,
+                 keep_module_on_host=False):
         self.module = module
         self.all_reduce_linears = all_reduce_linears
         self.prefix = prefix
@@ -186,6 +211,7 @@ def __init__(self, module, all_reduce_linears, prefix, state_dict, linear_layer_
         self.orig_layer_impl = orig_layer_impl
         self.linear_policies = None
         self.conv_linear_layer = False
+        self.keep_module_on_host = keep_module_on_host
 
     def in_module_list(module, module_list):
         for item in module_list:
@@ -265,11 +291,13 @@ def tp_parser(model):
         module_list = AutoTP.get_module_list(model)
         assert AutoTP.supported(model), "AutoTP not supported for model. Please use kernel injection since container policy for model exists." \
         if AutoTP.kernel_supported(module_list) else "AutoTP not supported for model. Please provide policy."
+        norm_layer_name_list = ['LayerNorm', 'layer_norm', 'ln_1', 'ln_2']
+        #ln_1 , ln_2 for Qwen
         for module in module_list:
             for key, submodule in module._modules.items():
                 if isinstance(submodule, nn.Linear):
                     layer_list = layer_list + ["." + key]
-                elif isinstance(submodule, nn.LayerNorm) or key == 'LayerNorm' or key == 'layer_norm':
+                elif isinstance(submodule, nn.LayerNorm) or key in norm_layer_name_list:
                     layer_list = layer_list + ["ln"]
                 else:
                     layer_list = layer_list + AutoTP.get_layers(key, submodule)
@@ -288,6 +316,15 @@ def tp_parser(model):
                 elif 'self_attention.dense' in layer and 'falcon' in str(
                         type(module)):  # this is a hack to get the right linear layer for this model!
                     gem_list = gem_list + [layer]
+                # Mixtral-7x8b used w2*act(w1*w3) linear. need to replace w2 to linearallreduce.
+                elif 'w2' in layer and 'Mixtral' in str(type(module)):
+                    gem_list = gem_list + [layer]
+                elif 'self_attn.dense' in layer and 'Phi' in str(type(module)):
+                    gem_list = gem_list + [layer]
+                elif 'self_attention.dense' in layer and 'ChatGLM' in str(model):
+                    gem_list = gem_list + [layer]
+                elif 'dense_4h_to_h' in layer and 'ChatGLM' in str(model):
+                    gem_list = gem_list + [layer]
 
             layer_list = []
             if gem_list != []:
@@ -299,69 +336,68 @@ def tp_parser(model):
         return policy_list
 
     def set_tensor_parallel_config(self, mp_size, mp_group):
+
+        if is_autotp_training_mode():
+            self.mp_group = groups.get_tensor_model_parallel_group()
+            self.mp_size = groups.get_tensor_model_parallel_world_size()
+            return
+
         self.mp_size = mp_size
         self.mp_group = mp_group
 
     def _replace(self, child, name, conv_linear_layer):
+        # This function should clearly define the routing rules for specific layers
+        # and avoid any complex shard-related logic.
         if getattr(child, "replaced", False) == True:
             return
+        device_name = 'cpu' if self.keep_module_on_host else get_accelerator().current_device_name()
+        # keep_module_on_host is used to keep the module on the host. Checkpoints are loaded to the host first (in some
+        # cases it can be done from the disk even to prevent filling host's memory), thus no need to create a new copy.
+        return_new_copy = not self.keep_module_on_host
         weight_shape = child.weight.shape
         mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
-        if name in self.all_reduce_linears:
-            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
-            # else [weight_shape[0], weight_shape[1] // mp_size]
+        # For TP layer skip, e.g., MoE gate, deepseek low rank layer skip
+        if "q_a_proj" in name or "kv_a_proj_with_mqa" in name or name == "block_sparse_moe.gate" or (
+            ('mlp.shared_expert_gate' == name or 'mlp.gate' == name) and 'qwen2_moe' in str(type(self.module))):
+            return child
+        # For Yuan model
+        if 'Yuan' in str(self.module):
+            if 'v_proj' in name:
+                return Yuan_LinearLayer(child, self.mp_group)
+
+            elif 'o_proj' in name:
+                return Yuan_LinearAllreduce(child, self.mp_group)
+
+        # For MLP including chunk layer.
+        if 'gate_up_proj' in name or ('dense_h_to_4h' in name and 'GLM' in str(self.module)):
+            return GateUpPack_LinearLayer(child, self.mp_group)
+            # For Arctic model, bypass to all_reduce replacement for w2 weights
+        arctic_w2_all_reduce_linear = False
+        if 'Arctic' in str(self.module) and 'w2' in name:
+            arctic_w2_all_reduce_linear = True
+        # For MoE MLP model, e.g., deepseek and jamba
+        down_proj = False
+        if 'down_proj' in name:
+            down_proj = True
+        if name in self.all_reduce_linears or arctic_w2_all_reduce_linear or down_proj:
 
+            setattr(child, "replaced", True)
             if self.conv_linear_layer:
-                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
-            data = child.weight.data.split(get_shard_size_list(
-                weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size),
-                                           dim=1)
-            data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach()
-            del data
+                return Conv_LinearALlreduce(child, self.mp_group, name=name)
+            elif name == "lm_head" or name == 'embed_out':
+                return LmHeadLinearAllreduce(child, self.mp_group)
 
-            setattr(child, "replaced", True)
-            if name == "lm_head" or name == 'embed_out':
-                return LmHeadLinearAllreduce(
-                    torch.nn.parameter.Parameter(data_dc, requires_grad=False), dist.get_rank(), dist.get_world_size(),
-                    child.bias if child.bias is None else torch.nn.parameter.Parameter(
-                        child.bias.to(get_accelerator().current_device_name())), self.mp_group)
-            return LinearAllreduce(torch.nn.parameter.Parameter(data_dc, requires_grad=False), child.bias if child.bias is None else \
-                        torch.nn.parameter.Parameter(child.bias.to(get_accelerator().current_device_name())), self.mp_group)
+            return LinearAllreduce(child, self.mp_group, name=name)
         else:
 
-            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
-            # else [weight_shape[0] // mp_size, weight_shape[1]]
+            setattr(child, "replaced", True)
             if self.conv_linear_layer:
-                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
-
-            if require_tp_fused_qkvw(name, self.mp_size):
-                #for detecting fused type
-                module_str = str(self.module).strip()
-                #The copy is a regular copy, The shape of dst and src is the same
-                data_dc = prepare_tp_fused_qkvw(module_str, child.weight.data, self.mp_size, mp_replace.gpu_index)
+                conv_LinearLayer(child, self.mp_group)
+            elif require_tp_fused_qkvw(name, self.mp_size):
+                #Check and handle fused qkv for TP
+                return fused_LinearLayer(child, self.mp_group, fused_module=self.module)
 
-                bias_data_dc = None if child.bias is None else prepare_tp_fused_qkvw(
-                    module_str, child.bias.data, self.mp_size, mp_replace.gpu_index).to(
-                        get_accelerator().current_device_name())
-            else:
-                data = child.weight.data.split(get_shard_size_list(weight_shape[0], self.mp_size),
-                                               dim=1 if self.conv_linear_layer else 0)
-                data_dc = data[mp_replace.gpu_index].to(get_accelerator().current_device_name()).clone().detach()
-                del data
-
-                if child.bias is not None:
-                    bias_data = child.bias.data.split(get_shard_size_list(
-                        weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size),
-                                                      dim=0)
-                    bias_data = bias_data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
-                    bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
-                    del bias_data
-                else:
-                    bias_data_dc = None
-
-            setattr(child, "replaced", True)
-            return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc.to(get_accelerator().current_device_name()), requires_grad=False), \
-                        bias=bias_data_dc)
+            return LinearLayer(child, self.mp_group, name=name)
 
     def _slice_embedding(self, child, name, conv_linear_layer):
         if getattr(child, "replaced", False) == True:
@@ -371,11 +407,11 @@ def _slice_embedding(self, child, name, conv_linear_layer):
         if hasattr(child.weight, 'ds_tensor'):
             data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
         else:
-            data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
+            data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size, name), dim=1)
         data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
         data = torch.nn.parameter.Parameter(data, requires_grad=False)
 
-        new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], self.mp_size))
+        new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], self.mp_size, name))
         new_embedding.weight.data.copy_(data)
         setattr(child, "replaced", True)
         return new_embedding
@@ -383,10 +419,14 @@ def _slice_embedding(self, child, name, conv_linear_layer):
     def update_mp_params(self, child):
         if getattr(child, "replaced", False) == True:
             return
-        for param in [
-                "n_heads", "inner_dim", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads",
-                "all_head_size", "embed_dim", "hidden_size", "num_key_value_heads", "num_kv_heads"
-        ]:
+        param_list = [
+            "n_heads", "inner_dim", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads", "all_head_size",
+            "embed_dim", "hidden_size", "num_key_value_heads", "num_kv_heads", "kv_n_heads", "d_model",
+            "num_attention_heads_per_partition", "num_multi_query_groups_per_partition", "hidden_size_per_partition"
+        ]
+        for param in param_list:
+            if "Yuan" in str(child) and 'embed_dim' in param_list:
+                param_list.remove('embed_dim')
             if hasattr(child, param):
                 param_val = getattr(child, param)
                 setattr(child, param, get_shard_size(param_val, self.mp_size))
@@ -446,11 +486,15 @@ def _replace_module(self, r_module, prev_name='', prev_class_name=''):
 
     def get_model_num_kv_heads(self, config):
         num_kv_heads = None
-        kv_head_names = ['num_key_value_heads', 'num_attention_heads', 'n_heads']
+        # multi_query_group_num is for chatglm2 & chatglm3
+        kv_head_names = [
+            'multi_query_group_num', 'num_kv_heads', 'num_key_value_heads', 'num_attention_heads', 'n_heads',
+            'attention_heads'
+        ]
         for name in kv_head_names:
             if hasattr(config, name):
                 num_kv_heads = getattr(config, name)
-                if num_kv_heads != None:
+                if num_kv_heads is not None:
                     break
         return num_kv_heads
 
diff --git a/deepspeed/module_inject/auto_tp_model_utils.py b/deepspeed/module_inject/auto_tp_model_utils.py
index 51e52e3258dd..a71b1a54d6f6 100644
--- a/deepspeed/module_inject/auto_tp_model_utils.py
+++ b/deepspeed/module_inject/auto_tp_model_utils.py
@@ -61,6 +61,16 @@ def build_bloom_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype
         return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype)
 
 
+def get_alibi_mask(self, tensor, seq_length_with_past):
+    mask = self.get_alibi_mask_orig(tensor, seq_length_with_past)
+    if not self.training and dist.is_initialized():
+        num_heads_per_rank = get_shard_size(self.n_head, dist.get_world_size())
+        offset = sum(get_shard_size_list(self.n_head, dist.get_world_size())[0:dist.get_rank()])
+        mask = mask[offset:num_heads_per_rank + offset, :seq_length_with_past, :seq_length_with_past]
+
+    return mask
+
+
 def build_mpt_atten_bias_tensor(self,
                                 device,
                                 dtype,
diff --git a/deepspeed/module_inject/containers/bloom.py b/deepspeed/module_inject/containers/bloom.py
index 05f30eec8d85..7a9b9ca2065b 100644
--- a/deepspeed/module_inject/containers/bloom.py
+++ b/deepspeed/module_inject/containers/bloom.py
@@ -19,16 +19,30 @@
 class DS_BloomContainer(MetaTensorContainer, HybridEngineContainer, BaseTransformerContainer):
 
     def __init__(self, **kwargs):
+        # Check transformers version, error if > 4.43.4 (breaks at 4.44.0)
+        from importlib.metadata import version
+        v_transformers = version('transformers')
+        vers = v_transformers.split('.')
+        major = int(vers[0])
+        minor = int(vers[1])
+        if major > 4 or (major == 4 and minor > 43):
+            import sys
+            sys.exit(
+                f"Transformers version {v_transformers} exceeds version 4.43.4! After transformers version 4.43.4, BLOOM inference with DeepSpeed is no longer supported."
+            )
+
         super().__init__(**kwargs)
 
         # All model specific things should be defined here instead of the base class.
         self.bigscience_bloom = True
+        self.triangular_masking = False
 
     def create_module(self, config=None):
         _config = config if config is not None else self.ds_model_config
 
         self.module = DeepSpeedBloomInference(_config, mp_group=self.mp_group)
         self.module.config.scale_attention = self.scale_attention
+        self.module.config.invert_mask = False
         return self.module
 
     def attention_qkv_mp(self, mp_replace, reversed_dim=False):
diff --git a/deepspeed/module_inject/containers/features/meta_tensor.py b/deepspeed/module_inject/containers/features/meta_tensor.py
index 5fb55bc74339..57b136663be3 100644
--- a/deepspeed/module_inject/containers/features/meta_tensor.py
+++ b/deepspeed/module_inject/containers/features/meta_tensor.py
@@ -60,7 +60,7 @@ def load_params(self, module, sd, weight_quantizer, mp_replace, prefix):
                 layer of the model for searching the parameter's name
                 in a checkpoint file. For more information of how this
                 is used please see
-                https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py
+                https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/load_checkpoint.py
             2. `split_qkv` (Default: True): we use this flag when splitting
                 the qkv parameter into heads. If it is False, it means the heads
                 of q, k, and v are stored together and needs to split in the
diff --git a/deepspeed/module_inject/containers/llama.py b/deepspeed/module_inject/containers/llama.py
index f6157e5cdfed..7af333dc1ee4 100644
--- a/deepspeed/module_inject/containers/llama.py
+++ b/deepspeed/module_inject/containers/llama.py
@@ -129,12 +129,10 @@ def __init__(self, client_module, inference=True):
 
     def get_hidden_heads(self):
         hidden_heads = (
-            getattr(self.client_module.self_attn.q_proj.weight, "ds_shape",
-                    self.client_module.self_attn.q_proj.weight.shape)[1],
+            self.client_module.self_attn.q_proj.in_features,
             self.client_module.self_attn.num_heads,
             self.client_module.input_layernorm.variance_epsilon,
-            getattr(self.client_module.mlp.gate_proj.weight, "ds_shape",
-                    self.client_module.mlp.gate_proj.weight.shape)[0],
+            self.client_module.mlp.gate_proj.out_features,
         )
         return hidden_heads
 
diff --git a/deepspeed/module_inject/containers/unet.py b/deepspeed/module_inject/containers/unet.py
index 4e15699dc5a1..481792655531 100644
--- a/deepspeed/module_inject/containers/unet.py
+++ b/deepspeed/module_inject/containers/unet.py
@@ -17,6 +17,8 @@ def __init__(self):
         try:
             import diffusers
             self._orig_layer_class = diffusers.models.unet_2d_condition.UNet2DConditionModel
+        except AttributeError:
+            self._orig_layer_class = diffusers.models.unets.unet_2d_condition.UNet2DConditionModel
         except ImportError:
             self._orig_layer_class = None
 
diff --git a/deepspeed/module_inject/containers/vae.py b/deepspeed/module_inject/containers/vae.py
index d7c9642768cc..d26d0ef77ca9 100644
--- a/deepspeed/module_inject/containers/vae.py
+++ b/deepspeed/module_inject/containers/vae.py
@@ -13,10 +13,16 @@ def __init__(self):
         super().__init__()
         try:
             import diffusers
-            if hasattr(diffusers.models.vae, "AutoencoderKL"):
+            if hasattr(diffusers.models, "autoencoders"):
+                # Diffusers >= 0.25.0
+                # Changes location to 'autoencoders' directory
+                self._orig_layer_class = diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL
+            elif hasattr(diffusers.models.vae, "AutoencoderKL"):
+                # Diffusers < 0.12.0
                 self._orig_layer_class = diffusers.models.vae.AutoencoderKL
             else:
-                # Diffusers >= 0.12.0 changes location of AutoencoderKL
+                # Diffusers >= 0.12.0 & < 0.25.0
+                # Changes location of AutoencoderKL
                 self._orig_layer_class = diffusers.models.autoencoder_kl.AutoencoderKL
         except ImportError:
             self._orig_layer_class = None
diff --git a/deepspeed/module_inject/fusedqkv_utils.py b/deepspeed/module_inject/fusedqkv_utils.py
index 2e8f6b5917ed..0609c6001dd2 100644
--- a/deepspeed/module_inject/fusedqkv_utils.py
+++ b/deepspeed/module_inject/fusedqkv_utils.py
@@ -4,8 +4,7 @@
 # DeepSpeed Team
 import torch
 from deepspeed.utils.logging import warning_once
-from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads
-import re
+from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list, get_num_kv_heads, get_n_embd, get_num_attention_heads
 
 
 def split_by_qkvlist_and_refuse(qkv_list, split_size, split_dim=0, cat_dim=0):
@@ -17,7 +16,7 @@ def split_by_qkvlist_and_refuse(qkv_list, split_size, split_dim=0, cat_dim=0):
 
 
 def require_tp_fused_qkvw(name, mp_size):
-    fused_qkvw_name_list = ['qkv_proj', 'query_key_value', 'attn.Wqkv']
+    fused_qkvw_name_list = ['qkv_proj', 'query_key_value', 'attn.Wqkv', 'self_attn.W_pack', 'c_attn']
 
     if mp_size == 1:
         return False
@@ -27,8 +26,10 @@ def require_tp_fused_qkvw(name, mp_size):
     return False
 
 
-def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index):
-    if src == None:
+def prepare_tp_fused_qkvw(module, src, mp_size, gpu_index):
+
+    module_str = str(module).strip()
+    if src is None:
         return
     fused_type_dict = {
         'CodeGenBlock': 'codegentype',
@@ -36,6 +37,12 @@ def prepare_tp_fused_qkvw(module_str, src, mp_size, gpu_index):
         'GLMBlock': 'glmtype',
         "MPTBlock": 'glmtype',
         "MptBlock": 'glmtype',
+        "BaichuanLayer": 'glmtype',
+        "QWenBlock": 'qwentype',
+        "FalconDecoderLayer": 'bloomtype',
+        "GPTBigCodeBlock": 'bigcodetype',
+        "DecoderLayer": 'glmtype',
+        "Phi3DecoderLayer": "phi3type"
     }
 
     def _codegen_type_transpose(input, mp_size, codegen_mp_num=4):
@@ -60,11 +67,24 @@ def _codegen_type_transpose(input, mp_size, codegen_mp_num=4):
     def _glm_type_transpose(input, mp_size):
         #input : [3*hidden_dim, hidden_dim](weight) or [3*hidden_dim](bias)
 
-        shape = input.shape
-        src_split = torch.split(input, shape[0] // 3, dim=0)
-
-        split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size_list(shape[0] // 3, mp_size))
-        return split_fusedqkv[gpu_index]
+        # For chatglm2 & chatglm3(kv_heads=2), need to special handle.
+        if get_num_kv_heads() == 2:
+            shape = input.shape
+            hidden_dim = get_n_embd()
+            kv_dim = (shape[0] - hidden_dim) // get_num_kv_heads()
+            q = input[:hidden_dim]
+            k = input[hidden_dim:hidden_dim + kv_dim]
+            v = input[hidden_dim + kv_dim:]
+            q_split = q.split(get_shard_size_list(q.shape[0], mp_size), dim=0)
+            k_split = k.split(get_shard_size_list(k.shape[0], mp_size), dim=0)
+            v_split = v.split(get_shard_size_list(v.shape[0], mp_size), dim=0)
+            return torch.cat((q_split[gpu_index], k_split[gpu_index], v_split[gpu_index]), dim=0)
+        else:
+            shape = input.shape
+            src_split = torch.split(input, shape[0] // 3, dim=0)
+
+            split_fusedqkv = split_by_qkvlist_and_refuse(src_split, get_shard_size_list(shape[0] // 3, mp_size))
+            return split_fusedqkv[gpu_index]
 
     def _bloom_type_transpose(input, mp_size):
         shape = input.shape
@@ -72,7 +92,36 @@ def _bloom_type_transpose(input, mp_size):
         split_fusedqkv = input.split(get_shard_size_list(shape[0], mp_size), dim=0)
         return split_fusedqkv[gpu_index]
 
-    def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None):
+    def _qwen_type_transpose(input, mp_size, module):
+        if not hasattr(module, "_ds_fusedqkv_entered"):
+            # Adjust splitting absolute value variables
+            setattr(module, "_ds_fusedqkv_entered", True)
+            module.attn.split_size = get_shard_size(module.attn.split_size, mp_size)
+        return _glm_type_transpose(input, mp_size)
+
+    def _bigcode_type_transpose(input, mp_size):
+        n_embd = get_n_embd()
+        q = input[:n_embd]
+        kv = input[n_embd:]
+        shape = q.shape
+        split_q = q.split(get_shard_size_list(shape[0], mp_size), dim=0)
+        return torch.cat((split_q[gpu_index], kv), dim=0)
+
+    def _phi3_type_transpose(input, mp_size):
+        num_kv_heads = get_num_kv_heads()
+        num_heads = get_num_attention_heads()
+        hidden_size = input.shape[1]
+        head_dim = hidden_size // num_heads
+        q_pos = input.shape[0] - 2 * num_kv_heads * head_dim
+        q = input[:q_pos]
+        k = input[q_pos:q_pos + num_kv_heads * head_dim]
+        v = input[q_pos + num_kv_heads * head_dim:]
+        split_q = q.split(get_shard_size_list(q.shape[0], mp_size), dim=0)
+        split_k = k.split(get_shard_size_list(k.shape[0], mp_size), dim=0)
+        split_v = v.split(get_shard_size_list(v.shape[0], mp_size), dim=0)
+        return torch.cat((split_q[gpu_index], split_k[gpu_index], split_v[gpu_index]), dim=0)
+
+    def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None, module=None):
 
         # suppose num_heads=n, q(n)_w means the n-th q head linear weight, the weight format are as following
         # bloomtype: [q(1)_w,k(1)_w,v(1)_w,q(2)_w,k(2)_w,v(2)_w,...,q(n)_w,k(n)_w,v(n)_w]
@@ -85,12 +134,98 @@ def _transpose_fused_qkvw(src, mp_size, fused_qkv_type=None):
             return _codegen_type_transpose(src, mp_size)
         elif fused_qkv_type == 'glmtype':
             return _glm_type_transpose(src, mp_size)
+        elif fused_qkv_type == 'qwentype':
+            return _qwen_type_transpose(src, mp_size, module)
+        elif fused_qkv_type == 'bigcodetype':
+            return _bigcode_type_transpose(src, mp_size)
+        elif fused_qkv_type == 'phi3type':
+            return _phi3_type_transpose(src, mp_size)
 
         raise ValueError("unknown fused_qkv_type")
 
-    for module_name, fused_type in fused_type_dict.items():
-        if re.search(module_name, module_str):
-            return _transpose_fused_qkvw(src, mp_size, fused_type)
+    module_name_matches = [k for k in fused_type_dict.keys() if k in module_str]
+    if module_name_matches:
+        # There can be overlap with matches (e.g., "DecoderLayer" and "FalconDecoderLayer").
+        # We take the longest matching module_name
+        module_name = max(module_name_matches, key=len)
+        fused_type = fused_type_dict[module_name]
+        return _transpose_fused_qkvw(src, mp_size, fused_type, module)
     warning_once(f"Unrecognized fusedkqv weight type, default to using bloom type,"
                  f"please check in prepare_tp_fused_qkvw() to avoid potential calculation errors")
     return _bloom_type_transpose(src, mp_size)
+
+
+# For share qk type:
+# q = [q1,...,q_{n/4}, q_{n/2+1},...,q_{3n/4}, k1,...,k_{n/4}, k_{n/2+1},...,k_{3n/4}]
+# k = [q_{n/4+1},...,q_{n/2}, q_{3n/4+1},...,qn, k_{n/4+1},...,k_{n/2}, k{3n/4+1},...,kn]
+# Avoid modifying the modeling code. We adjust the value and oproj weight to fit this qk type.
+def shard_value_with_share_qk(
+        weight,
+        bias,
+        rank,
+        world_size,
+        shard_value=True  # True -> shard_value; False -> shard_oproj
+):
+    if shard_value:
+        total_size = weight.shape[0]
+        weight_cat_dim = 0
+    else:
+        total_size = weight.shape[1]
+        weight_cat_dim = 1
+    num_heads = get_num_kv_heads()
+    head_dim = total_size // num_heads
+    assert (num_heads % world_size == 0)
+    if world_size > num_heads // 2:
+        RuntimeError(f"world_size {world_size} is larger than half of num_heads {num_heads}")
+    head_per_rank = num_heads // world_size
+    q_head_start = rank * head_per_rank
+    # mapping q_head to v_head
+    v_head_ids = []
+    i = 0
+    # mapping neighbor q_head to v_head
+    while i < head_per_rank:
+        v_head_ids.append(q_head_start // 2)
+        q_head_start += 2
+        i = i + 2
+
+    # mapping neighbor k_head to v_head
+    v_head_ids.extend([i + num_heads // 2 for i in v_head_ids])
+    sharded_weight = []
+    sharded_bias = []
+    for head_id in v_head_ids:
+        if shard_value:
+            sharded_weight.append(weight[head_id * head_dim:(head_id + 1) * head_dim])
+            if bias is not None:
+                sharded_bias.append(bias.data[head_id * head_dim:(head_id + 1) * head_dim])
+        else:
+            sharded_weight.append(weight[:, head_id * head_dim:(head_id + 1) * head_dim])
+    sharded_weight = torch.cat(sharded_weight, dim=weight_cat_dim)
+    if bias is not None:
+        if shard_value:
+            sharded_bias = torch.cat(sharded_bias, dim=0)
+        else:
+            bias = bias / float(world_size)
+        return torch.nn.Parameter(sharded_weight), torch.nn.Parameter(sharded_bias)
+    else:
+        return torch.nn.Parameter(sharded_weight), None
+
+
+# For phi3 with chunk mlp, adjust the weight order.
+def shard_chunk_mlp(
+    weight,
+    bias,
+    rank,
+    world_size,
+):
+    weight_gate, weight_states = weight.chunk(2, dim=0)
+    total_size = weight_gate.shape[0]
+    split_weight_gate = weight_gate.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+    split_weight_states = weight_states.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+    shard_weight = torch.cat((split_weight_gate[rank], split_weight_states[rank]), dim=0)
+    if bias is not None:
+        bias_gate, bias_states = bias.chunk(2, dim=0)
+        split_bias_gate = bias_gate.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+        split_bias_states = bias_states.split(get_shard_size_list(total_size, world_size, "mlp"), dim=0)
+        return shard_weight, torch.cat((split_bias_gate[rank], split_bias_states[rank]), dim=0)
+
+    return shard_weight, None
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
index 969826ad0289..0c673225a732 100644
--- a/deepspeed/module_inject/layers.py
+++ b/deepspeed/module_inject/layers.py
@@ -7,49 +7,573 @@
 from deepspeed import comm as dist
 from torch import nn
 from torch.nn import functional as F
-
 from torch.nn.parameter import Parameter
 from deepspeed.accelerator import get_accelerator
 from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+from abc import ABC, abstractmethod
+from typing import Iterable, Any, Optional, List, Tuple
+from .fusedqkv_utils import shard_value_with_share_qk, shard_chunk_mlp, prepare_tp_fused_qkvw
+from deepspeed.runtime.tensor_parallel import AUTOTP_MODE
+from copy import deepcopy
+from typing import Union
+
+DEEPSPEED_AUTOTP_MODE = AUTOTP_MODE.INFERENCE
+DS_IS_REPLACED_MODULE = 'ds_is_replaced_module'
+DS_TENSOR_MODEL_PARALLEL = 'tensor_model_parallel'
+
+
+def get_auto_tp_mode():
+    global DEEPSPEED_AUTOTP_MODE
+    return DEEPSPEED_AUTOTP_MODE
+
+
+def is_autotp_training_mode():
+    global DEEPSPEED_AUTOTP_MODE
+    return DEEPSPEED_AUTOTP_MODE == AUTOTP_MODE.TRAINING
+
+
+def set_autotp_mode(training=False):
+    """
+    Set the DEEPSPEED_AUTOTP_MODE based on the training flag
+    """
+    global DEEPSPEED_AUTOTP_MODE
+    if training:
+        DEEPSPEED_AUTOTP_MODE = AUTOTP_MODE.TRAINING
+    else:
+        DEEPSPEED_AUTOTP_MODE = AUTOTP_MODE.INFERENCE
+
+
+def move(tensor, device):
+    # TODO: consider the timing of deletion
+    # to save host resources when DP > 1。
+
+    if tensor.is_meta:
+        # Keep tensor in meta device if tensor is meta.
+        return tensor
+    else:
+        # Using new tensors help in freeing memory (after split for example) was done before by calling clone().
+        # Using copy=True instead of clone() will help in case of cpu --> cpu.
+        # Otherwise to() will not create a new copy for the view of the full tensor, and it will not be de-referenced.
+        cloned_tensor = tensor.to(device, copy=True)
+
+        # free the memory of the original tensor to reduce memory peak
+        # Equivalent to directly deleting the tensor reference outside the function.
+        # see https://github.com/microsoft/DeepSpeed/pull/4353
+        tensor.data = torch.empty(0, device=tensor.device)
+        return cloned_tensor
+
+
+class RowParallel(torch.autograd.Function):
+    """
+    A custom autograd function for performing row-wise parallelism.
+    """
+
+    @staticmethod
+    def symbolic(graph, input):
+        """Symbolic function for tracing."""
+        return input
+
+    @staticmethod
+    def forward(ctx: Any, group: dist.ProcessGroup, input: torch.Tensor, is_inference_mode: bool) -> torch.Tensor:
+        """
+        Forward pass.
+        """
+        ctx.group = group
+        if group == None:
+            return input
+        if is_inference_mode:
+            dist.inference_all_reduce(input, group=group)
+        else:
+            dist.all_reduce(input.contiguous(), group=group)
+        return input
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> Tuple[None, torch.Tensor, None]:
+        """
+        Backward pass.
+        """
+        return None, grad_output, None
+
+
+class ColumnParallel(torch.autograd.Function):
+    """
+    Custom autograd function for column-wise parallelism.
+    """
+
+    @staticmethod
+    def symbolic(graph, input):
+        """Symbolic function for tracing."""
+        return dist.all_reduce(input.contiguous(), dist.get_tensor_model_parallel_group())
 
+    @staticmethod
+    def forward(ctx: Any, group: dist.ProcessGroup, input: torch.Tensor) -> torch.Tensor:
+        """
+        Forward pass.
+        """
+        ctx.group = group
+        return input
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> Tuple[None, torch.Tensor]:
+        """
+        Backward pass.
+        """
+        if ctx.group == None:
+            return None, grad_output
+
+        dist.all_reduce(grad_output.contiguous(), group=ctx.group)
+        return None, grad_output
+
+
+class TensorParallel_Layer(nn.Module, ABC):
+    """
+    A base class for model layers with  tensor parallelism support.
+    This class is designed to be extended by specific layers that require distributed
+    operations and parameter gather/partitioning during inference or training.
+
+    Attributes:
+        mode (str): The mode of operation[INFERENCE or TRAINING], default is "INFERENCE".
+        mp_group (Optional[dist.ProcessGroup]): The process group used for model parallelism.
+        tp_world_size (int): The world size of tensor parallelism, i.e., the number of parallel workers.
+        tp_index (int): The rank (ID) of the current worker in tensor parallelism.
+        support_training (bool): Flag indicating whether the layer supports training (default: False).
+        name (Optional[str]): The name of the layer, if provided.
+    """
 
-class LinearAllreduce(nn.Module):
+    def __init__(self, mp_group: Optional[dist.ProcessGroup], **kwargs: Any):
+        """
+        Initializes the TensorParallel_Layer with optional model parallelism group and layer name.
+
+        Args:
+            mp_group (Optional[dist.ProcessGroup]): The process group for model parallelism.
+                                                    If None, no model parallelism is set.
+        """
+        super().__init__()
+        self.support_training: bool = False
+        if mp_group is not None:
+            self.mp_group = mp_group
+            self.tp_world_size: int = dist.get_world_size(self.mp_group)
+            self.tp_index: int = dist.get_rank(mp_group)
+
+            # backward compatibility
+            self.world_size = self.tp_world_size
+            self.rank = self.tp_index
+
+        self.name = getattr(self, 'name', None)
+        if kwargs.get('name') is not None:
+            self.name = kwargs.get('name')  # Set the layer name if provided.
+
+    @abstractmethod
+    def forward(self, input):
+        """
+        Forward pass method. Must be implemented by subclasses to define layer-specific operations.
+        """
+        pass
+
+    @abstractmethod
+    def gather_params(self, params_list):
+        """
+        Gathers parameters across devices for distributed training. Must be implemented by subclasses in "TRAINING" mode.
+        """
+        pass
+
+    @abstractmethod
+    def _tp_partition(self, params_list: List[torch.Tensor]):
+        """
+        Partitions the parameters for tensor parallelism.
+        It is necessary to ensure that this function only involves the logic of params partitioning.
+        """
+        pass
+
+    def config_tp_params(self, weight):
+        """
+        Configures the weight tensor for training with tensor parallelism. This includes enabling gradients
+        and associating necessary methods for parameter gathering and partitioning.
+
+        Args:
+            weight (Optional[torch.Tensor]): The weight tensor to configure for tensor parallelism.
+                                              If None, no action is taken.
+        """
+        # # The RNG states have already been synchronized in init_inference.
+        if self.is_training_mode():
+            assert self.support_training, "No implementation of backward."
+        if weight is not None:
+            if self.is_training_mode():
+                if weight.requires_grad is None:
+                    weight.requires_grad = True
+            else:
+                weight.requires_grad = False
+            setattr(weight, DS_TENSOR_MODEL_PARALLEL, True)
+            setattr(weight, DS_IS_REPLACED_MODULE, True)
+            weight.gather_params = self.gather_params
+            weight._tp_partition = self._tp_partition
+
+    def is_training_mode(self):
+        global DEEPSPEED_AUTOTP_MODE
+        return DEEPSPEED_AUTOTP_MODE == AUTOTP_MODE.TRAINING
+
+    def __deepcopy__(self, memo):
+        # This function is designed for
+        # 'mp_group' (a 'ProcessGroup') cannot be pickled during deepcopy in some usage.
+        cls = self.__class__
+        new_obj = cls.__new__(cls)
+
+        for key, value in vars(self).items():
+            if key == 'mp_group':
+                new_obj.mp_group = self.mp_group
+            else:
+                setattr(new_obj, key, deepcopy(value, memo))
+
+        memo[id(self)] = new_obj
+        return new_obj
+
+    def extra_repr(self):
+        if self.weight is not None:
+            out_features, in_features = self.weight.shape[-2:] if self.weight is not None else (None, None)
+            dtype = self.weight.dtype if self.weight is not None else None
+            extra_repr_str = "in_features={}, out_features={}, bias={}, dtype={}".format(
+                in_features, out_features, self.bias is not None, dtype)
+        return extra_repr_str
+
+
+class GatherReplacedLayerParams:
+    """
+    A context manager for gathering parameters of a replaced layer, enabling partitioning and gathering functionality
+    based on the configuration of the model.
+    """
 
-    def __init__(self, weight, bias=None, mp_group=None):
-        super(LinearAllreduce, self).__init__()
-        self.weight = weight
-        self.bias = bias
-        self.mp_group = mp_group
+    def __init__(self,
+                 params: Union[Iterable[torch.Tensor], torch.Tensor],
+                 module: torch.nn.Module,
+                 enabled: bool = True):
+        """
+        Initialize the context manager to handle parameter gathering and partitioning for a replaced layer.
+
+        Args:
+            params (Iterable or torch.Tensor): A collection or single parameter to manage.
+            module (torch.nn.Module): The module that these parameters belong to.
+            enabled (bool): Flag indicating whether the parameter management is enabled (default: True).
+        """
+        self.enabled = enabled
+        self.module = module
+        if not enabled:
+            return
+
+        # Ensure params is a list, whether it's a single param or iterable (e.g., model.parameters())
+        if isinstance(params, Iterable) and not isinstance(params, torch.Tensor):
+            self.params: List[torch.Tensor] = list(params)  # Convert generators to a list for multiple iterations
+        else:
+            self.params: List[torch.Tensor] = [params]  # Wrap single parameter in a list for uniform processing
+
+        # Check if the parameters belong to a replaced layer (indicated by a specific attribute)
+        if not any(self._is_replaced_module_weight(p) for p in params):
+            self.enabled = False
+            return
+
+    def _is_replaced_module_weight(self, param: torch.Tensor) -> bool:
+        """
+        Helper function to determine if a parameter belongs to a replaced module.
+
+        Args:
+            param (torch.Tensor): The parameter to check.
+
+        Returns:
+            bool: True if the parameter belongs to a replaced module, False otherwise.
+        """
+        return getattr(param, DS_IS_REPLACED_MODULE, False)
+
+    def __enter__(self) -> None:
+        """
+        Enter the context manager. If enabled, gather parameters for the replaced module.
+        """
+        if self.enabled:
+            self.params[0].gather_params(self.params)
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        """
+        Exit the context manager. If enabled, partition the parameters for the replaced module.
+        """
+        #TODO : Check whether there are any missing attributes.
+        if self.enabled:
+            self.params[0]._tp_partition(self.params)
+
+
+class LinearAllreduce(TensorParallel_Layer):
+
+    def __init__(self, module, mp_group, **kwargs):
+        super(LinearAllreduce, self).__init__(mp_group, **kwargs)
+        self.weight = module.weight
+        self.bias = module.bias
+
+        self._tp_partition([self.weight, self.bias])
+        self.support_training = True
+        self.config_tp_params(self.weight)
+        if self.bias is not None:
+            self.config_tp_params(self.bias)
 
     def forward(self, input):
         output = torch.matmul(input, self.weight.transpose(-1, -2))
-        if self.mp_group is not None:
-            dist.inference_all_reduce(output, group=self.mp_group)
+        output = RowParallel.apply(self.mp_group, output, not self.is_training_mode())
         if self.bias is not None:
             output += self.bias
         return output
 
+    @torch.no_grad()
+    def gather_params(self, params_list):
+
+        for idx, param in enumerate(params_list):
+            if param is None or idx > 0:
+                # don't gather bias
+                return
+            params_list[idx].data_partition = param.data
+            param = param.transpose(0, 1).contiguous()
+            output_param = torch.empty(self.tp_world_size * param.shape[0],
+                                       param.shape[1],
+                                       dtype=param.dtype,
+                                       device=param.device)
+            dist.all_gather_into_tensor(output_param, param, group=self.mp_group)
+            params_list[idx].data = output_param.transpose(0, 1).contiguous()
+        return
+
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+
+        if not self.is_training_mode():
+            self.uneven_partition(params_list)
+            return
 
-class LmHeadLinearAllreduce(nn.Module):
-
-    def __init__(
-        self,
-        weight,
-        rank,
-        world_size,
-        bias=None,
-        mp_group=None,
-    ):
-        super(LmHeadLinearAllreduce, self).__init__()
-        self.weight = weight
-        self.bias = bias
-        self.mp_group = mp_group
-        self.rank = rank
-        self.world_size = world_size
+        else:
+            for idx, param in enumerate(params_list):
+                if param is None or idx > 0:
+                    # don't slipt bias
+                    return
+                _partition = torch.chunk(param, self.tp_world_size, dim=-1)[self.tp_index]
+
+                _partition = move(_partition, get_accelerator().current_device_name()).detach()
+
+                params_list[idx].data = _partition
+
+    def uneven_partition(self, params_list):
+        for idx, param in enumerate(params_list):
+            if param is None or idx > 0:
+                # don't slipt bias
+                return
+            assert self.name is not None, "The module name must be provided in the initialization."
+            _partition = params_list[idx].split(get_shard_size_list(params_list[idx].shape[1], self.tp_world_size,
+                                                                    self.name),
+                                                dim=1)[self.tp_index]
+
+            _partition = move(_partition, get_accelerator().current_device_name()).detach()
+            params_list[idx].data = _partition
+
+
+#remove kwargs from partition.
+class LinearLayer(TensorParallel_Layer):
+
+    def __init__(self, module, mp_group=None, skip_partition=False, **kwargs):
+        super(LinearLayer, self).__init__(mp_group, **kwargs)
+        self.weight = module.weight
+        self.bias = module.bias
+        if not skip_partition:
+            self._tp_partition([self.weight, self.bias])
+        self.support_training = True
+        self.config_tp_params(self.weight)
+        if self.bias is not None:
+            self.config_tp_params(self.bias)
 
     def forward(self, input):
-        input_shard_size = get_shard_size(input.shape[-1], self.world_size)
-        input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.world_size)[0:self.rank])
+        if getattr(self, 'mp_group', None) is not None:
+            input = ColumnParallel.apply(self.mp_group, input)
+        output = torch.matmul(input, self.weight.transpose(-1, -2))
+        if self.bias is not None:
+            output += self.bias
+        return output
+
+    @torch.no_grad()
+    def gather_params(self, params_list):
+        #  Does not support uneven shard.
+        for idx, param in enumerate(params_list):
+
+            params_list[idx].data_partition = param.data
+            output_param = torch.empty(self.tp_world_size * param.shape[0],
+                                       param.shape[1],
+                                       dtype=param.dtype,
+                                       device=param.device)
+            dist.all_gather_into_tensor(output_param, param, group=self.mp_group)
+            params_list[idx].data = output_param.contiguous()
+
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+
+        if not self.is_training_mode():
+            self.uneven_partition(params_list)
+            return
+        for idx, param in enumerate(params_list):
+            if param is None:
+                return
+            #split bias if provide
+            _partition = torch.chunk(param, self.tp_world_size, dim=0)[self.tp_index]
+
+            _partition = move(_partition, get_accelerator().current_device_name()).detach()
+
+            params_list[idx].data = _partition
+
+    def uneven_partition(self, params_list):
+
+        for idx, param in enumerate(params_list):
+            if param is None:
+                #split bias if provide
+                return
+            assert self.name is not None, "The module name must be provided in the initialization."
+            _partition = params_list[idx].split(get_shard_size_list(params_list[idx].shape[0], self.tp_world_size,
+                                                                    self.name),
+                                                dim=0)[self.tp_index]
+
+            _partition = move(_partition, get_accelerator().current_device_name()).detach()
+
+            params_list[idx].data = _partition
+
+    # for bwc
+    @classmethod
+    def from_weights(cls, weight_shape=None, dtype=torch.half, weight=None, bias=None):
+        if weight is not None:
+            in_features = weight.shape[1]
+            out_features = weight.shape[0]
+            linear = nn.Linear(in_features, out_features, bias=(bias is not None))
+            linear.weight.data = weight
+            if bias is not None:
+                linear.bias.data = bias
+        else:
+            in_features = weight_shape[1]
+            out_features = weight_shape[0]
+            linear = nn.Linear(in_features, out_features, bias=(bias is not None))
+        return cls(linear, skip_partition=True)
+
+
+class FusedModuleWrapper:
+
+    def __init__(self, fused_module: nn.Module):
+        self.fused_module = fused_module
+
+    def __getattr__(self, module):
+        return self.fused_module
+
+
+class fused_LinearLayer(LinearLayer):
+
+    def __init__(self, module, mp_group, skip_partition=False, **kwargs):
+        assert kwargs.get('fused_module') is not None, "'fused_module' is required but not provided"
+        # Use the warp class to avoid module circular references.
+        self.fused_module = FusedModuleWrapper(kwargs.get('fused_module'))
+        super().__init__(module, mp_group, skip_partition, **kwargs)
+
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+        for idx, param in enumerate(params_list):
+            if param is None:
+                return
+
+            _partition = prepare_tp_fused_qkvw(self.fused_module.module, param, self.tp_world_size, self.tp_index)
+
+            _partition = move(_partition, get_accelerator().current_device_name()).detach()
+
+            params_list[idx].data = _partition
+
+
+class conv_LinearLayer(LinearLayer):
+
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+        weight = None
+        bias = None
+        if len(params_list) == 1:
+            weight = params_list[0]
+        elif len(params_list) == 2:
+            weight, bias = params_list[0], params_list[1]
+        _partition = weight.data.split(get_shard_size_list(weight.shape[0], self.tp_world_size, self.name),
+                                       dim=1)[self.tp_index]
+        _partition = move(_partition, get_accelerator().current_device_name()).detach()
+        weight.data = _partition
+
+        if bias is not None:
+            _partition = bias.data.split(get_shard_size_list(weight.shape[1], self.tp_world_size, self.name),
+                                         dim=0)[self.tp_index]
+            _partition = move(_partition, get_accelerator().current_device_name()).detach()
+
+            bias.data = _partition
+
+
+#override the subclasses related to weight splitting.
+class Yuan_LinearAllreduce(LinearAllreduce):
+
+    #Yuan2
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+        weight, bias = shard_value_with_share_qk(params_list[0].data, params_list[1], self.tp_index,
+                                                 self.tp_world_size, False)
+        params_list[0].data = weight
+        if bias is not None:
+            params_list[1].data = bias
+
+
+class Yuan_LinearLayer(LinearLayer):
+    #Yuan2
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+        weight, bias = shard_value_with_share_qk(params_list[0].data, params_list[1], self.tp_index,
+                                                 self.tp_world_size, True)
+        params_list[0].data = move(weight, get_accelerator().current_device_name()).detach()
+        if bias is not None:
+            params_list[1].data = move(bias, get_accelerator().current_device_name()).detach()
+
+
+class GateUpPack_LinearLayer(LinearLayer):
+    # chatGLM2, chatGLM2
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+        weight, bias = shard_chunk_mlp(params_list[0].data, params_list[1], self.tp_index, self.tp_world_size)
+        params_list[0].data = move(weight, device=get_accelerator().current_device_name()).detach()
+        if bias is not None:
+            params_list[1].data = move(bias, device=get_accelerator().current_device_name()).detach()
+
+
+class Conv_LinearALlreduce(LinearAllreduce):
+
+    @torch.no_grad()
+    def _tp_partition(self, params_list):
+        for idx, param in enumerate(params_list):
+            if param is None:
+                return
+            param.data = param.data.transpose(-1, -2).contiguous()
+
+            _partition = param.split(get_shard_size_list(param.shape[0], self.tp_world_size, self.name),
+                                     dim=1)[self.tp_index]
+
+            _partition = move(_partition, get_accelerator().current_device_name()).detach()
+
+            params_list[idx].data = _partition
+
+
+#override the subclasses related to fwd/bwd.
+class LmHeadLinearAllreduce(LinearAllreduce):
+
+    def __init__(self, module, mp_group, **kwargs):
+        # set the fixed name before partition
+        self.name = "lm_head"
+
+        # In some tied_embedding cases, only the lm head is sharded, while the word embedding is not.
+        # Reinitialization is used to decouple them and prevent the word embedding from being sharded.
+        # This should also be effective for cases where both are sharded in tied_embedding scenarios.
+
+        # TODO: Training scenario-related tests, is it necessary to re-implement the vocab parallel module?
+        module.weight = nn.Parameter(module.weight.clone().detach())
+        if hasattr(module, 'bias') and module.bias is not None:
+            module.bias = nn.Parameter(module.bias.clone().detach())
+        super().__init__(module, mp_group, **kwargs)
+
+    def forward(self, input):
+        input_shard_size = get_shard_size(input.shape[-1], self.tp_world_size, "lm_head")
+        input_shard_offset = sum(get_shard_size_list(input.shape[-1], self.tp_world_size, "lm_head")[0:self.tp_index])
         output = torch.matmul(input[:, :, input_shard_offset:input_shard_offset + input_shard_size],
                               self.weight.transpose(-1, -2))
         if self.mp_group is not None:
@@ -59,28 +583,66 @@ def forward(self, input):
         return output
 
 
-class LinearLayer(nn.Module):
+class TensorParallelConv2d(nn.Module):
 
-    def __init__(self, weight_shape=None, dtype=torch.half, weight=None, bias=None):
-        super(LinearLayer, self).__init__()
-        if weight is not None:
-            self.weight = weight
-            self.bias = bias
+    def __init__(self, conv, rank, world_size, shard_by_oc):
+        super().__init__()
+        self.rank = rank
+        self.world_size = world_size
+        self.shard_by_oc = shard_by_oc
+        self.shard_weights(conv)
+
+    # Split along the input/output channel depending on whether it is the last conv layer.
+    def shard_weights(self, conv):
+        if self.shard_by_oc:
+            total_size = conv.weight.shape[0]
         else:
-            self.weight = Parameter(
-                torch.empty(weight_shape, dtype=dtype, device=get_accelerator().current_device_name()))
+            total_size = conv.weight.shape[1]
+        bias_data = None
+        cols_per_rank = [0]
+        for i in range(self.world_size - 1, -1, -1):
+            cols = total_size // self.world_size
+            if i < total_size % self.world_size:
+                cols += 1
+            cols_per_rank.append(cols_per_rank[-1] + cols)
+        weight_data = conv.weight.data
+        if self.shard_by_oc:
+            # not last conv layer, split output channel
+            weight_data = weight_data[cols_per_rank[self.rank]:cols_per_rank[self.rank + 1]]
+            if conv.bias is not None:
+                bias_data = conv.bias.data[cols_per_rank[self.rank]:cols_per_rank[self.rank + 1]]
+        else:
+            # last conv layer, split input channel
+            weight_data = weight_data[:, cols_per_rank[self.rank]:cols_per_rank[self.rank + 1]]
+            if conv.bias is not None:
+                bias_data = conv.bias.data / float(self.world_size)
+        self.conv = nn.Conv2d(weight_data.shape[1], weight_data.shape[0], conv.kernel_size, conv.stride, conv.padding,
+                              conv.dilation, conv.groups, conv.bias is not None, conv.padding_mode)
+        self.conv.weight = torch.nn.Parameter(weight_data)
+        if conv.bias is not None:
+            self.conv.bias = torch.nn.Parameter(bias_data)
+        del conv
 
-            self.bias = Parameter(
-                torch.empty(weight_shape[0],
-                            dtype=dtype,
-                            device=get_accelerator().current_device_name())) \
-                if bias is not None else None
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return self.conv(input)
 
-    def forward(self, input):
-        output = torch.matmul(input, self.weight.transpose(-1, -2))
-        if self.bias is not None:
-            output += self.bias
-        return output
+
+class TensorParallelOcShardConv2d(TensorParallelConv2d):
+
+    def __init__(self, conv, rank, world_size):
+        super().__init__(conv, rank, world_size, True)
+
+
+class TensorParallelIcShardConv2d(TensorParallelConv2d):
+
+    def __init__(self, conv, rank, world_size):
+        super().__init__(conv, rank, world_size, False)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        out = self.conv(input)
+        if self.world_size > 1:
+            dist.inference_all_reduce(out)
+        return out
 
 
 class Normalize(nn.Module):
@@ -129,7 +691,7 @@ def __init__(self, weight_shape=None, weight=None, bias=None):
         self.offset = 2
         super().__init__(weight_shape, weight=weight)
 
-    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0):
+    def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0, position_ids: int = 0):
         """`input_ids_shape` is expected to be [bsz x seqlen]."""
         attention_mask = attention_mask.long()
 
diff --git a/deepspeed/module_inject/load_checkpoint.py b/deepspeed/module_inject/load_checkpoint.py
index a24d36344cdc..862628fa7b4b 100644
--- a/deepspeed/module_inject/load_checkpoint.py
+++ b/deepspeed/module_inject/load_checkpoint.py
@@ -236,7 +236,7 @@ def load_module_recursive(module, prefix='', level=0):
                         child.weight.ds_id in all_ds_ids):
                         prefix1 = all_ds_ids[child.weight.ds_id]
                         if child.__class__ is nn.Linear:
-                            child = LinearLayer(weight=all_ds_ids[child.weight.ds_id])
+                            child = LinearLayer.from_weights(weight=all_ds_ids[child.weight.ds_id])
                             setattr(module, name, child)
                     continue
                 child_params = list(child.parameters())
@@ -249,7 +249,9 @@ def load_module_recursive(module, prefix='', level=0):
                         child = Normalize(dim=ds_shape[-1], dtype=child.weight.dtype, eps=child.eps)
                         setattr(module, name, child)
                     elif child.__class__ in [nn.Linear, ColumnParallelLinear, RowParallelLinear]:
-                        child = LinearLayer(weight_shape=child.weight.shape, bias=child.bias)
+                        child = LinearLayer.from_weights(weight_shape=child.weight.shape,
+                                                         dtype=child.weight.dtype,
+                                                         bias=child.bias)
                         setattr(module, name, child)
                     elif child.__class__ is OPTLearnedPositionalEmbedding:
                         child = OPTEmbedding(weight_shape=ds_shape)
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index fe32378613c9..ed94a5021fee 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -14,9 +14,10 @@
 from deepspeed.accelerator import get_accelerator
 from .replace_policy import replace_policies, generic_policies
 from .auto_tp import AutoTP, ReplaceWithTensorSlicing, Loading
-
+from .layers import TensorParallelOcShardConv2d, TensorParallelIcShardConv2d
+from deepspeed.module_inject.layers import is_autotp_training_mode
 from deepspeed import comm as dist
-from deepspeed.module_inject.tp_shard import set_num_kv_heads
+from deepspeed.module_inject.tp_shard import set_num_kv_heads, set_n_embd, set_num_attention_heads, set_tp_grain_size
 
 from .load_checkpoint import load_model_with_checkpoint
 import time
@@ -267,17 +268,45 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         #mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group)
 
         # 1. Create AutoTP object
-        _autotp = AutoTP(module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl)
+        _autotp = AutoTP(module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl,
+                         config.keep_module_on_host)
 
         # 2. Set the tensor parallelism config
         _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
 
         # 3. Try to get num_key_heads from model_config.num_key_value_heads
-        num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
+        if hasattr(model_config, "vision_config"):
+            if "MllamaVisionEncoderLayer" in str(module):
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
+            elif hasattr(model_config, "text_config"):
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
+            else:
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
+        else:
+            num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
 
         # 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
         set_num_kv_heads(num_kv_heads)
 
+        # 4.1 Get n_embd
+        n_embd = None
+        multi_query_n_embd_names = ['n_embd', 'hidden_size']
+        for name in multi_query_n_embd_names:
+            if hasattr(model_config, name):
+                n_embd = getattr(model_config, name)
+            if n_embd != None:
+                break
+
+        # 4.2 set n_embd
+        set_n_embd(n_embd)
+
+        # 4.3 set attention_heads
+        if hasattr(model_config, 'num_attention_heads'):
+            set_num_attention_heads(getattr(model_config, 'num_attention_heads'))
+
+        # 4.4 set tp_grain_size
+        set_tp_grain_size(config.tensor_parallel.tp_grain_size)
+
         # 5. Set linear policies
         _autotp.update_linear_policies()
 
@@ -294,7 +323,7 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
 
         else:
             # copy relevant state from child -> new module
-            if config.replace_with_kernel_inject:
+            if not is_autotp_training_mode() and config.replace_with_kernel_inject:
                 new_module = replace_with_policy(child,
                                                  _policy,
                                                  config.triangular_masking,
@@ -306,6 +335,10 @@ def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
         return new_module
 
     def set_lm_head(module):
+        if is_autotp_training_mode():
+            # we need to handle autoTP training mode separately.
+            return
+
         embedding_weight = None
         for n, p in module.named_parameters():
             if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
@@ -314,13 +347,38 @@ def set_lm_head(module):
                 module.lm_head, "weight") and module.lm_head.weight.is_meta:
             module.lm_head.weight = embedding_weight
         # enable tensor parallel for the last linear
-        if hasattr(module, "lm_head") and hasattr(module.lm_head, "weight") and not module.lm_head.weight.is_meta:
+        if hasattr(module, "lm_head") and hasattr(module.lm_head, "weight") and isinstance(
+                module.lm_head, torch.nn.Linear):
             module = replace_wo_policy(module, ("lm_head", ), 0, "lm_head")
-        elif hasattr(module, "embed_out") and hasattr(module.embed_out,
-                                                      "weight") and not module.embed_out.weight.is_meta:
+        elif hasattr(module, "embed_out") and hasattr(module.embed_out, "weight") and isinstance(
+                module.embed_out, torch.nn.Linear):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
+        elif hasattr(module, "language_model") and hasattr(module.language_model, "lm_head"):
+            module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
         return module
 
+    def conv2d_parallel_shard_weights(model, rank, world_size):
+        # add conv policy
+        shard_oc_name = ["conv1"]
+        shard_ic_name = ["conv2"]
+        for name, sub_m in model.named_children():
+            for l_name, l_sub_m in sub_m.named_children():
+                if l_name in shard_oc_name:
+                    TPConv2d = TensorParallelOcShardConv2d(
+                        l_sub_m,
+                        rank,
+                        world_size,
+                    )
+                    setattr(sub_m, l_name, TPConv2d)
+                if l_name in shard_ic_name:
+                    TPConv2d = TensorParallelIcShardConv2d(
+                        l_sub_m,
+                        rank,
+                        world_size,
+                    )
+                    setattr(sub_m, l_name, TPConv2d)
+            conv2d_parallel_shard_weights(sub_m, rank, world_size)
+
     if checkpoint_dict is not None and not config.replace_with_kernel_inject:
         # AutoTP shard loading
         checkpoint = checkpoint_dict["checkpoints"]
@@ -334,12 +392,18 @@ def set_lm_head(module):
                                              checkpoint=checkpoint_file)
             pbar.update(1)
             gc.collect()
-        replaced_module = set_lm_head(replaced_module)
+        # conv2d tp module replace
+        # Now is for yuan model. Add model list and conv policy to decide whether to replace conv.
+        if 'Yuan' in str(replaced_module):
+            conv2d_parallel_shard_weights(replaced_module, dist.get_rank(), dist.get_world_size())
     else:
         replaced_module = replace_module(model=model,
                                          orig_class=orig_layer_impl,
                                          replace_fn=replace_fn,
                                          _replace_policy=config.injection_policy_tuple)
+    # AutoTP default set lm_head tp
+    if not config.replace_with_kernel_inject:
+        replaced_module = set_lm_head(replaced_module)
 
     quantizer = GroupQuantizer(q_int8=quantize)
     world_size = dist.get_world_size() if dist.is_initialized() else 1
@@ -359,7 +423,7 @@ def set_lm_head(module):
             pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
 
             for i in range(len(checkpoint)):
-                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu')]
+                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location='cpu', weights_only=False)]
                 load_model_with_checkpoint(replaced_module,
                                            sd,
                                            mp_replace,
@@ -381,7 +445,7 @@ def set_lm_head(module):
                     os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j]
                     for j in range(sd_count)
                 ]
-                sds = [torch.load(ckpt_file, map_location='cpu') for ckpt_file in ckpt_files]
+                sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False) for ckpt_file in ckpt_files]
                 load_model_with_checkpoint(replaced_module,
                                            sds,
                                            mp_replace,
@@ -401,7 +465,7 @@ def set_lm_head(module):
                     pbar.update(1)
                     ckpt_file = os.path.join(base_dir1,
                                              checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i]
-                    sds = [torch.load(ckpt_file, map_location='cpu')]
+                    sds = [torch.load(ckpt_file, map_location='cpu', weights_only=False)]
                     load_model_with_checkpoint(replaced_module,
                                                sds,
                                                mp_replace,
@@ -415,7 +479,7 @@ def set_lm_head(module):
         set_lm_head(replaced_module)
         print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
 
-    if config.save_mp_checkpoint_path is not None:
+    if not is_autotp_training_mode() and config.save_mp_checkpoint_path is not None:
         from collections import OrderedDict
         import json
         num_partitions = 8
@@ -440,9 +504,10 @@ def set_lm_head(module):
         if not dist.is_initialized() or dist.get_rank() == 0:
             print("Saving tp-sharded checkpoints")
             torch.save(
-                OrderedDict({k: v
-                             for k, v in dict(replaced_module.state_dict()).items()
-                             if transformer_name not in k}), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
+                OrderedDict({
+                    k: v
+                    for k, v in dict(replaced_module.state_dict()).items() if transformer_name not in k
+                }), f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
 
             dtype_reprs = {
                 torch.float32: 'float32',
@@ -563,7 +628,12 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No
     """
     sd = None
     if checkpoint is not None:
-        sd = torch.load(checkpoint, map_location='cpu')
+        if checkpoint.endswith(".safetensors"):
+            from safetensors.torch import load_file
+            sd = load_file(checkpoint)
+        else:
+            sd = torch.load(checkpoint, map_location='cpu', weights_only=False)
+
     policy = {}
     if orig_class is not None:
         policy.update({orig_class: (replace_fn, _replace_policy)})
@@ -578,7 +648,7 @@ def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=No
                 policy.update({plcy._orig_layer_class: (replace_fn, plcy)})
     assert len(policy.items()) > 0,\
         "No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})." +\
-        "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
+        "You can find some samples here: https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
 
     replaced_module, _ = _replace_module(model, policy, state_dict=sd)
     return replaced_module
@@ -597,7 +667,7 @@ def skip_level_0_prefix(model, state_dict):
     if key is None:
         key = re.match(r"(.*?)Model", model)
     # if keys start with 'model.', don't skip level 0 prefix
-    if state_dict != None:
+    if state_dict is not None:
         for item in state_dict.keys():
             if re.match("^model[.]", item):
                 return False
diff --git a/deepspeed/module_inject/tp_shard.py b/deepspeed/module_inject/tp_shard.py
index 8e2fa78d883f..ded262edcf61 100644
--- a/deepspeed/module_inject/tp_shard.py
+++ b/deepspeed/module_inject/tp_shard.py
@@ -12,28 +12,63 @@ def set_num_kv_heads(num):
     num_kv_heads = num
 
 
+def set_num_attention_heads(num):
+    global num_attention_heads
+    num_attention_heads = num
+
+
+def set_n_embd(num):
+    global n_embd
+    n_embd = num
+
+
+def set_tp_grain_size(num):
+    global tp_grain_size
+    tp_grain_size = num
+
+
 def get_num_kv_heads():
     global num_kv_heads
-    return num_kv_heads
+    if 'num_kv_heads' in globals():
+        return num_kv_heads
+    return None
+
+
+def get_num_attention_heads():
+    global num_attention_heads
+    return num_attention_heads
 
 
-def get_shard_size(total_size, mp_size, rank=None):
+def get_shard_size(total_size, mp_size, name=None, rank=None):
     global num_kv_heads
-    # When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
-    if num_kv_heads != None:
-        if (rank == None):
-            rank = dist.get_rank()
+    last_linear = ["lm_head", "embed_out"]
+    # MoE MLP layer use near even division will get better perf.
+    moe_mlp_layer = ["gate_proj", "up_proj", "down_proj", "w1", "w2", "w3"]
+    not_moe_mlp_layer = True
+    if name != None and any(s in str(name) for s in moe_mlp_layer):
+        not_moe_mlp_layer = False
+    # When we have num_kv_heads defined, uneven division is possible, otherwise enforce near even division
+    if rank == None:
+        rank = dist.get_rank()
+    if num_kv_heads != None and total_size % num_kv_heads == 0 and "mlp" not in str(name) and str(
+            name) not in last_linear and not_moe_mlp_layer:
         my_slices = (num_kv_heads // mp_size) + (1 if rank < (num_kv_heads % mp_size) else 0)
         return total_size * my_slices // num_kv_heads
     else:
-        if total_size % mp_size == 0:
-            return total_size // mp_size
+        if total_size >= tp_grain_size:
+            grain_size = total_size // tp_grain_size
+            return (grain_size // mp_size + (1 if rank < (grain_size % mp_size) else 0)) * tp_grain_size
         else:
-            assert False, f"Number of attention heads ({total_size}) must be divisible by mp_size ({mp_size})"
+            return total_size // mp_size + (1 if rank < (total_size % mp_size) else 0)
+
+
+def get_n_embd():
+    global n_embd
+    return n_embd
 
 
-def get_shard_size_list(total_size, mp_size):
+def get_shard_size_list(total_size, mp_size, name=None):
     shard_sizes = []
     for i in range(mp_size):
-        shard_sizes.append(get_shard_size(total_size, mp_size, i))
+        shard_sizes.append(get_shard_size(total_size, mp_size, name, i))
     return shard_sizes
diff --git a/deepspeed/moe/experts.py b/deepspeed/moe/experts.py
index 8cadb0c387fa..0863221d7edf 100644
--- a/deepspeed/moe/experts.py
+++ b/deepspeed/moe/experts.py
@@ -3,33 +3,36 @@
 
 # DeepSpeed Team
 
-import torch
 import copy
+from typing import List, Optional
+
+import torch
+from torch import nn
 
 
-class Experts(torch.nn.Module):
+class Experts(nn.Module):
 
-    def __init__(self, expert, num_local_experts=1, expert_group_name=None):
+    def __init__(self, expert: nn.Module, num_local_experts: int = 1, expert_group_name: Optional[str] = None) -> None:
         super(Experts, self).__init__()
 
-        self.deepspeed_experts = torch.nn.ModuleList([copy.deepcopy(expert) for i in range(num_local_experts)])
+        self.deepspeed_experts = nn.ModuleList([copy.deepcopy(expert) for _ in range(num_local_experts)])
         self.num_local_experts = num_local_experts
 
         # TODO: revisit allreduce for moe.gate...
         for expert in self.deepspeed_experts:
             # TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group)
-            for name, param in expert.named_parameters():
+            for param in expert.parameters():
                 param.allreduce = False
                 param.group_name = expert_group_name
 
-    def forward(self, inputs):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
         chunks = inputs.chunk(self.num_local_experts, dim=1)
-        expert_outputs = []
+        expert_outputs: List[torch.Tensor] = []
+
         for chunk, expert in zip(chunks, self.deepspeed_experts):
             out = expert(chunk)
-            if type(out) is tuple:
+            if isinstance(out, tuple):
                 out = out[0]  # Ignore the bias term for now
             expert_outputs += [out]
 
-        expert_output = torch.cat(expert_outputs, dim=1)
-        return expert_output
+        return torch.cat(expert_outputs, dim=1)
diff --git a/deepspeed/moe/layer.py b/deepspeed/moe/layer.py
index 7dd0c6bcb67d..6777788ab885 100644
--- a/deepspeed/moe/layer.py
+++ b/deepspeed/moe/layer.py
@@ -3,22 +3,23 @@
 
 # DeepSpeed Team
 
-import torch
+from typing import Optional, Tuple
 
-from deepspeed.utils import log_dist
+import torch
+from torch import nn
+from torch.nn import functional as F
 
-from deepspeed.utils import groups
-from .sharded_moe import MOELayer, TopKGate
+from deepspeed.utils import groups, log_dist
 from .experts import Experts
-import typing
+from .sharded_moe import MOELayer, TopKGate
 
 
-class MoE(torch.nn.Module):
+class MoE(nn.Module):
     """Initialize an MoE layer.
 
     Arguments:
         hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
-        expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
+        expert (nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
         num_experts (int, optional): default=1, the total number of experts per layer.
         ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
         k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
@@ -31,23 +32,25 @@ class MoE(torch.nn.Module):
         use_rts (bool, optional): default=True, whether to use Random Token Selection.
         use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
         enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
+        top2_2nd_expert_sampling (bool, optional): default=True, whether to perform sampling for 2nd expert
     """
 
     def __init__(self,
-                 hidden_size,
-                 expert,
-                 num_experts=1,
-                 ep_size=1,
-                 k=1,
-                 capacity_factor=1.,
-                 eval_capacity_factor=1.,
-                 min_capacity=4,
-                 use_residual=False,
-                 noisy_gate_policy: typing.Optional[str] = None,
+                 hidden_size: int,
+                 expert: nn.Module,
+                 num_experts: int = 1,
+                 ep_size: int = 1,
+                 k: int = 1,
+                 capacity_factor: float = 1.0,
+                 eval_capacity_factor: float = 1.0,
+                 min_capacity: int = 4,
+                 use_residual: bool = False,
+                 noisy_gate_policy: Optional[str] = None,
                  drop_tokens: bool = True,
-                 use_rts=True,
+                 use_rts: bool = True,
                  use_tutel: bool = False,
-                 enable_expert_tensor_parallelism: bool = False):
+                 enable_expert_tensor_parallelism: bool = False,
+                 top2_2nd_expert_sampling: bool = True) -> None:
 
         super(MoE, self).__init__()
 
@@ -68,7 +71,8 @@ def __init__(self,
 
         experts = Experts(expert, self.num_local_experts, self.expert_group_name)
         self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor,
-                                               min_capacity, noisy_gate_policy, drop_tokens, use_rts),
+                                               min_capacity, noisy_gate_policy, drop_tokens, use_rts, None,
+                                               top2_2nd_expert_sampling),
                                       experts,
                                       self.expert_group_name,
                                       self.ep_size,
@@ -77,12 +81,12 @@ def __init__(self,
         if self.use_residual:
             self.mlp = expert
             # coefficient is used for weighted sum of the output of expert and mlp
-            self.coefficient = torch.nn.Linear(hidden_size, 2)
+            self.coefficient = nn.Linear(hidden_size, 2)
 
-    def set_deepspeed_parallelism(self, use_data_before_expert_parallel_=False):
+    def set_deepspeed_parallelism(self, use_data_before_expert_parallel_: bool = False) -> None:
         self._create_process_groups(use_data_before_expert_parallel_=use_data_before_expert_parallel_)
 
-    def _create_process_groups(self, use_data_before_expert_parallel_=False):
+    def _create_process_groups(self, use_data_before_expert_parallel_: bool = False) -> None:
         # Create process group for a layer if needed
         if self.expert_group_name not in groups._get_expert_parallel_group_dict():
             print(f"No existing process group found, creating a new group named: {self.expert_group_name}")
@@ -98,7 +102,9 @@ def _create_process_groups(self, use_data_before_expert_parallel_=False):
         # Set the group handle for the MOELayer (deepspeed_moe) object
         self.deepspeed_moe._set_ep_group(groups._get_expert_parallel_group(self.expert_group_name))
 
-    def forward(self, hidden_states, used_token=None):
+    def forward(self,
+                hidden_states: torch.Tensor,
+                used_token: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """ MoE forward
 
         Arguments:
@@ -112,15 +118,15 @@ def forward(self, hidden_states, used_token=None):
 
             * l_aux (Tensor): gate loss value
 
-            * exp_counts (int): expert count
+            * exp_counts (Tensor): expert count
         """
         output = self.deepspeed_moe(hidden_states, used_token)
         if self.use_residual:
             # Residual MoE
             output_mlp = self.mlp(hidden_states)
-            if type(output_mlp) is tuple:
+            if isinstance(output_mlp, tuple):
                 output_mlp = output_mlp[0]  # Ignore the bias term for now
             coef = self.coefficient(hidden_states)
-            coef = torch.nn.functional.softmax(coef, dim=-1)
+            coef = F.softmax(coef, dim=-1)
             output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]
         return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts
diff --git a/deepspeed/moe/mappings.py b/deepspeed/moe/mappings.py
index 6c501ea6503a..e57f66b85193 100644
--- a/deepspeed/moe/mappings.py
+++ b/deepspeed/moe/mappings.py
@@ -23,6 +23,8 @@
 
 import torch
 import deepspeed
+from deepspeed.utils.bwc import (bwc_tensor_model_parallel_world_size, bwc_tensor_model_parallel_rank,
+                                 bwc_tensor_model_parallel_group)
 
 
 def _gather_tokens(input_, dim=0):
@@ -30,15 +32,23 @@ def _gather_tokens(input_, dim=0):
     mpu = deepspeed.utils.groups.mpu
 
     input_ = input_.contiguous()
-    # Size and dimension.
-    rank = mpu.get_tensor_model_parallel_rank()
-
-    tensor_list = [torch.empty_like(input_) for _ in range(mpu.get_tensor_model_parallel_world_size())]
-    tensor_list[rank] = input_
-    deepspeed.comm.all_gather(tensor_list, input_, group=mpu.get_tensor_model_parallel_group())
+    world_size = bwc_tensor_model_parallel_world_size(mpu)
+    if world_size == 1:
+        return input_
 
-    # Note: torch.cat already creates a contiguous tensor.
-    output = torch.cat(tensor_list, dim=dim).contiguous()
+    gather_buffer = torch.empty(world_size * input_.numel(), dtype=input_.dtype, device=input_.device)
+    deepspeed.comm.all_gather_into_tensor(gather_buffer, input_, group=bwc_tensor_model_parallel_group(mpu))
+    if dim == 0:
+        shape = list(input_.size())
+        shape[0] = shape[0] * world_size
+        output = gather_buffer.view(shape)
+    else:
+        tensor_list = [
+            gather_buffer.narrow(0,
+                                 input_.numel() * i, input_.numel()).view_as(input_) for i in range(world_size)
+        ]
+        # Note: torch.cat already creates a contiguous tensor.
+        output = torch.cat(tensor_list, dim=dim).contiguous()
 
     return output
 
@@ -47,8 +57,10 @@ def _drop_tokens(input_, dim=0):
     """Divide a tensor among the tensor parallel ranks"""
     mpu = deepspeed.utils.groups.mpu
 
-    total_chunks = mpu.get_tensor_model_parallel_world_size()
-    this_chunk = mpu.get_tensor_model_parallel_rank()
+    total_chunks = bwc_tensor_model_parallel_world_size(mpu)
+    if total_chunks == 1:
+        return input_
+    this_chunk = bwc_tensor_model_parallel_rank(mpu)
     assert input_.shape[
         dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
     chunk_size = input_.shape[dim] // total_chunks
@@ -92,7 +104,7 @@ def backward(ctx, input_):
 
 def gather_tokens(input_, dim=0):
     mpu = deepspeed.utils.groups.mpu
-    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
+    if mpu is None or bwc_tensor_model_parallel_world_size(mpu) == 1:
         # no tensor parallelism for non-experts
         return input_
     return _GatherTokens.apply(input_, dim)
@@ -100,7 +112,7 @@ def gather_tokens(input_, dim=0):
 
 def drop_tokens(input_, dim=0):
     mpu = deepspeed.utils.groups.mpu
-    if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
+    if mpu is None or bwc_tensor_model_parallel_world_size(mpu) == 1:
         # no tensor parallelism for non-experts
         return input_
     return _DropTokens.apply(input_, dim)
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
index d92211b9d220..a00d694fbc14 100644
--- a/deepspeed/moe/sharded_moe.py
+++ b/deepspeed/moe/sharded_moe.py
@@ -17,7 +17,8 @@
 
 from deepspeed.utils.timer import SynchronizedWallClockTimer
 from deepspeed.utils import logger
-from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple
+from deepspeed.utils.bwc import bwc_tensor_model_parallel_world_size
+from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -42,7 +43,7 @@
 
 try:
     # To enable Tutel MoE optimizations:
-    #   python3 -m pip install --user --upgrade git+https://github.com/microsoft/tutel@v0.1.x
+    #   python3 -m pip install --user --upgrade git+https://github.com/deepspeedai/tutel@v0.1.x
     from tutel import moe as tutel_moe
     TUTEL_INSTALLED = True
 except:
@@ -95,11 +96,7 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
 class _AllToAll(torch.autograd.Function):
 
     @staticmethod
-    def forward(
-            ctx: Any,
-            # TODO: replace with DS process group
-            group: torch.distributed.ProcessGroup,
-            input: Tensor) -> Tensor:  # type: ignore
+    def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor) -> Tensor:  # type: ignore
         ctx.group = group
         input = input.contiguous()
         output = torch.empty_like(input)
@@ -127,6 +124,8 @@ def einsum(rule, a, b):
         return a.unsqueeze(2) * b.unsqueeze(1)
     elif rule == 'se,se->s':
         return torch.bmm(a.unsqueeze(1), b.unsqueeze(2)).reshape(-1)
+    elif rule == 'se,sec->sec':
+        return a.unsqueeze(2) * b
     elif rule == 'sec,sm->ecm':
         s = a.shape[0]
         e = a.shape[1]
@@ -188,13 +187,14 @@ def top1gating(logits: Tensor,
                noisy_gate_policy: Optional[str] = None,
                drop_tokens: bool = True,
                use_rts: bool = True,
+               ep_group: Union[torch.distributed.ProcessGroup, None] = None,
                use_tutel: bool = False) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top1Gating on logits."""
     if noisy_gate_policy == 'RSample':
         logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device)
     # everything is in fp32 in this function
-    gates = F.softmax(logits, dim=1)
 
+    gates = F.softmax(logits, dim=1)
     capacity = _capacity(gates, torch.tensor(capacity_factor), torch.tensor(min_capacity))
 
     # Create a mask for 1st's expert per token
@@ -208,13 +208,21 @@ def top1gating(logits: Tensor,
         mask1 = einsum("s,se->se", used_token, mask1)
 
     # gating decisions
-    exp_counts = torch.sum(mask1, dim=0).detach().to('cpu')
+    exp_counts = torch.sum(mask1, dim=0).detach().to(logits.device)
 
     # if we don't want to drop any tokens
     if not drop_tokens:
         new_capacity = torch.max(exp_counts).to(logits.device)
-        dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group())
-        capacity = new_capacity
+        # Communicate across expert processes to pick the maximum capacity.
+        if ep_group is not None:
+            dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=ep_group)
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'.
+            # This is since we are going to activate drop_tokens() to drop duplicate tokens.
+            tp = 1 if groups.mpu is None else bwc_tensor_model_parallel_world_size(mpu=groups.mpu)
+            new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype)
+        # Make sure the capacity value does not exceed the number of tokens.
+        capacity = min(new_capacity, torch.tensor(mask1.size(0)).to(new_capacity.device))
 
     # Compute l_aux
     me = torch.mean(gates, dim=0)
@@ -279,23 +287,28 @@ def top1gating(logits: Tensor,
     return l_aux, combine_weights, dispatch_mask, exp_counts
 
 
-def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+def top2gating(logits: Tensor,
+               capacity_factor: float,
+               min_capacity: int,
+               drop_tokens: bool = True,
+               ep_group: Union[torch.distributed.ProcessGroup, None] = None,
+               top2_2nd_expert_sampling: bool = True) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
     """Implements Top2Gating on logits."""
     # everything is in fp32 in this function
     gates = F.softmax(logits, dim=1)
 
-    capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity))
-
     # Create a mask for 1st's expert per token
     indices1_s = torch.argmax(gates, dim=1)
     num_experts = int(gates.shape[1])
     mask1 = F.one_hot(indices1_s, num_classes=num_experts)
 
-    # Create a mask for 2nd's expert per token using Gumbel-max trick
-    # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/
-    logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device)
+    if top2_2nd_expert_sampling:
+        # Create a mask for 2nd's expert per token using Gumbel-max trick
+        # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/
+        logits += gumbel_rsample(logits.shape, device=logits.device)
+
     # Replace top-expert with min value
-    logits_except1 = logits_w_noise.masked_fill(mask1.bool(), float("-inf"))
+    logits_except1 = logits.masked_fill(mask1.bool(), float("-inf"))
     indices2_s = torch.argmax(logits_except1, dim=1)
     mask2 = F.one_hot(indices2_s, num_classes=num_experts)
 
@@ -305,17 +318,30 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup
     # Update 2nd's location by accounting for locations of 1st
     locations2 += torch.sum(mask1, dim=0, keepdim=True)
 
-    # gating decisions
-    exp_counts = torch.sum(mask1, dim=0).detach().to('cpu')
-
     # Compute l_aux
     me = torch.mean(gates, dim=0)
     ce = torch.mean(mask1.float(), dim=0)
     l_aux = torch.mean(me * ce) * num_experts * num_experts
 
-    # Remove locations outside capacity from mask
-    mask1 *= torch.lt(locations1, capacity)
-    mask2 *= torch.lt(locations2, capacity)
+    # gating decisions
+    exp_counts = torch.sum(mask1 + mask2, dim=0).detach().to(logits.device)
+
+    if drop_tokens:
+        # Calculate configured capacity and remove locations outside capacity from mask
+        capacity = _capacity(gates, torch.tensor(capacity_factor * 2), torch.tensor(min_capacity))
+        mask1 *= torch.lt(locations1, capacity)
+        mask2 *= torch.lt(locations2, capacity)
+    else:
+        # Do not drop tokens - set capacity according to current expert assignments
+        new_capacity = torch.max(exp_counts)
+        if ep_group is not None:
+            dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=ep_group)
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'.
+            # This is since we are going to activate drop_tokens() to drop duplicate tokens.
+            tp = 1 if groups.mpu is None else bwc_tensor_model_parallel_world_size(mpu=groups.mpu)
+            new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype)
+        capacity = new_capacity
 
     # Store the capacity location for each token
     locations1_s = torch.sum(locations1 * mask1, dim=1)
@@ -345,6 +371,81 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup
     return l_aux, combine_weights, dispatch_mask, exp_counts
 
 
+def topkgating(
+    logits: Tensor,
+    k: int,
+    capacity_factor: float,
+    min_capacity: int,
+    drop_tokens: bool = True,
+    ep_group: Union[torch.distributed.ProcessGroup, None] = None,
+    drop_policy: str = "probs",
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    """Implements TopKGating on logits."""
+
+    # everything is in fp32 in this function
+    # get topk gates
+    top_gate, top_idx = torch.topk(logits, k=k, dim=1)
+    # gating decisions
+    gates = F.softmax(logits, dim=1)
+    num_experts = int(gates.shape[1])
+
+    # get topk mask
+    topk_masked_gates = torch.zeros_like(logits).scatter(1, top_idx, top_gate)
+
+    mask = torch.zeros_like(gates, dtype=torch.bool).scatter_(1, top_idx, 1)
+
+    exp_counts = torch.sum(mask, dim=0).detach().to(logits.device)
+
+    # Compute l_aux
+    me = torch.mean(gates, dim=0)
+    ce = torch.mean(mask.float(), dim=0)
+    l_aux = torch.mean(me * ce) * num_experts * num_experts / k
+
+    if drop_tokens:
+        # Calculate configured capacity and remove locations outside capacity from mask
+        capacity = _capacity(gates, torch.tensor(capacity_factor * k), torch.tensor(min_capacity))
+        # update mask and locations by capacity
+
+        if drop_policy == 'probs':
+            capacity_probs, capacity_indices = torch.topk(topk_masked_gates, k=capacity, dim=0, sorted=False)
+            capacity_mask = torch.zeros_like(logits).scatter(0, capacity_indices, 1)
+            mask = torch.logical_and(mask, capacity_mask)
+            locations = torch.cumsum(mask, dim=0) - 1
+
+        elif drop_policy == "position":
+            locations = torch.cumsum(mask, dim=0) - 1
+            mask *= torch.lt(locations, capacity)
+        else:
+            raise ValueError(f"Invalid drop_policy: {drop_policy}")
+
+    else:
+        # Do not drop tokens - set capacity according to current expert assignments
+        new_capacity = torch.max(exp_counts)
+        if ep_group is not None:
+            dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=ep_group)
+        if groups._get_expert_model_parallel_world_size() == 1:
+            # If the non-expert is tensor-parallel, we need to pad the capacity to 'tp'.
+            # This is since we are going to activate drop_tokens() to drop duplicate tokens.
+            tp = 1 if groups.mpu is None else bwc_tensor_model_parallel_world_size(mpu=groups.mpu)
+            new_capacity = torch.ceil(new_capacity / tp).mul(tp).to(new_capacity.dtype)
+        capacity = new_capacity
+
+    # normalize gates
+    gates_masked = gates * mask
+    gates_s = torch.sum(gates_masked, dim=-1, keepdim=True)
+    denom_s = torch.clamp(gates_s, min=torch.finfo(gates_masked.dtype).eps)
+    gates_masked = gates_masked / denom_s
+
+    # dispatch_mask
+    locations_sc = _one_hot_to_float((locations * mask), capacity)
+
+    combine_weights = torch.einsum("se,sec->sec", gates_masked, locations_sc)
+
+    dispatch_mask = combine_weights.bool()
+
+    return l_aux, combine_weights, dispatch_mask, exp_counts
+
+
 class TopKGate(Module):
     """Gate module which implements Top2Gating as described in Gshard_.
     ::
@@ -357,7 +458,7 @@ class TopKGate(Module):
     Args:
         model_dim (int):
             size of model embedding dimension
-        num_experts (ints):
+        num_experts (int):
             number of experts in model
     """
 
@@ -372,13 +473,13 @@ def __init__(self,
                  min_capacity: int = 8,
                  noisy_gate_policy: Optional[str] = None,
                  drop_tokens: bool = True,
-                 use_rts: bool = True) -> None:
+                 use_rts: bool = True,
+                 ep_group: Union[torch.distributed.ProcessGroup, None] = None,
+                 top2_2nd_expert_sampling: bool = True) -> None:
         super().__init__()
 
-        # Only top-1 and top-2 are supported at the moment.
-        if k != 1 and k != 2:
-            raise ValueError('Only top-1 and top-2 gatings are supported.')
-        self.wg = torch.nn.Linear(model_dim, num_experts, bias=False).float()
+        self.wg = torch.nn.Linear(model_dim, num_experts, bias=False)
+        self.ep_group = ep_group
         self.k = k
         self.capacity_factor = capacity_factor
         self.eval_capacity_factor = eval_capacity_factor
@@ -389,6 +490,11 @@ def __init__(self,
         self.gate_time = 0.0
         self.drop_tokens = drop_tokens
         self.use_rts = use_rts
+        self.top2_2nd_expert_sampling = top2_2nd_expert_sampling
+
+    def _set_ep_group(self, ep_group):
+        assert self.ep_group is None, f'Attempting to override an existing ep_group'
+        self.ep_group = ep_group
 
     def forward(self,
                 input: torch.Tensor,
@@ -398,22 +504,24 @@ def forward(self,
         if self.wall_clock_breakdown:
             self.timers(TOPK_GATE_TIMER).start()
 
-        if self.wg.weight.dtype != torch.float32:
-            self.wg = self.wg.float()
         input_fp32 = input.float()
         # input jittering
         if self.noisy_gate_policy == 'Jitter' and self.training:
             input_fp32 = multiplicative_jitter(input_fp32, device=input.device)
-        logits = self.wg(input_fp32)
+        logits = torch.nn.functional.linear(input_fp32, weight=self.wg.weight.float(), bias=None)
 
         if self.k == 1:
             gate_output = top1gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
                                      self.min_capacity, used_token, self.noisy_gate_policy if self.training else None,
-                                     self.drop_tokens, self.use_rts, use_tutel)
+                                     self.drop_tokens, self.use_rts, self.ep_group, use_tutel)
 
-        else:
+        elif self.k == 2:
             gate_output = top2gating(logits, self.capacity_factor if self.training else self.eval_capacity_factor,
-                                     self.min_capacity)
+                                     self.min_capacity, self.drop_tokens, self.ep_group, self.top2_2nd_expert_sampling)
+        else:
+            gate_output = topkgating(logits, self.k,
+                                     self.capacity_factor if self.training else self.eval_capacity_factor,
+                                     self.min_capacity, self.drop_tokens, self.ep_group)
 
         if self.wall_clock_breakdown:
             self.timers(TOPK_GATE_TIMER).stop()
@@ -473,6 +581,7 @@ def __init__(self,
 
     def _set_ep_group(self, ep_group):
         self.ep_group = ep_group
+        self.gate._set_ep_group(ep_group)
 
     def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
 
@@ -502,13 +611,18 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
         if self.wall_clock_breakdown:
             self.timers(FIRST_ALLTOALL_TIMER).start()
 
-        if groups._get_expert_model_parallel_world_size() == 1:
-            # If the non-expert is tensor-parallel, it will create
+        tensor_model_world_size = bwc_tensor_model_parallel_world_size(groups.mpu)
+        if tensor_model_world_size > 1:
+            # If the non-expert is tensor-parallel,
+            # Whether expert is tensor-parallel or not , it will create
             # duplicate tokens on the tensor-parallel ranks.
-            # Since our experts are not tensor-parallel, these duplicates
-            # need to be dropped to ensure correctness.
-            # this also doubles up as a communication optimization as we are
-            # reducing the all-to-all communication volume.
+            # drop duplicate tokens also doubles up as a communication
+            # optimization as we are reducing the all-to-all communication volume.
+            # 1: for not tensor-parallel expert,drop duplicate tokens to ensure
+            # both correctness and reduce all-to-all communication.
+            # 2: for tensor-parallel expert,drop duplicate tokens to reduce all-to-all
+            # communication volume,before expert execution, it is necessary to perform
+            # an allgather to ensure correctness,
             dispatched_input = drop_tokens(dispatched_input, dim=1)
 
         dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)
@@ -517,10 +631,22 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
             self.timers(FIRST_ALLTOALL_TIMER).stop()
             self.time_falltoall = self.timers(FIRST_ALLTOALL_TIMER).elapsed(reset=False)
 
+        if tensor_model_world_size > 1 and groups._get_expert_model_parallel_world_size() > 1:
+            # if both expert and non-expert are tensor-parallel
+            # the dropped duplicate tokens need to be gathered on each
+            # tensor parallel rank again to ensure correctness
+            dispatched_input = gather_tokens(dispatched_input, dim=1)
+
         # Re-shape after all-to-all: ecm -> gecm
         dispatched_input = dispatched_input.reshape(self.ep_size, self.num_local_experts, -1, d_model)
-
         expert_output = self.experts(dispatched_input)
+        # Re-shape before drop_tokens: gecm -> ecm
+        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model)
+        if tensor_model_world_size > 1 and groups._get_expert_model_parallel_world_size() > 1:
+            # if both expert and non-expert are tensor-parallel
+            # drop duplicate tokens to ensure both correctness
+            # and reduce all-to-all communication.
+            expert_output = drop_tokens(expert_output, dim=1)
 
         if self.wall_clock_breakdown:
             self.timers(SECOND_ALLTOALL_TIMER).start()
@@ -531,10 +657,7 @@ def forward(self, *input: Tensor, **kwargs: Any) -> Tensor:
             self.timers(SECOND_ALLTOALL_TIMER).stop()
             self.time_salltoall = self.timers(SECOND_ALLTOALL_TIMER).elapsed(reset=False)
 
-        # Re-shape back: gecm -> ecm
-        expert_output = expert_output.reshape(self.ep_size * self.num_local_experts, -1, d_model)
-
-        if groups._get_expert_model_parallel_world_size() == 1:
+        if tensor_model_world_size > 1:
             # the dropped duplicate tokens need to be gathered on each
             # tensor parallel rank again for the tensor-parallel
             # non-expert of the next layer.
diff --git a/deepspeed/moe/utils.py b/deepspeed/moe/utils.py
index da31f550aabc..20866378efac 100644
--- a/deepspeed/moe/utils.py
+++ b/deepspeed/moe/utils.py
@@ -3,16 +3,20 @@
 
 # DeepSpeed Team
 
-from typing import List, Tuple, Dict
+from collections import defaultdict
+from typing import Any, Dict, List, Set, Tuple, Union, cast
+
 import torch
+from torch import nn
+
 from .layer import MoE
 
 
-def has_moe_layers(m):
+def has_moe_layers(m: nn.Module) -> Tuple[bool, int]:
     has_moe = False
     num_experts = 0
 
-    for _, module in m.named_modules():
+    for module in m.modules():
         if isinstance(module, MoE):
             has_moe = True
             num_experts = module.num_experts
@@ -27,8 +31,10 @@ def is_moe_param(param: torch.Tensor) -> bool:
 
 
 def split_params_into_shared_and_expert_params(
-        params: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
-    shared_params, expert_params = [], []
+        params: List[torch.nn.Parameter]) -> Tuple[List[torch.nn.Parameter], List[torch.nn.Parameter]]:
+    shared_params: List[nn.Parameter] = []
+    expert_params: List[nn.Parameter] = []
+
     for p in params:
         if is_moe_param(p):
             expert_params.append(p)
@@ -38,7 +44,7 @@ def split_params_into_shared_and_expert_params(
 
 
 def split_params_grads_into_shared_and_expert_params(
-        group: List[torch.nn.Parameter]) -> Tuple[torch.nn.Parameter, torch.nn.Parameter]:
+        group: List[torch.nn.Parameter]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
     """Split grad of parameters into grads of non-expert params
     and grads of expert params. This is useful while computing
     grad-norms for clipping and overflow detection
@@ -48,11 +54,12 @@ def split_params_grads_into_shared_and_expert_params(
             The group of parameters to split
 
     Returns:
-        Tuple[List[torch.nn.Parameter], List[torch.nn.Parameter]]:
+        Tuple[List[torch.Tensor], List[torch.Tensor]]:
         list of gradients for non MoE params, list of gradients of MoE params
     """
-    expert_grads = []
-    shared_grads = []
+    expert_grads: List[torch.Tensor] = []
+    shared_grads: List[torch.Tensor] = []
+
     for p in group:
         if p.grad is not None:
             if is_moe_param(p):
@@ -62,16 +69,17 @@ def split_params_grads_into_shared_and_expert_params(
     return shared_grads, expert_grads
 
 
-def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dict],
-                                                         max_group_size=178956971) -> Tuple[Dict]:
+def split_params_into_different_moe_groups_for_optimizer(
+        param_groups: Union[Dict[str, Any], Tuple[Dict[str, Any], ...], List[Dict[str, Any]]],
+        max_group_size: Union[int, float] = 178956971) -> List[Dict[str, Any]]:
     """Split parameters into different MoE groups for optimizer
 
     Args:
-        param_groups (Tuple[Dict]):
+        param_groups (Union[Dict[str, Any], Tuple[Dict[str, Any], ...], List[Dict[str, Any]]])
             The list of parameter groups to split
 
     Returns:
-        Tuple[Dict]:
+        List[Dict[str, Any]]:
         list of MoE/non-MoE groups for optimizer
     """
     if isinstance(param_groups, tuple):
@@ -82,45 +90,43 @@ def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dic
         raise ValueError(f"Unknown param group type of {type(param_groups)}")
 
     # gather all data parallel group names
-    data_parallel_group_names = set()
+    data_parallel_group_names: Set[str] = set()
     for param_group in param_groups:
-        for param in param_group["params"]:
+        for param in cast(List[nn.Parameter], param_group["params"]):
             if is_moe_param(param):
                 data_parallel_group_names.add(param.group_name)
-    data_parallel_group_names = list(data_parallel_group_names)
-    group_moe = {}
+
     # Create the param MoE groups, leave param assign to next step
+    group_moe: Dict[str, Dict[str, Dict[str, Any]]] = defaultdict(lambda: defaultdict(dict))
     for param_group in param_groups:
-        group_moe[param_group['name']] = {}
         for key in data_parallel_group_names:
-            group_moe[param_group['name']][key] = {}
-            group_moe[param_group['name']][key]['name'] = key
-            group_moe[param_group['name']][key]['moe'] = True
-            for ori_key in param_group.keys():
-                if ori_key != 'name':
-                    if ori_key == 'params':
-                        group_moe[param_group['name']][key][ori_key] = []
-                    else:
-                        group_moe[param_group['name']][key][ori_key] = param_group[ori_key]
+            group_moe[param_group['name']][key] = {
+                **param_group,
+                'name': key,
+                'moe': True,
+                'params': [],
+            }
+
     # Assign param
     for param_group in param_groups:
-        new_params = []
-        for param in param_group['params']:
+        new_params: List[nn.Parameter] = []
+
+        for param in cast(List[nn.Parameter], param_group['params']):
             if is_moe_param(param):
                 group_moe[param_group['name']][param.group_name]['params'].append(param)
-                # param_group['params'].remove(param)
             else:
                 new_params.append(param)
         param_group['params'] = new_params
 
     # Flatten the moe groups
     if max_group_size is not None:
-        for k, v in group_moe.items():
-            for k1, v1 in v.items():
-                cur_group = []
-                all_groups = []
+        for moe_group in group_moe.values():
+            for param_group in moe_group.values():
+                cur_group: List[nn.Parameter] = []
+                all_groups: List[List[nn.Parameter]] = []
                 size_of_cur_group = 0
-                for param in v1['params']:
+
+                for param in cast(List[nn.Parameter], param_group['params']):
                     if size_of_cur_group + param.numel() <= max_group_size:
                         cur_group.append(param)
                         size_of_cur_group += param.numel()
@@ -128,18 +134,49 @@ def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dic
                         all_groups.append(cur_group)
                         cur_group = [param]
                         size_of_cur_group = param.numel()
+
                 if cur_group:
                     all_groups.append(cur_group)
+
                 for group in all_groups:
-                    new_dict = {}
-                    for key, val in v1.items():
-                        if key != 'params':
-                            new_dict[key] = val
-                    new_dict['params'] = group
-                    param_groups.append(new_dict)
+                    param_groups.append({**param_group, 'params': group})
     else:
-        for k, v in group_moe.items():
-            for k1, v1 in v.items():
-                param_groups.append(v1)
+        for moe_group in group_moe.values():
+            for param_group in moe_group.values():
+                param_groups.append(param_group)
+
+    return param_groups
+
+
+def is_moe_param_group(param_group):
+    return param_group.get('moe', False)
+
 
-    return tuple(param_groups)
+def configure_moe_param_groups(model_parameters: List):
+    assert isinstance(model_parameters, list), "model_parameters must be a list"
+
+    for p in model_parameters:
+        # match torch.optim.Optimizer expectations,
+        # see: https://github.com/pytorch/pytorch/blob/2ffab6e663b9c6951048b8c8ba82d2cc5ca5c2fc/torch/optim/optimizer.py#L270-L272
+        if not isinstance(p, (torch.Tensor, dict)):
+            raise TypeError("param argument that would be given to the optimizer should be "
+                            f"an iterable of Tensors or dicts, but got {type(p)}")
+
+    # peak at the first element to determine how to proceed
+    first = model_parameters[0]
+
+    # Case 1: model_parameters is a list of torch.nn.Parameter
+    #   -> need to create moe compatible param groups
+    if isinstance(first, torch.nn.Parameter):
+        param_group = {'params': model_parameters, 'name': 'dense-params'}
+        return split_params_into_different_moe_groups_for_optimizer(param_group)
+
+    # Case 2: model_parameters is a list of param groups List[dict]
+    #   -> moe compatible param groups might already exist, if not create them
+    elif isinstance(first, dict):
+        #there are no moe groups created
+        if not any(['moe' in param_group for param_group in model_parameters]):
+            return split_params_into_different_moe_groups_for_optimizer(model_parameters)
+        else:
+            # moe groups exist, nothing to do
+            return model_parameters
diff --git a/deepspeed/monitor/comet.py b/deepspeed/monitor/comet.py
new file mode 100644
index 000000000000..d8bc4017800f
--- /dev/null
+++ b/deepspeed/monitor/comet.py
@@ -0,0 +1,92 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import TYPE_CHECKING, Any, Tuple, List, Dict, Optional
+
+from .utils import check_comet_availability
+from .monitor import Monitor
+
+import deepspeed.comm as dist
+
+if TYPE_CHECKING:
+    import comet_ml
+    from .config import CometConfig
+
+Name = str
+Value = Any
+GlobalSamples = int
+Event = Tuple[Name, Value, GlobalSamples]
+
+
+class CometMonitor(Monitor):
+
+    def __init__(self, comet_config: "CometConfig"):
+        super().__init__(comet_config)
+        check_comet_availability()
+        import comet_ml
+
+        self.enabled = comet_config.enabled
+        self._samples_log_interval = comet_config.samples_log_interval
+        self._experiment: Optional["comet_ml.ExperimentBase"] = None
+
+        if self.enabled and dist.get_rank() == 0:
+            self._experiment = comet_ml.start(
+                api_key=comet_config.api_key,
+                project=comet_config.project,
+                workspace=comet_config.workspace,
+                experiment_key=comet_config.experiment_key,
+                mode=comet_config.mode,
+                online=comet_config.online,
+            )
+
+            if comet_config.experiment_name is not None:
+                self._experiment.set_name(comet_config.experiment_name)
+
+        self._events_log_scheduler = EventsLogScheduler(comet_config.samples_log_interval)
+
+    @property
+    def experiment(self) -> Optional["comet_ml.ExperimentBase"]:
+        return self._experiment
+
+    @property
+    def samples_log_interval(self) -> int:
+        return self._samples_log_interval
+
+    def write_events(self, event_list: List[Event]) -> None:
+        if not self.enabled or dist.get_rank() != 0:
+            return None
+
+        for event in event_list:
+            name = event[0]
+            value = event[1]
+            engine_global_samples = event[2]
+
+            if self._events_log_scheduler.needs_logging(name, engine_global_samples):
+                self._experiment.__internal_api__log_metric__(
+                    name=name,
+                    value=value,
+                    step=engine_global_samples,
+                )
+
+
+class EventsLogScheduler:
+
+    def __init__(self, samples_log_interval: int):
+        self._samples_log_interval = samples_log_interval
+        self._last_logged_events_samples: Dict[str, int] = {}
+
+    def needs_logging(self, name: str, current_sample: int) -> bool:
+        if name not in self._last_logged_events_samples:
+            self._last_logged_events_samples[name] = current_sample
+            return True
+
+        last_logged_sample = self._last_logged_events_samples[name]
+        samples_delta = current_sample - last_logged_sample
+
+        if samples_delta >= self._samples_log_interval:
+            self._last_logged_events_samples[name] = current_sample
+            return True
+
+        return False
diff --git a/deepspeed/monitor/config.py b/deepspeed/monitor/config.py
index 5a8ca6ecf5cd..960ce1ba997a 100644
--- a/deepspeed/monitor/config.py
+++ b/deepspeed/monitor/config.py
@@ -3,12 +3,14 @@
 
 # DeepSpeed Team
 
-from deepspeed.pydantic_v1 import root_validator
+from typing import Optional
+
+from pydantic import model_validator
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 def get_monitor_config(param_dict):
-    monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor")}
+    monitor_dict = {key: param_dict.get(key, {}) for key in ("tensorboard", "wandb", "csv_monitor", "comet")}
     return DeepSpeedMonitorConfig(**monitor_dict)
 
 
@@ -34,10 +36,10 @@ class WandbConfig(DeepSpeedConfigModel):
     enabled: bool = False
     """ Whether logging to WandB is enabled. Requires `wandb` package is installed. """
 
-    group: str = None
+    group: Optional[str] = None
     """ Name for the WandB group. This can be used to group together runs. """
 
-    team: str = None
+    team: Optional[str] = None
     """ Name for the WandB team. """
 
     project: str = "deepspeed"
@@ -60,20 +62,83 @@ class CSVConfig(DeepSpeedConfigModel):
     """ Name for the current job. This will become a new directory inside `output_path`. """
 
 
+class CometConfig(DeepSpeedConfigModel):
+    """
+    Sets parameters for Comet monitor. For logging data Comet uses
+    experiment object.
+    https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment/
+    """
+
+    enabled: bool = False
+    """ Whether logging to Comet is enabled. Requires `comet_ml` package is installed. """
+
+    samples_log_interval: int = 100
+    """ Metrics will be submitted to Comet after processing every `samples_log_intervas` samples"""
+
+    project: Optional[str] = None
+    """
+    Comet project name. Can be set through .comet.config file or environment variable COMET_PROJECT_NAME
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    workspace: Optional[str] = None
+    """
+    Comet workspace name. Can be set through .comet.config file or environment variable COMET_WORKSPACE
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    api_key: Optional[str] = None
+    """
+    Comet API key. Can be set through .comet.config file or environment variable COMET_API_KEY
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    experiment_name: Optional[str] = None
+    """
+    The name for comet experiment to be used for logging.
+    Can be set through .comet.config file or environment variable COMET_EXPERIMENT_NAME
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    experiment_key: Optional[str] = None
+    """
+    The key for comet experiment to be used for logging. Must be an alphanumeric string whose length is between 32 and 50 characters.
+    Can be set through .comet.config  or environment variable COMET_EXPERIMENT_KEY
+    https://www.comet.com/docs/v2/guides/experiment-management/configure-sdk/#explore-comet-configuration-options
+    """
+
+    online: Optional[bool] = None
+    """
+    If True, the data will be logged to Comet server, otherwise it will be stored locally in offline experiment
+    Defaults to True.
+    """
+
+    mode: Optional[str] = None
+    """
+    Control how the Comet experiment is started, 3 options are possible.:
+        - "get": Continue logging to an existing experiment identified by the `experiment_key` value.
+        - "create": Always creates of a new experiment, useful for HPO sweeps.
+        - "get_or_create" (default): Starts a fresh experiment if required, or persists logging to an existing one.
+    """
+
+
 class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
     """Sets parameters for various monitoring methods."""
 
     tensorboard: TensorBoardConfig = {}
     """ TensorBoard monitor, requires `tensorboard` package is installed. """
 
+    comet: CometConfig = {}
+    """ Comet monitor, requires `comet_ml` package is installed """
+
     wandb: WandbConfig = {}
     """ WandB monitor, requires `wandb` package is installed. """
 
     csv_monitor: CSVConfig = {}
     """ Local CSV output of monitoring data. """
 
-    @root_validator
-    def check_enabled(cls, values):
-        values["enabled"] = values.get("tensorboard").enabled or values.get("wandb").enabled or values.get(
-            "csv_monitor").enabled
-        return values
+    @model_validator(mode="after")
+    def check_enabled(self):
+        enabled = self.tensorboard.enabled or self.wandb.enabled or self.csv_monitor.enabled or self.comet.enabled
+        self.__dict__["enabled"] = enabled
+        return self
diff --git a/deepspeed/monitor/monitor.py b/deepspeed/monitor/monitor.py
index 5a32b8bbcadd..e7e26dc483d9 100644
--- a/deepspeed/monitor/monitor.py
+++ b/deepspeed/monitor/monitor.py
@@ -24,6 +24,7 @@ def write_events(self, event_list):
 from .wandb import WandbMonitor
 from .tensorboard import TensorBoardMonitor
 from .csv_monitor import csvMonitor
+from .comet import CometMonitor
 
 
 class MonitorMaster(Monitor):
@@ -33,6 +34,7 @@ def __init__(self, monitor_config):
         self.tb_monitor = None
         self.wandb_monitor = None
         self.csv_monitor = None
+        self.comet_monitor = None
         self.enabled = monitor_config.enabled
 
         if dist.get_rank() == 0:
@@ -42,6 +44,8 @@ def __init__(self, monitor_config):
                 self.wandb_monitor = WandbMonitor(monitor_config.wandb)
             if monitor_config.csv_monitor.enabled:
                 self.csv_monitor = csvMonitor(monitor_config.csv_monitor)
+            if monitor_config.comet.enabled:
+                self.comet_monitor = CometMonitor(monitor_config.comet)
 
     def write_events(self, event_list):
         if dist.get_rank() == 0:
@@ -51,3 +55,5 @@ def write_events(self, event_list):
                 self.wandb_monitor.write_events(event_list)
             if self.csv_monitor is not None:
                 self.csv_monitor.write_events(event_list)
+            if self.comet_monitor is not None:
+                self.comet_monitor.write_events(event_list)
diff --git a/deepspeed/monitor/utils.py b/deepspeed/monitor/utils.py
index 265fc9811553..f5530e8532e1 100644
--- a/deepspeed/monitor/utils.py
+++ b/deepspeed/monitor/utils.py
@@ -3,6 +3,8 @@
 
 # DeepSpeed Team
 
+from packaging import version as pkg_version
+
 
 def check_tb_availability():
     try:
@@ -22,3 +24,14 @@ def check_wandb_availability():
             'If you want to use wandb logging, please `pip install wandb` and follow the instructions at https://docs.wandb.ai/quickstart'
         )
         raise
+
+
+def check_comet_availability():
+    try:
+        import comet_ml
+        comet_version = pkg_version.parse(comet_ml.__version__)
+        if comet_version < pkg_version.Version("3.41.0"):
+            raise ImportError("`comet_ml` must have at least version 3.41.0")
+    except ImportError:
+        print('If you want to use comet logging, please `pip install "comet_ml>=3.41.0"`')
+        raise
diff --git a/deepspeed/nvme/__init__.py b/deepspeed/nvme/__init__.py
new file mode 100644
index 000000000000..6d0de857cbd3
--- /dev/null
+++ b/deepspeed/nvme/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .perf_run_sweep import sweep_main, parse_sweep_arguments
+from .perf_generate_param import generate_main
+from .test_ds_aio import ds_io_main
diff --git a/deepspeed/nvme/ds_aio_args.py b/deepspeed/nvme/ds_aio_args.py
new file mode 100644
index 000000000000..9ed71c34a74d
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_args.py
@@ -0,0 +1,175 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import argparse
+import os
+from .test_ds_aio_utils import refine_integer_value
+from deepspeed.accelerator import get_accelerator
+
+MAPPING_DELIMITER = ':'
+
+
+def refine_args(args):
+    if args.io_size and type(args.io_size) == str:
+        args.io_size = refine_integer_value(args.io_size)
+
+    if args.block_size and type(args.block_size) == str:
+        args.block_size = refine_integer_value(args.block_size)
+
+    return args
+
+
+def _get_mapping_dict(args):
+    if args.folder is not None:
+        d = {i: args.folder for i in range(args.multi_process)}
+    else:
+        d = {}
+        for m in args.folder_to_device_mapping:
+            fields = m.split(MAPPING_DELIMITER)
+            d[fields[1]] = fields[0]
+
+    return d
+
+
+def _validate_folder_mapping(args):
+    no_error = True
+    error_messages = []
+    invalid_mappings = [m for m in args.folder_to_device_mapping if MAPPING_DELIMITER not in m]
+    if len(invalid_mappings) > 0:
+        error_messages.append(
+            f'Missing delimiter ({MAPPING_DELIMITER}) in folder_to_device_mapping {invalid_mappings}')
+        no_error = False
+
+    folder_list = [m.split(MAPPING_DELIMITER)[0] for m in args.folder_to_device_mapping]
+    invalid_folders = [d for d in folder_list if not os.path.exists(d)]
+    if len(invalid_folders) > 0:
+        error_messages.append(f'Invalid folders in folder_to_device_mapping: {invalid_folders}')
+        no_error = False
+
+    if args.gpu:
+        device_list = [int(m.split(MAPPING_DELIMITER)[1]) for m in args.folder_to_device_mapping]
+        invalid_device_list = [dev_id for dev_id in device_list if not dev_id < get_accelerator().device_count()]
+        if len(invalid_device_list) > 0:
+            error_messages.append(f'Invalid device ids in folder_to_device_mapping: {invalid_device_list}')
+            no_error = False
+
+    return no_error, error_messages
+
+
+def validate_args(args):
+    no_error = True
+    error_messages = []
+
+    if args.folder is not None and len(args.folder_to_device_mapping) > 0:
+        error_messages.append(f'--folder and --folder_to_device_mapping cannot be specified together.')
+        no_error = False
+    elif args.folder is None and len(args.folder_to_device_mapping) == 0:
+        error_messages.append(f'At least one of --folder or --folder_to_device_mapping must be specified.')
+        no_error = False
+
+    # Validate --folder
+    if args.folder is not None and not os.path.exists(args.folder):
+        no_error = False
+        error_messages.append(f'Invalid folder in --folder: {args.folder} ')
+
+    # Validate --folder_mapping_to_device
+    if len(args.folder_to_device_mapping) > 0:
+        no_mapping_error, mapping_error_messages = _validate_folder_mapping(args)
+        no_error = no_error and no_mapping_error
+        error_messages += mapping_error_messages
+
+    # Validate --gpu, --use_gds
+    if args.use_gds and not args.gpu:
+        error_messages.append(f'--gpu must be set to transfer with --use_gds')
+        no_error = False
+
+    if not no_error:
+        print(f'Found {len(error_messages)} validation errors')
+        for i, msg in enumerate(error_messages):
+            print(f'{i+1}: {msg}')
+
+    return no_error
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--folder', default=None, type=str, help='Folder to use for I/O.')
+
+    parser.add_argument('--folder_to_device_mapping',
+                        default=[],
+                        nargs='+',
+                        help='Specification of mapping of folder to (gpu) device id, (ignored for cpu accesses).'
+                        'Can be specified multiple times for multi-process runs,'
+                        'e.g. --folder_to_device_mapping /mnt/nvme0:0 --folder_to_device_mapping /mnt/nvme1:15 --gpu'
+                        'means access /mnt/nvme0 with gpu 0 and /mnt/nvme1 with gpu 15')
+
+    parser.add_argument('--io_size', type=str, default=None, required=True, help='Number of bytes to read or write.')
+
+    parser.add_argument('--read', action='store_true', help='Perform read I/O (default is write)')
+
+    parser.add_argument('--multi_process',
+                        type=int,
+                        default=1,
+                        help='Number of parallel processes doing I/O (default 1).')
+
+    parser.add_argument('--block_size',
+                        type=str,
+                        default='1M',
+                        help='I/O block size. Can use K, M, or G suffix (default 1M for 1 megabytes).')
+
+    parser.add_argument('--queue_depth', type=int, default=32, help='I/O queue depth (default 32).')
+
+    parser.add_argument('--single_submit',
+                        action='store_true',
+                        help='Submit I/O requests in singles (default is submit queue_depth amount at once.).')
+
+    parser.add_argument(
+        '--sequential_requests',
+        action='store_true',
+        help=
+        'Delay I/O request submission until completion of prior requests (default is overlap I/O submission and completion requests.).'
+    )
+
+    parser.add_argument('--validate', action='store_true', help='Perform validation of I/O transfer in library.')
+
+    parser.add_argument('--handle', action='store_true', help='Use AIO handle.')
+
+    parser.add_argument('--loops', type=int, default=3, help='Count of operation repetitions')
+
+    parser.add_argument('--io_parallel', type=int, default=None, help='Per iop parallelism')
+
+    parser.add_argument('--gpu', action='store_true', help='Use GPU memory')
+
+    parser.add_argument('--use_gds', action='store_true', help='Enable GDS AIO')
+
+    parser.add_argument('--slow_bounce_buffer',
+                        action='store_true',
+                        help='For GPU memory transfers, measure impact of bounce buffer pinning on critical path.')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+    return args
+
+
+def get_validated_args():
+    args = parse_arguments()
+    args = refine_args(args)
+    if not validate_args(args):
+        quit()
+    print(f'Successful validation of command line arguments')
+
+    peer_tag = 'gpu' if args.gpu else 'process'
+    args.mapping_dict = _get_mapping_dict(args)
+    args.mapping_list = [(device_id, folder) for device_id, folder in args.mapping_dict.items()]
+    assert len(args.mapping_dict) == len(args.mapping_list)
+    print(f'Configuring {len(args.mapping_list)} {peer_tag} to folder mapping')
+    for i, (device_id, folder) in enumerate(args.mapping_list):
+        print(f'[{i}]: {peer_tag} {device_id} <----> {folder}')
+
+    return args
diff --git a/deepspeed/nvme/ds_aio_basic.py b/deepspeed/nvme/ds_aio_basic.py
new file mode 100755
index 000000000000..b346fe9bbfeb
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_basic.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from deepspeed.ops.aio import AsyncIOBuilder
+from multiprocessing import Pool, Barrier
+from .test_ds_aio_utils import report_results, task_log, task_barrier
+
+
+def pre_basic(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    num_bytes = os.path.getsize(args.read_file) if read_op else args.write_size
+    file = args.read_file if read_op else f'{args.write_file}.{tid}'
+
+    task_log(tid, f'Allocate tensor of size {num_bytes} bytes')
+    buffer = torch.empty(num_bytes, dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid, f'{io_string} file {file} of size {num_bytes} bytes from buffer on device {buffer.device}')
+
+    ctxt = {}
+    ctxt['file'] = file
+    ctxt['num_bytes'] = num_bytes
+    ctxt['buffer'] = buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_basic_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, True)
+    return ctxt
+
+
+def pre_basic_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_basic(args, tid, False)
+    return ctxt
+
+
+def post_basic(pool_params):
+    _, _, ctxt = pool_params
+    ctxt["buffer"].detach()
+    ctxt["buffer"] = None
+    return ctxt
+
+
+def main_basic_read(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_read(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                     args.single_submit, not args.sequential_requests, args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_basic_write(pool_params):
+    args, tid, ctxt = pool_params
+    start_time = time.time()
+    AsyncIOBuilder().load().aio_write(ctxt['buffer'], ctxt['file'], args.block_size, args.queue_depth,
+                                      args.single_submit, not args.sequential_requests, args.validate)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_basic_read
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_read
+    else:
+        schedule['pre'] = pre_basic_write
+        schedule['post'] = post_basic
+        schedule['main'] = main_basic_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, num_processes)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, num_processes)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, num_processes)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, num_processes)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_basic_multiprocessing(args, read_op):
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/deepspeed/nvme/ds_aio_handle.py b/deepspeed/nvme/ds_aio_handle.py
new file mode 100755
index 000000000000..47c0cd709ec5
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_handle.py
@@ -0,0 +1,222 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import torch
+import os
+import time
+from multiprocessing import Pool, Barrier
+from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from deepspeed.accelerator import get_accelerator
+from .test_ds_aio_utils import report_results, task_log, task_barrier, create_filename, create_file
+
+BUFFER = 'buffer'
+BOUNCE_BUFFER = 'bounce_buffer'
+
+
+def pre_handle(args, tid, read_op):
+    io_string = "Read" if read_op else "Write"
+    gds = True if args.use_gds else False
+    device_id, folder = args.mapping_list[tid]
+    filename = create_filename(folder, args.read, args.io_size, tid)
+    if args.read and not (os.path.isfile(filename) and os.path.getsize(filename) == args.io_size):
+        create_file(filename, args.io_size)
+
+    task_log(tid, f'Allocate tensor of size {args.io_size} bytes')
+    bounce_buffer = None
+    if args.gpu:
+        device_name = get_accelerator().device_name(device_id)
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device=device_name)
+        if not (args.slow_bounce_buffer or gds):
+            bounce_buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8,
+                                          device='cpu').pin_memory()
+    else:
+        buffer = torch.randint(high=128, size=(args.io_size, ), dtype=torch.uint8, device='cpu').pin_memory()
+    task_log(tid,
+             f'{io_string} file {filename} of size {args.io_size} bytes from buffer on device {buffer.device}',
+             force=True)
+
+    io_parallel = args.io_parallel if args.io_parallel else 1
+    if gds:
+        handle = GDSBuilder().load().gds_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                not args.sequential_requests, io_parallel)
+        handle.pin_device_tensor(buffer)
+    else:
+        handle = AsyncIOBuilder().load().aio_handle(args.block_size, args.queue_depth, args.single_submit,
+                                                    not args.sequential_requests, io_parallel)
+    task_log(tid, f'created deepspeed aio handle')
+
+    ctxt = {}
+    ctxt['file'] = filename
+    ctxt['num_bytes'] = args.io_size
+    ctxt['handle'] = handle
+    ctxt['gds'] = gds
+    ctxt[BUFFER] = buffer
+    ctxt[BOUNCE_BUFFER] = bounce_buffer
+    ctxt['elapsed_sec'] = 0
+
+    return ctxt
+
+
+def pre_handle_read(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, True)
+    return ctxt
+
+
+def pre_handle_write(pool_params):
+    args, tid = pool_params
+    ctxt = pre_handle(args, tid, False)
+    return ctxt
+
+
+def post_handle(pool_params):
+    _, _, ctxt = pool_params
+    for buf in [BUFFER, BOUNCE_BUFFER]:
+        if ctxt[buf] is not None:
+            if ctxt['gds']:
+                ctxt['handle'].unpin_device_tensor(ctxt[buf])
+            ctxt[buf].detach()
+            ctxt[buf] = None
+    return ctxt
+
+
+def main_parallel_read(pool_params):
+    args, tid, ctxt = pool_params
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.pread(ctxt[dest_buffer], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+    return ctxt
+
+
+def main_parallel_write(pool_params):
+    args, tid, ctxt = pool_params
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
+    handle = ctxt['handle']
+    start_time = time.time()
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.pwrite(ctxt[source_buffer], ctxt['file'], args.validate, True)
+    assert ret != -1
+    handle.wait()
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_read(pool_parms):
+    args, tid, ctxt = pool_parms
+    handle = ctxt['handle']
+
+    start_time = time.time()
+    dest_buffer = BOUNCE_BUFFER if ctxt[BOUNCE_BUFFER] is not None else BUFFER
+    ret = handle.read(ctxt[dest_buffer], ctxt['file'], args.validate)
+    assert ret != -1
+    if dest_buffer == BOUNCE_BUFFER:
+        ctxt[BUFFER].data.copy_(ctxt[BOUNCE_BUFFER].data)
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def main_handle_write(pool_parms):
+    args, tid, ctxt = pool_parms
+    # Avoid overwriting existing files as it could be artificially faster
+    if os.path.isfile(ctxt['file']):
+        os.remove(ctxt['file'])
+
+    handle = ctxt['handle']
+    start_time = time.time()
+    if ctxt[BOUNCE_BUFFER] is not None:
+        source_buffer = BOUNCE_BUFFER
+        ctxt[BOUNCE_BUFFER].data.copy_(ctxt[BUFFER].data)
+    else:
+        source_buffer = BUFFER
+    ret = handle.write(ctxt[source_buffer], ctxt['file'], args.validate)
+    assert ret != -1
+    end_time = time.time()
+    ctxt['elapsed_sec'] += end_time - start_time
+
+    return ctxt
+
+
+def get_schedule(args, read_op):
+    schedule = {}
+    if read_op:
+        schedule['pre'] = pre_handle_read
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_read
+    else:
+        schedule['pre'] = pre_handle_write
+        schedule['post'] = post_handle
+        schedule['main'] = main_parallel_write
+
+    return schedule
+
+
+def _aio_handle_tasklet(pool_params):
+    args, tid, read_op = pool_params
+    num_processes = len(args.mapping_dict)
+
+    # Create schedule
+    schedule = get_schedule(args, read_op)
+    task_log(tid, f'schedule = {schedule}')
+    task_barrier(aio_barrier, num_processes)
+
+    # Run pre task
+    task_log(tid, f'running pre-task')
+    ctxt = schedule["pre"]((args, tid))
+    task_barrier(aio_barrier, num_processes)
+
+    # Run main tasks in a loop
+    ctxt["main_task_sec"] = 0
+    for i in range(args.loops):
+        task_log(tid, f'running main task {i}')
+        start_time = time.time()
+        ctxt = schedule["main"]((args, tid, ctxt))
+        task_barrier(aio_barrier, num_processes)
+        stop_time = time.time()
+        ctxt["main_task_sec"] += stop_time - start_time
+
+    # Run post task
+    task_log(tid, f'running post-task')
+    ctxt = schedule["post"]((args, tid, ctxt))
+    task_barrier(aio_barrier, num_processes)
+
+    return ctxt["main_task_sec"], ctxt["elapsed_sec"], ctxt["num_bytes"] * args.loops
+
+
+def _init_tasklet(b):
+    global aio_barrier
+    aio_barrier = b
+
+
+def aio_handle_multiprocessing(args, read_op):
+    num_processes = len(args.mapping_dict)
+    b = Barrier(num_processes)
+    pool_params = [(args, p, read_op) for p in range(num_processes)]
+    with Pool(processes=num_processes, initializer=_init_tasklet, initargs=(b, )) as p:
+        pool_results = p.map(_aio_handle_tasklet, pool_params)
+
+    report_results(args, read_op, pool_results)
diff --git a/deepspeed/nvme/ds_aio_job.py b/deepspeed/nvme/ds_aio_job.py
new file mode 100644
index 000000000000..0f9c8b5f1bcc
--- /dev/null
+++ b/deepspeed/nvme/ds_aio_job.py
@@ -0,0 +1,50 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping tensors to/from (NVMe) storage devices.
+"""
+import subprocess
+import shlex
+
+
+class Job(object):
+
+    def __init__(self, cmd_line, output_file=None, work_dir=None):
+        self.cmd_line = cmd_line
+        self.output_file = output_file
+        self.work_dir = work_dir
+        self.output_fd = None
+
+    def cmd(self):
+        return self.cmd_line
+
+    def get_stdout(self):
+        return self.output_fd
+
+    def get_stderr(self):
+        return self.output_fd
+
+    def get_cwd(self):
+        return self.work_dir
+
+    def open_output_file(self):
+        if self.output_file is not None:
+            self.output_fd = open(self.output_file, 'w')
+
+    def close_output_file(self):
+        if self.output_fd is not None:
+            self.output_fd.close()
+            self.output_fd = None
+
+
+def run_job(job, verbose=False):
+    args = shlex.split(' '.join(job.cmd()))
+    if verbose:
+        print(f'args = {args}')
+    job.open_output_file()
+    proc = subprocess.run(args=args, stdout=job.get_stdout(), stderr=job.get_stderr(), cwd=job.get_cwd())
+    job.close_output_file()
+    assert proc.returncode == 0, \
+    f"This command failed: {job.cmd()}"
diff --git a/deepspeed/nvme/parse_nvme_stats.py b/deepspeed/nvme/parse_nvme_stats.py
new file mode 100755
index 000000000000..09c79ada5b36
--- /dev/null
+++ b/deepspeed/nvme/parse_nvme_stats.py
@@ -0,0 +1,148 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+import argparse
+
+READ_SPEED = 'read_speed'
+WRITE_SPEED = 'write_speed'
+
+PERF_METRICS = [READ_SPEED, WRITE_SPEED]
+
+METRIC_SEARCH = {READ_SPEED: 'E2E Read Speed', WRITE_SPEED: 'E2E Write Speed'}
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--log_dir', type=str, required=True, help='Folder of statistics logs')
+
+    parser.add_argument('--metric',
+                        type=str,
+                        required=True,
+                        help='Performance metric to report: [read_speed|write_speed]')
+
+    args = parser.parse_args()
+    print(f'args = {args}')
+
+    return args
+
+
+def extract_value(key, file):
+    INVALID_PREFIXES = ["ds"]
+    for p in INVALID_PREFIXES:
+        if key.startswith(p):
+            return key
+    try:
+        if key[0] in ['t', 'd', 'p']:
+            return int(key[1:])
+        if key.startswith("bs"):
+            if key.endswith('K'):
+                v = key[2:].split('K')
+                return int(v[0]) * 1024
+            elif key.endswith('M'):
+                v = key[2:].split('M')
+                return int(v[0]) * 1024 * 1024
+            else:
+                return int(key[2:])
+    except:
+        print(f"{file}: extract_value fails on {key}")
+        return None
+
+    return key
+
+
+def get_file_key(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    values = [extract_value(k, file) for k in fields]
+    return tuple(values)
+
+
+def get_thread_count(file):
+    f, _ = os.path.splitext(os.path.basename(file))
+    fields = f.split('_')
+    for key in fields:
+        if key[0] == 't':
+            return int(key[1:])
+    return 1
+
+
+"""
+Extract performance metric from log file.
+Sample file lines are:
+Task Read Latency = 0.031647682189941406 sec
+Task Read Speed = 12.342926020792527 GB/sec
+E2E Read Latency = 0.031697988510131836 sec
+E2E Read Speed = 12.323337169333062 GB/sec
+
+For the above sample, -metric = "read_speed" corresponds to "E2E Read Speed", and 12.32 will be returned
+"""
+
+
+def get_metric(file, metric):
+    thread_count = get_thread_count(file)
+    with open(file) as f:
+        for line in f.readlines():
+            if line.startswith(METRIC_SEARCH[metric]):
+                if metric in [READ_SPEED, WRITE_SPEED]:
+                    fields = line.split()
+                    return float(fields[-2])
+                else:
+                    fields = line.split('=')
+                    return float(fields[-1])
+
+    return None
+
+
+def validate_args(args):
+    if not args.metric in PERF_METRICS:
+        print(f'{args.metric} is not a valid performance metrics')
+        return False
+
+    if not os.path.isdir(args.log_dir):
+        print(f'{args.log_dir} folder is not existent')
+        return False
+
+    return True
+
+
+def get_results(log_files, metric):
+    results = {}
+    for f in log_files:
+        file_key = get_file_key(f)
+        value = get_metric(f, metric)
+        results[file_key] = value
+
+    return results
+
+
+def get_sorted_results(log_dir, metric):
+    log_files = [f for f in os.listdir(log_dir) if os.path.isfile(os.path.join(log_dir, f))]
+
+    log_files_path = [os.path.join(log_dir, f) for f in log_files]
+    results = get_results(log_files_path, metric)
+    result_keys = list(results.keys())
+    sorted_keys = sorted(result_keys)
+    return sorted_keys, results
+
+
+def main():
+    print("Parsing aio statistics")
+    args = parse_arguments()
+
+    if not validate_args(args):
+        quit()
+
+    sorted_keys, results = get_sorted_results(args.log_dir, args.metric)
+    for k in sorted_keys:
+        print(f'{k} = {results[k]}')
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deepspeed/nvme/perf_generate_param.py b/deepspeed/nvme/perf_generate_param.py
new file mode 100644
index 000000000000..d0313d728ad5
--- /dev/null
+++ b/deepspeed/nvme/perf_generate_param.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+import os
+import argparse
+import json
+from .parse_nvme_stats import READ_SPEED, WRITE_SPEED, get_sorted_results
+from .perf_sweep_utils import BENCH_LOG_DIR, READ_LOG_DIR, WRITE_LOG_DIR
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--log_dir',
+                        type=str,
+                        default=BENCH_LOG_DIR,
+                        help=f'Folder of performance sweep logs. Default is {os.path.join(".", BENCH_LOG_DIR)}')
+    parser.add_argument('--verbose', action='store_true', help='Print debugging information.')
+
+    args = parser.parse_args()
+    if args.verbose:
+        print(f'args = {args}')
+
+    return args
+
+
+def validate_args(args):
+    for d in [READ_LOG_DIR, WRITE_LOG_DIR]:
+        log_dir = os.path.join(args.log_dir, d)
+        if not os.path.isdir(log_dir):
+            print(f'{log_dir} folder is not existent')
+            return False
+
+    return True
+
+
+def convert_to_param(key):
+    assert len(key) == 6
+    return {
+        "single_submit": "true" if key[0] == "single" else "false",
+        "overlap_events": "true" if key[1] == "overlap" else "false",
+        "num_threads": int(key[5]),
+        "queue_depth": int(key[3]),
+        "block_size": int(key[4])
+    }
+
+
+def generate_aio_param(read_log_dir, write_log_dir):
+    _, read_results = get_sorted_results(read_log_dir, READ_SPEED)
+    _, write_results = get_sorted_results(write_log_dir, WRITE_SPEED)
+    combined_perf = {key[1:]: value for key, value in read_results.items()}
+
+    for key, value in write_results.items():
+        new_key = key[1:]
+        if new_key in combined_perf:
+            combined_perf[new_key] += value
+        else:
+            combined_perf[new_key] = 0
+
+    optimal_key = None
+    optimal_perf = 0.0
+    for key, value in combined_perf.items():
+        if value > optimal_perf:
+            optimal_perf = value
+            optimal_key = key
+
+    aio_param = {"aio": convert_to_param(optimal_key)}
+
+    read_perf_keys = {key[1:]: key for key in read_results.keys()}
+    write_perf_keys = {key[1:]: key for key in write_results.keys()}
+    optimal_config_read = read_results.get(read_perf_keys[optimal_key], None)
+    optimal_config_write = write_results.get(write_perf_keys[optimal_key], None)
+
+    print(f'Best performance (GB/sec): read = {optimal_config_read:5.2f}, write = {optimal_config_write:5.2f}')
+    print(json.dumps(aio_param, indent=3))
+
+
+def generate_main(log_dir):
+    read_log_dir = os.path.join(log_dir, READ_LOG_DIR)
+    write_log_dir = os.path.join(log_dir, WRITE_LOG_DIR)
+    generate_aio_param(read_log_dir, write_log_dir)
+
+
+def main():
+    args = parse_arguments()
+    if not validate_args(args):
+        quit()
+    print(f'Generate DeepNVMe configuration from {args.log_dir} logs')
+    generate_main(args.log_dir)
+
+
+if __name__ == "__main__":
+    generate_main()
diff --git a/deepspeed/nvme/perf_run_sweep.py b/deepspeed/nvme/perf_run_sweep.py
new file mode 100644
index 000000000000..0155a4d46cae
--- /dev/null
+++ b/deepspeed/nvme/perf_run_sweep.py
@@ -0,0 +1,320 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+import os
+import sys
+import argparse
+import json
+import itertools
+import shutil
+
+from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
+from .ds_aio_job import Job, run_job
+from .perf_sweep_utils import READ_OP_DESC, WRITE_OP_DESC, BENCH_LOG_DIR, \
+    READ_LOG_DIR, WRITE_LOG_DIR
+
+OTHER_OPTIONS = '--handle'
+PERF_SCRIPT = 'ds_io'
+DEFAULT_SWEEP_CONFIG = {
+    "block_size": ["1M", "8M"],
+    "queue_depth": [32, 128],
+    "sequential_requests": [False],
+    "single_submit": [False],
+    "io_parallel": [1, 8],
+}
+
+
+class SweepConfig(object):
+
+    def __init__(self, args):
+        self.folder_to_device_mapping = get_ftd_map(args.nvme_dir)
+        self.search_space = get_sweep_config_dict(args.sweep_config)
+        self.search_space.update(self.folder_to_device_mapping)
+        self.read = not args.no_read
+        self.write = not args.no_write
+        self.flush_cache = args.flush_page_cache
+        self.log_dir = args.log_dir
+        self.verbose = args.verbose
+        self.other_options = f'{OTHER_OPTIONS} --loops {args.loops} --io_size {args.io_size}'
+        if args.gpu:
+            self.other_options += ' --gpu'
+        if args.gds:
+            self.other_options += ' --use_gds'
+
+
+def validate_arguments(args):
+    if not async_io_setup():
+        error_msg = """
+            Failing because environment is not properly configured for deepspeed async i/o module.
+            Possible fix: apt install libaio-dev.
+        """
+        print(error_msg)
+        quit()
+
+    if args.gds and not gds_io_setup():
+        error_msg = """
+            Failing because environment is not properly configured for deepspeed GDS I/O operator.
+        """
+        print(error_msg)
+        quit()
+
+
+def parse_sweep_arguments():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--nvme_dir',
+                        nargs='+',
+                        required=True,
+                        help='Directory in which to perform I/O tests. A writeable directory on a NVMe device.')
+
+    parser.add_argument('--sweep_config', type=str, default=None, help='Performance sweep configuration json file.')
+
+    parser.add_argument('--no_read', action='store_true', help='Disable read performance measurements.')
+
+    parser.add_argument('--no_write', action='store_true', help='Disable write performance measurements.')
+
+    parser.add_argument('--io_size',
+                        type=str,
+                        default="400M",
+                        help='Number of I/O bytes to read/write for performance measurements.')
+
+    parser.add_argument('--gpu', action='store_true', help='Test tensor transfers between GPU device and NVME device.')
+
+    parser.add_argument('--gds', action='store_true', help='Run the sweep over NVIDIA GPUDirectStorage operator')
+
+    parser.add_argument(
+        '--flush_page_cache',
+        action='store_true',
+        help=
+        'Page cache will not be flushed and reported read speeds may be higher than actual ***Requires sudo access***.'
+    )
+
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default=BENCH_LOG_DIR,
+        help=f'Output directory for performance log files. Default is {os.path.join(".", BENCH_LOG_DIR)}')
+
+    parser.add_argument('--loops', type=int, default=1, help='Count of operation repetitions')
+
+    parser.add_argument('--verbose', action='store_true', help='Print debugging information.')
+
+    args = parser.parse_args()
+    if args.verbose:
+        print(f'args = {args}')
+    validate_arguments(args)
+
+    return args
+
+
+def dump_cmd_lines(cmd_lines):
+    print(f'cmd line count =  {len(cmd_lines)}')
+    for i, cmd in enumerate(cmd_lines):
+        print(f'{i}:  {cmd}')
+
+
+def get_ftd_map(nvme_dir_list):
+    ftd_list = [f'{dir}:{dev}' for dev, dir in enumerate(nvme_dir_list)]
+    ftd_arg = [' '.join(ftd for ftd in ftd_list)]
+    return {'folder_to_device_mapping': ftd_arg}
+
+
+def get_sweep_config_dict(sweep_config_json):
+    if sweep_config_json is None:
+        return DEFAULT_SWEEP_CONFIG
+
+    with open(sweep_config_json) as fp:
+        sweep_config = json.load(fp)
+    return sweep_config
+
+
+def get_sweep_cmd_lines(sweep_config_dict):
+
+    def flatten_options(key, value_list):
+        flat_list = []
+        for v in value_list:
+            if not type(v) is bool:
+                flat_list.append(f'--{key} {v}')
+            elif v:
+                flat_list.append(f'--{key}')
+            else:
+                flat_list.append(' ')
+
+        return flat_list
+
+    flat_list = [flatten_options(key, value) for key, value in sweep_config_dict.items()]
+    cmd_list = list(itertools.product(*flat_list))
+    cmd_list = [list(cmd) for cmd in cmd_list]
+    #dump_cmd_lines(cmd_list)
+    return cmd_list
+
+
+def launch_sweep(sweep_jobs, sync_job, flush_cache_job, verbose):
+    for perf_job in sweep_jobs:
+        if flush_cache_job is not None:
+            run_job(sync_job, verbose)
+            run_job(flush_cache_job, verbose)
+
+        run_job(perf_job, verbose)
+
+        run_job(sync_job, verbose)
+
+
+def create_cmd_tags(cmd_line):
+    tags = {}
+    for param_value in cmd_line:
+        fields = param_value.split()
+        if len(fields) == 1:
+            tags[fields[0]] = None
+        elif len(fields) == 2:
+            if fields[0] == '--folder_to_device_mapping':
+                tags[fields[0]] = len(fields[1:])
+            else:
+                tags[fields[0]] = fields[1]
+        elif len(fields) > 2:
+            tags[fields[0]] = len(fields[1:])
+    return tags
+
+
+def get_log_file(io_op_desc, cmd_line):
+    QUEUE_DEPTH = "--queue_depth"
+    BLOCK_SIZE = "--block_size"
+    SINGLE_SUBMIT = "--single_submit"
+    SEQUENTIAL_REQUESTS = "--sequential_requests"
+    FTD_MAP = "--folder_to_device_mapping"
+    IO_PARALLEL = "--io_parallel"
+
+    tag_map = {
+        QUEUE_DEPTH: "d",
+        BLOCK_SIZE: "bs",
+        SINGLE_SUBMIT: "single",
+        SEQUENTIAL_REQUESTS: "sequential",
+        FTD_MAP: "ftd",
+        IO_PARALLEL: "p"
+    }
+
+    tag_default = {
+        QUEUE_DEPTH: 1,
+        BLOCK_SIZE: "1M",
+        SINGLE_SUBMIT: "block",
+        SEQUENTIAL_REQUESTS: "overlap",
+        FTD_MAP: 1,
+        IO_PARALLEL: 1
+    }
+
+    def get_default_value(tag):
+        value = tag_default[tag]
+        if tag in [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS]:
+            return value
+        return f'{tag_map[tag]}{value}'
+
+    def get_config_value(tag, value):
+        tag_key = tag_map[tag]
+        if value is None:
+            return tag_key
+        return f'{tag_key}{value}'
+
+    tag_list = [SINGLE_SUBMIT, SEQUENTIAL_REQUESTS, FTD_MAP, QUEUE_DEPTH, BLOCK_SIZE, IO_PARALLEL]
+    log_tags = [io_op_desc]
+    cmd_tags = create_cmd_tags(cmd_line)
+    for tag in tag_list:
+        if tag in cmd_tags:
+            log_tags.append(get_config_value(tag, cmd_tags[tag]))
+        else:
+            log_tags.append(get_default_value(tag))
+
+    log_file = '_'.join(log_tags)
+    log_file += '.txt'
+    return log_file
+
+
+def create_perf_jobs(io_op_desc, log_dir, cmd_lines):
+    py_cmd = [os.path.join(script_path(), PERF_SCRIPT)]
+
+    perf_jobs = []
+    for cmd in cmd_lines:
+        log_file = os.path.join(log_dir, get_log_file(io_op_desc, cmd))
+        job = Job(cmd_line=py_cmd + cmd, output_file=log_file)
+        perf_jobs.append(job)
+
+    return perf_jobs
+
+
+def script_path():
+    return os.path.dirname(os.path.realpath(sys.argv[0]))
+
+
+def async_io_setup():
+    return AsyncIOBuilder().is_compatible()
+
+
+def gds_io_setup():
+    return GDSBuilder().is_compatible()
+
+
+def remove_folder(folder):
+    assert os.path.isdir(folder), f"Error: cannot remove {folder} - folder not found"
+    shutil.rmtree(folder)
+
+
+def run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
+    read_cmd_lines = [[f'--read {sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    # dump_cmd_lines(cmd_lines)
+
+    log_folder = os.path.join(sweep_config.log_dir, f'{READ_LOG_DIR}')
+    os.makedirs(log_folder, exist_ok=True)
+
+    perf_jobs = create_perf_jobs(io_op_desc=READ_OP_DESC, log_dir=log_folder, cmd_lines=read_cmd_lines)
+
+    launch_sweep(sweep_jobs=perf_jobs,
+                 sync_job=sync_job,
+                 flush_cache_job=flush_cache_job,
+                 verbose=sweep_config.verbose)
+
+
+def run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines):
+    write_cmd_lines = [[f'{sweep_config.other_options}'] + cmd for cmd in cmd_lines]
+    # dump_cmd_lines(write_cmd_lines)
+
+    log_folder = os.path.join(sweep_config.log_dir, f'{WRITE_LOG_DIR}')
+    os.makedirs(log_folder, exist_ok=True)
+
+    perf_jobs = create_perf_jobs(io_op_desc=WRITE_OP_DESC, log_dir=log_folder, cmd_lines=write_cmd_lines)
+
+    launch_sweep(sweep_jobs=perf_jobs,
+                 sync_job=sync_job,
+                 flush_cache_job=flush_cache_job,
+                 verbose=sweep_config.verbose)
+
+
+def sweep_main(args):
+    sweep_config = SweepConfig(args)
+    cmd_lines = get_sweep_cmd_lines(sweep_config.search_space)
+
+    if sweep_config.flush_cache:
+        flush_cache_job = Job(cmd_line=['sudo', 'bash -c', "'echo 1 > /proc/sys/vm/drop_caches'"])
+    else:
+        flush_cache_job = None
+
+    sync_job = Job(cmd_line=['sync'])
+
+    if sweep_config.read:
+        run_read_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
+
+    if sweep_config.write:
+        run_write_sweep(sweep_config, flush_cache_job, sync_job, cmd_lines)
+
+
+def main():
+    args = parse_sweep_arguments()
+    print(f"Running DeepNVMe performance sweep on {args.nvme_dir}")
+    sweep_main(args)
+
+
+if __name__ == "__main__":
+    sweep_main()
diff --git a/deepspeed/nvme/perf_sweep_utils.py b/deepspeed/nvme/perf_sweep_utils.py
new file mode 100644
index 000000000000..e6832c1baa49
--- /dev/null
+++ b/deepspeed/nvme/perf_sweep_utils.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+SCRIPT_PREFIX = '_aio_bench'
+WRITE_OP_DESC = 'write'
+READ_OP_DESC = 'read'
+READ_IO_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_io'
+WRITE_IO_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_io'
+BENCH_LOG_DIR = f'{SCRIPT_PREFIX}_logs'
+READ_LOG_DIR = f'{SCRIPT_PREFIX}_{READ_OP_DESC}_logs'
+WRITE_LOG_DIR = f'{SCRIPT_PREFIX}_{WRITE_OP_DESC}_logs'
diff --git a/deepspeed/nvme/test_ds_aio.py b/deepspeed/nvme/test_ds_aio.py
new file mode 100755
index 000000000000..a17350414739
--- /dev/null
+++ b/deepspeed/nvme/test_ds_aio.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import multiprocessing as mp
+from .ds_aio_basic import aio_basic_multiprocessing
+from .ds_aio_handle import aio_handle_multiprocessing
+from .ds_aio_args import get_validated_args
+
+
+def ds_io_main():
+    print(f'Testing deepspeed_aio python frontend')
+
+    args = get_validated_args()
+    mp.set_start_method('spawn')
+    multiprocess_function = aio_handle_multiprocessing if args.handle else aio_basic_multiprocessing
+    multiprocess_function(args, args.read)
+
+
+if __name__ == "__main__":
+    ds_io_main()
diff --git a/deepspeed/nvme/test_ds_aio_utils.py b/deepspeed/nvme/test_ds_aio_utils.py
new file mode 100755
index 000000000000..cf167f647460
--- /dev/null
+++ b/deepspeed/nvme/test_ds_aio_utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+
+import os
+from .ds_aio_job import Job, run_job
+
+BYTES_PER_GB = 1024**3
+BYTES_PER_MB = 1024**2
+BYTES_PER_KB = 1024
+LOG_TIDS = [0]
+
+
+def task_log(tid, msg, force=False):
+    if force or tid in LOG_TIDS:
+        print(f'tid {tid}: {msg}')
+
+
+def task_barrier(barrier, num_parties):
+    assert barrier.parties == num_parties
+    barrier.wait()
+    assert barrier.broken == False
+
+
+def report_results(args, read_op, pool_results):
+    #print(f'pool_results = {pool_results}')
+    io_string = 'Read' if read_op else 'Write'
+    if None in pool_results:
+        print(f'Failure in one of {args.threads} {io_string} processes')
+        return
+
+    total_bytes = sum([num_bytes for _, _, num_bytes in pool_results])
+
+    task_latency_sec = max([sec for _, sec, _ in pool_results])
+    task_speed_GB = 0 if task_latency_sec == 0 else total_bytes / task_latency_sec / BYTES_PER_GB
+    print(f'Task {io_string} Latency = {task_latency_sec} sec')
+    print(f'Task {io_string} Speed = {task_speed_GB} GB/sec')
+
+    e2e_latency_sec = max([sec for sec, _, _ in pool_results])
+    e2e_speed_GB = 0 if e2e_latency_sec == 0 else total_bytes / e2e_latency_sec / BYTES_PER_GB
+    print(f'E2E {io_string} Latency = {e2e_latency_sec} sec')
+    print(f'E2E {io_string} Speed = {e2e_speed_GB} GB/sec')
+
+
+def get_block_size_and_count(io_bytes):
+    if io_bytes > BYTES_PER_MB and io_bytes % BYTES_PER_MB == 0:
+        block_size = BYTES_PER_MB
+        block_size_string = '1M'
+    else:
+        assert io_bytes % BYTES_PER_KB == 0
+        block_size = BYTES_PER_KB
+        block_size_string = '1K'
+    block_count = io_bytes / block_size
+
+    return block_size_string, int(block_count)
+
+
+def refine_integer_value(value):
+    unit_dict = {'K': 1024, 'M': 1024**2, 'G': 1024**3}
+
+    if value[-1] in list(unit_dict.keys()):
+        int_value = int(value[:-1]) * unit_dict[value[-1]]
+        return int_value
+    return int(value)
+
+
+def create_filename(folder, read_op, size, tid):
+    io_string = "read" if read_op else "write"
+    return os.path.join(folder, f'_aio_{io_string}_{size}.pt.{tid}')
+
+
+def create_file(filename, num_bytes):
+    block_size, block_count = get_block_size_and_count(num_bytes)
+    dd_job = Job(cmd_line=[f'dd if=/dev/urandom of={filename} bs={block_size} count={block_count}'])
+    print(f'[Start] Create {filename} of {num_bytes} bytes by running {dd_job.cmd()} ....')
+    run_job(dd_job)
+    print(f'[Done] Create read file of {num_bytes} bytes by running {dd_job.cmd()} ....')
diff --git a/deepspeed/nvme/validate_async_io.py b/deepspeed/nvme/validate_async_io.py
new file mode 100644
index 000000000000..10fb638347bc
--- /dev/null
+++ b/deepspeed/nvme/validate_async_io.py
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""
+Functionality of swapping optimizer tensors to/from (NVMe) storage devices.
+"""
+from deepspeed.ops.op_builder import AsyncIOBuilder
+assert AsyncIOBuilder().is_compatible()
+assert AsyncIOBuilder().load()
diff --git a/deepspeed/ops/__init__.py b/deepspeed/ops/__init__.py
index ba1c9c1fd9f0..15179984173c 100755
--- a/deepspeed/ops/__init__.py
+++ b/deepspeed/ops/__init__.py
@@ -7,11 +7,9 @@
 from . import adagrad
 from . import lamb
 from . import lion
-#from ..git_version_info_installed import installed_ops as __installed_ops__
-#if __installed_ops__['sparse_attn']:
 from . import sparse_attention
 from . import transformer
-
+from . import fp_quantizer
 from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 
 from ..git_version_info import compatible_ops as __compatible_ops__
diff --git a/deepspeed/ops/adagrad/cpu_adagrad.py b/deepspeed/ops/adagrad/cpu_adagrad.py
index c356a52777f2..dbde6d95f652 100755
--- a/deepspeed/ops/adagrad/cpu_adagrad.py
+++ b/deepspeed/ops/adagrad/cpu_adagrad.py
@@ -34,7 +34,7 @@ def __setstate__(self, state):
             group.setdefault('amsgrad', False)
 
     @torch.no_grad()
-    def step(self, closure=None, fp16_param_groups=None):
+    def step(self, closure=None):
         """Update the model parameters.
 
         .. note::
@@ -46,8 +46,6 @@ def step(self, closure=None, fp16_param_groups=None):
         Args:
             closure (callable, optional): closure to compute the loss.
                 Defaults to ``None``.
-            fp16_param_groups: FP16 GPU parameters to update. Performing the
-                copy here reduces communication time. Defaults to ``None``.
 
         Returns:
             loss: if ``closure`` is provided. Otherwise ``None``.
@@ -94,16 +92,7 @@ def step(self, closure=None, fp16_param_groups=None):
                                                        sparse_exp_avg_sq.values())
                     p[sparse_param.indices()] = sparse_param.values()
                     state['exp_avg_sq'][sparse_exp_avg_sq.indices()] = sparse_exp_avg_sq.values()
-                    if fp16_param_groups is not None:
-                        fp16_param_groups[group_id][param_id][sparse_param.indices()] = sparse_param.values()
                 else:
-                    if fp16_param_groups is not None:
-                        self.ds_opt_adagrad.adagrad_update_copy(self.opt_id, state['step'], group['lr'], group['eps'],
-                                                                group['weight_decay'], p.data, p.grad.data,
-                                                                state['exp_avg_sq'],
-                                                                fp16_param_groups[group_id][param_id].data)
-                    else:
-                        self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'],
-                                                           group['weight_decay'], p.data, p.grad.data,
-                                                           state['exp_avg_sq'])
+                    self.ds_opt_adagrad.adagrad_update(self.opt_id, state['step'], group['lr'], group['eps'],
+                                                       group['weight_decay'], p.data, p.grad.data, state['exp_avg_sq'])
         return loss
diff --git a/deepspeed/ops/adam/cpu_adam.py b/deepspeed/ops/adam/cpu_adam.py
index 1ffaf873f4e9..e0a72a494257 100755
--- a/deepspeed/ops/adam/cpu_adam.py
+++ b/deepspeed/ops/adam/cpu_adam.py
@@ -63,7 +63,7 @@ def __init__(self,
                 algorithm from the paper `On the Convergence of Adam and Beyond`_
                 (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
             adamw_mode: select between Adam and AdamW implementations (default: AdamW)
-            full_precision_optimizer_states: creates momentum and variance in full precision regardless of
+            fp32_optimizer_states: creates momentum and variance in full precision regardless of
                         the precision of the parameters (default: True)
         """
 
@@ -107,7 +107,7 @@ def __setstate__(self, state):
             group.setdefault('amsgrad', False)
 
     @torch.no_grad()
-    def step(self, closure=None, fp16_param_groups=None):
+    def step(self, closure=None):
         """Update the model parameters.
 
         .. note::
@@ -119,8 +119,6 @@ def step(self, closure=None, fp16_param_groups=None):
         Args:
             closure (callable, optional): closure to compute the loss.
                 Defaults to ``None``.
-            fp16_param_groups: FP16 GPU parameters to update. Performing the
-                copy here reduces communication time. Defaults to ``None``.
 
         Returns:
             loss: if ``closure`` is provided. Otherwise ``None``.
@@ -134,13 +132,6 @@ def step(self, closure=None, fp16_param_groups=None):
         # intended device for step
         device = torch.device('cpu')
 
-        # converting the fp16 params to a group of parameter
-        if type(fp16_param_groups) is list:
-            if type(fp16_param_groups[0]) is not list:
-                fp16_param_groups = [fp16_param_groups]
-        elif fp16_param_groups is not None:
-            fp16_param_groups = [[fp16_param_groups]]
-
         for group_id, group in enumerate(self.param_groups):
             for param_id, p in enumerate(group['params']):
 
@@ -169,13 +160,7 @@ def step(self, closure=None, fp16_param_groups=None):
                 state['step'] += 1
                 beta1, beta2 = group['betas']
 
-                if fp16_param_groups is not None:
-                    self.ds_opt_adam.adam_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2,
-                                                      group['eps'], group['weight_decay'], group['bias_correction'],
-                                                      p.data, p.grad.data, state['exp_avg'], state['exp_avg_sq'],
-                                                      fp16_param_groups[group_id][param_id].data)
-                else:
-                    self.ds_opt_adam.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'],
-                                                 group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
-                                                 state['exp_avg'], state['exp_avg_sq'])
+                self.ds_opt_adam.adam_update(self.opt_id, state['step'], group['lr'], beta1, beta2, group['eps'],
+                                             group['weight_decay'], group['bias_correction'], p.data, p.grad.data,
+                                             state['exp_avg'], state['exp_avg_sq'])
         return loss
diff --git a/deepspeed/ops/fp_quantizer/__init__.py b/deepspeed/ops/fp_quantizer/__init__.py
new file mode 100644
index 000000000000..f9cf23373c26
--- /dev/null
+++ b/deepspeed/ops/fp_quantizer/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .quantize import FP_Quantize, Quantizer
+from .fp8_gemm import matmul_fp8
diff --git a/deepspeed/ops/fp_quantizer/fp8_gemm.py b/deepspeed/ops/fp_quantizer/fp8_gemm.py
new file mode 100644
index 000000000000..db4fa5ae2c92
--- /dev/null
+++ b/deepspeed/ops/fp_quantizer/fp8_gemm.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+######## Fused MoE kernel #########
+# These kernels are implemented for
+# fusing GeMM with dequantization of
+# fp8 weight data when using bit-16
+# activation.
+###################################
+
+import torch
+
+
+def matmul_fp8(inp, weight, scale, quantization_group_size, quantizer):
+    from deepspeed import get_accelerator
+
+    if not get_accelerator().is_triton_supported():
+        return matmul_fp8_fallback(inp, weight, scale, quantization_group_size, quantizer)
+    else:
+        # Import dynamically to prevent failures on systems without triton.
+        from .fp8_gemm_triton import matmul_fp8_triton
+        return matmul_fp8_triton(inp, weight, scale, quantization_group_size)
+
+
+def matmul_fp8_fallback(inp, weight, scale, quantization_group_size, quantizer):
+    return torch.matmul(inp, quantizer.dequantize(weight, scale=scale))
diff --git a/deepspeed/ops/fp_quantizer/fp8_gemm_triton.py b/deepspeed/ops/fp_quantizer/fp8_gemm_triton.py
new file mode 100644
index 000000000000..746e217d4194
--- /dev/null
+++ b/deepspeed/ops/fp_quantizer/fp8_gemm_triton.py
@@ -0,0 +1,171 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+######## Fused MoE kernel #########
+# These kernels are implemented for
+# fusing GeMM with dequantization of
+# fp8 weight data when using bit-16
+# activation.
+###################################
+
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def matmul_kernel_fp8_bf16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
+                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
+                           quantization_group_size: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
+        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
+
+    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
+    scale = tl.load(scale_ptr + weight_ptrs_offset)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        # Dequantize weight (fp8 -> bf16)
+        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 4)).to(tl.uint16)
+        w = (w + 0x3C00).to(tl.uint16)
+        w = (w.to(tl.bfloat16, bitcast=True) * scale).to(tl.bfloat16)
+
+        inp_data += BLOCK_SIZE_K * stride_ak
+        weight_data += BLOCK_SIZE_K * stride_bk
+        weight_mask = offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K
+        weight = tl.load(weight_data, mask=weight_mask, other=0.0)
+        scale = tl.load(scale_ptr + (weight_ptrs_offset +
+                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)),
+                        mask=weight_mask,
+                        other=0.0)
+
+        accumulator += tl.dot(inp, w)
+
+    out = accumulator.to(tl.bfloat16)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
+
+
+@triton.jit
+def matmul_kernel_fp8_fp16(inp_ptr, weight_ptr, out_ptr, scale_ptr, M, N, K, stride_am, stride_ak, stride_bk,
+                           stride_bn, stride_cm, stride_cn, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                           BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr,
+                           quantization_group_size: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    inp_data = inp_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    weight_data = weight_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+    weight_ptrs_offset = offs_k[:, None] * (stride_bk // quantization_group_size) + (
+        (pid_n * BLOCK_SIZE_N) // quantization_group_size)
+
+    weight = tl.load(weight_data, mask=offs_k[:, None] < K, other=0.0)
+    scale = tl.load(scale_ptr + weight_ptrs_offset)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        inp = tl.load(inp_data, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0)
+        # Dequantize weight (fp8 -> fp16)
+        w = (((weight & 0x80) << 8) | ((weight & 0x7f) << 7)).to(tl.uint16)
+        w = (w + 0x2000).to(tl.uint16)
+        w = (w.to(tl.float16, bitcast=True) * scale).to(tl.float16)
+
+        inp_data += BLOCK_SIZE_K * stride_ak
+        weight_data += BLOCK_SIZE_K * stride_bk
+
+        weight = tl.load(weight_data, mask=offs_k[:, None] < K - (k + 1) * BLOCK_SIZE_K, other=0.0)
+        scale = tl.load(scale_ptr + (weight_ptrs_offset +
+                                     (((k + 1) * BLOCK_SIZE_K * stride_bk) // quantization_group_size)))
+
+        accumulator += tl.dot(inp, w)
+
+    out = accumulator.to(tl.float16)
+
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    out_data = out_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
+    tl.store(out_data, out, mask=(offs_cm[:, None] < M) & (offs_cn[None, :] < N))
+
+
+def matmul_fp8_triton(inp, weight, scale, quantization_group_size):
+
+    assert inp.shape[1] == weight.shape[0], \
+        f"Incompatible dimensions (input: {inp.shape}, weight: {weight.shape})"
+
+    M, K = inp.shape
+    K, N = weight.shape
+
+    out = torch.empty((M, N), device=inp.device, dtype=inp.dtype)
+
+    # GEMM tuning parameters!
+    # TODO: Add a more configurable tuning for selecting the best GeMM
+    BLOCK_SIZE_M = 16 if M <= 16 else 32 if M <= 32 else 64 if M <= 64 else 128
+    BLOCK_SIZE_N = 64
+    BLOCK_SIZE_K = max(64, quantization_group_size)
+    GROUP_SIZE_M = 8
+    num_stages = 4
+    num_warps = 4
+    if M >= 256:
+        BLOCK_SIZE_M = 256
+        BLOCK_SIZE_N = 128
+        BLOCK_SIZE_K = max(128, quantization_group_size)
+        num_stages = 3
+        num_warps = 8
+
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), )
+    kernel = matmul_kernel_fp8_bf16 if inp.dtype == torch.bfloat16 else matmul_kernel_fp8_fp16
+    kernel[grid](inp,
+                 weight,
+                 out,
+                 scale,
+                 M,
+                 N,
+                 K,
+                 inp.stride(0),
+                 inp.stride(1),
+                 weight.stride(0),
+                 weight.stride(1),
+                 out.stride(0),
+                 out.stride(1),
+                 quantization_group_size=quantization_group_size,
+                 BLOCK_SIZE_M=BLOCK_SIZE_M,
+                 BLOCK_SIZE_N=BLOCK_SIZE_N,
+                 BLOCK_SIZE_K=BLOCK_SIZE_K,
+                 GROUP_SIZE_M=GROUP_SIZE_M,
+                 num_stages=num_stages,
+                 num_warps=num_warps)
+    return out
diff --git a/deepspeed/ops/fp_quantizer/quantize.py b/deepspeed/ops/fp_quantizer/quantize.py
new file mode 100644
index 000000000000..69c21eaf693b
--- /dev/null
+++ b/deepspeed/ops/fp_quantizer/quantize.py
@@ -0,0 +1,184 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import abc
+from abc import ABC
+
+import gc
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+from deepspeed.accelerator import get_accelerator
+
+fp_quant_module = None
+
+
+class Quantizer(ABC):
+    """
+    Abstract Quantizer class that implements quantize/dequantize methods.
+
+    Arguments:
+        group_size (int, optional): number of values or elements that are grouped
+            together for the quantization process.
+    """
+
+    def __init__(self, group_size=512) -> None:
+        self.group_size = group_size
+
+    @abc.abstractmethod
+    def quantize(self,
+                 input,
+                 q_bits=8,
+                 q_mantisa_bits=3,
+                 stochastic_mode=False,
+                 return_meta_tensor=False) -> torch.Tensor:
+        ...
+
+    @abc.abstractmethod
+    def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor:
+        ...
+
+
+class FP_Quantize(Quantizer):
+
+    def __init__(self, quantization_config) -> None:
+        global fp_quant_module
+        super().__init__(group_size=quantization_config.group_size)
+        if fp_quant_module is None:
+            fp_quant_module = FPQuantizerBuilder().load()
+        self.cuda_impl = getattr(fp_quant_module, "CUDA_IMPL", True)
+        self.q_config = quantization_config
+
+        self.orig_dtype = None
+        self.num_groups = None
+        self.input_q = None
+        self.scale = None
+
+    def quantize(self,
+                 input,
+                 q_bits=8,
+                 q_mantisa_bits=3,
+                 stochastic_mode=False,
+                 return_meta_tensor=False) -> torch.Tensor:
+        assert input.dtype == torch.bfloat16, "only support bf16 for now"
+        if return_meta_tensor:
+            assert q_bits == 8, "meta tensor is only supported with q_bit=8"
+
+        self.orig_dtype = input.dtype
+        self.orig_shape = input.shape
+
+        if q_bits == 8:
+            pass
+        elif q_bits == 12:
+            q_mantisa_bits = 4
+        elif q_bits == 6:
+            q_mantisa_bits = 2
+        elif q_bits == 4:
+            q_mantisa_bits = 1
+        else:
+            assert (0), \
+                f"Missing {q_bits}-quantization, please add the template arguments for the kernel to support this precision!"
+
+        # Adding (group_size - 1) is for padding
+        self.num_groups = (input.numel() + self.q_config.group_size - 1) // self.q_config.group_size
+        # group_size should be the minimal number between the defined group size and number of elements in tensor.
+        group_size = int(min(self.q_config.group_size, input.numel()) * q_bits) // 8
+        # CUDA quantization kernel saves the scale as (fp32) inside the quantized tensor for each group
+        if self.cuda_impl:
+            group_size += 4
+        # CUDA quantization kernel allocates tensors as uint8, but handles them as fp8 inside the kernel.
+        self.input_q = torch.ones(self.num_groups, group_size, dtype=self.q_config.q_dtype, device=input.device)
+        # CUDA quantization kernel attaches scales to quantized result, in python implementation it can't be done
+        # because they are of different types.
+        self.scale = torch.ones(self.num_groups, 1, device=input.device)
+        out = fp_quant_module.quantize(self.input_q, input, self.scale, group_size, stochastic_mode, q_bits,
+                                       q_mantisa_bits)
+        if return_meta_tensor:
+            if self.cuda_impl:
+                data, self.scale = out.split(group_size, dim=-1)
+                data = data.contiguous().reshape(input.shape)
+            else:
+                data = out.contiguous().reshape(input.shape)
+            self.scale = self.scale.contiguous()
+            del self.input_q
+            del out
+            gc.collect()
+            get_accelerator().empty_cache()
+            return data, self.scale
+
+        return out
+
+    def to(self, *args, **kwargs):
+        # Intermediate tensors may need to be moved to different devices
+        if hasattr(self, 'input_q') and self.input_q is not None:
+            self.input_q = self.input_q.to(*args, **kwargs)
+        if hasattr(self, 'scale') and self.scale is not None:
+            self.scale = self.scale.to(*args, **kwargs)
+
+    def get_scales(self):
+        return fp_quant_module.get_scales(self.scale, self.num_groups)
+
+    def dequantize(self, input_q, fp_out=None, q_bits=8, q_mantisa_bits=3, scale=None) -> torch.Tensor:
+        assert (self.orig_dtype is not None), \
+            "[De-quantization Error]: you need to call quantize before dequantizing!"
+        fp_out = torch.empty(self.orig_shape, dtype=self.orig_dtype,
+                             device=input_q.device) if fp_out is None else fp_out
+        if q_bits == 8:
+            pass
+        elif q_bits == 12:
+            q_mantisa_bits = 4
+        elif q_bits == 6:
+            q_mantisa_bits = 2
+        elif q_bits == 4:
+            q_mantisa_bits = 1
+        else:
+            assert (0), \
+                f"Missing {q_bits}-dequantization, please add the template arguments for the kernel to support this precision!"
+
+        if scale is not None and self.cuda_impl:
+            assert input_q.numel() == fp_out.numel(), \
+            f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
+            input_q = torch.cat([input_q.reshape(-1, self.q_config.group_size), scale], dim=-1).contiguous()
+        elif scale is not None and not self.cuda_impl:
+            group_size = int(min(self.q_config.group_size, input_q.numel()) * q_bits) // 8
+            input_q = input_q.reshape(-1, group_size)
+
+        fp_quant_module.dequantize(fp_out, input_q, self.scale, self.q_config.group_size, q_mantisa_bits,
+                                   q_bits - q_mantisa_bits - 1)
+        return fp_out
+
+    def selective_dequantize(self,
+                             input_q,
+                             indexes,
+                             fp_out=None,
+                             q_bits=8,
+                             q_mantisa_bits=3,
+                             scale=None) -> torch.Tensor:
+        assert (not hasattr(self, 'orig_shape') or len(self.orig_shape) == 3), \
+            "Selective-Dequantization works on 3d tensor only! Please reshape the tensor before calling dequantize function."
+        assert (self.orig_dtype is not None), \
+            "[De-quantization Error]: you need to call quantize before dequantizing!"
+        fp_out = torch.empty(
+            (indexes.shape[0],
+             *self.orig_shape[1:]), dtype=self.orig_dtype, device=input_q.device) if fp_out is None else fp_out
+        if q_bits == 8:
+            pass
+        elif q_bits == 12:
+            q_mantisa_bits = 4
+        elif q_bits == 6:
+            q_mantisa_bits = 2
+        elif q_bits == 4:
+            q_mantisa_bits = 1
+        else:
+            assert (0), \
+                f"Missing {q_bits}-dequantization, please add the template arguments for the kernel to support this precision!"
+
+        if scale is not None and self.cuda_impl:
+            assert input_q.numel() == fp_out.numel(), \
+            f'[De-quantization Error]: quantized data should have the same size as original tensor when scale is not None!'
+            input_q = torch.cat([input_q.reshape(-1, self.q_config.group_size), scale], dim=-1).contiguous()
+
+        fp_quant_module.selective_dequantize(fp_out, input_q, indexes, self.q_config.group_size, q_mantisa_bits,
+                                             q_bits - q_mantisa_bits - 1)
+        return fp_out
diff --git a/deepspeed/ops/gds/__init__.py b/deepspeed/ops/gds/__init__.py
new file mode 100755
index 000000000000..3c0762c81076
--- /dev/null
+++ b/deepspeed/ops/gds/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..op_builder import GDSBuilder
diff --git a/deepspeed/ops/lion/cpu_lion.py b/deepspeed/ops/lion/cpu_lion.py
index a91a00643873..03342a3fcd34 100755
--- a/deepspeed/ops/lion/cpu_lion.py
+++ b/deepspeed/ops/lion/cpu_lion.py
@@ -69,7 +69,7 @@ def __setstate__(self, state):
             group.setdefault('amsgrad', False)
 
     @torch.no_grad()
-    def step(self, closure=None, fp16_param_groups=None):
+    def step(self, closure=None):
         """Update the model parameters.
 
         .. note::
@@ -81,8 +81,6 @@ def step(self, closure=None, fp16_param_groups=None):
         Args:
             closure (callable, optional): closure to compute the loss.
                 Defaults to ``None``.
-            fp16_param_groups: FP16 GPU parameters to update. Performing the
-                copy here reduces communication time. Defaults to ``None``.
 
         Returns:
             loss: if ``closure`` is provided. Otherwise ``None``.
@@ -96,13 +94,6 @@ def step(self, closure=None, fp16_param_groups=None):
         # intended device for step
         device = torch.device('cpu')
 
-        # converting the fp16 params to a group of parameter
-        if type(fp16_param_groups) is list:
-            if type(fp16_param_groups[0]) is not list:
-                fp16_param_groups = [fp16_param_groups]
-        elif fp16_param_groups is not None:
-            fp16_param_groups = [[fp16_param_groups]]
-
         for group_id, group in enumerate(self.param_groups):
             for param_id, p in enumerate(group['params']):
 
@@ -131,11 +122,6 @@ def step(self, closure=None, fp16_param_groups=None):
                 state['step'] += 1
                 beta1, beta2 = group['betas']
 
-                if fp16_param_groups is not None:
-                    self.ds_opt_lion.lion_update_copy(self.opt_id, state['step'], group['lr'], beta1, beta2,
-                                                      group['weight_decay'], p.data, p.grad.data, state['exp_avg'],
-                                                      fp16_param_groups[group_id][param_id].data)
-                else:
-                    self.ds_opt_lion.lion_update(self.opt_id, state['step'], group['lr'], beta1, beta2,
-                                                 group['weight_decay'], p.data, p.grad.data, state['exp_avg'])
+                self.ds_opt_lion.lion_update(self.opt_id, state['step'], group['lr'], beta1, beta2,
+                                             group['weight_decay'], p.data, p.grad.data, state['exp_avg'])
         return loss
diff --git a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
index e25621bd0977..37f065e48631 100755
--- a/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
+++ b/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py
@@ -8,7 +8,7 @@
 
 
 class BertSparseSelfAttention(nn.Module):
-    """Implements Sparse Self Attention layer of Bert model based on https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373
+    """Implements Sparse Self Attention layer of Bert model based on https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373
 
     For more information please see, TODO DeepSpeed Sparse Transformer.
 
diff --git a/deepspeed/ops/transformer/inference/config.py b/deepspeed/ops/transformer/inference/config.py
index d5aff4f541f7..c0dd29f4f962 100644
--- a/deepspeed/ops/transformer/inference/config.py
+++ b/deepspeed/ops/transformer/inference/config.py
@@ -43,6 +43,7 @@ class DeepSpeedInferenceConfig(TransformerConfig):
             return_tuple: if True, returns the transformer output as a tuple, otherwise returns as a tensor
             bigscience_bloom: This flag is added temporarily for supporting the BLOOM-176B model architecture.
             use_triton: This flag is to enable triton kernels in inference or not.
+            invert_mask: If True, the attention mask is inverted when passed to attention block.
     """
 
     def __init__(self,
@@ -80,7 +81,8 @@ def __init__(self,
                  use_triton=False,
                  triton_autotune=False,
                  num_kv=-1,
-                 rope_theta=10000):
+                 rope_theta=10000,
+                 invert_mask=True):
         super(DeepSpeedInferenceConfig,
               self).__init__(hidden_size, (intermediate_size if intermediate_size > 0 else 4 * hidden_size), heads,
                              num_hidden_layers)
@@ -101,7 +103,6 @@ def __init__(self,
         self.return_tuple = return_tuple
         self.mlp_after_attn = mlp_after_attn
         self.mlp_act_func_type = mlp_act_func_type
-        self.specialized_mode = False
         self.training_mp_size = training_mp_size
         self.bigscience_bloom = bigscience_bloom
         self.max_out_tokens = max_out_tokens
@@ -116,6 +117,7 @@ def __init__(self,
         self.triton_autotune = triton_autotune
         self.num_kv = num_kv
         self.rope_theta = rope_theta
+        self.invert_mask = invert_mask
 
     @classmethod
     def from_dict(cls, json_object):
diff --git a/deepspeed/ops/transformer/inference/diffusers_attention.py b/deepspeed/ops/transformer/inference/diffusers_attention.py
index 5efc560db75e..3c2340ccfc6f 100644
--- a/deepspeed/ops/transformer/inference/diffusers_attention.py
+++ b/deepspeed/ops/transformer/inference/diffusers_attention.py
@@ -10,10 +10,11 @@
 from packaging import version as pkg_version
 from deepspeed.utils.logging import log_dist
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.workspace import WorkspaceOp
+from deepspeed.ops.transformer.inference.op_binding.softmax_context import SoftmaxContextOp
+from deepspeed.ops.transformer.inference.op_binding import LinearOp
+from deepspeed.ops.transformer.inference.op_binding.pad_transform import PadTransformOp
 
-# Cuda modules will be imported if needed
-inference_module = None
 minus_inf = -10000.0
 triton_flash_attn = None
 
@@ -36,7 +37,8 @@ class DeepSpeedDiffusersAttentionFunction(Function):
     @staticmethod
     def forward(ctx, input, context, input_mask, config, attn_qkvw, attn_qw, attn_kw, attn_vw, attn_qkvb,
                 num_attention_heads_per_partition, norm_factor, hidden_size_per_partition, attn_ow, attn_ob,
-                do_out_bias, score_context_func, linear_func, triton_flash_attn_kernel, rope_theta):
+                do_out_bias, score_context_func, linear_func, pad_transform_func, triton_flash_attn_kernel,
+                rope_theta):
 
         def _transpose_for_context(x):
             x = x.permute(0, 2, 1, 3)
@@ -77,7 +79,7 @@ def selfAttention_fp(input, context, input_mask):
                     query = query.contiguous()
                     key = key.contiguous()
                     value = value.contiguous()
-                query, key, value = inference_module.pad_transform_fp16(query, key, value, config.heads, do_flash_attn)
+                query, key, value = pad_transform_func(query, key, value, config.heads, do_flash_attn)
                 attention_scores = (torch.matmul(query, key.transpose(-1, -2)) * scale).softmax(dim=-1)
                 context_layer = _transpose_for_context(torch.matmul(attention_scores, value))
 
@@ -117,10 +119,6 @@ def __init__(
 
         data_type = self.config.dtype
         data_type_fp = torch.half if self.config.dtype == torch.int8 else self.config.dtype
-        global inference_module
-        if inference_module is None:
-            builder = InferenceBuilder()
-            inference_module = builder.load()
 
         if DeepSpeedDiffusersAttention.layer_id == 1:
             log_dist(f"DeepSpeed-Attention config: {self.config.__dict__}", [0])
@@ -171,26 +169,24 @@ def __init__(
             self.norm_factor *= math.sqrt(self.config.layer_id + 1)
             # https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/models/gpt2/modeling_gpt2.py#L191
 
-        if self.config.dtype in [torch.float16, torch.int8]:
-            self.score_context_func = inference_module.softmax_context_fp16
-            self.linear_func = inference_module.linear_layer_fp16
-            self.allocate_workspace = inference_module.allocate_workspace_fp16
-        else:
-            self.score_context_func = inference_module.softmax_context_fp32
-            self.linear_func = inference_module.linear_layer_fp32
-            self.allocate_workspace = inference_module.allocate_workspace_fp32
+        self.workspace = WorkspaceOp(self.config)
+        self.score_context_func = SoftmaxContextOp(self.config)
+        self.linear_func = LinearOp(self.config)
+        self.pad_transform_func = PadTransformOp(self.config)
 
-    def forward(self, input, context=None, input_mask=None):
+    def allocate_workspace(self, size):
+        # Allocate memory only on first layer forward
         if self.config.layer_id == 0:
-            self.allocate_workspace(self.config.hidden_size, self.config.heads,
-                                    input.size()[1],
-                                    input.size()[0], DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False,
-                                    0, self.config.max_out_tokens, self.config.min_out_tokens)
-        output = DeepSpeedDiffusersAttentionFunction.apply(input, context, input_mask, self.config, self.attn_qkvw,
-                                                           self.attn_qw, self.attn_kw, self.attn_vw, self.attn_qkvb,
-                                                           self.num_attention_heads_per_partition, self.norm_factor,
-                                                           self.hidden_size_per_partition, self.attn_ow, self.attn_ob,
-                                                           self.do_out_bias, self.score_context_func, self.linear_func,
-                                                           self.triton_flash_attn_kernel, self.config.rope_theta)
+            self.workspace.allocate_workspace(self.config.hidden_size, self.config.heads, size[1], size[0],
+                                              DeepSpeedDiffusersAttention.layer_id, self.config.mp_size, False, 0,
+                                              self.config.max_out_tokens, self.config.min_out_tokens)
+
+    def forward(self, input, context=None, input_mask=None):
+        self.allocate_workspace(input.size())
+        output = DeepSpeedDiffusersAttentionFunction.apply(
+            input, context, input_mask, self.config, self.attn_qkvw, self.attn_qw, self.attn_kw, self.attn_vw,
+            self.attn_qkvb, self.num_attention_heads_per_partition, self.norm_factor, self.hidden_size_per_partition,
+            self.attn_ow, self.attn_ob, self.do_out_bias, self.score_context_func, self.linear_func,
+            self.pad_transform_func, self.triton_flash_attn_kernel, self.config.rope_theta)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
index b0156f905a06..d01638f36e40 100644
--- a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
+++ b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
@@ -10,26 +10,9 @@
 from .diffusers_attention import DeepSpeedDiffusersAttention
 from .bias_add import nhwc_bias_add
 from .diffusers_2d_transformer import Diffusers2DTransformerConfig
-from deepspeed.ops.op_builder import InferenceBuilder, SpatialInferenceBuilder
 from deepspeed.utils.types import ActivationFuncType
-
-# Ops will be loaded on demand
-transformer_cuda_module = None
-spatial_cuda_module = None
-
-
-def load_transformer_module():
-    global transformer_cuda_module
-    if transformer_cuda_module is None:
-        transformer_cuda_module = InferenceBuilder().load()
-    return transformer_cuda_module
-
-
-def load_spatial_module():
-    global spatial_cuda_module
-    if spatial_cuda_module is None:
-        spatial_cuda_module = SpatialInferenceBuilder().load()
-    return spatial_cuda_module
+from .op_binding.gated_activation import GatedActivationOp
+from .op_binding.layer_norm import LayerNormOp
 
 
 class DeepSpeedDiffusersTransformerBlock(nn.Module):
@@ -76,8 +59,8 @@ def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerC
         else:
             self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False)
 
-        self.transformer_cuda_module = load_transformer_module()
-        load_spatial_module()
+        self.gated_activation = GatedActivationOp()
+        self.layer_norm = LayerNormOp()
 
     def forward(self, hidden_states, context=None, timestep=None, **kwargs):
         # In v0.12.0 of diffuser, several new kwargs were added. Capturing
@@ -88,17 +71,17 @@ def forward(self, hidden_states, context=None, timestep=None, **kwargs):
         if "encoder_hidden_states" in kwargs and kwargs["encoder_hidden_states"] is not None:
             context = kwargs["encoder_hidden_states"]
 
-        out_norm_1 = self.transformer_cuda_module.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
+        out_norm_1 = self.layer_norm(hidden_states, self.norm1_g, self.norm1_b, self.norm1_eps)
         out_attn_1 = self.attn_1(out_norm_1)
 
-        out_norm_2, out_attn_1 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+        out_norm_2, out_attn_1 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
             out_attn_1, self.attn_1_bias, hidden_states, self.norm2_g, self.norm2_b, self.norm2_eps)
         out_attn_2 = self.attn_2(out_norm_2, context=context)
-        out_norm_3, out_attn_2 = self.transformer_cuda_module.layer_norm_residual_store_pre_ln_res(
+        out_norm_3, out_attn_2 = self.layer_norm.layer_norm_residual_store_pre_ln_res(
             out_attn_2, self.attn_2_bias, out_attn_1, self.norm3_g, self.norm3_b, self.norm3_eps)
 
         out_ff1 = nn.functional.linear(out_norm_3, self.ff1_w)
-        out_geglu = self.transformer_cuda_module.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
+        out_geglu = self.gated_activation(out_ff1, self.ff1_b, ActivationFuncType.GATED_GELU)
 
         out_ff2 = nn.functional.linear(out_geglu, self.ff2_w)
         return nhwc_bias_add(out_ff2, self.ff2_b, other=out_attn_2)
diff --git a/deepspeed/ops/transformer/inference/ds_attention.py b/deepspeed/ops/transformer/inference/ds_attention.py
index eb6ce2f75c69..24f710d22494 100644
--- a/deepspeed/ops/transformer/inference/ds_attention.py
+++ b/deepspeed/ops/transformer/inference/ds_attention.py
@@ -89,11 +89,11 @@ def __init__(self, config, mp_group=None, q_scales=None, q_groups=1, merge_count
                 torch.empty(self.hidden_size_per_partition * 3, dtype=data_type_fp, device=device)
             ]
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
-        no_masking = input_mask is None
+        no_masking = input_mask is None or input_mask is False
 
         if no_masking:
             input_mask = torch.empty(1)
@@ -108,7 +108,10 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=DeepSpeedSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -136,7 +139,8 @@ def forward(self,
                 output_attentions=False,
                 norm_w=None,
                 norm_b=None,
-                alibi=None):
+                alibi=None,
+                **kwargs):
         if self.attn_qkvw is None:
             self._attn_qkvw, self._attn_qkvb = self._merge_qkv()
         else:
@@ -157,10 +161,17 @@ def forward(self,
                                     gamma=norm_w,
                                     beta=norm_b)
 
+        is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+        token_idx = kwargs.get("token_idx", None)
+        position_ids = kwargs.get("position_ids", None)
+
         context_layer, key_layer, value_layer = self.compute_attention(qkv_out=qkv_out,
                                                                        input_mask=input_mask,
                                                                        layer_past=layer_past,
-                                                                       alibi=alibi)
+                                                                       alibi=alibi,
+                                                                       is_prompt=is_prompt,
+                                                                       token_idx=token_idx,
+                                                                       position_ids=position_ids)
 
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
         inp_norm = qkv_out[-1]
@@ -210,7 +221,7 @@ def _split_tensor_along_last_dim(self, tensor, num_partitions, contiguous_split_
 
         return tensor_list
 
-    def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list) or isinstance(qkv_out, tuple):
             qkv_out = qkv_out[0]
 
@@ -254,8 +265,12 @@ def compute_attention(self, qkv_out, input_mask, layer_past, alibi):
         if input_mask.dtype == torch.bool:
             input_mask = input_mask.long()
 
+        # Invert input_mask per transformer implementation (eg, in BLOOM, it's already inverted)
+        if self.config.invert_mask:
+            input_mask = 1 - input_mask
+
         attention_probs = self.softmax_func(attn_scores=attention_scores,
-                                            attn_mask=((1 - input_mask).to(target_dtype) * minus_inf),
+                                            attn_mask=input_mask.to(target_dtype) * minus_inf,
                                             alibi=alibi,
                                             triangular=(self.config.triangular_masking
                                                         and (attention_scores.shape[-2] > 1)),
diff --git a/deepspeed/ops/transformer/inference/moe_inference.py b/deepspeed/ops/transformer/inference/moe_inference.py
index f985a9a4ab6b..3a9785985d19 100644
--- a/deepspeed/ops/transformer/inference/moe_inference.py
+++ b/deepspeed/ops/transformer/inference/moe_inference.py
@@ -7,16 +7,16 @@
 import math
 import torch
 from torch.autograd import Function
-# accelerator modules will be imported if needed
-inference_module = None
-specialized_mode = None
 import torch.nn as nn
 from .ds_attention import DeepSpeedSelfAttention
 from .config import DeepSpeedInferenceConfig
+from .op_binding import SoftmaxOp, VectorMatMulOp, GELUGemmOp
+from .op_binding.bias_residual import BiasResidualOp
+from .op_binding.einsum_sec_sm_ecm import EinsumSecSmEcmOp
+from .op_binding.layer_norm import LayerNormOp
 from ....moe.sharded_moe import TopKGate
 from deepspeed import comm as dist
-from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import InferenceBuilder
+from .op_binding.moe_res_matmul import MoEResMatmulOp
 
 
 class DeepSpeedMoEInferenceConfig(DeepSpeedInferenceConfig):
@@ -110,16 +110,13 @@ class DeepSpeedMLPFunction(Function):
 
     @staticmethod
     def forward(ctx, input, inter_w, inter_b, config, output_b, output_w, q_scales, q_groups, merge_count, mp_group,
-                async_op):
+                async_op, gelu_gemm_func, vector_matmul_func):
         if config.q_int8:
-            intermediate = inference_module.fused_gemm_gelu_int8(input, inter_w, inter_b, config.epsilon, q_scales[2],
-                                                                 (q_groups * (2**merge_count)), config.pre_layer_norm)
-            output = inference_module.vector_matmul_int8(intermediate, output_w, q_scales[3], q_groups, (merge_count))
+            intermediate = gelu_gemm_func(input, inter_w, inter_b, config.epsilon, q_scales[2],
+                                          (q_groups * (2**merge_count)), config.pre_layer_norm)
+            output = vector_matmul_func(intermediate, output_w, q_scales[3], q_groups, (merge_count))
         else:
-            mlp_gemm_func = inference_module.fused_gemm_gelu_fp16 if config.fp16 else \
-                                    inference_module.fused_gemm_gelu_fp32
-
-            output = mlp_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
+            output = gelu_gemm_func(input, inter_w, inter_b, output_w, config.epsilon, config.pre_layer_norm, async_op)
         if mp_group is not None and dist.get_world_size(group=mp_group) > 1:
             dist.all_reduce(output, group=mp_group, async_op=async_op)
 
@@ -150,10 +147,13 @@ def __init__(self, config, q_scales=None, q_groups=1, merge_count=1, mlp_extra_g
         self.q_groups = q_groups * 2 if mlp_extra_grouping else q_groups
         self.merge_count = int(math.log2(merge_count))
         self.mp_group = mp_group
+        self.gelu_gemm_func = GELUGemmOp(self.config)
+        self.vector_matmul_func = VectorMatMulOp(self.config)
 
     def forward(self, input, async_op=False):
         return DeepSpeedMLPFunction.apply(input, self.inter_w, self.inter_b, self.config, self.output_b, self.output_w,
-                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op)
+                                          self.q_scales, self.q_groups, self.merge_count, self.mp_group, async_op,
+                                          self.gelu_gemm_func, self.vector_matmul_func)
 
 
 class DeepSpeedMoEInference(nn.Module):
@@ -187,18 +187,7 @@ def __init__(self,
 
         self.config = config
         self.config.layer_id = DeepSpeedMoEInference.layer_id
-        global inference_module
-        global specialized_mode
-        if inference_module is None:
-            specialized_mode = False
-            # InferenceSpecializedBuilder is not among DeepSpeed provided builder yet, so we infer by builder name string
-            builder = get_accelerator().create_op_builder("InferenceSpecializedBuilder")
-            if builder is not None and builder.is_compatible():
-                inference_module = builder.load()
-                specialized_mode = True
-            else:
-                inference_module = InferenceBuilder().load()
-        self.config.specialized_mode = specialized_mode
+
         assert self.config.dtype != torch.bfloat16, "DeepSpeed MoE Transformer Inference not yet tested for bfloat support"
 
         DeepSpeedMoEInference.layer_id += 1
@@ -213,10 +202,8 @@ def __init__(self,
             self.res_mlp = DeepSpeedMoEMLP(config, quantize_scales, quantize_groups, merge_count, mlp_extra_grouping,
                                            mp_group)
             self.res_coef = nn.Parameter(torch.Tensor(self.config.hidden_size, 2))
-            self.coef_func = inference_module.softmax_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.softmax_fp32
-            self.vector_matmul_func = inference_module.vector_matmul_fp16 if self.config.dtype == torch.float16 else \
-                                    inference_module.vector_matmul_fp32
+            self.coef_func = SoftmaxOp(self.config)
+            self.vector_matmul_func = VectorMatMulOp(self.config)
 
         config.mp_size = 1
         self.mlp = nn.ModuleList(
@@ -226,7 +213,7 @@ def __init__(self,
         self.moe_gate = TopKGate(self.config.hidden_size, self.config.global_experts, self.config.k,
                                  self.config.capacity_factor, self.config.eval_capacity_factor,
                                  self.config.min_capacity, self.config.noisy_gate_policy, self.config.drop_tokens,
-                                 self.config.use_rts)
+                                 self.config.use_rts, self.ep_group)
 
         self.ep_group = ep_group
         self.mp_group = mp_group
@@ -234,12 +221,10 @@ def __init__(self,
 
         print("DeepSpeed MoE Transformer Inference config is ", self.config.__dict__)
 
-        self.bias_residual_func = inference_module.bias_residual_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.bias_residual_fp32
-        self.ds_layernorm = inference_module.layer_norm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.layer_norm_fp32
-        self.einsum_sec_sm_ecm = inference_module.einsum_sec_sm_ecm_fp16 if self.config.dtype in [torch.float16, torch.int8] else \
-                                        inference_module.einsum_sec_sm_ecm_fp32
+        self.bias_residual_func = BiasResidualOp(self.config)
+        self.ds_layernorm = LayerNormOp(self.config)
+        self.einsum_sec_sm_ecm = EinsumSecSmEcmOp(self.config)
+        self.moe_res_matmul = MoEResMatmulOp(self.config)
 
     def res_coef_func(self, inp, async_op):
         inp = self.vector_matmul_func(inp, self.res_coef, async_op)
@@ -326,12 +311,12 @@ def forward(self,
                 res_coef_out = self.res_coef_func(attention_output, async_op=True)
 
             if self.expert_mp_group is not None:
-                tensor_list = [
-                    torch.empty_like(attention_output) for _ in range(dist.get_world_size(group=self.expert_mp_group))
-                ]
-                tensor_list[dist.get_rank(group=self.expert_mp_group)] = attention_output
-                dist.all_gather(tensor_list, attention_output, group=self.expert_mp_group)
-                attention_output = torch.cat(tensor_list).contiguous()
+                world_size = dist.get_world_size(group=self.expert_mp_group)
+                gather_buffer = torch.empty(world_size * attention_output.numel(),
+                                            dtype=attention_output.dtype,
+                                            device=attention_output.device)
+                dist.all_gather_into_tensor(gather_buffer, attention_output, group=self.expert_mp_group)
+                attention_output = gather_buffer.view(-1, *attention_output.size()[1:])
 
             ############## MoE Gating + Experts ###############
             dispatched_attention, combined_weights = self.moe_gate_einsum(attention_output)
@@ -346,7 +331,7 @@ def forward(self,
                                       dim=0)[dist.get_rank(group=self.expert_mp_group)]
 
             if self.config.mlp_type == 'residual':
-                inference_module.moe_res_matmul(res_mlp_out, res_coef_out, output)
+                self.moe_res_matmul(res_mlp_out, res_coef_out, output)
 
             output = self.bias_residual_func(output, residual_add, torch.empty(1))
 
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_add.py b/deepspeed/ops/transformer/inference/op_binding/bias_add.py
new file mode 100644
index 000000000000..d2ae38f546eb
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_add.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasAddOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasAddOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_add_func = self.inference_module.bias_add_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_add_func = self.inference_module.bias_add_bf16
+            else:
+                self.bias_add_func = self.inference_module.bias_add_fp32
+        except AttributeError:
+            self.bias_add_func = self.bias_add_fallback
+
+    @classmethod
+    def bias_add_fallback(cls, input, bias):
+        return torch.add(input, bias)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_add_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py b/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py
new file mode 100644
index 000000000000..f0fee0b0d06e
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_gelu.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasGeluOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasGeluOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_gelu_func = self.inference_module.bias_gelu_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_gelu_func = self.inference_module.bias_gelu_bf16
+            else:
+                self.bias_gelu_func = self.inference_module.bias_gelu_fp32
+        except AttributeError:
+            self.bias_gelu_func = self.bias_gelu_fallback
+
+    @classmethod
+    def bias_gelu_fallback(cls, activations, bias):
+        # Expected behavior is that of casting to float32 internally and using the tanh approximation
+        return F.gelu(activations.to(torch.float32) + bias.to(torch.float32), approximate='tanh').to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_gelu_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_relu.py b/deepspeed/ops/transformer/inference/op_binding/bias_relu.py
new file mode 100644
index 000000000000..ccfade1d9524
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_relu.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasReluOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasReluOp, self).__init__(config)
+
+        try:
+            if self.config.dtype == torch.float16:
+                self.bias_relu_func = self.inference_module.bias_relu_fp16
+            elif self.config.dtype == torch.bfloat16:
+                self.bias_relu_func = self.inference_module.bias_relu_bf16
+            else:
+                self.bias_relu_func = self.inference_module.bias_relu_fp32
+        except AttributeError:
+            self.bias_relu_func = self.bias_relu_fallback
+
+    @classmethod
+    def bias_relu_fallback(cls, activations, bias):
+        # Expected behavior is that of casting to float32 internally
+        return F.relu(activations.to(torch.float32) + bias.to(torch.float32)).to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor):
+        return self.bias_relu_func(activation, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/bias_residual.py b/deepspeed/ops/transformer/inference/op_binding/bias_residual.py
new file mode 100644
index 000000000000..ecad50e10ffe
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/bias_residual.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class BiasResidualOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(BiasResidualOp, self).__init__(config)
+
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.bias_residual_func = self.inference_module.bias_residual_fp16
+            else:
+                self.bias_residual_func = self.inference_module.bias_residual_fp32
+        except AttributeError:
+            self.bias_residual_func = self.bias_residual_fallback
+
+    @classmethod
+    def bias_residual_fallback(cls, output, residual, bias):
+        raise NotImplementedError("bias residual fallback isn't implemented")
+
+    def forward(self, output, residual, bias):
+        return self.bias_residual_func(output, residual, bias)
diff --git a/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py b/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py
new file mode 100644
index 000000000000..f34b10f786d1
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/einsum_sec_sm_ecm.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class EinsumSecSmEcmOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig):
+        super(EinsumSecSmEcmOp, self).__init__(config)
+
+        try:
+            if self.config.dtype in [torch.float16, torch.int8]:
+                self.einsum_sec_sm_ecm_func = self.inference_module.einsum_sec_sm_ecm_fp16
+            else:
+                self.einsum_sec_sm_ecm_func = self.inference_module.einsum_sec_sm_ecm_fp32
+        except AttributeError:
+            self.einsum_sec_sm_ecm_func = self.einsum_sec_sm_ecm_fallback
+
+    @classmethod
+    def einsum_sec_sm_ecm_fallback(cls, Q, W):
+        raise NotImplementedError("einsum sec sm ecm fallback isn't implemented")
+
+    def forward(self, Q, W):
+        return self.einsum_sec_sm_ecm_func(Q, W)
diff --git a/deepspeed/ops/transformer/inference/op_binding/gated_activation.py b/deepspeed/ops/transformer/inference/op_binding/gated_activation.py
new file mode 100644
index 000000000000..d28d818ce4b3
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/gated_activation.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from deepspeed.utils.types import ActivationFuncType
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class GatedActivationOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(GatedActivationOp, self).__init__(config)
+        try:
+            self.gated_activation_func = self.inference_module.gated_activation
+        except AttributeError:
+            self.gated_activation_func = self.gated_activation_fallback
+
+    @classmethod
+    def gated_activation_fallback(cls, activation, bias, activation_func_type):
+        # Expected behavior is that of casting to float32 internally
+        # Explicitly using the default GeLU
+        activation_func = None
+        activations = activation + bias.reshape(1, 1, -1)
+        hidden_states, gate = activations.chunk(2, dim=-1)
+
+        if activation_func_type == ActivationFuncType.GATED_SILU:
+            activation_func = F.silu
+        elif activation_func_type == ActivationFuncType.GATED_GELU:
+            activation_func = F.gelu
+
+        return hidden_states * activation_func(gate.to(torch.float32)).to(activations.dtype)
+
+    def forward(self, activation: torch.Tensor, bias: torch.Tensor, activation_func_type: ActivationFuncType):
+        return self.gated_activation_func(activation, bias, activation_func_type)
diff --git a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
index 63323c150752..60bbb4b48bdb 100644
--- a/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/gelu_gemm.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import torch
+import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 import deepspeed
@@ -14,7 +15,9 @@ class GELUGemmOp(BaseOp):
     def __init__(self, config: DeepSpeedInferenceConfig):
         super(GELUGemmOp, self).__init__(config)
         try:
-            if self.config.dtype in [torch.float16, torch.int8]:
+            if self.config.dtype == torch.int8:
+                self.fused_gemm_gelu = self.inference_module.fused_gemm_gelu_int8
+            elif self.config.dtype == torch.float16:
                 if deepspeed.HAS_TRITON and self.config.use_triton and self.config.dtype == torch.float16:
                     from deepspeed.ops.transformer.inference.triton.ops import fused_gemm_gelu as _triton_fused_gemm_gelu
                     self.fused_gemm_gelu = _triton_fused_gemm_gelu  # type: ignore
@@ -28,7 +31,11 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             self.fused_gemm_gelu = self.gelu_gemm_fallback
 
     def gelu_gemm_fallback(self, input, weight, scale, bias, out, out_scale, dtype, transpose):
-        raise NotImplementedError
+        tmp = torch.matmul(input, weight)
+        tmp = F.gelu(tmp.to(torch.float32) + bias.to(torch.float32), approximate="tanh").to(tmp.dtype)
+        output = torch.matmul(tmp, out)
+
+        return output
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, weight_out: torch.Tensor):
 
diff --git a/deepspeed/ops/transformer/inference/op_binding/layer_norm.py b/deepspeed/ops/transformer/inference/op_binding/layer_norm.py
new file mode 100644
index 000000000000..31219a58ac3c
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/layer_norm.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class LayerNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        super(LayerNormOp, self).__init__(config)
+        try:
+            if config is None:
+                self.layer_norm_func = self.inference_module.layer_norm
+            elif self.config.dtype in [torch.float16, torch.int8]:
+                self.layer_norm_func = self.inference_module.layer_norm_fp16
+            else:
+                self.layer_norm_func = self.inference_module.layer_norm_fp32
+        except AttributeError:
+            self.layer_norm_func = self.layer_norm_fallback
+
+    @classmethod
+    def layer_norm_residual(cls, vals, bias, res, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+        res_f = res.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        return F.layer_norm(vals_f + bias_f + res_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+    @classmethod
+    def layer_norm_residual_store_pre_ln_res(cls, vals, bias, res, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        bias_f = bias.to(torch.float32).reshape(1, 1, -1)
+        res_f = res.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        res_output = vals_f + bias_f + res_f
+        norm_output = F.layer_norm(res_output, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+        return norm_output, res_output.to(dtype)
+
+    @classmethod
+    def layer_norm_fallback(cls, vals, gamma, beta, epsilon):
+        channels = gamma.shape[0]
+        dtype = gamma.dtype
+        vals_f = vals.to(torch.float32)
+        gamma_f = gamma.to(torch.float32)
+        beta_f = beta.to(torch.float32)
+        return F.layer_norm(vals_f, (channels, ), weight=gamma_f, bias=beta_f, eps=epsilon).to(dtype)
+
+    def forward(self, vals, gamma, beta, epsilon):
+        return self.layer_norm_func(vals, gamma, beta, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
index 3064c00d1755..5f1f915ec021 100644
--- a/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/mlp_gemm.py
@@ -5,12 +5,12 @@
 
 from typing import Optional
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 from deepspeed.utils.types import NormType
+from .pre_rms_norm import PreRMSNormOp
 
 
 class MLPGemmOp(BaseOp):
@@ -39,23 +39,45 @@ def __init__(self, config: DeepSpeedInferenceConfig):
                 self.mlp_gemm_func = self.mlp_gemm_fallback
             elif self.config.norm_type == NormType.RMSNorm:
                 self.mlp_gemm_func = self.rms_mlp_gemm_fallback
+        self.pre_rms_norm = PreRMSNormOp()
 
     def mlp_gemm_fallback(self, input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps,
                           pre_layer_norm, mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type,
                           transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and mlp_after_attn and not transpose:
-            residual_add = F.layer_norm(input + residual + input_bias, (input.shape[2], ), gamma, beta,
-                                        self.config.epsilon)
-            tmp = torch.matmul(residual_add, weight_interm)
+        if mlp_after_attn:
+            residual_add = F.layer_norm(input + residual + input_bias, (input.shape[2], ), gamma, beta, eps)
+            tmp = torch.matmul(residual_add, weight_interm.t() if transpose else weight_interm)
             tmp = F.gelu(tmp + bias)
-            output = torch.matmul(tmp, weight_out)
-            return (output, residual_add)
+            output = torch.matmul(tmp, weight_out.t() if transpose else weight_out)
+
+            return output, residual_add
         else:
             raise NotImplementedError
 
     def rms_mlp_gemm_fallback(self, input, residual, weight_interm, weight_out, gamma, eps, interm_scale, out_scale,
                               dtype, mlp_act_func_type, transpose):
-        raise NotImplementedError
+        inp_norm, residual = self.pre_rms_norm(input, residual, gamma, eps)
+        tmp = torch.matmul(inp_norm.view([-1, inp_norm.size(2)]), weight_interm.t() if transpose else weight_interm)
+        up_proj, gate_proj = tmp.chunk(2, dim=1)
+
+        from deepspeed.utils.types import ActivationFuncType
+        if mlp_act_func_type == ActivationFuncType.GELU:
+            intermediate = F.gelu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.ReLU:
+            intermediate = F.relu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.GATED_GELU:
+            intermediate = F.gelu(gate_proj)
+        elif mlp_act_func_type == ActivationFuncType.GATED_SILU:
+            intermediate = F.silu(gate_proj)
+        else:
+            raise f"rms_mlp_gemm_fallback not implemented for activation type {mlp_act_func_type}"
+
+        intermediate = intermediate * up_proj
+
+        output = torch.matmul(intermediate, weight_out.t() if transpose else weight_out)
+        output = output.view([input.size(0), input.size(1), -1])
+
+        return [output, residual]
 
     def forward(self,
                 input: torch.Tensor,
diff --git a/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py b/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py
new file mode 100644
index 000000000000..ef3558c8bc88
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/moe_res_matmul.py
@@ -0,0 +1,29 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class MoEResMatmulOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(MoEResMatmulOp, self).__init__(config)
+        try:
+            self.moe_res_matmul_func = self.inference_module.moe_res_matmul
+        except AttributeError:
+            self.moe_res_matmul_func = self.moe_res_matmul_fallback
+
+    @classmethod
+    def moe_res_matmul_fallback(cls, residual, coef, output):
+        coef_t = coef.transpose(1, 2).contiguous()
+        coef1, coef2 = torch.split(coef_t, split_size_or_sections=coef_t.shape[len(coef_t.shape) - 1] // 2, dim=-1)
+        return residual * coef1 + output * coef2
+
+    def forward(self, residual, coef, output):
+        return self.moe_res_matmul_func(residual, coef, output)
diff --git a/deepspeed/ops/transformer/inference/op_binding/pad_transform.py b/deepspeed/ops/transformer/inference/op_binding/pad_transform.py
new file mode 100644
index 000000000000..876fefc3bcfb
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/pad_transform.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class PadTransformOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(PadTransformOp, self).__init__(config)
+        try:
+            self.pad_transform_func = self.inference_module.pad_transform_fp16
+        except AttributeError:
+            self.pad_transform_func = self.pad_transform_fallback
+
+    @staticmethod
+    def pad_transform_fallback(query, key, value, heads, do_flash_attn):
+        raise NotImplementedError("pad_transform fallback is not implemented.")
+
+    def forward(self, query, key, value, heads, do_flash_attn):
+        return self.pad_transform_func(query, key, value, heads, do_flash_attn)
diff --git a/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py b/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py
new file mode 100644
index 000000000000..7969d20f0527
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/pre_rms_norm.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+from .rms_norm import RMSNormOp
+
+
+class PreRMSNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(PreRMSNormOp, self).__init__(config)
+        try:
+            self.pre_rms_norm_func = self.inference_module.pre_rms_norm
+        except AttributeError:
+            self.pre_rms_norm_func = self.pre_rms_norm_fallback
+
+    @staticmethod
+    def pre_rms_norm_fallback(vals, residual, gamma, epsilon):
+        residual = vals.to(torch.float32) + residual.to(torch.float32)
+        vals = residual
+
+        return RMSNormOp.rms_norm_fallback(vals, gamma, epsilon), residual.to(gamma.dtype)
+
+    def forward(self, vals, residual, gamma, epsilon):
+        return self.pre_rms_norm_func(vals, residual, gamma, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
index 250bf9864e1e..9ff5366fae5d 100644
--- a/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
+++ b/deepspeed/ops/transformer/inference/op_binding/qkv_gemm.py
@@ -3,11 +3,11 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from .rms_norm import RMSNormOp
 import deepspeed
 from deepspeed.utils.types import NormType
 
@@ -56,19 +56,23 @@ def _triton_autotune(min_seqlen, max_seqlen, hidden_size, dtype=torch.float16):
             matmul(A, B)
         Fp16Matmul._update_autotune_table()
 
-    def qkv_gemm_fallback(self, input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
-            inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
-            tmp = torch.matmul(inp_norm, weight)
-            if add_bias:
-                tmp += bias
-            output = [tmp, inp_norm]
-            return output
-        else:
-            raise NotImplementedError
+    @staticmethod
+    def qkv_gemm_fallback(input, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        inp_norm = F.layer_norm(input, (input.shape[2], ), gamma, beta, eps)
+        tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+        if add_bias:
+            tmp += bias
+        output = [tmp, inp_norm]
+
+        return output
+
+    @staticmethod
+    def rms_qkv_gemm_fallback(input, weight, q_scale, gamma, eps, q_int8, transpose):
+        inp_norm = RMSNormOp.rms_norm_fallback(input, gamma, eps)
+        tmp = torch.matmul(inp_norm, weight.t() if transpose else weight)
+        output = [tmp, inp_norm]
 
-    def rms_qkv_gemm_fallback(self, input, weight, q_scale, gamma, eps, q_int8, transpose):
-        raise NotImplementedError
+        return output
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, gamma: torch.Tensor,
                 beta: torch.Tensor):
diff --git a/deepspeed/ops/transformer/inference/op_binding/residual_add.py b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
index 6f9b35cbc05d..93b229c5d1ac 100644
--- a/deepspeed/ops/transformer/inference/op_binding/residual_add.py
+++ b/deepspeed/ops/transformer/inference/op_binding/residual_add.py
@@ -3,9 +3,10 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 from typing import Optional
+
+from .vector_add import VectorAddOp
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
 
@@ -22,11 +23,32 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             else:
                 self.residual_add_func = self.inference_module.residual_add_bias_fp32
         except AttributeError:
-            self.residual_add_func = None
-        try:
-            self._vector_add = self.inference_module._vector_add
-        except AttributeError:
-            self._vector_add = None
+            self.residual_add_func = self.residual_add_fallback
+        self.vector_add = VectorAddOp()
+
+    @staticmethod
+    def res_add_bias(hidden_state, residual, attn_output, attn_bias, final_bias, add_attn_bias, mp_size):
+        hidden_state += attn_output + (residual + final_bias) / mp_size
+        if add_attn_bias:
+            hidden_state += attn_bias / mp_size
+
+        return hidden_state
+
+    @staticmethod
+    def residual_add_fallback(hidden_state, residual, attention_output, attention_bias, final_bias, mp_size,
+                              mlp_after_attn, add_bias, pre_layer_norm):
+        if mlp_after_attn:
+            if pre_layer_norm:
+                tmp = (residual.float() + attention_output.float() + attention_bias.float() +
+                       final_bias.float()) / mp_size + hidden_state.float()
+            else:
+                tmp = residual.float() + hidden_state.float() + final_bias.float()
+        else:
+            tmp = ResidualAddOp.res_add_bias(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                             add_bias, mp_size)
+        residual.copy_(tmp.to(hidden_state.dtype))
+
+        return residual
 
     def forward(self,
                 hidden_state: torch.Tensor,
@@ -37,28 +59,15 @@ def forward(self,
                 attention_bias: Optional[torch.Tensor] = None,
                 final_bias: Optional[torch.Tensor] = None):
 
-        if self.residual_add_func is not None:
-            if final_bias is None:
-                residual = self._vector_add(residual, hidden_state, 1.0 / self.config.mp_size)
-            else:
-                if not self.config.pre_layer_norm and residual_add is not None:
-                    # only use residual add if its set and we are not pre layer norm
-                    residual = residual_add
-
-                self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
-                                       self.config.mp_size, self.config.mlp_after_attn, add_bias,
-                                       self.config.pre_layer_norm)
+        if final_bias is None and attention_bias is None:
+            residual = self.vector_add(residual + attention_output, hidden_state, 1.0 / self.config.mp_size)
         else:
-            # fallback
-            if os.environ.get('DS_KI_FALLBACK') == 'True' and self.config.mlp_after_attn:
-                if self.config.pre_layer_norm:
-                    tmp = (residual.float() + attention_output.float() + attention_bias.float() +
-                           final_bias.float()) / self.config.mp_size + hidden_state.float()
-                else:
-                    tmp = residual.float() + hidden_state.float() + final_bias.float()
+            if not self.config.pre_layer_norm and residual_add is not None:
+                # only use residual add if its set and we are not pre layer norm
+                residual = residual_add
+
+            self.residual_add_func(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                   self.config.mp_size, self.config.mlp_after_attn, add_bias,
+                                   self.config.pre_layer_norm)
 
-                input_dtype = hidden_state.dtype
-                residual = tmp.to(input_dtype)
-            else:
-                raise NotImplementedError
         return residual
diff --git a/deepspeed/ops/transformer/inference/op_binding/rms_norm.py b/deepspeed/ops/transformer/inference/op_binding/rms_norm.py
new file mode 100644
index 000000000000..128883ce5d43
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/rms_norm.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class RMSNormOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(RMSNormOp, self).__init__(config)
+        try:
+            self.rms_norm_func = self.inference_module.rms_norm
+        except AttributeError:
+            self.rms_norm_func = self.rms_norm_fallback
+
+    @staticmethod
+    def rms_norm_fallback(vals, gamma, epsilon):
+        variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        vals = vals * torch.rsqrt(variance + epsilon)
+
+        if gamma.dtype in [torch.float16, torch.bfloat16]:
+            vals = vals.to(gamma.dtype)
+
+        return gamma * vals
+
+    def forward(self, vals, gamma, epsilon):
+        return self.rms_norm_func(vals, gamma, epsilon)
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax.py b/deepspeed/ops/transformer/inference/op_binding/softmax.py
index bc309d94df14..2e08541596fa 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax.py
@@ -3,11 +3,11 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 import torch.nn.functional as F
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from deepspeed.ops.transformer.inference.op_binding.workspace import InferenceContext
 
 
 class SoftmaxOp(BaseOp):
@@ -25,24 +25,42 @@ def __init__(self, config: DeepSpeedInferenceConfig):
         except AttributeError:
             self.softmax_func = self.softmax_fallback
 
-    def softmax_fallback(self, attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size,
-                         async_op, layer_scale, head_offset, mp_size):
-        if os.environ.get('DS_KI_FALLBACK') == 'True':
-            alibi = alibi[head_offset:head_offset + self.num_attention_heads_per_partition]
-            input_dtype = attn_scores.dtype
-            if (triangular):
-                tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
-                attn_scores = torch.masked_fill(attn_scores * layer_scale, tri, torch.finfo(input_dtype).min)
-            if alibi is not None:
-                attn_scores += alibi
-            if attn_mask is not None:
-                # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+    @staticmethod
+    def softmax_fallback(attn_scores, attn_mask, alibi, triangular, recompute, local_attention, window_size, async_op,
+                         layer_scale, head_offset, mp_size):
+        scores_len = len(attn_scores.size())
+        heads = 1
+        if scores_len > 1:
+            heads = attn_scores.size()[1]
+        num_attention_heads_per_partition = heads // mp_size
+
+        if alibi is not None:
+            if len(alibi.shape) == 1:
+                alibi = None
+            else:
+                alibi = alibi[head_offset:head_offset + num_attention_heads_per_partition]
+        if attn_mask is not None and len(attn_mask.shape) == 1:
+            attn_mask = None
+        input_dtype = attn_scores.dtype
+        attn_scores *= layer_scale
+
+        if alibi is not None:
+            attn_scores += alibi
+        if attn_mask is not None:
+            # expand atten_mask from two dim into 4 dim, insert two dims in the middle
+            if len(attn_mask.shape) == 2:
                 attn_mask = attn_mask[:, None, None, :]
-                attn_scores += attn_mask
-            output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
-            return output
-        else:
-            raise NotImplementedError
+            attn_scores += attn_mask
+        if triangular:
+            if attn_scores.shape[2] == 1:  # query using kv cache
+                token_idx = InferenceContext.Instance().current_tokens()
+                tri = torch.arange(attn_scores.shape[2], device=attn_scores.device).ge(token_idx)
+            else:
+                tri = ~torch.tril(torch.ones(attn_scores.size(), device=attn_scores.device)).to(bool)
+            attn_scores = torch.masked_fill(attn_scores, tri, float('-inf'))
+        output = F.softmax(attn_scores, dim=-1, dtype=torch.float32).to(input_dtype)
+
+        return output
 
     def forward(self, attn_scores: torch.Tensor, attn_mask: torch.Tensor, alibi: torch.Tensor, triangular: bool,
                 recompute: bool, local_attention: bool, window_size: int, async_op: bool, layer_scale: float,
diff --git a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
index 0dc4e08a3633..d745df678e93 100644
--- a/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
+++ b/deepspeed/ops/transformer/inference/op_binding/softmax_context.py
@@ -7,6 +7,8 @@
 from deepspeed import comm as dist
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
+from .softmax import SoftmaxOp
+from deepspeed.ops.transformer.inference.op_binding.workspace import InferenceContext
 
 
 class SoftmaxContextOp(BaseOp):
@@ -23,13 +25,108 @@ def __init__(self, config: DeepSpeedInferenceConfig):
         except AttributeError:
             self.softmax_context_func = self.softmax_context_fallback
 
+    @staticmethod
+    def transform4d_0213(x, seq_length):
+        assert x.dim() == 3, F"Dim {x.dim()} is not supported"
+        batch_size, num_heads, seq_length_head_dim = x.shape
+        head_dim = seq_length_head_dim // seq_length
+        x = x.view(batch_size, num_heads, seq_length, head_dim)
+        x = x.permute(0, 2, 1, 3)
+
+        return x
+
+    @staticmethod
+    def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+        batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+        if n_rep <= 1 or num_key_value_heads == 1:
+            return hidden_states
+
+        hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+
+        return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+    @staticmethod
+    def bias_add_transform_0213(input, bias, num_heads, trans_count, perform_bias=False):
+        assert trans_count == 1 or trans_count == 3, F"Trans count {trans_count} is not supported"
+        assert input.dim() == 3, F"Dim {input.dim()} is not supported"
+        input_biased = torch.add(input, bias) if perform_bias else input
+        batch_size, seq_length, value_size = input_biased.shape
+        hid_dim = value_size // trans_count
+        head_dim = hid_dim // num_heads
+
+        if trans_count == 1:
+            query_layer = input.view(batch_size, seq_length, num_heads, head_dim)
+            query_layer = query_layer.permute(0, 2, 1, 3)
+            key_layer = torch.zeros_like(query_layer)
+            value_layer = torch.zeros_like(query_layer)
+            return query_layer, key_layer, value_layer
+
+        qkv_layers = input.view(batch_size, seq_length, 3, num_heads, head_dim)
+        query_layer, key_layer, value_layer = qkv_layers[..., 0, :, :], qkv_layers[..., 1, :, :], qkv_layers[...,
+                                                                                                             2, :, :]
+        query_layer = query_layer.transpose(1, 2)
+        key_layer = key_layer.transpose(1, 2)
+        value_layer = value_layer.transpose(1, 2)
+
+        return query_layer, key_layer, value_layer
+
     def softmax_context_fallback(self, query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads,
                                  num_kv, norm_factor, triangular_masking, local_attention, window_size, no_masking,
-                                 layer_id, num_layers, alibi, rope_theta):
-        raise NotImplementedError
+                                 layer_id, num_layers, alibi, rope_theta, is_prompt, token_idx, position_ids):
+        bat_0213_query, bat_0213_key, bat_0213_value = self.bias_add_transform_0213(
+            query_key_value, None, heads, 3, False)
+
+        if rotary_dim > 0 and rotate_half:
+            from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+
+            rotary = InferenceContext.Instance().get_rotary(rotary_dim, rope_theta, bat_0213_value.device)
+            cos, sin = rotary(bat_0213_value, InferenceContext.Instance().get_max_tokens_num())
+            bat_0213_query, bat_0213_key = apply_rotary_pos_emb(bat_0213_query, bat_0213_key, cos, sin, position_ids)
+
+        bat_0213_key, bat_0213_value = InferenceContext.Instance().update_cache(layer_id, token_idx, is_prompt,
+                                                                                bat_0213_key, bat_0213_value)
+
+        bat_0213_key = self.repeat_kv(bat_0213_key, num_kv)
+        bat_0213_value = self.repeat_kv(bat_0213_value, num_kv)
+
+        bsz = query_key_value.shape[0]
+        head_dim = query_key_value.shape[2] // (heads * 3)
+
+        bmm_output = torch.bmm(bat_0213_query.reshape(bsz * heads, bat_0213_query.shape[2], head_dim),
+                               bat_0213_key.reshape(bsz * heads, bat_0213_key.shape[2], head_dim).transpose(1, 2))
+
+        layer_scale = 1.0
+        if alibi is not None and len(alibi.shape) > 1:
+            layer_scale = max(1, layer_id).to(float)
+
+        alpha = norm_factor * norm_factor / layer_scale
+        bmm_output *= alpha
+        bmm_output_reshape = bmm_output.reshape(bsz, heads, bmm_output.shape[1], bmm_output.shape[2])
+
+        recompute = is_prompt
+        if attn_mask is not None and len(attn_mask.shape) > 1 and attn_mask.shape[-1] < bmm_output_reshape.shape[3]:
+            attn_mask = torch.nn.functional.pad(attn_mask, (0, bmm_output_reshape.shape[3] - attn_mask.shape[-1]),
+                                                value=torch.finfo(attn_mask.dtype).min)
+        softmax_output = SoftmaxOp.softmax_fallback(bmm_output_reshape, attn_mask, alibi, triangular_masking,
+                                                    recompute, local_attention, window_size, None, layer_scale, 0, 1)
+
+        output = torch.bmm(softmax_output.reshape(bsz * heads, softmax_output.shape[2], softmax_output.shape[3]),
+                           bat_0213_value.reshape(bsz * heads, bat_0213_value.shape[2], head_dim))
+
+        output = output.reshape(bsz, heads, output.shape[1], head_dim)
+        output = output.reshape(bsz, heads, output.shape[2] * head_dim)
+        input_seq_len = query_key_value.shape[1]
+        t4d_0123_output = self.transform4d_0213(output, input_seq_len)
+        t4d_0123_output = t4d_0123_output.reshape(bsz, t4d_0123_output.shape[1], heads * head_dim)
+
+        if layer_id == num_layers - 1:
+            InferenceContext.Instance().advance_tokens()
+
+        return t4d_0123_output, bat_0213_key, bat_0213_value
 
     def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads: int, num_kv: int,
-                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor):
+                norm_factor: float, no_masking: bool, layer_id: int, num_layers: int, alibi: torch.Tensor,
+                is_prompt: bool, token_idx: torch.Tensor, position_ids: torch.Tensor):
 
         if alibi is not None:
             batch_heads = query_key_value.shape[0] * heads
@@ -42,6 +139,6 @@ def forward(self, query_key_value: torch.Tensor, attn_mask: torch.Tensor, heads:
                                            self.config.rotate_every_two, heads, num_kv, norm_factor,
                                            self.config.triangular_masking, self.config.local_attention,
                                            self.config.window_size, no_masking, layer_id, num_layers, alibi,
-                                           self.config.rope_theta)
+                                           self.config.rope_theta, is_prompt, token_idx, position_ids)
 
         return output
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_add.py b/deepspeed/ops/transformer/inference/op_binding/vector_add.py
new file mode 100644
index 000000000000..015340a1084b
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_add.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+
+class VectorAddOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        super(VectorAddOp, self).__init__(config)
+        try:
+            self.vector_add_func = self.inference_module._vector_add
+        except AttributeError:
+            self.vector_add_func = self.vector_add_fallback
+
+    @classmethod
+    def vector_add_fallback(cls, a, b, gamma):
+        """Based on csrc/transformer/inference/csrc/pt_binding.cpp code of _vector_add"""
+        dtype = a.dtype
+        return (gamma * a.float() + b.float()).to(dtype)
+
+    def forward(self, a, b, gamma):
+        return self.vector_add_func(a, b, gamma)
diff --git a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
index 011be859634d..cabab8d8c4ab 100644
--- a/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
+++ b/deepspeed/ops/transformer/inference/op_binding/vector_matmul.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import os
 import torch
 from ..config import DeepSpeedInferenceConfig
 from .base import BaseOp
@@ -25,7 +24,7 @@ def __init__(self, config: DeepSpeedInferenceConfig):
                 else:
                     self.vector_matmul_func = self.inference_module.vector_matmul_fp16
             elif self.config.dtype == torch.int8:
-                self.vector_matmul_func = self.inference_module.vector_matmul_fp16
+                self.vector_matmul_func = self.inference_module.vector_matmul_int8
             elif self.config.dtype == torch.bfloat16:
                 self.vector_matmul_func = self.inference_module.vector_matmul_bf16
             else:
@@ -34,10 +33,7 @@ def __init__(self, config: DeepSpeedInferenceConfig):
             self.vector_matmul_func = self.vector_matmul_fallback
 
     def vector_matmul_fallback(self, input, weight, async_op, q_scale, q_int8, transpose):
-        if os.environ.get('DS_KI_FALLBACK') == 'True' and not transpose:
-            return torch.matmul(input, weight)
-        else:
-            raise NotImplementedError
+        return torch.matmul(input, weight.t() if transpose else weight)
 
     def forward(self, input: torch.Tensor, weight: torch.Tensor, async_op: bool = False):
         q_scale = weight.scale if hasattr(weight, 'scale') else torch.empty(1)
diff --git a/deepspeed/ops/transformer/inference/op_binding/workspace.py b/deepspeed/ops/transformer/inference/op_binding/workspace.py
new file mode 100644
index 000000000000..19de7d9576af
--- /dev/null
+++ b/deepspeed/ops/transformer/inference/op_binding/workspace.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from ..config import DeepSpeedInferenceConfig
+from .base import BaseOp
+
+minus_inf = -10000.0
+key_idx = 0
+value_idx = 1
+
+
+class InferenceContext:
+
+    __instance = None
+
+    def __init__(self):
+        self.kv_cache = None
+        self.kv_cache_elem_dtype = None
+        self.num_tokens = 1
+        self.kv_cache_num_layers = None
+        self.kv_cache_size = None
+        self.max_out_tokens = None
+        self.rotary = None
+        self.allocate_called = False
+        self.static_shapes = True
+
+    @classmethod
+    def Instance(cls):
+        if InferenceContext.__instance is None:
+            InferenceContext.__instance = InferenceContext()
+        return InferenceContext.__instance
+
+    def gen_workspace(self, num_layers, num_heads, batch_size, prompt_len, hidden_dim, mp_size, external_cache,
+                      elem_dtype, rank, max_out_tokens, min_out_tokens):
+        self.allocate_called = True
+        self.kv_cache = None
+        if not external_cache:
+            self.kv_cache_num_layers = num_layers
+            self.max_out_tokens = max_out_tokens
+            head_size = hidden_dim // num_heads
+            self.kv_cache_size = torch.Size([batch_size, (num_heads // mp_size), max_out_tokens, head_size])
+            self.kv_cache_elem_dtype = elem_dtype
+        self.num_tokens = 0
+        self.static_shapes = True
+        return True
+
+    def retake_workspace(self):
+        return True
+
+    def _retake_workspace(self):
+        assert self.allocate_called, "retake workspace called before allocate workspace"
+
+        import deepspeed.accelerator as accelerator
+        if self.kv_cache is None:
+            self.kv_cache = []
+            for layer in range(self.kv_cache_num_layers):
+                self.kv_cache.append((torch.zeros(self.kv_cache_size,
+                                                  dtype=self.kv_cache_elem_dtype,
+                                                  device=accelerator.get_accelerator().device_name()),
+                                      torch.zeros(self.kv_cache_size,
+                                                  dtype=self.kv_cache_elem_dtype,
+                                                  device=accelerator.get_accelerator().device_name())))
+
+        return True
+
+    def update_cache(self, layer_id, token_idx, is_prompt, bat_0213_key, bat_0213_value):
+        has_workspace = self._retake_workspace()
+        assert has_workspace, "Could not allocate workspace"
+
+        # Update current token
+        if is_prompt:
+            self.static_shapes = True
+            if token_idx is None:
+                self.static_shapes = False
+                InferenceContext.Instance().reset_tokens(bat_0213_key.shape[2])
+            else:
+                InferenceContext.Instance().reset_tokens(token_idx)
+
+        if token_idx is None:
+            token_idx = InferenceContext.Instance().current_tokens()
+
+        bsz = bat_0213_key.shape[0]
+
+        # Update cache content
+        if is_prompt:
+            cache_max_seq = self.kv_cache_size[2]
+            cache_max_head_dim = self.kv_cache_size[3]
+            seq = bat_0213_key.shape[2]
+
+            mask = torch.arange(cache_max_seq, device=bat_0213_key.device)
+            mask = mask.ge(token_idx)
+            mask = mask.unsqueeze(-1)
+            mask = mask.expand([cache_max_seq, cache_max_head_dim])
+
+            self.kv_cache[layer_id][key_idx][:bsz, :, :seq, :].copy_(bat_0213_key)
+            self.kv_cache[layer_id][key_idx][:bsz, :].masked_fill_(mask, 0)
+            self.kv_cache[layer_id][value_idx][:bsz, :, :seq, :].copy_(bat_0213_value)
+            self.kv_cache[layer_id][value_idx][:bsz, :].masked_fill_(mask, 0)
+        else:
+            if self.static_shapes:
+                assert type(token_idx) == torch.Tensor, "token_idx is expected to be torch.Tensor"
+                self.kv_cache[layer_id][key_idx][:bsz].index_copy_(2, token_idx - 1, bat_0213_key)
+                self.kv_cache[layer_id][value_idx][:bsz].index_copy_(2, token_idx - 1, bat_0213_value)
+            else:
+                assert type(token_idx) == int, "token_idx is expected to be int"
+                self.kv_cache[layer_id][key_idx][:bsz, :, token_idx - 1:token_idx, :] = bat_0213_key
+                self.kv_cache[layer_id][value_idx][:bsz, :, token_idx - 1:token_idx, :] = bat_0213_value
+
+        bat_0213_key = self.kv_cache[layer_id][key_idx][:bsz]
+        bat_0213_value = self.kv_cache[layer_id][value_idx][:bsz]
+
+        if not self.static_shapes:
+            bat_0213_key = bat_0213_key[:, :, :token_idx, :]
+            bat_0213_value = bat_0213_value[:, :, :token_idx, :]
+
+        return bat_0213_key, bat_0213_value
+
+    def release_workspace(self):
+        self.kv_cache = None
+        self.rotary = None
+
+    def reset_tokens(self, initial_tokens=1):
+        self.num_tokens = initial_tokens
+
+    def current_tokens(self):
+        return self.num_tokens
+
+    def advance_tokens(self):
+        self.num_tokens = self.num_tokens + 1
+
+    def get_kv_cache(self):
+        return self.kv_cache
+
+    def get_rotary(self, rotary_dim, rope_theta, device=None):
+        if self.rotary is None:
+            from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding
+
+            self.rotary = LlamaRotaryEmbedding(rotary_dim, base=rope_theta, device=device)
+
+        return self.rotary
+
+    def get_max_tokens_num(self):
+        return self.max_out_tokens
+
+
+class WorkspaceOp(BaseOp):
+
+    def __init__(self, config: DeepSpeedInferenceConfig = None):
+        if config is None:
+            config = DeepSpeedInferenceConfig()
+        self.inference_context = InferenceContext.Instance()
+        self._is_allocated = False
+        try:
+            super(WorkspaceOp, self).__init__(config)
+            if config.dtype == torch.float32:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_fp32
+            elif config.dtype == torch.bfloat16:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_bf16
+            else:
+                self.allocate_workspace_func = self.inference_module.allocate_workspace_fp16
+            self.release_workspace_func = self.inference_module.release_workspace
+            self.retake_workspace_func = self.inference_module.retake_workspace
+            self.reset_cache_func = self.inference_module.reset_cache
+        except (ValueError, AttributeError) as e:
+            print(f"Using fallback functions in workspace because of {e}")
+            if config.dtype == torch.float32:
+                self.allocate_workspace_func = self.allocate_workspace_fp32_fallback
+            elif config.dtype == torch.bfloat16:
+                self.allocate_workspace_func = self.allocate_workspace_bf16_fallback
+            else:
+                self.allocate_workspace_func = self.allocate_workspace_fp16_fallback
+            self.release_workspace_func = self.release_workspace_fallback
+            self.retake_workspace_func = self.retake_workspace_fallback
+            self.reset_cache_func = self.reset_cache_fallback
+
+    def allocate_workspace(self, *args, **kwargs):
+        self._is_allocated = True
+        return self.allocate_workspace_func(*args, **kwargs)
+
+    def release_workspace(self):
+        self._is_allocated = False
+        return self.release_workspace_func()
+
+    def reset_cache(self):
+        return self.reset_cache_func() if self.reset_cache_func else None
+
+    def retake_workspace(self):
+        return self.retake_workspace_func() if self.retake_workspace_func else None
+
+    def allocate_workspace_fp32_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.float, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def allocate_workspace_bf16_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.bfloat16, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def allocate_workspace_fp16_fallback(self, hidden_dim, num_heads, prompt_length, batch_size, num_layers, mp_size,
+                                         external_cache, rank, max_out_tokens, min_out_tokens):
+        return self.inference_context.gen_workspace(num_layers, num_heads, batch_size, prompt_length, hidden_dim,
+                                                    mp_size, external_cache, torch.half, rank, max_out_tokens,
+                                                    min_out_tokens)
+
+    def reset_cache_fallback(self):
+        return self.inference_context.reset_tokens()
+
+    def release_workspace_fallback(self):
+        return self.inference_context.release_workspace()
+
+    def retake_workspace_fallback(self):
+        return self.inference_context.retake_workspace()
+
+    def is_allocated(self):
+        return self._is_allocated
diff --git a/deepspeed/ops/transformer/inference/triton/attention.py b/deepspeed/ops/transformer/inference/triton/attention.py
index c05370ec74e5..6845d91b06be 100644
--- a/deepspeed/ops/transformer/inference/triton/attention.py
+++ b/deepspeed/ops/transformer/inference/triton/attention.py
@@ -125,7 +125,7 @@ def _triton_autotune(min_seqlen,
             context_4d_matmul(output, qkv, head_size)
         Fp16Matmul._update_autotune_table()
 
-    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
+    def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi, is_prompt, token_idx, position_ids):
         if isinstance(qkv_out, list):
             qkv_out = qkv_out[0]
 
@@ -143,7 +143,10 @@ def ds_compute_attention(self, qkv_out, input_mask, layer_past, alibi):
             no_masking=no_masking,
             layer_id=self.config.layer_id,
             num_layers=TritonSelfAttention.num_layers,
-            alibi=alibi)
+            alibi=alibi,
+            is_prompt=is_prompt,
+            token_idx=token_idx,
+            position_ids=position_ids)
 
         context_layer, key_layer, value_layer = attn_key_value
         return context_layer, key_layer, value_layer
@@ -161,7 +164,8 @@ def forward(
             norm_w=None,
             norm_b=None,
             alibi=None,
-            use_triton_attention=True):
+            use_triton_attention=True,
+            **kwargs):
 
         if not self.config.pre_layer_norm:
             qkv_out = self.linear_func(input=input,
@@ -192,10 +196,16 @@ def forward(
                                               triangular=self.triangular_masking)
             key_layer, value_layer = qkv[:, :, self.hidden_size:2 * self.hidden_size], qkv[:, :, 2 * self.hidden_size:]
         else:
+            is_prompt = kwargs.get("first_token", qkv_out[0].shape[1] > 1)
+            token_idx = kwargs.get("token_idx", None)
+            position_ids = kwargs.get("position_ids", None)
             context_layer, key_layer, value_layer = self.ds_compute_attention(qkv_out=qkv_out,
                                                                               input_mask=input_mask,
                                                                               layer_past=layer_past,
-                                                                              alibi=alibi)
+                                                                              alibi=alibi,
+                                                                              is_prompt=is_prompt,
+                                                                              toke_idx=token_idx,
+                                                                              position_ids=position_ids)
         output = self.vector_matmul_func(input=context_layer, weight=self.attn_ow)
 
         inp_norm = qkv_out[-1]
diff --git a/deepspeed/ops/transformer/inference/triton/matmul_ext.py b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
index d6f72b4efb0b..9be4b0098c37 100644
--- a/deepspeed/ops/transformer/inference/triton/matmul_ext.py
+++ b/deepspeed/ops/transformer/inference/triton/matmul_ext.py
@@ -13,12 +13,48 @@
 import deepspeed
 from pathlib import Path
 import atexit
+import subprocess
 
 
 # -----------------------------------------------------------------------------
 # util class/functions for triton
-def _default_cache_dir():
-    return os.path.join(Path.home(), ".triton", "autotune")
+def is_nfs_path(path):
+    if os.name == 'nt':
+        return False
+
+    # Normalize the path to get the absolute path
+    path = os.path.abspath(path)
+
+    # Use the 'df' command to find the file system type for the given path
+    try:
+        output = subprocess.check_output(['df', '-T', path], encoding='utf-8')
+    except subprocess.CalledProcessError:
+        return False  # Command failed
+
+    # Process the output of 'df -T' to check for 'nfs' in the filesystem type column
+    lines = output.strip().split('\n')
+    if len(lines) > 1:  # The first line is headers
+        fs_type = lines[1].split()[1].lower()  # File system type is the second column
+        return 'nfs' in fs_type
+    return False
+
+
+class TritonCacheDir:
+    _warning_printed = False
+
+    @staticmethod
+    def warn_if_nfs(cache_dir):
+        if is_nfs_path(cache_dir) and not TritonCacheDir._warning_printed:
+            print(
+                f"Warning: The cache directory for DeepSpeed Triton autotune, {cache_dir}, appears to be on an NFS system. While this is generally acceptable, if you experience slowdowns or hanging when DeepSpeed exits, it is recommended to set the TRITON_CACHE_DIR environment variable to a non-NFS path."
+            )
+            TritonCacheDir._warning_printed = True
+        return
+
+    @staticmethod
+    def default_cache_dir():
+        tmp_path = os.path.join(Path.home(), ".triton", "autotune")
+        return tmp_path
 
 
 def bias_add_activation(C, bias=None, activation=""):
@@ -50,10 +86,10 @@ def __init__(self, key):
         self.file_path = None
         self.lock_path = None
         # if caching is enabled, get the lock and bin path
-        self.cache_dir = os.environ.get('TRITON_CACHE_DIR', _default_cache_dir())
+        self.cache_dir = os.environ.get('TRITON_CACHE_DIR', TritonCacheDir.default_cache_dir())
+        TritonCacheDir.warn_if_nfs(self.cache_dir)
         if self.cache_dir:
             os.makedirs(self.cache_dir, exist_ok=True)
-        if self.cache_dir:
             self.file_path = os.path.join(self.cache_dir, self.key + ".pickle")
             self.lock_path = self.file_path + ".lock"
 
@@ -66,7 +102,7 @@ def put(self, table):
             with FileLock(self.lock_path):
                 with open(self.file_path + ".tmp", 'wb') as handle:
                     pickle.dump(table, handle)
-                os.rename(self.file_path + ".tmp", self.file_path)
+                os.replace(self.file_path + ".tmp", self.file_path)
 
     def load(self):
         if os.path.exists(self.file_path):
diff --git a/deepspeed/ops/transformer/inference/triton/ops.py b/deepspeed/ops/transformer/inference/triton/ops.py
index dd87d08d4d2c..dbed45313780 100644
--- a/deepspeed/ops/transformer/inference/triton/ops.py
+++ b/deepspeed/ops/transformer/inference/triton/ops.py
@@ -3,12 +3,10 @@
 
 # DeepSpeed Team
 
-import deepspeed
-from deepspeed.ops.op_builder import InferenceBuilder
 import deepspeed.ops.transformer.inference.triton.matmul_ext as matmul_ext
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
 from deepspeed.ops.transformer.inference.triton.layer_norm import layer_norm, layer_norm_residual
-
-inference_module = None
+from deepspeed.utils.types import ActivationFuncType
 
 
 def vector_matmul_func(input, weight, async_op, q_scale, q_int8, transposed_mode):
@@ -76,15 +74,12 @@ def mlp_gemm_func(input,
     if use_triton_ln:
         mlp_input = layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
     else:
-        global inference_module
-        if inference_module is None:
-            inference_module = InferenceBuilder().load()
-        mlp_input = inference_module._layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
+        mlp_input = LayerNormOp.layer_norm_residual(input, input_bias, residual, gamma, beta, epsilon)
 
     # activation
-    if deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.GELU:
+    if ActivationFuncType(mlp_act_func_type) == ActivationFuncType.GELU:
         activation = "gelu"
-    elif deepspeed.utils.types.ActivationFuncType(mlp_act_func_type) == deepspeed.utils.types.ActivationFuncType.ReLU:
+    elif ActivationFuncType(mlp_act_func_type) == ActivationFuncType.ReLU:
         activation = "relu"
     else:
         activation = ""
@@ -121,10 +116,7 @@ def qkv_gemm_func(
     if use_triton_ln:
         qkv_input = layer_norm(input, gamma, beta, epsilon)
     else:
-        global inference_module
-        if inference_module is None:
-            inference_module = InferenceBuilder().load()
-        qkv_input = inference_module.layer_norm(input, gamma, beta, epsilon)
+        qkv_input = LayerNormOp()(input, gamma, beta, epsilon)
 
     qkv_out = matmul_ext.matmul(qkv_input, weight, bias=(bias if add_bias else None), activation="", use_triton=True)
 
diff --git a/deepspeed/profiling/flops_profiler/README.md b/deepspeed/profiling/flops_profiler/README.md
index 82011eb68568..68ac3dc285c7 100644
--- a/deepspeed/profiling/flops_profiler/README.md
+++ b/deepspeed/profiling/flops_profiler/README.md
@@ -178,7 +178,7 @@ When using DeepSpeed for model training, the profiler can be configured in the d
 
 #### Example: Megatron-LM
 
-For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM).
+For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM).
 
 An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attention_heads = 32, batch_size = 1024, seq_length = 1024`) is shown below.
 
diff --git a/deepspeed/profiling/flops_profiler/profiler.py b/deepspeed/profiling/flops_profiler/profiler.py
index 79e682a73b90..c6b607aad813 100644
--- a/deepspeed/profiling/flops_profiler/profiler.py
+++ b/deepspeed/profiling/flops_profiler/profiler.py
@@ -15,6 +15,8 @@
 from deepspeed.utils import logger
 from deepspeed.moe.layer import MoE
 from deepspeed.utils.timer import FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, STEP_GLOBAL_TIMER
+from deepspeed.utils.torch import required_torch_version
+import einops
 
 Tensor = torch.Tensor
 
@@ -81,6 +83,7 @@ def start_profile(self, ignore_list=None):
         self.reset_profile()
         _patch_functionals()
         _patch_tensor_methods()
+        _patch_miscellaneous_operations()
 
         def register_module_hooks(module, ignore_list):
             if ignore_list and type(module) in ignore_list:
@@ -114,7 +117,7 @@ def start_time_hook(module, input):
                 get_accelerator().synchronize()
                 module.__start_time__ = time.time()
 
-            if not hasattr(module, "__start_time_hook_handle"):
+            if not hasattr(module, "__start_time_hook_handle__"):
                 module.__start_time_hook_handle__ = module.register_forward_pre_hook(start_time_hook)
 
             def end_time_hook(module, input, output):
@@ -136,6 +139,7 @@ def stop_profile(self):
         if self.started and self.func_patched:
             _reload_functionals()
             _reload_tensor_methods()
+            _reload_miscellaneous_operations()
             self.func_patched = False
 
         def remove_profile_attrs(module):
@@ -786,6 +790,29 @@ def _einsum_flops_compute(equation, *operands):
     raise NotImplementedError("Unsupported einsum operation.")
 
 
+def _einops_einsum_flops_compute(*args):
+    """
+    Count flops for the einops.einsum operation.
+    """
+    *operands, equation = args
+    input_shapes = [o.shape for o in operands]
+
+    # Re-map equation so that same equation with different alphabet
+    # representations will look the same.
+    letter_order = OrderedDict((k, 0) for k in equation if k.isalpha()).keys()
+    mapping = {ord(x): 97 + i for i, x in enumerate(letter_order)}
+    equation = equation.translate(mapping)
+
+    np_arrs = [np.zeros(s) for s in input_shapes]
+    optim = np.einsum_path(equation, *np_arrs, optimize="optimal")[1]
+    for line in optim.split("\n"):
+        if "optimized flop" in line.lower():
+            flop = int(float(line.split(":")[-1]))
+            return flop, 0
+
+    raise NotImplementedError("Unsupported einops.einsum operation.")
+
+
 def _tensor_addmm_flops_compute(self, mat1, mat2, *, beta=1, alpha=1, out=None):
     """
     Count flops for the tensor addmm operation.
@@ -827,6 +854,15 @@ def _elementwise_flops_compute(input, other):
         return flops, 0
 
 
+def _attn_flops_compute(q, k, v, *args, **kwargs):
+    """
+    Count flops for the scaled_dot_product_attention operation.
+    """
+    macs = _prod(q.shape) * k.shape[-2]
+    macs += _prod(q.shape[:-1]) * k.shape[-2] * v.shape[-1]
+    return 2 * macs, macs
+
+
 def wrapFunc(func, funcFlopCompute):
     oldFunc = func
     name = func.__str__
@@ -899,10 +935,15 @@ def _patch_functionals():
     # embedding
     F.embedding = wrapFunc(F.embedding, _embedding_flops_compute)
 
+    # attn - scaled_dot_product_attention added in torch 2.0+
+    if required_torch_version(min_version=2.0):
+        F.scaled_dot_product_attention = wrapFunc(F.scaled_dot_product_attention, _attn_flops_compute)
+
 
 def _patch_tensor_methods():
     torch.matmul = wrapFunc(torch.matmul, _matmul_flops_compute)
     torch.Tensor.matmul = wrapFunc(torch.Tensor.matmul, _matmul_flops_compute)
+    torch.Tensor.__matmul__ = wrapFunc(torch.Tensor.__matmul__, _matmul_flops_compute)
     torch.mm = wrapFunc(torch.mm, _matmul_flops_compute)
     torch.Tensor.mm = wrapFunc(torch.Tensor.mm, _matmul_flops_compute)
     torch.bmm = wrapFunc(torch.bmm, _matmul_flops_compute)
@@ -922,6 +963,10 @@ def _patch_tensor_methods():
     torch.baddbmm = wrapFunc(torch.baddbmm, _tensor_addmm_flops_compute)
 
 
+def _patch_miscellaneous_operations():
+    einops.einsum = wrapFunc(einops.einsum, _einops_einsum_flops_compute)
+
+
 def _reload_functionals():
     # torch.nn.functional does not support importlib.reload()
     F.linear = old_functions[F.linear.__str__]
@@ -980,6 +1025,10 @@ def _reload_tensor_methods():
     torch.baddbmm = old_functions[torch.baddbmm.__str__]
 
 
+def _reload_miscellaneous_operations():
+    einops.einsum = old_functions[einops.einsum.__str__]
+
+
 def _rnn_flops(flops, rnn_module, w_ih, w_hh, input_size):
     gates_size = w_ih.shape[0]
     # matrix matrix mult ih state and internal state
diff --git a/deepspeed/pydantic_v1.py b/deepspeed/pydantic_v1.py
deleted file mode 100644
index 6aba072ad929..000000000000
--- a/deepspeed/pydantic_v1.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Pydantic v1 compatibility module.
-
-Pydantic v2 introduced breaking changes that hinder its adoption:
-https://docs.pydantic.dev/latest/migration/. To provide deepspeed users the option to
-migrate to pydantic v2 on their own timeline, deepspeed uses this compatibility module
-as a pydantic-version-agnostic alias for pydantic's v1 API.
-"""
-
-try:
-    from pydantic.v1 import *  # noqa: F401
-except ImportError:
-    from pydantic import *  # noqa: F401
diff --git a/deepspeed/runtime/__init__.py b/deepspeed/runtime/__init__.py
index 347ff7993d82..208299fb8c50 100644
--- a/deepspeed/runtime/__init__.py
+++ b/deepspeed/runtime/__init__.py
@@ -2,11 +2,3 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-
-
-class DeepSpeedOptimizer(object):
-    pass
-
-
-class ZeROOptimizer(DeepSpeedOptimizer):
-    pass
diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 72a7bc0516ba..08c4b81937f9 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -25,17 +25,21 @@
 
 from deepspeed.runtime.config import DeepSpeedConfig
 from deepspeed.utils import logger
-from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage, bwc_tensor_model_parallel_rank
+from deepspeed.runtime.utils import copy_to_device, move_to_device, see_memory_usage
 from deepspeed.utils.timer import SynchronizedWallClockTimer as Timers, FORWARD_GLOBAL_TIMER
+from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank
 from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime import compiler
 
 # DeepSpeed Checkpointing Enabled or Disabled
 deepspeed_checkpointing_enabled = False
 
 # MP parameters
 mpu = None
-mp_rank = None
-mp_size = None
+
+#set default values
+mp_rank = 0
+mp_size = 1
 mp_group = None
 
 # Model Parameters
@@ -59,8 +63,6 @@
 
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
-transport_stream = None
-cuda_device = None
 
 
 def detach_variable(inputs, device=None):
@@ -289,13 +291,9 @@ def gather_partitioned_activations(tensors, device=None):
             flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=device)
         else:
             flat_tensor = torch.zeros([tensor_size], dtype=item.dtype, device=item.device)
-        partitions = []
-        for i in range(mp_size):
-            part_i = flat_tensor.narrow(0, partition_size * i, partition_size)
-            if i == mp_rank:
-                part_i.copy_(item)
-            partitions.append(part_i)
-        dist.all_gather(partitions, partitions[mp_rank], group=mp_group)
+        part = flat_tensor.narrow(0, partition_size * mp_rank, partition_size)
+        part.copy_(item)
+        dist.all_gather_into_tensor(flat_tensor, part, group=mp_group)
         input_tensor = flat_tensor.view(list(size.numpy()))
         item.data = input_tensor.data
 
@@ -371,7 +369,9 @@ def is_activation_to_checkpoint(item):
         Is an activation to be checkpointed
     """
     global mp_size
-    return torch.is_tensor(item) and item.is_floating_point() and item.numel() >= mp_size
+    extra_flag = (not hasattr(item, 'no_checkpointing')) or (hasattr(item, 'no_checkpointing')
+                                                             and item.no_checkpointing == False)
+    return torch.is_tensor(item) and item.is_floating_point() and item.numel() >= mp_size and extra_flag
 
 
 def partition_activations(args, cpu_checkpoint, contiguous_checkpoint):
@@ -443,7 +443,9 @@ def get_partitioned_activations_for_backward(args, inputs, contiguous_checkpoint
             num_non_fp_tensors += 1
             continue
 
-        arg.data = inp.data
+        arg.data = torch.empty([], device=arg.device).data
+        arg.saved_data = inp.data
+
         new_args.append(arg)
         i = arg_index - num_non_fp_tensors
 
@@ -476,7 +478,8 @@ def get_cpu_activations_for_backward(args, inputs):
             new_args.append(arg)
             continue
 
-        arg.data = inp.data
+        arg.data = torch.empty([], device=arg.device).data
+        arg.saved_data = inp.data
         new_args.append(arg)
 
     return new_args
@@ -517,35 +520,10 @@ def save_args_for_backward(*all_args):
         global mp_rank, mp_size, mp_group
         global contiguous_data_buffers, contiguous_size_buffers
         global data_offsets, size_offsets
-        if mp_rank is None:
-            if mpu is not None:
-                if hasattr(mpu, 'get_tensor_model_parallel_rank'):
-                    mp_rank = mpu.get_tensor_model_parallel_rank()
-                    mp_size = mpu.get_tensor_model_parallel_world_size()
-                    mp_group = mpu.get_tensor_model_parallel_group()
-                else:
-                    mp_rank = mpu.get_model_parallel_rank()
-                    mp_size = mpu.get_model_parallel_world_size()
-                    mp_group = mpu.get_model_parallel_group()
-            else:
-                mp_rank = 0
-                mp_size = 1
-                mp_group = None
-
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
-
-        if cuda_device is None:
-            see_memory_usage("First Forward Beginning", force=False)
-            if dist.get_rank() == 0:
-                logger.info(f"Activation Checkpointing Information")
-                logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
-                logger.info(
-                    f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
-                logger.info(f"----Synchronization {SYNCHRONIZE}")
-                logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+        global PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
 
-            cuda_device = get_accelerator().current_device_name()
-            transport_stream = get_accelerator().Stream(device=cuda_device)
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
 
         if PARTITION_ACTIVATIONS:
             inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
@@ -630,7 +608,14 @@ def backward(ctx, *grads):
             raise RuntimeError("Checkpointing is not compatible with .grad(), "
                                "please use .backward() if possible")
 
-        global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+        global PARTITION_ACTIVATIONS
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
+        # Rebuild deepspeed_saved_tensors
+        for t in ctx.deepspeed_saved_tensors:
+            if t is not None and hasattr(t, 'saved_data') and t.saved_data is not None:
+                t.data = t.saved_data.to(t.device)
+                t.saved_data = None
 
         if PARTITION_ACTIVATIONS:
             # with get_accelerator().stream(transport_stream):
@@ -757,35 +742,10 @@ def save_args_for_backward(*all_args):
     global mp_rank, mp_size, mp_group
     global contiguous_data_buffers, contiguous_size_buffers
     global data_offsets, size_offsets
-    if mp_rank is None:
-        if mpu is not None:
-            if hasattr(mpu, 'get_tensor_model_parallel_rank'):
-                mp_rank = mpu.get_tensor_model_parallel_rank()
-                mp_size = mpu.get_tensor_model_parallel_world_size()
-                mp_group = mpu.get_tensor_model_parallel_group()
-            else:
-                mp_rank = mpu.get_model_parallel_rank()
-                mp_size = mpu.get_model_parallel_world_size()
-                mp_group = mpu.get_model_parallel_group()
-        else:
-            mp_rank = 0
-            mp_size = 1
-            mp_group = None
-
-    global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
-
-    if cuda_device is None:
-        see_memory_usage("First Forward Beginning", force=False)
-        if dist.get_rank() == 0:
-            logger.info(f"Activation Checkpointing Information")
-            logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
-            logger.info(
-                f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
-            logger.info(f"----Synchronization {SYNCHRONIZE}")
-            logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+    global PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
 
-        cuda_device = get_accelerator().current_device_name()
-        transport_stream = get_accelerator().Stream(device=cuda_device)
+    cuda_device = get_accelerator().current_device_name()
+    transport_stream = get_accelerator().Stream(device=cuda_device)
 
     if PARTITION_ACTIVATIONS:
         inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
@@ -892,7 +852,9 @@ def replay_unpack(none_value):
                 raise RuntimeError("Checkpointing is not compatible with .grad(), "
                                    "please use .backward() if possible")
 
-            global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+            global PARTITION_ACTIVATIONS
+            cuda_device = get_accelerator().current_device_name()
+            transport_stream = get_accelerator().Stream(device=cuda_device)
 
             # gather inputs which is partitioned or checkpointed before first forward
             if PARTITION_ACTIVATIONS:
@@ -958,8 +920,9 @@ def after_backward_hook(_nonuse_grads):
 
     with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
         outputs = function(*inputs_cuda)
-    for leaf_tensor in leaf_tensors:
-        leaf_tensor.register_hook(after_backward_hook)
+    if PROFILE_TIME or SYNCHRONIZE:
+        for leaf_tensor in leaf_tensors:
+            leaf_tensor.register_hook(after_backward_hook)
 
     see_memory_usage("After running forward on the layer", force=False)
 
@@ -981,6 +944,7 @@ def after_backward_hook(_nonuse_grads):
         return tuple(all_outputs)
 
 
+@compiler.disable  # WA from Pytorch repo for compile + zero 3 accuracy issue
 def checkpoint(function, *args):
     """Checkpoint a model or part of the model.
     This has been directly copied from torch.utils.checkpoint. """
@@ -1143,6 +1107,27 @@ def configure(
     if CONTIGUOUS_CHECKPOINTING:
         assert num_layers is not None, "Must specify the number of layers with contiguous memory checkpointing"
 
+    global mp_rank, mp_size, mp_group
+
+    if mpu is not None:
+        if hasattr(mpu, 'get_tensor_model_parallel_rank'):
+            mp_rank = mpu.get_tensor_model_parallel_rank()
+            mp_size = mpu.get_tensor_model_parallel_world_size()
+            mp_group = mpu.get_tensor_model_parallel_group()
+        else:
+            mp_rank = mpu.get_model_parallel_rank()
+            mp_size = mpu.get_model_parallel_world_size()
+            mp_group = mpu.get_model_parallel_group()
+
+    #print configuration only once
+    see_memory_usage("After configuration", force=False)
+    if dist.get_rank() == 0:
+        logger.info(f"Activation Checkpointing Information")
+        logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
+        logger.info(f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
+        logger.info(f"----Synchronization {SYNCHRONIZE}")
+        logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+
 
 def is_configured():
     """True if deepspeed activation checkpointing has been configured
diff --git a/deepspeed/runtime/base_optimizer.py b/deepspeed/runtime/base_optimizer.py
new file mode 100644
index 000000000000..d2c54155da89
--- /dev/null
+++ b/deepspeed/runtime/base_optimizer.py
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import torch
+
+from deepspeed.utils import logger
+from deepspeed.utils.tensor_fragment import map_to_flat_opt_states
+from deepspeed.runtime.utils import bwc_tensor_model_parallel_rank
+
+
+class DeepSpeedOptimizer(object):
+    pass
+
+
+class ZeROOptimizer(DeepSpeedOptimizer):
+
+    def load_hp_checkpoint_state_from_checkpoint_dir(self, lp_groups_name: str, checkpoint_dir: str) -> None:
+        checkpoint_dir = os.path.join(checkpoint_dir, "zero")
+        optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
+        assert os.path.isfile(
+            optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
+        optim_sd = torch.load(optim_state_path, weights_only=False)
+
+        self._load_global_state(optim_sd)
+
+        tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
+        if self.mpu is None:
+            logger.warning("MPU is not provided, setting tp size to 1 in checkpoint loading.")
+            tp_world_size = 1
+        else:
+            tp_world_size = self.mpu.get_slice_parallel_world_size() if hasattr(self.mpu, "get_slice_parallel_world_size") \
+                else self.mpu.get_tensor_model_parallel_world_size()
+
+        for i, (param_group,
+                loaded_param_group) in enumerate(zip(self.optimizer.param_groups, optim_sd['param_groups'])):
+            # We have an assumption that all params in the same param_group have the same keys
+            opt_keys = set()
+            steps = []
+
+            lp_groups = getattr(self, lp_groups_name)
+            for lp in lp_groups[i]:
+                if lp._hp_mapping is not None:
+                    #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
+                    step = lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
+                                                       tp_world_size)
+                    for key in lp._hp_mapping.get_optim_state_keys():
+                        opt_keys.add(key)
+                    steps.append(step)
+
+            hp_param = param_group['params'][0]
+            assert all(step == steps[0] for step in steps), f"Steps {steps} are not equal"
+            if steps[0] is not None:
+                self.optimizer.state[hp_param]['step'] = steps[0]
+
+            map_to_flat_opt_states(hp_param, lp_groups[i], self.optimizer.state, opt_keys)
+
+            for key, value in loaded_param_group.items():
+                if key == 'params':
+                    continue
+                param_group[key] = value
diff --git a/deepspeed/runtime/bf16_optimizer.py b/deepspeed/runtime/bf16_optimizer.py
index 494816e6a846..78895e70df03 100644
--- a/deepspeed/runtime/bf16_optimizer.py
+++ b/deepspeed/runtime/bf16_optimizer.py
@@ -6,19 +6,18 @@
 from collections import OrderedDict
 import torch
 import sys
-import os
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed import comm as dist
 from deepspeed.runtime.constants import PIPE_REPLICATED
-from deepspeed.runtime import ZeROOptimizer
+from deepspeed.runtime.base_optimizer import ZeROOptimizer
 from packaging import version as pkg_version
-
 from deepspeed.git_version_info import version
 from deepspeed.runtime.utils import (get_global_norm_of_tensors, clip_tensors_by_global_norm, DummyOptim,
-                                     align_dense_tensors, all_gather_dp_groups, bwc_tensor_model_parallel_rank,
-                                     is_model_parallel_parameter, see_memory_usage)
-
-from deepspeed.utils import link_hp_params, fragment_address
+                                     align_dense_tensors, all_gather_dp_groups, is_model_parallel_parameter,
+                                     see_memory_usage, graph_process, get_norm_with_moe_layers)
+from deepspeed.utils import link_hp_params, lazy_init_hp_params_optimizer_state, fragment_address, groups
+from deepspeed.moe.utils import is_moe_param, is_moe_param_group
+from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank
 from deepspeed.checkpoint import enable_universal_checkpoint
 from deepspeed.checkpoint.constants import (DS_VERSION, PARTITION_COUNT, BASE_OPTIMIZER_STATE,
                                             SINGLE_PARTITION_OF_FP32_GROUPS, CLIP_GRAD, GROUP_PADDINGS,
@@ -27,6 +26,11 @@
 setattr(sys.modules[__name__], 'fragment_address', fragment_address)
 
 
+def print_rank_0(message, debug=False, force=False):
+    if dist.get_rank() == 0 and (debug or force):
+        print(message)
+
+
 class BF16_Optimizer(ZeROOptimizer):
 
     def __init__(self,
@@ -37,7 +41,11 @@ def __init__(self,
                  norm_type=2,
                  allgather_bucket_size=5000000000,
                  dp_process_group=None,
-                 timers=None):
+                 timers=None,
+                 grad_acc_dtype=None,
+                 graph_harvesting=False,
+                 immediate_grad_update=False,
+                 has_moe_layers=False):
         super().__init__()
         see_memory_usage('begin bf16_optimizer', force=True)
         self.timers = timers
@@ -45,13 +53,22 @@ def __init__(self,
         self.param_names = param_names
         self.using_real_optimizer = not isinstance(self.optimizer, DummyOptim)
 
+        assert grad_acc_dtype in [torch.float32, torch.bfloat16
+                                  ], f"BF16Optimizer: Unsupported gradient accumulation data type: {grad_acc_dtype}"
+        self.grad_acc_dtype = grad_acc_dtype
+        self.immediate_grad_update = immediate_grad_update
+
         self.clip_grad = clip_grad
         self.norm_type = norm_type
         self.mpu = mpu
         self.allgather_bucket_size = int(allgather_bucket_size)
         self.dp_process_group = dp_process_group
         self.dp_rank = dist.get_rank(group=self.dp_process_group)
+        self.has_moe_layers = has_moe_layers
+        self.non_expert_gradients = []
         self.real_dp_process_group = [dp_process_group for i in range(len(self.optimizer.param_groups))]
+        if self.has_moe_layers:
+            self._configure_moe_settings()
 
         # Use torch (un)flatten ops
         self.flatten = _flatten_dense_tensors
@@ -76,17 +93,41 @@ def __init__(self,
         self.fp32_groups_has_gradients = []
 
         self.group_paddings = []
-
+        self.graph_harvesting = graph_harvesting
         if self.using_real_optimizer:
             self._setup_for_real_optimizer()
 
-        see_memory_usage('end bf16_optimizer', force=True)
+        see_memory_usage('end bf16_ optimizer', force=True)
+
+    def destroy(self):
+        for i, _ in enumerate(self.optimizer.param_groups):
+            for p in self.bf16_groups[i]:
+                if getattr(p, '_hp_mapping', None):
+                    p._hp_mapping = None
+        for hook in self._grad_acc_hooks:
+            hook.remove()
+        print_rank_0("Removed grad acc hooks")
+
+    def _configure_moe_settings(self):
+        assert any(
+            [is_moe_param_group(group) for group in self.optimizer.param_groups]
+        ), "The model has moe layers, but None of the param groups are marked as MoE. Create a param group with 'moe' key set to True before creating optimizer"
+
+        for i, group in enumerate(self.optimizer.param_groups):
+            if is_moe_param_group(group):
+                assert all([is_moe_param(param)
+                            for param in group['params']]), "All params in MoE group must be MoE params"
+                self.real_dp_process_group[i] = groups._get_expert_data_parallel_group(group['name'])
+        self.expert_gradients = {}
+        if self.has_moe_layers:
+            for key in groups._get_expert_data_parallel_group_dict().keys():
+                self.expert_gradients[key] = []
 
     def _setup_for_real_optimizer(self):
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
-        self.partition_count = [dp_world_size for i in range(len(self.optimizer.param_groups))]
+        self.partition_count = [dist.get_world_size(group=pg) for pg in self.real_dp_process_group]
 
         for i, param_group in enumerate(self.optimizer.param_groups):
+            real_dp_world_size = dist.get_world_size(group=self.real_dp_process_group[i])
             see_memory_usage(f'before initializing group {i}', force=True)
 
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
@@ -98,17 +139,16 @@ def _setup_for_real_optimizer(self):
             # create flat bf16 params
             self.bf16_groups_flat.append(
                 self._flatten_dense_tensors_aligned(self.bf16_groups[i],
-                                                    self.nccl_start_alignment_factor * dp_world_size))
-
+                                                    self.nccl_start_alignment_factor * real_dp_world_size))
             # Make bf16 params point to flat tensor storage
             self._update_storage_to_flattened_tensor(tensor_list=self.bf16_groups[i],
                                                      flat_tensor=self.bf16_groups_flat[i])
 
             # divide flat weights into equal sized partitions
-            partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
+            partition_size = self.bf16_groups_flat[i].numel() // real_dp_world_size
             bf16_dp_partitions = [
                 self.bf16_groups_flat[i].narrow(0, dp_index * partition_size, partition_size)
-                for dp_index in range(dp_world_size)
+                for dp_index in range(real_dp_world_size)
             ]
             self.bf16_partitioned_groups.append(bf16_dp_partitions)
 
@@ -119,7 +159,12 @@ def _setup_for_real_optimizer(self):
             num_elem_list = [t.numel() for t in self.bf16_groups[i]]
 
             # create fp32 gradients
-            self.fp32_groups_gradients_flat.append(torch.zeros_like(self.bf16_groups_flat[i], dtype=torch.float32))
+            fp32_flat_buffer = torch.zeros_like(self.bf16_groups_flat[i], dtype=self.grad_acc_dtype)
+            self.fp32_groups_gradients_flat.append(fp32_flat_buffer)
+            if self.has_moe_layers and is_moe_param_group(param_group):
+                self.expert_gradients[param_group['name']].append(fp32_flat_buffer)
+            else:
+                self.non_expert_gradients.append(fp32_flat_buffer)
 
             # track individual fp32 gradients for entire model
             fp32_gradients = self._split_flat_tensor(flat_tensor=self.fp32_groups_gradients_flat[i],
@@ -152,12 +197,13 @@ def _setup_for_real_optimizer(self):
 
             see_memory_usage(f'after initializing group {i}', force=True)
 
-        see_memory_usage('before initialize_optimizer', force=True)
-        self.initialize_optimizer_states()
-        see_memory_usage('end initialize_optimizer', force=True)
+        self._grad_acc_hooks = []
+        if self.immediate_grad_update:
+            self.create_grad_acc_hooks()
 
         # Need optimizer states initialized before linking lp to optimizer state
         self._link_all_hp_params()
+        self._hp_optimizer_states_linked = False
         self._enable_universal_checkpoint()
         self._param_slice_mappings = self._create_param_mapping()
 
@@ -178,11 +224,12 @@ def _create_param_mapping(self):
         return param_mapping
 
     def _link_all_hp_params(self):
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
         for i, _ in enumerate(self.optimizer.param_groups):
+            real_dp_world_size = dist.get_world_size(group=self.real_dp_process_group[i])
+
             # Link bf16 and fp32 params in partition
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            partition_size = self.bf16_groups_flat[i].numel() // dp_world_size
+            partition_size = self.bf16_groups_flat[i].numel() // real_dp_world_size
             flat_hp_partition = self.fp32_groups_flat_partition[i]
             link_hp_params(lp_param_list=self.bf16_groups[i],
                            flat_hp_partition=flat_hp_partition,
@@ -192,23 +239,14 @@ def _link_all_hp_params(self):
                            param_group_index=i,
                            partition_start=partition_id * partition_size,
                            partition_size=partition_size,
-                           partition_optimizer_state=self.optimizer.state[flat_hp_partition],
                            dp_group=self.real_dp_process_group[i])
 
-    def initialize_optimizer_states(self):
-        """Take an optimizer step with zero-valued gradients to allocate internal
-        optimizer state.
-
-        This helps prevent memory fragmentation by allocating optimizer state at the
-        beginning of training instead of after activations have been allocated.
-        """
-        for param_partition, grad_partition in zip(self.fp32_groups_flat_partition,
-                                                   self.fp32_groups_gradient_flat_partition):
-            param_partition.grad = grad_partition
-
-        self.optimizer.step()
-
-        self.clear_hp_grads()
+    def _lazy_init_hp_params_optimizer_state(self):
+        if not self._hp_optimizer_states_linked:
+            for i, _ in enumerate(self.optimizer.param_groups):
+                lazy_init_hp_params_optimizer_state(self.bf16_groups[i], self.fp32_groups_flat_partition[i],
+                                                    self.optimizer.state)
+            self._hp_optimizer_states_linked = True
 
     def _split_flat_tensor(self, flat_tensor, num_elem_list):
         assert sum(num_elem_list) <= flat_tensor.numel()
@@ -234,9 +272,18 @@ def step(self, closure=None):
         if closure is not None:
             raise NotImplementedError(f'{self.__class__} does not support closure.')
 
-        all_groups_norm = get_global_norm_of_tensors(input_tensors=self.get_grads_for_norm(),
-                                                     mpu=self.mpu,
-                                                     norm_type=self.norm_type)
+        non_expert_grads_for_norm, expert_grads_for_norm = self.get_grads_for_norm()
+        non_expert_groups_norm = get_global_norm_of_tensors(input_tensors=non_expert_grads_for_norm,
+                                                            mpu=self.mpu,
+                                                            norm_type=self.norm_type,
+                                                            use_graph=self.graph_harvesting)
+        all_groups_norm = non_expert_groups_norm
+        if self.has_moe_layers:
+            all_groups_norm = get_norm_with_moe_layers(non_expert_groups_norm,
+                                                       mpu=self.mpu,
+                                                       expert_tensors=expert_grads_for_norm,
+                                                       norm_type=self.norm_type)
+
         self._global_grad_norm = all_groups_norm
 
         assert all_groups_norm > 0.
@@ -244,10 +291,24 @@ def step(self, closure=None):
             clip_tensors_by_global_norm(input_tensors=self.get_grads_for_norm(for_clipping=True),
                                         max_norm=self.clip_grad,
                                         global_norm=all_groups_norm,
-                                        mpu=self.mpu)
+                                        mpu=self.mpu,
+                                        use_graph=self.graph_harvesting)
+
+        for param_partition, grad_partition in zip(self.fp32_groups_flat_partition,
+                                                   self.fp32_groups_gradient_flat_partition):
+            # In case of grad acc dtype different than FP32, need to cast to high precision.
+            param_partition.grad = grad_partition.to(
+                param_partition.dtype) if grad_partition.dtype != param_partition.dtype else grad_partition
 
         self.optimizer.step()
 
+        if self.grad_acc_dtype is not torch.float32:
+            for param_partition in self.fp32_groups_flat_partition:
+                param_partition.grad = None
+
+        # We need to link optimizer state after the first step() call
+        self._lazy_init_hp_params_optimizer_state()
+
         self.update_lp_params()
 
         self.clear_hp_grads()
@@ -267,48 +328,96 @@ def backward(self, loss, update_hp_grads=True, clear_lp_grads=False, **bwd_kwarg
         if update_hp_grads:
             self.update_hp_grads(clear_lp_grads=clear_lp_grads)
 
+    @torch.no_grad()
+    def _update_hp_grad(self, lp, group_idx, param_idx, clear_lp_grads):
+        if lp.grad is None:
+            return
+
+        hp_grad = self.fp32_groups_gradients[group_idx][param_idx]
+        assert hp_grad is not None, \
+            f'high precision param has no gradient, lp param_id = {id(lp)} group_info = [{group_idx}][{param_idx}]'
+
+        hp_grad.data.add_(lp.grad.data.to(hp_grad.dtype).view(hp_grad.shape))
+        lp._hp_grad = hp_grad
+        self.fp32_groups_has_gradients[group_idx][param_idx] = True
+
+        # clear gradients
+        if clear_lp_grads:
+            lp.grad.zero_()
+
+    @torch.no_grad()
+    def _update_hp_grads_func(self, clear_lp_grads=False):
+        for i, group in enumerate(self.bf16_groups):
+            for j, lp in enumerate(group):
+                self._update_hp_grad(lp, i, j, clear_lp_grads)
+
     @torch.no_grad()
     def update_hp_grads(self, clear_lp_grads=False):
+        if self.immediate_grad_update:
+            return
+
+        if self.graph_harvesting:
+            graph_process(False, self._update_hp_grads_func, clear_lp_grads)
+        else:
+            self._update_hp_grads_func(clear_lp_grads)
+        #cpu op
         for i, group in enumerate(self.bf16_groups):
             for j, lp in enumerate(group):
                 if lp.grad is None:
                     continue
-
-                hp_grad = self.fp32_groups_gradients[i][j]
-                assert hp_grad is not None, \
-                    f'high precision param has no gradient, lp param_id = {id(lp)} group_info = [{i}][{j}]'
-
-                hp_grad.data.add_(lp.grad.data.to(hp_grad.dtype).view(hp_grad.shape))
-                lp._hp_grad = hp_grad
                 self.fp32_groups_has_gradients[i][j] = True
 
-                # clear gradients
-                if clear_lp_grads:
-                    lp.grad = None
-
     @torch.no_grad()
     def get_grads_for_reduction(self):
-        return self.fp32_groups_gradients_flat
+        if self.has_moe_layers:
+            return self.non_expert_gradients, self.expert_gradients
+        return self.non_expert_gradients, {}
 
     @torch.no_grad()
     def get_grads_for_norm(self, for_clipping=False):
-        grads = []
+        """
+        Returns:
+            tuple[list[Tensor], dict[ep_name, List[Tensor]] | list:
+            If for_clipping, return all gradients.
+            Otherwise, separate and return dict of expert_grad and list of non_expert_grad
+        """
+        # (grads, expert_group_name)
+        expert_grads_for_norm = {}
+
+        # grads
+        non_expert_grads_for_norm = []
+        all_grads_for_clip = []
+
         tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
+        assert len(self.bf16_groups) == len(self.optimizer.param_groups)
         for i, group in enumerate(self.bf16_groups):
             for j, lp in enumerate(group):
                 if not for_clipping:
                     if hasattr(lp, PIPE_REPLICATED) and lp.ds_pipe_replicated:
                         continue
 
-                    if not (tensor_mp_rank == 0 or is_model_parallel_parameter(lp)):
+                    # skip duplicated parameters. perform norm only on cards with tp_rank=0.
+                    # non-duplicated parameters include:
+                    # - Parameters with tp: Use allreducesum of mp_group.
+                    # - Moe Parameters with ep: Use allreducesum of ep_group.
+                    if not (tensor_mp_rank == 0 or is_model_parallel_parameter(lp) or is_moe_param(lp)):
                         continue
 
                 if not self.fp32_groups_has_gradients[i][j]:
                     continue
-
-                grads.append(self.fp32_groups_gradients[i][j])
-
-        return grads
+                if not for_clipping:
+                    param_group = self.optimizer.param_groups[i]
+                    if self.has_moe_layers and is_moe_param_group(param_group):
+                        if param_group['name'] not in expert_grads_for_norm:
+                            expert_grads_for_norm[param_group['name']] = []
+                        expert_grads_for_norm[param_group['name']].append(self.fp32_groups_gradients[i][j])
+                    else:
+                        non_expert_grads_for_norm.append(self.fp32_groups_gradients[i][j])
+                else:
+                    all_grads_for_clip.append(self.fp32_groups_gradients[i][j])
+        if not for_clipping:
+            return non_expert_grads_for_norm, expert_grads_for_norm
+        return all_grads_for_clip
 
     @torch.no_grad()
     def update_lp_params(self):
@@ -320,7 +429,8 @@ def update_lp_params(self):
             # if i == 0:
             #     print_rank_0(f'{fp32_partition[:10]=}', force=True)
 
-        all_gather_dp_groups(partitioned_param_groups=self.bf16_partitioned_groups,
+        all_gather_dp_groups(groups_flat=self.bf16_groups_flat,
+                             partitioned_param_groups=self.bf16_partitioned_groups,
                              dp_process_group=self.real_dp_process_group,
                              start_alignment_factor=self.nccl_start_alignment_factor,
                              allgather_bucket_size=self.allgather_bucket_size)
@@ -333,9 +443,20 @@ def clear_hp_grads(self):
             self.fp32_groups_has_gradients[i] = [False] * len(group)
 
     def clear_lp_grads(self):
+
+        # using zero_() fixed memory address for graph replay
+        set_to_none = False if self.graph_harvesting else True
+        zero_grads_list = []
         for group in self.bf16_groups:
             for param in group:
-                param.grad = None
+                if set_to_none:
+                    param.grad = None
+                elif param.grad is not None:
+                    if param.grad.grad_fn is not None:
+                        param.grad.detach_()
+                    zero_grads_list.append(param.grad)
+        if not set_to_none and len(zero_grads_list) > 0:
+            torch._foreach_zero_(zero_grads_list)
 
     def state_dict(self):
         state_dict = {}
@@ -351,20 +472,21 @@ def state_dict(self):
 
     # Restore base optimizer fp32 weights bfloat16 weights
     def _restore_from_bit16_weights(self):
-        for i, group in enumerate(self.bf16_groups):
+        for i, (bf16_partitions,
+                fp32_partition) in enumerate(zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition)):
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            for bf16_partitions, fp32_partition in zip(self.bf16_partitioned_groups, self.fp32_groups_flat_partition):
-                fp32_partition.data.copy_(bf16_partitions[partition_id].data)
+            fp32_partition.data.copy_(bf16_partitions[partition_id].data)
 
     def refresh_fp32_params(self):
         self._restore_from_bit16_weights()
 
     def load_state_dict(self,
                         state_dict_list,
-                        checkpoint_folder,
+                        checkpoint_folder=None,
                         load_optimizer_states=True,
                         load_from_fp32_weights=False,
-                        load_serial=None):
+                        load_serial=None,
+                        param_shapes=None):
         if checkpoint_folder:
             self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
         else:
@@ -382,6 +504,7 @@ def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, l
         self.clip_grad = current_rank_sd.get(CLIP_GRAD, self.clip_grad)
 
         if load_optimizer_states:
+            print(f"_load_legacy_checkpoint current_rank_sd[BASE_OPTIMIZER_STATE]")
             self.optimizer.load_state_dict(current_rank_sd[BASE_OPTIMIZER_STATE])
 
         if load_from_fp32_weights:
@@ -394,24 +517,42 @@ def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, l
             self._link_all_hp_params()
 
     def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
-        self._load_hp_checkpoint_state(checkpoint_folder)
+        self.load_hp_checkpoint_state_from_checkpoint_dir("bf16_groups", checkpoint_folder)
+
+    def _load_global_state(self, sd):
+        pass
 
     @property
     def param_groups(self):
         """Forward the wrapped optimizer's parameters."""
         return self.optimizer.param_groups
 
-    def _load_hp_checkpoint_state(self, checkpoint_dir):
-        checkpoint_dir = os.path.join(checkpoint_dir, "zero")
-        tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
-        tp_world_size = self.mpu.get_slice_parallel_world_size()
+    @property
+    def state(self):
+        """Forward the wrapped optimizer's states."""
+        return self.optimizer.state
 
-        for i, _ in enumerate(self.optimizer.param_groups):
-            for lp in self.bf16_groups[i]:
-                if lp._hp_mapping is not None:
-                    #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
-                    lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
-                                                tp_world_size)
+    def accumulate_hp_grads_and_remove_lp(self, lp_param, group_idx, param_idx):
+        assert self.immediate_grad_update
+        self._update_hp_grad(lp_param, group_idx, param_idx, clear_lp_grads=False)
+
+    def create_grad_acc_hooks(self):
+        self.grad_accs = []
+        for i, param_group in enumerate(self.bf16_groups):
+            for j, param in enumerate(param_group):
+                if param.requires_grad:
+
+                    def wrapper(param, i, j):
+                        param_tmp = param.expand_as(param)
+                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
+
+                        def accumulate_hp_grads_and_remove_lp(*notneeded):
+                            self.accumulate_hp_grads_and_remove_lp(param, i, j)
+
+                        self._grad_acc_hooks.append(grad_acc.register_hook(accumulate_hp_grads_and_remove_lp))
+                        self.grad_accs.append(grad_acc)
+
+                    wrapper(param, i, j)
 
 
 def _get_padded_tensor(src_tensor, size):
diff --git a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
index e26e3243c4b5..e834bf0d22d7 100644
--- a/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/nebula_checkpoint_engine.py
@@ -58,7 +58,7 @@ def load(self, path: str, map_location=None):
         if not self.enable_nebula_load and first_load_flag:
             self.tag_flag = tag
             logger.info(f"[Nebula] Disable nebula load. Loading checkpoint from {path} ...")
-            partition = torch.load(path, map_location=map_location)
+            partition = torch.load(path, map_location=map_location, weights_only=False)
             logger.info(f"[Nebula] Disable nebula load. Loaded checkpoint from {path} .")
             return partition
 
diff --git a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
index 5cd44864bb2e..076c638532ad 100644
--- a/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
+++ b/deepspeed/runtime/checkpoint_engine/torch_checkpoint_engine.py
@@ -25,7 +25,7 @@ def save(self, state_dict, path: str):
 
     def load(self, path: str, map_location=None):
         logger.info(f"[Torch] Loading checkpoint from {path}...")
-        partition = torch.load(path, map_location=map_location)
+        partition = torch.load(path, map_location=map_location, weights_only=False)
         logger.info(f"[Torch] Loaded checkpoint from {path}.")
         return partition
 
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index b8134b453e39..2fadce52222c 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -8,15 +8,15 @@
 """
 
 import math
-from typing import List
+from typing import List, Any
 import torch
 from torch import Tensor
 from deepspeed import comm as dist
-# NOTE: Use torch.distributed's ProcessGroup class until we have our own.
-from torch.distributed import ProcessGroup, all_to_all_single
+from deepspeed.comm import ProcessGroup, all_to_all_single
 from deepspeed.accelerator import get_accelerator
 from deepspeed.utils import instrument_w_nvtx
 from deepspeed.ops import op_builder
+from deepspeed.utils import logger
 
 
 def _torch_reduce_scatter_fn(input_tensor: Tensor, output_tensor: Tensor, group=None, async_op=False, prof=False):
@@ -41,9 +41,15 @@ def all_to_all_quant_reduce(tensors: List[Tensor], groups: {}) -> List[Tensor]:
     output_lst: List[Tensor] = [None] * len(tensors)
     for idx, tensor in enumerate(tensors):
         if tensor.dim() == 1:
-            intra_quant_group = global_world_size
             output_lst[idx] = reduce_scatter_coalesced([tensor])[0]
-            continue
+        elif tensor.numel() % (2 * global_world_size) != 0:
+            # Due to the constraint of 2-stage all-to-all, the input tensor must be divisible by 2 * global_world_size
+            # Otherwise, all-to-all cannot be performed because of shape mismatch.
+            # See more at https://github.com/deepspeedai/DeepSpeed/pull/5056
+            logger.warning(
+                f"qgZ falls back to reduce_scatter because tensor size = {tensor.numel()} is not divisible by (2 * global_world_size) = {2 * global_world_size}. Please consider allocating a new world to enable qgZ"
+            )
+            output_lst[idx] = reduce_scatter_coalesced([tensor])[0]
         else:
             intra_quant_group = max(tensor.shape[0], tensor.shape[1], global_world_size)
 
@@ -64,7 +70,86 @@ def all_to_all_quant_reduce(tensors: List[Tensor], groups: {}) -> List[Tensor]:
             all_to_all_single(global_scale_output, global_scales, group=groups[f'global_{inter_idx}'])
             final_output = quantizer_module.dequantize(global_output, global_scale_output, global_scale_output.numel(),
                                                        4, quantizer_module.Symmetric)
+            assert final_output.numel(
+            ) % num_nodes == 0, f"final_output.numel()={final_output.numel()} is not divisible by num_nodes={num_nodes}"
+            output_lst[idx] = (sum(list(final_output.chunk(num_nodes))) / num_nodes).view(-1)
+    return output_lst
+
+
+@instrument_w_nvtx
+@torch.no_grad()
+def all_to_all_loco_quant_reduce(
+    params: List[Tensor],
+    groups: {},
+    loco_param: Any = None,
+) -> List[Tensor]:
+    global quantizer_module
+    global loco_idx
+    if quantizer_module is None:
+        quantizer_module = op_builder.QuantizerBuilder().load()
+    local_world_size = get_accelerator().device_count()
+    global_world_size = dist.get_world_size()
+    num_nodes = global_world_size // local_world_size
+    this_rank = dist.get_rank()
+    intra_idx = int(this_rank / local_world_size)
+    inter_idx = this_rank % local_world_size
+    output_lst: List[Tensor] = [None] * len(params)
+    for idx, p in enumerate(params):
+        tensor = p.grad
+        if tensor.dim() == 1:
+            output_lst[idx] = reduce_scatter_coalesced([tensor])[0]
+        elif tensor.numel() % (2 * global_world_size) != 0:
+            # Due to the constraint of 2-stage all-to-all, the input tensor must be divisible by 2 * global_world_size
+            # Otherwise, all-to-all cannot be performed because of shape mismatch.
+            # See more at https://github.com/deepspeedai/DeepSpeed/pull/5056
+            logger.warning(
+                f"qgZ falls back to reduce_scatter because tensor size = {tensor.numel()} is not divisible by (2 * global_world_size) = {2 * global_world_size}. Please consider allocating a new world to enable qgZ"
+            )
+            output_lst[idx] = reduce_scatter_coalesced([tensor])[0]
+        else:
+            err_beta = loco_param['err_beta']
+            reset_T = loco_param['reset_T']
+            if not hasattr(p, 'intra_ef_buf') or loco_idx > reset_T:
+                loco_idx = 0
+                intra_err = torch.zeros_like(p.grad)
+                inter_err = torch.zeros(tensor.numel() // local_world_size, device=tensor.device, dtype=tensor.dtype)
+            else:
+                intra_err = quantizer_module.dequantize(p.intra_ef_buf[0], p.intra_ef_buf[1],
+                                                        p.intra_ef_buf[1].numel(), 8, quantizer_module.Symmetric)
+                inter_err = quantizer_module.dequantize(p.inter_ef_buf[0], p.inter_ef_buf[1],
+                                                        p.inter_ef_buf[1].numel(), 8, quantizer_module.Symmetric)
+
+            intra_quant_group = max(tensor.shape[0], tensor.shape[1], global_world_size)
+            inter_quant_group = intra_quant_group // local_world_size
+            intra_quant_int4, intra_q_scales = quantizer_module.loco_swizzle_quant(tensor, intra_err, err_beta,
+                                                                                   intra_quant_group, 4,
+                                                                                   quantizer_module.Symmetric, 1,
+                                                                                   num_nodes, local_world_size)
+            local_output = torch.empty_like(intra_quant_int4)
+            scale_output = torch.empty_like(intra_q_scales)
+            all_to_all_single(local_output, intra_quant_int4, group=groups[f'local_{intra_idx}'])
+            all_to_all_single(scale_output, intra_q_scales, group=groups[f'local_{intra_idx}'])
+
+            p.intra_ef_buf = quantizer_module.quantize(intra_err, intra_quant_group, 8, quantizer_module.Symmetric)
+
+            global_input_tensor, global_scales = quantizer_module.loco_quantized_reduction(
+                local_output, scale_output, inter_err, err_beta, intra_quant_group, inter_quant_group, 4,
+                quantizer_module.Symmetric, local_world_size)
+
+            global_output = torch.empty_like(global_input_tensor)
+            global_scale_output = torch.empty_like(global_scales)
+            all_to_all_single(global_output, global_input_tensor, group=groups[f'global_{inter_idx}'])
+            all_to_all_single(global_scale_output, global_scales, group=groups[f'global_{inter_idx}'])
+
+            p.inter_ef_buf = quantizer_module.quantize(inter_err, inter_quant_group, 8, quantizer_module.Symmetric)
+
+            final_output = quantizer_module.dequantize(global_output, global_scale_output, global_scale_output.numel(),
+                                                       4, quantizer_module.Symmetric)
+            assert final_output.numel(
+            ) % num_nodes == 0, f"final_output.numel()={final_output.numel()} is not divisible by num_nodes={num_nodes}"
             output_lst[idx] = (sum(list(final_output.chunk(num_nodes))) / num_nodes).view(-1)
+            loco_idx += 1
+
     return output_lst
 
 
diff --git a/deepspeed/runtime/comm/compressed.py b/deepspeed/runtime/comm/compressed.py
new file mode 100644
index 000000000000..2c5482eb1ad7
--- /dev/null
+++ b/deepspeed/runtime/comm/compressed.py
@@ -0,0 +1,137 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import numpy as np
+import torch
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import PackbitsBuilder
+
+
+class CompressedBackend(object):
+
+    def __init__(self, mpu=None):
+        if mpu is None:
+            self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
+        else:
+            self.mpu = mpu
+            self.world_group = self.mpu.get_data_parallel_group()
+        self.size = dist.get_world_size(group=self.world_group)
+        self.rank = dist.get_rank(group=self.world_group)
+        self.packer = PackbitsBuilder().load()
+
+    def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
+        req = []
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    req.append(dist.irecv(recvbuf[idx], src=idx, group=group))
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            req.append(dist.isend(sendbuf, group=group, dst=root))
+        return req
+
+    def my_gather(self, rank, size, group, sendbuf, recvbuf, root):
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    dist.recv(recvbuf[idx], src=idx, group=group)
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            dist.send(sendbuf, group=group, dst=root)
+
+    def pack(self, buffer, size):
+        # pack float tensor into uint8 tensor
+        packed = self.packer.packbits(buffer.float(), buffer.numel(), self.rank)
+        return packed.reshape(size, -1)
+
+    def unpack(self, buffer, size, dtype):
+        # unpack uint8 to float tensor
+        unpacked = self.packer.unpackbits(buffer, buffer.numel(), self.rank)
+        return unpacked.reshape(size, -1).to(dtype)
+
+    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank):
+        original_shape = buffer_m.size()
+        if len(original_shape) > 1:
+            buffer_m = torch.flatten(buffer_m)
+
+        # align size of original_buffer and error
+        original_size = buffer_m.numel()
+        worker_error_size = worker_error.numel()
+        if original_size != worker_error_size:
+            empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.linalg.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+
+        worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        sign_list_packed_tmp = self.pack(buffer_m, self.size).type(torch.int8)
+
+        recvbuf_sign = torch.zeros([self.size, len(sign_list_packed_tmp[self.rank])],
+                                   dtype=sign_list_packed_tmp[0].dtype,
+                                   device=sign_list_packed_tmp.device)
+
+        sign_list_packed = [sign_list_packed_tmp[idx] for idx in range(self.size)]
+
+        recvbuf_scale = [
+            torch.zeros(1, dtype=worker_scale.dtype, device=get_accelerator().current_device_name())
+            for _ in range(self.size)
+        ]
+
+        # communication phase 1
+        # all to all for sign
+        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group)
+        # all gather for scale
+        dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)
+
+        flattened_recvbuf_sign = recvbuf_sign.type(torch.uint8).flatten()
+        compensated_server_m = self.unpack(flattened_recvbuf_sign, self.size, torch.float32) \
+            .mul_(torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
+
+        compensated_server_m.add_(server_error)
+
+        server_scale = torch.linalg.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel())
+
+        server_error.set_(compensated_server_m -
+                          server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        server_sign_packed = self.pack(compensated_server_m, 1).type(torch.int8)
+
+        # recvbuf_sign_server
+        recvbuf_sign_server_tmp = torch.zeros([self.size, len(server_sign_packed[0])],
+                                              dtype=recvbuf_sign.dtype,
+                                              device=server_sign_packed.device)
+
+        recvbuf_sign_server = [recvbuf_sign_server_tmp[idx] for idx in range(self.size)]
+
+        # recvbuf_scale_server
+        recvbuf_scale_server_tmp = torch.zeros([self.size, 1],
+                                               dtype=worker_scale.dtype,
+                                               device=server_sign_packed.device)
+
+        recvbuf_scale_server = [recvbuf_scale_server_tmp[idx] for idx in range(self.size)]
+
+        # communication Phase 2
+        dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group)
+        dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)
+
+        recvbuf_sign_server = torch.stack(recvbuf_sign_server)
+
+        flattened_recvbuf_sign_server = recvbuf_sign_server.type(torch.uint8).flatten()
+
+        buffer_m.data.copy_(
+            self.unpack(flattened_recvbuf_sign_server, self.size,
+                        torch.float32).mul_(recvbuf_scale_server_tmp).flatten().data)
+
+        if original_size != worker_error_size:
+            buffer_m = buffer_m[0:original_size]
+        if len(original_shape) > 1:
+            buffer_m = buffer_m.reshape(original_shape)
+
+        return buffer_m
diff --git a/deepspeed/runtime/comm/hccl.py b/deepspeed/runtime/comm/hccl.py
new file mode 100644
index 000000000000..b8639c7da4c9
--- /dev/null
+++ b/deepspeed/runtime/comm/hccl.py
@@ -0,0 +1,124 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import numpy as np
+import torch
+import torch_npu
+import deepspeed.comm as dist
+
+
+class HcclBackend(object):
+
+    def __init__(self, mpu=None):
+        if mpu is None:
+            self.world_group = dist.new_group(ranks=range(dist.get_world_size()))
+        else:
+            self.mpu = mpu
+            self.world_group = self.mpu.get_data_parallel_group()
+        self.size = dist.get_world_size(group=self.world_group)
+        self.rank = dist.get_rank(group=self.world_group)
+
+    def my_igather(self, rank, size, group, sendbuf, recvbuf, root):
+        req = []
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    req.append(dist.irecv(recvbuf[idx], src=idx, group=group))
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            req.append(dist.isend(sendbuf, group=group, dst=root))
+        return req
+
+    def my_gather(self, rank, size, group, sendbuf, recvbuf, root):
+        if rank == root:
+            for idx in range(size):
+                if idx != rank:
+                    dist.recv(recvbuf[idx], src=idx, group=group)
+                else:
+                    recvbuf[rank] = sendbuf
+        else:
+            dist.send(sendbuf, group=group, dst=root)
+
+    def compressed_allreduce(self, buffer_m: torch.tensor, worker_error, server_error, local_rank):
+        original_shape = buffer_m.size()
+        if len(original_shape) > 1:
+            buffer_m = torch.flatten(buffer_m)
+
+        # align size of original_buffer and error
+        original_size = buffer_m.numel()
+        worker_error_size = worker_error.numel()
+        if original_size != worker_error_size:
+            empty_tensor = torch.zeros(worker_error_size - original_size, device=buffer_m.device)
+            buffer_m = torch.cat([buffer_m, empty_tensor])
+
+        buffer_m.add_(worker_error)
+        worker_scale = torch.linalg.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
+
+        worker_error.set_(buffer_m - worker_scale * buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        sign_list_packed_tmp = torch_npu.npu_sign_bits_pack(buffer_m, self.size).type(torch.int8)
+
+        recvbuf_sign = torch.zeros([self.size, len(sign_list_packed_tmp[self.rank])],
+                                   dtype=sign_list_packed_tmp[0].dtype,
+                                   device=sign_list_packed_tmp.device)
+
+        sign_list_packed = [sign_list_packed_tmp[idx] for idx in range(self.size)]
+
+        recvbuf_scale = [
+            torch.zeros(1, dtype=worker_scale.dtype, device=torch.device(local_rank)) for _ in range(self.size)
+        ]
+
+        # communication phase 1
+        # all to all for sign
+        dist.all_to_all_single(recvbuf_sign, torch.stack(sign_list_packed), group=self.world_group)
+        # all gather for scale
+        dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)
+
+        flattened_recvbuf_sign = recvbuf_sign.type(torch.uint8).flatten()
+        compensated_server_m = torch_npu.npu_sign_bits_unpack(flattened_recvbuf_sign, self.size, torch.float32) \
+            .mul_(torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
+
+        compensated_server_m.add_(server_error)
+
+        server_scale = torch.linalg.norm(compensated_server_m) / np.sqrt(compensated_server_m.numel())
+
+        server_error.set_(compensated_server_m -
+                          server_scale * compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))
+
+        server_sign_packed = torch_npu.npu_sign_bits_pack(compensated_server_m, 1).type(torch.int8)
+
+        # recvbuf_sign_server
+        recvbuf_sign_server_tmp = torch.zeros([self.size, len(server_sign_packed[0])],
+                                              dtype=recvbuf_sign.dtype,
+                                              device=server_sign_packed.device)
+
+        recvbuf_sign_server = [recvbuf_sign_server_tmp[idx] for idx in range(self.size)]
+
+        # recvbuf_scale_server
+        recvbuf_scale_server_tmp = torch.zeros([self.size, 1],
+                                               dtype=worker_scale.dtype,
+                                               device=server_sign_packed.device)
+
+        recvbuf_scale_server = [recvbuf_scale_server_tmp[idx] for idx in range(self.size)]
+
+        # communication Phase 2
+        dist.all_gather(recvbuf_sign_server, server_sign_packed[0], group=self.world_group)
+        dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)
+
+        recvbuf_sign_server = torch.stack(recvbuf_sign_server)
+
+        flattened_recvbuf_sign_server = recvbuf_sign_server.type(torch.uint8).flatten()
+
+        buffer_m.data.copy_(
+            torch_npu.npu_sign_bits_unpack(flattened_recvbuf_sign_server, self.size,
+                                           torch.float32).mul_(recvbuf_scale_server_tmp).flatten().data)
+
+        if original_size != worker_error_size:
+            buffer_m = buffer_m[0:original_size]
+        if len(original_shape) > 1:
+            buffer_m = buffer_m.reshape(original_shape)
+
+        return buffer_m
diff --git a/deepspeed/runtime/comm/nccl.py b/deepspeed/runtime/comm/nccl.py
index 231f841ae8b7..a57b7519a295 100644
--- a/deepspeed/runtime/comm/nccl.py
+++ b/deepspeed/runtime/comm/nccl.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from deepspeed.runtime.compression.cupy import CupyBackend
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 
diff --git a/deepspeed/runtime/compiler.py b/deepspeed/runtime/compiler.py
new file mode 100644
index 000000000000..be778b83f8bb
--- /dev/null
+++ b/deepspeed/runtime/compiler.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from deepspeed.utils.torch import required_torch_version
+
+try:
+    from torch.compiler import is_compiling as torch_is_compiling
+except ImportError:
+    try:
+        from torch._dynamo.external_utils import is_compiling as torch_is_compiling
+    except ImportError:
+        # Torch does not have compiler support
+        torch_is_compiling = lambda: False
+
+
+def is_compile_supported():
+    return required_torch_version(min_version=2.1)
+
+
+def disable(func):
+    if is_compile_supported():
+        return torch.compiler.disable(func)
+    return func
+
+
+def is_compiling():
+    return torch_is_compiling()
diff --git a/deepspeed/runtime/compression/cupy.py b/deepspeed/runtime/compression/cupy.py
index b959a9c20372..7133ac04ed2b 100644
--- a/deepspeed/runtime/compression/cupy.py
+++ b/deepspeed/runtime/compression/cupy.py
@@ -14,10 +14,10 @@ def __init__(self):
         pass
 
     def torch2cupy(self, tensor):
-        return cupy.fromDlpack(to_dlpack(tensor))
+        return cupy.from_dlpack(to_dlpack(tensor))
 
     def cupy2torch(self, cupy_tensor):
-        return from_dlpack(cupy_tensor.toDlpack())
+        return from_dlpack(cupy_tensor)
 
     def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
         packed_sign = cupy.packbits(cupy_bool_tensor)
diff --git a/deepspeed/runtime/config.py b/deepspeed/runtime/config.py
index b49469b94f11..b6dabc161e8c 100755
--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -62,9 +62,12 @@
 from ..compression.constants import *
 from .swap_tensor.aio_config import get_aio_config
 
+from .tensor_parallel import get_tensor_parallel_config
 from .data_pipeline.config import get_data_efficiency_enabled, get_data_efficiency_config, get_curriculum_enabled_legacy, get_curriculum_params_legacy
 from .data_pipeline.constants import *
 
+from ..utils.config import get_timers_config
+
 TENSOR_CORE_ALIGN_SIZE = 8
 
 ADAGRAD_OPTIMIZER = 'adagrad'
@@ -168,6 +171,14 @@ def get_bfloat16_enabled(param_dict):
     return False
 
 
+def get_bfloat16_immediate_grad_update(param_dict):
+    for key in [BFLOAT16, BFLOAT16_OLD]:
+        if key in param_dict.keys():
+            return get_scalar_param(param_dict[key], BFLOAT16_IMMEDIATE_GRAD_UPDATE,
+                                    BFLOAT16_IMMEDIATE_GRAD_UPDATE_DEFAULT)
+    return False
+
+
 def get_fp16_master_weights_and_grads_enabled(param_dict):
     if get_fp16_enabled(param_dict):
         return get_scalar_param(param_dict[FP16], FP16_MASTER_WEIGHTS_AND_GRADS, FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT)
@@ -249,10 +260,10 @@ def get_communication_data_type(param_dict,
         return torch.float32
     elif val == "fp16":
         return torch.float16
-    elif val == "bfp16":
+    elif val == "bf16":
         return torch.bfloat16
 
-    raise ValueError(f"Invalid communication_data_type. Supported data types: ['fp16', 'bfp16', 'fp32']. Got: {val}")
+    raise ValueError(f"Invalid communication_data_type. Supported data types: ['fp16', 'bf16', 'fp32']. Got: {val}")
 
 
 def get_prescale_gradients(param_dict):
@@ -279,6 +290,10 @@ def get_gradient_clipping(param_dict):
     return get_scalar_param(param_dict, GRADIENT_CLIPPING, GRADIENT_CLIPPING_DEFAULT)
 
 
+def get_graph_harvesting(param_dict):
+    return get_scalar_param(param_dict, GRAPH_HARVESTING, GRAPH_HARVESTING_DEFAULT)
+
+
 def get_sparse_attention(param_dict):
     if SPARSE_ATTENTION in param_dict.keys():
         sparsity = param_dict[SPARSE_ATTENTION]
@@ -691,7 +706,7 @@ def write_config(self, filename):
 
 class DeepSpeedConfig(object):
 
-    def __init__(self, config: Union[str, dict], mpu=None):
+    def __init__(self, config: Union[str, dict], mpu=None, mesh_device=None):
         super(DeepSpeedConfig, self).__init__()
         if isinstance(config, dict):
             self._param_dict = config
@@ -707,14 +722,16 @@ def __init__(self, config: Union[str, dict], mpu=None):
                 )
         try:
             self.global_rank = dist.get_rank()
-            if mpu is None:
-                self.world_size = dist.get_world_size()
-            else:
+            if mpu is not None:
                 self.world_size = mpu.get_data_parallel_world_size()
+            elif mesh_device is not None:
+                self.world_size = dist.get_world_size(mesh_device.get_group(mesh_dim="data_parallel"))
+            else:
+                self.world_size = dist.get_world_size()
         except:
             self.global_rank = 0
             self.world_size = 1
-
+        logger.info(f"Config mesh_device {mesh_device} world_size = {self.world_size}")
         # If elastic-mode enabled, update compute + update _param_dict
         self.elasticity_enabled = elasticity_enabled(self._param_dict)
         if self.elasticity_enabled:
@@ -813,6 +830,7 @@ def _initialize_params(self, param_dict):
         self.fp16_enabled = get_fp16_enabled(param_dict)
         self.fp16_auto_cast = get_fp16_auto_cast(param_dict)
         self.bfloat16_enabled = get_bfloat16_enabled(param_dict)
+        self.bfloat16_immediate_grad_update = get_bfloat16_immediate_grad_update(param_dict)
         assert not (self.fp16_enabled
                     and self.bfloat16_enabled), 'bfloat16 and fp16 modes cannot be simultaneously enabled'
         self.fp16_master_weights_and_gradients = get_fp16_master_weights_and_grads_enabled(param_dict)
@@ -823,6 +841,7 @@ def _initialize_params(self, param_dict):
         self.dynamic_loss_scale_args = get_dynamic_loss_scale_args(param_dict)
 
         self.compression_config = get_compression_config(param_dict)
+        self.graph_harvesting = get_graph_harvesting(param_dict)
 
         self.optimizer_name = get_optimizer_name(param_dict)
         if (self.optimizer_name is not None and self.optimizer_name.lower() in DEEPSPEED_OPTIMIZERS):
@@ -894,6 +913,9 @@ def _initialize_params(self, param_dict):
         self.weight_quantization_config = WeightQuantConfig(
             **param_dict['weight_quantization']) if 'weight_quantization' in param_dict else None
 
+        self.timers_config = get_timers_config(param_dict)
+        self.tensor_parallel_config = get_tensor_parallel_config(param_dict)
+
     def _batch_assertion(self):
 
         train_batch = self.train_batch_size
@@ -992,8 +1014,8 @@ def _do_error_check(self):
             self.gradient_accumulation_steps), "DeepSpeedConfig: {} is not defined".format(GRADIENT_ACCUMULATION_STEPS)
 
         if self.zero_enabled:
-            assert (self.zero_optimization_stage <=
-                    ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
+            assert (self.zero_optimization_stage
+                    <= ZeroStageEnum.max_stage), "DeepSpeedConfig: Maximum supported ZeRO stage is {}".format(
                         ZeroStageEnum.max_stage)
 
         if self.fp16_master_weights_and_gradients:
diff --git a/deepspeed/runtime/config_utils.py b/deepspeed/runtime/config_utils.py
index 5522a8e79d69..d5c3a1548360 100755
--- a/deepspeed/runtime/config_utils.py
+++ b/deepspeed/runtime/config_utils.py
@@ -5,11 +5,12 @@
 """
 Collection of DeepSpeed configuration utilities
 """
-import json
 import collections
-import collections.abc
+import json
+import torch
 from functools import reduce
-from deepspeed.pydantic_v1 import BaseModel
+from pydantic import BaseModel, ConfigDict, field_serializer
+
 from deepspeed.utils import logger
 
 
@@ -54,67 +55,73 @@ def __init__(self, strict=False, **data):
         if (not strict):  # This is temporary until we refactor all DS configs, allows HF to load models
             data = {k: v for k, v in data.items() if (v != "auto" or k == "replace_method")}
         super().__init__(**data)
-        self._deprecated_fields_check(self)
+        self._deprecated_fields_check()
 
-    def _process_deprecated_field(self, pydantic_config, field):
+    def _process_deprecated_field(self, dep_field):
         # Get information about the deprecated field
-        fields_set = pydantic_config.__fields_set__
-        dep_param = field.name
-        kwargs = field.field_info.extra
+        pydantic_config = self
+        fields_set = pydantic_config.model_fields_set
+        kwargs = pydantic_config.model_fields[dep_field].json_schema_extra
         new_param_fn = kwargs.get("new_param_fn", lambda x: x)
-        param_value = new_param_fn(getattr(pydantic_config, dep_param))
-        new_param = kwargs.get("new_param", "")
+        param_value = new_param_fn(getattr(pydantic_config, dep_field))
+        new_field = kwargs.get("new_param", "")
         dep_msg = kwargs.get("deprecated_msg", "")
-        if dep_param in fields_set:
-            logger.warning(f"Config parameter {dep_param} is deprecated" +
-                           (f" use {new_param} instead" if new_param else "") + (f". {dep_msg}" if dep_msg else ""))
+        if dep_field in fields_set:
+            logger.warning(f"Config parameter {dep_field} is deprecated" +
+                           (f" use {new_field} instead" if new_field else "") + (f". {dep_msg}" if dep_msg else ""))
             # Check if there is a new param and if it should be set with a value
-            if new_param and kwargs.get("set_new_param", True):
+            if new_field and kwargs.get("set_new_param", True):
                 # Remove the deprecate field if there is a replacing field
                 try:
-                    delattr(pydantic_config, dep_param)
+                    delattr(pydantic_config, dep_field)
                 except Exception as e:
-                    logger.error(f"Tried removing deprecated '{dep_param}' from config")
+                    logger.error(f"Tried removing deprecated '{dep_field}' from config")
                     raise e
 
                 # Set new param value
-                new_param_nested = new_param.split(".")
+                new_param_nested = new_field.split(".")
                 if len(new_param_nested) > 1:
                     # If the new param exists in a subconfig, we need to get
                     # the fields set for that subconfig
                     pydantic_config = reduce(getattr, new_param_nested[:-1], pydantic_config)
-                    fields_set = pydantic_config.__fields_set__
+                    fields_set = pydantic_config.model_fields_set
                 new_param_name = new_param_nested[-1]
                 assert (
                     new_param_name not in fields_set
-                ), f"Cannot provide deprecated parameter '{dep_param}' and replacing parameter '{new_param}' together"
+                ), f"Cannot provide deprecated parameter '{dep_field}' and replacing parameter '{new_field}' together"
                 # A custom function for converting the old param value to new param value can be provided
                 try:
                     setattr(pydantic_config, new_param_name, param_value)
                 except Exception as e:
-                    logger.error(f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'")
+                    logger.error(f"Tried setting value for '{new_field}' with value from deprecated '{dep_field}'")
                     raise e
 
-    def _deprecated_fields_check(self, pydantic_config):
-        fields = pydantic_config.__fields__
-        for field in fields.values():
-            if field.field_info.extra.get("deprecated", False):
-                self._process_deprecated_field(pydantic_config, field)
+    def _deprecated_fields_check(self):
+        fields = self.model_fields
+        for field_name, field_info in fields.items():
+            if field_info.json_schema_extra and field_info.json_schema_extra.get("deprecated", False):
+                self._process_deprecated_field(field_name)
+
+    model_config = ConfigDict(
+        validate_default=True,
+        validate_assignment=True,
+        use_enum_values=True,
+        populate_by_name=True,
+        extra="forbid",
+        arbitrary_types_allowed=True,
+        protected_namespaces=(),
+    )
 
-    class Config:
-        validate_all = True
-        validate_assignment = True
-        use_enum_values = True
-        allow_population_by_field_name = True
-        extra = "forbid"
-        arbitrary_types_allowed = True
+    @field_serializer("dtype", check_fields=False)
+    def serialize_torch_dtype(dtype: torch.dtype) -> str:
+        return str(dtype)
 
 
 def get_config_default(config, field_name):
-    assert field_name in config.__fields__, f"'{field_name}' is not a field in {config}"
-    assert not config.__fields__.get(
-        field_name).required, f"'{field_name}' is a required field and does not have a default value"
-    return config.__fields__.get(field_name).default
+    assert field_name in config.model_fields, f"'{field_name}' is not a field in {config}"
+    assert not config.model_fields.get(
+        field_name).is_required(), f"'{field_name}' is a required field and does not have a default value"
+    return config.model_fields.get(field_name).get_default()
 
 
 class pp_int(int):
diff --git a/deepspeed/runtime/constants.py b/deepspeed/runtime/constants.py
index 96f2a38bd05c..679230ca7d4c 100755
--- a/deepspeed/runtime/constants.py
+++ b/deepspeed/runtime/constants.py
@@ -126,6 +126,10 @@
 BFLOAT16_ENABLED = "enabled"
 BFLOAT16_ENABLED_DEFAULT = False
 
+# BFLOAT16 optimizer immediate gradient update
+BFLOAT16_IMMEDIATE_GRAD_UPDATE = "immediate_grad_update"
+BFLOAT16_IMMEDIATE_GRAD_UPDATE_DEFAULT = False
+
 #########################################
 # FP16 support
 #########################################
@@ -210,6 +214,18 @@
 GRADIENT_CLIPPING = 'gradient_clipping'
 GRADIENT_CLIPPING_DEFAULT = 0.
 
+#########################################
+# Capture graph for short kernels sequences
+#########################################
+# Graph harvesting. By default, this feature is not enabled.
+# Users can configure in ds_config.json as below example:
+GRAPH_HARVESTING_FORMAT = '''
+Graph harvesting should be enabled as:
+"graph_harvesting": true
+'''
+GRAPH_HARVESTING = 'graph_harvesting'
+GRAPH_HARVESTING_DEFAULT = False
+
 #########################################
 # Communication data type
 #########################################
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
index cb0d366ce798..93d351169834 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import os
+import sys
 from collections import defaultdict
 import csv
 import time
@@ -12,9 +13,10 @@
 import torch
 from torch.utils.data import BatchSampler, SequentialSampler, DataLoader, Subset
 
+import deepspeed.comm as dist
 from deepspeed.utils import logger
-from .indexed_dataset import MMapIndexedDataset
-from .utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype
+from deepspeed.runtime.data_pipeline.data_sampling.indexed_dataset import MMapIndexedDataset, valid_dtypes
+from deepspeed.runtime.data_pipeline.data_sampling.utils import split_dataset, split_index, create_mmap_dataset_builder, close_mmap_dataset_builder, find_fit_int_dtype
 
 
 class DataAnalyzer(object):
@@ -36,7 +38,8 @@ def __init__(self,
                  custom_map_init=None,
                  custom_map_update=None,
                  custom_map_finalize=None,
-                 custom_reduce=None):
+                 custom_reduce=None,
+                 sample_indices=None):
         super().__init__()
         self.dataset = dataset
         self.num_workers = num_workers
@@ -55,15 +58,14 @@ def __init__(self,
         self.custom_map_update = custom_map_update
         self.custom_map_finalize = custom_map_finalize
         self.custom_reduce = custom_reduce
+        self.sample_indices = sample_indices
 
     def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtypes, save_path, worker_id):
         metric_results = []
         for m_idx in range(len(metric_names)):
             metric_name, metric_type, metric_dtype = metric_names[m_idx], \
                 metric_types[m_idx], metric_dtypes[m_idx]
-            assert metric_dtype not in [
-                np.float64, np.double
-            ], "Currently floating point metric values are not supported. Please change your metric into integer values (and potentially multiply a larger coefficient to keep the precision)."
+            assert metric_dtype in valid_dtypes, f"metric_dtype {metric_dtype} not supported. Supported dtypes {valid_dtypes}"
             metric_save_path = f"{save_path}/{metric_name}/worker{worker_id}_thread{thread_id}/"
             os.makedirs(metric_save_path, exist_ok=True)
             if metric_type == 'single_value_per_sample':
@@ -84,16 +86,34 @@ def init_metric_results(self, thread_id, metric_names, metric_types, metric_dtyp
                 metric_results.append({"metric_value": metric_value, "metric_value_fname": metric_value_fname})
         return metric_results
 
-    def update_metric_results(self, data, metric_types, metric_functions, metric_results):
+    def update_metric_results(self,
+                              data,
+                              metric_types,
+                              metric_dtypes,
+                              metric_functions,
+                              metric_results,
+                              batch_start_idx=0):
         for m_idx in range(len(metric_types)):
-            metric_type, metric_function, metric_result = metric_types[m_idx], \
-                metric_functions[m_idx], metric_results[m_idx]
+            metric_type, metric_dtype, metric_function, metric_result = metric_types[m_idx], \
+                metric_dtypes[m_idx], metric_functions[m_idx], metric_results[m_idx]
+            metric_values = metric_function(data)
+
+            assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \
+                "metric_function must return a tensor or array"
+            assert metric_values.dtype == metric_dtype, \
+                f"metric_function result dtype {metric_values.dtype} does not match metric_dtype {metric_dtype}"
+            if isinstance(metric_values, np.ndarray):
+                metric_values = torch.from_numpy(metric_values)
+
             if metric_type == 'single_value_per_sample':
-                metric_values = metric_function(data)
                 for row in range(metric_values.size()[0]):
+                    sample_idx = batch_start_idx + row  # sample idx following dataset iteration order
+                    if isinstance(data, dict) and 'index' in data:  # Megatron use case, idx provided in 'index' field
+                        sample_idx = data['index'][row][0].item()
+                    elif self.sample_indices is not None:  # user defined shuffling of indices
+                        sample_idx = self.sample_indices[sample_idx]
                     metric_result["sample_to_metric_builder"].add_item(metric_values[row].reshape(-1))
-                    metric_result["metric_to_sample_dict"][metric_values[row].item()].append(
-                        data['index'][row][0].item())
+                    metric_result["metric_to_sample_dict"][metric_values[row].item()].append(sample_idx)
                 for m_value in metric_result["metric_to_sample_dict"]:
                     if len(metric_result["metric_to_sample_dict"][m_value]) > 100:
                         metric_fname = metric_result["metric_to_sample_fname"]
@@ -102,7 +122,6 @@ def update_metric_results(self, data, metric_types, metric_functions, metric_res
                             writer.writerows([metric_result["metric_to_sample_dict"][m_value]])
                         metric_result["metric_to_sample_dict"][m_value] = []
             elif metric_type == 'accumulate_value_over_samples':
-                metric_values = metric_function(data)
                 if metric_result["metric_value"] is None:
                     metric_result["metric_value"] = metric_values
                 else:
@@ -136,15 +155,12 @@ def run_map_helper(self, thread_id):
             f"on data subset {start_idx} to {end_idx}")
         thread_dataset = Subset(self.dataset, list(range(start_idx, end_idx)))
         sampler = BatchSampler(SequentialSampler(thread_dataset), batch_size=self.batch_size, drop_last=False)
-        if self.collate_fn is None:
-            iterator = iter(DataLoader(thread_dataset, batch_sampler=sampler, num_workers=0, pin_memory=False))
-        else:
-            iterator = iter(
-                DataLoader(thread_dataset,
-                           batch_sampler=sampler,
-                           num_workers=0,
-                           collate_fn=self.collate_fn,
-                           pin_memory=False))
+        iterator = iter(
+            DataLoader(thread_dataset,
+                       batch_sampler=sampler,
+                       num_workers=0,
+                       collate_fn=self.collate_fn,
+                       pin_memory=False))
         if self.custom_map_init is None:
             metric_results = self.init_metric_results(thread_id, self.metric_names, self.metric_types,
                                                       self.metric_dtypes, self.save_path, self.worker_id)
@@ -157,11 +173,14 @@ def run_map_helper(self, thread_id):
         while True:
             try:
                 data = next(iterator)
+                batch_start_idx = start_idx + processed_sample
                 if self.custom_map_update is None:
-                    self.update_metric_results(data, self.metric_types, self.metric_functions, metric_results)
+                    self.update_metric_results(data, self.metric_types, self.metric_dtypes, self.metric_functions,
+                                               metric_results, batch_start_idx)
                 else:
-                    self.custom_map_update(data, self.metric_types, self.metric_functions, metric_results)
-                processed_sample += self.batch_size
+                    self.custom_map_update(data, self.metric_types, self.metric_dtypes, self.metric_functions,
+                                           metric_results, batch_start_idx)
+                processed_sample += len(data)
                 duration = (time.time() - start) / 3600.0
                 remain_duration = duration * total_sample / processed_sample - duration
                 logger.info(
@@ -367,26 +386,10 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_
                     index_to_metric_builder.merge_file_(chunk_im_fname)
                 close_mmap_dataset_builder(index_to_sample_builder, index_to_sample_fname)
                 close_mmap_dataset_builder(index_to_metric_builder, index_to_metric_fname)
-                num_sample_per_value = {}
-                index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True)
-                index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True)
-                index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged"
-                index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname,
-                                                                             sample_idx_dtype)
-                for v_idx in range(len(index_to_sample)):
-                    if v_idx > 0:
-                        assert index_to_metric[v_idx] > index_to_metric[v_idx - 1]
-                    num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx])
-                assert sum(num_sample_per_value.values()) == total_num_samples
-                merge_step = max(1, len(index_to_sample) // 100)
-                for v_idx in range(0, len(index_to_sample), merge_step):
-                    merged_samples = np.copy(
-                        np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))],
-                                       axis=None))
-                    index_to_sample_merged_builder.add_item(
-                        torch.tensor(merged_samples.astype(np.int64), dtype=torch.long))
-                    logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.")
-                close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname)
+
+                num_sample_per_value = DataAnalyzer.output_index_to_sample_percentile(
+                    index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path, total_num_samples,
+                    sample_idx_dtype)
                 self.get_metric_value_percentiles(metric_name, num_sample_per_value, total_num_samples)
             elif metric_type == 'accumulate_value_over_samples':
                 metric_save_path = f"{save_path}/{metric_name}/"
@@ -408,6 +411,29 @@ def merge_map_results(self, dataset, metric_names, metric_types, save_path, num_
                 metric_value_builder.add_item(torch.tensor(metric_value.astype(np.int64), dtype=torch.long))
                 close_mmap_dataset_builder(metric_value_builder, metric_value_fname)
 
+    @staticmethod
+    def output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname, metric_name, metric_save_path,
+                                          total_num_samples, sample_idx_dtype):
+        """ read index_to_metric and index_to_sample files and write distribution to index_to_sample_percentage_merged """
+        num_sample_per_value = {}
+        index_to_sample = MMapIndexedDataset(index_to_sample_fname, skip_warmup=True)
+        index_to_metric = MMapIndexedDataset(index_to_metric_fname, skip_warmup=True)
+        index_to_sample_merged_fname = f"{metric_save_path}/{metric_name}_index_to_sample_percentile_merged"
+        index_to_sample_merged_builder = create_mmap_dataset_builder(index_to_sample_merged_fname, sample_idx_dtype)
+        for v_idx in range(len(index_to_sample)):
+            if v_idx > 0:
+                assert index_to_metric[v_idx] > index_to_metric[v_idx - 1]
+            num_sample_per_value[index_to_metric[v_idx][0]] = len(index_to_sample[v_idx])
+        assert sum(list(num_sample_per_value.values())) == total_num_samples
+        merge_step = max(1, len(index_to_sample) // 100)
+        for v_idx in range(0, len(index_to_sample), merge_step):
+            merged_samples = np.copy(
+                np.concatenate(index_to_sample[v_idx:min(len(index_to_sample), (v_idx + merge_step))], axis=None))
+            index_to_sample_merged_builder.add_item(torch.tensor(merged_samples.astype(np.int64), dtype=torch.long))
+            logger.info(f"Finished merging index_to_sample {v_idx} to {v_idx+merge_step}.")
+        close_mmap_dataset_builder(index_to_sample_merged_builder, index_to_sample_merged_fname)
+        return num_sample_per_value
+
     def run_reduce(self):
         if self.custom_reduce is None:
             self.merge_map_results(self.dataset, self.metric_names, self.metric_types, self.save_path,
@@ -415,3 +441,440 @@ def run_reduce(self):
         else:
             self.custom_reduce(self.dataset, self.metric_names, self.metric_types, self.save_path, self.num_workers,
                                self.num_threads, self.num_threads_reduce)
+
+    def run_map_reduce(self, comm_group=None):
+        self.run_map()
+        # wait for the mapping operation, where all nodes outputs their own (partial) result files
+        dist.barrier(group=comm_group)
+        if self.worker_id == 0:
+            self.run_reduce()
+        # wait for the reduce, where rank 0 merges all (partial) files. Dataset can then be used by all nodes.
+        dist.barrier(group=comm_group)
+
+
+class DistributedDataAnalyzer(object):
+
+    def __init__(
+        self,
+        dataset,
+        num_workers=1,
+        num_threads=1,
+        worker_id=0,
+        batch_size=1,
+        metric_names=[],
+        metric_functions=[],
+        metric_types=[],
+        save_path="./",
+        collate_fn=None,
+        device='cuda',
+        comm_group=None,
+        sample_indices=None,
+    ) -> None:
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.metric_names = metric_names
+        self.metric_functions = metric_functions
+        self.metric_types = metric_types
+        self.save_path = save_path
+        self.collate_fn = collate_fn
+        self.device = device
+        self.sample_indices = sample_indices
+        self.num_threads = num_threads
+        self.worker_id = worker_id
+
+        if not dist.is_initialized():
+            dist.init_distributed()
+
+        # comm_group and worker_id+num_workers are mutually exclusive
+        self.comm_group = comm_group
+        if self.comm_group is None:
+            # self.comm_group = deepspeed.utils.groups._clone_world_group()
+            self.num_workers = num_workers
+            self.worker_id = worker_id
+        else:
+            self.num_workers = self.comm_group.size()
+            self.worker_id = self.comm_group.rank()
+
+        if self.worker_id == 0:
+            logger.info(f"Distributed data analyzer initialized with {self.num_workers} workers.")
+
+    def run_map_helper(self, thread_id=0, metric_queues=None):
+        thread_start_idx, thread_end_idx = self.thread_splits[thread_id][0], self.thread_splits[thread_id][1]
+        worker_dataset = Subset(self.dataset, list(range(thread_start_idx, thread_end_idx)))
+        sampler = BatchSampler(SequentialSampler(worker_dataset), batch_size=self.batch_size, drop_last=False)
+        dataloader = DataLoader(dataset=worker_dataset,
+                                batch_sampler=sampler,
+                                num_workers=0,
+                                collate_fn=self.collate_fn,
+                                pin_memory=False)
+
+        # set initial results list
+        metric_results = []
+        for metric_type in self.metric_types:
+            assert metric_type in ['single_value_per_sample', 'accumulate_value_over_samples'], \
+                f"metric_type {metric_type} not implemented."
+            metric_results.append([] if metric_type == 'single_value_per_sample' else None)
+
+        # iterate dataloader and store metric results
+        batch_start_idx = thread_start_idx
+        for data in dataloader:
+            for m_idx in range(len(self.metric_names)):
+                metric_type, metric_function = self.metric_types[m_idx], self.metric_functions[m_idx]
+                metric_values = metric_function(data)
+                assert torch.is_tensor(metric_values) or isinstance(metric_values, np.ndarray), \
+                    "metric_function must return a tensor or array"
+                if isinstance(metric_values, np.ndarray):
+                    metric_values = torch.from_numpy(metric_values)
+                assert metric_values.dtype in valid_dtypes, \
+                    f"metric_function result dtype {metric_values.dtype} not supported. Supported dtypes {valid_dtypes}"
+
+                if metric_type == 'single_value_per_sample':
+                    for row in range(metric_values.size()[0]):
+                        value = metric_values[row].item()
+                        sample_idx = batch_start_idx + row  # sample idx following dataset iteration order
+                        if isinstance(data, dict) and 'index' in data:  # Megatron use case
+                            sample_idx = data['index'][row][0].item()
+                        elif self.sample_indices is not None:  # user defined shuffling of indices
+                            sample_idx = self.sample_indices[sample_idx]
+                        metric_results[m_idx].append((value, sample_idx))
+                elif metric_type == 'accumulate_value_over_samples':
+                    if metric_results[m_idx] is None:
+                        metric_results[m_idx] = metric_values
+                    else:
+                        metric_results[m_idx].add_(metric_values)
+            batch_start_idx += len(data)
+
+        if self.num_threads == 1:
+            return metric_results
+
+        # copy metric_results to the shared queue
+        assert metric_queues
+        for m_idx in range(len(self.metric_names)):
+            results = metric_results[m_idx]
+            if torch.is_tensor(results):
+                results = results.item() if results.dim() == 0 else results.tolist()
+            try:
+                metric_queues[m_idx].put((thread_id, results))
+            except Exception as e:
+                logger.error(f"Error putting metric results to queue: {e}")
+                sys.exit(1)
+
+    def run_map_reduce(self):
+
+        # setup individual dataloaders
+        self.worker_splits, self.thread_splits = split_dataset(self.dataset,
+                                                               self.num_workers,
+                                                               self.worker_id,
+                                                               num_threads=self.num_threads)
+        node_start_idx, node_end_idx = self.worker_splits[self.worker_id]
+        logger.info(f"worker {self.worker_id} working on data subset {node_start_idx} to {node_end_idx}.")
+
+        if self.num_threads in [0, 1, None]:
+            metric_results = self.run_map_helper()
+            metric_results = [torch.tensor(m).to(self.device) for m in metric_results]
+        else:
+
+            # create a shared queue of results per metric to be populated by individual threads
+            with Manager() as manager:
+                metric_queues = [manager.Queue() for _ in self.metric_names]
+                threads = [
+                    Process(target=self.run_map_helper, args=(t, metric_queues)) for t in range(self.num_threads)
+                ]
+                for thread in threads:
+                    thread.start()
+                for thread in threads:
+                    thread.join()
+
+                # gather results from shared queues into metric_results
+                metric_results = [None for _ in self.metric_names]
+                for m_idx, (queue, metric_type) in enumerate(zip(metric_queues, self.metric_types)):
+                    while not queue.empty():
+                        t_idx, t_results = queue.get()
+                        t_start_idx, t_end_idx = self.thread_splits[t_idx]
+                        if t_start_idx >= t_end_idx:  # no results from this thread
+                            continue  #corner case for small datasets and high thread count
+                        t_results = torch.tensor(t_results)
+                        if metric_type == 'single_value_per_sample':
+                            # add thread results to the metric_results list, ordered by thread idx
+                            if metric_results[m_idx] is None:  # initialize if needed
+                                metric_results[m_idx] = torch.zeros(node_end_idx - node_start_idx,
+                                                                    t_results.size(1)).to(self.device)
+                            metric_results[m_idx][t_start_idx - node_start_idx:t_end_idx - node_start_idx] = t_results
+                        else:
+                            if metric_results[m_idx] is None:  # initialize if needed
+                                metric_results[m_idx] = torch.zeros(t_results.size()).to(self.device)
+                            metric_results[m_idx].add_(t_results)
+
+        # compute dtype for sample ids
+        total_num_samples = len(self.dataset)
+        sample_idx_dtype = find_fit_int_dtype(0, total_num_samples - 1)
+        logger.info(f"Total number of data samples: {total_num_samples}.")
+        logger.info(f"Will use {sample_idx_dtype} to store the sample indexes.")
+
+        for m_idx in range(len(self.metric_names)):
+            metric_values, metric_name, metric_type = \
+                metric_results[m_idx], self.metric_names[m_idx], self.metric_types[m_idx]
+            metric_save_path = f"{self.save_path}/{metric_name}/"
+            os.makedirs(metric_save_path, exist_ok=True)
+
+            if metric_type == 'single_value_per_sample':
+
+                # Compute sample and metric value dtypes based on range
+                values, samples = metric_values[:, 0], metric_values[:, 1]
+                value_min, value_max = Dist.min_max(values, self.comm_group)
+                sample_min, sample_max = Dist.min_max(samples, self.comm_group)
+                metric_value_dtype = find_fit_int_dtype(value_min, value_max)
+                sample_value_dtype = find_fit_int_dtype(sample_min, sample_max)
+
+                # sample_to_metric maps sample ids to metric values, as a list of metric values
+                sample_to_metric_fname = f"{metric_save_path}/{metric_name}_sample_to_metric"
+                values = [torch.tensor([x]) for x in metric_values[:, 0]]
+                self.file_write_ordered(values, sample_to_metric_fname, metric_value_dtype)
+
+                # distributed sorting by values, gives an ordered disjoint subset of keys on nodes
+                metric_values = Dist.sample_sort(metric_values, self.comm_group, self.num_workers)
+                metric_to_samples_dict = {}
+                if len(metric_values) > 0:
+                    for value, sample in metric_values:
+                        if value.item() not in metric_to_samples_dict:
+                            metric_to_samples_dict[value.item()] = []
+                        metric_to_samples_dict[value.item()].append(sample.item())
+
+                # index_to_metric and index_to_sample serialize a dicitonary from metric to samples
+                # index_to_metric stores a key per row, index_to_sample stores the values per row
+                values = [torch.tensor([x]) for x in metric_to_samples_dict.keys()]
+                samples = [torch.tensor(metric_to_samples_dict[x]) for x in metric_to_samples_dict.keys()]
+                index_to_metric_fname = f"{metric_save_path}/{metric_name}_index_to_metric"  #dict keys
+                index_to_sample_fname = f"{metric_save_path}/{metric_name}_index_to_sample"  #dict values
+                self.file_write_ordered(values, index_to_metric_fname, metric_value_dtype)
+                self.file_write_ordered(samples, index_to_sample_fname, sample_value_dtype)
+
+                if self.worker_id == 0:
+                    DataAnalyzer.output_index_to_sample_percentile(index_to_sample_fname, index_to_metric_fname,
+                                                                   metric_name, metric_save_path, total_num_samples,
+                                                                   sample_idx_dtype)
+                dist.barrier(self.comm_group)
+
+            elif metric_type == 'accumulate_value_over_samples':
+                metric_value_fname = f"{metric_save_path}/{metric_name}_metric_value"
+                dist.reduce(metric_values, dst=0, op=dist.ReduceOp.SUM, group=self.comm_group)
+                metric_value_dtype = find_fit_int_dtype(metric_values.min(), metric_values.max())
+
+                if self.worker_id == 0:
+                    builder = create_mmap_dataset_builder(metric_value_fname, metric_value_dtype)
+                    builder.add_item(metric_values.cpu())
+                    close_mmap_dataset_builder(builder, metric_value_fname)
+        dist.barrier(self.comm_group)
+
+    def file_write_ordered(self, tensor_list, fname, numpy_dtype):
+        """ MPI_file_write_ordered extended to write a list of tensors, by one rank, iteratively """
+
+        # each node has a list of rows (tensors) to be written to the file.
+        # we will serialize it in order to communicate it in one comm step.
+
+        tkwargs = dict(dtype=torch.int64, device=self.device)
+
+        # 1. gather on rank 0 the number of rows to be sent/recv
+        row_count = torch.tensor([len(tensor_list)], **tkwargs)
+        row_counts = torch.zeros(self.num_workers, **tkwargs)
+        dist.all_gather_into_tensor(row_counts, row_count, group=self.comm_group)
+        assert row_counts[self.worker_id] == row_count == len(tensor_list), "all_gather failed"
+
+        # 2. gather on rank 0 the sizes of the rows to be sent/recv
+        row_len = torch.tensor([len(l) for l in tensor_list], **tkwargs)
+        row_lens = Dist.gather_v(row_len, 0, self.comm_group, self.num_workers, self.worker_id)
+
+        # 4. gather on rank 0 of the total size (sum of all row lengths) to be received
+        size = torch.tensor([sum(row_len).item()], **tkwargs)
+        sizes = torch.zeros(self.num_workers, **tkwargs)
+        dist.all_gather_into_tensor(sizes, size, group=self.comm_group)
+        assert sizes[self.worker_id] == size.item(), "all_gather did not return the same sizes"  #sanity check
+
+        # method to deserializes a buffer into rows of different lengths and write them to file
+        def write_buffer_to_file(buff, src, builder):
+            assert self.worker_id == 0, "only rank 0 can write to file"
+
+            # collect all buffers and write them at once
+            buff = buff.cpu().detach().numpy()
+            row_offsets = np.cumsum([0] + row_lens[src].tolist())
+            arr_list = []
+            for i in range(len(row_lens[src])):
+                arr_list.append(buff[row_offsets[i]:row_offsets[i + 1]])
+            builder.add_items(arr_list)
+
+        # 5. rank 0 prepares output folder and file
+        if self.worker_id == 0:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            builder = create_mmap_dataset_builder(fname, numpy_dtype)
+
+        # iterate through ranks that have data to be sent/recv/written
+        for src in [rank for rank, count in enumerate(row_counts) if count > 0]:
+
+            dist.barrier(group=self.comm_group)
+            if self.worker_id == 0 and src == 0:  # rank 0's write its own data
+                buffer = torch.cat(tensor_list, dim=0).to(self.device)
+                write_buffer_to_file(buffer, 0, builder)
+            elif self.worker_id == 0 and src > 0:  # rank 0 receives other rank's data and writes it
+                buffer = torch.empty(sizes[src].item(), dtype=buffer.dtype, device=buffer.device)
+                err = dist.recv(buffer, src=src, group=self.comm_group, tag=src)
+                assert err == src and len(buffer) > 0, "recv failed"
+                write_buffer_to_file(buffer, src, builder)
+            elif self.worker_id == src:  # current rank sends data to rank 0
+                buffer = torch.cat(tensor_list, dim=0).to(self.device)
+                dist.send(buffer, 0, group=self.comm_group, tag=src)
+
+        # rank 0 closes the file
+        if self.worker_id == 0:
+            close_mmap_dataset_builder(builder, fname)  # close file
+        dist.barrier(self.comm_group)
+
+
+class Dist:
+    """ auxiliary class to perform distributed operations on tensors"""
+
+    @staticmethod
+    def min_max(tensor, comm_group):
+        """ given a distributed tensor, return the min/max values across all ranks"""
+
+        value_min, value_max = tensor.min(), tensor.max()
+        dist.reduce(value_min, 0, op=dist.ReduceOp.MIN, group=comm_group)
+        dist.reduce(value_max, 0, op=dist.ReduceOp.MAX, group=comm_group)
+        return value_min.item(), value_max.item()
+
+    @staticmethod
+    def gather_v(tensor, dst, comm_group, num_workers, worker_id):
+        """ MPI_Gatherv. gather tensors of variable sizes in a single rank """
+
+        # gather the number of rows to be sent/recv
+        size = torch.tensor([len(tensor)], dtype=torch.int64, device=tensor.device)
+        sizes = torch.zeros(num_workers, dtype=torch.int64, device=tensor.device)
+        dist.all_gather_into_tensor(sizes, size, group=comm_group)
+        assert sizes[worker_id] == size, "all_gather failed"
+
+        # all_gather requires all tensors to be of same size so we need to pad them
+        max_size = max(sizes).item()
+        buffer = torch.empty(max_size, dtype=tensor.dtype, device=tensor.device)
+        buffer[0:size] = tensor.data
+        buffer_list = None
+        if worker_id == 0:  # create padded recv buffers
+            buffer_list = [torch.empty(max_size, dtype=tensor.dtype, device=tensor.device) for _ in range(num_workers)]
+        dist.gather(buffer, buffer_list, dst=dst, group=comm_group)
+
+        # revert padding and return value
+        if worker_id == 0:
+            buffer_list = [r[:s.item()] for r, s in zip(buffer_list, sizes)]
+        return buffer_list
+
+    @staticmethod
+    def sample_sort(tensor, comm_group, num_workers, n_samples=100):
+        """ perform a distributed random sort of a tensor, and returns the sorted partial tensor"""
+        device, dims = tensor.device, tensor.size()[1]
+
+        # 1 - sort rows by first column, then second column, then third, etc...
+        tensor = torch.tensor(sorted(tensor.tolist()), dtype=tensor.dtype, device=tensor.device)
+
+        # 2 - collect few samples per rank
+        idx = torch.round(torch.linspace(0, len(tensor) - 1, n_samples)).to(int)
+        samples = tensor[idx][:, 0].contiguous().to(device)  #only first column, all but last row
+
+        # 2 - Allgather samples
+        all_samples = [torch.zeros(n_samples, dtype=samples.dtype, device=device) for _ in range(num_workers)]
+        dist.all_gather(all_samples, samples, group=comm_group)
+        all_samples = torch.cat(all_samples, dim=0).to(device)
+
+        # 3 - Sort all samples and collect the ranges of each rank as equidistant
+        all_samples = all_samples.sort()[0]
+        idx = torch.round(torch.linspace(0, len(all_samples) - 1, num_workers + 1)).to(int)
+        ranges = all_samples[idx]  # range of each rank r as ranges[r] <= x < ranges[r+1]
+        ranges[-1] += 1  # increase upper limit of last rank so that x < ranges[r+1].
+
+        # 4 - collect elements to send to each rank, based on the rank ranges
+        send = []
+        for rank in range(num_workers):
+            mask = (tensor[:, 0] >= ranges[rank]) & (tensor[:, 0] < ranges[rank + 1])
+            send.append(tensor[mask])
+
+        # 5. all to all to communicate the sizes to be sent/recv
+        send_count = [torch.tensor([len(s) * dims], dtype=torch.int64, device=device) for s in send]
+        recv_count = list(torch.empty([num_workers], dtype=torch.int64, device=device).chunk(num_workers))
+        dist.all_to_all(recv_count, send_count, group=comm_group)
+
+        # 6. all-to-all-v to communicate the elements to be sent/recv as a single tensor
+        send = torch.cat(send, dim=0).flatten().to(device)
+        recv = torch.zeros(sum(recv_count), dtype=send.dtype).to(device)
+        send_count = [s.item() for s in send_count]  # convert to list of ints
+        recv_count = [r.item() for r in recv_count]
+        dist.all_to_all_single(recv, send, recv_count, send_count, group=comm_group)
+        del send
+
+        # 7. the received tensor is the 1D disjoint subset of the distributed tensor.
+        # We will recover the original dimensionality and sort it by columns again.
+        recv = recv.view(-1, dims)
+        recv = torch.tensor(sorted(recv.tolist()), dtype=recv.dtype, device=recv.device)
+        return recv
+
+
+def test_compare_both_data_analyzers(dataset):
+    """ given a dataset, compare file and memory based data analyser"""
+
+    id = lambda t: t.to(torch.int64)  # identity
+    batch_sum = lambda t: id(t).sum()  #sum batch
+    num_threads = 4
+    kwargs = dict(
+        dataset=dataset,
+        batch_size=2**10,
+        worker_id=int(os.environ['RANK']),
+        num_workers=int(os.environ['WORLD_SIZE']),
+        metric_names=["mod", "batch_sum"],
+        metric_functions=[id, batch_sum],
+        metric_types=['single_value_per_sample', 'accumulate_value_over_samples'],
+        num_threads=num_threads,
+    )
+
+    dda = DistributedDataAnalyzer(
+        save_path="./output_dist",
+        device=f"cuda:{int(os.environ['LOCAL_RANK'])}",
+        **kwargs,
+    )
+    start_time = time.time()
+    dda.run_map_reduce()
+    if dda.worker_id == 0:
+        print("DistributedDataAnalyzer runtime: %s seconds " % (time.time() - start_time))
+
+    da = DataAnalyzer(num_threads_reduce=num_threads,
+                      save_path="./output_disk",
+                      metric_dtypes=[torch.int64, torch.int64],
+                      **kwargs)
+    start_time = time.time()
+    da.run_map_reduce()
+    if da.worker_id == 0:
+        print("DataAnalyzer runtime: %s seconds " % (time.time() - start_time))
+
+    output_paths = [
+        "batch_sum/batch_sum_metric_value.bin", "batch_sum/batch_sum_metric_value.idx", \
+        "mod/mod_index_to_metric.bin", "mod/mod_index_to_metric.idx", \
+        "mod/mod_index_to_sample.bin", "mod/mod_index_to_sample.idx", \
+        "mod/mod_index_to_sample_percentile_merged.bin", "mod/mod_index_to_sample_percentile_merged.idx", \
+        "mod/mod_sample_to_metric.bin", "mod/mod_sample_to_metric.idx"
+    ]
+
+    if dda.worker_id == 0:
+        for path in output_paths:
+            with open(os.path.join(da.save_path, path), 'rb') as f1, \
+                open(os.path.join(dda.save_path, path), 'rb') as f2:
+                if f1.read() != f2.read():
+                    print(f"files {path} are not identical.")
+
+
+if __name__ == "__main__":
+
+    class TestDataset(torch.utils.data.Dataset):
+
+        def __init__(self, size=10_000_000):
+            self.values = [(x + 7) % 10_000 for x in range(size)]
+            self.size = size
+
+        __len__ = lambda self: self.size
+        __getitem__ = lambda self, idx: self.values[idx]
+
+    test_compare_both_data_analyzers(TestDataset())
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
index ef845e4bc490..100bef3f7946 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/data_sampler.py
@@ -119,9 +119,15 @@ def set_custom_curriculum_learning_schedule(self, schedule_func_dict):
             if metric in schedule_func_dict:
                 self.curriculum_schedulers[metric].set_custom_get_difficulty(schedule_func_dict[metric])
 
-    def get_start_end_idx(self):
-        start_idx = self.data_parallel_rank * self.micro_batch_size
-        end_idx = start_idx + self.micro_batch_size
+    def get_start_end_idx(self, batch_len=None):
+        """
+        given the length of a minibatch (defaults to micro-batch size * data_parallel_size),
+        return the start and end indices of the current data parallel rank
+        """
+        batch_len = batch_len or self.micro_batch_times_data_parallel_size
+        start_idx_fn = lambda r: round(r * batch_len / self.data_parallel_group.size())
+        start_idx = start_idx_fn(self.data_parallel_rank)
+        end_idx = start_idx_fn(self.data_parallel_rank + 1)
         return start_idx, end_idx
 
     def get_sample_based_on_metric_value(self, metric, value_start, value_end):
@@ -281,12 +287,17 @@ def get_next_global_batch(self):
                 for cidx in range(len(samples_per_cluster)):
                     batch += self.get_sample_from_cluster(cidx, samples_per_cluster[cidx])
                 self.np_rng.shuffle(batch)
+
+                # broadcast tensor must have same shape across participants. So we fill batch with -1s when not full
+                assert len(batch) <= self.global_batch_size
+                batch += [-1] * (self.global_batch_size - len(batch))
                 batch = torch.tensor(batch, device=get_accelerator().current_device_name(), dtype=torch.long).view(-1)
             else:
                 batch = torch.empty(self.global_batch_size,
                                     device=get_accelerator().current_device_name(),
                                     dtype=torch.long)
             dist.broadcast(batch, 0, group=self.data_parallel_group)
+            batch = batch[batch != -1]  # remove trailing -1s used to fill incomplete batch tensor
             self.batch = batch.tolist()
 
     def __iter__(self):
@@ -297,7 +308,7 @@ def __iter__(self):
             self.batch = self.batch[self.micro_batch_times_data_parallel_size:]
             if len(current_batch) == self.micro_batch_times_data_parallel_size or \
                 (len(current_batch) > 0 and not self.drop_last):
-                start_idx, end_idx = self.get_start_end_idx()
+                start_idx, end_idx = self.get_start_end_idx(len(current_batch))
                 yield current_batch[start_idx:end_idx]
                 self.consumed_samples += len(current_batch)
                 current_batch = []
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
index 60115fa6efef..872d05de0145 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/indexed_dataset.py
@@ -98,25 +98,26 @@ def write_longs(f, a):
     f.write(np.array(a, dtype=np.int64))
 
 
+# valid metric_dtypes as numpy and torch types
 dtypes = {
-    1: np.uint8,
-    2: np.int8,
-    3: np.int16,
-    4: np.int32,
-    5: np.int64,
-    6: np.float64,
-    7: np.double,
-    8: np.uint16,
-    9: np.uint32,
-    10: np.uint64
+    1: (np.uint8, torch.uint8),
+    2: (np.int8, torch.int8),
+    3: (np.int16, torch.int16),
+    4: (np.int32, torch.int32),
+    5: (np.int64, torch.int64),
+    6: (np.uint16, None),
+    7: (np.uint32, None),
+    8: (np.uint64, None),
 }
 
+valid_dtypes = set([dt[0] for dt in dtypes.values()] + [dt[1] for dt in dtypes.values() if dt[1] is not None])
+
 
 def code(dtype):
-    for k in dtypes.keys():
-        if dtypes[k] == dtype:
-            return k
-    raise ValueError(dtype)
+    for c, (np_dt, torch_dt) in dtypes.items():
+        if dtype in [np_dt, torch_dt]:
+            return c
+    raise ValueError(f"{dtype} not supported. Supported types: {valid_dtypes}")
 
 
 def index_file_path(prefix_path):
@@ -153,7 +154,7 @@ def read_index(self, path):
             version = f.read(8)
             assert struct.unpack('<Q', version) == (1, )
             code, self.element_size = struct.unpack('<QQ', f.read(16))
-            self.dtype = dtypes[code]
+            self.dtype = dtypes[code][0]  #numpy type
             self._len, self.s = struct.unpack('<QQ', f.read(16))
             self.doc_count = struct.unpack('<Q', f.read(8))
             self.dim_offsets = read_longs(f, self._len + 1)
@@ -269,7 +270,6 @@ def __getitem__(self, idx):
 
 
 class IndexedDatasetBuilder(object):
-    element_sizes = {np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, np.float64: 4, np.double: 8}
 
     def __init__(self, out_file, dtype=np.int32):
         self.out_file = open(out_file, 'wb')
@@ -277,7 +277,7 @@ def __init__(self, out_file, dtype=np.int32):
         self.data_offsets = [0]
         self.dim_offsets = [0]
         self.sizes = []
-        self.element_size = self.element_sizes[self.dtype]
+        self.element_size = self.dtype().itemsize
         self.doc_idx = [0]
 
     def add_item(self, tensor):
@@ -427,7 +427,7 @@ def __init__(self, path, skip_warmup=False):
                 assert (1, ) == version
 
                 dtype_code, = struct.unpack('<B', stream.read(1))
-                self._dtype = dtypes[dtype_code]
+                self._dtype = dtypes[dtype_code][0]  #numpy type
                 self._dtype_size = self._dtype().itemsize
 
                 self._len = struct.unpack('<Q', stream.read(8))[0]
@@ -576,15 +576,23 @@ class MMapIndexedDatasetBuilder(object):
 
     def __init__(self, out_file, dtype=np.int64):
         self._data_file = open(out_file, 'wb')
-        self._dtype = dtype
+        self._dtype = [np_dt for np_dt, torch_dt in dtypes.values() if dtype in [np_dt, torch_dt]][0]
         self._sizes = []
         self._doc_idx = [0]
 
     def add_item(self, tensor):
+        """ write the tensor to the file and update its size in the index"""
         np_array = np.array(tensor.numpy(), dtype=self._dtype)
         self._data_file.write(np_array.tobytes(order='C'))
         self._sizes.append(np_array.size)
 
+    def add_items(self, arr_list):
+        """ write a list of arrays to the file and update their sizes in the index"""
+        np_arrays = [arr.astype(self._dtype) for arr in arr_list]
+        self._data_file.writelines([arr.tobytes(order='C') for arr in np_arrays])
+        for arr in np_arrays:
+            self._sizes.append(arr.size)
+
     def add_item_numpy(self, np_array):
         if np_array.dtype != self._dtype:
             np_array = np_array.astype(self._dtype)
@@ -609,6 +617,8 @@ def merge_file_(self, another_file):
         # Concatenate data
         with open(data_file_path(another_file), 'rb') as f:
             shutil.copyfileobj(f, self._data_file)
+        self._data_file.flush()
+        assert os.stat(self._data_file.name).st_size != 0, f"Zero-sized file: {self._data_file.name}"
 
     def finalize(self, index_file):
         self._data_file.close()
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/utils.py b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
index 9c643f3705de..dc55f96e222d 100644
--- a/deepspeed/runtime/data_pipeline/data_sampling/utils.py
+++ b/deepspeed/runtime/data_pipeline/data_sampling/utils.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import math
 import numpy as np
 
 from deepspeed.utils import logger
@@ -32,10 +31,8 @@ def find_fit_int_dtype(min_value, max_value):
 
 
 def split_index(start_idx, end_idx, num_partitions):
-    partition_size = math.ceil((end_idx - start_idx) / num_partitions)
-    partitions = [[start_idx + x * partition_size,
-                   min(end_idx, start_idx + (x + 1) * partition_size)] for x in range(num_partitions)]
-    return partitions
+    partition_boundaries = np.linspace(start_idx, end_idx, dtype=int, num=num_partitions + 1)
+    return [(partition_boundaries[i], partition_boundaries[i + 1]) for i in range(num_partitions)]
 
 
 def split_dataset(dataset, num_workers, worker_id, num_threads):
diff --git a/deepspeed/runtime/domino/__init__.py b/deepspeed/runtime/domino/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/deepspeed/runtime/domino/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/runtime/domino/transformer.py b/deepspeed/runtime/domino/transformer.py
new file mode 100644
index 000000000000..88c5494c8147
--- /dev/null
+++ b/deepspeed/runtime/domino/transformer.py
@@ -0,0 +1,517 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+
+
+def is_rank_0():
+    if dist.get_rank() == 0:
+        return True
+
+
+class DominoModule(torch.nn.Module):
+    """extensions of torch Module."""
+
+    def __init__(self, ):
+        super(DominoModule, self).__init__()
+
+
+import enum
+
+
+class LayerType(enum.Enum):
+    encoder = 1
+    decoder = 2
+
+
+class AttnType(enum.Enum):
+    self_attn = 1
+    cross_attn = 2
+
+
+class AttnMaskType(enum.Enum):
+    padding = 1
+    causal = 2
+
+
+class ModelType(enum.Enum):
+    encoder_or_decoder = 1
+    encoder_and_decoder = 2
+
+
+handle_dic = {}
+
+
+def no_oper(input_, dic_, h_id):
+    return NoOper.apply(input_, dic_, h_id)
+
+
+class NoOper(torch.autograd.Function):
+
+    @staticmethod
+    def symbolic(graph, input_, handle_dic, h_id):
+        return input_
+
+    @staticmethod
+    def forward(ctx, input_, handle_dic, h_id):
+        ctx.handle_dic = handle_dic
+        ctx.h_id = h_id
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        handle = ctx.handle_dic[ctx.h_id]
+        handle.wait()
+        return grad_output, None, None
+
+
+def copy_to_tensor_model_parallel_region_a(mpu, input_, dic_, h_id):
+    return _CopyToModelParallelRegionA.apply(mpu, input_, dic_, h_id)
+
+
+class _CopyToModelParallelRegionA(torch.autograd.Function):
+    """Pass the input to the model parallel region."""
+
+    @staticmethod
+    def symbolic(graph, mpu, input_, handle_dic, h_id):
+        return input_
+
+    @staticmethod
+    def forward(ctx, mpu, input_, handle_dic, h_id):
+        ctx.mpu = mpu
+        ctx.handle_dic = handle_dic
+        ctx.h_id = h_id
+        return input_
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        # Bypass the function if we are using only 1 GPU.
+        if ctx.mpu.get_tensor_model_parallel_world_size() == 1:
+            return grad_output
+
+        # Async All-reduce.
+        handle = dist.all_reduce(grad_output, group=ctx.mpu.get_tensor_model_parallel_group(), async_op=True)
+        ctx.handle_dic[ctx.h_id] = handle
+        return None, grad_output, None, None
+
+
+class CoreAttention(DominoModule):
+
+    def __init__(self, config, layer_number, mpu, attn_mask_type=AttnMaskType.causal):
+        super(CoreAttention, self).__init__()
+
+        self.layer_number = max(1, layer_number)
+        self.att_dropout_p = config.attention_dropout
+        self.is_causal = True
+        projection_size = config.kv_channels * config.num_attention_heads
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_partition = projection_size // world_size
+
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
+        # attn_mask is None when is_causal=True
+        context_layer = torch.nn.functional.scaled_dot_product_attention(query_layer,
+                                                                         key_layer,
+                                                                         value_layer,
+                                                                         attn_mask=None,
+                                                                         dropout_p=self.att_dropout_p,
+                                                                         is_causal=True,
+                                                                         scale=None)
+
+        # [b, np, sq, hn] --> [sq, b, np, hn]
+        context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
+
+        # [sq, b, np, hn] --> [sq, b, hp]
+        new_context_layer_shape = context_layer.size()[:-2] + \
+            (self.hidden_size_per_partition,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer
+
+
+class ShardedAttention(DominoModule):
+    """Sharded self-attention layer class.
+    Only support self attention and causal attention mask
+    """
+
+    def __init__(self,
+                 config,
+                 layer_number,
+                 mpu,
+                 ColumnParallelLinear,
+                 RowParallelLinearNoComm,
+                 apply_rotary_pos_emb,
+                 attention_type=AttnType.self_attn,
+                 attn_mask_type=AttnMaskType.causal):
+        super(ShardedAttention, self).__init__()
+        self.layer_number = max(1, layer_number)
+        self.attention_type = attention_type
+        self.attn_mask_type = attn_mask_type
+        self.params_dtype = config.params_dtype
+        self.apply_rotary_pos_emb = apply_rotary_pos_emb
+
+        query_projection_size = config.kv_channels * config.num_attention_heads
+        kv_projection_size = config.kv_channels * config.num_attention_heads
+
+        # Per attention head and per partition values.
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.hidden_size_per_attention_head = query_projection_size // config.num_attention_heads
+        self.num_attention_heads_per_partition = config.num_attention_heads // world_size
+
+        self.query_key_value = ColumnParallelLinear(config.hidden_size,
+                                                    query_projection_size + 2 * kv_projection_size,
+                                                    config=config,
+                                                    init_method=config.init_method,
+                                                    bias=config.add_bias_linear,
+                                                    gather_output=False)
+
+        self.core_attention = CoreAttention(config, self.layer_number, mpu, self.attn_mask_type)
+
+        self.dense = RowParallelLinearNoComm(query_projection_size,
+                                             config.hidden_size,
+                                             config=config,
+                                             init_method=config.output_layer_init_method,
+                                             bias=config.add_bias_linear,
+                                             input_is_parallel=True,
+                                             skip_bias_add=True)
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+
+        # Query, Key, and Value
+        # Attention heads [s, b, h] --> [s, b, np * 3 * hn)]
+        mixed_x_layer, _ = self.query_key_value(hidden_states)
+
+        # [s, b, np * 3 * hn] --> [s, b, np, 3 * hn]
+        new_tensor_shape = mixed_x_layer.size()[:-1] + (
+            self.num_attention_heads_per_partition,
+            3 * self.hidden_size_per_attention_head,
+        )
+        mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
+
+        # [s, b, np, 3 * hn] -> [b, np, s, 3*hn]
+        mixed_x_layer = mixed_x_layer.permute(1, 2, 0, 3).contiguous()
+
+        # [s, b, np, 3 * hn] --> [s, b, np, hn], [s, b, np, hn], [s, b, np, hn]
+        (query_layer, key_layer, value_layer) = torch.split(mixed_x_layer, [
+            self.hidden_size_per_attention_head, self.hidden_size_per_attention_head,
+            self.hidden_size_per_attention_head
+        ],
+                                                            dim=3)
+        # [s, b, np, np * hn] -> [s, b, np, hn]
+        query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1,
+                                       self.hidden_size_per_attention_head)
+
+        # apply rotary embedding
+        if rotary_pos_emb is not None:
+            if isinstance(rotary_pos_emb, tuple):
+                rotary_pos_emb = rotary_pos_emb
+            else:
+                rotary_pos_emb = ((rotary_pos_emb, ) * 2)
+            q_pos_emb, k_pos_emb = rotary_pos_emb
+            query_layer = self.apply_rotary_pos_emb(query_layer, q_pos_emb)
+            key_layer = self.apply_rotary_pos_emb(key_layer, k_pos_emb)
+
+        context_layer = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
+
+        # Output. [s, b, h]
+        output, bias = self.dense(context_layer)
+
+        return output, bias
+
+
+class DominoTransformerLayer(DominoModule):
+    """A domino single transformer layer.
+    [s, b, h] -> [s, b, h]
+    """
+
+    def __init__(self,
+                 config,
+                 layer_number,
+                 mpu,
+                 fused_layer_norm,
+                 _initialize_affine_weight_gpu,
+                 ColumnParallelLinear,
+                 RowParallelLinearNoComm,
+                 apply_rotary_pos_emb,
+                 bias_dropout_add_fused_train,
+                 bias_dropout_add_fused_inference,
+                 skip_bias_add=True,
+                 layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.causal,
+                 drop_path_rate=0.,
+                 output_bias=None):
+        super(DominoTransformerLayer, self).__init__()
+
+        if not dist.is_initialized():
+            dist.init_distributed()
+            assert dist.is_initialized(), "deepspeed.comm is not initialized!"
+
+        self.llama_model = config.llama_model
+        self.layer_number = layer_number
+        self.layer_type = layer_type
+        self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm
+        self.bias_dropout_add_fused_train = bias_dropout_add_fused_train
+        self.bias_dropout_add_fused_inference = bias_dropout_add_fused_inference
+        self.mpu = mpu
+        self.output_bias = output_bias
+
+        # Layernorm on the input data.
+        self.input_layernorm = fused_layer_norm(config.hidden_size,
+                                                eps=config.layernorm_epsilon,
+                                                no_persist_layer_norm=config.no_persist_layer_norm)
+
+        # Self attention.
+        self.self_attention = ShardedAttention(config,
+                                               layer_number,
+                                               mpu,
+                                               ColumnParallelLinear,
+                                               RowParallelLinearNoComm,
+                                               apply_rotary_pos_emb,
+                                               attention_type=AttnType.self_attn,
+                                               attn_mask_type=self_attn_mask_type)
+
+        self.hidden_dropout = config.hidden_dropout
+
+        # Layernorm on the attention output
+        self.post_attention_layernorm = fused_layer_norm(config.hidden_size,
+                                                         eps=config.layernorm_epsilon,
+                                                         no_persist_layer_norm=config.no_persist_layer_norm)
+
+        # ------------ init mlp start ------------
+        init_method = config.init_method
+        output_layer_init_method = config.output_layer_init_method
+        self.add_bias = config.add_bias_linear
+        self.skip_bias_add = skip_bias_add
+
+        ffn_hidden_size = config.ffn_hidden_size
+        if config.gated_linear_unit:
+            ffn_hidden_size *= 2
+        self.output_size_c = config.ffn_hidden_size
+        self.input_size_c = config.hidden_size
+        self.input_size_r = config.ffn_hidden_size
+        self.output_size_r = self.input_size_c
+
+        world_size = mpu.get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = self.output_size_c // world_size
+        self.input_size_per_partition = self.input_size_r // world_size
+
+        # Initialize weight.
+        self.weight_c = Parameter(
+            torch.empty(self.output_size_per_partition,
+                        self.input_size_c,
+                        device=get_accelerator().current_device_name(),
+                        dtype=config.params_dtype))
+        self.weight_r = Parameter(
+            torch.empty(self.output_size_r,
+                        self.input_size_per_partition,
+                        device=get_accelerator().current_device_name(),
+                        dtype=config.params_dtype))
+
+        if config.perform_initialization:
+            _initialize_affine_weight_gpu(self.weight_c, init_method, partition_dim=0, stride=1)
+
+            _initialize_affine_weight_gpu(self.weight_r, output_layer_init_method, partition_dim=1, stride=1)
+
+        if self.add_bias:
+            self.bias_c = Parameter(
+                torch.empty(self.output_size_per_partition,
+                            device=get_accelerator().current_device_name(),
+                            dtype=config.params_dtype))
+            self.bias_r = Parameter(
+                torch.empty(self.output_size_r,
+                            device=get_accelerator().current_device_name(),
+                            dtype=config.params_dtype))
+            if config.perform_initialization:
+                with torch.no_grad():
+                    self.bias_c.zero_()
+                    self.bias_r.zero_()
+        else:
+            self.register_parameter('bias_c', None)
+            self.register_parameter('bias_r', None)
+
+        if config.swiglu:
+
+            def swiglu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return F.silu(x[0]) * x[1]
+
+            self.mlp_activation_func = swiglu
+        else:
+            self.mlp_activation_func = F.gelu
+        # ------------ init mlp end ------------
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+        hidden_states0, hidden_states1 = hidden_states
+
+        layernorm_output0 = self.input_layernorm(hidden_states0)
+        layernorm_output1 = self.input_layernorm(hidden_states1)
+
+        if not self.llama_model:
+            rotary_pos_emb = None
+
+        attention_output0, attention_bias0  = \
+            self.self_attention(
+                layernorm_output0,
+                attention_mask,
+                rotary_pos_emb=rotary_pos_emb)
+        handle0 = dist.all_reduce(attention_output0, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
+
+        attention_output1, attention_bias1 = \
+            self.self_attention(
+            layernorm_output1,
+            attention_mask,
+            rotary_pos_emb=rotary_pos_emb)
+        handle1 = dist.all_reduce(attention_output1, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
+        handle0.wait()
+
+        # Residual0 connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual0 = layernorm_output0
+        else:
+            residual0 = hidden_states0
+
+        if self.training:
+            bias_dropout_add_func = self.bias_dropout_add_fused_train
+        else:
+            bias_dropout_add_func = self.bias_dropout_add_fused_inference
+        if attention_bias0 is not None:
+            attention_bias0 = attention_bias0.expand_as(residual0)
+        layernorm_input0 = bias_dropout_add_func(attention_output0, attention_bias0, residual0, self.hidden_dropout)
+
+        layernorm_output0 = self.post_attention_layernorm(layernorm_input0)
+        layernorm_output0 = no_oper(layernorm_output0, handle_dic, f'{self.layer_number}_0')
+
+        # Residual1 connection.
+        if self.apply_residual_connection_post_layernorm:
+            residual1 = layernorm_output1
+        else:
+            residual1 = hidden_states1
+
+        if attention_bias1 is not None:
+            attention_bias1 = attention_bias1.expand_as(residual1)
+        layernorm_input1 = bias_dropout_add_func(attention_output1, attention_bias1, residual1, self.hidden_dropout)
+
+        layernorm_output1 = self.post_attention_layernorm(layernorm_input1)
+        layernorm_output1 = no_oper(layernorm_output1, handle_dic, f'{self.layer_number}_1')
+
+        # ------------ explicit mlp start ------------
+        bias_c = self.bias_c if not self.skip_bias_add else None
+
+        input0 = copy_to_tensor_model_parallel_region_a(self.mpu, layernorm_output0, handle_dic,
+                                                        f'{self.layer_number}_0')
+        # Batch0 Matrix multiply.
+        output0 = torch.matmul(input0, self.weight_c.t())
+        if bias_c is not None:
+            output0 = output0 + bias_c
+        output0 = self.mlp_activation_func(output0)
+        output0 = torch.matmul(output0, self.weight_r.t())
+        handle2 = dist.all_reduce(output0, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
+
+        handle1.wait()
+
+        input1 = copy_to_tensor_model_parallel_region_a(self.mpu, layernorm_output1, handle_dic,
+                                                        f'{self.layer_number}_1')
+        # Batch1 Matrix multiply.
+        output1 = torch.matmul(input1, self.weight_c.t())
+        output1 = self.mlp_activation_func(output1)
+        if bias_c is not None:
+            output1 = output1 + bias_c
+        output1 = torch.matmul(output1, self.weight_r.t())
+        dist.all_reduce(output1, group=self.mpu.get_tensor_model_parallel_group())
+
+        handle2.wait()
+
+        output0 = output0 + self.bias_r if self.bias_r is not None else output0
+        output1 = output1 + self.bias_r if self.bias_r is not None else output1
+        output_bias = self.output_bias
+        mlp_output0, mlp_output1, mlp_bias0, mlp_bias1 = output0, output1, output_bias, output_bias
+        # ------------ explicit mlp end ------------
+
+        if self.apply_residual_connection_post_layernorm:
+            residual0 = layernorm_output0
+            residual1 = layernorm_output1
+        else:
+            residual0 = layernorm_input0
+            residual1 = layernorm_input1
+
+        if mlp_bias0 is not None:
+            mlp_bias0 = mlp_bias0.expand_as(residual0)
+            mlp_bias1 = mlp_bias1.expand_as(residual1)
+        output0 = bias_dropout_add_func(mlp_output0, mlp_bias0, residual0, self.hidden_dropout)
+        output1 = bias_dropout_add_func(mlp_output1, mlp_bias1, residual1, self.hidden_dropout)
+
+        return output0, output1
+
+
+class DominoTransformer(DominoModule):
+    """Transformer class."""
+
+    def __init__(self,
+                 config,
+                 model_type,
+                 mpu,
+                 fused_layer_norm,
+                 _initialize_affine_weight_gpu,
+                 ColumnParallelLinear,
+                 RowParallelLinearNoComm,
+                 apply_rotary_pos_emb,
+                 bias_dropout_add_fused_train,
+                 bias_dropout_add_fused_inference,
+                 layer_type=LayerType.encoder,
+                 self_attn_mask_type=AttnMaskType.causal,
+                 pre_process=True,
+                 post_process=True,
+                 post_layer_norm=True,
+                 drop_path_rate=0.0):
+        super(DominoTransformer, self).__init__()
+
+        self.layer_type = layer_type
+        self.model_type = model_type
+        self.post_process = post_process
+        self.post_layer_norm = post_layer_norm
+        self.num_layers = config.num_layers
+        self.drop_path_rate = drop_path_rate
+        self.drop_path_rates = [rate.item() for rate in torch.linspace(0, self.drop_path_rate, config.num_layers)]
+
+        def build_layer(layer_number):
+            return DominoTransformerLayer(config,
+                                          layer_number,
+                                          mpu,
+                                          fused_layer_norm,
+                                          _initialize_affine_weight_gpu,
+                                          ColumnParallelLinear,
+                                          RowParallelLinearNoComm,
+                                          apply_rotary_pos_emb,
+                                          bias_dropout_add_fused_train,
+                                          bias_dropout_add_fused_inference,
+                                          layer_type=layer_type,
+                                          self_attn_mask_type=self_attn_mask_type,
+                                          drop_path_rate=self.drop_path_rates[layer_number - 1])
+
+        self.layers = torch.nn.ModuleList([build_layer(i + 1) for i in range(self.num_layers)])
+
+        if self.post_process and self.post_layer_norm:
+            self.final_layernorm = fused_layer_norm(config.hidden_size,
+                                                    eps=config.layernorm_epsilon,
+                                                    no_persist_layer_norm=config.no_persist_layer_norm)
+
+    def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
+        # hidden_states: [s, b, h]
+
+        for index in range(self.num_layers):
+            layer = self.layers[index]
+            hidden_states = layer(hidden_states, attention_mask, rotary_pos_emb)
+
+        hidden_states0, hidden_states1 = hidden_states
+        if self.post_process and self.post_layer_norm:
+            hidden_states0 = self.final_layernorm(hidden_states0)
+            hidden_states1 = self.final_layernorm(hidden_states1)
+
+        return hidden_states0, hidden_states1
diff --git a/deepspeed/runtime/eigenvalue.py b/deepspeed/runtime/eigenvalue.py
index df63854dd1ca..a82d8b1d5c7a 100755
--- a/deepspeed/runtime/eigenvalue.py
+++ b/deepspeed/runtime/eigenvalue.py
@@ -7,6 +7,7 @@
 from deepspeed.utils import log_dist
 import numpy as np
 import logging
+from deepspeed.utils.torch import required_torch_version
 
 
 class Eigenvalue(object):
@@ -36,12 +37,15 @@ def __init__(self,
             ranks=[0])
 
     # Replace all nan/pos-inf/neg-inf to zero
-    # TODO: Pytorch new version may add this function, replace this one by then.
     def nan_to_num(self, x):
-        device = x.device
-        x = x.cpu().numpy()
-        x = np.nan_to_num(x=x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
-        return torch.from_numpy(x).to(device)
+        if required_torch_version(min_version=1.8):
+            return torch.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
+        else:
+            # Fallback to numpy based implementation for backwards-compatibility with PyTorch 1.7 or older versions.
+            device = x.device
+            x = x.cpu().numpy()
+            x = np.nan_to_num(x=x, copy=False, nan=0.0, posinf=0.0, neginf=0.0)
+            return torch.from_numpy(x).to(device)
 
     def normalize(self, v):
         norm_squared = self.inner_product(v, v)
@@ -110,8 +114,8 @@ def compute_eigenvalue(self, module, device=None, scale=1.0):
             eigenvalue_current, eigenvalue_previous = 1., 0.
 
             while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
-                (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) >=
-                                                                           self.tol):  # test convergence criteria
+                (eigenvalue_current - eigenvalue_previous) / eigenvalue_current)
+                                                                           >= self.tol):  # test convergence criteria
                 eigenvalue_previous = eigenvalue_current
 
                 Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
old mode 100644
new mode 100755
index 8f62f36f328e..bcbe0f68b04f
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -17,14 +17,15 @@
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import _LRScheduler
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from contextlib import contextmanager
 
-from typing import Callable, Dict, Union, Iterable
+from typing import Callable, Dict, Union, Iterable, Container
 
 import deepspeed
 
 from deepspeed import comm as dist
 from deepspeed.runtime.utils import see_memory_usage, DummyOptim
-from .zero.offload_config import OffloadDeviceEnum
+from .zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 from deepspeed.runtime.zero.utils import is_zero_supported_optimizer, ZeRORuntimeException
@@ -35,6 +36,9 @@
 from deepspeed.runtime.fp16.unfused_optimizer import FP16_UnfusedOptimizer
 from deepspeed.runtime.bf16_optimizer import BF16_Optimizer
 
+from deepspeed.linear.optimized_linear import LoRAOptimizedLinear
+from deepspeed.module_inject.layers import GatherReplacedLayerParams
+
 from deepspeed.runtime.config import DEEPSPEED_OPTIMIZERS, \
     ADAGRAD_OPTIMIZER, ADAM_OPTIMIZER, ADAMW_OPTIMIZER, LAMB_OPTIMIZER, ONEBIT_ADAM_OPTIMIZER, ONEBIT_LAMB_OPTIMIZER, \
     TORCH_ADAM_PARAM, ADAM_W_MODE, ADAM_W_MODE_DEFAULT, ZERO_ONE_ADAM_OPTIMIZER, MUADAM_OPTIMIZER, MUADAMW_OPTIMIZER, \
@@ -69,10 +73,10 @@
     STEP_MICRO_TIMER, \
     FORWARD_GLOBAL_TIMER, BACKWARD_GLOBAL_TIMER, BACKWARD_INNER_GLOBAL_TIMER, BACKWARD_REDUCE_GLOBAL_TIMER, \
     STEP_GLOBAL_TIMER
-from deepspeed.utils.debug import debug_extract_module_and_param_names
+from deepspeed.utils.debug import debug_extract_module_and_param_names, debug_clear_module_and_param_names
 from deepspeed.monitor.monitor import MonitorMaster
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
-from deepspeed.runtime.utils import clip_grad_norm_
+from deepspeed.runtime.utils import clip_grad_norm_, compare_tensors_in_structures
 from deepspeed.runtime.eigenvalue import Eigenvalue
 from deepspeed.runtime.data_pipeline.constants import DATA_SAMPLING, \
     DATA_ROUTING, DATA_SAMPLING_ENABLED, CURRICULUM_LEARNING, \
@@ -90,10 +94,11 @@
 
 from .pipe.module import PipelineModule
 from .utils import get_ma_status
+from .compiler import is_compile_supported
 from ..ops.adam import FusedAdam
 from ..moe.sharded_moe import TopKGate, MOELayer
 from ..moe.layer import MoE
-from ..moe.utils import is_moe_param
+from ..moe.utils import is_moe_param, configure_moe_param_groups
 from ..git_version_info import version
 
 from deepspeed.profiling.flops_profiler.profiler import FlopsProfiler
@@ -120,21 +125,25 @@
 
 def split_half_float_double_sparse(tensors):
     device_type = get_accelerator().device_name()
-    supported_types = [
-        "torch.{}.HalfTensor".format(device_type), "torch.{}.FloatTensor".format(device_type),
-        "torch.{}.DoubleTensor".format(device_type), "torch.{}.BFloat16Tensor".format(device_type),
-        SparseTensor.type()
-    ]
+    supported_types = get_accelerator().supported_dtypes()
 
     for t in tensors:
-        assert t.type() in supported_types, f"attempting to reduce an unsupported grad type: {t.type()}"
+        assert t.dtype in supported_types, f"attempting to reduce an unsupported grad type: {t.dtype}"
 
-    buckets = []
+    sparse_tensor_buckets, dense_tensor_buckets = [], []
     for i, dtype in enumerate(supported_types):
-        bucket = [t for t in tensors if t.type() == dtype]
-        if bucket:
-            buckets.append((dtype, bucket))
-    return buckets
+        sparse_bucket, dense_bucket = [], []
+        for t in tensors:
+            if t.dtype == dtype:
+                if isinstance(t, SparseTensor):
+                    sparse_bucket.append(t)
+                else:
+                    dense_bucket.append(t)
+        if sparse_bucket:
+            sparse_tensor_buckets.append((dtype, sparse_bucket))
+        if dense_bucket:
+            dense_tensor_buckets.append((dtype, dense_bucket))
+    return sparse_tensor_buckets, dense_tensor_buckets
 
 
 class EngineTimers(object):
@@ -175,21 +184,20 @@ def __init__(self, enable_micro_timers, enable_global_timers):
 class DeepSpeedEngine(Module):
     r"""DeepSpeed engine for training."""
 
-    def __init__(
-        self,
-        args,
-        model,
-        optimizer=None,
-        model_parameters=None,
-        training_data=None,
-        lr_scheduler=None,
-        mpu=None,
-        dist_init_required=None,
-        collate_fn=None,
-        config=None,
-        config_class=None,
-        dont_change_device=False,
-    ):
+    def __init__(self,
+                 args,
+                 model,
+                 optimizer=None,
+                 model_parameters=None,
+                 training_data=None,
+                 lr_scheduler=None,
+                 mpu=None,
+                 dist_init_required=None,
+                 collate_fn=None,
+                 config=None,
+                 config_class=None,
+                 mesh_device=None,
+                 dont_change_device=False):
         super(DeepSpeedEngine, self).__init__()
         self.dont_change_device = dont_change_device
         self.client_optimizer = optimizer
@@ -210,6 +218,7 @@ def __init__(
         self.loaded_checkpoint_mp_world_size = None
         self.loaded_checkpoint_dp_world_size = None
         self.enable_backward_allreduce = True
+        self.inside_no_sync_ctxt = False
         self.progressive_layer_drop = None
         self.eigenvalue = None
         self.block_eigenvalue = None
@@ -222,22 +231,24 @@ def __init__(
         self._step_applied = False
         self._global_grad_norm = None
         self.use_ds_comm = False  # False --> Use torch.dist, True --> Use ds.comm backend.
-
         self.checkpoint_engine = None
 
         self._is_gradient_accumulation_boundary = None
         self.scale_wrt_gas = None
-        self.losses = 0.0
+        self.losses = None
+        self.mesh_device = mesh_device
 
         # for debug purposes - can then debug print: debug_get_module_name(module)
         debug_extract_module_and_param_names(model)
 
-        # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
-        self.param_names = {param: name for name, param in model.named_parameters()}
+        if self.mesh_device:
+            groups.mesh_device = self.mesh_device
 
         self._do_args_sanity_check(args)
         self._configure_with_arguments(args, mpu)
         self._do_sanity_check()
+        if self.autotp_size() > 1:
+            self._configure_tensor_parallel_states(model)
         see_memory_usage(f"DeepSpeed Engine: After args sanity test", force=self.memory_breakdown())
         if mpu is not None:
             if self.elasticity_enabled():
@@ -261,6 +272,9 @@ def __init__(
         # Configure distributed model
         self._configure_distributed_model(model)
 
+        # needed for zero_to_fp32 weights reconstruction to remap nameless data to state_dict
+        self.param_names = {param: name for name, param in model.named_parameters()}
+
         self._get_model_parameters()
 
         see_memory_usage(f"DeepSpeed Engine: After configure distributed model")
@@ -268,11 +282,10 @@ def __init__(
         # Configure wall clock timers
         self.timers = SynchronizedWallClockTimer()
         # Throughput timer
-        self.tput_timer = ThroughputTimer(
-            batch_size=self.train_batch_size(),
-            steps_per_output=self.steps_per_print(),
-            monitor_memory=False,
-        )
+        self.tput_timer = ThroughputTimer(self._config.timers_config,
+                                          batch_size=self.train_batch_size(),
+                                          steps_per_output=self.steps_per_print(),
+                                          monitor_memory=False)
 
         log_dist(f"DeepSpeed Flops Profiler Enabled: {self.flops_profiler_enabled()}", ranks=[0])
 
@@ -302,7 +315,7 @@ def __init__(
 
         if has_optimizer:
             self._configure_optimizer(optimizer, model_parameters)
-            self._configure_lr_scheduler(lr_scheduler)
+            self._configure_lr_scheduler()
             self._report_progress(0)
         elif self.zero_optimization():
             # no optim selected but zero is enabled
@@ -324,6 +337,8 @@ def __init__(
                 self.sparse_tensor_module_names.add(name + ".weight")
                 logger.info("Will convert {} to sparse tensor during training".format(name))
 
+        self._optimized_linear_offload_setup()
+
         self.save_non_zero_checkpoint = False
         self.save_zero_checkpoint = False
         if not isinstance(self.optimizer, DeepSpeedZeRoOffload):
@@ -359,9 +374,114 @@ def __init__(
         self.flatten = _flatten_dense_tensors
         self.unflatten = _unflatten_dense_tensors
 
+        self._is_compiled = False
+
+    def _optimized_linear_offload_setup(self):
+        self.optimized_linear_base_weight_sharding = False
+        self.optimized_linear_lora_enabled = False
+        offload_ratio = None
+        for _, module in self.module.named_modules():
+            if isinstance(module, LoRAOptimizedLinear):
+                self.optimized_linear_lora_enabled = True
+                offload_ratio = None
+                if offload_ratio is not None:
+                    assert offload_ratio == module.lora_config.offload_ratio, \
+                        "all lora_config offload ratios should be the same across the model"
+                offload_ratio = module.lora_config.offload_ratio
+                if module.zero_shards > 1:
+                    # set attr so checkpoint saving can handle BWS properly
+                    self.optimized_linear_base_weight_sharding = True
+
+        if offload_ratio is None:
+            # Nothing enabled, do nothing
+            return
+
+        total_params = 0
+        for _, p in self.module.named_parameters():
+            if hasattr(p, 'ds_optim_param'):
+                total_params += p.numel()
+
+        offload_limit = total_params * offload_ratio
+        logger.info(f'offloading {offload_ratio*100}% of eligible params, specifically {offload_limit} params')
+        total_offloaded = 0
+        for _, p in self.module.named_parameters():
+            if hasattr(p, 'ds_optim_param'):
+                if total_offloaded < offload_limit:
+                    total_offloaded += p.numel()
+                    p.ds_offload = True
+                    p.offload()
+                else:
+                    p.ds_offload = False
+
+    def _configure_tensor_parallel_states(self, model):
+        """
+        Configures the tensor parallel states for the model.
+        This includes setting up the tensor parallel groups, initializing the TP mesh,
+        and registering a pre-hook to ensure that the Dataloader inputs are consistent across ranks.
+        """
+        self._set_client_model(model)
+
+        # sanity check
+        # currently, the compatibility between 'autotp' and 'zero > 1' has not been validated
+        assert self.zero_optimization_stage(
+        ) <= 2, "Currently, the compatibility between 'autotp' and 'zero_stage = 3' has not been validated"
+
+        self.mpu = groups
+        self.mpu._init_tp_mesh_device(tensor_model_parallel_size=self.autotp_size())
+
+        self.first_dataloader_check = None
+
+        def check_dataloader_inputs_same_across_ranks(module, args, kwargs):
+
+            def broadcast_and_check(args, bcast_rank, bcast_group):
+                if isinstance(args, tuple):
+                    args = list(args)
+                if len(args) > 0:
+                    if self.mpu.get_tensor_model_parallel_rank() == 0:
+                        _src_args = [args]
+                        dist.broadcast_object_list(object_list=_src_args,
+                                                   src=bcast_rank,
+                                                   group=bcast_group,
+                                                   device=get_accelerator().current_device())
+                        # Rank 0 does not need to compare with itself
+                        is_equal = True
+                    else:
+                        _src_args = [None]
+                        dist.broadcast_object_list(object_list=_src_args,
+                                                   src=bcast_rank,
+                                                   group=bcast_group,
+                                                   device=get_accelerator().current_device())
+
+                        is_equal = compare_tensors_in_structures(args, _src_args[0])
+
+                    equal_tensor = torch.tensor(is_equal,
+                                                dtype=self.communication_data_type,
+                                                device=get_accelerator().current_device())
+                    dist.all_reduce(equal_tensor, group=bcast_group)
+                    assert torch.equal(
+                        equal_tensor,
+                        torch.tensor(groups.get_tensor_model_parallel_world_size(),
+                                     dtype=self.communication_data_type,
+                                     device=get_accelerator().current_device())
+                    ), "Data inconsistency within the TP group. Please check the Dataloader implementation to ensure consistency."
+
+            bcast_rank = self.mpu.get_tensor_model_parallel_src_rank()
+            bcast_group = self.mpu.get_tensor_model_parallel_group()
+
+            broadcast_and_check(args, bcast_rank, bcast_group)
+            broadcast_and_check(kwargs, bcast_rank, bcast_group)
+
+            logger.info(f":The Dataloader has passed the TP group consistency check.")
+            self.first_dataloader_check.remove()
+
+        self.first_dataloader_check = self.module.register_forward_pre_hook(check_dataloader_inputs_same_across_ranks,
+                                                                            prepend=True,
+                                                                            with_kwargs=True)
+
     def destroy(self):
         if self.optimizer is not None and hasattr(self.optimizer, 'destroy'):
             self.optimizer.destroy()
+        debug_clear_module_and_param_names()
 
     def _get_model_parameters(self):
         if self.autotuning_profile_model_info():
@@ -569,6 +689,9 @@ def random_ltd_initialize(self):
             raise ValueError(f'not yet support')
             #self.lr_scheduler = lr_schedules.WarmupLayerTokenDecayLR(self.optimizer, self.random_ltd_scheduler)
 
+    def get_sequence_parallel_group(self):
+        return self.seq_parallel_group
+
     def wall_clock_breakdown(self):
         return self._config.wall_clock_breakdown
 
@@ -743,6 +866,9 @@ def zero_load_from_fp32_weights(self):
     def zero_elastic_checkpoint(self):
         return self._config.zero_config.elastic_checkpoint
 
+    def zero_nvme_offload_optimizer(self):
+        return getattr(self.optimizer, "swap_optimizer", False)
+
     def zero_max_live_parameters(self):
         return self._config.zero_config.max_live_parameters
 
@@ -752,6 +878,9 @@ def zero_max_reuse_distance(self):
     def zero_prefetch_bucket_size(self):
         return self._config.zero_config.prefetch_bucket_size
 
+    def zero_module_granularity_threshold(self):
+        return self._config.zero_config.module_granularity_threshold
+
     def zero_param_persistence_threshold(self):
         return self._config.zero_config.param_persistence_threshold
 
@@ -770,6 +899,12 @@ def zero_legacy_stage1(self):
     def zero_ignore_unused_parameters(self):
         return self._config.zero_config.ignore_unused_parameters
 
+    def autotp_size(self):
+        return self._config.tensor_parallel_config.autotp_size
+
+    def graph_harvesting(self):
+        return self._config.graph_harvesting
+
     def fp16_enabled(self):
         return self._config.fp16_enabled
 
@@ -845,6 +980,12 @@ def zero_quantized_nontrainable_weights(self):
     def zero_quantized_gradients(self):
         return self._config.zero_config.zero_quantized_gradients
 
+    def zeropp_loco_param(self):
+        return self._config.zero_config.zeropp_loco_param
+
+    def zero_log_trace_cache_warnings(self):
+        return self._config.zero_config.log_trace_cache_warnings
+
     def dump_state(self):
         return self._config.dump_state
 
@@ -889,19 +1030,19 @@ def _optimizer_has_ckpt_event_prologue(self):
     def _optimizer_has_ckpt_event_epilogue(self):
         return self.optimizer is not None and hasattr(self.optimizer, 'checkpoint_event_epilogue')
 
-    def _configure_lr_scheduler(self, client_lr_scheduler):
-        # First check for scheduler in json configuration
-        lr_scheduler = self._scheduler_from_config(self.optimizer)
-        if lr_scheduler:
-            log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0])
-            self.lr_scheduler = lr_scheduler
-        else:
-            if isinstance(client_lr_scheduler, Callable):
+    def _configure_lr_scheduler(self):
+        if self.client_lr_scheduler:
+            if isinstance(self.client_lr_scheduler, Callable):
                 log_dist('DeepSpeed using client callable to create LR scheduler', ranks=[0])
-                self.lr_scheduler = client_lr_scheduler(self.basic_optimizer)
+                self.lr_scheduler = self.client_lr_scheduler(self.basic_optimizer)
             else:
                 log_dist('DeepSpeed using client LR scheduler', ranks=[0])
-                self.lr_scheduler = client_lr_scheduler
+                self.lr_scheduler = self.client_lr_scheduler
+        else:
+            # load lr scheduler from json configuration if lr scheduler is not defined and passed in
+            lr_scheduler = self._scheduler_from_config(self.optimizer)
+            log_dist(f"DeepSpeed using configured LR scheduler = {self.scheduler_name()}", ranks=[0])
+            self.lr_scheduler = lr_scheduler
 
         log_dist(f'DeepSpeed LR Scheduler = {self.lr_scheduler}', ranks=[0])
 
@@ -955,13 +1096,13 @@ def _set_distributed_vars(self, args):
         device_rank = args.device_rank if args is not None and hasattr(args, 'device_rank') else self.local_rank
         if device_rank >= 0:
             get_accelerator().set_device(device_rank)
-            self.device = torch.device(get_accelerator().device_name(), device_rank)
+            self.device = torch.device(get_accelerator().device_name(device_rank))
             self.world_size = dist.get_world_size()
             self.global_rank = dist.get_rank()
         else:
             self.world_size = 1
             self.global_rank = 0
-            self.device = torch.device(get_accelerator().device_name())
+            self.device = get_accelerator().device()
 
     # Configure based on command line arguments
     def _configure_with_arguments(self, args, mpu):
@@ -1014,6 +1155,12 @@ def _supported_optims(self):
 
     # Validate configuration based on command line arguments
     def _do_sanity_check(self):
+        if self.fp16_enabled() and not get_accelerator().is_fp16_supported():
+            raise ValueError("Type fp16 is not supported on your device.")
+
+        if self.bfloat16_enabled() and not get_accelerator().is_bf16_supported():
+            raise ValueError("Type bf16 is not supported on your device.")
+
         expected_optim_types = self._supported_optims()
         expected_optim_types += [type(None), Callable]
         assert isinstance(self.client_optimizer, tuple(expected_optim_types)), \
@@ -1038,18 +1185,21 @@ def _broadcast_model(self):
         def is_replicated(p):
             if hasattr(p, "ds_status") and p.ds_status is not ZeroParamStatus.AVAILABLE:
                 return False
+            elif hasattr(p, 'ds_optim_param'):
+                # do not broadcast OptimizedLinear parameters, they are unique per base weight shard
+                return False
             return True
 
-        for p in self.module.parameters():
+        for n, p in self.module.named_parameters():
             # Broadcast the model for different parameters
             if is_moe_param(p):
                 if torch.is_tensor(p) and is_replicated(p):
-                    dist.broadcast(p,
+                    dist.broadcast(p.data,
                                    groups._get_expert_broadcast_src_rank(p.group_name),
                                    group=self.expert_data_parallel_group[p.group_name])
             else:
                 if torch.is_tensor(p) and is_replicated(p):
-                    dist.broadcast(p, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
+                    dist.broadcast(p.data, groups._get_broadcast_src_rank(), group=self.seq_data_parallel_group)
 
     @staticmethod
     def __check_params(model: Module, dtype: torch.dtype) -> None:
@@ -1074,11 +1224,25 @@ def _configure_distributed_model(self, model):
         if self.fp16_enabled():
             if is_zero_init_model:
                 self.__check_params(self.module, torch.half)
-            self.module.half()
+            # selectively avoid casting specially 
+            # marked parameters to 16-bit
+            self.module._apply(
+                lambda t: t.half() if (
+                    t.is_floating_point() and 
+                    not getattr(t, "_deepspeed_no_cast", False)
+                ) else t
+            )
         elif self.bfloat16_enabled():
             if is_zero_init_model:
                 self.__check_params(self.module, torch.bfloat16)
-            self.module.bfloat16()
+            # selectively avoid casting specially 
+            # marked parameters to 16-bit
+            self.module._apply(
+                lambda t: t.bfloat16() if (
+                    t.is_floating_point() and 
+                    not getattr(t, "_deepspeed_no_cast", False)
+                ) else t
+            )
         else:
             self.__check_params(self.module, torch.float)
 
@@ -1115,7 +1279,8 @@ def _configure_distributed_model(self, model):
         # Query the groups module to get information about various parallel groups
         self.local_all_to_all_group = None
         if self.zero_quantized_gradients():
-            log_dist("Using quantized gradients", ranks=[0])
+            message = "Using LoCo quantized gradients" if self.zeropp_loco_param() else "Using quantized gradients"
+            log_dist(message, ranks=[0])
             self.local_all_to_all_group = groups._get_local_all_to_all_group()
         self.data_parallel_group = groups._get_data_parallel_group()
         self.dp_world_size = groups._get_data_parallel_world_size()
@@ -1127,6 +1292,7 @@ def _configure_distributed_model(self, model):
         self.sequence_parallel_size = groups._get_sequence_parallel_world_size()
         if self.sequence_parallel_size > 1:
             self.communication_data_type = self._config.seq_parallel_communication_data_type
+            self.seq_parallel_group = groups._get_sequence_parallel_group()
 
         if not (self.amp_enabled() or is_zero_init_model):
             self._broadcast_model()
@@ -1180,9 +1346,15 @@ def _do_optimizer_sanity_check(self, basic_optimizer):
         # data type checks
         elif model_dtype == grad_accum_dtype:
             if model_dtype == torch.bfloat16:
-                raise NotImplementedError(
-                    "Bfloat16 wrapper must use a gradient accumulation type of fp32, enable ZeRO to use Bfloat16 gradient accumulation"
-                )
+                if self.pipeline_parallelism:
+                    logger.warning(
+                        "**** BF16 gradient accumulation is not safe numerically with large number of accumulation steps, proceed with caution *****"
+                    )
+                    return BFLOAT16
+                else:
+                    raise NotImplementedError(
+                        "Bfloat16 wrapper must use a gradient accumulation type of fp32, enable ZeRO to use Bfloat16 gradient accumulation"
+                    )
             if model_dtype == torch.float16:
                 return FP16
             # else optimizer_wrapper = None
@@ -1196,6 +1368,8 @@ def _do_optimizer_sanity_check(self, basic_optimizer):
     # Configure optimizer
     def _configure_optimizer(self, client_optimizer, model_parameters):
         if client_optimizer is None:
+            if self.has_moe_layers:
+                model_parameters = configure_moe_param_groups(model_parameters)
             basic_optimizer = self._configure_basic_optimizer(model_parameters)
             log_dist(f"Using DeepSpeed Optimizer param name {self.optimizer_name()} as basic optimizer", ranks=[0])
         else:
@@ -1237,7 +1411,7 @@ def _configure_optimizer(self, client_optimizer, model_parameters):
         else:
             self.optimizer = basic_optimizer
 
-        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer_name()), ranks=[0])
+        log_dist("DeepSpeed Final Optimizer = {}".format(self.optimizer.__class__.__name__), ranks=[0])
 
         self.compression_scheduler = self._configure_compression_scheduler()
         self.quantizer = self._configure_quantization()
@@ -1444,7 +1618,11 @@ def _configure_bf16_optimizer(self, optimizer):
                                    clip_grad=clip_grad,
                                    allgather_bucket_size=self.zero_allgather_bucket_size(),
                                    dp_process_group=self.seq_data_parallel_group,
-                                   timers=timers)
+                                   timers=timers,
+                                   grad_acc_dtype=self.get_data_types()[1],
+                                   graph_harvesting=self.graph_harvesting(),
+                                   immediate_grad_update=self._config.bfloat16_immediate_grad_update,
+                                   has_moe_layers=self.has_moe_layers)
 
         return optimizer
 
@@ -1471,13 +1649,6 @@ def _configure_zero_optimizer(self, optimizer):
             assert not isinstance(optimizer, DummyOptim), "zero stage {} requires an optimizer".format(zero_stage)
 
             log_dist(f'Creating {model_dtype} ZeRO stage {zero_stage} optimizer', ranks=[0])
-            # Overlap and contiguous grads are meaningless in stage 1 and are ignored
-            if zero_stage == ZeroStageEnum.optimizer_states:
-                overlap_comm = False
-                round_robin_gradients = False
-                # Non-MoE requires contiguous grads to be disabled w. stage 1
-                if not self.has_moe_layers:
-                    contiguous_gradients = False
 
             if isinstance(self.module, PipelineModule):
                 if overlap_comm:
@@ -1537,6 +1708,8 @@ def _configure_zero_optimizer(self, optimizer):
                     zero_param_parallel_group=zero_param_parallel_group,
                     zero_quantized_weights=self.zero_quantized_weights(),
                     zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+                    zero_module_granularity_threshold=self.zero_module_granularity_threshold(),
+                    log_trace_cache_warnings=self.zero_log_trace_cache_warnings(),
                 )
             else:
                 log_dist(
@@ -1583,6 +1756,9 @@ def _configure_zero_optimizer(self, optimizer):
                     zero_hpz_partition_size=self.zero_hpz_partition_size(),
                     zero_quantized_weights=self.zero_quantized_weights(),
                     zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights(),
+                    zero_module_granularity_threshold=self.zero_module_granularity_threshold(),
+                    zeropp_loco_param=self.zeropp_loco_param(),
+                    log_trace_cache_warnings=self.zero_log_trace_cache_warnings(),
                 )
 
         else:
@@ -1739,14 +1915,17 @@ def eval(self):
         self.warn_unscaled_loss = True
         self.module.train(False)
 
-    def _scale_loss_by_gas(self, prescaled_loss):
+    def _scale_loss_by_gas(self, prescaled_loss, eval_micro_batches=None):
+        # In pipeline evaluation, there is an option to use different micro-bs, which creates different number of
+        # micro batches, thus the training gas, is not valid in this case. need to use the number of eval_micro_batches
+        scaling_factor = self.gradient_accumulation_steps() if eval_micro_batches is None else eval_micro_batches
         if isinstance(prescaled_loss, torch.Tensor):
-            scaled_loss = prescaled_loss / self.gradient_accumulation_steps()
+            scaled_loss = prescaled_loss / scaling_factor
         elif isinstance(prescaled_loss, tuple) or isinstance(prescaled_loss, list):
             scaled_loss = []
             for l in prescaled_loss:
                 if isinstance(l, torch.Tensor):
-                    scaled_loss.append(l / self.gradient_accumulation_steps())
+                    scaled_loss.append(l / scaling_factor)
                 else:
                     scaled_loss.append(l)
         else:
@@ -1851,7 +2030,7 @@ def _cast_inputs_half(self, inputs):
             for k, v in inputs.items():
                 new_inputs[k] = self._cast_inputs_half(v)
             return new_inputs
-        elif hasattr(inputs, 'half'):
+        elif hasattr(inputs, 'half') and inputs.is_floating_point():
             return inputs.half()
         else:
             return inputs
@@ -1881,9 +2060,6 @@ def print_forward_breakdown(self, fwd_time):
 
     @instrument_w_nvtx
     def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
-        assert not (self.bfloat16_enabled() and self.pipeline_parallelism), \
-            f'allreduce_gradients() is not valid when bfloat+pipeline_parallelism is enabled'
-
         # Pass (PP) gas boundary flag to optimizer (required for zero)
         self.optimizer.is_gradient_accumulation_boundary = self.is_gradient_accumulation_boundary()
         # ZeRO stage >= 2 communicates during non gradient accumulation boundaries as well
@@ -1896,14 +2072,34 @@ def allreduce_gradients(self, bucket_size=MEMORY_OPT_ALLREDUCE_SIZE):
                     self.optimizer, 'reduce_gradients'):
                 self.optimizer.reduce_gradients(pipeline_parallel=self.pipeline_parallelism)
             else:
-                self.buffered_allreduce_fallback(elements_per_buffer=bucket_size)
+                grads = None
+                self.buffered_allreduce_fallback(grads=grads, elements_per_buffer=bucket_size)
+
+    @contextmanager
+    def no_sync(self):
+        r"""
+            Context manager to disable gradient reduction during backward pass.
+            This context manager has the following effects on other DeepSpeed features.
+            1. Incompatible with ZeRO stage 2/3 which rely on reduction for gradient partitioning.
+            2. It is illegal to  call engine.step() within the context manager.
+            3. Tracking of gradient accumulation steps is disabled.
+        """
+        assert not self.zero_optimization_partition_gradients(), \
+        f"no_sync context manager is incompatible with gradient partitioning logic of ZeRO stage {self.zero_optimization_stage()}"
+
+        assert not self.inside_no_sync_ctxt, f"no_sync context manager reentry is unsupported"
+
+        self.inside_no_sync_ctxt = True
+        try:
+            yield
+        finally:
+            self.inside_no_sync_ctxt = False
 
     @instrument_w_nvtx
-    def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_graph=False, scale_wrt_gas=True):
+    def backward(self, loss, release_loss=False, retain_graph=False, scale_wrt_gas=True):
         r"""Execute backward pass on the loss
         Arguments:
             loss: Torch tensor on which to execute backward propagation
-            allreduce_gradients: is deprecated, ignored, and will soon be removed'
             retain_graph: bool, default: false
                 forward on user defined choice of retain_graph
         """
@@ -1913,21 +2109,21 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr
         if self.scale_wrt_gas is not None:
             scale_wrt_gas = self.scale_wrt_gas
 
-        if not allreduce_gradients:
-            logger.warning(f"Argument `allreduce_gradients` is deprecated, ignored, and will soon be removed")
+        do_gradient_reduction = self.enable_backward_allreduce and not self.inside_no_sync_ctxt
 
-        # scale loss w.r.t. gradient accumulation if needed
-        if self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
+        # scale loss w.r.t. gradient accumulation if reduction is not disabled
+        if do_gradient_reduction and self.gradient_accumulation_steps() > 1 and scale_wrt_gas:
             loss = self._scale_loss_by_gas(loss.float())
 
         # Log training loss
-        self.losses += loss.mean().item()
+        mean_loss = loss.mean().detach()
+        self.losses = mean_loss if self.losses is None else self.losses + mean_loss
         if self.monitor.enabled:
             if self.is_gradient_accumulation_boundary():
                 if self.global_rank == 0:
                     self.summary_events = [(
                         f"Train/Samples/train_loss",
-                        self.losses,
+                        self.losses.item(),
                         self.global_samples,
                     )]
                     self.monitor.write_events(self.summary_events)
@@ -1965,7 +2161,7 @@ def backward(self, loss, allreduce_gradients=True, release_loss=False, retain_gr
 
         self._start_timers(self.engine_timers.backward_reduce_timers)
 
-        if allreduce_gradients and self.enable_backward_allreduce:
+        if do_gradient_reduction:
             # Traditional code path that allreduces the module parameter grads
             self.allreduce_gradients()
 
@@ -2069,8 +2265,6 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
         else:
             self.zero_grad()
 
-        report_progress = self.global_rank == 0 if self.global_rank else True
-
         # Check overflow here since in DS fp16 optimizer, the overflow is updated in above step() function.
         overflow = False
         if hasattr(self.optimizer, "overflow"):
@@ -2090,10 +2284,12 @@ def _take_model_step(self, lr_kwargs, block_eigenvalue={}):
                     # pipe_engine.train_batch()
                     self.lr_scheduler.step(self.train_batch_size())
 
-        if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
-            self._report_progress(self.global_steps + 1)
+        if self.steps_per_print() is not None:
+            report_progress = self.global_rank == 0 if self.global_rank else True
+            if report_progress and (self.global_steps + 1) % self.steps_per_print() == 0:
+                self._report_progress(self.global_steps + 1)
 
-        self.losses = 0.0
+        self.losses = None
         self.global_steps += 1
         self.global_samples += self.train_batch_size()
 
@@ -2101,6 +2297,9 @@ def step(self, lr_kwargs=None):
         r"""Execute the weight update step after forward and backward propagation
         on effective_train_batch.
         """
+        assert not self.inside_no_sync_ctxt, \
+        "It is illegal to call Engine.step() inside no_sync context manager"
+
         see_memory_usage("Engine before step", force=self.memory_breakdown())
 
         # Check early because self.global_steps is incremented at some point here.
@@ -2303,7 +2502,7 @@ def _report_progress(self, step):
         mom = self.get_mom()
         log_dist(f"step={step}, skipped={self.skipped_steps}, lr={lr}, mom={mom}", ranks=[0])
 
-    def allreduce_bucket(self, bucket, dp_group):
+    def allreduce_bucket(self, bucket, dp_group, dp_world_size=None):
         tensor = self.flatten(bucket)
 
         tensor_to_allreduce = tensor
@@ -2311,16 +2510,18 @@ def allreduce_bucket(self, bucket, dp_group):
         if self.communication_data_type != tensor.dtype:
             tensor_to_allreduce = tensor.to(self.communication_data_type)
 
+        if dp_world_size is None:
+            dp_world_size = dist.get_world_size(group=dp_group)
         if self.postscale_gradients():
             if self.gradient_predivide_factor() != 1.0:
                 tensor_to_allreduce.mul_(1.0 / self.gradient_predivide_factor())
 
             dist.all_reduce(tensor_to_allreduce, group=dp_group)
             if self.gradient_average:
-                if self.gradient_predivide_factor() != dist.get_world_size(group=dp_group):
-                    tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dist.get_world_size(group=dp_group))
+                if self.gradient_predivide_factor() != dp_world_size:
+                    tensor_to_allreduce.mul_(self.gradient_predivide_factor() / dp_world_size)
         else:
-            tensor_to_allreduce.mul_(1. / dist.get_world_size(group=dp_group))
+            tensor_to_allreduce.mul_(1. / dp_world_size)
             dist.all_reduce(tensor_to_allreduce, group=dp_group)
 
         if self.communication_data_type != tensor.dtype and tensor is not tensor_to_allreduce:
@@ -2328,23 +2529,23 @@ def allreduce_bucket(self, bucket, dp_group):
 
         return tensor
 
-    def allreduce_and_copy(self, small_bucket, dp_group):
-        allreduced = self.allreduce_bucket(small_bucket, dp_group)
+    def allreduce_and_copy(self, small_bucket, dp_group, dp_world_size=None):
+        allreduced = self.allreduce_bucket(small_bucket, dp_group, dp_world_size)
         for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
             buf.copy_(synced)
 
-    def allreduce_no_retain(self, bucket, dp_group, numel_per_bucket=500000000):
+    def allreduce_no_retain(self, bucket, dp_group, numel_per_bucket=500000000, dp_world_size=None):
         small_bucket = []
         numel = 0
         for tensor in bucket:
             small_bucket.append(tensor)
             numel = numel + tensor.numel()
             if numel > numel_per_bucket:
-                self.allreduce_and_copy(small_bucket, dp_group)
+                self.allreduce_and_copy(small_bucket, dp_group, dp_world_size)
                 small_bucket = []
                 numel = 0
         if len(small_bucket) > 0:
-            self.allreduce_and_copy(small_bucket, dp_group)
+            self.allreduce_and_copy(small_bucket, dp_group, dp_world_size)
 
     def _get_gradients_for_reduction(self):
         non_expert_grads = []
@@ -2378,36 +2579,56 @@ def _get_gradients_for_reduction(self):
         return non_expert_grads, expert_grads
 
     def _reduce_non_expert_gradients(self, grads, elements_per_buffer):
-        split_buckets = split_half_float_double_sparse(grads)
-        for _, bucket_tuple in enumerate(split_buckets):
-            bucket_type, bucket = bucket_tuple
-
-            if self.pipeline_parallelism:
-                dp_group = self.mpu.get_data_parallel_group()
-            else:
-                dp_group = groups._get_sequence_data_parallel_group()
-
-            if bucket_type == SparseTensor.type():
-                self.sparse_allreduce_no_retain(bucket, dp_group=dp_group)
-            else:
-                self.allreduce_no_retain(bucket, dp_group=dp_group, numel_per_bucket=elements_per_buffer)
+        split_sparse_tensor_buckets, split_dense_tensor_buckets = split_half_float_double_sparse(grads)
+        if self.pipeline_parallelism:
+            dp_group = self.mpu.get_data_parallel_group()
+            dp_world_size = dist.get_world_size(dp_group)
+        else:
+            dp_group = groups._get_sequence_data_parallel_group()
+            dp_world_size = dist.get_world_size(dp_group) / float(self.sequence_parallel_size)
+        for _, sparse_bucket_tuple in enumerate(split_sparse_tensor_buckets):
+            if sparse_bucket_tuple:
+                bucket_type, sparse_bucket = sparse_bucket_tuple
+                self.sparse_allreduce_no_retain(sparse_bucket, dp_group=dp_group, dp_world_size=dp_world_size)
+
+        for _, dense_bucket_tuple in enumerate(split_dense_tensor_buckets):
+            if dense_bucket_tuple:
+                bucket_type, dense_bucket = dense_bucket_tuple
+                self.allreduce_no_retain(dense_bucket,
+                                         dp_group=dp_group,
+                                         numel_per_bucket=elements_per_buffer,
+                                         dp_world_size=dp_world_size)
 
     def _reduce_expert_gradients(self, expert_grads, elements_per_buffer):
+        # to maintain the gradients value unaffected by ep_size setting,
+        # utilize dp_world_size for allreduce average
+        dp_world_size = dist.get_world_size(groups._get_data_parallel_group())
         for ep_name, expert_grads_group in expert_grads.items():
-            expert_split_buckets = split_half_float_double_sparse(expert_grads_group)
-            for i, bucket_tuple in enumerate(expert_split_buckets):
-                bucket_type, bucket = bucket_tuple
-                if bucket_type == SparseTensor.type():
-                    self.sparse_allreduce_no_retain(bucket, groups._get_expert_data_parallel_group(ep_name))
-                else:
+            ep_dp_group = groups._get_expert_data_parallel_group(ep_name)
+            split_sparse_tensor_buckets, split_dense_tensor_buckets = split_half_float_double_sparse(
+                expert_grads_group)
+
+            for _, sparse_bucket_tuple in enumerate(split_sparse_tensor_buckets):
+                if sparse_bucket_tuple:
+                    bucket_type, sparse_bucket = sparse_bucket_tuple
+                    self.sparse_allreduce_no_retain(sparse_bucket, dp_group=ep_dp_group, dp_world_size=dp_world_size)
+
+            for _, dense_bucket_tuple in enumerate(split_dense_tensor_buckets):
+                if dense_bucket_tuple:
+                    bucket_type, dense_bucket = dense_bucket_tuple
                     # Separate between diff groups
-                    self.allreduce_no_retain(bucket,
-                                             dp_group=groups._get_expert_data_parallel_group(ep_name),
-                                             numel_per_bucket=elements_per_buffer)
+                    self.allreduce_no_retain(dense_bucket,
+                                             dp_group=ep_dp_group,
+                                             numel_per_bucket=elements_per_buffer,
+                                             dp_world_size=dp_world_size)
 
     def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000):
         if grads is None:
-            non_expert_grads, expert_grads = self._get_gradients_for_reduction()
+            if hasattr(self.optimizer, "get_grads_for_reduction"):
+                # This is currently for BF16 optimizer
+                non_expert_grads, expert_grads = self.optimizer.get_grads_for_reduction()
+            else:
+                non_expert_grads, expert_grads = self._get_gradients_for_reduction()
         else:
             assert not self.has_moe_layers, "attempting to reduce grads in unsupported way w.r.t. MoE"
             non_expert_grads = grads
@@ -2417,8 +2638,8 @@ def buffered_allreduce_fallback(self, grads=None, elements_per_buffer=500000000)
         if self.has_moe_layers:
             self._reduce_expert_gradients(expert_grads, elements_per_buffer)
 
-    def sparse_allreduce_no_retain(self, bucket, dp_group):
-        allreduced_sparses = self.sparse_allreduce_bucket(bucket, dp_group)
+    def sparse_allreduce_no_retain(self, bucket, dp_group, dp_world_size=None):
+        allreduced_sparses = self.sparse_allreduce_bucket(bucket, dp_group, dp_world_size)
         # Densify sparse tensor and copy back to original location
         for tensor in allreduced_sparses:
             if tensor.is_sparse:
@@ -2426,13 +2647,13 @@ def sparse_allreduce_no_retain(self, bucket, dp_group):
             else:
                 tensor.orig_dense_tensor.copy_(tensor.to_dense())
 
-    def sparse_allreduce_bucket(self, bucket, dp_group):
+    def sparse_allreduce_bucket(self, bucket, dp_group, dp_world_size=None):
         sparse_list = []
         for sparse in bucket:
-            sparse_list.append(self.sparse_allreduce(sparse, dp_group))
+            sparse_list.append(self.sparse_allreduce(sparse, dp_group, dp_world_size))
         return sparse_list
 
-    def sparse_allreduce(self, sparse, dp_group):
+    def sparse_allreduce(self, sparse, dp_group, dp_world_size=None):
         original_data_type = sparse.values.dtype
         if self.communication_data_type != sparse.values.dtype:
             if self.communication_data_type in (torch.float16, torch.bfloat16):
@@ -2444,12 +2665,13 @@ def sparse_allreduce(self, sparse, dp_group):
             indices = sparse.indices
             values = sparse.values
 
+        if dp_world_size is None:
+            dp_world_size = dist.get_world_size(group=dp_group)
         if self.postscale_gradients():
             if self.gradient_average:
-                values.mul_(self.gradient_predivide_factor() /
-                            (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size)))
+                values.mul_(self.gradient_predivide_factor() / (dp_world_size))
         else:
-            values.mul_(1. / (dist.get_world_size(group=dp_group) / float(self.sequence_parallel_size)))
+            values.mul_(1. / (dp_world_size))
 
         indices_device_list = self.sparse_all_gather(indices, dp_group)
         values_device_list = self.sparse_all_gather(values, dp_group)
@@ -2491,7 +2713,7 @@ def all_gather_scalar(self, value, dp_group):
         return tensor_list
 
     def module_state_dict(self, destination=None, prefix="", keep_vars=False, exclude_frozen_parameters=False):
-        sd = self.module.state_dict(destination, prefix, keep_vars)
+        sd = self.module.state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
 
         # Remove frozen parameter weights from state_dict if specified
         if exclude_frozen_parameters:
@@ -2616,7 +2838,10 @@ def _get_ckpt_name(self, checkpoints_path, tag, mp_placeholder=None):
             mp_rank_str = f"{mp_rank:02d}"
 
         if self.zero_optimization_partition_weights():
-            filename = "zero_pp_rank_{}".format(dist.get_rank(group=self.optimizer.dp_process_group))
+            if self.load_universal_checkpoint():
+                filename = "zero_pp_rank_0"
+            else:
+                filename = "zero_pp_rank_{}".format(dist.get_rank(group=self.optimizer.dp_process_group))
             ckpt_name = os.path.join(
                 checkpoints_path,
                 str(tag),
@@ -2718,20 +2943,33 @@ def load_checkpoint(self,
                                                          load_module_only=load_module_only,
                                                          custom_load_fn=custom_load_fn)
 
-        load_zero_checkpoint = load_optimizer_states and load_path is not None and (self.zero_optimization()
-                                                                                    or self.bfloat16_enabled())
+        load_zero_checkpoint = load_path is not None and (self.zero_optimization() or self.bfloat16_enabled())
         if load_zero_checkpoint:
-            success = self._load_zero_checkpoint(load_dir, tag, load_optimizer_states=load_optimizer_states)
+            if (load_optimizer_states and not load_module_only) or self.load_universal_checkpoint():
+                success = self._load_zero_checkpoint(load_dir, tag, load_optimizer_states=load_optimizer_states)
+            else:
+                success = False
             if not success:
                 self.optimizer._restore_from_bit16_weights()
 
+        if self.zero_nvme_offload_optimizer():
+            from shutil import copytree, disk_usage
+            offload_dir = self.optimizer.optimizer_swapper.swap_folder
+            offload_ckpt_dir = os.path.join(load_dir, tag, "offloaded_tensors")
+            _, _, free = disk_usage(offload_dir)
+            logger.info(
+                f"Copying NVMe offload checkpoint from {offload_ckpt_dir} to {offload_dir}, {free / 1e9:,.2f} GB free on target filesystem..."
+            )
+            copytree(offload_ckpt_dir, offload_dir, dirs_exist_ok=True)
+            _, _, free = disk_usage(offload_dir)
+            logger.info(f"Copying complete! {free / 1e9:,.2f} GB free on target filesystem")
+            self.optimizer.reset_swap_buffers()
+
         if self._optimizer_has_ckpt_event_epilogue():
             self.optimizer.checkpoint_event_epilogue()
 
-        if self.load_universal_checkpoint():
+        if self.load_universal_checkpoint() and not self.zero_optimization_partition_weights():
             self.optimizer.update_lp_params()
-            if load_zero_checkpoint:
-                self.update_optimizer_step(step=client_states['iteration'] + 1)
 
         return load_path, client_states
 
@@ -2790,8 +3028,10 @@ def _load_checkpoint(self,
         optim_checkpoint = None
         if load_module_only:
             deepspeed_states = ['module']
-            if self.optimizer is not None and self.fp16_enabled():
+            if self.optimizer is not None:
                 self.optimizer.refresh_fp32_params()
+            if load_lr_scheduler_states and self.lr_scheduler is not None:
+                self.lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
         else:
             has_zero_optimizer_state = self.zero_optimization() or self.bfloat16_enabled()
             if load_optimizer_states and self.optimizer is not None and not has_zero_optimizer_state:
@@ -2897,11 +3137,13 @@ def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
             if zero_sd_list is None:
                 return False
 
+        param_shapes = self._get_zero_param_shapes()
         self.optimizer.load_state_dict(state_dict_list=zero_sd_list,
                                        load_optimizer_states=load_optimizer_states,
                                        load_from_fp32_weights=self.zero_load_from_fp32_weights(),
                                        checkpoint_folder=checkpoint_folder,
-                                       load_serial=load_serial)
+                                       load_serial=load_serial,
+                                       param_shapes=param_shapes)
 
         if self.load_universal_checkpoint():
             logger.info(f'loaded universal zero checkpoints from {checkpoint_folder} for rank {self.global_rank}')
@@ -2909,24 +3151,6 @@ def _load_zero_checkpoint(self, load_dir, tag, load_optimizer_states=True):
             logger.info(f"loading {len(zero_sd_list)} zero partition checkpoints for rank {self.global_rank}")
         return True
 
-    def update_optimizer_step(self, step):
-
-        def set_step(d):
-            if isinstance(d['step'], torch.Tensor):
-                d['step'] = torch.tensor(step, dtype=d['step'].dtype, device=d['step'].device)
-            else:
-                d['step'] = step
-
-        optimizer = self.optimizer
-        base_optimizer = optimizer.optimizer
-        state = base_optimizer.state
-        for group in optimizer.param_groups:
-            if 'step' in group:
-                set_step(group)
-            for p in group['params']:
-                if p in state and len(state[p]) > 0 and 'step' in state[p]:
-                    set_step(state[p])
-
     def _get_mp_rank_zero_checkpoint_names(self, load_dir, tag, mp_rank, dp_world_size, bf16_mode):
         zero_ckpt_names = []
         for dp_rank in range(dp_world_size):
@@ -2985,7 +3209,7 @@ def _get_all_zero_checkpoints(self, load_dir, tag):
                 if bf16_mode is not self.bfloat16_enabled():
                     checkpoint_bit16 = BFLOAT16 if bf16_mode else FP16
                     engine_bit16 = BFLOAT16 if self.bfloat16_enabled() else FP16
-                    logger.warn(f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine')
+                    logger.warning(f'Loading {checkpoint_bit16} zero checkpoints into {engine_bit16} training engine')
                 return self._get_all_zero_checkpoint_state_dicts(zero_ckpt_names)
 
         return None
@@ -3070,6 +3294,21 @@ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True,
             self._create_zero_checkpoint_files(save_dir, tag)
             self._save_zero_checkpoint(save_dir, tag)
 
+        if self.zero_nvme_offload_optimizer():
+            from shutil import copytree, disk_usage
+            offload_dir = self.optimizer.optimizer_swapper.swap_folder
+            offload_ckpt_dir = os.path.join(save_dir, tag, "offloaded_tensors")
+            _, _, free = disk_usage(save_dir)
+            logger.info(
+                f"Copying NVMe offload files from {offload_dir} to {offload_ckpt_dir}, {free / 1e9:,.2f} GB free on target filesystem..."
+            )
+            copytree(offload_dir,
+                     offload_ckpt_dir,
+                     ignore=lambda _, dir_list: list(filter(lambda x: 'gradient' in x, dir_list)),
+                     dirs_exist_ok=False)
+            _, _, free = disk_usage(save_dir)
+            logger.info(f"Copying complete! {free / 1e9:,.2f} GB free on target filesystem")
+
         if self._optimizer_has_ckpt_event_epilogue():
             self.optimizer.checkpoint_event_epilogue()
 
@@ -3126,7 +3365,7 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
 
                     local_expert_id = None
                     if not m:
-                        logger.warn(f'No expert found in key {key}.')
+                        logger.warning(f'No expert found in key {key}.')
                     else:
                         local_expert_id = m.group(1)
 
@@ -3156,22 +3395,24 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
         # In the case of E + D parallelism, only the
         # first expert parallel group should save the expert weights
         # since each expert parallel group is a copy of the model's experts
-        if exp_dp_rank != 0:
-            return
-
-        # Save optimizer states. They are different across each exp parallel rank.
-        optimizer_state = {
-            'optimizer': self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None
-        }
-        # TODO: why use BufferedWriter not the path
-        file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank)
-        self.checkpoint_engine.save(optimizer_state, file_path)
-
-        # get non-moe parameters
-        model_state_dict = self._get_non_moe_state_dict(
-            self.module_state_dict(exclude_frozen_parameters=exclude_frozen_parameters))
+        if exp_dp_rank == 0:
+            # Save optimizer states. They are different across each exp parallel rank.
+            optimizer_state = {
+                'optimizer': self.optimizer.state_dict() if self.optimizer and not self.zero_optimization() else None
+            }
+            # TODO: why use BufferedWriter not the path
+            file_path = self._get_optimizer_ckpt_name(save_dir, tag, expp_rank)
+            self.checkpoint_engine.save(optimizer_state, file_path)
+
+        # Load flow uses below saved file for model parameters, RNG and more
+        if groups._get_data_parallel_rank() == 0:
+            # Get non-moe parameters
+            # Classes DeepSpeedEngine and PipelineEngine have different behavior for method module_state_dict.
+            # DeepSpeedEngine returns the state dict, where PipelineEngine saves the state dict and returns None.
+            # We need to get the state dict, therefore, call to DeepSpeedEngine (base class for PipelineEngine)
+            model_state_dict = self._get_non_moe_state_dict(
+                DeepSpeedEngine.module_state_dict(self, exclude_frozen_parameters=exclude_frozen_parameters))
 
-        if expp_rank == 0:
             # TODO: update num experts info,.. in checkpoint
             state = {
                 'module':
@@ -3201,7 +3442,6 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
             state.update(client_state)
             logger.info(f'Saving model checkpoint: {save_path}')
             self.checkpoint_engine.save(state, save_path)
-        self._curr_save_path = None
 
     def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
         name_function = (self._get_zero_ckpt_name if zero_checkpoint else self._get_ckpt_name)
@@ -3420,7 +3660,53 @@ def _save_zero_checkpoint(self, save_path, tag):
         ckpt_type = 'zero' if self.zero_optimization() else 'bf16_zero'
         logger.info(f'{ckpt_type} checkpoint saved {zero_checkpoint_name}')
 
-    def _zero3_consolidated_16bit_state_dict(self):
+    def _replace_module_consolidated_state_dict(self):
+        """
+        Get a full non-partitioned state_dict with fp16 weights on cpu.
+        Important: this function must be called on all ranks and not just rank 0.
+        This is similar to nn.Module.state_dict (modelled after _save_to_state_dict)
+        This method is used for tensor parallel training.
+
+        Returns:
+        OrderedDict: The consolidated state dictionary if the current process rank is 0, otherwise None.
+        """
+        #TODO: If we use both Zero3 and tensor parallel simultaneously
+        # we need to consolidate the gather mechanisms of both.
+        state_dict = OrderedDict() if dist.get_rank() == 0 else None
+
+        def get_layer_state_dict(module, prefix=""):
+            with GatherReplacedLayerParams(list(module.parameters(recurse=False)), module, enabled=True):
+                for name, param in module.named_parameters(recurse=False):
+                    if param is None:
+                        continue
+                    key = prefix + name
+                    if (dist.get_rank() == 0):
+                        state_dict[key] = param.detach().cpu()
+                        # print(key,module, param.detach().cpu().shape)
+
+            for name, child in module.named_children():
+                if child is not None:
+                    get_layer_state_dict(child, prefix + name + ".")
+
+        get_layer_state_dict(self.module, prefix="")
+
+        # ensure that all GPU communication tasks are completed before the process exits
+        get_accelerator().synchronize()
+        return state_dict
+
+    def _consolidated_16bit_state_dict(self, exclude_frozen_parameters=False):
+        """
+        Consolidate the 16-bit state dictionary.
+        """
+        if self.zero_optimization_stage() == ZeroStageEnum.weights:
+            return self._zero3_consolidated_16bit_state_dict(exclude_frozen_parameters)
+        elif self.autotp_size() > 1:
+            return self._replace_module_consolidated_state_dict()
+
+        raise ValueError("consolidated_16bit_state_dict is only applicable to cases where weights are partitioned, "
+                         "including Zero Stage 3 and tensor parallelism.")
+
+    def _zero3_consolidated_16bit_state_dict(self, exclude_frozen_parameters=False):
         """
         Get a full non-partitioned state_dict with fp16 weights on cpu.
         Important: this function must be called on all ranks and not just rank 0.
@@ -3446,7 +3732,7 @@ def get_layer_state_dict(module, prefix=""):
                 if dist.get_rank() == 0:
                     # handle params
                     for name, param in module.named_parameters(recurse=False):
-                        if param is None:
+                        if param is None or (exclude_frozen_parameters and not param.requires_grad):
                             continue
                         key = prefix + name
                         # can't rely on param.data_ptr() as it will be reused as weights gets
@@ -3489,7 +3775,7 @@ def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
         compatibility"""
         return self.save_16bit_model(save_dir, save_filename)
 
-    def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
+    def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin", exclude_frozen_parameters=False):
         """
         Save 16bit model weights
 
@@ -3498,6 +3784,7 @@ def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
         Arguments:
             save_dir: Required. Directory for saving the model
             save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
+            exclude_frozen_parameters: Optional. Exclude frozen parameters from checkpointed state.
 
         Returns:
             ``True`` when a model has been saved, ``False`` otherwise. It will not be saved if
@@ -3514,14 +3801,15 @@ def save_16bit_model(self, save_dir, save_filename="pytorch_model.bin"):
         if self.zero_optimization_partition_weights():
             if self.zero_gather_16bit_weights_on_model_save():
                 # consolidation is expensive in time and memory and therefore isn't a default
-                state_dict = self._zero3_consolidated_16bit_state_dict()
+                state_dict = self._zero3_consolidated_16bit_state_dict(
+                    exclude_frozen_parameters=exclude_frozen_parameters)
             else:
                 # the model will be bogus if not consolidated so don't confuse the user by saving it
                 logger.info(
-                    f"Did not save the model {path} because `stage3_gather_16bit_weights_on_model_save` is False")
+                    f"Did not save the model {path} because stage3_gather_16bit_weights_on_model_save is False")
                 return False
         else:
-            state_dict = self.module.state_dict()
+            state_dict = self.module_state_dict(exclude_frozen_parameters=exclude_frozen_parameters)
 
         tag = f"global_step{self.global_steps}"
         tag = str(tag)
@@ -3544,3 +3832,69 @@ def empty_partition_cache(self):
             self.optimizer.empty_partition_cache()
             gc.collect()
             get_accelerator().empty_cache()
+
+    def compile(self, backend=get_accelerator().get_compile_backend(), compile_kwargs={}) -> None:
+        """Compile the module using the specified backend and kwargs.
+        If a compiler_fn is set, it will be used instead of torch.compile().
+        """
+        # Avoid graph breaks
+        deepspeed.utils.nvtx.enable_nvtx = False
+
+        if not is_compile_supported():
+            raise RuntimeError("compile is not supported in your version of PyTorch.")
+
+        if self.is_compiled:
+            return
+
+        if 'backend' in compile_kwargs:
+            logger.warning("The `backend` in `compile_kwargs` will be overridden. Use the `backend` argument instead.")
+
+        # create new dict to avoid modifying original dict
+        self.module.compile(**{**compile_kwargs, 'backend': backend})
+        self._is_compiled = True
+
+    @property
+    def is_compiled(self) -> bool:
+        return self._is_compiled
+
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False) -> None:
+        """Offload the engine's states to the specified device.
+
+        Arguments:
+            include: Optional. The set of states to offload. If not provided, all states are offloaded.
+            device: Optional. The device to move the ZeRO optimizer buffers to. Currently only `OffloadDeviceEnum.cpu` is supported.
+            pin_memory: Optional. Whether to pin the memory of the offloaded states.
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+        assert self.zero_optimization_stage(
+        ) == ZeroStageEnum.weights, "Moving buffers across devices is supported only for ZeRO stage 3."
+
+        opt_offload_config = self.zero_offload_optimizer()
+        assert opt_offload_config is None or opt_offload_config.device == OffloadDeviceEnum.none, "Moving states across devices is not supported for offloaded optimizer states."
+        param_offload_config = self.zero_offload_param()
+        assert param_offload_config is None or param_offload_config.device == OffloadDeviceEnum.none, "Moving states across devices is not supported for offloaded parameters."
+
+        assert not self.zero_offload_param(), "Moving states across devices is not supported for offloaded parameters."
+
+        if device == OffloadDeviceEnum.none:
+            logger.warning("No device specified for offloading states.")
+            return
+
+        if device == OffloadDeviceEnum.nvme:
+            raise ValueError("NVMe offload is not supported for offloading states.")
+
+        self.optimizer.offload_states(include=include, device=device, pin_memory=pin_memory, non_blocking=non_blocking)
+
+    def reload_states(self, non_blocking: bool = False) -> None:
+        """Reload the engine states to the original device.
+
+        Arguments:
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+        assert self.zero_optimization_stage(
+        ) == ZeroStageEnum.weights, "Moving buffers back is supported only for ZeRO stage 3."
+        self.optimizer.reload_states(non_blocking=non_blocking)
diff --git a/deepspeed/runtime/fp16/fused_optimizer.py b/deepspeed/runtime/fp16/fused_optimizer.py
index 182f806c839c..49093bb73c8f 100755
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -9,14 +9,16 @@
 
 import torch
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-
-from deepspeed.runtime import DeepSpeedOptimizer
-from deepspeed.runtime.utils import get_global_norm, get_grad_norm, CheckOverflow, get_weight_norm, required_torch_version
+from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
+from deepspeed.runtime.utils import get_global_norm, get_flattened_grad_norm, CheckOverflow, get_weight_norm, get_norm_with_moe_layers, is_model_parallel_parameter
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
-from deepspeed.utils import groups, logger, log_dist
-from deepspeed import comm as dist
+from deepspeed.utils import logger, log_dist
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, CLIP_GRAD
 from deepspeed.accelerator import get_accelerator
+from deepspeed.moe.utils import is_moe_param_group
+from deepspeed.runtime.constants import PIPE_REPLICATED
+from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank
 
 OVERFLOW_CHECK_TIMER = 'overflow_check'
 COMPUTE_NORM_TIMER = 'compute_norm'
@@ -63,6 +65,8 @@ def __init__(self,
         self.fp16_groups_flat = []
         self.fp32_groups_flat = []
 
+        self.flatten_grad_norm_mask_list = []
+        self.has_executed_step = False
         self._global_grad_norm = 0.
 
         # loop to deal with groups
@@ -205,6 +209,40 @@ def override_loss_scale(self, loss_scale):
         self.custom_loss_scaler = True
         self.external_loss_scale = loss_scale
 
+    def _require_avoid_recompute_norm(self, p, tensor_model_parallel_rank):
+        # for filtering  replicated tensors from tensor
+        if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
+            return True
+        if (tensor_model_parallel_rank > 0) and not is_model_parallel_parameter(p):
+            return True
+
+    def _get_norm_mask_idx(self, group):
+        """The function preserves the parallel information for norm
+        from unflattened gradients.
+
+        Args:
+            group (Iterable[Tensor] ): params group
+
+        Returns:
+            torch.Tensor: A 2D tensor containing index ranges for each group,
+                      where each row represents a [start index, end index].
+        """
+        group_mask_idx_list = []
+        grad_flat_st_idx = 0
+        grad_flat_en_idx = 0
+
+        for p in group:
+            grad_flat_en_idx = grad_flat_st_idx + p.numel()
+            if p.grad is not None and self._require_avoid_recompute_norm(p, bwc_tensor_model_parallel_rank(self.mpu)):
+                # merge range
+                if len(group_mask_idx_list) > 0 and grad_flat_st_idx == group_mask_idx_list[-1][-1]:
+                    group_mask_idx_list[-1][-1] = grad_flat_en_idx
+                else:
+                    group_mask_idx_list.append([grad_flat_st_idx, grad_flat_en_idx])
+            grad_flat_st_idx = grad_flat_en_idx
+
+        return torch.tensor(group_mask_idx_list, device=get_accelerator().current_device_name())
+
     def step(self, closure=None):
         """
         Not supporting closure.
@@ -237,6 +275,10 @@ def step(self, closure=None):
             return self.overflow
 
         grads_groups_flat = []
+        non_experts_grads_for_norm = []
+        expert_grads_for_norm = {}
+        assert len(self.fp16_groups) == len(self.optimizer.param_groups)
+
         for i, group in enumerate(self.fp16_groups):
             data_type = self.fp32_groups_flat[i].dtype
 
@@ -246,21 +288,41 @@ def step(self, closure=None):
                     for p in group
                 ]))
 
+            self.fp32_groups_flat[i].grad = grads_groups_flat[i]
+            param_group = self.optimizer.param_groups[i]
+
+            # split expert and non_expert grads for norm
+            if self.has_moe_layers and is_moe_param_group(param_group):
+                if param_group['name'] not in expert_grads_for_norm:
+                    expert_grads_for_norm[param_group['name']] = []
+
+                expert_grads_for_norm[param_group['name']].append(self.fp32_groups_flat[i])
+            else:
+                # retrieves the required mask for calculating the norm of flat_grad
+                # perform this collect operation only once
+                if not self.has_executed_step:
+                    cur_flat_grad_norm_mask = self._get_norm_mask_idx(group)
+                    self.flatten_grad_norm_mask_list.append(cur_flat_grad_norm_mask)
+
+                non_experts_grads_for_norm.append(self.fp32_groups_flat[i])
+
             for p in group:
                 p.grad = None
 
-            self.fp32_groups_flat[i].grad = grads_groups_flat[i]
-
         self.timers(COMPUTE_NORM_TIMER).start()
 
-        all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu)
-
-        self.timers(COMPUTE_NORM_TIMER).stop()
+        all_groups_norm = get_flattened_grad_norm(non_experts_grads_for_norm,
+                                                  mpu=self.mpu,
+                                                  grad_norm_mask=self.flatten_grad_norm_mask_list)
 
         if self.has_moe_layers:
-            all_groups_norm = self._get_norm_with_moe_layers(all_groups_norm)
+            all_groups_norm = get_norm_with_moe_layers(all_groups_norm,
+                                                       mpu=self.mpu,
+                                                       expert_tensors=expert_grads_for_norm,
+                                                       norm_type=self.norm_type)
 
         scaled_global_grad_norm = get_global_norm(norm_list=[all_groups_norm])
+        self.timers(COMPUTE_NORM_TIMER).stop()
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.cur_scale
@@ -283,27 +345,13 @@ def step(self, closure=None):
             updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i])
             for p, q in zip(self.fp16_groups[i], updated_params):
                 p.data.copy_(q.data)
-
+        self.has_executed_step = True
         self.timers(UPDATE_FP16_TIMER).stop()
 
         self.timers.log(STEP_TIMERS)
 
         return self.overflow
 
-    def _get_norm_with_moe_layers(self, all_groups_norm):
-        #all_groups_norm_old = all_groups_norm
-        # Need to allreduce (avg) the norms across different ranks because moe params will not be synced during allreduce
-        if self.using_pipeline:
-            pg = self.deepspeed.mpu.get_data_parallel_group()
-        else:
-            pg = groups._get_data_parallel_group()
-        scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=pg))
-        scaled_norm_tensor = torch.tensor(scaled_norm, device=self.fp32_groups_flat[0].device, dtype=torch.float)
-        dist.all_reduce(scaled_norm_tensor, group=pg)
-        all_groups_norm = scaled_norm_tensor.item()
-        #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}")
-        return all_groups_norm
-
     def unscale_and_clip_grads(self, grad_groups_flat, total_norm, apply_scale=True):
         # compute combined scale factor for this group
         combined_scale = self.cur_scale
diff --git a/deepspeed/runtime/fp16/onebit/adam.py b/deepspeed/runtime/fp16/onebit/adam.py
index 236eea8cadc5..fa817573f734 100644
--- a/deepspeed/runtime/fp16/onebit/adam.py
+++ b/deepspeed/runtime/fp16/onebit/adam.py
@@ -7,7 +7,7 @@
 import torch
 import numpy as np
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed import comm as dist
 
 
@@ -70,8 +70,6 @@ def __init__(self,
 
         super(OnebitAdam, self).__init__(params, defaults)
         self.eps_mode = 0 if eps_inside_sqrt else 1
-        assert (dist.is_initialized())
-
         self.comm_time = 0.0
         self.step_time = 0.0
         self.ave_step = 1
@@ -86,22 +84,27 @@ def __init__(self,
 
         self.comm_backend_name = comm_backend_name
 
+        assert dist.is_initialized(), "Please initialize the torch distributed backend."
         # Empty initializer. Set handle based on the comm backend as follows.
         self.comm_backend_handle = None
-
         if self.comm_backend_name == 'nccl':
             assert (
                 required_torch_version(min_version=1.8)
             ), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
-            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
             self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
-
         elif self.comm_backend_name == 'mpi':
             from deepspeed.runtime.comm.mpi import MpiBackend
             self.comm_backend_handle = MpiBackend(cuda_aware)
-
+        elif self.comm_backend_name == 'hccl':
+            from deepspeed.runtime.comm.hccl import HcclBackend
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = HcclBackend(self.deepspeed.mpu)
+        elif self.comm_backend_name == 'compressed':
+            from deepspeed.runtime.comm.compressed import CompressedBackend
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = CompressedBackend(self.deepspeed.mpu)
         self.size = self.comm_backend_handle.size
 
         self.divider = int(self.size * 8 / np.gcd(self.size, 8))
diff --git a/deepspeed/runtime/fp16/onebit/lamb.py b/deepspeed/runtime/fp16/onebit/lamb.py
index 0662fabeeee1..9e7bae816ecd 100644
--- a/deepspeed/runtime/fp16/onebit/lamb.py
+++ b/deepspeed/runtime/fp16/onebit/lamb.py
@@ -7,7 +7,7 @@
 import torch
 import numpy as np
 from deepspeed import comm as dist
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 from deepspeed.accelerator import get_accelerator
 
@@ -93,8 +93,6 @@ def __init__(self,
 
         super(OnebitLamb, self).__init__(params, defaults)
         self.eps_mode = 0 if eps_inside_sqrt else 1
-        assert (dist.is_initialized())
-
         self.deepspeed = deepspeed
         self.lamb_freeze_key = False
         self.initialize = False
@@ -108,21 +106,27 @@ def __init__(self,
 
         self.comm_backend_name = comm_backend_name
 
+        assert dist.is_initialized(), "Please initialize the torch distributed backend."
         # Empty initializer. Set handle based on the comm backend as follows.
         self.comm_backend_handle = None
-
         if self.comm_backend_name == 'nccl':
             assert (
                 required_torch_version(min_version=1.8)
             ), "Please use torch 1.8 or greater to enable NCCL backend in 1-bit Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
-            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
             self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
-
         elif self.comm_backend_name == 'mpi':
             from deepspeed.runtime.comm.mpi import MpiBackend
             self.comm_backend_handle = MpiBackend(cuda_aware)
+        elif self.comm_backend_name == 'hccl':
+            from deepspeed.runtime.comm.hccl import HcclBackend
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = HcclBackend(self.deepspeed.mpu)
+        elif self.comm_backend_name == 'compressed':
+            from deepspeed.runtime.comm.compressed import CompressedBackend
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = CompressedBackend(self.deepspeed.mpu)
 
         self.size = self.comm_backend_handle.size
 
@@ -161,7 +165,7 @@ def step(self, closure=None, grads=None):
         else:
             grads_group = grads
 
-        #remove the previous stats
+        # remove the previous stats
         del self.lamb_coeffs[:]
 
         if self.lamb_freeze_key:
@@ -173,7 +177,7 @@ def step(self, closure=None, grads=None):
                 # This is used to reduce compression error during compression stage.
                 momentum_scales = []
                 for group in self.param_groups:
-                    momentum_scales.append([(torch.linalg.norm(self.state[p]['exp_avg']) /
+                    momentum_scales.append([(torch.linalg.vector_norm(self.state[p]['exp_avg']) /
                                              np.sqrt(torch.numel(self.state[p]['exp_avg']))).item()
                                             for p in group['params']])
                 united_scale = sum([sum(x) for x in momentum_scales]) / sum([len(x) for x in momentum_scales])
diff --git a/deepspeed/runtime/fp16/onebit/zoadam.py b/deepspeed/runtime/fp16/onebit/zoadam.py
index 922263ad6a76..70282ec41714 100644
--- a/deepspeed/runtime/fp16/onebit/zoadam.py
+++ b/deepspeed/runtime/fp16/onebit/zoadam.py
@@ -7,14 +7,16 @@
 import torch
 import numpy as np
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed import comm as dist
 
 
 class ZeroOneAdam(torch.optim.Optimizer):
-    """Implements the 0/1 Adam algorithm. Currently GPU-only.
+    """
+    Implements the 0/1 Adam algorithm. Currently GPU-only.
     For usage example please see https://www.deepspeed.ai/tutorials/zero-one-adam/
     For technical details please read https://arxiv.org/abs/2202.06009
+
     Arguments:
         params (iterable): iterable of parameters to optimize or dicts defining
             parameter groups.
@@ -83,8 +85,6 @@ def __init__(self,
 
         super(ZeroOneAdam, self).__init__(params, defaults)
         self.eps_mode = 0 if eps_inside_sqrt else 1
-        assert (dist.is_initialized())
-
         self.deepspeed = deepspeed
         self.initialize = False
         self.cuda_aware = cuda_aware
@@ -99,22 +99,27 @@ def __init__(self,
 
         self.comm_backend_name = comm_backend_name
 
+        assert dist.is_initialized(), "Please initialize the torch distributed backend."
         # Empty initializer. Set handle based on the comm backend as follows.
         self.comm_backend_handle = None
-
         if self.comm_backend_name == 'nccl':
             assert (
                 required_torch_version(min_version=1.8)
             ), "Please use torch 1.8 or greater to enable NCCL backend in 0/1 Adam. Alternatively, please specify 'mpi' as the 'comm_backend_name' in config file to proceed with the MPI backend"
-            assert dist.is_initialized() == True, "Please initialize the torch distributed backend."
             from deepspeed.runtime.comm.nccl import NcclBackend
             self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
             self.comm_backend_handle = NcclBackend(self.deepspeed.mpu)
-
         elif self.comm_backend_name == 'mpi':
             from deepspeed.runtime.comm.mpi import MpiBackend
             self.comm_backend_handle = MpiBackend(cuda_aware)
-
+        elif self.comm_backend_name == 'hccl':
+            from deepspeed.runtime.comm.hccl import HcclBackend
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = HcclBackend(self.deepspeed.mpu)
+        elif self.comm_backend_name == 'compressed':
+            from deepspeed.runtime.comm.compressed import CompressedBackend
+            self.using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
+            self.comm_backend_handle = CompressedBackend(self.deepspeed.mpu)
         self.size = self.comm_backend_handle.size
 
         self.divider = int(self.size * 8 / np.gcd(self.size, 8))
diff --git a/deepspeed/runtime/fp16/unfused_optimizer.py b/deepspeed/runtime/fp16/unfused_optimizer.py
index 14271255df2e..530355f846e2 100755
--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -11,10 +11,11 @@
 import torch
 from torch._utils import _flatten_dense_tensors
 
-from deepspeed.runtime import DeepSpeedOptimizer
-from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm, required_torch_version
+from deepspeed.runtime.base_optimizer import DeepSpeedOptimizer
+from deepspeed.runtime.utils import get_global_norm, CheckOverflow, get_weight_norm
 from deepspeed.runtime.fp16.loss_scaler import INITIAL_LOSS_SCALE, SCALE_WINDOW, MIN_LOSS_SCALE
 from deepspeed.utils import logger
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT
 from deepspeed.accelerator import get_accelerator
 from deepspeed import comm as dist
diff --git a/deepspeed/runtime/hybrid_engine.py b/deepspeed/runtime/hybrid_engine.py
index da6f7a9be54e..b6e417fd4764 100644
--- a/deepspeed/runtime/hybrid_engine.py
+++ b/deepspeed/runtime/hybrid_engine.py
@@ -17,16 +17,14 @@
 from deepspeed.accelerator import get_accelerator
 from torch import nn
 from deepspeed.utils import logger
-
-from deepspeed.ops.op_builder import InferenceBuilder
-
 from deepspeed.module_inject.layers import LinearLayer, Normalize, EmbeddingLayer, OPTEmbedding
+from ..ops.transformer.inference.op_binding.workspace import WorkspaceOp
+
 try:
     import transformers
     OPTLearnedPositionalEmbedding = transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding
 except:
     OPTLearnedPositionalEmbedding = None
-inference_cuda_module = None
 
 
 class DeepSpeedHybridEngine(DeepSpeedEngine):
@@ -61,12 +59,8 @@ def __init__(self, args, model, **kwargs):
         self._total_batch_size = None
         self._gather_latency = 0
 
-        global inference_cuda_module
-        if inference_cuda_module is None:
-            builder = InferenceBuilder()
-            inference_cuda_module = builder.load()
-
         self.is_lora_fused = False
+        self.workspace = WorkspaceOp()
 
     def convert_to_linear_transposed(self, model):
 
@@ -160,13 +154,13 @@ def unfuse_lora_weight_non_pinned(self):
 
     def retake_inference_cache(self):
         if self._config.hybrid_engine.release_inference_cache:
-            retake_success = inference_cuda_module.retake_workspace()
+            retake_success = self.workspace.retake_workspace()
 
             if not retake_success:
                 logger.warning("Unable to acquire workspace on first attempt, emptying cache and retrying.")
                 gc.collect()
                 get_accelerator().empty_cache()
-                retake_success = inference_cuda_module.retake_workspace()
+                retake_success = self.workspace.retake_workspace()
 
                 if not retake_success:
                     raise RuntimeError("Unable to retake inference workspace.")
@@ -269,7 +263,7 @@ def generate(self, *inputs, **kwargs):
                 self.is_lora_fused = False
 
         if self._config.hybrid_engine.release_inference_cache:
-            inference_cuda_module.release_workspace()
+            self.workspace.release_workspace()
             gc.collect()
             get_accelerator().empty_cache()
 
@@ -296,8 +290,13 @@ def create_inference_containers(self, module, layer_id=0):
 
                     layer_id += 1
                 else:
-                    self._other_layers.append(self.inference_policies[child.__class__][0](
-                        weight=child.weight, bias=child.bias if hasattr(child, 'bias') else None))
+                    if self.inference_policies[child.__class__][0] == LinearLayer:
+                        self._other_layers.append(self.inference_policies[child.__class__][0](module=child,
+                                                                                              mp_group=None,
+                                                                                              skip_partition=True))
+                    else:
+                        self._other_layers.append(self.inference_policies[child.__class__][0](
+                            weight=child.weight, bias=child.bias if hasattr(child, 'bias') else None))
                     self._orig_modules_others.append(child)
                     self._orig_fwds_others.append(child.forward)
             else:
@@ -385,14 +384,20 @@ def eval(self):
             self._total_latency = self._total_latency + latency
             self._iters = self._iters + 1
             if not dist.is_initialized() or dist.get_rank() == 0:
+                if self._total_batch_size is not None:
+                    cur_samples_p_sec = f'|CurSamplesPerSec={(1 / latency * self._total_batch_size):.2f} '
+                    avg_samples_p_sec = f'|AvgSamplesPerSec={(1 / (self._total_latency / self._iters) * self._total_batch_size):.2f}'
+                else:
+                    cur_samples_p_sec = ''
+                    avg_samples_p_sec = ''
                 others = latency - (self._generate_latency + self._training_latency)
                 print(f'|E2E latency={(latency):.2f}s ' + \
                       f'|Gather latency={self._gather_latency:.2f}s ({(self._gather_latency / latency * 100):.2f}%) '
                       f'|Generate time={(self._generate_latency):.2f}s ({(self._generate_latency / latency * 100):.2f}%) ' + \
                       f'|Training time={(self._training_latency):.2f}s ({(self._training_latency / latency * 100):.2f}%) ' + \
-                      f'|Others={others:.2f} ({(others / latency * 100):.2f}%)'
-                      f'|CurSamplesPerSec={(1 / latency * self._total_batch_size):.2f} ' + \
-                      f'|AvgSamplesPerSec={(1 / (self._total_latency / self._iters) * self._total_batch_size):.2f}')
+                      f'|Others={others:.2f} ({(others / latency * 100):.2f}%)' + \
+                      cur_samples_p_sec + \
+                      avg_samples_p_sec)
             self._t_start = time.time()
         self._training_latency = 0
         super().eval()
diff --git a/deepspeed/runtime/lr_schedules.py b/deepspeed/runtime/lr_schedules.py
index d7f7e15a4dbd..2ffd0bf9f036 100755
--- a/deepspeed/runtime/lr_schedules.py
+++ b/deepspeed/runtime/lr_schedules.py
@@ -247,6 +247,12 @@ def get_lr_from_config(config):
     return lr_params[WARMUP_MAX_LR], ''
 
 
+def update_lr(param_groups, lrs):
+    for param_group, lr in zip(param_groups, lrs):
+        param_group['lr'] = lr
+    return [group['lr'] for group in param_groups]
+
+
 """
 Only optimizers that are subclass of torch.optim.Optimizer are supported. So check the passed optimizer and wrapped
 optimizer to see if requirement is satisfied.
@@ -268,7 +274,7 @@ class LRRangeTest(object):
     """Sets the learning rate of each parameter group according to
     learning rate range test (LRRT) policy. The policy increases learning
     rate starting from a base value with a constant frequency, as detailed in
-    the paper `A disciplined approach to neural network hyper-parameters: Part1`_.
+    the paper `A disciplined approach to neural network hyper-parameters: Part 1 <https://arxiv.org/abs/1803.09820>`_
 
     LRRT policy is used for finding maximum LR that trains a model without divergence, and can be used to
     configure the LR boundaries for Cyclic LR schedules.
@@ -328,7 +334,7 @@ def __init__(self,
         self.interval_fn = self._staircase_interval if lr_range_test_staircase else self._continuous_interval
 
         if last_batch_iteration == -1:
-            self._update_optimizer(self.min_lr)
+            self._last_lr = update_lr(self.optimizer.param_groups, self.min_lr)
 
     def _staircase_interval(self):
         return math.floor(float(self.last_batch_iteration + 1) / self.step_size)
@@ -349,16 +355,11 @@ def get_last_lr(self):
         assert getattr(self, '_last_lr', None) is not None, "need to call step() first"
         return self._last_lr
 
-    def _update_optimizer(self, group_lrs):
-        for param_group, lr in zip(self.optimizer.param_groups, group_lrs):
-            param_group['lr'] = lr
-
     def step(self, batch_iteration=None):
         if batch_iteration is None:
             batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = batch_iteration
-        self._update_optimizer(self.get_lr())
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -378,7 +379,7 @@ class OneCycle(object):
     1CLR policy changes the learning rate after every batch.
     `step` should be called after a batch has been used for training.
 
-    This implementation was adapted from the github repo: `pytorch/pytorch`_
+    This implementation was adapted from the github repo: `PyTorch <https://github.com/pytorch/pytorch>`_.
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
@@ -507,7 +508,7 @@ def _initialize_lr(self, optimizer, cycle_min_lr, cycle_max_lr, decay_lr_rate, l
     def _initialize_momentum(self, optimizer, cycle_min_mom, cycle_max_mom, decay_mom_rate, last_batch_iteration):
         if 'betas' not in optimizer.defaults:
             optimizer_name = type(optimizer).__name__
-            logger.warn(
+            logger.warning(
                 f"cycle_momentum is disabled because optimizer {optimizer_name} does not support momentum, no betas attribute in defaults"
             )
             self.cycle_momentum = False
@@ -615,9 +616,7 @@ def step(self, batch_iteration=None):
             batch_iteration = self.last_batch_iteration + 1
 
         self.last_batch_iteration = batch_iteration
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
         if self.cycle_momentum:
             momentums = self.get_mom()
@@ -675,11 +674,14 @@ def __init__(self,
         self.warmup_type = warmup_type
         self.inverse_log_warm_up = 1.0 / math.log(self.warmup_num_steps)
         self.last_batch_iteration = last_batch_iteration
+        # Initialize lr in optimizer
+        if last_batch_iteration == -1:
+            self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
             logger.warning("Attempting to get learning rate from scheduler before it has started")
-            return [0.0]
+            return self.min_lrs
         gamma = self._get_gamma()
         return [min_lr + (delta_lr * gamma) for min_lr, delta_lr in zip(self.min_lrs, self.delta_lrs)]
 
@@ -693,9 +695,7 @@ def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def state_dict(self):
         return {'last_batch_iteration': self.last_batch_iteration}
@@ -819,6 +819,10 @@ def __init__(self,
                 total_num_steps, warmup_num_steps))
         self.org_lrs = [group['lr'] for group in self.optimizer.param_groups]
 
+        # Initialize lrs in optimizer groups
+        if last_batch_iteration == -1:
+            self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
+
     def get_lr_ratio(self):
         if self.last_batch_iteration < 0:
             logger.warning("Attempting to get learning rate from scheduler before it has started")
@@ -844,11 +848,7 @@ def step(self, last_batch_iteration=None):
         if last_batch_iteration is None:
             last_batch_iteration = self.last_batch_iteration + 1
         self.last_batch_iteration = last_batch_iteration
-
-        lrs = self.get_lr()
-        for param_group, lr in zip(self.optimizer.param_groups, lrs):
-            param_group['lr'] = lr
-        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+        self._last_lr = update_lr(self.optimizer.param_groups, self.get_lr())
 
     def get_lr(self):
         if self.last_batch_iteration < 0:
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
index b0fc3c920ea4..deb44c2e71eb 100644
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
@@ -4,6 +4,9 @@
 # DeepSpeed Team
 
 from types import MethodType
+from collections import OrderedDict
+from functools import reduce
+from operator import mul
 
 import torch
 from deepspeed import comm as dist
@@ -39,6 +42,9 @@
 PIPE_RECV_INPUT_TIMER = 'pipe_recv_input'
 PIPE_RECV_GRAD_TIMER = 'pipe_recv_grad'
 
+# The buffer size to store the meta data for each tensor.
+TENSOR_META_SIZE = 256
+
 
 def is_even(number):
     return number % 2 == 0
@@ -116,7 +122,8 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
 
         self._force_grad_boundary = False
 
-        self.batch_timer = ThroughputTimer(batch_size=self.train_batch_size(),
+        self.batch_timer = ThroughputTimer(self._config.timers_config,
+                                           batch_size=self.train_batch_size(),
                                            logging_fn=self.tput_log,
                                            monitor_memory=False,
                                            steps_per_output=self.steps_per_print())
@@ -136,7 +143,7 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         assert isinstance(self._config.pipeline['grad_partitioned'], bool)
         self.is_pipe_partitioned = self.is_model_parallel and self._config.pipeline['pipe_partitioned']
         self.is_grad_partitioned = self.is_model_parallel and self._config.pipeline['grad_partitioned']
-        logger.info(f'is_pipe_partitioned= {self.is_pipe_partitioned}',
+        logger.info(f'is_pipe_partitioned= {self.is_pipe_partitioned} '
                     f'is_grad_partitioned= {self.is_grad_partitioned}')
 
         model_parameters = filter(lambda p: p.requires_grad, self.module.parameters())
@@ -177,20 +184,31 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
         }
         self.pipe_recv_buf = None
         self.grad_layer = None
+        self._grad_layer_buf = []
 
         self.meta_buffer = None
 
         self.first_output_send = True
         self.first_gradient_send = True
+        self.pipe_partition_input_meta_cache = None
+        self.pipe_partition_output_meta_cache = None
+        self.pipe_partition_grad_meta_cache = None
+        self.grad_partition_grad_layer_meta_cache = None
 
         #stores the loss for the current micro batch being processed
         self.loss = torch.tensor(0.0).to(self.device)
 
         #stores the loss for the entire batch
         self.total_loss = None
+        self.total_additional_losses = None
         self.agg_loss = torch.tensor(0.0, requires_grad=False).to(self.device)
         self.dp_group_loss = torch.tensor(0.0, requires_grad=False).to(self.device)
 
+        # stores aggregated-DP train final loss and aggregated-DP additional losses, if any
+        # additional losses are stored as dict: {loss-name: agg-loss}
+        self.agg_train_loss = None
+        self.agg_additional_losses = None
+
         if self._config.pipeline['activation_checkpoint_interval'] > 0:
             self.module.activation_checkpoint_interval = self._config.pipeline['activation_checkpoint_interval']
             # set use_reentrant default to True.
@@ -201,6 +219,8 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
                 self.module.activation_checkpoint_func = ds_checkpointing.non_reentrant_checkpoint
                 if self.grid.get_global_rank() == 0:
                     logger.info(f'CONFIG: activation_checkpoint_func=non_reentrant_checkpoint')
+        if self.module.activation_checkpoint_interval > 0:
+            self.module._precompute_checkpointable_values()
 
         self.module.checkpoint_parallel_write_pipeline = self._config.checkpoint_parallel_write_pipeline
 
@@ -236,6 +256,8 @@ def __init__(self, has_bool_tensors=False, *super_args, **super_kwargs):
             self.timers(STEP_MICRO_TIMER).start()
             self.timers(STEP_MICRO_TIMER).stop()
 
+        self.dynamic_shape = self.module.dynamic_shape
+
     def set_has_attention_mask(self, value):
         assert isinstance(value, bool)
         self.has_attention_mask = value
@@ -265,7 +287,8 @@ def _exec_reduce_tied_grads(self):
         weight_group_list = self.module.get_tied_weights_and_groups()
         for weight, group in weight_group_list:
             grad = weight._hp_grad if self.using_bf16_optimizer else weight.grad
-            dist.all_reduce(grad, group=group)
+            if grad is not None:
+                dist.all_reduce(grad, group=group)
 
     def _exec_reduce_grads(self):
         self._force_grad_boundary = True
@@ -278,10 +301,7 @@ def _exec_reduce_grads(self):
         self._force_grad_boundary = False
 
     def _bf16_reduce_grads(self):
-        # Make our own list of gradients from the optimizer's FP32 grads
-        grads = []
-        self.buffered_allreduce_fallback(grads=self.optimizer.get_grads_for_reduction(),
-                                         elements_per_buffer=MEMORY_OPT_ALLREDUCE_SIZE)
+        self.buffered_allreduce_fallback(grads=None, elements_per_buffer=MEMORY_OPT_ALLREDUCE_SIZE)
 
     def _reserve_pipe_buffers(self, num_buffers):
         """Ensure that each pipeline buffer has at least ``num_buffers`` slots.
@@ -307,8 +327,14 @@ def reset_activation_shape(self):
         self.first_output_send = True
         self.pipe_recv_buf = None
         self.grad_layer = None
+        self._grad_layer_buf = []
         self.meta_buffer = None
 
+        self.pipe_partition_input_meta_cache = None
+        self.pipe_partition_output_meta_cache = None
+        self.pipe_partition_grad_meta_cache = None
+        self.grad_partition_grad_layer_meta_cache = None
+
     def train_batch(self, data_iter=None):
         """Progress the pipeline to train the next batch of data. The engine will ingest
         ``self.train_batch_size()`` total samples collectively across all workers.
@@ -352,6 +378,7 @@ def train_batch(self, data_iter=None):
 
         self.module.train()
         self.total_loss = None
+        self.total_additional_losses = None
         self._compute_loss = True
 
         # Do the work
@@ -360,7 +387,9 @@ def train_batch(self, data_iter=None):
                                        stages=self.num_stages,
                                        stage_id=self.stage_id)
         self._exec_schedule(sched)
-        self.agg_train_loss = self._aggregate_total_loss()
+
+        with torch.no_grad():
+            self.agg_train_loss = self._aggregate_total_loss()
 
         self.timers(TRAIN_BATCH_TIMER).stop()
 
@@ -369,10 +398,12 @@ def train_batch(self, data_iter=None):
                 elapsed = self.timers(TRAIN_BATCH_TIMER).elapsed(reset=True) / 1000.0
                 iter_time = elapsed / self.steps_per_print()
                 tput = self.train_batch_size() / iter_time
-                print(f'steps: {self.global_steps} '
-                      f'loss: {self.agg_train_loss:0.4f} '
-                      f'iter time (s): {iter_time:0.3f} '
-                      f'samples/sec: {tput:0.3f}')
+                log_str = f'steps: {self.global_steps} loss: {self.agg_train_loss:0.4f} '
+                if self.agg_additional_losses is not None:
+                    for loss_name, loss_value in self.agg_additional_losses.items():
+                        log_str += f'{loss_name}: {loss_value.item():0.4f} '
+                log_str += f'iter time (s): {iter_time:0.3f} samples/sec: {tput:0.3f}'
+                print(log_str)
             else:
                 self.timers(TRAIN_BATCH_TIMER).elapsed(reset=True)
 
@@ -393,7 +424,13 @@ def train_batch(self, data_iter=None):
         # TODO: should return precisely what loss returned and allow others to be queried?
         return self.agg_train_loss
 
-    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg', bcast_loss=True):
+    def eval_batch(self,
+                   data_iter,
+                   return_logits=False,
+                   compute_loss=True,
+                   reduce_output='avg',
+                   bcast_loss=True,
+                   num_micro_batches=None):
         """Evaluate the pipeline on a batch of data from ``data_iter``. The
         engine will evaluate ``self.train_batch_size()`` total samples
         collectively across all workers.
@@ -442,10 +479,11 @@ def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_o
         train_iterator = self.data_iterator
         self.set_dataiterator(data_iter)
 
+        # set the number micro batches in case the user chose value than training
+        micro_batches = self.micro_batches if num_micro_batches is None else num_micro_batches
+
         # Do the work
-        sched = schedule.InferenceSchedule(micro_batches=self.micro_batches,
-                                           stages=self.num_stages,
-                                           stage_id=self.stage_id)
+        sched = schedule.InferenceSchedule(micro_batches=micro_batches, stages=self.num_stages, stage_id=self.stage_id)
 
         # prevent dead-lock with multiple evals sequence
         dist.barrier()
@@ -454,7 +492,7 @@ def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_o
             self._exec_schedule(sched)
 
         if self.is_last_stage():
-            eval_output = self._reduce_outputs(self.fwd_outputs, reduce=reduce_output)
+            eval_output = self._reduce_outputs(self.fwd_outputs, reduce=reduce_output, micro_batches=micro_batches)
 
         if compute_loss and (bcast_loss or self.monitor.enabled):
             eval_output = self._bcast_pipe_scalar(eval_output)
@@ -496,7 +534,7 @@ def is_last_stage(self):
         """True if this process is in the last stage in the pipeline."""
         return self.stage_id == self.num_stages - 1
 
-    def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True):
+    def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True, micro_batches=None):
         if reduce is None:
             return outputs
 
@@ -511,7 +549,7 @@ def _reduce_outputs(self, outputs, reduce='avg', reduce_dp=True):
                     reduced[idx] += out
 
             # Average over the microbatches
-            reduced = self._scale_loss_by_gas(reduced)
+            reduced = self._scale_loss_by_gas(reduced, eval_micro_batches=micro_batches)
 
             # Average over DP groups
             if reduce_dp and self.is_data_parallel:
@@ -545,29 +583,67 @@ def _bcast_pipe_scalar(self, data, src_rank=None, dtype=torch.float32):
     def _aggregate_total_loss(self):
         # Scale loss, average among DP ranks, and bcast loss to the rest of my DP group
         if self.is_last_stage():
+            # Scale loss and additional losses, if any
             loss = self._scale_loss_by_gas(self.total_loss)
-            self.dp_group_loss = loss.clone().detach()
+            self.agg_additional_losses = self.total_additional_losses
+            if self.agg_additional_losses is not None:
+                self.agg_additional_losses = OrderedDict({
+                    loss_name: self._scale_loss_by_gas(_loss.clone().detach())
+                    for loss_name, _loss in self.agg_additional_losses.items()
+                })
 
-            ## Average loss across all data-parallel groups
+            self.dp_group_loss = loss.clone().detach()
             agg_loss = self.dp_group_loss.clone().detach()
             #print(f'RANK={self.global_rank} bcast SENDER src={self.global_rank} group={self.grid.pp_group}', flush=True)
+
+            # Average loss across all data-parallel groups
             if self.is_data_parallel:
-                dist.all_reduce(agg_loss, group=self.mpu.get_data_parallel_group())
-                agg_loss /= self.dp_world_size
+                if self.agg_additional_losses is None:
+                    dist.all_reduce(agg_loss, group=self.mpu.get_data_parallel_group())
+                    agg_loss /= self.dp_world_size
+                else:
+                    # use a single reduce op for agg_loss and additional losses, if any
+                    assert '__train_loss__' not in self.agg_additional_losses.keys()
+                    tensors = OrderedDict({'__train_loss__': agg_loss})
+                    tensors.update(self.agg_additional_losses.items())
+                    flat_tensor = torch.cat([t.clone().reshape(-1).detach() for t in tensors.values()])
+                    dist.all_reduce(flat_tensor, group=self.mpu.get_data_parallel_group())
+                    flat_tensor /= self.dp_world_size
+                    offset = 0
+                    reduced_tensor = {}
+                    for name, t in tensors.items():
+                        n_elem = t.numel()
+                        reduced_tensor[name] = flat_tensor[offset:offset + n_elem].clone().detach().reshape(t.shape)
+                        offset += n_elem
+                    agg_loss = reduced_tensor['__train_loss__']
+                    self.agg_additional_losses = OrderedDict(
+                        {name: reduced_tensor[name]
+                         for name in self.agg_additional_losses.keys()})
 
             assert self.global_rank in self.grid.pp_group
-            losses = torch.stack([self.dp_group_loss, agg_loss])
+            losses = [self.dp_group_loss, agg_loss]
+            if self.agg_additional_losses is not None:
+                losses += list(self.agg_additional_losses.values())
+            losses = torch.stack(losses).float()
             if self.is_pipe_parallel:
                 dist.broadcast(tensor=losses, src=self.global_rank, group=self.mpu.get_pipe_parallel_group())
         else:
             # Get loss from last stage
             src_rank = self.grid.stage_to_global(self.num_stages - 1)
             assert src_rank in self.grid.pp_group
-            losses = torch.Tensor([0., 0.]).to(self.device)
+            # losses to reduce are: dp_group_loss, agg_loss, model additional losses
+            # therefore: 2 + n_additional_losses
+            additional_losses = self.module.get_additional_losses()
+            n_additional_losses = 0 if additional_losses is None else len(additional_losses)
+            losses = torch.Tensor([0.] * (2 + n_additional_losses)).to(self.device)
             dist.broadcast(tensor=losses, src=src_rank, group=self.grid.get_pipe_parallel_group())
             self.dp_group_loss = losses[0].clone().detach()
             agg_loss = losses[1].clone().detach()
-
+            if additional_losses is not None:
+                self.agg_additional_losses = OrderedDict({
+                    name: losses[2 + i].clone().detach()
+                    for i, name in enumerate(additional_losses.keys())
+                })
         return agg_loss
 
     def set_dataloader(self, loader):
@@ -641,7 +717,9 @@ def _exec_forward_pass(self, buffer_id):
 
         # collect the partitioned input from the previous stage
         if self.is_pipe_partitioned and not self.is_first_stage():
-            part_input = PartitionedTensor.from_meta(meta=inputs[0],
+            if self.pipe_partition_input_meta_cache is None:
+                self.pipe_partition_input_meta_cache = inputs[0].to('cpu')
+            part_input = PartitionedTensor.from_meta(meta=self.pipe_partition_input_meta_cache,
                                                      local_part=inputs[1],
                                                      group=self.grid.get_slice_parallel_group())
 
@@ -675,7 +753,7 @@ def _exec_forward_pass(self, buffer_id):
                 raise ValueError("expecting a tensor or a tuple of tensors")
             part = PartitionedTensor(tensor=first_output, group=self.grid.get_slice_parallel_group())
             # Clear the large output data, but save the computation graph
-            first_output.data = torch.zeros(1)
+            first_output.data = torch.zeros(1, device=first_output.data.device)
             self.pipe_buffers['output_tensors'][buffer_id] = first_output
             # Inject the partitioned tensor into the output before sending
             outputs = (part.to_meta(), part.data(), *outputs_tail)
@@ -693,19 +771,34 @@ def _exec_forward_pass(self, buffer_id):
                 self.loss = outputs
             if self.eval_return_logits:
                 self.outputs = outputs
+
             if isinstance(self.loss, torch.Tensor):
                 self.fwd_outputs.append(self.loss.detach())
-
-                if self.total_loss is None:
-                    self.total_loss = torch.zeros_like(self.loss)
-                self.total_loss += self.loss.detach()
             else:
                 self.fwd_outputs.append([l.detach() for l in self.loss])
 
-                if self.total_loss is None:
-                    self.total_loss = [torch.zeros_like(l) for l in self.loss]
-                for idx, l in enumerate(self.loss):
-                    self.total_loss[idx] += l.detach()
+            def add_to_total_loss(_total_loss, _loss):
+                if isinstance(_loss, torch.Tensor):
+                    if _total_loss is None:
+                        _total_loss = torch.zeros_like(_loss)
+                    _total_loss += _loss.detach()
+                else:
+                    if _total_loss is None:
+                        _total_loss = [torch.zeros_like(_l) for _l in _loss]
+                    for _idx, _l in enumerate(_loss):
+                        _total_loss[_idx] += _l.detach()
+                return _total_loss
+
+            self.total_loss = add_to_total_loss(self.total_loss, self.loss)
+
+            # aggregate additional losses across gradient accumulation steps
+            additional_losses = self.module.get_additional_losses()
+            if additional_losses is not None:
+                if self.total_additional_losses is None:
+                    self.total_additional_losses = OrderedDict()
+                for name, loss in additional_losses.items():
+                    total = self.total_additional_losses[name] if name in self.total_additional_losses else None
+                    self.total_additional_losses[name] = add_to_total_loss(total, loss)
 
     def _exec_backward_pass(self, buffer_id):
         assert self.optimizer is not None, "must provide optimizer during " \
@@ -732,7 +825,9 @@ def _exec_backward_pass(self, buffer_id):
         # careful to also restore the computational graph of the tensors we partitioned.
         if self.is_pipe_partitioned:
             if self.is_grad_partitioned:
-                part_output = PartitionedTensor.from_meta(meta=outputs[0],
+                if self.pipe_partition_output_meta_cache is None:
+                    self.pipe_partition_output_meta_cache = outputs[0].to('cpu')
+                part_output = PartitionedTensor.from_meta(meta=self.pipe_partition_output_meta_cache,
                                                           local_part=outputs[1],
                                                           group=self.grid.get_slice_parallel_group())
                 self.pipe_buffers['output_tensors'][buffer_id].data = part_output.full()
@@ -745,7 +840,9 @@ def _exec_backward_pass(self, buffer_id):
         grad_tensors = self.grad_layer
         if self.is_grad_partitioned:
             #print(f'RANK={self.global_rank} BEFORE-BWD restoring grad={self.grad_layer[0].size()} {self.grad_layer[1].size()}')
-            part_grad = PartitionedTensor.from_meta(meta=self.grad_layer[0],
+            if self.grad_partition_grad_layer_meta_cache is None:
+                self.grad_partition_grad_layer_meta_cache = self.grad_layer[0].to('cpu')
+            part_grad = PartitionedTensor.from_meta(meta=self.grad_partition_grad_layer_meta_cache,
                                                     local_part=self.grad_layer[1],
                                                     group=self.grid.get_slice_parallel_group())
             grad_tensors = (part_grad.full(), *grad_tensors[2:])
@@ -766,7 +863,8 @@ def _exec_backward_pass(self, buffer_id):
 
         if self.using_bf16_optimizer and not self.is_last_stage():
             # manually call because we don't call optimizer.backward()
-            self.optimizer.update_hp_grads(clear_lp_grads=False)
+            if not self._config.bfloat16_immediate_grad_update:
+                self.optimizer.update_hp_grads(clear_lp_grads=False)
 
         # Free up the memory from the output of forward()
         self.pipe_buffers['output_tensors'][buffer_id] = None
@@ -837,51 +935,38 @@ def _send_tensor_meta(self, buffer, recv_stage):
                 * ndims
                 * shape
         """
-        send_bytes = 0
+        meta_buffer = torch.empty(TENSOR_META_SIZE, dtype=torch.int32, device=self.device)
         if isinstance(buffer, torch.Tensor):
-            type_tensor = torch.LongTensor(data=[0]).to(self.device)
-            p2p.send(type_tensor, recv_stage)
-            send_shape = torch.LongTensor(data=buffer.size()).to(self.device)
-            send_ndims = torch.LongTensor(data=[len(buffer.size())]).to(self.device)
-            p2p.send(send_ndims, recv_stage)
-            p2p.send(send_shape, recv_stage)
-            send_bytes += _tensor_bytes(buffer)
-        elif isinstance(buffer, list):
-            assert (False)
-            type_tensor = torch.LongTensor(data=[1]).to(self.device)
-            p2p.send(type_tensor, recv_stage)
-            count_tensor = torch.LongTensor(data=[len(buffer)]).to(self.device)
-            p2p.send(count_tensor, recv_stage)
-            for tensor in buffer:
-                assert isinstance(tensor, torch.Tensor)
-                send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
-                send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
-                p2p.send(send_ndims, recv_stage)
-                p2p.send(send_shape, recv_stage)
-                send_bytes += _tensor_bytes(tensor)
+            meta_buf_list = [
+                0,  # type of data (0: tensor, 1: list (unused), 2: tuple)
+                self.DTYPE_TO_ID[buffer.dtype],  # dtype
+                len(buffer.size())  # ndims
+            ]
+            meta_buf_list.extend(buffer.size())
+            assert len(
+                meta_buf_list
+            ) <= TENSOR_META_SIZE, f"Buffer for metadata is too small. Current buffer size: {TENSOR_META_SIZE} but required {len(meta_buf_list)}"
+            meta_buffer[:len(meta_buf_list)].copy_(torch.tensor(meta_buf_list, dtype=torch.int32))
+            p2p.send(meta_buffer, recv_stage)
+
         elif isinstance(buffer, tuple):
-            type_tensor = torch.LongTensor(data=[2]).to(self.device)
-            p2p.send(type_tensor, recv_stage)
-            count_tensor = torch.LongTensor(data=[len(buffer)]).to(self.device)
-            p2p.send(count_tensor, recv_stage)
-            for idx, tensor in enumerate(buffer):
+            meta_buf_list = [
+                2,  # type of data (0: tensor, 1: list (unused), 2: tuple)
+                len(buffer)  # num_tensors
+            ]
+
+            for tensor in buffer:
                 assert isinstance(tensor, torch.Tensor)
-                send_shape = torch.LongTensor(data=tensor.size()).to(self.device)
-                send_ndims = torch.LongTensor(data=[len(tensor.size())]).to(self.device)
-                send_dtype = torch.LongTensor(data=[self.DTYPE_TO_ID[tensor.dtype]]).to(self.device)
-                p2p.send(send_dtype, recv_stage)
-                p2p.send(send_ndims, recv_stage)
-                p2p.send(send_shape, recv_stage)
-                # Useful for performance debugging.
-                '''
-                new_bytes = _tensor_bytes(tensor)
-                send_bytes += _tensor_bytes(tensor)
-                # Useful for performance debugging.
-                if self.grid.data_parallel_id == 0:
-                    print(
-                        f'STAGE={self.stage_id} pipe-send-volume[{idx}]: shape={send_shape} {new_bytes/1024**2:0.2f}MB'
-                    )
-                '''
+                meta_buf_list.append(self.DTYPE_TO_ID[tensor.dtype])
+                meta_buf_list.append(len(tensor.size()))
+                meta_buf_list.extend(tensor.size())
+
+            assert len(
+                meta_buf_list
+            ) <= TENSOR_META_SIZE, f"Buffer for metadata is too small. Current buffer size: {TENSOR_META_SIZE} but required {len(meta_buf_list)}"
+            meta_buffer[:len(meta_buf_list)].copy_(torch.tensor(meta_buf_list, dtype=torch.int32))
+            p2p.send(meta_buffer, recv_stage)
+
         else:
             raise NotImplementedError(f'Could not send meta type {type(buffer)}')
 
@@ -894,49 +979,35 @@ def _send_tensor_meta(self, buffer, recv_stage):
     def _recv_tensor_meta(self, send_stage):
         """Receive metadata about upcoming p2p transfers and return allocated buffers.
 
-        Metadata is communicated in this order:
-            * type (0: tensor, 1: list)
-            * num_tensors if type=list
-            foreach tensor in buffer:
-                * ndims
-                * shape
-
         Returns:
             Allocated buffer for receiving from send_stage.
         """
+        buffer = torch.empty(TENSOR_META_SIZE, dtype=torch.int32, device=self.device)
+        p2p.recv(buffer, send_stage)
 
-        type_tensor = torch.LongTensor(data=[0]).to(self.device)
-        p2p.recv(type_tensor, send_stage)
-        recv_type = type_tensor.item()
+        recv_type = buffer[0].item()
 
         # A single tensor will be sent.
         if recv_type == 0:
-            recv_ndims = torch.LongTensor(data=[0]).to(self.device)
-            p2p.recv(recv_ndims, send_stage)
-            recv_ndims = recv_ndims.item()
-            recv_shape = torch.LongTensor([1] * recv_ndims).to(self.device)
-            p2p.recv(recv_shape, send_stage)
-            recv_shape = recv_shape.tolist()
-            return self._allocate_buffer(recv_shape, num_buffers=1)[0]
-
-        # List or tuple of tensors
+            recv_dtype = self.ID_TO_DTYPE[buffer[1].item()]
+            recv_ndims = buffer[2].item()
+            recv_shape = buffer[3:3 + recv_ndims].tolist()
+            return self._allocate_or_extend_buffers(0, recv_shape, recv_dtype)
+
+        # List or tuple of tensors (recv_type == 1 (list) is currently unused)
         elif recv_type == 1 or recv_type == 2:
-            count_tensor = torch.LongTensor(data=[0]).to(self.device)
-            p2p.recv(count_tensor, send_stage)
-            num_tensors = count_tensor.item()
-            recv_shapes_and_dtypes = []
+            num_tensors = buffer[1].item()
+
+            buffers = []
+            offset = 2
             for idx in range(num_tensors):
-                recv_dtype = torch.LongTensor(data=[0]).to(self.device)
-                p2p.recv(recv_dtype, send_stage)
-                recv_dtype = self.ID_TO_DTYPE[recv_dtype.item()]
-                recv_ndims = torch.LongTensor(data=[0]).to(self.device)
-                p2p.recv(recv_ndims, send_stage)
-                recv_ndims = recv_ndims.item()
-                recv_shape = torch.LongTensor([1] * recv_ndims).to(self.device)
-                p2p.recv(recv_shape, send_stage)
-                recv_shapes_and_dtypes.append((recv_shape.tolist(), recv_dtype))
-
-            buffers = self._allocate_buffers(recv_shapes_and_dtypes, num_buffers=1)[0]
+                recv_dtype = self.ID_TO_DTYPE[buffer[offset].item()]
+                recv_ndims = buffer[offset + 1].item()
+                recv_shape = buffer[offset + 2:offset + 2 + recv_ndims].tolist()
+                offset += 2 + recv_ndims
+
+                buffers.append(self._allocate_or_extend_buffers(idx, recv_shape, recv_dtype))
+
             # Convert to tuples if requested.
             if recv_type == 2:
                 buffers = tuple(buffers)
@@ -959,7 +1030,7 @@ def _exec_send_activations(self, buffer_id):
             outputs[-1] = outputs[-1].half()
             outputs = tuple(outputs)
 
-        if self.first_output_send:
+        if self.dynamic_shape or self.first_output_send:
             self.first_output_send = False
             self._send_tensor_meta(outputs, self.next_stage)
 
@@ -1044,7 +1115,7 @@ def _exec_recv_activations(self, buffer_id):
         recvd = None
 
         # Allocate the buffer if necessary
-        if self.pipe_recv_buf is None:
+        if self.dynamic_shape or self.pipe_recv_buf is None:
             self.pipe_recv_buf = self._recv_tensor_meta(self.prev_stage)
 
         if isinstance(self.pipe_recv_buf, torch.Tensor):
@@ -1088,7 +1159,9 @@ def _exec_recv_grads(self, buffer_id):
         # XXX these shapes are hardcoded for Megatron
         # Restore partitioned output if it was partitioned and we are sending full gradients
         if self.is_pipe_partitioned and not self.is_grad_partitioned:
-            part_output = PartitionedTensor.from_meta(meta=outputs[0],
+            if self.pipe_partition_grad_meta_cache is None:
+                self.pipe_partition_grad_meta_cache = outputs[0].to('cpu')
+            part_output = PartitionedTensor.from_meta(meta=self.pipe_partition_grad_meta_cache,
                                                       local_part=outputs[1],
                                                       group=self.grid.get_slice_parallel_group())
             outputs[0].data = part_output.full()
@@ -1097,10 +1170,9 @@ def _exec_recv_grads(self, buffer_id):
             self.pipe_buffers['outputs'][buffer_id] = outputs
 
         # Allocate gradient if necessary
-        if self.grad_layer is None:
+        if self.dynamic_shape or self.grad_layer is None:
             if isinstance(outputs, torch.Tensor):
-                s = list(outputs.size())
-                self.grad_layer = self._allocate_buffer(s, dtype=outputs.dtype, num_buffers=1)[0]
+                self.grad_layer = self._allocate_or_extend_buffers(0, list(outputs.size()), outputs.dtype)
             else:
                 # XXX This is a HACK
                 # When we exchange activations/gradients, the two pipe stages
@@ -1122,7 +1194,11 @@ def _exec_recv_grads(self, buffer_id):
                                                                  for t in outputs[2:] if t.is_floating_point()]
                 else:
                     sizes_and_dtypes = [(list(t.size()), t.dtype) for t in outputs if t.is_floating_point()]
-                self.grad_layer = self._allocate_buffers(sizes_and_dtypes, num_buffers=1)[0]
+
+                self.grad_layer = [
+                    self._allocate_or_extend_buffers(i, size, dtype)
+                    for i, (size, dtype) in enumerate(sizes_and_dtypes)
+                ]
 
         if isinstance(self.grad_layer, torch.Tensor):
             p2p.recv(self.grad_layer, self.next_stage)
@@ -1203,16 +1279,17 @@ def _allocate_buffer(self, shape, num_buffers=-1, **kwargs):
             buffers.append(self._allocate_zeros(shape, **kwargs))
         return buffers
 
-    def _allocate_buffers(self, shapes_and_dtypes, requires_grad=False, num_buffers=-1):
-        buffers = []
-        if num_buffers == -1:
-            num_buffers = self.num_pipe_buffers
-        for count in range(num_buffers):
-            buffer = []
-            for shape, dtype in shapes_and_dtypes:
-                buffer.append(self._allocate_zeros(shape, dtype=dtype, requires_grad=requires_grad))
-            buffers.append(buffer)
-        return buffers
+    def _allocate_or_extend_buffers(self, idx, shape, dtype):
+        numel = reduce(mul, shape) if len(shape) > 0 else 1
+        if len(self._grad_layer_buf) <= idx or self._grad_layer_buf[idx].numel() < numel:
+            new_buf = self._allocate_buffer(shape, dtype=dtype, num_buffers=1)[0]
+            if len(self._grad_layer_buf) <= idx:
+                self._grad_layer_buf.append(new_buf)
+            else:
+                self._grad_layer_buf[idx] = new_buf
+            return self._grad_layer_buf[idx]
+        else:
+            return self._grad_layer_buf[idx].flatten()[:numel].view(shape)
 
     def forward(self, *args, **kwargs):
         """Disabled for pipeline parallel training. See ``train_batch()``. """
@@ -1304,7 +1381,7 @@ def load_module_state_dict(self, checkpoint, strict=True, custom_load_fn=None, f
             strict (bool, optional): Strict state loading. Defaults to True.
         """
         assert custom_load_fn is None, "custom_load_fn not supported w. pipeline parallelism"
-        state_dict = checkpoint['module']
+        state_dict = checkpoint if self.has_moe_layers else checkpoint['module']
         if (state_dict is not None) and (not isinstance(state_dict, str)):
             super().load_module_state_dict(state_dict, strict)
             return
@@ -1343,3 +1420,6 @@ def _exec_schedule(self, pipe_schedule):
                 # Equivalent to: self._exec_forward_pass(buffer_id=0)
                 self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self)
                 self._exec_instr(**cmd.kwargs)
+
+    def get_additional_losses(self):
+        return self.agg_additional_losses
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index c11379b0a0d7..49fa2807c355 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -116,7 +116,10 @@ def forward(self, inputs):
         partition_method (str, optional): The method upon which the layers are partitioned. Defaults to 'parameters'.
         activation_checkpoint_interval (int, optional): The granularity activation checkpointing in terms of number of layers. 0 disables activation checkpointing.
         activation_checkpoint_func (callable, optional): The function to use for activation checkpointing. Defaults to ``deepspeed.checkpointing.checkpoint``.
-        checkpointable_layers(list, optional): Checkpointable layers may not be checkpointed. Defaults to None which does not additional filtering.
+        checkpointable_layers (list[str], optional): List of layer class names that are eligible for checkpointing. For GPT models,
+            ParallelTransformerLayerPipe is always checkpointed regardless of this list. If None, all layers with parameters are
+            considered checkpointable. Defaults to None.
+        dynamic_shape: Allows dynamic shapes of inputs. This might have a performance impact.
     """
 
     def __init__(self,
@@ -130,7 +133,8 @@ def __init__(self,
                  partition_method='parameters',
                  activation_checkpoint_interval=0,
                  activation_checkpoint_func=checkpointing.checkpoint,
-                 checkpointable_layers=None):
+                 checkpointable_layers=None,
+                 dynamic_shape=False):
 
         super().__init__()
 
@@ -196,6 +200,16 @@ def __init__(self,
         #newseed = get_accelerator().initial_seed() + self._grid.get_stage_id()
         #ds_utils.set_random_seed(newseed)
 
+        self.activation_checkpoint_interval = activation_checkpoint_interval
+
+        self.activation_checkpoint_func = activation_checkpoint_func
+
+        #storage for precomputed checkpointeble results
+        self.is_checkpointable_results = []
+        self.is_checkpointable_results_interval = None
+
+        # if configuration use_reentrant = False, self.activation_checkpoint_func will be set to ``checkpointing.non_reentrant_checkpoint``
+
         #with torch.random.fork_rng(devices=[get_accelerator().current_device_name()]):
         self._build()
         self.to(get_accelerator().device_name(self.local_rank))
@@ -203,10 +217,17 @@ def __init__(self,
         self.tied_comms = self._index_tied_modules()
         self._synchronize_tied_weights()
 
-        self.activation_checkpoint_interval = activation_checkpoint_interval
+        self.dynamic_shape = dynamic_shape
 
-        self.activation_checkpoint_func = activation_checkpoint_func
-        # if configuration use_reentrant = False, self.activation_checkpoint_func will be set to ``checkpointing.non_reentrant_checkpoint``
+    def _precompute_checkpointable_values(self):
+        if self.activation_checkpoint_interval > 0 and self.is_checkpointable_results_interval != self.activation_checkpoint_interval:
+            num_layers = len(self.forward_funcs)
+            self.interval_was_zero = False
+            for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
+                end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers)
+                funcs = self.forward_funcs[start_idx:end_idx]
+                self.is_checkpointable_results.append(self._is_checkpointable(funcs))
+            self.is_checkpointable_results_interval = self.activation_checkpoint_interval
 
     def _build(self):
         specs = self._layer_specs
@@ -352,7 +373,9 @@ def exec_func(*inputs):
         else:
             num_layers = len(self.forward_funcs)
             x = forward_input
-            for start_idx in range(0, num_layers, self.activation_checkpoint_interval):
+            for start_idx, is_checkpointable_result in \
+                zip(range(0, num_layers, self.activation_checkpoint_interval), self.is_checkpointable_results):
+
                 end_idx = min(start_idx + self.activation_checkpoint_interval, num_layers)
 
                 funcs = self.forward_funcs[start_idx:end_idx]
@@ -361,7 +384,7 @@ def exec_func(*inputs):
                 if not isinstance(x, tuple):
                     x = (x, )
 
-                if self._is_checkpointable(funcs):
+                if is_checkpointable_result:
                     x = self.activation_checkpoint_func(exec_range_func(start_idx, end_idx), *x)
                 else:
                     x = exec_range_func(start_idx, end_idx)(*x)
@@ -629,8 +652,31 @@ def _is_checkpointable(self, funcs):
             # because only non_reentrant_checkpoint can accept inputs with requires_grad=False
             # otherwise, the backward of the embedding layer won't receive gradients.
             if self.__class__.__name__ in ('GPTModelPipe', 'GPT2ModelPipe'):
-                return all('ParallelTransformerLayerPipe' in f.__class__.__name__ for f in funcs)
+                # For GPT models, checkpoint both transformer layers and any additional
+                # layers specified in checkpointable_layers (if provided)
+                return all('ParallelTransformerLayerPipe' in f.__class__.__name__ or (
+                    self.checkpointable_layers is not None and f.__class__.__name__ in self.checkpointable_layers)
+                           for f in funcs)
+
         if self.checkpointable_layers is not None:
+            # For non-GPT models, only checkpoint layers specified in checkpointable_layers
             return all(f.__class__.__name__ in self.checkpointable_layers for f in funcs)
+
+        # Default behavior: checkpoint any layer that has parameters
         params = [f.parameters() for f in funcs if isinstance(f, torch.nn.Module)]
         return any(len(list(p)) > 0 for p in params)
+
+    def get_additional_losses(self):
+        """ Returns model specific additional losses for reporting
+
+         Return a dictionary of {"loss name": loss_value} or None if no additional losses.
+        """
+        return None
+
+    def compile(self, *args, **kwargs):
+        for idx, layer in enumerate(self.forward_funcs):
+            if isinstance(layer, nn.Module):
+                layer.compile(*args, **kwargs)
+            else:
+                new_layer = torch.compile(layer, *args, **kwargs)
+                self.forward_funcs[idx] = new_layer
diff --git a/deepspeed/runtime/pipe/p2p.py b/deepspeed/runtime/pipe/p2p.py
index 31b9a14bf0f4..ed6d80b8d4fb 100644
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
@@ -3,15 +3,13 @@
 
 # DeepSpeed Team
 
-import pickle
+import msgpack
 import typing
 
 import torch
 from deepspeed import comm as dist
 
-# To query whether we have send/recv support
-from packaging.version import Version
-from deepspeed.git_version_info import torch_info
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 _groups = None
@@ -21,9 +19,7 @@
 
 
 def can_send_recv() -> bool:
-    torch_version = Version(torch_info['version'])
-    sendrecv_min = Version('1.8')
-    return torch_version >= sendrecv_min
+    return required_torch_version(min_version=1.8)
 
 
 #initializes adjacent process groups
@@ -100,7 +96,7 @@ def wait():
 def send_obj(msg: typing.Any, dest: int):
     """Send an arbitrary python object to ``dest``.
 
-    Note: ``msg`` must be pickleable.
+    Note: ``msg`` must be serializable by msgpack.
 
     WARN: This incurs a CPU -> GPU transfer and should be used sparingly
     for performance reasons.
@@ -110,7 +106,7 @@ def send_obj(msg: typing.Any, dest: int):
         dest (int): Destination rank.
     """
     # serialize the message
-    msg = pickle.dumps(msg)
+    msg = msgpack.packb(msg)
     # construct a tensor to send
     msg = torch.ByteTensor(torch.ByteStorage.from_buffer(msg)).to(get_accelerator().device_name())
 
@@ -137,7 +133,7 @@ def recv_obj(sender: int) -> typing.Any:
     msg = torch.empty(length.item(), dtype=torch.uint8).to(get_accelerator().device_name())
     dist.recv(msg, src=sender)
 
-    msg = pickle.loads(msg.cpu().numpy().tobytes())
+    msg = msgpack.unpackb(msg.cpu().numpy().tobytes())
 
     def _to(x):
         """Recursively move to the current device."""
diff --git a/deepspeed/runtime/sparse_tensor.py b/deepspeed/runtime/sparse_tensor.py
index f0bb5c75530e..291ba5f0c786 100644
--- a/deepspeed/runtime/sparse_tensor.py
+++ b/deepspeed/runtime/sparse_tensor.py
@@ -15,6 +15,7 @@ class SparseTensor(object):
 
     def __init__(self, dense_tensor=None):
         self.orig_dense_tensor = dense_tensor
+        self.dtype = self.orig_dense_tensor.dtype
         self.is_sparse = dense_tensor.is_sparse
         if dense_tensor is not None:
             if dense_tensor.is_sparse:
diff --git a/deepspeed/runtime/swap_tensor/aio_config.py b/deepspeed/runtime/swap_tensor/aio_config.py
index df4a38380089..be6c7d93c86a 100644
--- a/deepspeed/runtime/swap_tensor/aio_config.py
+++ b/deepspeed/runtime/swap_tensor/aio_config.py
@@ -5,25 +5,39 @@
 
 from deepspeed.runtime.config_utils import get_scalar_param
 from deepspeed.runtime.swap_tensor.constants import *
+from deepspeed.accelerator import get_accelerator
 
 AIO_DEFAULT_DICT = {
     AIO_BLOCK_SIZE: AIO_BLOCK_SIZE_DEFAULT,
     AIO_QUEUE_DEPTH: AIO_QUEUE_DEPTH_DEFAULT,
-    AIO_THREAD_COUNT: AIO_THREAD_COUNT_DEFAULT,
+    AIO_INTRA_OP_PARALLELISM: AIO_INTRA_OP_PARALLELISM_DEFAULT,
     AIO_SINGLE_SUBMIT: AIO_SINGLE_SUBMIT_DEFAULT,
-    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT
+    AIO_OVERLAP_EVENTS: AIO_OVERLAP_EVENTS_DEFAULT,
+    AIO_USE_GDS: AIO_USE_GDS_DEFAULT
 }
 
 
 def get_aio_config(param_dict):
     if AIO in param_dict.keys() and param_dict[AIO] is not None:
         aio_dict = param_dict[AIO]
-        return {
-            AIO_BLOCK_SIZE: get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
-            AIO_QUEUE_DEPTH: get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
-            AIO_THREAD_COUNT: get_scalar_param(aio_dict, AIO_THREAD_COUNT, AIO_THREAD_COUNT_DEFAULT),
-            AIO_SINGLE_SUBMIT: get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
-            AIO_OVERLAP_EVENTS: get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT)
+        aio_config = {
+            AIO_BLOCK_SIZE:
+            get_scalar_param(aio_dict, AIO_BLOCK_SIZE, AIO_BLOCK_SIZE_DEFAULT),
+            AIO_QUEUE_DEPTH:
+            get_scalar_param(aio_dict, AIO_QUEUE_DEPTH, AIO_QUEUE_DEPTH_DEFAULT),
+            AIO_INTRA_OP_PARALLELISM:
+            get_scalar_param(aio_dict, AIO_INTRA_OP_PARALLELISM, AIO_INTRA_OP_PARALLELISM_DEFAULT),
+            AIO_SINGLE_SUBMIT:
+            get_scalar_param(aio_dict, AIO_SINGLE_SUBMIT, AIO_SINGLE_SUBMIT_DEFAULT),
+            AIO_OVERLAP_EVENTS:
+            get_scalar_param(aio_dict, AIO_OVERLAP_EVENTS, AIO_OVERLAP_EVENTS_DEFAULT),
+            AIO_USE_GDS:
+            get_scalar_param(aio_dict, AIO_USE_GDS, AIO_USE_GDS_DEFAULT)
         }
 
+        if aio_config[AIO_USE_GDS]:
+            assert get_accelerator().device_name() == 'cuda', 'GDS currently only supported for CUDA accelerator'
+
+        return aio_config
+
     return AIO_DEFAULT_DICT
diff --git a/deepspeed/runtime/swap_tensor/constants.py b/deepspeed/runtime/swap_tensor/constants.py
index 4c9722bc4e4f..c1207749eac6 100644
--- a/deepspeed/runtime/swap_tensor/constants.py
+++ b/deepspeed/runtime/swap_tensor/constants.py
@@ -9,9 +9,10 @@
 "aio": {
   "block_size": 1048576,
   "queue_depth": 8,
-  "thread_count": 1,
+  "intra_op_parallelism": 1,
   "single_submit": false,
-  "overlap_events": true
+  "overlap_events": true,
+  "use_gds": false
 }
 '''
 AIO = "aio"
@@ -19,9 +20,11 @@
 AIO_BLOCK_SIZE_DEFAULT = 1048576
 AIO_QUEUE_DEPTH = "queue_depth"
 AIO_QUEUE_DEPTH_DEFAULT = 8
-AIO_THREAD_COUNT = "thread_count"
-AIO_THREAD_COUNT_DEFAULT = 1
+AIO_INTRA_OP_PARALLELISM = "intra_op_parallelism"
+AIO_INTRA_OP_PARALLELISM_DEFAULT = 1
 AIO_SINGLE_SUBMIT = "single_submit"
 AIO_SINGLE_SUBMIT_DEFAULT = False
 AIO_OVERLAP_EVENTS = "overlap_events"
 AIO_OVERLAP_EVENTS_DEFAULT = True
+AIO_USE_GDS = "use_gds"
+AIO_USE_GDS_DEFAULT = False
diff --git a/deepspeed/runtime/swap_tensor/optimizer_utils.py b/deepspeed/runtime/swap_tensor/optimizer_utils.py
index 86e43c98e7e5..d7b0ea9634b2 100644
--- a/deepspeed/runtime/swap_tensor/optimizer_utils.py
+++ b/deepspeed/runtime/swap_tensor/optimizer_utils.py
@@ -30,7 +30,7 @@ class OptimizerStateSwapInfo(object):
 
     def __init__(self, parameter, numel, base_folder):
         self.tensors = []
-        self.param_id = id(parameter)
+        self.param_id = OptimizerSwapper.parameter_id(parameter)
         self.swap_folder = base_folder
         self.swap_paths = []
         self.swapped_gradients = {}
@@ -50,7 +50,7 @@ def has_gradients(self):
     def _add_tensors(self, tensor_list):
         for t in tensor_list:
             self.tensors.append(t)
-            self.swap_paths.append(os.path.join(self.swap_folder, f'{id(t)}.tensor.swp'))
+            self.swap_paths.append(os.path.join(self.swap_folder, f'{OptimizerSwapper.parameter_id(t)}.tensor.swp'))
 
     def add_state_tensors(self, tensor_list):
         self.has_state_tensors = True
@@ -112,6 +112,10 @@ def release_unswapped_gradients(self):
 
 class OptimizerSwapper(object):
 
+    @staticmethod
+    def parameter_id(param):
+        return param.ds_id
+
     def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_numel, device, dtype, timers):
         self.swap_config = swap_config
         self.aio_config = aio_config
@@ -126,7 +130,7 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
 
         # Read/Write alignment for each thread during Intra-request parallelism
         self.min_aio_bytes = max(MIN_AIO_BYTES, aio_config[AIO_BLOCK_SIZE])
-        self.aligned_bytes = AIO_ALIGNED_BYTES * aio_config[AIO_THREAD_COUNT]
+        self.aligned_bytes = AIO_ALIGNED_BYTES * aio_config[AIO_INTRA_OP_PARALLELISM]
         self.numel_alignment = self.aligned_bytes // self.swap_element_size
 
         # Swap buffer management
@@ -149,6 +153,11 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
             'timer_names',
         ]
 
+    def purge_state(self):
+        for swap_info in self.swap_params_info.values():
+            swap_info.tensors = [swap_info.tensors[0]]
+            swap_info.has_state_tensors = False
+
     def swappable_tensor(self, param=None, numel=None):
         assert param is not None or numel is not None, "Either param or numel must be provided"
         if param is not None:
@@ -178,10 +187,10 @@ def _flush_gradient_swapper(self, gradient_swapper):
             self.timer_names.update(gradient_swapper.get_timer_names())
 
     def _swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors, gradient_swapper):
-        if not id(parameter) in self.swap_params_info.keys():
+        if not OptimizerSwapper.parameter_id(parameter) in self.swap_params_info.keys():
             return
 
-        swap_info = self.swap_params_info[id(parameter)]
+        swap_info = self.swap_params_info[OptimizerSwapper.parameter_id(parameter)]
 
         swappable_tensors = []
         swappable_offsets = []
@@ -241,7 +250,7 @@ def _initialize_from_swapped_fp16_params(self, aio_handle, fp16_partitions_info,
                 for i, tensor in enumerate(fp16_pinned_tensors):
                     true_index = curr_index + i
                     logger.info(
-                        f'swap_in_fp16_param: fp32_id = {id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}'
+                        f'swap_in_fp16_param: fp32_id = {OptimizerSwapper.parameter_id(fp32_parameters[true_index])} index = {true_index} orig_num_elem = {fp16_num_elems[true_index]}, swap_num_elem = {fp16_pinned_tensors[i].numel()}'
                     )
 
             swap_out_count = self._swap_out_fp16_params(aio_handle=aio_handle,
@@ -330,7 +339,7 @@ def _initialize_parameters(self, parameters, src_tensors, aio_handle):
         if dist.get_rank() == 0 and SWAPPER_DEBUG_MODE:
             for i, tensor in enumerate(src_tensors):
                 logger.info(
-                    f'copy_in_fp16_param: fp32_id = {id(parameters[i])} index = {i}, swap_num_elem = {src_tensors[i].numel()}'
+                    f'copy_in_fp16_param: fp32_id = {OptimizerSwapper.parameter_id(parameters[i])} index = {i}, swap_num_elem = {src_tensors[i].numel()}'
                 )
 
         self.swap_buffer_manager.free(pinned_buffers)
@@ -420,8 +429,9 @@ def _get_state_tensors(self, parameter):
             return []
 
         tensor_list = []
-        for value in self.optimizer.state[parameter].values():
+        for state_name, value in self.optimizer.state[parameter].items():
             if torch.is_tensor(value):
+                value.ds_id = state_name + '-' + parameter.ds_id
                 tensor_list.append(value)
 
         return tensor_list
@@ -433,7 +443,7 @@ def _update_param_state_info(self, swap_info, parameter):
                 swap_info.add_state_tensors(state_tensors)
 
     def _create_param_swap_info(self, parameter, numel):
-        param_id = id(parameter)
+        param_id = OptimizerSwapper.parameter_id(parameter)
         assert not param_id in self.swap_params_info
 
         self.swap_params_info[param_id] = OptimizerStateSwapInfo(parameter=parameter,
@@ -446,7 +456,7 @@ def _create_param_swap_info(self, parameter, numel):
         return swap_info
 
     def _get_param_swap_info(self, parameter):
-        param_id = id(parameter)
+        param_id = OptimizerSwapper.parameter_id(parameter)
         swap_info = self.swap_params_info.get(param_id, None)
 
         if swap_info is not None:
diff --git a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
index e7bf06043fd7..8b6cbe8fbb51 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py
@@ -33,9 +33,11 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
                                                           largest_numel, device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
-                                            aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
-                                            aio_config[AIO_THREAD_COUNT])
+        self.aio_handle = aio_op.aio_handle(block_size=aio_config[AIO_BLOCK_SIZE],
+                                            queue_depth=aio_config[AIO_QUEUE_DEPTH],
+                                            single_submit=aio_config[AIO_SINGLE_SUBMIT],
+                                            overlap_events=aio_config[AIO_OVERLAP_EVENTS],
+                                            intra_op_parallelism=aio_config[AIO_INTRA_OP_PARALLELISM])
 
         # Overlap swapping out
         self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.aio_handle,
@@ -185,7 +187,7 @@ def _separate_pinned_tensors(self, swap_info):
         return pinned_tensors, pinned_paths, unpinned_tensors, unpinned_paths
 
     def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor):
-        swap_info = self.swap_params_info[id(parameter)]
+        swap_info = self.swap_params_info[OptimizerSwapper.parameter_id(parameter)]
         param_gradients = swap_info.swapped_gradients.values()
         swap_buffers = [gradient_tensor.narrow(0, grad.offset, grad.length) for grad in param_gradients]
         swap_paths = [grad.path for grad in param_gradients]
@@ -203,7 +205,7 @@ def _swap_in_pinned_gradients(self, aio_handle, parameter, gradient_tensor):
         self._log_timers([SWAP_READ_GRADIENTS, SWAP_WAIT_GRADIENTS])
 
     def _swap_in_gradients(self, aio_handle, parameter, dest_buffer):
-        swap_info = self.swap_params_info.get(id(parameter), None)
+        swap_info = self.swap_params_info.get(OptimizerSwapper.parameter_id(parameter), None)
         if not (swap_info and swap_info.has_gradients()):
             return
 
diff --git a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
index fcc6a272883f..f80fe1501c00 100644
--- a/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
+++ b/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py
@@ -13,6 +13,7 @@
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import AsyncIOBuilder
+from deepspeed.ops.op_builder import GDSBuilder
 from .constants import *
 from .utils import swap_in_tensors, swap_out_tensors, MIN_AIO_BYTES, AIO_ALIGNED_BYTES, print_object, SwapBufferPool
 
@@ -37,8 +38,6 @@ class AsyncPartitionedParameterSwapper(object):
 
     def __init__(self, ds_config, model_dtype):
 
-        aio_op = AsyncIOBuilder().load(verbose=False)
-        self.aio_handle = aio_op.aio_handle
         self.dtype = model_dtype
 
         #set swap buffers, create aio handles
@@ -93,9 +92,13 @@ def _configure_aio(self, ds_config):
 
         self.aio_config = ds_config.aio_config
 
+        self.use_gds = self.aio_config[AIO_USE_GDS]
+        self.aio_handle = GDSBuilder().load(verbose=False).gds_handle if self.use_gds else AsyncIOBuilder().load(
+            verbose=False).aio_handle
+
         # Read/Write alignment for each thread during Intra-request parallelism
         self.min_aio_bytes = max(MIN_AIO_BYTES, self.aio_config[AIO_BLOCK_SIZE])
-        self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_THREAD_COUNT]
+        self.aligned_bytes = AIO_ALIGNED_BYTES * self.aio_config[AIO_INTRA_OP_PARALLELISM]
         self.numel_alignment = self.aligned_bytes // self.swap_element_size
 
         self.elements_per_buffer = self.swap_config.buffer_size
@@ -104,19 +107,31 @@ def _configure_aio(self, ds_config):
 
         self.available_buffer_ids = [i for i in range(self.param_buffer_count)]
         self.reserved_buffer_ids = []
-        self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer *
-                                                                    self.param_buffer_count),
-                                                                dtype=self.dtype,
-                                                                requires_grad=False),
-                                                    align_bytes=0)
-
-        self.aio_read_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
-                                               self.aio_config[AIO_SINGLE_SUBMIT], self.aio_config[AIO_OVERLAP_EVENTS],
-                                               self.aio_config[AIO_THREAD_COUNT])
-
-        self.aio_write_handle = self.aio_handle(self.aio_config[AIO_BLOCK_SIZE], self.aio_config[AIO_QUEUE_DEPTH],
-                                                self.aio_config[AIO_SINGLE_SUBMIT],
-                                                self.aio_config[AIO_OVERLAP_EVENTS], self.aio_config[AIO_THREAD_COUNT])
+
+        self.aio_read_handle = self.aio_handle(block_size=self.aio_config[AIO_BLOCK_SIZE],
+                                               queue_depth=self.aio_config[AIO_QUEUE_DEPTH],
+                                               single_submit=self.aio_config[AIO_SINGLE_SUBMIT],
+                                               overlap_events=self.aio_config[AIO_OVERLAP_EVENTS],
+                                               intra_op_parallelism=self.aio_config[AIO_INTRA_OP_PARALLELISM])
+
+        self.aio_write_handle = self.aio_handle(block_size=self.aio_config[AIO_BLOCK_SIZE],
+                                                queue_depth=self.aio_config[AIO_QUEUE_DEPTH],
+                                                single_submit=self.aio_config[AIO_SINGLE_SUBMIT],
+                                                overlap_events=self.aio_config[AIO_OVERLAP_EVENTS],
+                                                intra_op_parallelism=self.aio_config[AIO_INTRA_OP_PARALLELISM])
+
+        if self.use_gds:
+            self.buffers = torch.empty(int(self.aligned_elements_per_buffer * self.param_buffer_count),
+                                       dtype=self.dtype,
+                                       device=get_accelerator().device_name(),
+                                       requires_grad=False)
+            self.aio_read_handle.pin_device_tensor(self.buffers)
+        else:
+            self.buffers = get_accelerator().pin_memory(torch.empty(int(self.aligned_elements_per_buffer *
+                                                                        self.param_buffer_count),
+                                                                    dtype=self.dtype,
+                                                                    requires_grad=False),
+                                                        align_bytes=0)
 
         self.swap_out_params = []
 
diff --git a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
index cb00e3dc2fad..8f6d72e35f63 100644
--- a/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
+++ b/deepspeed/runtime/swap_tensor/pipelined_optimizer_swapper.py
@@ -8,6 +8,7 @@
 
 from deepspeed.ops.op_builder import AsyncIOBuilder
 from deepspeed import comm as dist
+import torch
 
 from deepspeed.runtime.swap_tensor.constants import *
 from deepspeed.runtime.swap_tensor.utils import swap_in_tensors, swap_out_tensors, print_object
@@ -28,7 +29,7 @@ def __init__(self, aio_handle, read_op, param_info, allocated_buffers, state_buf
         self.num_ops = num_ops
 
     def is_parameter(self, parameter):
-        return id(parameter) == self.param_info.param_id
+        return OptimizerSwapper.parameter_id(parameter) == self.param_info.param_id
 
     def wait(self):
         assert self.wait_required
@@ -55,13 +56,17 @@ def __init__(self, swap_config, aio_config, base_folder, optimizer, largest_nume
                                                         device, dtype, timers)
 
         aio_op = AsyncIOBuilder().load()
-        self.write_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
-                                                  aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
-                                                  aio_config[AIO_THREAD_COUNT])
-
-        self.read_aio_handle = aio_op.aio_handle(aio_config[AIO_BLOCK_SIZE], aio_config[AIO_QUEUE_DEPTH],
-                                                 aio_config[AIO_SINGLE_SUBMIT], aio_config[AIO_OVERLAP_EVENTS],
-                                                 aio_config[AIO_THREAD_COUNT])
+        self.write_aio_handle = aio_op.aio_handle(block_size=aio_config[AIO_BLOCK_SIZE],
+                                                  queue_depth=aio_config[AIO_QUEUE_DEPTH],
+                                                  single_submit=aio_config[AIO_SINGLE_SUBMIT],
+                                                  overlap_events=aio_config[AIO_OVERLAP_EVENTS],
+                                                  intra_op_parallelism=aio_config[AIO_INTRA_OP_PARALLELISM])
+
+        self.read_aio_handle = aio_op.aio_handle(block_size=aio_config[AIO_BLOCK_SIZE],
+                                                 queue_depth=aio_config[AIO_QUEUE_DEPTH],
+                                                 single_submit=aio_config[AIO_SINGLE_SUBMIT],
+                                                 overlap_events=aio_config[AIO_OVERLAP_EVENTS],
+                                                 intra_op_parallelism=aio_config[AIO_INTRA_OP_PARALLELISM])
 
         # Overlap gradient swap out
         self.gradient_swapper = AsyncTensorSwapper(aio_handle=self.write_aio_handle,
@@ -154,6 +159,8 @@ def swap_out_gradients(self, parameter, gradient_offsets, gradient_tensors):
 
     def _complete_swap_out(self, swap_out_type):
         self.swap_ops[swap_out_type].wait()
+        for buffer in self.swap_ops[swap_out_type].state_buffers:
+            buffer = torch.Tensor()
         self.swap_buffer_manager.free(self.swap_ops[swap_out_type].allocated_buffers)
         self.swap_ops[swap_out_type] = None
 
diff --git a/deepspeed/runtime/swap_tensor/utils.py b/deepspeed/runtime/swap_tensor/utils.py
index 90b2d9b8bd31..1f9825c34638 100644
--- a/deepspeed/runtime/swap_tensor/utils.py
+++ b/deepspeed/runtime/swap_tensor/utils.py
@@ -18,12 +18,12 @@
 
 def swap_in_tensors(swap_handle, tensor_buffers, swap_paths):
     for buffer, path in zip(tensor_buffers, swap_paths):
-        assert (swap_handle.async_pread(buffer, path) == 0)
+        assert (swap_handle.async_pread(buffer, path, 0) == 0)
 
 
 def swap_out_tensors(swap_handle, tensor_buffers, swap_paths):
     for buffer, path in zip(tensor_buffers, swap_paths):
-        assert (swap_handle.async_pwrite(buffer, path) == 0)
+        assert (swap_handle.async_pwrite(buffer, path, 0) == 0)
 
 
 def print_object(obj, name, exclude_list=[]):
diff --git a/deepspeed/runtime/tensor_parallel/__init__.py b/deepspeed/runtime/tensor_parallel/__init__.py
new file mode 100644
index 000000000000..388239345351
--- /dev/null
+++ b/deepspeed/runtime/tensor_parallel/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .config import AUTOTP_MODE, get_tensor_parallel_config
+from .tp_manager import TpTrainingManager
diff --git a/deepspeed/runtime/tensor_parallel/config.py b/deepspeed/runtime/tensor_parallel/config.py
new file mode 100644
index 000000000000..1300bf9323cd
--- /dev/null
+++ b/deepspeed/runtime/tensor_parallel/config.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from enum import Enum
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+import torch
+from pydantic import Field
+from typing import Optional
+
+
+class AUTOTP_MODE(Enum):
+    TRAINING = "TRAINING"
+    INFERENCE = "INFERENCE"
+
+
+class TPConfig(DeepSpeedConfigModel):
+    """ Configure tensor parallelism settings """
+
+    tp_size: int = 1
+    """ Number of devices to split the model across using tensor parallelism. """
+
+    tp_grain_size: int = 1
+    "The variable required by the autoTP parser has not been activated in training yet"
+    "as it depends on the gather logic that supports uneven partitioning. "
+    "Desired MLP/lm_head tp size granularity. DNN library favors tensor size in granularity of power of 2, we pick 64 as a default size."
+
+    mpu: object = None
+    """
+    A model parallelism unit object that implements
+    ``get_{model,data}_parallel_{rank,group,world_size}()``.
+    """
+
+    tp_group: object = None
+
+
+class TPTrainingConfig(DeepSpeedConfigModel):
+
+    dtype: torch.dtype = torch.float16
+    """
+    Desired model data type, will convert model to this type.
+    """
+
+    autotp_size: int = 0
+    """
+    In automatic tensor-parallelism training, 'tensor_parallel_size'
+    When set to 0, indicates that it is disabled.
+    """
+    tensor_parallel: TPConfig = Field({}, alias="tp")
+    """
+    Configuration for tensor parallelism used to split the model across several
+    GPUs. Expects a dictionary containing values for :any:`DeepSpeedTPConfig`.
+    """
+
+    injection_policy_tuple: Optional[tuple] = None
+    #The following parameters are required by autoTP parser.
+    ########################################
+    keep_module_on_host: bool = False
+    """
+    When loading checkpoints to model parameters, they are moved to the device. In very large models
+    this might fill the device and cause OOM. Setting this flag to true, will keep checkpoints on
+    host and not move them directly to the device (giving an option to quantize checkpoint data before
+    moving it to the device for example).
+    """
+
+    replace_with_kernel_inject: bool = Field(False, alias="kernel_inject")
+    """
+    Set to true to inject inference kernels for models such as, Bert, GPT2,
+    GPT-Neo and GPT-J.  Otherwise, the injection_dict provides the names of two
+    linear layers as a tuple:
+    `(attention_output projection, transformer output projection)`
+    """
+    ########################################
+
+
+def get_tensor_parallel_config(ds_config):
+
+    if 'tensor_parallel' in ds_config:
+        return TPTrainingConfig(**ds_config['tensor_parallel'])
+    return TPTrainingConfig()
diff --git a/deepspeed/runtime/tensor_parallel/tp_manager.py b/deepspeed/runtime/tensor_parallel/tp_manager.py
new file mode 100644
index 000000000000..cf0b5a75c92a
--- /dev/null
+++ b/deepspeed/runtime/tensor_parallel/tp_manager.py
@@ -0,0 +1,66 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from .config import TPTrainingConfig, TPConfig
+from deepspeed.utils import groups
+import deepspeed.comm as dist
+
+
+class TpTrainingManager():
+
+    def __init__(self, model, tp_size, dtype):
+        self.module = model
+        self.config = self._initialize_config(dtype)
+
+        from deepspeed.module_inject.auto_tp import AutoTP
+        from deepspeed import get_accelerator
+
+        # Parse model configuration
+        parser_dict = AutoTP.tp_parser(model)
+        print("AutoTP: ", parser_dict)
+
+        # Initialize TP configuration and model
+        self._initialize_tp_config(tp_size)
+        self._get_model_config_generate()
+
+        # Synchronize random number generator state across devices
+        _rng_state = get_accelerator().get_rng_state().to(get_accelerator().current_device_name())
+        dist.broadcast(_rng_state, groups.get_tensor_model_parallel_src_rank(), self.tp_config.tp_group)
+        get_accelerator().set_rng_state(_rng_state.cpu())
+
+        # Apply injection policies
+        self._apply_policies(parser_dict)
+
+    def _initialize_config(self, dtype):
+        """Initialize and return the DeepSpeed TP training configuration."""
+        config = TPTrainingConfig()
+        config.dtype = dtype
+        return config
+
+    def _apply_policies(self, parser_dict):
+        """Apply injection policies to the parsed modules."""
+        for client_module, injection_policy in parser_dict:
+            self.config.injection_policy_tuple = injection_policy
+            self._apply_injection_policy(self.config, client_module)
+
+    def _apply_injection_policy(self, config, client_module=None):
+        from deepspeed.module_inject import replace_transformer_layer
+        """Apply the given injection policy to a client module."""
+        if isinstance(self.module, torch.nn.Module):
+            replace_transformer_layer(client_module, self.module, None, self.config, self.model_config)
+
+    def _initialize_tp_config(self, tp_size):
+        """Perform TP configuration initialization."""
+        self.tp_config = TPConfig()
+        self.tp_config.tp_size = tp_size
+
+        groups._init_tp_mesh_device(tp_size)
+        self.tp_config.tp_group = groups.get_tensor_model_parallel_group()
+        self.config.tensor_parallel = self.tp_config
+
+    def _get_model_config_generate(self):
+        """Generate and apply HF model  configuration."""
+        self.model_config = getattr(self.module, 'config', None)
diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py
index 108c7775530b..9fd7a65a53ba 100755
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -9,29 +9,28 @@
 """
 
 from collections.abc import Iterable
-from deepspeed.moe.utils import is_moe_param
 import os
 import psutil
 import gc
 from math import sqrt
-from bisect import bisect_left
-from packaging import version as pkg_version
 
-import torch
-from deepspeed import comm as dist
+from numpy import prod
 
+import torch
+from torch.nn import functional as F
 try:
     from torch._six import inf
 except ModuleNotFoundError:
     from torch import inf
-
+from typing import Union, List, Dict
+from deepspeed import comm as dist
+from deepspeed.moe.utils import is_moe_param
 from deepspeed.utils import groups, logger
+from deepspeed.utils.bwc import (bwc_tensor_model_parallel_rank, bwc_pipeline_parallel_world_size,
+                                 bwc_pipeline_parallel_group)
 from deepspeed.runtime.constants import PIPE_REPLICATED
-from numpy import prod
 from deepspeed.accelerator import get_accelerator
-
 from deepspeed.module_inject.policy import transpose
-from torch.nn import functional as F
 
 torch_memory_reserved = get_accelerator().memory_reserved
 torch_max_memory_reserved = get_accelerator().max_memory_reserved
@@ -48,6 +47,27 @@ def __init__(self, params):
         self.param_groups.append({'params': params})
 
 
+graph_cache = {}
+
+
+def graph_process(replay_first_step, func, *args, **kwargs):
+    # `func` should only contain operations on the GPU
+    # Please ensure that the memory address of the data required by 'func' remains constant
+    if func.__name__ not in graph_cache:
+        cuda_stream = get_accelerator().Stream()
+        cuda_stream.wait_stream(get_accelerator().current_stream())
+        with get_accelerator().stream(cuda_stream):
+            func(*args, **kwargs)
+        get_accelerator().current_stream().wait_stream(cuda_stream)
+        graph_cache[func.__name__] = get_accelerator().create_graph()
+        with get_accelerator().capture_to_graph(graph_cache[func.__name__]):
+            func(*args, **kwargs)
+        if replay_first_step:
+            get_accelerator().replay_graph(graph_cache[func.__name__])
+    else:
+        get_accelerator().replay_graph(graph_cache[func.__name__])
+
+
 def noop_decorator(func):
     return func
 
@@ -97,44 +117,6 @@ def is_model_parallel_parameter(p) -> bool:
     return False
 
 
-def bwc_tensor_model_parallel_rank(mpu=None):
-    """Backwards-compatible way of querying the tensor model parallel rank from
-    an ``mpu`` object.
-
-    *Tensor* model parallelism means that tensors are physically split across
-    processes. This contrasts with *pipeline* model parallelism, in which the
-    layers are partitioned but tensors left intact.
-
-    The API for tensor model parallelism has changed across versions and this
-    helper provides a best-effort implementation across versions of ``mpu``
-    objects.  The preferred mechanism is
-    ``mpu.get_tensor_model_parallel_rank()``.
-
-    This should "just work" with both Megatron-LM and DeepSpeed's pipeline
-    parallelism.
-
-    Args:
-        mpu (model parallel unit, optional): The tensor model parallel rank.
-            If ``mpu=None``, returns 0. Defaults to ``None``.
-
-    Returns:
-        int: the rank
-    """
-    if mpu is None:
-        # No model parallelism in easy :)
-        return 0
-
-    if hasattr(mpu, 'get_tensor_model_parallel_rank'):
-        # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
-        return mpu.get_tensor_model_parallel_rank()
-    elif hasattr(mpu, 'get_slice_parallel_rank'):
-        # Some DeepSpeed + pipeline parallelism versions
-        return mpu.get_slice_parallel_rank()
-    else:
-        # Deprecated Megatron and DeepSpeed convention
-        return mpu.get_model_parallel_rank()
-
-
 def copy_to_device(item, device, criterion_func):
     """
     Return a copy of tensor on specified device.
@@ -185,6 +167,17 @@ def move_to_device(item, device, criterion_func):
         return item
 
 
+def get_norm_with_moe_layers_fast(all_groups_norm, group):
+    # This implementation standardizes the grad_norm across ranks. A more precise implementation can be found in 'get_norm_with_moe_layers'.
+    # Need to allreduce (avg) the norms across different ranks because moe params will not be synced during allreduce
+    scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=group))
+    scaled_norm_tensor = torch.tensor(scaled_norm, device=get_accelerator().current_device_name(), dtype=torch.float)
+    dist.all_reduce(scaled_norm_tensor, group=group)
+    all_groups_norm = scaled_norm_tensor.item()
+    #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {deepspeed.comm.get_rank()}")
+    return all_groups_norm
+
+
 class CheckOverflow(object):
     '''Checks for overflow in gradient across parallel process'''
 
@@ -264,8 +257,8 @@ def has_overflow(self, params, has_moe_params=None):
         elif self.mpu is not None:
             if self.deepspeed is not None:
                 using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce')
-                if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or (
-                        not using_pipeline and self.deepspeed.enable_backward_allreduce is False):
+                if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce
+                        is False) or (not using_pipeline and self.deepspeed.enable_backward_allreduce is False):
                     dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_data_parallel_group())
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group())
         elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False:
@@ -343,48 +336,55 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None):
     if isinstance(parameters, torch.Tensor):
         parameters = [parameters]
     parameters = list(filter(lambda p: p.grad is not None, parameters))
-    max_norm = float(max_norm)
     norm_type = float(norm_type)
+    all_norms = []
     if norm_type == inf:
-        total_norm = max(p.grad.data.abs().max() for p in parameters)
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+        for p in parameters:
+            all_norms.append(p.grad.data.abs().max().float())
+        total_norm = torch.stack(all_norms).max()
+        total_norm = total_norm.to(get_accelerator().current_device_name())
         # Take max across all GPUs.
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()
+            dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
     else:
         total_norm = 0
         for p in parameters:
             if mpu is not None:
                 if (mpu.get_model_parallel_rank() == 0) or is_model_parallel_parameter(p):
-                    param_norm = p.grad.data.norm(norm_type)
-                    total_norm += param_norm.item()**norm_type
+                    param_norm = p.grad.data.detach().float().norm(norm_type)
+                    all_norms.append(param_norm)
             else:
-                param_norm = p.grad.data.float().norm(norm_type)
-                total_norm += param_norm.item()**norm_type
-
+                param_norm = p.grad.data.detach().float().norm(norm_type)
+                all_norms.append(param_norm)
+        if len(all_norms) > 0:
+            total_norm = torch.stack(all_norms).square().sum().float()
+        else:
+            total_norm = get_accelerator().FloatTensor([0.0])
+        total_norm = total_norm.to(get_accelerator().current_device_name())
         # Sum across all model parallel GPUs.
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+        total_norm = total_norm.pow(1. / norm_type)
 
     # Need to average total_norm across different GPUs due to the presence of moe params
     pg = groups._get_data_parallel_group()
     scaled_norm = total_norm * 1.0 / float(dist.get_world_size(group=pg))
+    scaled_norm_tensor = scaled_norm
 
-    scaled_norm_tensor = get_accelerator().FloatTensor([float(scaled_norm)])
     dist.all_reduce(scaled_norm_tensor, group=pg)
-    total_norm = scaled_norm_tensor.item()
+    total_norm = scaled_norm_tensor
+    total_norm = total_norm.to(parameters[0].device)
 
+    max_norm = torch.tensor([float(max_norm)], device=total_norm.device)
     clip_coef = max_norm / (total_norm + 1e-6)
-    if clip_coef < 1:
-        for p in parameters:
-            p.grad.data.mul_(clip_coef)
+    tmp_tensor = torch.tensor([1.0], device=clip_coef.device)
+    clip_coef = torch.min(tmp_tensor, clip_coef)
+    for p in parameters:
+        p.grad.data.mul_(clip_coef)
     return total_norm
 
 
-def get_grad_norm(parameters, norm_type=2, mpu=None):
+def get_flattened_grad_norm(parameters, norm_type=2, mpu=None, grad_norm_mask=None):
     """Get grad norm of an iterable of parameters.
 
     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
@@ -396,7 +396,8 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
             single Tensor that will have gradients normalized
         norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
             infinity norm.
-
+        grad_norm_mask (List[Tensor]): A list of Tensor, where
+            each Tensor is a 2D Tensor containing ranges of [start_index, end_index].
     Returns:
         Total norm of the parameters (viewed as a single vector).
     """
@@ -414,18 +415,27 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
         total_norm = total_norm_cuda[0].item()
     else:
         total_norm = 0.
-        tensor_mp_rank = bwc_tensor_model_parallel_rank(mpu=mpu)
-        for p in parameters:
-            # Pipeline parallelism may replicate parameters. Avoid multi-counting.
-            if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
-                continue
-
-            # Filter to avoid over-counting replicated tensors from tensor
-            # model parallelism
-            if (tensor_mp_rank > 0) and not is_model_parallel_parameter(p):
-                continue
+        for idx, p in enumerate(parameters):
+            # Use grad_norm_mask to avoid redundant computation of flattened gradient norm
+            if grad_norm_mask is not None and len(grad_norm_mask[idx]) > 0:
+
+                # A loop-free implementation to create a mask tensor based on a range list
+                # which is logically equivalent to the following implementation.
+                # # mask_tensor_ = torch.zeros_like(p, device=p.device, dtype=bool)
+                # # for mask_idx in grad_norm_mask[idx]:
+                # #   mask_tensor_[mask_idx[0]:mask_idx[1]] = True
+                cum_sum_pairs = torch.tensor([1, -1], device=get_accelerator().current_device_name(),
+                                             dtype=p.dtype).repeat(grad_norm_mask[idx].shape[0], 1)
+                mask_tensor = torch.zeros(p.shape[0] + 1,
+                                          device=get_accelerator().current_device_name(),
+                                          dtype=p.dtype)
+                mask_tensor = mask_tensor.scatter_(0, grad_norm_mask[idx].view(-1),
+                                                   cum_sum_pairs.view(-1)).cumsum(0).bool()[:-1]
+
+                param_norm = torch.masked_fill(p.grad.data, mask_tensor, 0).float().norm(norm_type)
 
-            param_norm = p.grad.data.float().norm(norm_type)
+            else:
+                param_norm = p.grad.data.float().norm(norm_type)
             total_norm += param_norm.item()**norm_type
 
         # Sum across all model parallel GPUs.
@@ -570,67 +580,43 @@ def partition_uniform(num_items, num_parts):
     return parts
 
 
-def _lprobe(weights, num_parts, bottleneck):
-    num_items = len(weights)
-    total_weight = weights[-1]
-
-    # initialize partitioning
-    parts = [0] * (num_parts + 1)
-    for p in range(1, num_parts + 1):
-        parts[p] = num_items
-
-    bsum = bottleneck  # running sum of target weight for pth partition
-    chunksize = num_items // num_parts
-    step = chunksize
-    for p in range(1, num_parts):
-        # Jump to the next bucket
-        while (step < num_items) and (weights[step] < bsum):
-            step += chunksize
-
-        # Find the end index of partition p
-        parts[p] = bisect_left(weights, bsum, lo=step - chunksize, hi=min(step, num_items))
-        # Nothing more to partition, return early
-        if parts[p] == num_items:
-            # See if the current partition is overweight.
-            part_size = weights[-1] - weights[parts[p - 1]]
-            return parts, part_size < bottleneck
-
-        # Next partition target
-        bsum = weights[parts[p] - 1] + bottleneck
-
-    return parts, bsum >= total_weight
-
-
-def _rb_partition_balanced(weights, num_parts, eps):
-    total_weight = weights[-1]
-    lower = total_weight / num_parts  # best case heaviest partition
-    upper = total_weight  # worst case heaviest partition
-
-    # Do a binary search for the best partitioning
-    while upper > lower + eps:
-        mid = lower + ((upper - lower) / 2)
-        parts, success = _lprobe(weights, num_parts, mid)
-        if success:
-            upper = mid
-        else:
-            lower = mid + eps
-    return upper
-
-
-def partition_balanced(weights, num_parts, eps=1e-3):
-    num_items = len(weights)
-    # First check for the trivial edge case
-    if num_items <= num_parts:
-        return partition_uniform(num_items, num_parts)
-
-    weights_ = prefix_sum_inc(weights)
-
-    # Find the smallest bottleneck (weight of heaviest partition)
-    bottleneck = _rb_partition_balanced(weights_, num_parts, eps=eps)
-
-    # Now compute that partitioning
-    parts, success = _lprobe(weights_, num_parts, bottleneck)
-    assert success
+def partition_balanced(weights, num_parts):
+    """
+    use dynamic programming solve `The Linear Partition Problem`.
+    see https://www8.cs.umu.se/kurser/TDBAfl/VT06/algorithms/BOOK/BOOK2/NODE45.HTM
+    """
+    import numpy as np
+    n = len(weights)
+    m = num_parts
+
+    if n <= m:
+        return partition_uniform(n, m)
+
+    dp_max = np.full((n + 1, m + 1), np.inf)
+    dp_min = np.full((n + 1, m + 1), np.inf)
+    dp_cost = np.full((n + 1, m + 1), np.inf)
+    position = np.zeros((n + 1, m + 1), dtype=int)
+    prefix_sum = np.zeros((n + 1))
+    prefix_sum[1:] = np.cumsum(weights)
+
+    dp_max[0, 0] = 0
+    dp_cost[0, 0] = 0
+    for i in range(1, n + 1):
+        for j in range(1, min(i, m) + 1):
+            for k in range(i):
+                max_sum = max(dp_max[k, j - 1], prefix_sum[i] - prefix_sum[k])
+                min_sum = min(dp_min[k, j - 1], prefix_sum[i] - prefix_sum[k])
+                cost = max_sum - min_sum
+                if dp_cost[i, j] >= cost:
+                    dp_cost[i, j] = cost
+                    dp_max[i, j] = max_sum
+                    dp_min[i, j] = min_sum
+                    position[i, j] = k
+
+    parts = [n]
+    for i in reversed(range(1, m + 1)):
+        parts.append(position[parts[-1], i])
+    parts.reverse()
 
     return parts
 
@@ -643,10 +629,10 @@ def __init__(self, tensor, group, partition_meta=None):
         self.group = group
         self.num_parts = dist.get_world_size(group=self.group)
         self.rank = dist.get_rank(group=self.group)
-
         self.orig_size = list(tensor.size())
         self.orig_device = tensor.device
         self.local_data, self.partition = self._partition_tensor(tensor)
+        self.even_split = tensor.numel() % self.num_parts == 0
 
     @classmethod
     def from_meta(cls, meta, local_part, group, device=get_accelerator().device_name()):
@@ -689,23 +675,16 @@ def full(self, device=None):
         # Allocate the full tensor as a flat buffer.
         full_numel = prod(self.full_size())
         flat_tensor = torch.zeros([full_numel], dtype=self.local_data.dtype, device=device)
-
-        # Prepare all-gather buffer
-        partition_tensors = []
-        for part_id in range(self.num_parts):
-            part_size = self.partition[part_id + 1] - self.partition[part_id]
-            buf = flat_tensor.narrow(0, start=self.partition[part_id], length=part_size)
-            if part_id == self.rank:
-                buf.copy_(self.local_data)
-            partition_tensors.append(buf)
-
-        # Collect the full tensor
-        dist.all_gather(partition_tensors, partition_tensors[self.rank], group=self.group)
-
-        for i in range(len(partition_tensors)):
-            partition_tensors[i].data = torch.zeros(1)
-            partition_tensors[i] = None
-
+        if self.even_split:
+            # Collect the full tensor
+            dist.all_gather_into_tensor(flat_tensor, self.local_data, group=self.group)
+        else:
+            for part_id in range(self.num_parts):
+                part_size = self.partition[part_id + 1] - self.partition[part_id]
+                buf = flat_tensor.narrow(0, start=self.partition[part_id], length=part_size)
+                if part_id == self.rank:
+                    buf.copy_(self.local_data)
+                dist.broadcast(buf, part_id, self.group)
         return flat_tensor.view(self.full_size()).clone().detach()
 
     def to_meta(self):
@@ -844,26 +823,7 @@ def get_only_unique_item(items):
     return unique_item
 
 
-def clip_gradients(parameters, max_norm=1.0, global_grad_norm=None, mpu=None, eps=1e-6):
-    """Clip the gradient of a list of parameters.
-    Args:
-        parameters: List of parameters whose .grad will be clipped.
-        global_grad_norm (float, optional): Precomputed gradient norm. Defaults to None.
-        mpu (optional): model parallelism unit. Defaults to None.
-        eps (float, optional): epsilon value added to grad norm. Defaults to 1e-6
-    Returns:
-        float: the global gradient norm
-    """
-    if global_grad_norm is None:
-        global_grad_norm = get_grad_norm(parameters, mpu=mpu)
-    clip_coef = max_norm / (global_grad_norm + eps)
-    if clip_coef < 1:
-        for p in parameters:
-            p.grad.detach().mul_(clip_coef)
-    return global_grad_norm
-
-
-def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None):
+def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None, use_graph=False, moe_ep_group=None):
     """Get norm of an iterable of tensors.
 
     This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
@@ -877,31 +837,73 @@ def get_global_norm_of_tensors(input_tensors, norm_type=2, mpu=None):
     Returns:
         Total norm of the tensors (viewed as a single vector).
     """
-
     assert isinstance(input_tensors, Iterable), f'expected Iterable type not {type(input_tensors)}'
     assert all([torch.is_tensor(t) for t in input_tensors]), f'expected list of only tensors'
 
     norm_type = float(norm_type)
+    all_norms = []
     if norm_type == inf:
-        total_norm = max(t.data.abs().max() for t in input_tensors)
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+        for t in input_tensors:
+            all_norms.append(t.data.abs().max().float())
+        total_norm = torch.stack(all_norms).max()
+        device_total_norm = total_norm.to(get_accelerator().current_device_name())
+        # Max across model parallel
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
-            total_norm = total_norm_cuda[0].item()
+            # For MoE grads, max over model parallel only if MoE-TP is enabled
+            if moe_ep_group is None or groups._get_expert_model_parallel_world_size() > 1:
+                dist.all_reduce(device_total_norm, op=dist.ReduceOp.MAX, group=mpu.get_model_parallel_group())
+            # If MoE grads and MoE-TP disabled, max over pipeline parallel
+            elif bwc_pipeline_parallel_world_size(mpu) > 1:
+                dist.all_reduce(device_total_norm, op=dist.ReduceOp.MAX, group=bwc_pipeline_parallel_group(mpu))
+
+        # MoE grads: max across expert parallel group
+        if moe_ep_group is not None:
+            dist.all_reduce(device_total_norm, op=dist.ReduceOp.MAX, group=moe_ep_group)
+        total_norm = device_total_norm.to(input_tensors[0].device)
     else:
-        total_norm = sum([t.data.float().norm(norm_type).item()**norm_type for t in input_tensors])
-        total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
+
+        if 'norm_tensors_compute_buffer' not in graph_cache or len(
+                graph_cache['norm_tensors_compute_buffer']) != len(input_tensors):
+            graph_cache['norm_tensors_compute_buffer'] = [
+                torch.empty([], dtype=torch.float, device=get_accelerator().current_device_name())
+                for t in input_tensors
+            ]
+        compute_buffer = graph_cache['norm_tensors_compute_buffer']
+
+        def _norm_tensors(tensor_list, _compute_buffer, _norm_type):
+            for i, t in enumerate(tensor_list):
+                _compute_buffer[i].data.copy_(t.data.float().norm(_norm_type)**_norm_type)
+                if i != 0:
+                    _compute_buffer[0].data.add_(_compute_buffer[i].data)
+
+        if use_graph:
+            graph_process(False, _norm_tensors, input_tensors, compute_buffer, norm_type)
+        else:
+            _norm_tensors(input_tensors, compute_buffer, norm_type)
+
+        device_total_norm = compute_buffer[0].float().detach()
+
+        # Sum across model parallel
         if mpu is not None:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+            # For MoE grads, sum over model parallel only if MoE-TP is enabled
+            if moe_ep_group is None or groups._get_expert_model_parallel_world_size() > 1:
+                dist.all_reduce(device_total_norm, op=dist.ReduceOp.SUM, group=mpu.get_model_parallel_group())
+            # If MoE grads and MoE-TP disabled, sum over pipeline parallel
+            elif bwc_pipeline_parallel_world_size(mpu) > 1:
+                dist.all_reduce(device_total_norm, op=dist.ReduceOp.SUM, group=bwc_pipeline_parallel_group(mpu))
 
-    if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-        total_norm = -1
+        # MoE grads: sum across expert parallel group
+        if moe_ep_group is not None:
+            dist.all_reduce(device_total_norm, op=dist.ReduceOp.SUM, group=moe_ep_group)
+        total_norm = device_total_norm.to(input_tensors[0].device).pow(1. / norm_type)
+
+    inf_or_nan = total_norm.isinf().logical_or(total_norm.isnan())
+    total_norm.masked_fill_(inf_or_nan, -1)
 
     return total_norm
 
 
-def clip_tensors_by_global_norm(input_tensors, max_norm=1.0, global_norm=None, mpu=None, eps=1e-6):
+def clip_tensors_by_global_norm(input_tensors, max_norm=1.0, global_norm=None, mpu=None, eps=1e-6, use_graph=False):
     """Clip list of tensors by global norm.
     Args:
         input_tensors: List of tensors to be clipped
@@ -912,14 +914,26 @@ def clip_tensors_by_global_norm(input_tensors, max_norm=1.0, global_norm=None, m
         float: the global norm
     """
     if global_norm is None:
-        global_norm = get_global_norm_of_tensors(input_tensors, mpu=mpu)
-
+        global_norm = get_global_norm_of_tensors(input_tensors, mpu=mpu, use_graph=use_graph)
     clip_coef = max_norm / (global_norm + eps)
-
     if clip_coef < 1:
-        for t in input_tensors:
-            t.detach().mul_(clip_coef)
+        if use_graph:
+
+            def clip_tensors(_tensor_list, _clip_coef_tensor):
+                for t in _tensor_list:
+                    t.detach().mul_(_clip_coef_tensor)
+
+            if 'clip_coef_tensor' not in graph_cache:
+                # Alloc memory
+                graph_cache['clip_coef_tensor'] = torch.tensor(clip_coef,
+                                                               dtype=torch.float32).to(get_accelerator().device_name())
+            clip_coef_tensor = graph_cache['clip_coef_tensor']
+            clip_coef_tensor.copy_(torch.tensor(clip_coef, dtype=torch.float32))
+            graph_process(False, clip_tensors, input_tensors, clip_coef_tensor)
 
+        else:
+            for t in input_tensors:
+                t.detach().mul_(clip_coef)
     return global_norm
 
 
@@ -937,16 +951,22 @@ def align_dense_tensors(tensor_list, alignment):
     return padded_tensor_list
 
 
-def all_gather_all_partitions(global_flatten_group, partitioned_param_groups, dp_process_group):
-    for group_id, partitioned_params in enumerate(partitioned_param_groups):
-        # Sequential AllGather Best of both worlds
+def all_gather_into_tensor_dp_groups(groups_flat, partitioned_param_groups, dp_process_group):
+    for group_id, (group_flat, partitioned_params) in enumerate(zip(groups_flat, partitioned_param_groups)):
         partition_id = dist.get_rank(group=dp_process_group[group_id])
         dp_world_size = dist.get_world_size(group=dp_process_group[group_id])
-        dist.all_gather_into_tensor(global_flatten_group[group_id], partitioned_params[partition_id],
-                                    dp_process_group[group_id])
+        if dp_world_size == 1:
+            # no groups share optimizer states
+            # pipeline parallel with bf16 will default call this even if dp size = 1.
+            continue
+        dist.all_gather_into_tensor(group_flat, partitioned_params[partition_id], dp_process_group[group_id])
+
 
+def all_gather_dp_groups(groups_flat, partitioned_param_groups, dp_process_group, start_alignment_factor,
+                         allgather_bucket_size):
+    if dist.has_all_gather_into_tensor():
+        return all_gather_into_tensor_dp_groups(groups_flat, partitioned_param_groups, dp_process_group)
 
-def all_gather_dp_groups(partitioned_param_groups, dp_process_group, start_alignment_factor, allgather_bucket_size):
     for group_id, partitioned_params in enumerate(partitioned_param_groups):
         # Sequential AllGather Best of both worlds
         partition_id = dist.get_rank(group=dp_process_group[group_id])
@@ -1005,15 +1025,123 @@ def get_inactive_params(param_list):
                             param.ds_status == ZeroParamStatus.NOT_AVAILABLE)]
 
 
-def required_torch_version(min_version=None, max_version=None):
-    assert min_version or max_version, "Must provide a min_version or max_version argument"
+def get_norm_with_moe_layers(non_expert_norm, mpu, expert_tensors, norm_type=2):
+    """ Compute the global norm with MoE experts
 
-    torch_version = pkg_version.parse(torch.__version__)
+    Inputs:
+    non_expert_norm (float) : the calculated norm of the non-expert params
+    expert_tensors (Dict[ep_name, List[Tensor]): Dictionary of expert group name to list of grad tensors
+    norm_type (int): the norm to use
 
-    if min_version and pkg_version.parse(str(min_version)) > torch_version:
-        return False
+    Returns:
+        if norm is (-/+) inf, returns -1
+        otherwise the global norm (float)
+    """
+
+    def to_tensor(v):
+        return get_accelerator().FloatTensor(float(v)).detach()
+
+    group_norms = [non_expert_norm]
+    for exp_name, tensors in expert_tensors.items():
+        group_norm = get_global_norm_of_tensors(input_tensors=tensors,
+                                                mpu=mpu,
+                                                norm_type=norm_type,
+                                                use_graph=False,
+                                                moe_ep_group=groups._get_expert_parallel_group(exp_name))
+        group_norms.append(group_norm)
+
+    # check if all norms are valid
+    group_norms = torch.stack([to_tensor(norm) for norm in group_norms])
+    if group_norms.eq(-1).any():
+        return -1
+
+    # combine norms
+    if norm_type == inf:
+        total_norm = group_norms.max().item()
+    else:
+        total_norm = group_norms.pow(norm_type).sum()
+        total_norm = total_norm.item()**(1. / norm_type)
+        if total_norm == float('inf') or total_norm == -float('inf'):
+            total_norm = -1
+
+    return total_norm
+
+
+def _make_offload_state_key(key):
+    return f"{key}_offload_buffer"
+
+
+def offload_adam_states(optimizer, device, pin_memory: bool = False, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_key(state, key):
+        offload_buf_key = _make_offload_state_key(key)
+        if offload_buf_key not in state:
+            state[offload_buf_key] = torch.empty_like(state[key], device=device)
+            if pin_memory:
+                state[offload_buf_key] = get_accelerator().pin_memory(state[offload_buf_key])
+        state[offload_buf_key].copy_(state[key], non_blocking=non_blocking)
+        state[key].data = state[offload_buf_key]
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_key(state, "exp_avg_sq")
 
-    if max_version and pkg_version.parse(str(max_version)) < torch_version:
+
+def reload_adam_states(optimizer, device, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_back_key(state, key):
+        state[key].data = state[_make_offload_state_key(key)].to(device, non_blocking=non_blocking)
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_back_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_back_key(state, "exp_avg_sq")
+
+
+def compare_tensors_in_structures(inputs1: Union[List, Dict], inputs2: Union[List, Dict]) -> bool:
+    """
+    Compare two lists or dictionaries for equality, including any tensors they may contain.
+
+    Args:
+        inputs1: First input, either a list or a dictionary.
+        inputs2: Second input, either a list or a dictionary.
+
+    Returns:
+        True if inputs1 and inputs2 are equal; False otherwise.
+    """
+    if type(inputs1) != type(inputs2):  # Ensure types match
         return False
 
-    return True
+    if isinstance(inputs1, list) and isinstance(inputs2, list):
+        if len(inputs1) != len(inputs2):
+            return False
+        for val1, val2 in zip(inputs1, inputs2):
+            if isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor):
+                val1 = val1.to(get_accelerator().current_device())
+                val2 = val2.to(get_accelerator().current_device())
+                if not torch.equal(val1, val2):
+                    return False
+            elif val1 != val2:
+                return False
+        return True
+
+    elif isinstance(inputs1, dict) and isinstance(inputs2, dict):
+        if inputs1.keys() != inputs2.keys():
+            return False
+        for key in inputs1:
+            val1, val2 = inputs1[key], inputs2[key]
+            if isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor):
+                val1 = val1.to(get_accelerator().current_device())
+                val2 = val2.to(get_accelerator().current_device())
+                if not torch.equal(val1, val2):
+                    return False
+            elif val1 != val2:
+                return False
+        return True
+
+    return False
diff --git a/deepspeed/runtime/zero/__init__.py b/deepspeed/runtime/zero/__init__.py
index 1ccca09a9e69..23fcf9ec13fb 100644
--- a/deepspeed/runtime/zero/__init__.py
+++ b/deepspeed/runtime/zero/__init__.py
@@ -13,3 +13,5 @@
 from .tiling import TiledLinearReturnBias
 
 from .mics import MiCS_Init
+
+from .stage3 import unwrap_model_for_generation
diff --git a/deepspeed/runtime/zero/config.py b/deepspeed/runtime/zero/config.py
index 76583c129cb9..19ee9b51702e 100644
--- a/deepspeed/runtime/zero/config.py
+++ b/deepspeed/runtime/zero/config.py
@@ -4,9 +4,9 @@
 # DeepSpeed Team
 
 import sys
-from typing import Optional
+from typing import Optional, Dict, Any
 from enum import Enum
-from deepspeed.pydantic_v1 import Field, validator, root_validator
+from pydantic import Field, model_validator
 from deepspeed.runtime.config_utils import get_scalar_param, pp_int, DeepSpeedConfigModel
 from deepspeed.utils import logger
 from .offload_config import DeepSpeedZeroOffloadParamConfig, DeepSpeedZeroOffloadOptimizerConfig, OffloadDeviceEnum
@@ -20,6 +20,8 @@
     "stage": [0|1|2],
     "stage3_max_live_parameters" : 1000000000,
     "stage3_max_reuse_distance" : 1000000000,
+    "stage3_use_all_reduce_for_fetch_params": [true|false],
+    "stage3_module_granularity_threshold": 0,
     "allgather_partitions": [true|false],
     "use_multi_rank_bucket_allreduce": [true|false],
     "allgather_bucket_size": 500000000,
@@ -29,7 +31,7 @@
     "reduce_bucket_size": 500000000,
     "load_from_fp32_weights": [true|false],
     "cpu_offload": [true|false] (deprecated),
-    "cpu_offload_params" : [true|false] (deprecated),
+    "cpu_offload_param" : [true|false] (deprecated),
     "cpu_offload_use_pin_memory": [true|false] (deprecated),
     "sub_group_size" : 1000000000000,
     "offload_param": {...},
@@ -42,6 +44,8 @@
     "zero_quantized_gradients": [true|false],
     "memory_efficient_linear": [true|false],
     "override_module_apply": [true|false],
+    "zeropp_loco_param": {...},
+    "log_trace_cache_warnings" : [true|false],
     }
 }
 """
@@ -127,7 +131,7 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     the allgather for large model sizes
     """
 
-    overlap_comm: bool = None  # None for dynamic default value (see validator `overlap_comm_valid` below)
+    overlap_comm: Optional[bool] = None  # None for dynamic default value (see validator `overlap_comm_valid` below)
     """
     Attempts to overlap the reduction of the gradients with backward computation
     """
@@ -167,27 +171,37 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     parameters). Used by ZeRO3-Offload and ZeRO-Infinity
     """
 
-    cpu_offload_param: bool = Field(
+    cpu_offload_param: Optional[bool] = Field(
         None,
-        deprecated=True,
-        new_param="offload_param",
-        new_param_fn=(lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu) if val else None),
+        json_schema_extra={
+            "deprecated": True,
+            "new_param": "offload_param",
+            "new_param_fn": (lambda val: DeepSpeedZeroOffloadParamConfig(device=OffloadDeviceEnum.cpu)
+                             if val else None)
+        },
     )
     """ Deprecated, please use ``offload_param`` """
 
-    cpu_offload_use_pin_memory: bool = Field(
+    cpu_offload_use_pin_memory: Optional[bool] = Field(
         None,
-        deprecated=True,
-        new_param="offload_param or offload_optimizer",
-        set_new_param=False,
+        json_schema_extra={
+            "deprecated": True,
+            "new_param": "offload_param or offload_optimizer",
+            "set_new_param": False
+        },
     )
     """ Deprecated, please use ``offload_param`` or ``offload_optimizer`` """
 
-    cpu_offload: bool = Field(
+    cpu_offload: Optional[bool] = Field(
         None,
-        deprecated=True,
-        new_param="offload_optimizer",
-        new_param_fn=(lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu) if val else None),
+        json_schema_extra={
+            "deprecated":
+            True,
+            "new_param":
+            "offload_optimizer",
+            "new_param_fn": (lambda val: DeepSpeedZeroOffloadOptimizerConfig(device=OffloadDeviceEnum.cpu)
+                             if val else None)
+        },
     )
     """ Deprecated, please use ``offload_optimizer`` """
 
@@ -234,9 +248,25 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     this option is enabled and then saves the fp16 model weights.
     """
 
+    module_granularity_threshold: int = Field(pp_int(0), alias="stage3_module_granularity_threshold")
+    """
+    The granularity of a module is determined by the ratio of "parameter_count / (1 + descendant count)".
+    ZeRO3 classifies modules with a granularity below the threshold as fine-grained,
+    which are treated as integral units during parameter fetching. This reduces host overhead
+    and the separate allgather overhead introduced by hooks for fine-grained layers when fetching parameters.
+    """
+
+    use_all_reduce_for_fetch_params: bool = Field(False, alias="stage3_use_all_reduce_for_fetch_params")
+    """
+    Use all_reduce op when fetching module parameters at stage3. This improves performance by reducing
+    the overhead of concatenation and slicing on the host.
+    """
+
     stage3_gather_fp16_weights_on_model_save: bool = Field(False,
-                                                           deprecated=True,
-                                                           new_param="gather_16bit_weights_on_model_save")
+                                                           json_schema_extra={
+                                                               "deprecated": True,
+                                                               "new_param": "gather_16bit_weights_on_model_save"
+                                                           })
     """ Deprecated, please use ``gather_16bit_weights_on_model_save`` """
 
     ignore_unused_parameters: bool = True
@@ -282,8 +312,18 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     Boolean indicating whether to use quantized zero gradients
     for efficient all_2_all_reduce comm
     """
+    zeropp_loco_param: Optional[Dict[str, Any]] = None
+    """
+    This dictionary contains parameters for using LoCo-Zero++, with two key parameters:
+    - `err_beta`: A coefficient for the moving average of quantization errors before and after gradient computation.
+    It ranges between 0 and 1, with a default value of 0.8.
+    - `reset_T`: The number of steps after which the moving-average error buffer is cleared. The default value is 1024.
+    These parameters can be adjusted based on performance needs. Example configuration in ds config:
+    "zeropp_loco_param": { "err_beta": 0.8, "reset_T": 1024 }.
+    See LoCo paper for more details: (https://arxiv.org/abs/2407.04480).
+    """
 
-    mics_shard_size: int = Field(-1, new_param="mics_shard_size")
+    mics_shard_size: int = Field(-1, json_schema_extra={"new_param": "mics_shard_size"})
 
     mics_hierarchical_params_gather: bool = False
 
@@ -301,17 +341,21 @@ class DeepSpeedZeroConfig(DeepSpeedConfigModel):
     Override nn.Module apply function, for Stage 3.
     """
 
+    log_trace_cache_warnings: bool = False
+    """
+    Whether to log warnings from trace cache, such as invalidation events.
+    """
+
     # Validators
-    @validator("overlap_comm")
-    def overlap_comm_valid(cls, field_value, values):
-        if field_value is None:
-            assert ("stage" in values), "DeepSpeedZeroConfig: 'stage' must be defined before 'overlap_comm'"
-            field_value = values["stage"] == ZeroStageEnum.weights
-        return field_value
-
-    @root_validator
-    def offload_ratio_check(cls, values):
-        offload_config = getattr(values, "offload_optimizer", {})
+    @model_validator(mode="after")
+    def overlap_comm_valid(self):
+        if self.overlap_comm is None:
+            self.overlap_comm = self.stage == ZeroStageEnum.weights
+        return self
+
+    @model_validator(mode="after")
+    def offload_ratio_check(self):
+        offload_config = self.offload_optimizer
         if offload_config and offload_config.ratio < 1.0:
-            assert values.get("stage") == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3."
-        return values
+            assert self.stage == ZeroStageEnum.weights, "Partial offloading only supported for ZeRO Stage 3."
+        return self
diff --git a/deepspeed/runtime/zero/contiguous_memory_allocator.py b/deepspeed/runtime/zero/contiguous_memory_allocator.py
index 632f68ca1718..35b3d5c7dd5d 100644
--- a/deepspeed/runtime/zero/contiguous_memory_allocator.py
+++ b/deepspeed/runtime/zero/contiguous_memory_allocator.py
@@ -193,7 +193,7 @@ def _defragment_memory(self):
             tensor = self.tensor_map[self.tensor_ids[tensor_addr]]
 
             assert tensor_size == tensor.numel(), \
-                "Size mismatch. {tensor_size} is allocated at addr {tensor_addr} but tensor size is {tensor.numel()} "
+                f"Size mismatch. {tensor_size} is allocated at addr {tensor_addr} but tensor size is {tensor.numel()} "
 
             assert empty_addr != tensor_addr, \
                 f"Cannot have same empty address {empty_addr} and tensor address {tensor_addr}"
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index e9dd78864cde..8c8db60768eb 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -16,6 +16,7 @@
 #when implemented outside of torch.autograd.Function
 
 import math
+import functools
 
 import torch
 from torch import Tensor
@@ -33,8 +34,14 @@ def print_rank_0(message, debug=False, force=False):
 
 
 try:
-    autocast_custom_fwd = get_accelerator().amp().custom_fwd
-    autocast_custom_bwd = get_accelerator().amp().custom_bwd
+    # Fix `torch.[device].amp.custom_fwd/bwd` FutureWarning in torch 2.4
+    if hasattr(torch, 'amp') and hasattr(torch.amp, 'custom_fwd') and hasattr(torch.amp, 'custom_bwd'):
+        autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=get_accelerator().device_name())
+        autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=get_accelerator().device_name())
+    else:
+        # original implementation
+        autocast_custom_fwd = get_accelerator().amp().custom_fwd
+        autocast_custom_bwd = get_accelerator().amp().custom_bwd
 except (ImportError, AttributeError) as exp:
     autocast_custom_fwd = noop_decorator
     autocast_custom_bwd = noop_decorator
diff --git a/deepspeed/runtime/zero/mics.py b/deepspeed/runtime/zero/mics.py
index 95e48053abe4..628bf86a61da 100755
--- a/deepspeed/runtime/zero/mics.py
+++ b/deepspeed/runtime/zero/mics.py
@@ -12,11 +12,12 @@
 import deepspeed
 import torch
 from deepspeed import comm as dist
+from deepspeed.runtime.zero.utils import is_zero_param
 from deepspeed.runtime.zero.mics_utils import (MiCS_CommGroups, create_mics_comm_groups, scale_tensors)
-from deepspeed.runtime.zero.parameter_offload import (DeepSpeedZeRoOffload, is_zero_param)
+from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
 from deepspeed.runtime.zero.partition_parameters import Init, AllGatherCoalescedHandle, ZeroParamStatus
 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
-from deepspeed.utils import instrument_w_nvtx, log_dist
+from deepspeed.utils import instrument_w_nvtx, log_dist, logger
 from deepspeed.accelerator import get_accelerator
 from torch import Tensor
 from torch.nn import Parameter
@@ -37,11 +38,19 @@ class MiCS_AllGatherCoalescedHandle(AllGatherCoalescedHandle):
     def __init__(self, allgather_handle, params: List[Parameter], partitions: List[Tensor], world_size: int) -> None:
         super().__init__(allgather_handle, params, partitions, world_size)
 
-    def wait(self) -> None:
+    def wait(self, **kwargs) -> None:
         """
         """
         # let the current stream to op
-        instrument_w_nvtx(self.allgather_handle.wait)()
+        try:
+            # print("HANDLE", self.allgather_handle)
+            instrument_w_nvtx(self.allgather_handle.wait)()
+        except (ValueError, RuntimeError) as e:
+            log_dist(
+                f"WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork",
+                ranks=[0])
+            log_dist(f"Error message: {e}", ranks=[0])
+
         if self.complete:
             return
 
@@ -57,6 +66,7 @@ class MiCS_Init(Init):
     def __init__(self,
                  module=None,
                  data_parallel_group=None,
+                 sequence_data_parallel_group=None,
                  mem_efficient_linear=True,
                  remote_device=None,
                  pin_memory=False,
@@ -78,6 +88,8 @@ def __init__(self,
                 if it was constructed in the context.
             data_parallel_group (``deepspeed.comm`` process group, optional):
                 The group of processes to partition among. Defaults to all processes.
+                Synonymous with sequence data parallel group for param partitioning
+                across both sequence and data parallel groups.
             mem_efficient_linear (bool, optional): Replace
                 torch.nn.functional.linear with an implementation that allows
                 DeepSpeed to partition parameters. Defaults to ``True``.
@@ -138,9 +150,24 @@ def __init__(self,
         if not dist.is_initialized():
             dist.init_distributed()
             assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"
+
+        if data_parallel_group is None:
+            ds_process_group = dist.get_world_group()
+        else:
+            ds_process_group = data_parallel_group
+
+        if sequence_data_parallel_group is not None:
+            logger.warning(
+                f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
+            if data_parallel_group is not None:
+                raise ValueError(
+                    "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
+                )
+            self.ds_process_group = sequence_data_parallel_group
+
         self.mics_comm_groups = create_mics_comm_groups(
             _ds_config.mics_shard_size,
-            data_parallel_group,
+            ds_process_group,
             hierarchical_allgather=_ds_config.mics_hierarchial_params_gather,
             mpu=mpu)
 
@@ -359,6 +386,7 @@ def __init__(self,
                  offload_optimizer_config=None,
                  offload_param_config=None,
                  sub_group_size=1000000000000,
+                 offload_ratio=0.0,
                  mpu=None,
                  clip_grad=0,
                  gradient_accumulation_dtype=torch.float16,
@@ -374,7 +402,7 @@ def __init__(self,
                          dynamic_loss_args, verbose, contiguous_gradients, reduce_bucket_size, prefetch_bucket_size,
                          max_reuse_distance, max_live_parameters, param_persistence_threshold,
                          model_persistence_threshold, dp_process_group, reduce_scatter, overlap_comm,
-                         offload_optimizer_config, offload_param_config, sub_group_size, mpu, clip_grad,
+                         offload_optimizer_config, offload_param_config, sub_group_size, offload_ratio, mpu, clip_grad,
                          gradient_accumulation_dtype, communication_data_type, postscale_gradients,
                          gradient_predivide_factor, gradient_accumulation_steps, elastic_checkpoint, aio_config)
         first_param = next(module.parameters())
diff --git a/deepspeed/runtime/zero/offload_config.py b/deepspeed/runtime/zero/offload_config.py
index b7adc13a0ea2..ca35d7a7d169 100644
--- a/deepspeed/runtime/zero/offload_config.py
+++ b/deepspeed/runtime/zero/offload_config.py
@@ -5,7 +5,9 @@
 
 from enum import Enum
 from pathlib import Path
-from deepspeed.pydantic_v1 import Field, validator
+from pydantic import Field, model_validator
+from typing import Optional
+
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel, pp_int
 
 
@@ -25,7 +27,7 @@ class DeepSpeedZeroOffloadParamConfig(DeepSpeedConfigModel):
     `nvme`.
     """
 
-    nvme_path: Path = None
+    nvme_path: Optional[Path] = None
     """ Filesystem path for NVMe device for parameter offloading. """
 
     buffer_count: int = Field(5, ge=0)
@@ -56,7 +58,7 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
     `nvme`. Optimizer computation is offload to CPU regardless of device option.
     """
 
-    nvme_path: Path = None
+    nvme_path: Optional[Path] = None
     """ Filesystem path for NVMe device for optimizer state offloading. """
 
     buffer_count: int = Field(4, ge=0)
@@ -88,10 +90,20 @@ class DeepSpeedZeroOffloadOptimizerConfig(DeepSpeedConfigModel):
     fast_init: bool = False
     """ Enable fast optimizer initialization when offloading to NVMe. """
 
-    @validator("pipeline_read", "pipeline_write", always=True)
-    def set_pipeline(cls, field_value, values):
-        values["pipeline"] = field_value or values.get("pipeline", False)
-        return field_value
-
     ratio: float = Field(1.0, ge=0.0, le=1.0)
     """ Percentage of offloaded optimizer states to CPU Adam. Only valid with ZeRO Stage 3."""
+
+    @model_validator(mode="after")
+    def set_pipeline(self):
+        pipeline = self.pipeline_read or self.pipeline_write
+        self.__dict__["pipeline"] = pipeline
+        return self
+
+
+class OffloadStateTypeEnum(str, Enum):
+    """ Enum for internal buffer types """
+    optim_states = "optim_states"
+    hp_params = "hp_params"
+    lp_params = "lp_params"
+    lp_grads = "lp_grads"
+    contiguous_grad_buffer = "contiguous_grad_buffer"
diff --git a/deepspeed/runtime/zero/offload_states.py b/deepspeed/runtime/zero/offload_states.py
new file mode 100644
index 000000000000..f521a11a7aa4
--- /dev/null
+++ b/deepspeed/runtime/zero/offload_states.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Set
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.zero.offload_config import OffloadStateTypeEnum
+
+from deepspeed.utils.tensor_fragment import safe_get_local_fp32_param, safe_get_local_optimizer_state
+
+
+def _make_offload_state_key(key):
+    return f"{key}_offload_buffer"
+
+
+def offload_adam_states(optimizer, device, pin_memory: bool = False, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_key(state, key):
+        offload_buf_key = _make_offload_state_key(key)
+        if offload_buf_key not in state:
+            state[offload_buf_key] = torch.empty_like(state[key], device=device)
+            if pin_memory:
+                state[offload_buf_key] = get_accelerator().pin_memory(state[offload_buf_key])
+        state[offload_buf_key].copy_(state[key], non_blocking=non_blocking)
+        state[key].data = state[offload_buf_key]
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_key(state, "exp_avg_sq")
+
+
+def reload_adam_states(optimizer, device, non_blocking: bool = False):
+    """Move optimizer states to device. Note that this assumes the state structure of DeepSpeed Adam."""
+
+    def move_back_key(state, key):
+        state[key].data = state[_make_offload_state_key(key)].to(device, non_blocking=non_blocking)
+
+    for _, state in optimizer.state.items():
+        if "exp_avg" in state:
+            move_back_key(state, "exp_avg")
+        if "exp_avg_sq" in state:
+            move_back_key(state, "exp_avg_sq")
+
+
+def get_state_devices(model, state: OffloadStateTypeEnum) -> Set[torch.device]:
+    """Retrieve the devices of the specified state of the model.
+
+    Args:
+        model (DeepSpeedEngine): The model whose device allocations are to be checked.
+        state (OffloadStateTypeEnum): The specific state for which the devices should be retrieved.
+
+    Returns:
+        Set[torch.device]: A set of devices of the specified state.
+
+    """
+    if state == OffloadStateTypeEnum.hp_params:
+        return set(safe_get_local_fp32_param(p).device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.lp_params:
+        return set(p.ds_tensor.device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.lp_grads:
+        return {model.optimizer.grad_partitions_flat_buffer.device}
+    elif state == OffloadStateTypeEnum.optim_states:
+        return set(safe_get_local_optimizer_state(p, "exp_avg").device for p in model.parameters()) | \
+               set(safe_get_local_optimizer_state(p, "exp_avg_sq").device for p in model.parameters())
+    elif state == OffloadStateTypeEnum.contiguous_grad_buffer:
+        if model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer == None:
+            return {}
+        return {model.optimizer._DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer.device}
diff --git a/deepspeed/runtime/zero/parameter_offload.py b/deepspeed/runtime/zero/parameter_offload.py
index d1ecd9288878..2706d4474515 100644
--- a/deepspeed/runtime/zero/parameter_offload.py
+++ b/deepspeed/runtime/zero/parameter_offload.py
@@ -6,91 +6,19 @@
 import sys
 import torch
 from collections import OrderedDict
+from deepspeed.utils import z3_leaf_module, set_z3_leaf_module
 from deepspeed.runtime.utils import see_memory_usage
+from deepspeed.runtime.zero.utils import apply_to_tensors_only, is_zero_param
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import _init_external_params
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.partitioned_param_coordinator import PartitionedParameterCoordinator, InflightParamRegistry, iter_params
-from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
+from deepspeed import utils
 
 FWD_MODULE_STACK = list()
 
 
-def is_builtin_type(obj):
-    # https://stackoverflow.com/a/17795199
-    return obj.__class__.__module__ == '__builtin__' or obj.__class__.__module__ == "builtins"
-
-
-def isinstance_namedtuple(obj: object) -> bool:
-    """
-    Is this an instance of namedtuple/NamedTuple?
-    From: https://stackoverflow.com/a/62692640
-
-    Args:
-        obj (object): An object.
-
-    Returns:
-        bool: True if namedtuple/NamedTuple else False.
-    """
-    return isinstance(obj, tuple) and hasattr(obj, '_asdict') and hasattr(obj, '_fields')
-
-
-# ensure we only warn once, otherwise every iteration will trigger a warning
-warned = False
-
-
-def _apply_to_tensors_only(module, functional, backward_function, outputs):
-    """
-    Apply a torch.autograd.Function that calls a `backward_function` to every Tensor in `outputs`.
-
-    Args:
-        module (torch.nn.Module):  A torch module
-        functional (Type[torch.autograd.Function]): The function class to apply.
-        backward_function (Callable[[torch.nn.Module], None]): A backward_function to pass to
-            `functional.apply`.
-        outputs (Any): The output of `module`.
-
-    Returns:
-        Any: The output of `module`.
-    """
-    if isinstance(outputs, (tuple, list)):
-        touched_outputs = []
-        for output in outputs:
-            touched_output = _apply_to_tensors_only(module, functional, backward_function, output)
-            touched_outputs.append(touched_output)
-
-        if isinstance_namedtuple(outputs):
-            # namedtuples require a slightly different syntax.
-            return outputs.__class__(*touched_outputs)
-
-        return outputs.__class__(touched_outputs)
-    elif isinstance(outputs, dict):
-        # apply inplace to avoid recreating dict inherited objects
-        for key in outputs.keys():
-            outputs[key] = _apply_to_tensors_only(module, functional, backward_function, outputs[key])
-        return outputs
-
-    elif isinstance(outputs, torch.Tensor):
-        # this also applies to torch.Tensor's subclasses like torch.nn.parameter.Parameter
-        touched_outputs = functional.apply(module, backward_function, outputs)
-
-        # restore zero param attributes if those get stripped by `backward_function`
-        if not is_zero_param(touched_outputs) and is_zero_param(outputs):
-            touched_outputs.ds_param_alias = outputs
-        return touched_outputs
-    else:
-        if not is_builtin_type(outputs):
-            global warned
-            if not warned and dist.get_rank() == 0:
-                logger.warning(
-                    f"A module has unknown inputs or outputs type ({type(outputs)}) and the tensors embedded in it cannot be detected. "
-                    "The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and "
-                    "output tensors and therefore may not get triggered properly.")
-                warned = True
-        return outputs
-
-
 #for each tensor in outputs run the forward_function and register backward_function as hook
 def _apply_forward_and_backward_to_tensors_only(module, forward_function, backward_function, outputs):
     if type(outputs) is tuple:
@@ -122,6 +50,10 @@ def __init__(self, parent_module, *args, **kwargs):
         self._parent_module = parent_module
         self._in_forward = False
 
+    def __reduce__(self):
+        r0, _, *r2 = super().__reduce__()
+        return (r0, (self._parent_module, )) + tuple(r2)
+
     def __getitem__(self, key):
         param = super().__getitem__(key)
 
@@ -129,7 +61,8 @@ def __getitem__(self, key):
         if param is None:
             return param
 
-        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+        # TODO: only weaken this check during compilation
+        if hasattr(param, "ds_status") and param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
             if self._parent_module._parameters._in_forward:
                 register_external_parameter(FWD_MODULE_STACK[-1], param)
                 param.all_gather()
@@ -150,54 +83,6 @@ def _inject_parameters(module, cls):
         module._parameters = new_param
 
 
-class PreBackwardFunction(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, module, pre_backward_function, outputs):
-        ctx.module = module
-        ctx.pre_backward_function = pre_backward_function
-        if not hasattr(module, "applied_pre_backward_ref_cnt"):
-            module.applied_pre_backward_ref_cnt = 0
-        module.applied_pre_backward_ref_cnt += 1
-        #print(f"After Forward: {ctx.module.__class__.__name__}")
-        outputs = outputs.detach()
-        return outputs
-
-    @staticmethod
-    def backward(ctx, *args):
-        #print(f"Before Backward: {ctx.module.__class__.__name__}")
-        ctx.pre_backward_function(ctx.module)
-        return (None, None) + args
-
-
-class PostBackwardFunction(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, module, post_backward_function, output):
-        ctx.module = module
-        if output.requires_grad:
-            #TODO SOME TIMES post backward does not seem to be triggered debug in detail
-            #Should only cause increase in memory not correctness issue
-            #if output.grad_fn.__class__.__name__ == 'ViewBackward':
-            #    ctx.view=True
-            #    print(f"Warning view tensor for input to module : {module.__class__.__name__}. Backward hooks may not trigger properly")
-            #assert len(module.parameters(recurse=False)), "The input tensor to the module is a view, and autograd Function or register_hook is not triggered with view tensors."
-            #if module.ds_grads_remaining == 0:
-            #    print(f"Before Forward: {ctx.module.__class__.__name__}")
-            module.ds_grads_remaining += 1
-            ctx.post_backward_function = post_backward_function
-        output = output.detach()
-        return output
-
-    @staticmethod
-    def backward(ctx, *args):
-        ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
-        if ctx.module.ds_grads_remaining == 0:
-            ctx.post_backward_function(ctx.module)
-            #print(f"After Backward: {ctx.module.__class__.__name__}")
-        return (None, None) + args
-
-
 class DeepSpeedZeRoOffload(object):
 
     def __init__(
@@ -217,6 +102,8 @@ def __init__(
         zero_param_parallel_group=None,
         zero_quantized_weights=False,
         zero_quantized_nontrainable_weights=False,
+        zero_module_granularity_threshold=0,
+        log_trace_cache_warnings=False,
     ):
 
         see_memory_usage("DeepSpeedZeRoOffload initialize [begin]", force=True)
@@ -232,6 +119,7 @@ def __init__(
         self.zero_param_parallel_group = zero_param_parallel_group
         self.zero_quantized_weights = zero_quantized_weights
         self.zero_quantized_nontrainable_weights = zero_quantized_nontrainable_weights
+        self.log_trace_cache_warnings = log_trace_cache_warnings
 
         if offload_param_config is not None and offload_param_config.device != OffloadDeviceEnum.none:
             self.offload_device = offload_param_config.device
@@ -249,7 +137,6 @@ def __init__(
         self.persistent_parameters = self.mark_persistent_parameters(self.param_numel_persistence_threshold,
                                                                      self.model_persistence_threshold)
 
-        self.param_coordinators = {}
         self._prefetch_bucket_sz = int(prefetch_bucket_size)
         self._max_reuse_distance_in_numel = int(max_reuse_distance)
         self._max_available_parameters_in_numel = int(max_live_parameters)
@@ -257,14 +144,36 @@ def __init__(
         ) if overlap_comm else get_accelerator().default_stream()
 
         if not hasattr(module, "ds_inflight_param_registry"):
-            module.ds_inflight_param_registry = dict()
-            # we need two registries, one for training and one for eval. They will be used when creating PartitionedParameterCoordinator
-            module.ds_inflight_param_registry[True] = InflightParamRegistry()
-            module.ds_inflight_param_registry[False] = InflightParamRegistry()
+            module.ds_inflight_param_registry = InflightParamRegistry()
         self.__inflight_param_registry = module.ds_inflight_param_registry
 
+        self.fast_sharding_for_leaf_module = False
+
+        if zero_module_granularity_threshold > 0:
+            self.min_granularity_value = sys.maxsize
+            self.min_granularity_layer = None
+            self.granularity_info = set()
+            self.z3_leaf_layers = []
+            self._set_z3_leaf_modules_by_threshold(module, zero_module_granularity_threshold)
+            self.fast_sharding_for_leaf_module = True
+
+        self.param_coordinator = PartitionedParameterCoordinator(
+            prefetch_bucket_sz=self._prefetch_bucket_sz,
+            max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
+            max_available_parameters_in_numel=self._max_available_parameters_in_numel,
+            allgather_stream=self.__allgather_stream,
+            inflight_param_registry=self.__inflight_param_registry,
+            prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme,
+            timers=self.timers,
+            zero_quantized_weights=self.zero_quantized_weights,
+            zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights,
+            fast_sharding_for_leaf_module=self.fast_sharding_for_leaf_module,
+            log_trace_cache_warnings=self.log_trace_cache_warnings,
+        )
+
         self.forward_hooks = []
         self.backward_hooks = []
+
         self.setup_zero_stage3_hooks()
         print_rank_0(
             f'Created module hooks: forward = {len(self.forward_hooks)}, backward = {len(self.backward_hooks)}',
@@ -277,26 +186,13 @@ def partition_all_parameters(self):
         """Partitioning Parameters that were not partitioned usually if parameters
         of modules whose input parameters do not require grad computation do not
         trigger post call and will therefore will remain unpartitioned"""
-        self.get_param_coordinator(training=self.module.training).release_and_reset_all(self.module)
+        self.get_param_coordinator().release_and_reset_all(self.module)
         for param in iter_params(self.module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
 
-    def get_param_coordinator(self, training):
-        if not training in self.param_coordinators:
-            self.param_coordinators[training] = PartitionedParameterCoordinator(
-                prefetch_bucket_sz=self._prefetch_bucket_sz,
-                max_reuse_distance_in_numel=self._max_reuse_distance_in_numel,
-                max_available_parameters_in_numel=self._max_available_parameters_in_numel,
-                allgather_stream=self.__allgather_stream,
-                inflight_param_registry=self.__inflight_param_registry[training],
-                prefetch_nvme=self.offload_device == OffloadDeviceEnum.nvme,
-                timers=self.timers,
-                zero_quantized_weights=self.zero_quantized_weights,
-                zero_quantized_nontrainable_weights=self.zero_quantized_nontrainable_weights,
-            )
-
-        return self.param_coordinators[training]
+    def get_param_coordinator(self):
+        return self.param_coordinator
 
     def empty_partition_cache(self):
         self.partition_all_parameters()
@@ -344,14 +240,14 @@ def setup_zero_stage3_hooks(self):
 
         #reset step if in inference mode
         @instrument_w_nvtx
-        def _end_of_forward_hook(module, *args):
+        def _start_of_forward_hook(module, *args):
+
+            self.get_param_coordinator().reset_step()
 
-            if not torch._C.is_grad_enabled():
-                self.get_param_coordinator(training=False).reset_step()
+        self.module.register_forward_pre_hook(_start_of_forward_hook)
 
         #likely one of them should be enough but just to be safe
-        self._register_hooks_recursively(self.module)
-        self.module.register_forward_hook(_end_of_forward_hook)
+        self._register_deepspeed_module(self.module)
 
         # Add top module to stack trace
         global FWD_MODULE_STACK
@@ -377,15 +273,19 @@ def mark_persistent_parameters(self, param_threshold, model_threshold):
 
         return persistent_params
 
-    def _register_hooks_recursively(self, module, count=[0]):
+    def _register_deepspeed_module(self, module, count=[0]):
         my_count = count[0]
-        module.id = my_count
+        module.ds_id = my_count
 
-        #print(f"{module.__class__} : {module.id}")
+        #print(f"{module.__class__} : {module.ds_id}")
 
-        for child in module.children():
-            count[0] = count[0] + 1
-            self._register_hooks_recursively(child, count=count)
+        if z3_leaf_module(module):
+            for param in module.parameters():
+                param.ds_z3_leaf_module = module
+        else:
+            for child in module.children():
+                count[0] = count[0] + 1
+                self._register_deepspeed_module(child, count=count)
 
         @instrument_w_nvtx
         def _pre_forward_module_hook(module, *args):
@@ -393,6 +293,7 @@ def _pre_forward_module_hook(module, *args):
 
         @instrument_w_nvtx
         def _post_forward_module_hook(module, input, output):
+
             global FWD_MODULE_STACK
             FWD_MODULE_STACK.pop()
             if output is None:
@@ -433,20 +334,16 @@ def _post_forward_module_hook(module, input, output):
 
             self.post_sub_module_forward_function(module)
 
-        def _pre_backward_module_hook(module, inputs, output):
+        def _bwd_hook_unexpected_inputs_msg(value):
+            return f"A module has unknown inputs or outputs type ({type(value)}) and the tensors embedded in it cannot be detected. " \
+                "The ZeRO-3 hooks designed to trigger before or after backward pass of the module relies on knowing the input and " \
+                "output tensors and therefore may not get triggered properly."
 
-            @instrument_w_nvtx
-            def _run_before_backward_function(sub_module):
-                # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
-                # before doing backwards, so each backward will need a pre-fetch - using reference
-                # counting to support this scenario
-                #print(f"COUNTER before: {sub_module.applied_pre_backward_ref_cnt}")
-                if sub_module.applied_pre_backward_ref_cnt > 0:
-                    self.pre_sub_module_backward_function(sub_module)
-                    sub_module.applied_pre_backward_ref_cnt -= 1
-                #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
+        def _pre_backward_module_hook(module, inputs, output):
 
-            return _apply_to_tensors_only(module, PreBackwardFunction, _run_before_backward_function, output)
+            return apply_to_tensors_only(module.pre_bwd_fn.apply,
+                                         output,
+                                         warning_msg_fn=_bwd_hook_unexpected_inputs_msg)
 
         #This is an alternate to doing _post_backward_module_hook
         #it uses tensor.register_hook instead of using torch.autograd.Function
@@ -469,14 +366,12 @@ def _run_before_forward_function(input):
                                                                _run_after_backward_hook, inputs)
 
         def _post_backward_module_hook(module, inputs):
-            module.ds_grads_remaining = 0
-
-            @instrument_w_nvtx
-            def _run_after_backward_function(sub_module):
-                if sub_module.ds_grads_remaining == 0:
-                    self.post_sub_module_backward_function(sub_module)
+            if not hasattr(module, "ds_grads_remaining"):
+                module.ds_grads_remaining = 0
 
-            return _apply_to_tensors_only(module, PostBackwardFunction, _run_after_backward_function, inputs)
+            return apply_to_tensors_only(module.post_bwd_fn.apply,
+                                         inputs,
+                                         warning_msg_fn=_bwd_hook_unexpected_inputs_msg)
 
         # Pre forward hook
         self.forward_hooks.append(module.register_forward_pre_hook(_pre_forward_module_hook))
@@ -485,42 +380,111 @@ def _run_after_backward_function(sub_module):
         self.forward_hooks.append(module.register_forward_hook(_post_forward_module_hook))
 
         # Pre backward hook
+        if not hasattr(module, "pre_bwd_fn"):
+
+            @instrument_w_nvtx
+            def _run_before_backward_function(sub_module):
+                # some models (e.g. Albert) may run multiple forwards on the same layer in a loop
+                # before doing backwards, so each backward will need a pre-fetch - using reference
+                # counting to support this scenario
+                #print(f"COUNTER before: {sub_module.applied_pre_backward_ref_cnt}")
+                if sub_module.applied_pre_backward_ref_cnt > 0:
+                    self.pre_sub_module_backward_function(sub_module)
+                    sub_module.applied_pre_backward_ref_cnt -= 1
+                #print(f"COUNTER after: {sub_module.applied_pre_backward_ref_cnt}")
+
+            class PreBackwardFunctionForModule(torch.autograd.Function):
+
+                @staticmethod
+                def forward(ctx, outputs):
+                    # Capture `module` and _run_before_backward_function
+                    ctx.module = module
+                    ctx.pre_backward_function = _run_before_backward_function
+                    if not hasattr(ctx.module, "applied_pre_backward_ref_cnt"):
+                        ctx.module.applied_pre_backward_ref_cnt = 0
+                    ctx.module.applied_pre_backward_ref_cnt += 1
+                    outputs = outputs.detach()
+                    return outputs
+
+                @staticmethod
+                def backward(ctx, *args):
+                    ctx.pre_backward_function(ctx.module)
+                    return args
+
+            module.pre_bwd_fn = PreBackwardFunctionForModule
+
         self.backward_hooks.append(module.register_forward_hook(_pre_backward_module_hook))
 
         # post backward hook
+        if not hasattr(module, "post_bwd_fn"):
+
+            @instrument_w_nvtx
+            def _run_after_backward_function(sub_module):
+                if sub_module.ds_grads_remaining == 0:
+                    self.post_sub_module_backward_function(sub_module)
+
+            class PostBackwardFunctionModule(torch.autograd.Function):
+
+                @staticmethod
+                def forward(ctx, output):
+                    ctx.module = module
+                    if output.requires_grad:
+                        #TODO SOME TIMES post backward does not seem to be triggered debug in detail
+                        #Should only cause increase in memory not correctness issue
+                        #if output.grad_fn.__class__.__name__ == 'ViewBackward':
+                        #    ctx.view=True
+                        #    print(f"Warning view tensor for input to module : {module.__class__.__name__}. Backward hooks may not trigger properly")
+                        #assert len(module.parameters(recurse=False)), "The input tensor to the module is a view, and autograd Function or register_hook is not triggered with view tensors."
+                        #if module.ds_grads_remaining == 0:
+                        #    print(f"Before Forward: {ctx.module.__class__.__name__}")
+                        module.ds_grads_remaining += 1
+                        ctx.post_backward_function = _run_after_backward_function
+                    output = output.detach()
+                    return output
+
+                @staticmethod
+                def backward(ctx, *args):
+                    ctx.module.ds_grads_remaining = ctx.module.ds_grads_remaining - 1
+                    if ctx.module.ds_grads_remaining == 0:
+                        ctx.post_backward_function(ctx.module)
+                    return args
+
+            module.post_bwd_fn = PostBackwardFunctionModule
+
         self.backward_hooks.append(module.register_forward_pre_hook(_post_backward_module_hook))
 
+    @torch.no_grad()
     def pre_sub_module_forward_function(self, sub_module):
         see_memory_usage(f"Before sub module function {sub_module.__class__.__name__}", force=False)
-        prev_grad_state = torch.is_grad_enabled(
-        )  # we don't want to enable grad for sub modules fetching, yet the subfunction need to know if grad is enabled
-        torch.set_grad_enabled(False)
+
         global FWD_MODULE_STACK
         FWD_MODULE_STACK.append(sub_module)
 
-        param_coordinator = self.get_param_coordinator(training=sub_module.training)
+        param_coordinator = self.get_param_coordinator()
         param_coordinator.trace_prologue(sub_module)
         if param_coordinator.is_record_trace():
             param_coordinator.record_module(sub_module)
-        param_coordinator.fetch_sub_module(sub_module, forward=prev_grad_state)
-        torch.set_grad_enabled(prev_grad_state)
+        param_coordinator.fetch_sub_module(sub_module, forward=True)
+
         see_memory_usage(f"Before sub module function {sub_module.__class__.__name__} after fetch", force=False)
 
     @torch.no_grad()
     def post_sub_module_forward_function(self, sub_module):
-        see_memory_usage(f"After sub module function {sub_module.__class__.__name__} {sub_module.id} before release",
-                         force=False)
+        see_memory_usage(
+            f"After sub module function {sub_module.__class__.__name__} {sub_module.ds_id} before release",
+            force=False)
 
-        param_coordinator = self.get_param_coordinator(training=sub_module.training)
-        param_coordinator.release_sub_module(sub_module, backward=False)
+        param_coordinator = self.get_param_coordinator()
+        param_coordinator.release_sub_module(sub_module)
 
-        see_memory_usage(f"After sub module function {sub_module.__class__.__name__}  {sub_module.id} after release",
-                         force=False)
+        see_memory_usage(
+            f"After sub module function {sub_module.__class__.__name__}  {sub_module.ds_id} after release",
+            force=False)
 
     @torch.no_grad()
     def pre_sub_module_backward_function(self, sub_module):
-        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
-        param_coordinator = self.get_param_coordinator(training=True)
+        # assert sub_module.training, "backward pass is invalid for module in evaluation mode"
+        param_coordinator = self.get_param_coordinator()
         param_coordinator.trace_prologue(sub_module)
         if param_coordinator.is_record_trace():
             param_coordinator.record_module(sub_module)
@@ -528,13 +492,92 @@ def pre_sub_module_backward_function(self, sub_module):
 
     @torch.no_grad()
     def post_sub_module_backward_function(self, sub_module):
-        assert sub_module.training, "backward pass is invalid for module in evaluation mode"
+        # assert sub_module.training, "backward pass is invalid for module in evaluation mode"
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} before release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.ds_id} before release",
             force=False)
 
-        self.get_param_coordinator(training=True).release_sub_module(sub_module, backward=True)
+        self.get_param_coordinator().release_sub_module(sub_module)
 
         see_memory_usage(
-            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.id} after release",
+            f"After sub module backward function {sub_module.__class__.__name__} {sub_module.ds_id} after release",
             force=False)
+
+    def _set_z3_leaf_modules_by_threshold(self, module, zero_module_granularity_threshold):
+
+        self._get_granularity_recursively(module)
+        print_rank_0(f"{'MODULE NAME'.ljust(30)}|{'GRANULARITY VALUE'.rjust(20)}", force=True)
+        for granularity in self.granularity_info:
+            print_rank_0(granularity, force=True)
+
+        if self.min_granularity_value <= zero_module_granularity_threshold:
+            self._set_leaf_by_threshold_preorder(module, zero_module_granularity_threshold)
+            utils.logger.info(
+                f"z3_leaf_module was set by stage3_module_granularity_threshold:{zero_module_granularity_threshold}")
+            for layer in self.z3_leaf_layers:
+                print_rank_0(f"{layer.__class__.__name__}:{layer.ds_model_granularity}", force=True)
+        else:
+            utils.logger.warning(
+                f"The smallest module granularity is [{self.min_granularity_layer}:{self.min_granularity_value}]. "\
+                f"To make stage3_module_granularity_threshold effective, you need to set stage3_module_granularity_threshold >= {self.min_granularity_value}. "\
+                f"Current Value:{zero_module_granularity_threshold}"
+            )
+
+    def _get_granularity_recursively(self, module):
+        """This function is used to recursively obtain the granularity of each module."""
+
+        # avoid setting as leaf for particularly large models, even if the granularity is very small
+        # an oversized leaf module increases the number of live parameters, introducing memory overhead
+        Z3_MAX_LEAF_SIZE = 1e9
+
+        if not list(module.parameters()):
+            # skip Modules without parameters, such as GELU, etc.
+            module.ds_model_granularity = sys.maxsize
+            return 0, 0
+
+        num_layers = 0
+        num_params = 0
+        num_params += sum(p.ds_numel for p in module.parameters(recurse=False))
+        if not any(module.children()):
+            # torch leaf module
+            module.ds_model_granularity = sys.maxsize
+            return 1, num_params
+
+        for child in module.children():
+            layers_in_child, params_in_child = self._get_granularity_recursively(child)
+            num_layers += layers_in_child
+            num_params += params_in_child
+
+        if module.__class__.__name__ in torch.nn.modules.container.__all__:
+            # Do not set container modules like ModuleList as leaf modules
+            # as this will prevent hooks from being set on their children
+            # and they may do not invoke the forward method
+            module.ds_model_granularity = sys.maxsize
+            return num_layers, num_params
+
+        num_layers += 1
+        ds_model_granularity = (num_params // num_layers) if num_params <= Z3_MAX_LEAF_SIZE else sys.maxsize
+        module.ds_model_granularity = ds_model_granularity
+        # module.ds_model_num_layers = num_layers
+        # module.ds_model_num_params = num_params
+        if self.min_granularity_value > ds_model_granularity:
+            self.min_granularity_value = ds_model_granularity
+            self.min_granularity_layer = module.__class__.__name__
+        self.granularity_info.add(f"{module.__class__.__name__.ljust(30)}|{str(ds_model_granularity).rjust(20)}")
+
+        return num_layers, num_params
+
+    def _set_leaf_by_threshold_preorder(self, module, granularity_treshhold):
+        '''Set modules as leaf modules based on the threshold, prioritizing parent nodes.'''
+
+        num_params = sum(p.ds_numel for p in module.parameters())
+        if num_params == 0:
+            # skip Modules without parameters, such as GELU, etc.
+            return
+        if module.ds_model_granularity <= granularity_treshhold:
+            set_z3_leaf_module(module, True)
+            self.z3_leaf_layers.append(module)
+            return
+
+        for sub_module in module.children():
+            self._set_leaf_by_threshold_preorder(sub_module, granularity_treshhold)
diff --git a/deepspeed/runtime/zero/partition_parameters.py b/deepspeed/runtime/zero/partition_parameters.py
index c0fcb8d34b80..db03a4b86134 100755
--- a/deepspeed/runtime/zero/partition_parameters.py
+++ b/deepspeed/runtime/zero/partition_parameters.py
@@ -23,9 +23,9 @@
 
 from deepspeed.utils import groups
 import deepspeed
-from ..utils import see_memory_usage
+from ..utils import see_memory_usage, get_only_unique_item
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
-from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks
+from deepspeed.runtime.zero.utils import assert_ints_same_as_other_ranks, is_zero_param
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.config_utils import get_config_default
 from deepspeed.utils import instrument_w_nvtx, logger
@@ -55,8 +55,9 @@ def __init__(self, param: Parameter) -> None:
                                                  non_blocking=True).view(param.ds_shape)
         self.__param = param
 
-    def wait(self) -> None:
-        get_accelerator().current_stream().synchronize()
+    def wait(self, **kwargs) -> None:
+        if not get_accelerator().resolves_data_dependency():
+            get_accelerator().current_stream().synchronize()
         self.__param.ds_status = ZeroParamStatus.AVAILABLE
 
 
@@ -77,11 +78,12 @@ def __init__(self, params: List[Parameter]) -> None:
                                                      non_blocking=True).view(param.ds_shape)
 
     @instrument_w_nvtx
-    def wait(self) -> None:
+    def wait(self, **kwargs) -> None:
         if self.__complete:
             return
 
-        get_accelerator().current_stream().synchronize()
+        if not get_accelerator().resolves_data_dependency():
+            get_accelerator().current_stream().synchronize()
         for param in self.__params:
             assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
             param.ds_status = ZeroParamStatus.AVAILABLE
@@ -109,12 +111,6 @@ def debug_rank0(msg: str) -> None:
         logger.debug(msg)
 
 
-def is_zero_param(parameter):
-    if not torch.is_tensor(parameter):
-        return False
-    return hasattr(parameter, 'ds_id')
-
-
 def _init_external_params(module):
     if not hasattr(module, '_external_params'):
         module._external_params = {}
@@ -266,7 +262,7 @@ def new_tensor(cls, *args, **kwargs) -> Tensor:
 
 
 # https://stackoverflow.com/a/63851681/9201239
-def get_all_subclasses(cls):
+def get_all_subclasses(cls, include_root=True):
     subclass_list = []
 
     def recurse(cl):
@@ -276,7 +272,10 @@ def recurse(cl):
 
     recurse(cls)
 
-    return set(subclass_list)
+    ret = set(subclass_list)
+    if include_root:
+        ret.add(cls)
+    return ret
 
 
 @instrument_w_nvtx
@@ -312,6 +311,7 @@ def __init__(self, enabled=True, mem_efficient_linear=True, ds_config=None, dtyp
             torch.half, torch.bfloat16, torch.float
         ], f"Invalid data type {self.dtype}, allowed values are [torch.half, torch.bfloat16, torch.float]"
         self.wrapped_cls = set()
+        self.skip_init_depth = 0
 
         self.quantized_initialization = None
         if ds_config is not None and ds_config.weight_quantization_config and ds_config.weight_quantization_config.quantized_initialization:
@@ -368,7 +368,8 @@ def _set_dtype(self, ds_config, dtype):
             else:
                 self.dtype = torch.float
         else:
-            self.dtype = dtype or torch.half
+            self.dtype = dtype or torch.float16 if get_accelerator().is_fp16_supported(
+            ) else torch.bfloat16 if get_accelerator().is_bf16_supported else torch.float32
 
     def patch_init_and_builtins(self):
 
@@ -435,6 +436,53 @@ def wrapped_apply(module: Module, fn_to_apply: Callable) -> None:
 
             return wrapped_apply
 
+        def hook_for_skip_init(module):
+            # this function is intended for handling the logic of torch.nn.utils.skip_init
+            # skip_init:module_cls(*args, **kwargs).to_empty(device=final_device), where kwargs['device']='meta'
+            # the function call occurs between module_cls(*args, **kwargs) and to_empty(device=final_device).
+            def partition_after_empty_init(f):
+
+                @functools.wraps(f)
+                def wrapper(module, *args, **kwargs):
+                    _module = f(module, *args, **kwargs)
+                    # here is the post-hook for module.apply(empty_like...)
+                    # after module.apply(empty_like...), the module has completed its empty init on real device
+                    # since skip_init won't involve any computations or weight adjustments, we can directly utilize post_init
+                    self._post_init_method(_module)
+                    return _module
+
+                return wrapper
+
+            def post_wrapper_to_empty(f):
+                # append some wrapper restoration after to_empty() call
+                @functools.wraps(f)
+                def wrapper(*args, **kwargs):
+                    res = f(*args, **kwargs)
+                    # restore _apply hook
+                    for subclass in get_all_subclasses(torch.nn.modules.module.Module):
+                        _disable_class_apply(subclass)
+                    # self restore
+                    module.to_empty = f
+                    return res
+
+                return wrapper
+
+            def _enable_class_apply(cls):
+                if '_apply' in cls.__dict__:
+                    cls._old_apply_of_skip_init_hook = cls._apply
+                    cls._apply = partition_after_empty_init(cls._apply)
+
+            def _disable_class_apply(cls):
+                if hasattr(cls, '_old_apply_of_skip_init_hook'):
+                    cls._apply = cls._old_apply_of_skip_init_hook
+
+            # add hooks for to_empty: apply_(empty_like)
+            for subclass in get_all_subclasses(torch.nn.modules.module.Module):
+                _enable_class_apply(subclass)
+
+            # add a restore hook when exiting skip_init
+            module.to_empty = post_wrapper_to_empty(module.to_empty)
+
         def partition_after(f):
 
             @functools.wraps(f)
@@ -456,26 +504,37 @@ def wrapper(module, *args, **kwargs):
                     is_child_module = True
                     setattr(module, "_ds_child_entered", True)
 
-                f(module, *args, **kwargs)
+                init_on_meta = 'device' in kwargs and kwargs['device'] == 'meta'
+                if init_on_meta:
+                    self.skip_init_depth += 1
 
+                f(module, *args, **kwargs)
+                if init_on_meta and self.skip_init_depth == 1:
+                    # check and handle the logic of empty_init
+                    hook_for_skip_init(module)
                 if is_child_module:
                     # child's __init__ is done, now we can run a single post_init on the child object
                     delattr(module, "_ds_child_entered")
 
                     print_rank_0(f'Running post_init for {module.__class__.__name__}', force=False)
-                    self._post_init_method(module)
+                    if self.skip_init_depth == 0:
+                        self._post_init_method(module)
 
                 print_rank_0(f'After initializing followed by post init for {module.__class__.__name__}', force=False)
+                if init_on_meta:
+                    self.skip_init_depth -= 1
 
             return wrapper
 
         def _enable_class(cls):
-            cls._old_init = cls.__init__
-            cls.__init__ = partition_after(cls.__init__)
+            if '__init__' in cls.__dict__:
+                cls._old_init = cls.__init__
+                cls.__init__ = partition_after(cls.__init__)
 
         def _init_subclass(cls, **kwargs):
-            cls._old_init = cls.__init__
-            cls.__init__ = partition_after(cls.__init__)
+            if '__init__' in cls.__dict__:
+                cls._old_init = cls.__init__
+                cls.__init__ = partition_after(cls.__init__)
 
         # Replace .__init__() for all existing subclasses of torch.nn.Module recursively
         for subclass in get_all_subclasses(torch.nn.modules.module.Module):
@@ -512,11 +571,11 @@ def _init_subclass(cls, **kwargs):
         self.patched = True
 
     def unpatch_init_and_builtins(self):
-
         if self.patched:
 
             def _disable_class(cls):
-                cls.__init__ = cls._old_init
+                if hasattr(cls, '_old_init'):
+                    cls.__init__ = cls._old_init
 
             for subclass in get_all_subclasses(torch.nn.modules.module.Module):
                 _disable_class(subclass)
@@ -580,7 +639,7 @@ def __init__(self, handle, param: Parameter, quantization=None) -> None:
         self.__param = param
         self.__quantization = quantization
 
-    def wait(self) -> None:
+    def wait(self, handle_dependency=True) -> None:
         instrument_w_nvtx(self.__handle.wait)()
         if self.__quantization:
             instrument_w_nvtx(self.__quantization.quant_handle.wait)()
@@ -591,6 +650,8 @@ def wait(self) -> None:
 
 class AllGatherCoalescedHandle:
 
+    data_buffer = []
+
     def __init__(
         self,
         allgather_handle,
@@ -598,7 +659,6 @@ def __init__(
         partitions: List[Tensor],
         world_size: int,
         use_secondary_tensor=False,
-        forward=False,
         quantization=None,
     ) -> None:
         self.allgather_handle = allgather_handle
@@ -606,7 +666,6 @@ def __init__(
         self.partitions = partitions
         self.world_size = world_size
         self.use_secondary_tensor = use_secondary_tensor
-        self.forward = forward
         self.complete = False
         self.quantization = quantization
 
@@ -615,7 +674,7 @@ def __init__(
                 raise RuntimeError(f"expected param {param.ds_summary()} to not be available")
 
     @instrument_w_nvtx
-    def wait(self) -> None:
+    def wait(self, handle_dependency=True) -> None:
         if self.complete:
             return
 
@@ -637,7 +696,7 @@ def wait(self) -> None:
             assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
             partitions: List[Tensor] = []
             ds_tensor_numel = param.ds_tensor.ds_numel
-            if self.use_secondary_tensor and not self.forward:
+            if self.use_secondary_tensor:
                 ds_tensor_numel *= param.ds_secondary_tensor_num_of_groups
             for rank in range(self.world_size):
                 param_start = rank * ds_tensor_numel
@@ -647,14 +706,20 @@ def wait(self) -> None:
                     partitions.append(part_to_copy)
             param.data = instrument_w_nvtx(torch.cat)(partitions).view(param.ds_shape)
             param.ds_status = ZeroParamStatus.AVAILABLE
-
-            for part_to_copy in partitions:
-                if not get_accelerator().is_synchronized_device():
+            if not get_accelerator().is_synchronized_device() and handle_dependency:
+                for part_to_copy in partitions:
                     part_to_copy.record_stream(get_accelerator().current_stream())
 
             param_offset += ds_tensor_numel
 
         self.complete = True
+        if not get_accelerator().is_synchronized_device() and not handle_dependency:
+            # if the device needs to handle dependencies and opts for explicit processing outside the function.
+            AllGatherCoalescedHandle.data_buffer.append(partitions)
+
+    @staticmethod
+    def free_buffer():
+        AllGatherCoalescedHandle.data_buffer = []
 
 
 class MultipleAllGatherHandles:
@@ -662,9 +727,34 @@ class MultipleAllGatherHandles:
     def __init__(self, handles: List[AllGatherCoalescedHandle]):
         self.handles = handles
 
-    def wait(self) -> None:
+    def wait(self, handle_dependency=True) -> None:
         for handle in self.handles:
-            handle.wait()
+            handle.wait(handle_dependency)
+
+
+class AllReduceCoalescedHandle:
+
+    def __init__(self, handle, params: List[Parameter]) -> None:
+        self.handle = handle
+        self.params = params
+        self.complete = False
+
+        for param in self.params:
+            if param.ds_status != ZeroParamStatus.INFLIGHT:
+                raise RuntimeError(f"expected param {param.ds_summary()} to not be available")
+
+    @instrument_w_nvtx
+    def wait(self) -> None:
+        if self.complete:
+            return
+
+        instrument_w_nvtx(self.handle.wait)()
+
+        for param in self.params:
+            assert param.ds_status == ZeroParamStatus.INFLIGHT, f"expected param {param.ds_summary()} to be inflight"
+            param.ds_status = ZeroParamStatus.AVAILABLE
+
+        self.complete = True
 
 
 class QuantizationInfo:
@@ -740,24 +830,22 @@ class Init(InsertPostInitMethodToModuleSubClasses):
     apply_param_persistence = False
     override_module_apply = get_config_default(DeepSpeedZeroConfig, "override_module_apply")
 
-    def __init__(
-        self,
-        module=None,
-        data_parallel_group=None,
-        mem_efficient_linear=True,
-        remote_device=None,
-        pin_memory=False,
-        config_dict_or_path=None,
-        config=None,
-        enabled=True,
-        dtype=None,
-        mpu=None,
-        zero_param_parallel_group=None,
-        zero_quantized_weights=False,
-        zero_quantized_nontrainable_weights=False,
-        sequence_data_parallel_group=None,
-        param_swapper=None,
-    ):
+    def __init__(self,
+                 module=None,
+                 data_parallel_group=None,
+                 mem_efficient_linear=True,
+                 remote_device=None,
+                 pin_memory=False,
+                 config_dict_or_path=None,
+                 config=None,
+                 enabled=True,
+                 dtype=None,
+                 mpu=None,
+                 zero_param_parallel_group=None,
+                 zero_quantized_weights=False,
+                 zero_quantized_nontrainable_weights=False,
+                 sequence_data_parallel_group=None,
+                 param_swapper=None):
         """A context to enable massive model construction for training with
         ZeRO-3. Models are automatically partitioned (or, sharded) across the
         system and converted to half precision.
@@ -767,6 +855,8 @@ def __init__(
                 if it was constructed in the context.
             data_parallel_group (``deepspeed.comm`` process group, optional):
                 The group of processes to partition among. Defaults to all processes.
+                Synonymous with sequence data parallel group for param partitioning
+                across both sequence and data parallel groups.
             mem_efficient_linear (bool, optional): Replace
                 torch.nn.functional.linear with an implementation that allows
                 DeepSpeed to partition parameters. Defaults to ``True``.
@@ -854,27 +944,30 @@ def __init__(
         """
         if config is not None:
             config_dict_or_path = config
-            logger.warning(
-                f'zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.')
+            logger.warning('zero.Init: the `config` argument is deprecated. Please use `config_dict_or_path` instead.')
         _ds_config = deepspeed.runtime.config.DeepSpeedConfig(config_dict_or_path,
                                                               mpu) if config_dict_or_path is not None else None
         if _ds_config is not None:
             mem_efficient_linear = _ds_config.zero_config.memory_efficient_linear
+
         super().__init__(enabled=enabled, mem_efficient_linear=mem_efficient_linear, ds_config=_ds_config, dtype=dtype)
         if not dist.is_initialized():
             init_distributed()
             assert dist.is_initialized(), "Parameters cannot be scattered without initializing deepspeed.comm"
 
-        if data_parallel_group is None and sequence_data_parallel_group is None:
+        if data_parallel_group is None:
             self.ds_process_group = dist.get_world_group()
-        elif sequence_data_parallel_group is not None:
-            self.ds_process_group = sequence_data_parallel_group
-        elif data_parallel_group is not None:
+        else:
             self.ds_process_group = data_parallel_group
-        else:  # both given
-            raise ValueError(
-                "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
-            )
+
+        if sequence_data_parallel_group is not None:
+            logger.warning(
+                f"sequence_data_parallel_group' is deprecated and will be removed. Use 'data_parallel_group' instead.")
+            if data_parallel_group is not None:
+                raise ValueError(
+                    "Both 'data_parallel_group' and 'sequence_data_parallel_group' were specified. Please provide only one of these arguments."
+                )
+            self.ds_process_group = sequence_data_parallel_group
 
         self.rank = dist.get_rank(group=self.ds_process_group)
         self.dp_world_size = dist.get_world_size(group=self.ds_process_group)
@@ -892,7 +985,7 @@ def __init__(
             self.num_ranks_in_param_group = groups._get_zero_param_intra_parallel_group_world_size()
             self.num_param_groups = int(self.dp_world_size / self.num_ranks_in_param_group)
             self.rank_in_group = groups._get_zero_param_intra_parallel_rank_in_mygroup()
-            print_rank_0(f"hpZeRO group size? {self.num_ranks_in_param_group}", force=True)
+            print_rank_0(f"hpZeRO group size: {self.num_ranks_in_param_group}", force=True)
 
             logger.debug(
                 "hpZeRO partition parameter my rank in world {} my rank in group {} ranks in my param partition group: {} "
@@ -945,6 +1038,11 @@ def __init__(
         if not self.use_all_gather_into_tensor:
             logger.info(f"all_gather_into_tensor API is not available in torch {torch.__version__}")
 
+        self.use_all_reduce_for_fetch_params = get_config_default(DeepSpeedZeroConfig,
+                                                                  "use_all_reduce_for_fetch_params")
+        if _ds_config is not None:
+            self.use_all_reduce_for_fetch_params = _ds_config.zero_config.use_all_reduce_for_fetch_params
+
     def _update_persist_config(self, ds_config):
         Init.apply_param_persistence = True
         Init.param_persistence_threshold = ds_config.zero_config.param_persistence_threshold
@@ -953,9 +1051,10 @@ def _update_persist_config(self, ds_config):
     def _zero_init_param(self, param):
         self._convert_to_deepspeed_param(param)
         if dist.get_world_group() == self.get_dp_process_group():
-            dist.broadcast(param, 0, self.get_dp_process_group())
+            dist.broadcast(param.data, 0, self.get_dp_process_group())
         else:
-            dist.broadcast(param, dist.get_global_rank(self.get_dp_process_group(), 0), self.get_dp_process_group())
+            dist.broadcast(param.data, dist.get_global_rank(self.get_dp_process_group(), 0),
+                           self.get_dp_process_group())
         param.partition()
 
     def _convert_to_zero_parameters(self, param_list):
@@ -1061,10 +1160,12 @@ def all_gather(param_list=None, async_op=False, hierarchy=0):
                 param_list = [cls]
             return self._all_gather(param_list, async_op=async_op, hierarchy=hierarchy)
 
-        def _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_process_group):
+        def _all_gather_dtype(dtype, params, world_size, rank_in_group, ds_process_group):
             partition_sz = sum(p.ds_tensor.ds_numel for p in params)
 
-            if params[0].ds_secondary_tensor is not None and not forward:
+            use_secondary_tensor = params[0].ds_secondary_tensor is not None
+
+            if use_secondary_tensor:
                 partition_sz = sum(p.ds_tensor.ds_numel * p.ds_secondary_tensor_num_of_groups for p in params)
 
             flat_tensor = torch.empty(partition_sz * world_size,
@@ -1076,13 +1177,11 @@ def _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_proc
             for i in range(world_size):
                 partitions.append(flat_tensor.narrow(0, partition_sz * i, partition_sz))
 
-            if params[0].ds_secondary_tensor is not None and not forward:
-                use_secondary_tensor = True
+            if use_secondary_tensor:
                 instrument_w_nvtx(
                     torch.cat)([p.ds_secondary_tensor.to(get_accelerator().current_device_name()) for p in params],
                                out=partitions[rank_in_group])
             else:
-                use_secondary_tensor = False
                 instrument_w_nvtx(torch.cat)([p.ds_tensor.to(get_accelerator().current_device_name()) for p in params],
                                              out=partitions[rank_in_group])
             handle = _dist_allgather_fn(partitions[rank_in_group], flat_tensor, ds_process_group)
@@ -1094,12 +1193,10 @@ def _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_proc
                 partitions=partitions,
                 world_size=world_size,
                 use_secondary_tensor=use_secondary_tensor,
-                forward=forward,
             )
 
         @instrument_w_nvtx
         def all_gather_coalesced(params: Iterable[Parameter],
-                                 forward: bool = True,
                                  safe_mode: bool = False,
                                  quantize: bool = False) -> AllGatherCoalescedHandle:
 
@@ -1118,8 +1215,8 @@ def all_gather_coalesced(params: Iterable[Parameter],
             ds_process_group = self.ds_process_group
             rank_in_group = self.rank
             world_size = self.dp_world_size
-            use_secondary_tensor = False
-            if self.zero_param_process_group and not forward:
+            use_secondary_tensor = params[0].ds_secondary_tensor is not None
+            if self.zero_param_process_group and use_secondary_tensor:
                 ds_process_group = self.zero_param_process_group  #intragroup
                 rank_in_group = self.rank_in_group
                 world_size = self.num_ranks_in_param_group
@@ -1147,12 +1244,12 @@ def all_gather_coalesced(params: Iterable[Parameter],
 
             if len(params) == 1:
                 # have an opportunity to avoid some intermediate memory allocations
-                param, = params
+                param = params[0]
                 buffer_size = math.ceil(param.ds_numel / world_size) * world_size
-                if not forward and param.ds_secondary_tensor is not None:
+                if use_secondary_tensor:
                     buffer_size = param.ds_secondary_tensor.shape[0] * world_size  #make sure out is appropriately sized
 
-                param_ds_tensor = param.ds_secondary_tensor if not forward and param.ds_secondary_tensor is not None else param.ds_tensor
+                param_ds_tensor = param.ds_secondary_tensor if use_secondary_tensor else param.ds_tensor
                 param_buffer = torch.empty(
                     buffer_size,
                     dtype=param_ds_tensor.dtype if not quantize else torch.int8,
@@ -1193,86 +1290,107 @@ def all_gather_coalesced(params: Iterable[Parameter],
                     return AllGatherHandle(handle, param, quantization=quant_info)
 
             else:
-                if not quantize:
-                    dtype_params = defaultdict(list)
-                    for p in params:
-                        dtype_params[p.ds_tensor.dtype].append(p)
-                    handles = []
-                    for dtype, params in dtype_params.items():
-                        handles.append(
-                            _all_gather_dtype(dtype, params, forward, world_size, rank_in_group, ds_process_group))
+                if self.use_all_reduce_for_fetch_params and not quantize and not use_secondary_tensor:
+                    # Use all_reduce instead of all_gather to fetch the module params
+                    flat_buffer_size = sum(p.ds_numel_aligned for p in params)
+                    flat_tensor = torch.zeros(flat_buffer_size,
+                                              dtype=get_only_unique_item(p.ds_tensor.dtype for p in params),
+                                              device=get_accelerator().current_device_name(),
+                                              requires_grad=False)
+                    start_param = 0
+                    for param in params:
+                        param.data = flat_tensor.narrow(0, start_param, param.ds_numel).view(param.ds_shape)
+                        start = start_param + param.ds_tensor.ds_numel * self.get_partition_rank()
+                        flat_tensor.narrow(0, start, param.ds_tensor.ds_numel).copy_(param.ds_tensor)
 
-                    return MultipleAllGatherHandles(handles)
+                        start_param += param.ds_numel
 
+                    handle = dist.all_reduce(flat_tensor, group=ds_process_group, async_op=True)
+
+                    return AllReduceCoalescedHandle(handle=handle, params=params)
                 else:
-                    partition_sz = sum(p.ds_tensor.ds_numel for p in params)
+                    if not quantize:
+                        dtype_params = defaultdict(list)
+                        for p in params:
+                            dtype_params[p.ds_tensor.dtype].append(p)
+                        handles = []
+                        for dtype, params in dtype_params.items():
+                            handles.append(
+                                _all_gather_dtype(dtype, params, world_size, rank_in_group, ds_process_group))
 
-                    if params[0].ds_secondary_tensor is not None and not forward:
-                        partition_sz = sum(p.ds_tensor.ds_numel * p.ds_secondary_tensor_num_of_groups for p in params)
+                        return MultipleAllGatherHandles(handles)
 
-                    flat_tensor = torch.empty(partition_sz * world_size,
-                                              dtype=torch.int8,
-                                              device=get_accelerator().current_device_name(),
-                                              requires_grad=False)
-
-                    if params[0].ds_secondary_tensor is not None and not forward:
-                        use_secondary_tensor = True
-                        if hasattr(params[0].ds_secondary_tensor, "ds_quant_scale"):
-                            quantized_param = instrument_w_nvtx(torch.cat)([
-                                p.ds_secondary_tensor.data.to(get_accelerator().current_device_name()) for p in params
-                            ])
-                            scales = instrument_w_nvtx(torch.cat)([
-                                p.ds_secondary_tensor.ds_quant_scale.to(get_accelerator().current_device_name())
-                                for p in params
-                            ])
-                        else:
-                            quantized_param, scales = self.quantizer_module.quantize(
-                                instrument_w_nvtx(torch.cat)([
-                                    p.ds_secondary_tensor.to(get_accelerator().current_device_name()) for p in params
-                                ]))
                     else:
-                        if hasattr(params[0].ds_tensor, "ds_quant_scale"):
-                            quantized_param = instrument_w_nvtx(torch.cat)(
-                                [p.ds_tensor.data.to(get_accelerator().current_device_name()) for p in params])
-                            scales = instrument_w_nvtx(torch.cat)([
-                                p.ds_tensor.ds_quant_scale.to(get_accelerator().current_device_name()) for p in params
-                            ])
+                        partition_sz = sum(p.ds_tensor.ds_numel for p in params)
+
+                        if use_secondary_tensor:
+                            partition_sz = sum(p.ds_tensor.ds_numel * p.ds_secondary_tensor_num_of_groups
+                                               for p in params)
+
+                        flat_tensor = torch.empty(partition_sz * world_size,
+                                                  dtype=torch.int8,
+                                                  device=get_accelerator().current_device_name(),
+                                                  requires_grad=False)
+
+                        if use_secondary_tensor:
+                            if hasattr(params[0].ds_secondary_tensor, "ds_quant_scale"):
+                                quantized_param = instrument_w_nvtx(torch.cat)([
+                                    p.ds_secondary_tensor.data.to(get_accelerator().current_device_name())
+                                    for p in params
+                                ])
+                                scales = instrument_w_nvtx(torch.cat)([
+                                    p.ds_secondary_tensor.ds_quant_scale.to(get_accelerator().current_device_name())
+                                    for p in params
+                                ])
+                            else:
+                                quantized_param, scales = self.quantizer_module.quantize(
+                                    instrument_w_nvtx(torch.cat)([
+                                        p.ds_secondary_tensor.to(get_accelerator().current_device_name())
+                                        for p in params
+                                    ]))
                         else:
-                            quantized_param, scales = self.quantizer_module.quantize(
-                                instrument_w_nvtx(torch.cat)(
-                                    [p.ds_tensor.to(get_accelerator().current_device_name()) for p in params]))
-                    quant_scale_buffer = torch.empty(
-                        scales.numel() * world_size,
-                        dtype=torch.float32,
-                        device=get_accelerator().current_device_name(),
-                        requires_grad=False,
-                    )
-                    handle = _dist_allgather_fn(quantized_param, flat_tensor, ds_process_group)
-                    quant_handle = _dist_allgather_fn(scales, quant_scale_buffer, ds_process_group)
-                    quant_info = QuantizationInfo()
-                    quant_info.quantized_param = flat_tensor
-                    quant_info.backend = self.quantizer_module
-                    quant_info.quant_handle = quant_handle
-                    quant_info.scale_buffer = quant_scale_buffer
-                    quant_info.partition_sz = partition_sz
-                    quant_info.world_size = world_size
-                    return AllGatherCoalescedHandle(
-                        allgather_handle=handle,
-                        params=params,
-                        partitions=None,
-                        world_size=world_size,
-                        use_secondary_tensor=use_secondary_tensor,
-                        forward=forward,
-                        quantization=quant_info,
-                    )
-
-        def partition(param_list=None, backward=False, hierarchy=0, has_been_updated=False):
+                            if hasattr(params[0].ds_tensor, "ds_quant_scale"):
+                                quantized_param = instrument_w_nvtx(torch.cat)(
+                                    [p.ds_tensor.data.to(get_accelerator().current_device_name()) for p in params])
+                                scales = instrument_w_nvtx(torch.cat)([
+                                    p.ds_tensor.ds_quant_scale.to(get_accelerator().current_device_name())
+                                    for p in params
+                                ])
+                            else:
+                                quantized_param, scales = self.quantizer_module.quantize(
+                                    instrument_w_nvtx(torch.cat)(
+                                        [p.ds_tensor.to(get_accelerator().current_device_name()) for p in params]))
+                        quant_scale_buffer = torch.empty(
+                            scales.numel() * world_size,
+                            dtype=torch.float32,
+                            device=get_accelerator().current_device_name(),
+                            requires_grad=False,
+                        )
+                        handle = _dist_allgather_fn(quantized_param, flat_tensor, ds_process_group)
+                        quant_handle = _dist_allgather_fn(scales, quant_scale_buffer, ds_process_group)
+                        quant_info = QuantizationInfo()
+                        quant_info.quantized_param = flat_tensor
+                        quant_info.backend = self.quantizer_module
+                        quant_info.quant_handle = quant_handle
+                        quant_info.scale_buffer = quant_scale_buffer
+                        quant_info.partition_sz = partition_sz
+                        quant_info.world_size = world_size
+                        return AllGatherCoalescedHandle(
+                            allgather_handle=handle,
+                            params=params,
+                            partitions=None,
+                            world_size=world_size,
+                            use_secondary_tensor=use_secondary_tensor,
+                            quantization=quant_info,
+                        )
+
+        def partition(param_list=None, hierarchy=0, has_been_updated=False, free_data=True):
             cls = param
             print_rank_0(f"{'--'*hierarchy}----Partitioning param {debug_param2name_id_shape_device(cls)}",
                          force=False)
             if param_list is None:
                 param_list = [cls]
-            self._partition(param_list, has_been_updated=has_been_updated)
+            self._partition(param_list, has_been_updated=has_been_updated, free_data=True)
 
         def reduce_gradients_at_owner(param_list=None, hierarchy=0):
             cls = param
@@ -1416,12 +1534,12 @@ def _all_gather(self, param_list, async_op=False, hierarchy=None):
 
         return handles
 
-    def _partition(self, param_list, force=False, has_been_updated=False):
+    def _partition(self, param_list, force=False, has_been_updated=False, free_data=True):
         for param in param_list:
             print_rank_0(f"Before Partitioning Param {param.ds_id}", force=False)
             if self.zero_param_process_group is not None:
-                self._partition_param_sec(param, has_been_updated=has_been_updated)
-            self._partition_param(param, has_been_updated=has_been_updated)
+                self._partition_param_sec(param)
+            self._partition_param(param, has_been_updated=has_been_updated, free_data=True)
 
             param.ds_status = ZeroParamStatus.NOT_AVAILABLE
             # if param.ds_tensor is not None:
@@ -1429,7 +1547,7 @@ def _partition(self, param_list, force=False, has_been_updated=False):
             #    "After the parameters are initially partitioned, make sure we are not recreating the partition."
             #print_rank_0(f"After Partitioning Param {param.ds_id} {param.ds_tensor.size()} {param.ds_tensor}",force=False)
     @instrument_w_nvtx
-    def _partition_param(self, param, buffer=None, has_been_updated=False):
+    def _partition_param(self, param, buffer=None, has_been_updated=False, free_data=True):
         assert param.ds_status is not ZeroParamStatus.INFLIGHT, f" {param} Cannot partition a param in flight"
         global reuse_buffers
         print_rank_0(f"Param id {param.ds_id} status is {param.ds_status}", force=False)
@@ -1454,7 +1572,8 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
 
                 see_memory_usage(f'Before partitioning param {param.ds_id} {param.shape}', force=False)
                 # param.data does not store anything meaningful in partitioned state
-                free_param(param)
+                if free_data:
+                    free_param(param)
                 see_memory_usage(f'After partitioning param {param.ds_id} {param.shape}', force=False)
 
                 if param.ds_tensor.final_location == OffloadDeviceEnum.nvme:
@@ -1500,6 +1619,7 @@ def _partition_param(self, param, buffer=None, has_been_updated=False):
                 param.ds_tensor.ds_numel = partition_size
                 param.ds_tensor.status = PartitionedParamStatus.AVAILABLE
                 param.ds_tensor.final_location = final_location
+                param.ds_numel_aligned = tensor_size
 
             start = partition_size * self.get_partition_rank()
             end = start + partition_size
@@ -1556,29 +1676,29 @@ def _partition_param_sec(self, param, buffer=None, has_been_updated=False):
         #print_rank_0(f"SEC Param id {param.ds_id} status is {param.ds_status}", force=True)
         if param.ds_status is ZeroParamStatus.AVAILABLE:
             if param.ds_secondary_tensor is not None and not has_been_updated:  ##param already partitioned
-
                 return
             #check padding
             tensor_size = self._aligned_size(param)
             partition_size = tensor_size // self.dp_world_size
 
             secondary_partition_size = int(tensor_size // self.num_ranks_in_param_group)
-            final_location = None
-            secondary_partitioned_tensor = torch.empty(secondary_partition_size,
-                                                       dtype=param.dtype,
-                                                       device=self.remote_device)
-
-            if self.pin_memory:
-                secondary_partitioned_tensor = secondary_partitioned_tensor.pin_memory()
-            # quantize the tensor if it's not trainable
-            if not param.requires_grad and self.quantized_nontrainable_weights:
-                secondary_partitioned_tensor, secondary_partitioned_tensor.ds_quant_scale = self.quantizer_module.quantize(
-                    secondary_partitioned_tensor)
-            secondary_partitioned_tensor.requires_grad = False
-            param.ds_secondary_tensor = secondary_partitioned_tensor
-            param.ds_secondary_tensor.ds_numel = secondary_partition_size
-            param.ds_secondary_tensor.status = PartitionedParamStatus.AVAILABLE
-            param.ds_secondary_tensor.final_location = final_location
+            if param.ds_secondary_tensor is None:
+                final_location = None
+                secondary_partitioned_tensor = torch.empty(secondary_partition_size,
+                                                           dtype=param.dtype,
+                                                           device=self.remote_device)
+
+                if self.pin_memory:
+                    secondary_partitioned_tensor = secondary_partitioned_tensor.pin_memory()
+                # quantize the tensor if it's not trainable
+                if not param.requires_grad and self.quantized_nontrainable_weights:
+                    secondary_partitioned_tensor, secondary_partitioned_tensor.ds_quant_scale = self.quantizer_module.quantize(
+                        secondary_partitioned_tensor)
+                secondary_partitioned_tensor.requires_grad = False
+                param.ds_secondary_tensor = secondary_partitioned_tensor
+                param.ds_secondary_tensor.ds_numel = secondary_partition_size
+                param.ds_secondary_tensor.status = PartitionedParamStatus.AVAILABLE
+                param.ds_secondary_tensor.final_location = final_location
 
             #use rank in group for secondary tensor
             secondary_start = secondary_partition_size * self.rank_in_group
@@ -1586,19 +1706,17 @@ def _partition_param_sec(self, param, buffer=None, has_been_updated=False):
             secondary_end = secondary_start + secondary_partition_size
 
             one_dim_param = param.contiguous().view(-1)
-            start = partition_size * self.rank
-            end = start + partition_size
-            if start < param.ds_numel and end <= param.ds_numel:
-                if secondary_start < param.ds_numel and secondary_end <= param.ds_numel:
-                    sec_src_tensor = one_dim_param.narrow(0, secondary_start, secondary_partition_size)
-                    param.ds_secondary_tensor.copy_(sec_src_tensor)
 
-            else:
-                if start < param.ds_numel:
-                    elements_to_copy = param.ds_numel - start
-                    elements_to_copy_sec = elements_to_copy * param.ds_secondary_tensor_num_of_groups
-                    param.ds_secondary_tensor.narrow(0, 0, elements_to_copy_sec).copy_(
-                        one_dim_param.narrow(0, secondary_start, elements_to_copy_sec))
+            # ds_numel is unpadded, so the last chunk of the secondary tensor might not be secondary_partition_size
+            sec_numel = max(0, min(param.ds_numel - secondary_start, secondary_partition_size))
+
+            # copy from full tensor to secondary tensor
+            param.ds_secondary_tensor.narrow(0, 0,
+                                             sec_numel).copy_(one_dim_param.narrow(0, secondary_start, sec_numel))
+
+            # TODO: This is a temporary fix to avoid the issue that 2nd tensor all-gather happens before 2nd tensor partition is done
+            if not get_accelerator().resolves_data_dependency():
+                get_accelerator().current_stream().synchronize()
 
             print_rank_0(f"{param.ds_id} partitioned type {param.dtype} dev {param.device} shape {param.shape}",
                          force=False)
@@ -1633,7 +1751,8 @@ def _allgather_param(self, param, async_op=False, hierarchy=0):
             f'After allocate allgather param {debug_param2name_id_shape_status(param)} {aligned_param_size} {partition_size} ',
             force=False)
 
-        get_accelerator().synchronize()
+        if not get_accelerator().resolves_data_dependency():
+            get_accelerator().synchronize()
 
         print_rank_0(
             f"{'--'* hierarchy}----allgather param with {debug_param2name_id_shape_status(param)} partition size={partition_size}"
@@ -1766,10 +1885,12 @@ def _allgather_params_coalesced(self, param_list, hierarchy=0, quantize=False):
             param.data = gathered_tensor.narrow(0, 0, param.ds_numel).view(param.ds_shape).data
 
         # guarantee the communication to be completed
-        get_accelerator().synchronize()
+        if not get_accelerator().resolves_data_dependency():
+            get_accelerator().synchronize()
 
         return None
 
+    @torch.no_grad()
     def _allgather_params(self, param_list, hierarchy=0):
         if len(param_list) == 0:
             return
@@ -1815,10 +1936,10 @@ def _allgather_params(self, param_list, hierarchy=0):
 
                         offset += param_scale_numel
 
-        dist.all_gather(partitions,
-                        partitions[self.get_partition_rank()],
-                        group=self.get_partition_dp_group(param),
-                        async_op=False)
+        dist.all_gather_into_tensor(flat_tensor,
+                                    partitions[self.get_partition_rank()],
+                                    group=self.get_partition_dp_group(param),
+                                    async_op=False)
         if hasattr(param_list[0], 'ds_quant_scale'):
             dist.all_gather(flat_scale_tensor,
                             param_list[0].ds_quant_scale,
@@ -2129,7 +2250,7 @@ def __exit__(self, *exc):
             self.params[0].partition(param_list=self.params, has_been_updated=False)
             return
 
-        handles = [dist.broadcast(p, self.src_rank, group=p.ds_process_group, async_op=True) for p in self.params]
+        handles = [dist.broadcast(p.data, self.src_rank, group=p.ds_process_group, async_op=True) for p in self.params]
         for h in handles:
             h.wait()
         self.params[0].partition(param_list=self.params, has_been_updated=True)
diff --git a/deepspeed/runtime/zero/partitioned_param_coordinator.py b/deepspeed/runtime/zero/partitioned_param_coordinator.py
index 9bcf5a91bc95..3417080b1bea 100644
--- a/deepspeed/runtime/zero/partitioned_param_coordinator.py
+++ b/deepspeed/runtime/zero/partitioned_param_coordinator.py
@@ -9,6 +9,7 @@
 from typing import Deque, Set
 
 from deepspeed import comm as dist
+from deepspeed.utils import z3_leaf_module
 from deepspeed.utils.logging import logger
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.runtime.zero.partition_parameters import *
@@ -16,6 +17,9 @@
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
 from deepspeed.utils.debug import debug_module2name_id, debug_param2name_id
 from deepspeed.accelerator import get_accelerator
+import deepspeed.runtime.compiler as compiler
+from deepspeed.runtime.compiler import is_compiling
+
 import logging
 
 ENABLE_PROFILER = False
@@ -31,6 +35,7 @@ def get_all_parameters(sub_module, recurse=False):
     return itertools.chain(sub_module.named_parameters(recurse=recurse), sub_module.ds_external_parameters())
 
 
+@compiler.disable
 def iter_params(module: Module, recurse=False) -> Iterable[Parameter]:
     return map(lambda pair: pair[1], get_all_parameters(module, recurse))
 
@@ -82,13 +87,15 @@ def __init__(
         timers=None,
         zero_quantized_weights=False,
         zero_quantized_nontrainable_weights=False,
+        fast_sharding_for_leaf_module=False,
+        log_trace_cache_warnings=False,
     ) -> None:
         # mapping of param -> handle for each param that is currently in flight
         self.__inflight_param_registry = inflight_param_registry
         # keeps track of the number of submodules invoked so far.
         self.__step_id: int = 0
         # network tracing mode
-        self.__trace_mode: ZeRoTraceMode = ZeRoTraceMode.RECORD
+        self.__trace_mode: ZeRoTraceMode = ZeRoTraceMode.INVALID
         # sequence of submodules/parameters in forward pass + backward pass
         self.__submodule_order: Iterable[Module] = []
         self.__param_order: Iterable[__class__.__ParamInTrace] = []
@@ -125,6 +132,13 @@ def __init__(
         self.__max_ongoing_fetch_events: int = 2
         self.__profiler = PartitionedParameterProfiler(timers if ENABLE_PROFILER else None)
 
+        # Whether to log trace cache warnings, e.g. invalidation events
+        self.__log_trace_cache_warnings = log_trace_cache_warnings
+
+        # whether to enable fast fetch for the z3 leaf module.
+        # this will improve fetch speed but will not break down leaf module parameters to alleviate memory pressure.
+        self.fast_sharding_for_leaf_module = fast_sharding_for_leaf_module
+
     """Tracing and Tracking
     TODO. consider performing trace before initializing PartitionedParameterCoordinator
     and passing trace results into constructor. This way all the code in here can
@@ -149,46 +163,59 @@ def is_invalid_trace(self) -> bool:
     def is_record_trace(self) -> bool:
         return self.__trace_mode == ZeRoTraceMode.RECORD
 
+    def _clean_inflight_param_registry(self) -> None:
+        for param, handle in self.__inflight_param_registry.items():
+            handle.wait()
+            self.__release_param(param)
+        self.__inflight_param_registry.clear()
+
     def _invalidate_trace(self) -> None:
         if self.is_invalid_trace():
             raise RuntimeError("attempted to invalidate already invalid trace")
         self.__trace_mode = ZeRoTraceMode.INVALID
         self._clear_trace_structures()
+        self._clean_inflight_param_registry()
 
     def trace_prologue(self, sub_module: Module) -> None:
         if self.is_complete_trace():
             # sub_module must match expectation else invalidate trace cache
             if len(self.__submodule_order) <= self.__step_id:
                 print_rank_0(
-                    f"Invalidate trace cache @ step {self.__step_id} and module {sub_module.id}: "
+                    f"Invalidate trace cache @ step {self.__step_id} and module {sub_module.ds_id}: "
                     f"cache has only {len(self.__submodule_order)} modules",
-                    force=True)
+                    force=self.__log_trace_cache_warnings)
                 self._invalidate_trace()
                 return
 
             if sub_module != self.__submodule_order[self.__step_id]:
-                expected_module_id = self.__submodule_order[self.__step_id].id
+                expected_module_id = self.__submodule_order[self.__step_id].ds_id
                 print_rank_0(
                     f"Invalidate trace cache @ step {self.__step_id}: "
-                    f"expected module {expected_module_id}, but got module {sub_module.id}",
-                    force=True)
+                    f"expected module {expected_module_id}, but got module {sub_module.ds_id}",
+                    force=self.__log_trace_cache_warnings)
                 self._invalidate_trace()
 
+    @compiler.disable
     def record_module(self, sub_module: Module) -> None:
         """adds sub module to trace"""
+        if is_compiling():
+            return
+
         if not self.is_record_trace():
             raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
 
         self.__submodule_order.append(sub_module)
-        self.__step_id_module_fetched_for[sub_module.id].append(self.__step_id)
+        self.__step_id_module_fetched_for[sub_module.ds_id].append(self.__step_id)
 
     def record_parameters(self, sub_module: Module) -> None:
+        if is_compiling():
+            return
         """adds sub module to trace"""
         if not self.is_record_trace():
             raise RuntimeError(f"attempted to record trace when status = {self.__trace_mode}")
 
-        step_id = self.__step_id_module_fetched_for[sub_module.id].popleft()
-        for param in sorted(set(iter_params(sub_module)), key=lambda p: p.ds_id):
+        step_id = self.__step_id_module_fetched_for[sub_module.ds_id].popleft()
+        for param in sorted(set(iter_params(sub_module, recurse=z3_leaf_module(sub_module))), key=lambda p: p.ds_id):
             self.__param_order.append(__class__.__ParamInTrace(param=param, step_id_last_used_at=step_id))
 
     def construct_parameter_trace_from_module_trace(self):
@@ -197,15 +224,17 @@ def construct_parameter_trace_from_module_trace(self):
         for sub_module in self.__submodule_order:
             self.record_parameters(sub_module)
 
+    @compiler.disable
     def reset_step(self) -> None:
         """indicate that we have completed one fwd+bwd for the model"""
-        if self.__inflight_param_registry:
-            raise RuntimeError(f"still have inflight params "
-                               f"{[p.ds_summary() for p in self.__inflight_param_registry.keys()]}")
+        if is_compiling():
+            return
+
+        self._clean_inflight_param_registry()
 
         if not self.is_complete_trace():  # not self.trace_complete:
             # Make sure that recorded submodule orders are identical across ranks
-            assert_ints_same_as_other_ranks([m.id for m in self.__submodule_order])
+            assert_ints_same_as_other_ranks([m.ds_id for m in self.__submodule_order])
 
             if self.is_record_trace():
                 # Successfully recorded a trace
@@ -218,7 +247,7 @@ def reset_step(self) -> None:
                 self.__param_order = tuple(self.__param_order)  # freeze
                 self.__trace_mode = ZeRoTraceMode.COMPLETE
                 print_rank_0(
-                    f"completed record trace of {len(self.__submodule_order)} sub modules: {[m.id for m in self.__submodule_order]}",
+                    f"completed record trace of {len(self.__submodule_order)} sub modules: {[m.ds_id for m in self.__submodule_order]}",
                     force=False)
             else:
                 # Enable trace recording for next forward/backward pass
@@ -232,7 +261,6 @@ def reset_step(self) -> None:
         self.__most_recent_step_id_param_fetched_for = collections.defaultdict(lambda: int(-1e10))
         self.__step_id_module_fetched_for = collections.defaultdict(lambda: collections.deque())
         self.__step_id = 0
-        self.__n_available_params = 0
         self.__profiler.reset_events()
 
     def _dump_params(self, tag, sub_module, params, step_id=None):
@@ -251,6 +279,7 @@ def _dump_param_ids(self, tag, mod_id, p_ids, step_id=None):
     Fetching, prefetching, and releasing parameters
     """
 
+    @compiler.disable
     @instrument_w_nvtx
     @torch.no_grad()
     def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
@@ -261,19 +290,20 @@ def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
         """
         if logger.isEnabledFor(logging.DEBUG):
             debug_rank0(
-                f"{self.__step_id}: M{current_submodule.id}({type(current_submodule).__name__}) P{[p.ds_id for p in iter_params(current_submodule)]} "
+                f"{self.__step_id}: M{current_submodule.ds_id}({type(current_submodule).__name__}) P{[p.ds_id for p in iter_params(current_submodule, recurse=z3_leaf_module(current_submodule))]} "
                 + str({
                     "avail": f"{self.__n_available_params:.1e}",
                     "queue_sz": f"{len(self.__param_queue or [])}",
                     "inflight": [p.ds_id for p in self.__inflight_param_registry],
                 }))
 
-        params_to_fetch = frozenset(iter_params(current_submodule))
+        params_to_fetch = set(iter_params(current_submodule, recurse=z3_leaf_module(current_submodule)))
         fetch_numel = sum(
             [p.partition_numel() for p in params_to_fetch if p.ds_status == ZeroParamStatus.NOT_AVAILABLE])
+
         if fetch_numel > 0:
             event_name = __class__.FORWARD_FETCH_SUBMIT if forward else __class__.BACKWARD_FETCH_SUBMIT
-            self._dump_param_ids(event_name, current_submodule.id,
+            self._dump_param_ids(event_name, current_submodule.ds_id,
                                  [p.ds_id for p in params_to_fetch if p.ds_status == ZeroParamStatus.NOT_AVAILABLE])
             self.__profiler.start_event(event_name)
             # kick off all gather for params in the immediately required submodule
@@ -287,9 +317,10 @@ def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
         wait_numel = 0
         wait_event_name = __class__.FORWARD_FETCH_WAIT if forward else __class__.BACKWARD_FETCH_WAIT
         self.__profiler.start_event(wait_event_name)
+        fast_fetch = self.fast_sharding_for_leaf_module and z3_leaf_module(current_submodule)
         # wait for parameters in the immediately needed submodule to become available
         for param in params_to_fetch:
-            param.ds_active_sub_modules.add(current_submodule.id)
+            param.ds_active_sub_modules.add(current_submodule.ds_id)
             if logger.isEnabledFor(logging.DEBUG):
                 debug_rank0(f"-wait: {param.ds_summary()}")
             if param in self.__inflight_param_registry:
@@ -300,16 +331,18 @@ def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
                     if len(self.__ongoing_fetch_events) > self.__max_ongoing_fetch_events:
                         self.__ongoing_fetch_events.popleft().synchronize()
 
-                    self.__inflight_param_registry.pop(param).wait()
+                    self.__inflight_param_registry.pop(param).wait(handle_dependency=not fast_fetch)
 
-                    if not get_accelerator().is_synchronized_device():
+                    if not get_accelerator().handles_memory_backpressure() and not fast_fetch:
                         event = get_accelerator().Event()
                         event.record()
                         self.__ongoing_fetch_events.append(event)
 
             assert param.ds_status == ZeroParamStatus.AVAILABLE, param.ds_summary()
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             get_accelerator().current_stream().wait_stream(self.__allgather_stream)
+        if fast_fetch:
+            AllGatherCoalescedHandle.free_buffer()
         self.__profiler.stop_event(wait_event_name, wait_numel)
 
         # kick off parameter prefetches for upcoming modules
@@ -331,7 +364,7 @@ def fetch_sub_module(self, current_submodule: Module, forward: bool) -> None:
             if discarded_from_prefetch_queue != params_not_already_fetched:
                 raise RuntimeError(
                     f"tracing error at step {self.__step_id}: \n"
-                    f"module id: {current_submodule.id}, training: {current_submodule.training}\n"
+                    f"module id: {current_submodule.ds_id}, training: {current_submodule.training}\n"
                     f"expected the next {len(params_not_already_fetched)} parameters in the "
                     f"parameter fetch queue to be {tuple(p.ds_summary(use_debug_name=True) for p in params_not_already_fetched)} \n"
                     f"but got \n {tuple(p.ds_summary(use_debug_name=True) for p in discarded_from_prefetch_queue)}.")
@@ -386,15 +419,25 @@ def _is_currently_on_nvme(param):
 
     @instrument_w_nvtx
     @torch.no_grad()
-    def release_sub_module(self, submodule: Module, backward: bool) -> None:
+    def release_sub_module(self, submodule: Module) -> None:
         """release the parameters of a sub module, assuming they meet conditions to
         be released."""
         params_to_release = (self.__params_to_release(submodule, self.__step_id) if self.is_complete_trace() else set(
-            p.ds_id for p in iter_params(submodule)))
-        for param in iter_params(submodule):
-            param.ds_active_sub_modules.discard(submodule.id)
+            p.ds_id for p in iter_params(submodule, recurse=z3_leaf_module(submodule))))
+
+        free_data = not z3_leaf_module(submodule) or not self.fast_sharding_for_leaf_module
+        if not free_data:
+            # wait for the computation to finish and launch as early as possible.
+            empty_buffer = torch.empty(1, device=get_accelerator().current_device())
+
+        for param in iter_params(submodule, recurse=z3_leaf_module(submodule)):
+            param.ds_active_sub_modules.discard(submodule.ds_id)
             if param.ds_id in params_to_release and not param.is_external_param:
-                self.__release_param(param, backward)
+                self.__release_param(param, free_data)
+            if not free_data:
+                if param.ds_id in params_to_release and not param.is_external_param:
+                    # empty buffer ensures that all computations are complete
+                    param.data = empty_buffer
 
     @instrument_w_nvtx
     @torch.no_grad()
@@ -402,13 +445,13 @@ def release_and_reset_all(self, module: Module) -> None:
         """release all module parameters"""
         for param in iter_params(module, recurse=True):
             if param in self.__inflight_param_registry:
-                raise RuntimeError(f"param {param.ds_summary()} still in flight")
+                self.__inflight_param_registry.pop(param).wait()
 
             # TODO. make this throw if if there are still active submodules. currently
             # there's a hook execution issue
             param.ds_active_sub_modules.clear()
-            self.__release_param(param, backward=False)
-
+            self.__release_param(param)
+        self.__n_available_params = 0
         for param in iter_params(module, recurse=True):
             if param.ds_status != ZeroParamStatus.NOT_AVAILABLE:
                 raise RuntimeError(f"{param.ds_summary()} expected to be released")
@@ -431,26 +474,34 @@ def __all_gather_params_(self, params: Set[Parameter], forward: bool, quantize:
         """for each partitioned parameter, kick off an async allgather and store
         the work handle for the in flight parameters."""
         partitioned_params = []
-        all_gather_numel = 0
+        all_gather_numel = 0  # numel = num of elements
         for param in params:
             if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
                 partitioned_params.append(param)
                 all_gather_numel += param.ds_numel
 
         if partitioned_params:
-            partitioned_params
             self.__n_available_params += all_gather_numel
-            with get_accelerator().stream(self.__allgather_stream):
-                event_name = __class__.FORWARD_ALL_GATHER if forward else __class__.BACKWARD_ALL_GATHER
-                self.__profiler.start_event(event_name)
-                handle = partitioned_params[0].all_gather_coalesced(partitioned_params,
-                                                                    forward=forward,
-                                                                    quantize=quantize)
-                self.__profiler.stop_event(event_name, all_gather_numel)
-
-            for param in partitioned_params:
-                assert param.ds_status == ZeroParamStatus.INFLIGHT, param.ds_summary()
-                self.__inflight_param_registry[param] = handle
+            # here we need to handle a special case where some of the parameters have a valid hpz secondary tensor (e.g. they are not trainable so their secondary tensor never expire) but others do not.
+            partitioned_params_with_secondary_tensors = [
+                p for p in partitioned_params if p.ds_secondary_tensor is not None
+            ]
+            partitioned_params_without_secondary_tensors = [
+                p for p in partitioned_params if p.ds_secondary_tensor is None
+            ]
+            for param_group in [
+                    partitioned_params_with_secondary_tensors, partitioned_params_without_secondary_tensors
+            ]:
+                if not param_group:
+                    continue
+                with get_accelerator().stream(self.__allgather_stream):
+                    event_name = __class__.FORWARD_ALL_GATHER if forward else __class__.BACKWARD_ALL_GATHER
+                    self.__profiler.start_event(event_name)
+                    handle = param_group[0].all_gather_coalesced(param_group, quantize=quantize)
+                    self.__profiler.stop_event(event_name, all_gather_numel)
+                for param in param_group:
+                    assert param.ds_status == ZeroParamStatus.INFLIGHT, param.ds_summary()
+                    self.__inflight_param_registry[param] = handle
 
             # Release swap buffers for persisted params on nvme since they will never be partitioned or evicted from GPU
             swap_persisted_params = [
@@ -459,12 +510,13 @@ def __all_gather_params_(self, params: Set[Parameter], forward: bool, quantize:
             if swap_persisted_params:
                 swap_persisted_params[0].nvme_swapper.remove_partition_and_release_buffers(swap_persisted_params)
 
+    @compiler.disable
     @instrument_w_nvtx
-    def __release_param(self, param: Parameter, backward: bool) -> None:
+    def __release_param(self, param: Parameter, free_data: bool = True) -> None:
         if param.ds_status == ZeroParamStatus.AVAILABLE and not param.ds_active_sub_modules:
             if logger.isEnabledFor(logging.DEBUG):
                 debug_rank0(f"-release: {param.ds_summary()}")
-            param.partition(backward=backward)
+            param.partition(free_data=free_data)
             self.__n_available_params -= param.ds_numel
 
     @instrument_w_nvtx
@@ -473,7 +525,9 @@ def __params_to_release(self, submodule_to_release: Module, step_id: int) -> Set
         if not self.is_complete_trace():
             raise RuntimeError("expected trace to be complete")
 
-        params_to_release = set(p.ds_id for p in iter_params(submodule_to_release) if not p.ds_persist)
+        params_to_release = set(
+            p.ds_id for p in iter_params(submodule_to_release, recurse=z3_leaf_module(submodule_to_release))
+            if not p.ds_persist)
 
         # Problem: When prefetcher scans the param trace, it skips AVAILABLE params.
         # This creates issues if those params are released before the skipped uses:
@@ -482,7 +536,7 @@ def __params_to_release(self, submodule_to_release: Module, step_id: int) -> Set
         # diverges from the trace.
         # Solution: Don't release params whose reuse was skipped by prefetch. This is
         # possible because we detect such skips during prefetch and mark those params.
-        for param in iter_params(submodule_to_release):
+        for param in iter_params(submodule_to_release, recurse=z3_leaf_module(submodule_to_release)):
             if self.__most_recent_step_id_param_fetched_for[param] > step_id:
                 params_to_release.discard(param.ds_id)
 
@@ -493,7 +547,7 @@ def __params_to_release(self, submodule_to_release: Module, step_id: int) -> Set
         for module in self.__submodule_order[step_id:]:
             if params_traversed >= self.__max_reuse_dist_in_numel:
                 break
-            for param in iter_params(module):
+            for param in iter_params(module, recurse=z3_leaf_module(submodule_to_release)):
                 params_to_release.discard(param.ds_id)
                 params_traversed += param.ds_numel
 
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index c0fd9d6625c7..ee97b6278d9e 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -6,22 +6,29 @@
 import sys
 import gc
 import collections
-from typing import Deque, Dict, Tuple
+import itertools
+from typing import Deque, Dict, Set, Tuple, Container
+from contextlib import contextmanager
+
 from deepspeed import comm as dist
-from deepspeed.utils import groups
+from deepspeed.utils import groups, z3_leaf_parameter
 
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-from deepspeed.runtime import ZeROOptimizer
+from deepspeed.runtime.base_optimizer import ZeROOptimizer
 from deepspeed.utils import logger
+from deepspeed.utils.torch import register_grad_hook
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
-from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
-from deepspeed.runtime.utils import inf, get_global_norm, is_model_parallel_parameter, get_only_unique_item
+from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce, all_to_all_loco_quant_reduce
+from deepspeed.runtime.utils import inf, is_model_parallel_parameter, get_only_unique_item
 from deepspeed.runtime.zero.partition_parameters import *
 from deepspeed.runtime.zero.config import ZeroStageEnum
-from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
 from deepspeed.runtime.zero.parameter_offload import DeepSpeedZeRoOffload
-from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
+from deepspeed.runtime.zero.utils import apply_to_tensors_only, get_mapping_to_flat_buffer
+from deepspeed.runtime.zero.offload_states import offload_adam_states, reload_adam_states
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.runtime.swap_tensor.partitioned_param_swapper import PartitionedParamStatus
+from deepspeed.runtime.swap_tensor.optimizer_utils import OptimizerSwapper
 from deepspeed.runtime.swap_tensor.partitioned_optimizer_swapper import PartitionedOptimizerSwapper
 from deepspeed.runtime.swap_tensor.pipelined_optimizer_swapper import PipelinedOptimizerSwapper
 from deepspeed.checkpoint.constants import OPTIMIZER_STATE_DICT, FP32_FLAT_GROUPS, PARTITION_COUNT, ZERO_STAGE, LOSS_SCALER
@@ -66,6 +73,39 @@ def move_to_cpu(tensor_list):
         tensor.data = tensor.data.cpu()
 
 
+@contextmanager
+def unwrap_model_for_generation(model):
+    """
+    For ZeRO-3 models, we gather the weights once to speed up generation.
+    """
+    with GatheredParameters(model.parameters()):
+        # Removes the optimizer hooks from a DeepSpeed ZeRO-3 model.
+
+        # Remove hooks
+        if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
+            optimizer_offload = model.optimizer.parameter_offload
+        elif model.optimizer is not None:
+            optimizer_offload = model.optimizer
+
+        for hook in optimizer_offload.forward_hooks:
+            hook.remove()
+        for hook in optimizer_offload.backward_hooks:
+            hook.remove()
+
+        optimizer_offload.forward_hooks = []
+        optimizer_offload.backward_hooks = []
+
+        yield model
+
+        # Adds the optimizer hooks from a DeepSpeed ZeRO-3 model.
+        if model.optimizer is not None and hasattr(model.optimizer, "parameter_offload"):
+            optimizer_offload = model.optimizer.parameter_offload
+        elif model.optimizer is not None:
+            optimizer_offload = model.optimizer
+        optimizer_offload._register_deepspeed_module(optimizer_offload.module)
+    return
+
+
 INITIAL_MICRO_STEP_ID = -1
 
 
@@ -118,6 +158,9 @@ def __init__(
         zero_hpz_partition_size=1,
         zero_quantized_weights=False,
         zero_quantized_nontrainable_weights=False,
+        zero_module_granularity_threshold=0,
+        zeropp_loco_param=None,
+        log_trace_cache_warnings=False,
     ):
         see_memory_usage("Stage 3 initialize beginning", force=True)
 
@@ -188,23 +231,25 @@ def __init__(
             mpu=mpu,
             zero_param_parallel_group=zero_param_parallel_group,
             zero_quantized_weights=zero_quantized_weights,
-            zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights)
+            zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights,
+            zero_module_granularity_threshold=zero_module_granularity_threshold,
+            log_trace_cache_warnings=log_trace_cache_warnings,
+        )
 
         self.persistent_parameters = self.parameter_offload.persistent_parameters
         self._configure_offloading(offload_optimizer_config, offload_param_config)
 
         # backup fused_adam optimizer init
         if self.offload_optimizer and self.partial_offload != 1.0:
-            backup_gpu_tensor = torch.randn(1, device='cuda').to(self.dtype)
+            backup_gpu_tensor = torch.randn(1, device=get_accelerator().device_name()).to(self.dtype)
             backup_gpu_param = torch.nn.Parameter(backup_gpu_tensor)
             assert type(init_optimizer) == DeepSpeedCPUAdam, 'Hybrid Optimizer Only Supports DeepSpeedCPUAdam'
-            self.backup_optimizer = FusedAdam([backup_gpu_param],
-                                              lr=self.optimizer.param_groups[0]["lr"],
-                                              bias_correction=self.optimizer.param_groups[0]["bias_correction"],
-                                              betas=self.optimizer.param_groups[0]["betas"],
-                                              eps=self.optimizer.param_groups[0]["eps"],
-                                              weight_decay=self.optimizer.param_groups[0]["weight_decay"],
-                                              amsgrad=self.optimizer.param_groups[0]["amsgrad"])
+            self.backup_optimizer = torch.optim.AdamW([backup_gpu_param],
+                                                      lr=self.optimizer.param_groups[0]["lr"],
+                                                      betas=self.optimizer.param_groups[0]["betas"],
+                                                      eps=self.optimizer.param_groups[0]["eps"],
+                                                      weight_decay=self.optimizer.param_groups[0]["weight_decay"],
+                                                      amsgrad=self.optimizer.param_groups[0]["amsgrad"])
             # Multiple param_groups configs for back-up optimizer
             if len(self.optimizer.param_groups) > 1:
                 for i in range(1, len(self.optimizer.param_groups)):
@@ -213,14 +258,12 @@ def __init__(
         self.module = module
         self.elastic_checkpoint = elastic_checkpoint
 
-        self.inf_or_nan_tracker: Tensor = torch.zeros(1,
-                                                      dtype=torch.bool,
-                                                      device=get_accelerator().current_device_name(),
-                                                      requires_grad=False)
+        self.device = get_accelerator().current_device_name() if not self.offload_optimizer else OffloadDeviceEnum.cpu
+
+        self.inf_or_nan_tracker: Tensor = torch.zeros(1, dtype=torch.bool, device=self.device, requires_grad=False)
 
         self.deepspeed_adam_offload = (self.offload_optimizer and type(init_optimizer) == DeepSpeedCPUAdam)
 
-        self.device = get_accelerator().current_device_name() if not self.offload_optimizer else OffloadDeviceEnum.cpu
         ### streams used for overlapping computation with communication
         self.reduce_and_partition_stream = None if get_accelerator().is_synchronized_device() else get_accelerator(
         ).Stream() if overlap_comm else get_accelerator().default_stream()
@@ -246,6 +289,8 @@ def __init__(
 
         self.partition_count = dist.get_world_size(group=self.dp_process_group)
 
+        self.zeropp_loco_param = zeropp_loco_param
+
         if mpu is None:
             self.model_parallel_group = None
             self.model_parallel_rank = 0
@@ -282,6 +327,7 @@ def __init__(
         # Holds a fused and flattened copy of the parameters
         self.fp16_partitioned_groups_flat = []
         self.fp16_partitioned_groups_flat_numel = []
+        self.fp16_partitioned_groups_flat_id = []
 
         #defragmented pinned memory
         self.param_groups_fp16_flat_cpu_memory = []
@@ -334,20 +380,16 @@ def __init__(
         self.grads_in_ipg_bucket = []
         self.params_in_ipg_bucket = []
 
-        self.is_gradient_accumulation_boundary = True
+        self.params_already_reduced = {}
         self._release_ipg_buffers()
         self.previous_reduced_grads = None
 
-        # simplified param id
-        self.param_id = {}
-
-        count = 0
-        for i, params_group in enumerate(self.fp16_groups):
+        # model parameter traversal-based param id that's stable across runs
+        for params_group in self.fp16_groups:
             for param in params_group:
-                unique_id = id(param)
-                self.param_id[unique_id] = count
-                self.param_dict[count] = param
-                count = count + 1
+                param_id = self.get_param_id(param)
+                self.param_dict[param_id] = param
+                self.params_already_reduced[param_id] = False
 
         #Largest partitioned param
         largest_partitioned_param_numel = 0
@@ -377,6 +419,8 @@ def __init__(
 
         #creates backward hooks for gradient partitioning
         ###Calls all gather param
+        self._grad_acc_hooks = []
+        self._leaf_module_hooks = []
         self.create_reduce_and_remove_grad_hooks()
 
         #exit(0)
@@ -392,11 +436,18 @@ def __init__(
 
         self._link_all_hp_params()
 
+        self.offloaded_states: Set(OffloadDeviceEnum) = set()
+
         if dist.get_rank(group=self.dp_process_group) == 0:
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
     def destroy(self):
         self.parameter_offload.destroy()
+        for hook in self._grad_acc_hooks:
+            hook.remove()
+        for hook in self._leaf_module_hooks:
+            hook.remove()
+        print_rank_0("Removed grad acc hooks", force=False)
         del self.__ipg_bucket_flat_buffer
 
     def initialize_ds_offload(
@@ -416,6 +467,8 @@ def initialize_ds_offload(
         zero_param_parallel_group,
         zero_quantized_weights,
         zero_quantized_nontrainable_weights,
+        zero_module_granularity_threshold,
+        log_trace_cache_warnings,
     ):
         return DeepSpeedZeRoOffload(module=module,
                                     timers=timers,
@@ -431,7 +484,9 @@ def initialize_ds_offload(
                                     mpu=mpu,
                                     zero_param_parallel_group=zero_param_parallel_group,
                                     zero_quantized_weights=zero_quantized_weights,
-                                    zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights)
+                                    zero_quantized_nontrainable_weights=zero_quantized_nontrainable_weights,
+                                    zero_module_granularity_threshold=zero_module_granularity_threshold,
+                                    log_trace_cache_warnings=log_trace_cache_warnings)
 
     def _get_trainable_parameter_groups(self):
         param_groups = []
@@ -525,21 +580,15 @@ def defragment(tensors: List[Tensor]) -> Tensor:
         cpu_buffer = torch.empty(sum(p.numel() for p in tensors),
                                  dtype=get_only_unique_item(t.dtype for t in tensors),
                                  device="cpu")
-        tensor_infos: List[Tuple[Tensor, int, int]] = []
+        tensor_infos: List[Tuple[Tensor, int, int]] = get_mapping_to_flat_buffer(tensors)
         orig_device = get_only_unique_item(t.device for t in tensors)
 
         offset = 0
-        for tensor in tensors:
-            tensor_numel = tensor.numel()
+        for tensor, offset, tensor_numel in tensor_infos:
             # move the tensor from device memory to host memory
             cpu_buffer.narrow(0, offset, tensor_numel).copy_(tensor)
             tensor.data = torch.empty(0, dtype=tensor.dtype, device=tensor.device)
 
-            # record some data so we can restore the device tensor later
-            tensor_infos.append((tensor, offset, tensor_numel))
-
-            offset += tensor_numel
-
         gc.collect()
         get_accelerator().empty_cache()
 
@@ -552,8 +601,8 @@ def defragment(tensors: List[Tensor]) -> Tensor:
 
         return device_buffer
 
-    def _get_param_coordinator(self, training):
-        return self.parameter_offload.get_param_coordinator(training)
+    def _get_param_coordinator(self):
+        return self.parameter_offload.get_param_coordinator()
 
     def _configure_offloading(self, offload_optimizer_config, offload_param_config):
         ###################### offload optimizer setup ##################################
@@ -672,6 +721,9 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
                 # record total elements of parameter partitions in sub group
                 self.fp16_partitioned_groups_flat_numel.append(sum(p.partition_numel() for p in sub_group))
 
+                # record ds_ids of parameter partitions in sub group
+                self.fp16_partitioned_groups_flat_id.append([p.ds_id for p in sub_group])
+
                 # record padding required to align group to world size (only applies to last rank)
                 rank_requires_padding = dist.get_rank(
                     self.dp_process_group) == dist.get_world_size(self.dp_process_group) - 1
@@ -680,19 +732,12 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
         # move parameters to flattened buffer
         if not self.offload_param:  # partitioned params remain in GPU during training
             # move parameter partitions into a single contiguous flat buffer
-            parameter_partitions: List[Tensor] = []
-            for sub_group in self.fp16_groups:
-                for param in sub_group:
-                    parameter_partitions.append(param.ds_tensor)
-            device_buffer = __class__.defragment(parameter_partitions)
+            parameter_partitions = self._get_parameter_partitions()
+
+            # We need to keep the reference to this buffer to make sure you can free it in `offload_states`
+            self.lp_param_buffer = __class__.defragment(parameter_partitions)
+            self._set_fp16_partitioned_groups_flat()
 
-            # setup flat buffers per subgroup, these are each just sections of the
-            # contiguous flat buffer for all parameters that we created earlier
-            offset = 0
-            for sub_group in self.fp16_groups:
-                sub_group_numel = sum(param.partition_numel() for param in sub_group)
-                self.fp16_partitioned_groups_flat.append(device_buffer.narrow(0, offset, sub_group_numel))
-                offset += sub_group_numel
         else:  # partitioned params offloaded to CPU when not in use
             # create a flat CPU memory allocation for each param group
             self._create_param_groups_fp16_flat_cpu_memory()
@@ -738,6 +783,9 @@ def _create_fp16_partitions_with_defragmentation(self, fp16_param_groups):
             assert len(largest_partition_numel) > 0, f'Unexpected that largest partition is empty'
             self.fp16_groups[0][0].nvme_swapper.reserve_partitioned_swap_space(largest_partition_numel)
 
+    def _get_parameter_partitions(self) -> List[Tensor]:
+        return [param.ds_tensor for sub_group in self.fp16_groups for param in sub_group]
+
     def _swap_in_sub_group_to_flat_buffer(self, flat_buffer, sub_group_id):
         offset = 0
         elements_in_sub_group = sum([t.ds_numel for t in self.fp16_partitioned_groups[sub_group_id]])
@@ -812,10 +860,14 @@ def _create_fp32_partitions(self):
 
         for i, tensor in enumerate(self.fp16_partitioned_groups_flat):
             num_elements = self.fp16_partitioned_groups_flat_numel[i]
+            ds_id_begin = str(self.fp16_partitioned_groups_flat_id[i][0])
+            ds_id_end = str(self.fp16_partitioned_groups_flat_id[i][-1])
+            ds_id = ds_id_begin + '_' + ds_id_end
 
             # a partition of the fp32 master weights that will be updated by this process
             if self._swappable_optimizer_subgroup(i):
                 self.fp32_partitioned_groups_flat.append(torch.Tensor())
+                self.fp32_partitioned_groups_flat[i].ds_id = ds_id
                 nvme_memory_usage += (fp32_element_size * num_elements)
                 num_swappable_partitions += 1
 
@@ -852,6 +904,7 @@ def _create_fp32_partitions(self):
                     else:
                         self.fp32_partitioned_groups_flat.append(self.fp16_partitioned_groups_flat[i].to(
                             self.device).clone().float().detach())
+                self.fp32_partitioned_groups_flat[i].ds_id = ds_id
 
             self.fp32_partitioned_groups_flat[i].requires_grad = True  # keep this in case internal optimizer uses it
 
@@ -962,6 +1015,15 @@ def _partitioned_params_swap_out(self, i):
             swap_fp16_params[0].nvme_swapper.swap_out_partitioned_params(dst_fp16_params=swap_fp16_params,
                                                                          src_fp32_params=swap_fp32_params)
 
+    def _set_fp16_partitioned_groups_flat(self):
+        # setup flat buffers per subgroup, these are each just sections of the
+        # contiguous flat buffer for all parameters that we created earlier
+        offset = 0
+        for sub_group in self.fp16_groups:
+            sub_group_numel = sum(param.partition_numel() for param in sub_group)
+            self.fp16_partitioned_groups_flat.append(self.lp_param_buffer.narrow(0, offset, sub_group_numel))
+            offset += sub_group_numel
+
     def initialize_optimizer_states(self):
         num_subgroups = len(self.fp16_groups)
 
@@ -1003,10 +1065,6 @@ def initialize_optimizer_states(self):
             else:
                 self.fp32_partitioned_groups_flat[i].grad = gradient_buffer.narrow(0, 0, num_elements)
 
-            # Initialize the optimizer states with the flattened fp32 partition.
-            if not is_adagrad:
-                self._optimizer_step(i)
-
             if swappable_param_subgroup:
                 self._partitioned_params_swap_out(i)
 
@@ -1075,9 +1133,12 @@ def independent_gradient_partition_epilogue(self):
         self.__reduce_and_partition_ipg_grads()
         self.report_ipg_memory_usage(f"In ipg_epilogue after reduce_ipg_grads", 0)
 
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
+        for param_id in self.params_already_reduced.keys():
+            self.params_already_reduced[param_id] = False
+
         #in case of cpu offload, averaged gradients are already in fp32_partitioned_groups_flat.grad
         #TODO: use a similar code path for both cpu_offload and non-cpu offload
         if not self.offload_optimizer:
@@ -1099,7 +1160,7 @@ def overlapping_partition_gradients_reduce_epilogue(self):
 
     def create_reduce_and_remove_grad_hooks(self):
         print_rank_0(f'[Begin] Create gradient reduction hooks')
-        self.grad_accs = []
+        self.leaf_parameters = defaultdict(list)
         for i, param_group in enumerate(self.fp16_groups):
             for param in param_group:
                 if param.requires_grad:
@@ -1111,26 +1172,80 @@ def create_reduce_and_remove_grad_hooks(self):
 
                     #print(f"After all gather {param.device}, {param.shape}")
                     def wrapper(param):
-                        param_tmp = param.expand_as(param)
-                        grad_acc = param_tmp.grad_fn.next_functions[0][0]
 
                         @instrument_w_nvtx
                         def reduce_partition_and_remove_grads(*notneeded):
                             self.reduce_ready_partitions_and_remove_grads(param)
 
-                        grad_acc.register_hook(reduce_partition_and_remove_grads)
-                        self.grad_accs.append(grad_acc)
+                        self._grad_acc_hooks.append(register_grad_hook(param, reduce_partition_and_remove_grads))
 
                     #print(f"param grad fn {param.expand_as(param).grad_fn}")
-                    wrapper(param)
+                    if z3_leaf_parameter(param):
+                        self.leaf_parameters[param.ds_z3_leaf_module].append(param)
+                    else:
+                        wrapper(param)
 
                     # Partition the parameter after creating the hook
                     param.partition()
+
+        # We delay reduce-scatter for all gradients in the leaf modules until the backward pass of the leaf module is done
+        for leaf_module, leaf_parameters in self.leaf_parameters.items():
+
+            def wrapper_pre_hook(params):
+
+                def forward_pre_hook(module, input):
+                    """Pre-forward hook to set backward hook on input tensors to the leaf module"""
+                    module._leaf_module_inputs_remaining = 0
+
+                    @instrument_w_nvtx
+                    def reduce_leaf_module_grads(grad):
+                        module._leaf_module_inputs_remaining -= 1
+                        # Make sure everything is done in the leaf module
+                        if module._leaf_module_inputs_remaining == 0:
+                            for param in params:
+                                if param.grad is None:
+                                    param.grad = torch.zeros_like(param)
+                                self.reduce_ready_partitions_and_remove_grads(param)
+
+                    def set_module_bwd_hook(tensor):
+                        if tensor.requires_grad:
+                            module._leaf_module_inputs_remaining += 1
+                            tensor.register_hook(reduce_leaf_module_grads)
+                        return tensor
+
+                    output = apply_to_tensors_only(set_module_bwd_hook, input)
+
+                    return output
+
+                return forward_pre_hook
+
+            def wrapper_post_hook():
+
+                def forward_post_hook(module, input, output):
+                    """Pre-forward hook to set backward hook on input tensors to the leaf module"""
+                    module._leaf_output_required_grad_num = 0
+
+                    def increment_rg_count_bwd_hook(tensor):
+                        if tensor.requires_grad:
+                            module._leaf_output_required_grad_num += 1
+                        return tensor
+
+                    apply_to_tensors_only(increment_rg_count_bwd_hook, output)
+
+                    if module._leaf_module_inputs_remaining == 0 and module._leaf_output_required_grad_num > 0:
+                        raise RuntimeError(
+                            "A module cannot be set as a leaf module when it does not have any input tensors that require gradients and has output tensors that require gradients. This is because the gradient reduction hook will not be called in this case."
+                        )
+
+                return forward_post_hook
+
+            self._leaf_module_hooks.append(leaf_module.register_forward_pre_hook(wrapper_pre_hook(leaf_parameters)))
+            self._leaf_module_hooks.append(leaf_module.register_forward_hook(wrapper_post_hook()))
+
         print_rank_0(f'[End] Create gradient reduction hooks')
 
     def get_param_id(self, param):
-        unique_id = id(param)
-        return self.param_id[unique_id]
+        return OptimizerSwapper.parameter_id(param)
 
     def report_ipg_memory_usage(self, tag, param_elems):
         elem_count = self.elements_in_ipg_bucket + param_elems
@@ -1158,7 +1273,7 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param):
     @instrument_w_nvtx
     @torch.no_grad()
     def __add_grad_to_ipg_bucket(self, param: Parameter) -> None:
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.wait_stream(get_accelerator().default_stream())
 
         if self.contiguous_gradients and self.elements_in_ipg_bucket + param.grad.numel() <= self.reduce_bucket_size:
@@ -1207,7 +1322,7 @@ def __reduce_and_partition_ipg_grads(self, safe_mode: bool = False) -> None:
 
             self.params_in_ipg_bucket.clear()
 
-            if not get_accelerator().is_synchronized_device():
+            if not get_accelerator().handles_memory_backpressure():
                 event = get_accelerator().Event()
                 event.record()
                 self.param_reduce_events.append(event)
@@ -1215,7 +1330,7 @@ def __reduce_and_partition_ipg_grads(self, safe_mode: bool = False) -> None:
     @instrument_w_nvtx
     def __avg_scatter_contiguous_grads(self, buffer_to_reduce: Tensor) -> List[Tensor]:
         dtype = buffer_to_reduce.dtype
-        if self.communication_data_type == self.dtype:
+        if self.communication_data_type != dtype:
             buffer_to_reduce = buffer_to_reduce.to(self.communication_data_type)
         if self.postscale_gradients and self.gradient_predivide_factor != 1.0:
             buffer_to_reduce = buffer_to_reduce.div_(self.gradient_predivide_factor)
@@ -1268,7 +1383,10 @@ def __avg_scatter_grads(self, params_to_reduce: List[Parameter]) -> List[Tensor]
         global_world_size = dist.get_world_size()
         num_nodes = global_world_size // local_world_size
         if self.all2all_process_group is not None and num_nodes > 1:
-            grad_partitions_for_rank = all_to_all_quant_reduce(full_grads_for_rank, self.all2all_process_group)
+            grad_partitions_for_rank = (all_to_all_loco_quant_reduce(params_to_reduce, self.all2all_process_group,
+                                                                     self.zeropp_loco_param)
+                                        if self.zeropp_loco_param is not None else all_to_all_quant_reduce(
+                                            full_grads_for_rank, self.all2all_process_group))
         else:
             grad_partitions_for_rank = reduce_scatter_coalesced(full_grads_for_rank, self.dp_process_group)
 
@@ -1324,7 +1442,7 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
                 param_id = self.get_param_id(p)
                 if param_id in self.norm_for_param_grads.keys():
                     param_norm = self.norm_for_param_grads[param_id]
-                    total_norm += param_norm.item()**2
+                    total_norm += param_norm**2
 
         # Sum across all model parallel GPUs.
         total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
@@ -1333,12 +1451,16 @@ def complete_grad_norm_calculation_for_cpu_offload(self, params):
 
         self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
-        total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+        total_norm = total_norm_cuda[0]**(1. / norm_type)
 
-        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
+        norm_is_inf = total_norm.isinf()
+        norm_is_nan = total_norm.isnan()
+        inf_or_nan = norm_is_nan.logical_or(norm_is_inf)
 
-        return total_norm
+        err = torch.tensor(-1.0, device=inf_or_nan.device, dtype=torch.float)
+        total_norm = inf_or_nan * err + inf_or_nan.logical_not() * total_norm
+
+        return total_norm.cpu()
 
     @instrument_w_nvtx
     def partition_grads(self, params_to_release: List[Parameter], grad_partitions: List[Tensor]) -> None:
@@ -1390,7 +1512,7 @@ def partition_grads(self, params_to_release: List[Parameter], grad_partitions: L
                     else:
                         fp32_grad_tensor = self.fp32_partitioned_groups_flat[i].grad.narrow(
                             0, dest_offset, grad_buffer.numel())
-                        fp32_grad_tensor.copy_(grad_buffer)
+                        fp32_grad_tensor.copy_(grad_buffer.float())
 
             # free the gradient
             if not get_accelerator().is_synchronized_device():
@@ -1501,7 +1623,7 @@ def set_none_gradients_to_zero(self, i, partition_id):
         for param_id in self.is_grad_computed[i][partition_id]:
             param = self.param_dict[param_id]
             if param.grad is None:
-                param.grad = torch.zero_like(param)
+                param.grad = torch.zeros_like(param)
 
     ######################Reduction Related Methods##############################
 
@@ -1665,7 +1787,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
 
             # Take max across all GPUs.
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
+            total_norm = total_norm_cuda[0]
         else:
             # if dist.get_rank() == 0:
             #    logger.info(f"Total Norm beginning {total_norm}")
@@ -1676,7 +1798,7 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
 
             # Sum across all model parallel GPUs.
             if len(grad_norms) == 0:
-                # FIX https://github.com/microsoft/DeepSpeed/issues/3564
+                # FIX https://github.com/deepspeedai/DeepSpeed/issues/3564
                 total_norm_cuda = torch.tensor(0,
                                                dtype=gradients[0].dtype).to(get_accelerator().device_name()).double()
             else:
@@ -1686,10 +1808,14 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
 
             self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
 
-            total_norm = total_norm_cuda.item()**(1. / norm_type)
+            total_norm = total_norm_cuda**(1. / norm_type)
+
+        norm_is_inf = total_norm.isinf()
+        norm_is_nan = total_norm.isnan()
+        inf_or_nan = norm_is_nan.logical_or(norm_is_inf)
 
-        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
+        err = torch.tensor(-1.0, device=self.device, dtype=torch.float)
+        total_norm = inf_or_nan * err + inf_or_nan.logical_not() * total_norm
 
         return total_norm
 
@@ -1751,7 +1877,7 @@ def _pre_step(self):
         see_memory_usage(f"In step before checking overflow", force=False)
 
         print_rank_0("Finished Tracing at Beginning of Step")
-        self._get_param_coordinator(training=True).hierarchy = 0
+        self._get_param_coordinator().hierarchy = 0
 
         print_rank_0("Finished Tracing at Beginning of Step")
 
@@ -1798,7 +1924,7 @@ def _prepare_sub_group(self, sub_group_id, timer_names):
 
     def _optimizer_states_and_gradient_swap_in(self, sub_group_id, timer_names):
         param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
-        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        fp32_param_id = self.get_param_id(self.fp32_partitioned_groups_flat[sub_group_id])
         assert self._swappable_optimizer_subgroup(sub_group_id), \
             f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
 
@@ -1846,7 +1972,7 @@ def flatten_dense_tensors_aligned(self, tensor_list, alignment):
 
     def _optimizer_states_and_gradient_swap_out(self, sub_group_id, timer_names):
         param_length = self.fp16_partitioned_groups_flat_numel[sub_group_id]
-        fp32_param_id = id(self.fp32_partitioned_groups_flat[sub_group_id])
+        fp32_param_id = self.get_param_id(self.fp32_partitioned_groups_flat[sub_group_id])
         assert self._swappable_optimizer_subgroup(sub_group_id), \
             f'Parameter {fp32_param_id} of numel={param_length} is not swappable'
 
@@ -1882,6 +2008,25 @@ def _overflow_clean_up(self, prev_scale):
 
         see_memory_usage('After overflow after clearing gradients', force=False)
 
+    def _loco_err_buf_update(self, overflow: bool, scale=1.0):
+        """
+        Loco Error Buffer update.
+        """
+        if not overflow and scale == 1.0: return
+        if dist.get_rank() == 0:
+            logger.info(f"update loco-zero++ error buffer with overflow: {overflow}")
+        # FP32 grad should never exist.
+        # For speed, set model fp16 grad to None by default
+        for group in self.fp16_groups:
+            for p in group:
+                if hasattr(p, 'intra_ef_buf'):
+                    if overflow:
+                        del p.intra_ef_buf
+                        del p.inter_ef_buf
+                    else:
+                        p.intra_ef_buf[1] *= scale
+                        p.inter_ef_buf[1] *= scale
+
     @instrument_w_nvtx
     def _overflow_check_and_loss_scale_update(self):
 
@@ -1896,6 +2041,9 @@ def _overflow_check_and_loss_scale_update(self):
         if self.overflow:
             self._overflow_clean_up(prev_scale)
 
+        #update loco error buf
+        self._loco_err_buf_update(self.overflow, self.loss_scale / prev_scale)
+
         return self.overflow
 
     @instrument_w_nvtx
@@ -1910,7 +2058,7 @@ def _post_step(self, timer_names):
         if self.swap_optimizer:
             self.optimizer_swapper.log_timers()
 
-        # self.invalidate_secondary_tensor() # given that we want hpz in forward pass when no_grad is set, we need to keep the secondary tensor
+        self.invalidate_secondary_tensor()
 
         self.timers.log(timer_names)
 
@@ -1949,7 +2097,7 @@ def step(self, closure=None):
             return
 
         norm_groups = self._get_norm_groups()
-        scaled_global_grad_norm = get_global_norm(norm_list=norm_groups)
+        scaled_global_grad_norm = torch.linalg.vector_norm(torch.stack(norm_groups))
 
         # Stash unscaled gradient norm
         self._global_grad_norm = scaled_global_grad_norm / self.loss_scale
@@ -1984,7 +2132,7 @@ def step(self, closure=None):
         # warn user about caching allocator flushes
         memory_stats = get_accelerator().memory_stats()
         alloc_retries = memory_stats.get("num_alloc_retries")
-        if alloc_retries == None:
+        if alloc_retries is None:
             alloc_retries = 0
         if alloc_retries > self.n_caching_allocator_flushes:
             if dist.get_rank() == 0:
@@ -2033,8 +2181,8 @@ def unscale_and_clip_grads(self, sub_group_id, total_norm):
         if self.clip_grad > 0.:
             # norm is in fact norm*scale
             clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
+            clip = torch.clamp(clip, min=1.0)
+            combined_scale = clip * self.loss_scale
 
         self.fp32_partitioned_groups_flat[sub_group_id].grad.mul_(1. / combined_scale)
 
@@ -2069,10 +2217,11 @@ def has_overflow(self, partition_gradients=True):
                     self.inf_or_nan_tracker += torch.isnan(self.grad_partitions_flat_buffer).any()
                     self.inf_or_nan_tracker = self.inf_or_nan_tracker > 0
 
-                overflow_gpu = self.inf_or_nan_tracker.clone().to(torch.uint8)
+                overflow_gpu = self.inf_or_nan_tracker.clone().to(get_accelerator().current_device_name()).to(
+                    torch.uint8)
                 self.inf_or_nan_tracker.zero_()
 
-            if not get_accelerator().is_synchronized_device():
+            if not get_accelerator().resolves_data_dependency():
                 get_accelerator().default_stream().wait_stream(self.reduce_and_partition_stream)
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
@@ -2134,8 +2283,6 @@ def backward(self, loss, retain_graph=False):
         else:
             self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
 
-        self._get_param_coordinator(training=True).reset_step()
-
         if self.swap_optimizer:
             self.optimizer_swapper.post_backward()
 
@@ -2143,7 +2290,7 @@ def get_fp32_grad_partitions(self) -> Dict[int, Dict[int, Tensor]]:
         """get fp32 gradient partition dictionary
         accessed as grad_dict[parameter_group_index][parameter_index]
         """
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
         grad_dict = collections.defaultdict(dict)
         if self.offload_optimizer:
@@ -2160,26 +2307,20 @@ def get_fp32_grad_partitions(self) -> Dict[int, Dict[int, Tensor]]:
         return grad_dict
 
     def _fp32_state_allgather(self, param, fp32_state_partition):
-        reduce_buffer = torch.zeros(self.partition_count * fp32_state_partition.numel(),
+        reduce_buffer = torch.empty(self.partition_count * fp32_state_partition.numel(),
                                     dtype=torch.float32,
-                                    device=param.device).flatten()
+                                    device=param.device)
         my_rank = dist.get_rank(group=self.dp_process_group)
-        partitions = [
-            reduce_buffer.narrow(0,
-                                 fp32_state_partition.numel() * i, fp32_state_partition.numel())
-            for i in range(self.partition_count)
-        ]
-        partitions[my_rank].data.copy_(fp32_state_partition.data, non_blocking=False)
-
-        dist.all_gather(partitions, partitions[my_rank], group=self.dp_process_group)
-
+        partition = reduce_buffer.narrow(0, fp32_state_partition.numel() * my_rank, fp32_state_partition.numel())
+        partition.data.copy_(fp32_state_partition.data, non_blocking=False)
+        dist.all_gather_into_tensor(reduce_buffer, partition, group=self.dp_process_group)
         return reduce_buffer.narrow(0, 0, param.ds_numel).view(param.ds_shape)
 
     def get_fp32_grad_for_param(self, param) -> Tensor:
         if not param.requires_grad:
             return None
 
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
         if self.offload_optimizer:
@@ -2190,8 +2331,26 @@ def get_fp32_grad_for_param(self, param) -> Tensor:
 
         return self._fp32_state_allgather(param, fp32_grad)
 
+    def set_fp32_grad_for_param(self, value, param):
+        if not param.requires_grad:
+            return
+
+        if not get_accelerator().resolves_data_dependency():
+            self.reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id]
+
+        my_rank = dist.get_rank(group=self.dp_process_group)
+        value_partition = value.flatten().narrow(0, fp32_grad.numel() * my_rank, fp32_grad.numel())
+
+        fp32_grad.data.copy_(value_partition.data)
+
     def _get_fp32_opt_state_partition(self, param, optim_state_key=None):
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
         group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
@@ -2238,17 +2397,11 @@ def set_full_hp_param(self, value, param, optim_state_key=None):
 
     ### Local API START ###
 
-    def get_local_fp32_param(self, param, optim_state_key=None) -> Tensor:
-        if not param.requires_grad:
-            return None
-        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
-        return fp32_opt_state
-
     def get_local_fp32_grad_for_param(self, param) -> Tensor:
         if not param.requires_grad:
             return None
 
-        if not get_accelerator().is_synchronized_device():
+        if not get_accelerator().resolves_data_dependency():
             self.reduce_and_partition_stream.synchronize()
 
         if self.offload_optimizer:
@@ -2258,6 +2411,30 @@ def get_local_fp32_grad_for_param(self, param) -> Tensor:
             fp32_grad = self.__param_id_to_grad_partition[param.ds_id].float()
         return fp32_grad
 
+    def set_local_grad_for_param(self, value, param):
+        if not param.requires_grad:
+            return
+
+        assert value.numel() == param.ds_tensor.numel(
+        ), f" Number of elements do not match: {value.numel()} != {param.ds_tensor.ds_numel}"
+
+        if not get_accelerator().resolves_data_dependency():
+            self.reduce_and_partition_stream.synchronize()
+
+        if self.offload_optimizer:
+            group_idx, dest_offset, num_elements = self.grad_position[self.get_param_id(param)]
+            fp32_grad = self.fp32_partitioned_groups_flat[group_idx].grad.narrow(0, dest_offset, num_elements)
+        else:
+            fp32_grad = self.__param_id_to_grad_partition[param.ds_id]
+
+        fp32_grad.data.copy_(value.flatten().data)
+
+    def get_local_fp32_param(self, param, optim_state_key=None) -> Tensor:
+        if not param.requires_grad:
+            return None
+        fp32_opt_state, group_idx = self._get_fp32_opt_state_partition(param, optim_state_key)
+        return fp32_opt_state
+
     def set_local_hp_param(self, value, param, optim_state_key=None):
         if not param.requires_grad:
             return
@@ -2272,7 +2449,7 @@ def set_local_hp_param(self, value, param, optim_state_key=None):
 
         if self._swappable_optimizer_subgroup(group_idx):
             self._optimizer_states_and_gradient_swap_out(group_idx)
-        logger.info(f"[set_local_hp_param][update the params' value successfully]")
+        # logger.info(f"[set_local_hp_param][update the params' value successfully]")
 
     ### Local API END ###
 
@@ -2397,10 +2574,6 @@ def state_dict(self):
         if self.elastic_checkpoint:
             raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
 
-        if self.swap_optimizer or self.params_in_nvme_and_cpu:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.")
-
         return self._rigid_state_dict()
 
 
@@ -2484,6 +2657,20 @@ def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
             self.optimizer.load_state_dict(state_dict[OPTIMIZER_STATE_DICT])
             self._clear_fp32_optimizer_param_groups()
 
+        if self.swap_optimizer:
+            # Purge the swapped optimizer state, it was initialized to the freshly created model and not the checkpoint
+            self.optimizer_swapper.purge_state()
+
+        if self.swap_optimizer:
+            # Touch all parameters to synchronize all buffers
+            timer_names = set()
+            self._partition_all_parameters()
+            for sub_group_id, group in enumerate(self.fp16_groups):
+                self._prepare_sub_group(sub_group_id, timer_names)
+                self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
+                self._release_sub_group(sub_group_id, timer_names)
+            self._post_step(timer_names)
+
         # restore fp32 partitions
         for curr_param, saved_param in zip(self.fp32_partitioned_groups_flat, state_dict[FP32_FLAT_GROUPS]):
             curr_param.data.copy_(saved_param.data)
@@ -2491,8 +2678,9 @@ def _rigid_load_state_dict(self, state_dict, load_optimizer_states=True):
         # restore fp16 partitions from fp32
         for sub_group_id in range(len(self.fp32_partitioned_groups_flat)):
             fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
-            fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
-            fp16_param.data.copy_(fp32_param.data)
+            if sum(fp32_param.size()) > 0:
+                fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
+                fp16_param.data.copy_(fp32_param.data)
 
         # update fp16 unflattened params
         for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
@@ -2508,7 +2696,8 @@ def load_state_dict(self,
                         load_optimizer_states=True,
                         load_from_fp32_weights=False,
                         checkpoint_folder=None,
-                        load_serial=None):
+                        load_serial=None,
+                        param_shapes=None):
         r"""Loading a ZeRO checkpoint
         Arguments:
             state_dict_list: List of all saved ZeRO checkpoints, one for each saved partition.
@@ -2537,28 +2726,118 @@ def load_state_dict(self,
         if self.elastic_checkpoint:
             raise NotImplementedError("ZeRO-3 does not yet support elastic checkpointing, please disable for now.")
 
-        if self.swap_optimizer or self.params_in_nvme_and_cpu:
-            raise NotImplementedError(
-                "ZeRO-3 does not yet support checkpointing with NVMe offloading, please disable for now.")
-
-        self._rigid_load_state_dict(state_dict_list[dist.get_rank(group=self.dp_process_group)],
-                                    load_optimizer_states=load_optimizer_states)
-
-        # when use loading checkpoint serial, after finish loading, we need to
-        # delete the temp state_dict_list variable to save memory, then trigger
-        # the next rank's loading
-        if load_serial != None:
-            load_serial += 1
-            rank = dist.get_rank(group=self.dp_process_group)
-            local_rank = dist.get_local_rank()
-            del state_dict_list[rank]
-            rank_end = dist.get_world_size() - 1
-            if local_rank != rank_end:
-                dist.send(tensor=load_serial, dst=rank + 1)
+        if checkpoint_folder:
+            self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights,
+                                            param_shapes)
+        else:
+            self._rigid_load_state_dict(state_dict_list[dist.get_rank(group=self.dp_process_group)],
+                                        load_optimizer_states=load_optimizer_states)
+
+            # when use loading checkpoint serial, after finish loading, we need to
+            # delete the temp state_dict_list variable to save memory, then trigger
+            # the next rank's loading
+            if load_serial is not None:
+                load_serial += 1
+                rank = dist.get_rank(group=self.dp_process_group)
+                local_rank = dist.get_local_rank()
+                del state_dict_list[rank]
+                rank_end = dist.get_world_size() - 1
+                if local_rank != rank_end:
+                    dist.send(tensor=load_serial, dst=rank + 1)
+
+            if len(self.persistent_parameters) > 0:
+                self.persistent_parameters[0].partition(self.persistent_parameters)
+                # self.persistent_parameters[0].all_gather(self.persistent_parameters) # this will be done in checkpoint_event_epilogue() so remove it to prevent double all_gather
+
+    def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights,
+                                   param_shapes):
+        self.load_hp_checkpoint_state_from_checkpoint_dir_stage3(checkpoint_folder, param_shapes)
+
+    def load_hp_checkpoint_state_from_checkpoint_dir_stage3(self, checkpoint_dir, param_shapes):
+        """ Load optimizer and model states from the checkpoint directory. """
+        checkpoint_dir = os.path.join(checkpoint_dir, "zero")
+        optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
+        assert os.path.isfile(
+            optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
+
+        optim_sd = torch.load(optim_state_path, weights_only=False)
+        self._load_global_state_stage3(optim_sd)
+
+        key_list = ["fp32", "exp_avg", "exp_avg_sq"]
+
+        for key in key_list:
+            key_tensor = torch.empty(0)
+            for layer in param_shapes[0].keys():
+                key_layer_state_partition = self.load_hp_checkpoint_state(os.path.join(checkpoint_dir, layer), key)
+                key_tensor = torch.cat((key_tensor, key_layer_state_partition))
+            if key == "fp32":
+                self.fp32_partitioned_groups_flat[0].data.copy_(key_tensor)
+                self.optimizer.param_groups[0]['params'].append(self.fp32_partitioned_groups_flat[0])
+            else:
+                optim_sd[OPTIMIZER_STATE_DICT]['state'][0][key] = key_tensor
 
-        if len(self.persistent_parameters) > 0:
-            self.persistent_parameters[0].partition(self.persistent_parameters)
-            # self.persistent_parameters[0].all_gather(self.persistent_parameters) # this will be done in checkpoint_event_epilogue() so remove it to prevent double all_gather
+        if self.swap_optimizer:
+            # Purge the swapped optimizer state, it was initialized to the freshly created model and not the checkpoint
+            self.optimizer_swapper.purge_state()
+
+        if self.swap_optimizer:
+            # Touch all parameters to synchronize all buffers
+            timer_names = set()
+            self._partition_all_parameters()
+            for sub_group_id, group in enumerate(self.fp16_groups):
+                self._prepare_sub_group(sub_group_id, timer_names)
+                self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
+                self._release_sub_group(sub_group_id, timer_names)
+            self._post_step(timer_names)
+
+        self.optimizer.load_state_dict(optim_sd[OPTIMIZER_STATE_DICT])
+        for param_group in self.optimizer.param_groups:
+            param_group['params'] = []
+
+        for sub_group_id in range(len(self.fp32_partitioned_groups_flat)):
+            fp32_param = self.fp32_partitioned_groups_flat[sub_group_id]
+            if sum(fp32_param.size()) > 0:
+                fp16_param = self.fp16_partitioned_groups_flat[sub_group_id]
+                fp16_param.data.copy_(fp32_param.data)
+
+        for sub_group_id in range(len(self.fp16_partitioned_groups_flat)):
+            updated_params = self.unflatten(self.fp16_partitioned_groups_flat[sub_group_id],
+                                            self.fp16_partitioned_groups[sub_group_id])
+
+            for partitioned_param, q in zip(self.fp16_partitioned_groups[sub_group_id], updated_params):
+                partitioned_param.data = q.data
+
+    def _load_global_state_stage3(self, sd):
+        self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler)
+        self.dynamic_loss_scale = sd.get('dynamic_loss_scale', self.dynamic_loss_scale)
+        self.overflow = sd.get('overflow', self.overflow)
+
+    def load_hp_checkpoint_state(self, folder, key):
+        local_rank = dist.get_local_rank()
+
+        # Load tensors from files and reshape them to flat vectors
+        loaded_checkpoint_state = torch.load(os.path.join(folder, f"{key}.pt"), weights_only=False).view(-1)
+
+        # Partition the loaded data according to the local rank
+        world_size = dist.get_world_size(group=self.dp_process_group)
+        unpartitioned_numel = loaded_checkpoint_state.numel()
+        partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+
+        if world_size * partitioned_numel != unpartitioned_numel:
+            padding_size = world_size * partitioned_numel - unpartitioned_numel
+            padding_tensor = torch.zeros(padding_size, dtype=loaded_checkpoint_state.dtype)
+            loaded_checkpoint_state = torch.cat([loaded_checkpoint_state, padding_tensor])
+        checkpoint_state_partition = loaded_checkpoint_state.narrow(0, local_rank * partitioned_numel,
+                                                                    partitioned_numel)
+
+        return checkpoint_state_partition
+
+    def reset_swap_buffers(self):
+        timer_names = set()
+        for sub_group_id, group in enumerate(self.fp16_groups):
+            self._prepare_sub_group(sub_group_id, timer_names)
+            self._reassign_or_swap_out_partitioned_parameters(sub_group_id)
+            self._release_sub_group(sub_group_id, timer_names)
 
     def checkpoint_event_prologue(self):
         self._partition_all_parameters()
@@ -2570,6 +2849,149 @@ def checkpoint_event_epilogue(self):
     def empty_partition_cache(self):
         self.parameter_offload.empty_partition_cache()
 
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False):
+        device = device.value
+
+        self.empty_partition_cache()
+
+        assert self.optimizer.__class__ == deepspeed.ops.adam.fused_adam.FusedAdam, f"Offloading is supported only for DeepSpeed FusedAdam."
+
+        def needs_offload(target):
+            # return True
+            return target not in self.offloaded_states and (include == None or target in include)
+
+        # HP param
+        if needs_offload(OffloadStateTypeEnum.hp_params):
+            if pin_memory:
+                if not hasattr(self, "hp_params_pin_buffers"):
+                    self.hp_params_pin_buffers = [
+                        get_accelerator().pin_memory(torch.empty_like(t, device=device))
+                        for t in self.fp32_partitioned_groups_flat
+                    ]
+
+                for src_tensor, dest_buf in zip(self.fp32_partitioned_groups_flat, self.hp_params_pin_buffers):
+                    dest_buf.copy_(src_tensor, non_blocking=non_blocking)
+                    src_tensor.data = dest_buf
+            else:
+                for buf in self.fp32_partitioned_groups_flat:
+                    buf.data = buf.data.to(device, non_blocking=non_blocking)
+            self.offloaded_states.add(OffloadStateTypeEnum.hp_params)
+
+        # LP param
+        if needs_offload(OffloadStateTypeEnum.lp_params):
+            if pin_memory:
+                if not hasattr(self, "lp_param_contiguous_pin_buffer"):
+                    self.lp_param_contiguous_pin_buffer = get_accelerator().pin_memory(
+                        torch.empty_like(self.lp_param_buffer, device=device))
+                self.lp_param_contiguous_pin_buffer.copy_(self.lp_param_buffer, non_blocking=non_blocking)
+                cpu_buffer = self.lp_param_contiguous_pin_buffer
+            else:
+                cpu_buffer = self.lp_param_buffer.to(device, non_blocking=non_blocking)
+
+            self.lp_param_buffer.data = cpu_buffer
+            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(
+                [p.ds_tensor for p in self.module.parameters()]):
+                tensor.data = cpu_buffer.narrow(0, offset, tensor_numel)
+
+            self.fp16_partitioned_groups_flat.clear()
+            self.offloaded_states.add(OffloadStateTypeEnum.lp_params)
+
+        # LP grad
+        if needs_offload(OffloadStateTypeEnum.lp_grads):
+            if pin_memory:
+                if not hasattr(self, "lp_grad_partitions_flat_pin_buffers"):
+                    self.lp_grad_partitions_flat_pin_buffers = get_accelerator().pin_memory(
+                        torch.empty_like(self.grad_partitions_flat_buffer, device=device))
+                self.lp_grad_partitions_flat_pin_buffers.copy_(self.grad_partitions_flat_buffer,
+                                                               non_blocking=non_blocking)
+                self.grad_partitions_flat_buffer.data = self.lp_grad_partitions_flat_pin_buffers
+            else:
+                self.grad_partitions_flat_buffer.data = self.grad_partitions_flat_buffer.data.to(device)
+            self.averaged_gradients = {}
+
+            self.__param_id_to_grad_partition = {}
+
+            self.offloaded_states.add(OffloadStateTypeEnum.lp_grads)
+
+        # contiguous bucket
+        if needs_offload(OffloadStateTypeEnum.contiguous_grad_buffer):
+            if hasattr(self, "_DeepSpeedZeroOptimizer_Stage3__ipg_bucket_flat_buffer"):
+                # Record properties like shape, strides, etc. as a meta tensor
+                self.grad_buffer_meta = self.__ipg_bucket_flat_buffer.to("meta")
+                self.__ipg_bucket_flat_buffer = None
+                self.offloaded_states.add(OffloadStateTypeEnum.contiguous_grad_buffer)
+
+        # Adam
+        if needs_offload(OffloadStateTypeEnum.optim_states):
+            offload_adam_states(self.optimizer, device, pin_memory=pin_memory, non_blocking=non_blocking)
+            self.offloaded_states.add(OffloadStateTypeEnum.optim_states)
+
+        gc.collect()
+        get_accelerator().empty_cache()
+
+    def reload_states(self, non_blocking: bool = False):
+
+        device = get_accelerator().current_device_name()
+
+        # HP param
+        if OffloadStateTypeEnum.hp_params in self.offloaded_states:
+            if hasattr(self, "hp_params_pin_buffers"):
+                for src, dest in zip(self.hp_params_pin_buffers, self.fp32_partitioned_groups_flat):
+                    dest.data = src.to(device, non_blocking=non_blocking)
+            else:
+                for buf in self.fp32_partitioned_groups_flat:
+                    buf.data = buf.data.to(device, non_blocking=non_blocking)
+            self.offloaded_states.remove(OffloadStateTypeEnum.hp_params)
+
+        # LP Param
+        if OffloadStateTypeEnum.lp_params in self.offloaded_states:
+            cpu_buffer = self.lp_param_contiguous_pin_buffer if hasattr(
+                self, "lp_param_contiguous_pin_buffer") else self.lp_param_buffer
+            self.lp_param_buffer.data = cpu_buffer.data.to(device, non_blocking=non_blocking)
+            self._set_fp16_partitioned_groups_flat()
+
+            parameter_partitions = self._get_parameter_partitions()
+            for tensor, offset, tensor_numel in get_mapping_to_flat_buffer(parameter_partitions):
+                tensor.data = self.lp_param_buffer.narrow(0, offset, tensor_numel)
+            self.offloaded_states.remove(OffloadStateTypeEnum.lp_params)
+
+        # LP grad
+        if OffloadStateTypeEnum.lp_grads in self.offloaded_states:
+            if hasattr(self, "lp_grad_partitions_flat_pin_buffers"):
+                self.grad_partitions_flat_buffer.data = self.lp_grad_partitions_flat_pin_buffers.to(
+                    device, non_blocking=non_blocking)
+            else:
+                self.grad_partitions_flat_buffer.data = self.grad_partitions_flat_buffer.data.to(
+                    device, non_blocking=non_blocking)
+            self.averaged_gradients = {}
+
+            offset = 0
+            all_params = list(itertools.chain.from_iterable(self.fp16_groups))
+            for param in all_params:
+                self.__param_id_to_grad_partition[param.ds_id] = self.grad_partitions_flat_buffer.narrow(
+                    0, offset, param.partition_numel())
+                offset += param.partition_numel()
+
+            self.offloaded_states.remove(OffloadStateTypeEnum.lp_grads)
+
+        # contiguous bucket
+        if OffloadStateTypeEnum.contiguous_grad_buffer in self.offloaded_states:
+            self.__ipg_bucket_flat_buffer = torch.empty_like(self.grad_buffer_meta, device=device)
+            # self.__ipg_bucket_flat_buffer.data = self.__ipg_bucket_flat_buffer.data.to(device)
+            self.offloaded_states.remove(OffloadStateTypeEnum.contiguous_grad_buffer)
+
+        # Adam
+        if OffloadStateTypeEnum.optim_states in self.offloaded_states:
+            reload_adam_states(self.optimizer, device, non_blocking=non_blocking)
+            self.offloaded_states.remove(OffloadStateTypeEnum.optim_states)
+
+        if non_blocking:
+            get_accelerator().synchronize()
+
 
 def _handle_overflow(cpu_sum, x, i):
     import math
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
index 4b92fe319bfa..2bece09bffc4 100755
--- a/deepspeed/runtime/zero/stage_1_and_2.py
+++ b/deepspeed/runtime/zero/stage_1_and_2.py
@@ -4,22 +4,20 @@
 # DeepSpeed Team
 
 import torch
-import os
 from deepspeed import comm as dist
 from packaging import version as pkg_version
 from collections import OrderedDict
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
 
-from deepspeed.runtime import ZeROOptimizer
+from deepspeed.runtime.base_optimizer import ZeROOptimizer
 from deepspeed.runtime.fp16.loss_scaler import CreateLossScaler
-from deepspeed.runtime.utils import (bwc_tensor_model_parallel_rank, get_global_norm, empty_cache, see_memory_usage,
-                                     inf, is_model_parallel_parameter, align_dense_tensors, all_gather_dp_groups,
-                                     all_gather_all_partitions)
-
+from deepspeed.runtime.utils import (empty_cache, see_memory_usage, inf, is_model_parallel_parameter,
+                                     align_dense_tensors, all_gather_dp_groups)
 from deepspeed.runtime.zero.config import ZeroStageEnum
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.utils import logger
+from deepspeed.utils.bwc import bwc_tensor_model_parallel_rank
 from deepspeed.moe.utils import is_moe_param
 from deepspeed.git_version_info import version
 
@@ -29,7 +27,7 @@
 from deepspeed.checkpoint.constants import (DS_VERSION, GROUP_PADDINGS, PARTITION_COUNT, LOSS_SCALER,
                                             SINGLE_PARTITION_OF_FP32_GROUPS, BASE_OPTIMIZER_STATE,
                                             BASE_OPTIMIZER_STATE_STEP, CLIP_GRAD, ZERO_STAGE, PARAM_SLICE_MAPPINGS)
-from deepspeed.utils import link_hp_params
+from deepspeed.utils import link_hp_params, lazy_init_hp_params_optimizer_state
 from deepspeed.checkpoint import enable_universal_checkpoint
 
 from deepspeed.utils import groups
@@ -41,6 +39,7 @@
 OPTIMIZER_GRADIENTS_TIMER = 'optimizer_gradients'
 OPTIMIZER_STEP_TIMER = 'optimizer_step'
 OPTIMIZER_TIMERS = [OPTIMIZER_ALLGATHER_TIMER, OPTIMIZER_GRADIENTS_TIMER, OPTIMIZER_STEP_TIMER]
+INITIAL_MICRO_STEP_ID = -1
 
 
 def input(msg):
@@ -76,11 +75,6 @@ def get_alignment_padding(tensor_list, alignment):
     return (alignment - remainder) if remainder else remainder
 
 
-def move_to_cpu(tensor_list):
-    for tensor in tensor_list:
-        tensor.data = tensor.data.cpu()
-
-
 def print_rank_msg(msg):
     print(f"rank {dist.get_rank()} - {msg}")
 
@@ -94,6 +88,12 @@ def _get_padded_tensor(src_tensor, size):
     return padded_tensor
 
 
+def _pad_tensor_by_size(src_tensor, pad_size, dtype, device):
+    padded_tensor = torch.zeros(src_tensor.numel() + pad_size, dtype=dtype, device=device)
+    padded_tensor.data[:src_tensor.numel()].copy_(src_tensor.data)
+    return padded_tensor
+
+
 class DeepSpeedZeroOptimizer(ZeROOptimizer):
     """
     DeepSpeedZeroOptimizer designed to reduce the memory footprint
@@ -225,7 +225,7 @@ def __init__(self,
         self.gradient_predivide_factor = gradient_predivide_factor
         self.postscale_gradients = postscale_gradients
         self.gradient_accumulation_steps = gradient_accumulation_steps
-        self.micro_step_id = 0
+        self.micro_step_id = INITIAL_MICRO_STEP_ID
         self.ignore_unused_parameters = ignore_unused_parameters
         self.round_robin_gradients = round_robin_gradients
 
@@ -238,11 +238,11 @@ def __init__(self,
             f"Currently only supported using ZeRO-Offload with DeepSpeedCPUAdam. But current setting is ZeRO-Offload:{self.cpu_offload} and optimizer type {type(self.optimizer)}." \
             f"Either disable fp16_master_weights_and_gradients or enable {self.zero_stage_string} Offload with DeepSpeedCPUAdam."
 
-        if self.reduce_scatter:
+        if self.reduce_scatter and self.partition_gradients:
             valid_reduce_scatter_dtypes = (torch.float16, torch.bfloat16, torch.float32)
             assert self.communication_data_type in valid_reduce_scatter_dtypes, f"{self.zero_stage_string} supports {valid_reduce_scatter_dtypes} communication_data_type with reduce scatter enabled. Got: '{self.communication_data_type}'"
-            assert self.gradient_predivide_factor == 1.0, "gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
-            assert self.postscale_gradients, "pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
+            assert self.gradient_predivide_factor == 1.0, f"gradient_predivide_factor != 1.0 is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
+            assert self.postscale_gradients, f"pre-scale gradients is not yet supported with {self.zero_stage_string} with reduce scatter enabled"
 
         # param flattened by groups
         self.bit16_groups = []
@@ -295,6 +295,7 @@ def __init__(self,
 
         self.round_robin_bit16_groups = []
         self.round_robin_bit16_indices = []
+        self.round_robin_bit16_meta = []
 
         # Use different parallel to do all_to_all_reduce related things
         # padding on each partition for alignment purposes
@@ -309,6 +310,7 @@ def __init__(self,
             for param in param_group['params']:
                 if param.requires_grad:
                     param.grad_accum = None
+                    param.param_idx_in_group = len(trainable_parameters)
                     trainable_parameters.append(param)
             self.bit16_groups.append(trainable_parameters)
 
@@ -317,7 +319,14 @@ def __init__(self,
 
             see_memory_usage(f"Before moving param group {i} to CPU")
             # move all the parameters to cpu to free up GPU space for creating flat buffer
-            move_to_cpu(self.bit16_groups[i])
+
+            # Create temp CPU param copies, free accelerator tensors
+            orig_group_numel = 0
+            for param in self.bit16_groups[i]:
+                orig_group_numel += param.numel()
+                param.cpu_data = param.data.cpu()
+                param.data = torch.empty(1).to(param.device)
+
             empty_cache()
             see_memory_usage(f"After moving param group {i} to CPU", force=False)
 
@@ -335,21 +344,27 @@ def __init__(self,
             self.round_robin_bit16_groups.append(round_robin_tensors)
             self.round_robin_bit16_indices.append(round_robin_indices)
 
-            # create flat buffer in CPU and move to GPU
-            self.bit16_groups_flat.append(
-                self.flatten_dense_tensors_aligned(
-                    self.round_robin_bit16_groups[i],
-                    self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i])).to(
-                        get_accelerator().current_device_name()))
-            see_memory_usage(f"After flattening and moving param group {i} to GPU", force=False)
+            # Create meta tensors list, ordered according to round_robin_tensors
+            meta_tensors = []
+            for param in round_robin_tensors:
+                meta_tensors.append(torch.zeros_like(param.cpu_data, device="meta"))
+            self.round_robin_bit16_meta.append(meta_tensors)
 
-            # Record padding required for alignment
-            if partition_id == dist.get_world_size(group=self.real_dp_process_group[i]) - 1:
-                padding = self.bit16_groups_flat[i].numel() - sum(
-                    [t.numel() for t in self.round_robin_bit16_groups[i]])
-            else:
-                padding = 0
-            self.groups_padding.append(padding)
+            # create flat buffer in CPU
+            flattened_buffer = self.flatten_dense_tensors_aligned(
+                self.round_robin_bit16_groups[i],
+                self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[i]),
+                use_cpu_data=True)
+
+            # free temp CPU params
+            for param in self.bit16_groups[i]:
+                del param.cpu_data
+
+            # Move CPU flat tensor to the accelerator memory.
+            self.bit16_groups_flat.append(flattened_buffer.to(get_accelerator().current_device_name()))
+            del flattened_buffer
+
+            see_memory_usage(f"After flattening and moving param group {i} to GPU", force=False)
 
             if dist.get_rank(group=self.real_dp_process_group[i]) == 0:
                 see_memory_usage(f"After Flattening and after emptying param group {i} cache", force=False)
@@ -362,6 +377,18 @@ def __init__(self,
             data_parallel_partitions = self.get_data_parallel_partitions(self.bit16_groups_flat[i], i)
             self.parallel_partitioned_bit16_groups.append(data_parallel_partitions)
 
+            # Record padding required for alignment
+            left_boundary = sum([t.numel() for t in data_parallel_partitions[:partition_id]])
+            curr_partition_size = data_parallel_partitions[partition_id].numel()
+
+            if orig_group_numel <= left_boundary:
+                padding = curr_partition_size
+            elif orig_group_numel < left_boundary + curr_partition_size:
+                padding = left_boundary + curr_partition_size - orig_group_numel
+            else:
+                padding = 0
+            self.groups_padding.append(padding)
+
             # verify that data partition start locations are 4-byte aligned
             for partitioned_data in data_parallel_partitions:
                 assert (partitioned_data.data_ptr() % (2 * self.nccl_start_alignment_factor) == 0)
@@ -370,11 +397,16 @@ def __init__(self,
             # Note that the params in single_partition_of_fp32_groups is cloned and detached
             # from the origin params of the model.
             if not fp16_master_weights_and_gradients:
-                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                    self.device).clone().float().detach())
+                weights_partition = self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().float().detach()
             else:
-                self.single_partition_of_fp32_groups.append(self.parallel_partitioned_bit16_groups[i][partition_id].to(
-                    self.device).clone().half().detach())
+                weights_partition = self.parallel_partitioned_bit16_groups[i][partition_id].to(
+                    self.device).clone().half().detach()
+
+            if self.cpu_offload:
+                weights_partition = get_accelerator().pin_memory(weights_partition)
+
+            self.single_partition_of_fp32_groups.append(weights_partition)
 
             # Set local optimizer to have flat params of its own partition.
             # After this, the local optimizer will only contain its own partition of params.
@@ -491,6 +523,7 @@ def __init__(self,
         self.reset_partition_gradient_structures()
 
         # creates backward hooks for gradient partitioning
+        self._grad_acc_hooks = []
         if self.partition_gradients or self.overlap_comm:
             self.create_reduce_and_remove_grad_hooks()
 
@@ -520,9 +553,20 @@ def __init__(self,
             see_memory_usage(f"After initializing ZeRO optimizer", force=True)
 
         self._link_all_hp_params()
+        self._hp_optimizer_states_linked = False
+
         self._enable_universal_checkpoint()
         self._param_slice_mappings = self._create_param_mapping()
 
+    def destroy(self):
+        for i, _ in enumerate(self.optimizer.param_groups):
+            for p in self.bit16_groups[i]:
+                if getattr(p, '_hp_mapping', None):
+                    p._hp_mapping = None
+        for hook in self._grad_acc_hooks:
+            hook.remove()
+        self.print_rank_0("Removed grad acc hooks")
+
     def _enable_universal_checkpoint(self):
         for lp_param_group in self.bit16_groups:
             enable_universal_checkpoint(param_list=lp_param_group)
@@ -540,14 +584,14 @@ def _create_param_mapping(self):
         return param_mapping
 
     def _link_all_hp_params(self):
-        dp_world_size = dist.get_world_size(group=self.dp_process_group)
         if self.cpu_offload:
             self._get_offload_gradient_dict()
 
         for i, _ in enumerate(self.optimizer.param_groups):
             # Link bit16 and fp32 params in partition
             partition_id = dist.get_rank(group=self.real_dp_process_group[i])
-            partition_size = self.bit16_groups_flat[i].numel() // dp_world_size
+            partition_size = self.bit16_groups_flat[i].numel() // dist.get_world_size(
+                group=self.real_dp_process_group[i])
             flat_hp_partition = self.single_partition_of_fp32_groups[i]
             link_hp_params(lp_param_list=self.bit16_groups[i],
                            flat_hp_partition=flat_hp_partition,
@@ -557,9 +601,15 @@ def _link_all_hp_params(self):
                            param_group_index=i,
                            partition_start=partition_id * partition_size,
                            partition_size=partition_size,
-                           partition_optimizer_state=self.optimizer.state[flat_hp_partition],
                            dp_group=self.real_dp_process_group[i])
 
+    def _lazy_init_hp_params_optimizer_state(self):
+        if not self._hp_optimizer_states_linked:
+            for i, _ in enumerate(self.optimizer.param_groups):
+                lazy_init_hp_params_optimizer_state(self.bit16_groups[i], self.single_partition_of_fp32_groups[i],
+                                                    self.optimizer.state)
+            self._hp_optimizer_states_linked = True
+
     def is_moe_group(self, group):
         return 'moe' in group and group['moe']
 
@@ -569,7 +619,7 @@ def _configure_moe_settings(self):
             assert self.contiguous_gradients, "Contiguous Gradients in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
         # NOTE: To run ZeRO stage 1 with MoE, we need to set self.contiguous_gradients to True or ignore the assertion
         if not self.partition_gradients and not self.contiguous_gradients:
-            logger.warn(
+            logger.warning(
                 "ZeRO Stage 1 has not been thoroughly tested with MoE. This configuration is still experimental.")
         assert self.reduce_scatter, "Reduce Scatter in ZeRO Stage 2 must be set to True for MoE. Other code paths are not tested with MoE"
 
@@ -591,8 +641,7 @@ def _configure_moe_settings(self):
         assert self.ep_process_group is not None, "Expert parallel group should be configured with MoE"
 
     def _update_model_bit16_weights(self, group_index):
-        updated_params = self.unflatten(self.bit16_groups_flat[group_index],
-                                        self.round_robin_bit16_groups[group_index])
+        updated_params = self.unflatten(self.bit16_groups_flat[group_index], self.round_robin_bit16_meta[group_index])
         for p, q in zip(self.round_robin_bit16_groups[group_index], updated_params):
             p.data = q.data
 
@@ -644,8 +693,6 @@ def initialize_optimizer_states(self):
         # which do lazy initialization of the state at the first call to step.
         if isinstance(self.optimizer, torch.optim.Adagrad):
             self.optimizer = torch.optim.Adagrad(self.single_partition_of_fp32_groups, **self.optimizer.defaults)
-        else:
-            self.optimizer.step()
 
         if not self.cpu_offload:
             for group in self.single_partition_of_fp32_groups:
@@ -685,8 +732,9 @@ def reduce_gradients(self, pipeline_parallel=False):
     def get_first_param_index(self, group_id, param_group, partition_id):
         for index, param in enumerate(param_group):
             param_id = self.get_param_id(param)
-            if partition_id in self.param_to_partition_ids[group_id][param_id]:
-                return index
+            if group_id in self.param_to_partition_ids and param_id in self.param_to_partition_ids[group_id]:
+                if partition_id in self.param_to_partition_ids[group_id][param_id]:
+                    return index
         return None
 
     def initialize_gradient_partitioning_data_structures(self):
@@ -724,7 +772,8 @@ def independent_gradient_partition_epilogue(self):
             self.params_already_reduced[i] = False
 
         if self.overlap_comm:
-            get_accelerator().synchronize()
+            if not get_accelerator().resolves_data_dependency():
+                get_accelerator().synchronize()
             # It is safe to clear previously reduced grads of other partitions
             self._clear_previous_reduced_grads()
 
@@ -865,7 +914,7 @@ def wrapper(param, i):
                         def reduce_partition_and_remove_grads(*notneeded):
                             self.reduce_ready_partitions_and_remove_grads(param, i)
 
-                        grad_acc.register_hook(reduce_partition_and_remove_grads)
+                        self._grad_acc_hooks.append(grad_acc.register_hook(reduce_partition_and_remove_grads))
                         self.grad_accs.append(grad_acc)
 
                     wrapper(param, i)
@@ -882,7 +931,8 @@ def report_ipg_memory_usage(self, tag, param_elems):
         )
 
     # create a flat tensor aligned at the alignment boundary
-    def flatten_dense_tensors_aligned(self, tensor_list, alignment):
+    def flatten_dense_tensors_aligned(self, tensor_list, alignment, use_cpu_data=False):
+        tensor_list = [param.cpu_data for param in tensor_list] if use_cpu_data else tensor_list
         return self.flatten(align_dense_tensors(tensor_list, alignment))
 
     ############### Independent Partition Gradient ########################
@@ -917,7 +967,7 @@ def reduce_independent_p_g_buckets_and_remove_grads(self, param, i):
         assert grad_reduc is not None, f"rank {dist.get_rank()} - Invalid to reduce Param {param_id} with None gradient"
 
         self.grads_in_ipg_bucket.append(grad_reduc)
-        self.params_in_ipg_bucket.append((i, param, param_id))
+        self.params_in_ipg_bucket.append((i, param.param_idx_in_group, param_id))
 
         #make sure the average tensor function knows how to average the gradients
         if is_moe_param(param):
@@ -930,6 +980,8 @@ def print_rank_0(self, message):
             logger.info(message)
 
     def gradient_reduction_w_predivide(self, tensor):
+        if tensor.size().numel() == 0:
+            return tensor
 
         dp_world_size = dist.get_world_size(group=self.dp_process_group)
 
@@ -999,8 +1051,9 @@ def allreduce_and_scatter(self, bucket, numel_per_bucket=500000000, log=None, di
     def average_tensor(self, tensor):
         if self.overlap_comm:
             stream = self.reduction_stream
-            if not get_accelerator().is_synchronized_device():
+            if not get_accelerator().resolves_data_dependency():
                 stream.wait_stream(get_accelerator().current_stream())
+                get_accelerator().current_stream().wait_stream(stream)
         else:
             stream = get_accelerator().current_stream()
 
@@ -1020,17 +1073,14 @@ def average_tensor(self, tensor):
 
             process_group = self.dp_process_group
             # count = 0
-            for i, param, param_id in self.params_in_ipg_bucket:
+            for i, param_idx_in_group, param_id in self.params_in_ipg_bucket:
+                param = self.bit16_groups[i][param_idx_in_group]
 
                 process_group = self.dp_process_group
-                grad_reduc = self.get_gradient_for_reduction(param)
-                #Averages gradients at parameter level if ipg has a moe param
-                #Otherwise averaging is done at the entire buffer level at the end of the loop
-                # MoE param have different groups
+
                 if self.ipg_bucket_has_moe_params:
                     process_group = self.expert_dp_process_group[param.group_name] if is_moe_param(
                         param) else self.dp_process_group
-                    grad_reduc.data.div_(dist.get_world_size(group=process_group) / float(self.sequence_parallel_size))
 
                 partition_ids = self.param_to_partition_ids[i][param_id]
                 assert all([p_id < dist.get_world_size(group=process_group) for p_id in partition_ids
@@ -1069,8 +1119,7 @@ def average_tensor(self, tensor):
                     curr_size += numel
                     prev_id, prev_process_group = partition_id, process_group
 
-            if not self.ipg_bucket_has_moe_params:
-                tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
+            tensor.div_(dist.get_world_size(group=self.dp_process_group) / float(self.sequence_parallel_size))
 
             buckets = {}
             for i, (dst, bucket_offset, numel) in enumerate(rank_and_offsets):
@@ -1088,14 +1137,14 @@ def average_tensor(self, tensor):
                 if self.use_multi_rank_bucket_allreduce:
                     self.allreduce_and_scatter(buckets[bucket_key],
                                                numel_per_bucket=self.reduce_bucket_size,
-                                               divide=self.ipg_bucket_has_moe_params,
+                                               divide=False,
                                                process_group=bucket_key)
                 else:
                     dst, process_group = bucket_key
                     self.allreduce_no_retain(buckets[bucket_key],
                                              numel_per_bucket=self.reduce_bucket_size,
                                              rank=dst,
-                                             divide=self.ipg_bucket_has_moe_params,
+                                             divide=False,
                                              process_group=process_group)
 
     ##############################################################################
@@ -1185,9 +1234,7 @@ def copy_gradients_to_cpu():
 
         if self.micro_step_id > 0:
             accumulate_gradients()
-
-        # at the boundary we will send 32bit directly
-        if not self.is_gradient_accumulation_boundary:
+        else:
             copy_gradients_to_cpu()
 
     def set_norm_for_param_grad(self, param):
@@ -1326,7 +1373,7 @@ def reduce_ipg_grads(self):
                 self.average_tensor(extra_large_grad_reduc.view(-1))
                 self.extra_large_param_to_reduce = None
             else:
-                self.average_tensor(self.ipg_buffer[self.ipg_index])
+                self.average_tensor(self.ipg_buffer[self.ipg_index].narrow(0, 0, self.elements_in_ipg_bucket))
         else:
             self.buffered_reduce_fallback(None,
                                           self.grads_in_ipg_bucket,
@@ -1343,7 +1390,8 @@ def reduce_ipg_grads(self):
             stream = get_accelerator().current_stream()
 
         with get_accelerator().stream(stream):
-            for _, param, param_id in self.params_in_ipg_bucket:
+            for group_idx, param_idx_in_group, param_id in self.params_in_ipg_bucket:
+                param = self.bit16_groups[group_idx][param_idx_in_group]
 
                 assert self.params_already_reduced[param_id] == False, \
                     f"The parameter {param_id} has already been reduced. \
@@ -1436,11 +1484,10 @@ def set_none_gradients_to_zero(self, i, partition_id):
         for param_id in self.is_grad_computed[i][partition_id]:
             param = self.param_dict[param_id]
             if param.grad is None:
-                param.grad = torch.zero_like(param)
+                param.grad = torch.zeros_like(param)
 
     ######################Reduction Related Methods##############################
     def allreduce_bucket(self, bucket, rank=None, log=None, divide=True, process_group=None):
-        rank = None
         tensor = self.flatten(bucket)
 
         process_group = self.dp_process_group if process_group is None else process_group
@@ -1481,7 +1528,8 @@ def _clear_previous_reduced_grads(self):
     def allreduce_and_copy(self, small_bucket, rank=None, log=None, divide=True, process_group=None):
         process_group = self.dp_process_group if process_group is None else process_group
         if self.overlap_comm:
-            get_accelerator().synchronize()
+            if not get_accelerator().resolves_data_dependency():
+                get_accelerator().synchronize()
             # It is safe to clear the previously reduced grads of other partitions
             self._clear_previous_reduced_grads()
             stream = self.reduction_stream
@@ -1630,16 +1678,16 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
             Total norm of the parameters (viewed as a single vector).
         """
         norm_type = float(norm_type)
+        all_norms = []
         if norm_type == inf:
-            total_norm = max(g.data.abs().max() for g in gradients)
-            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=self.dp_process_group)
+            for g in gradients:
+                all_norms.append(g.data.abs().max().float())
+            total_norm = torch.stack(all_norms).max()
+            dist.all_reduce(total_norm, op=dist.ReduceOp.MAX, group=self.dp_process_group)
 
             # Take max across all GPUs.
-            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.MAX)
-            total_norm = total_norm_cuda[0].item()
+            self._model_parallel_all_reduce(tensor=total_norm, op=dist.ReduceOp.MAX)
         else:
-            total_norm = 0.0
             # if dist.get_rank() == 0:
             #    logger.info(f"Total Norm beginning {total_norm}")
             for g, p in zip(gradients, params):
@@ -1647,19 +1695,26 @@ def get_grad_norm_direct(self, gradients, params, norm_type=2):
                 if hasattr(p, PIPE_REPLICATED) and p.ds_pipe_replicated:
                     continue
                 if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
-                    param_norm = g.data.double().norm(2)
-                    total_norm += param_norm.item()**2
-            # Sum across all model parallel GPUs.
-            total_norm_cuda = get_accelerator().FloatTensor([float(total_norm)])
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.SUM, group=self.dp_process_group)
+                    all_norms.append(
+                        torch.linalg.vector_norm(g.data.double().detach(),
+                                                 ord=norm_type).to(get_accelerator().current_device_name()))
+            if len(all_norms) > 0:
+                total_norm = torch.stack(all_norms).square().sum().float()
+            else:
+                total_norm = torch.tensor(0.0, dtype=torch.float32).to(self.device)
+            # Sum across all model parallel Device.
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=self.dp_process_group)
 
-            self._model_parallel_all_reduce(tensor=total_norm_cuda, op=dist.ReduceOp.SUM)
+            self._model_parallel_all_reduce(tensor=total_norm, op=dist.ReduceOp.SUM)
 
-            total_norm = total_norm_cuda[0].item()**(1. / norm_type)
+            total_norm = total_norm.pow(1. / norm_type)
 
-        if total_norm == float('inf') or total_norm == -float('inf') or total_norm != total_norm:
-            total_norm = -1
+        norm_is_inf = total_norm.isinf()
+        norm_is_nan = total_norm.isnan()
+        inf_or_nan = norm_is_nan.logical_or(norm_is_inf)
 
+        err = torch.tensor(-1.0, device=self.device, dtype=torch.float)
+        total_norm = inf_or_nan * err + inf_or_nan.logical_not() * total_norm
         return total_norm
 
     # creates a flat fused tensor from the tensor list starting at the first_offset
@@ -1733,18 +1788,20 @@ def scaled_global_norm(self, norm_type=2):
         assert norm_type == 2, "only L2 norm supported"
         norm_groups = []
         for i, group in enumerate(self.bit16_groups):
-            partition_id = dist.get_rank(group=self.real_dp_process_group[i])
             if self.cpu_offload:
-                norm_groups.append(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i]))
-                single_grad_partition = self.single_partition_of_fp32_groups[i].grad
+                # complete complete_grad_norm_calculation_for_cpu_offload return python float, moving back to
+                # torch.tensor as else statement returns tensor as well
+                norm = torch.tensor(self.complete_grad_norm_calculation_for_cpu_offload(self.params_in_partition[i]),
+                                    device=self.device)
+                norm_groups.append(norm)
             else:
                 norm_groups.append(self.get_grad_norm_direct(self.averaged_gradients[i], self.params_in_partition[i]))
 
         if self.has_moe_layers:
             self._average_expert_grad_norms(norm_groups)
 
-        # note that the get_global_norm function only supports l2 norm
-        return get_global_norm(norm_list=norm_groups)
+        # calculating L2 norm
+        return torch.linalg.vector_norm(torch.stack(norm_groups), ord=norm_type)
 
     def get_bit16_param_group(self, group_no):
         bit16_partitions = self.parallel_partitioned_bit16_groups[group_no]
@@ -1763,11 +1820,14 @@ def _optimizer_step(self, group_no):
         self.optimizer.step()
         self.optimizer.param_groups = original_param_groups
 
+        # We need to link optimizer state after the first step() call
+        self._lazy_init_hp_params_optimizer_state()
+
     def step(self, closure=None):
         """
         Not supporting closure.
         """
-        self.micro_step_id = -1
+        self.micro_step_id = INITIAL_MICRO_STEP_ID
 
         see_memory_usage(f"In step before checking overflow")
 
@@ -1818,7 +1878,8 @@ def step(self, closure=None):
                 #    bit16_partitions[partition_id].data.copy_(fp32_partition.data)
                 bit16_partitions = self.parallel_partitioned_bit16_groups[i]
                 fp32_partition = self.single_partition_of_fp32_groups[i]
-                bit16_partitions[partition_id].data.copy_(fp32_partition.data)
+                bit16_partitions[partition_id].data.copy_(
+                    fp32_partition.to(get_accelerator().current_device_name()).data)
 
                 self.timers(OPTIMIZER_STEP_TIMER).stop()
             else:
@@ -1866,16 +1927,11 @@ def step(self, closure=None):
         self.timers(OPTIMIZER_ALLGATHER_TIMER).start()
         # Gather the updated weights from everyone.
         # Then all partitions of the model parameters are updated and ready for next round forward.
-        if dist.has_all_gather_into_tensor():
-            all_gather_all_partitions(global_flatten_group=self.bit16_groups_flat,
-                                      partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-                                      dp_process_group=self.real_dp_process_group)
-        else:
-            all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-                                 dp_process_group=self.real_dp_process_group,
-                                 start_alignment_factor=self.nccl_start_alignment_factor,
-                                 allgather_bucket_size=self.allgather_bucket_size)
-
+        all_gather_dp_groups(groups_flat=self.bit16_groups_flat,
+                             partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                             dp_process_group=self.real_dp_process_group,
+                             start_alignment_factor=self.nccl_start_alignment_factor,
+                             allgather_bucket_size=self.allgather_bucket_size)
         self.timers(OPTIMIZER_ALLGATHER_TIMER).stop()
 
         # TODO: we probably don't need this? just to be safe
@@ -1896,26 +1952,20 @@ def update_lp_params(self):
             # print_rank_0(f'update_lp_params {i=} {partition_id=}', force=True)
             # if i == 0:
             #     print_rank_0(f'{fp32_partition[:10]=}', force=True)
-
-        if dist.has_all_gather_into_tensor():
-            all_gather_all_partitions(global_flatten_group=self.bit16_groups_flat,
-                                      partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-                                      dp_process_group=self.real_dp_process_group)
-        else:
-            all_gather_dp_groups(partitioned_param_groups=self.parallel_partitioned_bit16_groups,
-                                 dp_process_group=self.real_dp_process_group,
-                                 start_alignment_factor=self.nccl_start_alignment_factor,
-                                 allgather_bucket_size=self.allgather_bucket_size)
+        all_gather_dp_groups(groups_flat=self.bit16_groups_flat,
+                             partitioned_param_groups=self.parallel_partitioned_bit16_groups,
+                             dp_process_group=self.real_dp_process_group,
+                             start_alignment_factor=self.nccl_start_alignment_factor,
+                             allgather_bucket_size=self.allgather_bucket_size)
 
     def _average_expert_grad_norms(self, norm_groups):
         for i, norm in enumerate(norm_groups):
             if self.is_moe_param_group[i]:
-                scaled_norm = norm * 1.0 / float(dist.get_world_size(group=self.real_dp_process_group[i]))
-                scaled_norm_tensor = torch.tensor(scaled_norm,
-                                                  device=get_accelerator().device_name(),
-                                                  dtype=torch.float)
+                scaled_norm_tensor = norm * 1.0 / dist.get_world_size(group=self.real_dp_process_group[i])
+                if self.device == 'cpu':
+                    scaled_norm_tensor = scaled_norm_tensor.to(get_accelerator().current_device_name())
                 dist.all_reduce(scaled_norm_tensor, group=self.real_dp_process_group[i])
-                norm_groups[i] = scaled_norm_tensor.item()
+                norm_groups[i] = scaled_norm_tensor.to(self.device)
 
     def unscale_and_clip_grads(self, grad_groups_flat, total_norm):
         # compute combined scale factor for this group
@@ -1923,8 +1973,8 @@ def unscale_and_clip_grads(self, grad_groups_flat, total_norm):
         if self.clip_grad > 0.:
             # norm is in fact norm*scale
             clip = ((total_norm / self.loss_scale) + 1e-6) / self.clip_grad
-            if clip > 1:
-                combined_scale = clip * self.loss_scale
+            clip = torch.clamp(clip, min=1.0)
+            combined_scale = clip * self.loss_scale
 
         for grad in grad_groups_flat:
             if isinstance(grad, list):
@@ -1938,24 +1988,26 @@ def _check_overflow(self, partition_gradients=True):
         self.overflow = self.has_overflow(partition_gradients)
 
     # `params` is a list / generator of torch.Variable
-    def has_overflow_serial(self, params, is_grad_list=False):
+    def has_overflow_serial(self, params):
+        invalid_grad_count = torch.zeros([1], dtype=torch.float, device=get_accelerator().current_device_name())
         for p in params:
-            if p.grad is not None and self._has_inf_or_nan(p.grad.data):
-                return True
-
-        return False
+            if p.grad is not None:
+                invalid_grad_count += self._has_inf_or_nan(p.grad)
+        return invalid_grad_count.bool()
 
     def has_overflow_partitioned_grads_serial(self):
+        invalid_grad_count = torch.zeros([1], dtype=torch.float, device=get_accelerator().current_device_name())
         for i in range(len(self.bit16_groups)):
             for j, grad in enumerate(self.averaged_gradients[i]):
-                if grad is not None and self._has_inf_or_nan(grad.data, j):
-                    return True
-        return False
+                if grad is not None:
+                    invalid_grad_count += self._has_inf_or_nan(grad)
+        return invalid_grad_count.bool()
 
     def has_overflow(self, partition_gradients=True):
         if partition_gradients:
             overflow = self.local_overflow if self.cpu_offload else self.has_overflow_partitioned_grads_serial()
-            overflow_gpu = get_accelerator().ByteTensor([overflow])
+            overflow_gpu = get_accelerator().ByteTensor([overflow]) if self.cpu_offload else overflow.byte().to(
+                get_accelerator().current_device_name())
             '''This will capture overflow across all data parallel and expert parallel process
             Since expert parallel process are a subset of data parallel process'''
             dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.dp_process_group)
@@ -1965,9 +2017,7 @@ def has_overflow(self, partition_gradients=True):
             for group in self.bit16_groups:
                 for param in group:
                     params.append(param)
-
-            overflow = self.has_overflow_serial(params, is_grad_list=partition_gradients)
-            overflow_gpu = get_accelerator().ByteTensor([overflow])
+            overflow_gpu = self.has_overflow_serial(params).byte().to(get_accelerator().current_device_name())
 
         # Since each model parallel GPU carries only part of the model,
         # make sure overflow flag is synced across all the model parallel GPUs
@@ -1979,24 +2029,11 @@ def has_overflow(self, partition_gradients=True):
     # `x` is a torch.Tensor
     @staticmethod
     def _has_inf_or_nan(x, j=None):
-        try:
-            # if x is half, the .float() incurs an additional deep copy, but it's necessary if
-            # Pytorch's .sum() creates a one-element tensor of the same type as x
-            # (which is true for some recent version of pytorch).
-            cpu_sum = float(x.float().sum())
-            # More efficient version that can be used if .sum() returns a Python scalar
-            # cpu_sum = float(x.sum())
-        except RuntimeError as instance:
-            # We want to check if inst is actually an overflow exception.
-            # RuntimeError could come from a different error.
-            # If so, we still want the exception to propagate.
-            if "value cannot be converted" not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') or cpu_sum != cpu_sum:
-                return True
-            return False
+        float_x = x.float()
+        nan = float_x.isnan()
+        inf = float_x.isinf()
+        inf_or_nan = nan.logical_or(inf)
+        return inf_or_nan.float().max()
 
     def backward(self, loss, retain_graph=False):
         """
@@ -2182,7 +2219,7 @@ def refresh_fp32_params(self):
     # Extract optimizer state for current partition from merged states of all partitions
     def _partition_base_optimizer_state(self, state_key, all_partition_states, group_id):
         partition_id = dist.get_rank(group=self.real_dp_process_group[group_id])
-        alignment = dist.get_world_size(group=self.real_dp_process_group[group_id])
+        alignment = self.nccl_start_alignment_factor * dist.get_world_size(group=self.real_dp_process_group[group_id])
         if torch.is_tensor(all_partition_states[0]):
             flat_merged_partitions = self.flatten_dense_tensors_aligned(all_partition_states, alignment)
             dp_partitions = self.get_data_parallel_partitions(flat_merged_partitions, group_id)
@@ -2191,19 +2228,39 @@ def _partition_base_optimizer_state(self, state_key, all_partition_states, group
             # Assume non-tensor states are not partitioned and equal across ranks, so return first one
             return all_partition_states[0]
 
-    def _restore_base_optimizer_state(self, base_optimizer_group_states):
+    def _restore_step_from_elastic_checkpoint(self, all_state_dict):
+        assert BASE_OPTIMIZER_STATE_STEP in all_state_dict[0]
+        assert all(sd[BASE_OPTIMIZER_STATE_STEP] == all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+                   for sd in all_state_dict), "State dicts of all partitions must have the same step value"
+        return all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
+
+    def _restore_base_optimizer_state(self, base_optimizer_group_states, base_optimizer_state_step, group_paddings):
         if type(base_optimizer_group_states) == dict:
             base_optimizer_group_states = base_optimizer_group_states['state']
+
+        saved_keys = base_optimizer_group_states[0].keys()
+
         for i, group in enumerate(self.optimizer.param_groups):
             p = group['params'][0]
-            for key, saved in base_optimizer_group_states[i].items():
-                if torch.is_tensor(self.optimizer.state[p][key]):
-                    dst_tensor = self.optimizer.state[p][key]
-                    src_tensor = _get_padded_tensor(saved, dst_tensor.numel())
-                    self.optimizer.state[p][key].data.copy_(src_tensor.data)
+            padding = 0 if group_paddings is None else group_paddings[i]
+            for key in saved_keys:
+                saved = base_optimizer_group_states[i][key]
+
+                if torch.is_tensor(saved):
+                    if key in self.optimizer.state[p]:
+                        dst_tensor = self.optimizer.state[p][key]
+                        src_tensor = _get_padded_tensor(saved, dst_tensor.numel())
+                        self.optimizer.state[p][key].data.copy_(src_tensor.data)
+                    else:
+                        self.optimizer.state[p][key] = _pad_tensor_by_size(
+                            saved, padding, torch.float32,
+                            torch.device('cpu') if self.cpu_offload else self.device)
                 else:
                     self.optimizer.state[p][key] = saved
 
+        for param_group in self.optimizer.param_groups:
+            param_group['step'] = base_optimizer_state_step
+
     def get_ep_ranks(self, rank=0, group_name=None):
         from deepspeed.utils import groups
         expert_parallel_size_ = groups._get_expert_parallel_world_size(group_name)
@@ -2231,51 +2288,23 @@ def _restore_elastic_base_optimizer_state(self, all_state_dict):
                 partition_states[key] = self._partition_base_optimizer_state(key, all_partition_states, i)
             base_optimizer_group_states.append(partition_states)
 
-        self._restore_base_optimizer_state(base_optimizer_group_states)
-
-        # Restore step
-        if BASE_OPTIMIZER_STATE_STEP in all_state_dict[0]:
-            assert all(sd[BASE_OPTIMIZER_STATE_STEP] == all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
-                       for sd in all_state_dict), "State dicts of all partitions must have the same step value"
-            loaded_param_groups_step = all_state_dict[0][BASE_OPTIMIZER_STATE_STEP]
-            for param_group in self.optimizer.param_groups:
-                param_group['step'] = loaded_param_groups_step
+        self._restore_base_optimizer_state(base_optimizer_group_states,
+                                           self._restore_step_from_elastic_checkpoint(all_state_dict), None)
 
     def load_state_dict(self,
                         state_dict_list,
                         load_optimizer_states=True,
                         load_from_fp32_weights=False,
                         checkpoint_folder=None,
-                        load_serial=None):
+                        load_serial=None,
+                        param_shapes=None):
         if checkpoint_folder:
             self._load_universal_checkpoint(checkpoint_folder, load_optimizer_states, load_from_fp32_weights)
         else:
             self._load_legacy_checkpoint(state_dict_list, load_optimizer_states, load_from_fp32_weights)
 
     def _load_universal_checkpoint(self, checkpoint_folder, load_optimizer_states, load_from_fp32_weights):
-        self._load_hp_checkpoint_state(checkpoint_folder)
-
-    @property
-    def param_groups(self):
-        """Forward the wrapped optimizer's parameters."""
-        return self.optimizer.param_groups
-
-    def _load_hp_checkpoint_state(self, checkpoint_dir):
-        checkpoint_dir = os.path.join(checkpoint_dir, "zero")
-        optim_state_path = os.path.join(checkpoint_dir, "optimizer_state.pt")
-        assert os.path.isfile(
-            optim_state_path), f'{optim_state_path} containing optimizer global state is missing! Cannot proceed.'
-        optim_sd = torch.load(optim_state_path)
-        self._load_global_state(optim_sd)
-
-        tp_rank = bwc_tensor_model_parallel_rank(mpu=self.mpu)
-        tp_world_size = self.mpu.get_slice_parallel_world_size()
-        for i, _ in enumerate(self.optimizer.param_groups):
-            for lp in self.bit16_groups[i]:
-                if lp._hp_mapping is not None:
-                    #print(f"Loading {self.param_names[lp]} {tp_rank=} {tp_world_size=}")
-                    lp.load_hp_checkpoint_state(os.path.join(checkpoint_dir, self.param_names[lp]), tp_rank,
-                                                tp_world_size)
+        self.load_hp_checkpoint_state_from_checkpoint_dir("bit16_groups", checkpoint_folder)
 
     def _load_global_state(self, sd):
         self.loss_scaler = sd.get(LOSS_SCALER, self.loss_scaler)
@@ -2349,7 +2378,9 @@ def _load_legacy_checkpoint(self, state_dict_list, load_optimizer_states=True, l
                     self._restore_elastic_base_optimizer_state(state_dict_list)
                 else:
                     # loading an elastic checkpoint into rigid exec
-                    self._restore_base_optimizer_state(current_rank_sd[BASE_OPTIMIZER_STATE])
+                    self._restore_base_optimizer_state(current_rank_sd[BASE_OPTIMIZER_STATE],
+                                                       current_rank_sd[BASE_OPTIMIZER_STATE_STEP],
+                                                       current_rank_sd[GROUP_PADDINGS])
 
         # At this point, the optimizer's references to the model's fp32 parameters are up to date.
         # The optimizer's hyperparameters and internal buffers are also up to date.
@@ -2408,7 +2439,9 @@ def estimate_zero2_model_states_mem_needs(total_params,
         gpu_mem = 2 * total_params
         cpu_mem = total_params * max(4 * total_gpus, 16) * additional_buffer_factor
     else:
-        gpu_mem = 4 * total_params + int(16 * total_params / total_gpus)
+        # GPU's total_params multipliers: 2 = params_16bit,
+        # 18 = 2_grads_16bit + 4_grads_32bit + 4_params_32bit + 8_optimizer_states_32bit(momentum and variance)
+        gpu_mem = 2 * total_params + int(18 * total_params / total_gpus)
         cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor
 
     return int(cpu_mem), int(gpu_mem)
diff --git a/deepspeed/runtime/zero/utils.py b/deepspeed/runtime/zero/utils.py
index 0bf1ca4a894d..2d1cf17962d8 100755
--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 import os
-from typing import List
+from typing import List, Tuple
 
 import torch
 from deepspeed import comm as dist
@@ -16,6 +16,9 @@
 from deepspeed.utils.nvtx import instrument_w_nvtx
 from deepspeed.accelerator import get_accelerator
 
+# ensure we only warn once, otherwise every iteration will trigger a warning
+warned = False
+
 
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
     data_parallel_size = int(dist.get_world_size())
@@ -65,7 +68,6 @@ def get_lst_from_rank0(lst: List[int]) -> None:
     lst_tensor = torch.tensor(
         lst if dist.get_rank() == 0 else [-1] * len(lst),
         dtype=int,
-        # device=get_accelerator().current_device_name(),
         device=torch.device(get_accelerator().device_name(os.environ["LOCAL_RANK"])),
         requires_grad=False,
     )
@@ -87,3 +89,87 @@ def assert_ints_same_as_other_ranks(ints: List[int]) -> None:
     if ints != rank0_ints:
         raise RuntimeError(f"disagreement between rank0 and rank{dist.get_rank()}: "
                            f"rank0: {rank0_ints}, rank{dist.get_rank()}: {ints}")
+
+
+def is_builtin_type(obj):
+    # https://stackoverflow.com/a/17795199
+    return obj.__class__.__module__ == '__builtin__' or obj.__class__.__module__ == "builtins"
+
+
+def isinstance_namedtuple(obj: object) -> bool:
+    """
+    Is this an instance of namedtuple/NamedTuple?
+    From: https://stackoverflow.com/a/62692640
+
+    Args:
+        obj (object): An object.
+
+    Returns:
+        bool: True if namedtuple/NamedTuple else False.
+    """
+    return isinstance(obj, tuple) and hasattr(obj, '_asdict') and hasattr(obj, '_fields')
+
+
+def is_zero_param(parameter):
+    if not torch.is_tensor(parameter):
+        return False
+    return hasattr(parameter, 'ds_id')
+
+
+def apply_to_tensors_only(function, value, warning_msg_fn=None):
+    """
+    Apply `function` to every Tensor in `value`.
+
+    Args:
+        functional: The function class to apply.
+        value (Any): Target object to apply `function` to.
+
+    Returns:
+        Any: Output of `function`.
+    """
+    if isinstance(value, (tuple, list)):
+        touched_outputs = []
+        for elem in value:
+            touched_output = apply_to_tensors_only(function, elem)
+            touched_outputs.append(touched_output)
+
+        if isinstance_namedtuple(value):
+            # namedtuples require a slightly different syntax.
+            return value.__class__(*touched_outputs)
+
+        return value.__class__(touched_outputs)
+    elif isinstance(value, dict):
+        # apply inplace to avoid recreating dict inherited objects
+        for key in value.keys():
+            value[key] = apply_to_tensors_only(function, value[key])
+        return value
+
+    elif isinstance(value, torch.Tensor):
+        # this also applies to torch.Tensor's subclasses like torch.nn.parameter.Parameter
+        touched_output = function(value)
+
+        # restore zero param attributes if those get stripped by `backward_function`
+        if not is_zero_param(touched_output) and is_zero_param(value):
+            touched_output.ds_param_alias = value
+
+        return touched_output
+    else:
+        if not is_builtin_type(value):
+            global warned
+            if warning_msg_fn and not warned and dist.get_rank() == 0:
+                logger.warning(warning_msg_fn(value))
+                warned = True
+        return value
+
+
+def get_mapping_to_flat_buffer(tensors: List[torch.Tensor]) -> List[Tuple[torch.Tensor, int, int]]:
+    tensor_infos: List[Tuple[torch.Tensor, int, int]] = []
+
+    offset = 0
+    for tensor in tensors:
+        tensor_numel = tensor.numel()
+        # record some data so we can restore the device tensor later
+        tensor_infos.append((tensor, offset, tensor_numel))
+        offset += tensor_numel
+
+    return tensor_infos
diff --git a/deepspeed/sequence/cross_entropy.py b/deepspeed/sequence/cross_entropy.py
new file mode 100644
index 000000000000..baa7bc1ea7a8
--- /dev/null
+++ b/deepspeed/sequence/cross_entropy.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+import deepspeed.comm as dist
+
+
+class _VocabSequenceParallelCrossEntropy(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, vocab_seq_parallel_logits, target, sp_group):
+        # vocab_seq_parallel_logits: [S/P, B, V]
+        # target: [S/P, B]
+        # return: [S, B]
+
+        # Need softmax for backward
+        softmax = torch.nn.functional.softmax(vocab_seq_parallel_logits, dim=-1)
+        ctx.vocab_size = vocab_seq_parallel_logits.size(2)
+        loss = torch.nn.functional.nll_loss(softmax.log().view(-1, ctx.vocab_size), target.view(-1), reduction='none')
+
+        sp_world_size = dist.get_world_size(sp_group)
+        sp_rank = dist.get_rank(sp_group)
+        ctx.sp_world_size = sp_world_size
+        ctx.sp_rank = sp_rank
+        ctx.seqlen = vocab_seq_parallel_logits.size(0) * sp_world_size
+        batch_size = vocab_seq_parallel_logits.size(1)
+
+        loss_all = torch.empty(ctx.seqlen,
+                               batch_size,
+                               dtype=vocab_seq_parallel_logits.dtype,
+                               device=vocab_seq_parallel_logits.device)
+        dist.all_gather_into_tensor(loss_all, loss, group=sp_group)
+
+        ctx.save_for_backward(softmax, target)
+
+        return loss_all
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        softmax, target = ctx.saved_tensors
+
+        step_seqlen = ctx.seqlen // ctx.sp_world_size
+        sp_rank = ctx.sp_rank
+        grad_output_part = grad_output[step_seqlen * sp_rank:step_seqlen * (sp_rank + 1), :]
+
+        grad_input = softmax
+        grad_2d = grad_input.view(-1, ctx.vocab_size)
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+
+        grad_2d[arange_1d, target.view(-1)] -= 1
+        grad_input.mul_(grad_output_part.unsqueeze(dim=-1))
+
+        return grad_input, None, None, None
+
+
+def vocab_sequence_parallel_cross_entropy(vocab_parallel_logits, target, sp_group):
+    return _VocabSequenceParallelCrossEntropy.apply(vocab_parallel_logits, target, sp_group)
diff --git a/deepspeed/sequence/fpdt_layer.py b/deepspeed/sequence/fpdt_layer.py
new file mode 100644
index 000000000000..4fa2cc988a19
--- /dev/null
+++ b/deepspeed/sequence/fpdt_layer.py
@@ -0,0 +1,1225 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+
+from typing import Optional, Any, Tuple
+from torch import Tensor
+from packaging import version
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+
+try:
+    import flash_attn
+    from flash_attn.flash_attn_interface import _flash_attn_forward, _flash_attn_backward
+    flash_attn_version = version.parse(flash_attn.__version__)
+except ImportError:
+    _flash_attn_forward = None
+    _flash_attn_backward = None
+
+from einops import rearrange
+from .layer import single_all_to_all, apply_rotary_pos_emb
+
+
+def _rotate_half_backward(x):
+    x = rearrange(x, '... (j d) -> ... j d', j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((x2, -x1), dim=-1)
+
+
+def apply_rotary_pos_emb_backward(grad_output, freqs_cos, freqs_sin):
+    rot_dim = freqs_cos.shape[-1]
+    grad, grad_pass = grad_output[..., :rot_dim], grad_output[..., rot_dim:]
+    grad_t = (grad * freqs_cos) + (_rotate_half_backward(grad * freqs_sin))
+    grad = grad_t if grad_pass.shape[-1] == 0 else torch.cat((grad_t, grad_pass), dim=-1)
+    return grad
+
+
+def _update_out_and_lse(
+    out: torch.Tensor,
+    lse: torch.Tensor,
+    block_out: torch.Tensor,
+    block_lse: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+
+    block_out = block_out.to(torch.float32)
+    block_lse = block_lse.transpose(-2, -1).unsqueeze(dim=-1)
+
+    new_lse = lse + torch.log1p(torch.exp(block_lse - lse))
+
+    out = torch.exp(lse - new_lse) * out + torch.exp(block_lse - new_lse) * block_out
+
+    lse = new_lse
+    return out, lse
+
+
+def update_out_and_lse(
+    out: Optional[torch.Tensor],
+    lse: Optional[torch.Tensor],
+    block_out: torch.Tensor,
+    block_lse: torch.Tensor,
+    slice_=None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if out is None:
+        if slice_ is not None:
+            raise RuntimeError("first update_out_and_lse should not pass slice_ args")
+        out = block_out.to(torch.float32)
+        lse = block_lse.permute(0, 2, 1).contiguous().unsqueeze(dim=-1).contiguous()
+    elif slice_ is not None:
+        slice_out, slice_lse = out[slice_], lse[slice_]
+        slice_out, slice_lse = _update_out_and_lse(slice_out, slice_lse, block_out, block_lse)
+        out[slice_], lse[slice_] = slice_out, slice_lse
+    else:
+        out, lse = _update_out_and_lse(out, lse, block_out, block_lse)
+    return out, lse
+
+
+class FPDT_InputConstruct(torch.nn.Module):
+
+    def __init__(self, tokens, labels, loss_mask, attention_mask, position_ids, args, sp_size, sp_rank) -> None:
+
+        super(FPDT_InputConstruct, self).__init__()
+        self.tokens = tokens
+        self.labels = labels
+        self.loss_mask = loss_mask
+        self.attention_mask = attention_mask
+        self.position_ids = position_ids
+        global_seq_len = tokens.shape[1]
+        batch_size = tokens.shape[0]
+        assert global_seq_len % sp_size == 0
+        assert global_seq_len % args.ds_sequence_parallel_fpdt_chunk_size == 0
+        num_chunk_per_gpu = global_seq_len // args.ds_sequence_parallel_fpdt_chunk_size
+        local_seq_len = global_seq_len // sp_size
+        assert local_seq_len % num_chunk_per_gpu == 0
+
+        self.num_chunk_per_gpu = num_chunk_per_gpu
+        self.chunk_size = local_seq_len // num_chunk_per_gpu
+        self.sp_size = sp_size
+        self.sp_rank = sp_rank
+        self.global_seq_len = global_seq_len
+        self.local_seq_len = local_seq_len
+        self.batch_size = batch_size
+        self.device = tokens.device
+
+    def generate(self):
+        device = self.device
+        totalChunks = self.global_seq_len // self.chunk_size
+        token_chunk_idx = torch.arange(self.global_seq_len, device=device, dtype=torch.int) // self.chunk_size
+        chunk_to_gpu = torch.arange(totalChunks, device=device, dtype=torch.int)
+        chunk_to_gpu = chunk_to_gpu.reshape(self.num_chunk_per_gpu, -1).t().contiguous()
+
+        gather_chunk = chunk_to_gpu.flatten().unsqueeze(1).contiguous()
+        mask = gather_chunk == token_chunk_idx
+
+        indices = mask.nonzero(as_tuple=False)
+        gather_indices = indices[:, 0]
+        token_chunk_indices = indices[:, 1]
+        indices = torch.cat([token_chunk_indices[gather_indices == i] for i in range(gather_chunk.shape[0])])
+        load_balanced_loss_mask = self.loss_mask[:, indices] if self.loss_mask is not None else self.loss_mask
+
+        indices = indices.reshape(-1, self.chunk_size)[self.num_chunk_per_gpu * self.sp_rank:self.num_chunk_per_gpu *
+                                                       (self.sp_rank + 1)].flatten().contiguous()
+        load_balanced_tokens = self.tokens[:, indices]
+        load_balanced_labels = self.labels[:, indices] if self.labels is not None else self.labels
+
+        load_balanced_attention_mask = self.attention_mask if self.attention_mask is not None else self.attention_mask
+        load_balanced_position_ids = self.position_ids[:,
+                                                       indices] if self.position_ids is not None else self.position_ids
+
+        return load_balanced_tokens, load_balanced_labels, load_balanced_loss_mask, load_balanced_attention_mask, load_balanced_position_ids
+
+
+class _FPDTGPUAttentionImpl_(torch.autograd.Function):
+    generate_vmap_rule = False
+
+    @staticmethod
+    def forward(ctx: Any,
+                layernorm_output,
+                attention_mask,
+                inference_params,
+                rotary_pos_emb,
+                spg,
+                scatter_idx,
+                gather_idx,
+                hidden_size,
+                projection_size,
+                hidden_size_per_attention_head,
+                kv_projection_size,
+                qkv_linear_weight,
+                qkv_linear_bias,
+                dropout,
+                num_chunks=8,
+                cpu_offloading=True):
+
+        do_save = layernorm_output.requires_grad
+
+        if rotary_pos_emb is not None:
+            pos_emb_cos, pos_emb_sin = rotary_pos_emb[0].permute(1, 0, 2, 3), rotary_pos_emb[1].permute(1, 0, 2, 3)
+            ctx.pos_emb_cos = pos_emb_cos
+            ctx.pos_emb_sin = pos_emb_sin
+        else:
+            ctx.pos_emb_cos = None
+            ctx.pos_emb_sin = None
+
+        with torch.no_grad():
+            per_gpu_seq_len = layernorm_output.shape[0]
+            chunk_size = per_gpu_seq_len // num_chunks
+            assert chunk_size * num_chunks == per_gpu_seq_len
+            assert attention_mask is None
+            ctx.num_chunks = num_chunks
+            ctx.cpu_offloading = cpu_offloading
+            ctx.spg = spg
+            ctx.scatter_idx = scatter_idx
+            ctx.gather_idx = gather_idx
+
+            device = get_accelerator().current_device_name()
+            ctx.device = device
+            ctx.dtype = layernorm_output.dtype
+            ctx.projection_size = projection_size
+            ctx.kv_projection_size = kv_projection_size
+
+            global_q = []
+            global_k = []
+            global_v = []
+
+            ctx.softmax_scale = hidden_size_per_attention_head**(-0.5)
+
+            ctx.dropout_p = dropout
+            ctx.window_size = (-1, -1)
+            ctx.alibi_slopes = None
+
+            batch_size = layernorm_output.shape[1]
+
+            global_o = [None for _ in range(num_chunks)]
+            global_lse = [None for _ in range(num_chunks)]
+
+            for i in range(num_chunks):
+
+                st = chunk_size * i
+                ed = st + chunk_size
+
+                qkv_chunk = torch.matmul(layernorm_output[st:ed], qkv_linear_weight.t()) + qkv_linear_bias
+
+                q_chunk = qkv_chunk[:, :, :projection_size].contiguous().reshape(
+                    qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
+                    hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous()  # b, l, nh, hd
+                q_chunk = single_all_to_all(q_chunk, scatter_idx, gather_idx, 0, spg)
+                global_q_chunk_len = q_chunk.shape[1]
+                if rotary_pos_emb is not None:
+                    q_chunk = apply_rotary_pos_emb(q_chunk,
+                                                   pos_emb_cos[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)],
+                                                   pos_emb_sin[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)])
+                global_q.append(q_chunk)
+
+                k_chunk = qkv_chunk[:, :, projection_size:projection_size + kv_projection_size].contiguous().reshape(
+                    qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
+                    hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous()  # b, l, nh, hd
+                k_chunk = single_all_to_all(k_chunk, scatter_idx, gather_idx, 0, spg)
+                if rotary_pos_emb is not None:
+                    k_chunk = apply_rotary_pos_emb(k_chunk,
+                                                   pos_emb_cos[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)],
+                                                   pos_emb_sin[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)])
+                global_k.append(k_chunk)
+
+                v_chunk = qkv_chunk[:, :, projection_size + kv_projection_size:].contiguous().reshape(
+                    qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
+                    hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous()  # b, l, nh, hd
+                v_chunk = single_all_to_all(v_chunk, scatter_idx, gather_idx, 0, spg)
+                global_v.append(v_chunk)
+
+                for k_i in range(len(global_k)):
+                    causal_chunk = i == k_i
+                    if flash_attn_version >= version.parse("2.6.0"):
+                        block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(global_q[i],
+                                                                                     global_k[k_i],
+                                                                                     global_v[k_i],
+                                                                                     ctx.dropout_p,
+                                                                                     ctx.softmax_scale,
+                                                                                     causal=causal_chunk,
+                                                                                     window_size=ctx.window_size,
+                                                                                     softcap=0.0,
+                                                                                     alibi_slopes=ctx.alibi_slopes,
+                                                                                     return_softmax=False)
+                    else:
+                        block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(global_q[i],
+                                                                                     global_k[k_i],
+                                                                                     global_v[k_i],
+                                                                                     ctx.dropout_p,
+                                                                                     ctx.softmax_scale,
+                                                                                     causal=causal_chunk,
+                                                                                     window_size=ctx.window_size,
+                                                                                     alibi_slopes=ctx.alibi_slopes,
+                                                                                     return_softmax=False)
+
+                    global_o[i], global_lse[i] = update_out_and_lse(global_o[i], global_lse[i], block_out, block_lse)
+
+                global_o[i] = global_o[i].to(q_chunk.dtype)
+
+            output = [None for i in range(num_chunks)]
+
+            for i in range(num_chunks):
+                global_lse[i] = global_lse[i][:, :, :, 0].permute(0, 2, 1).contiguous()
+                output[i] = single_all_to_all(global_o[i].to(ctx.dtype).contiguous(), gather_idx, scatter_idx, 0, spg)
+            output = torch.cat(output, dim=1)
+
+            head_dim = output.shape[-1]
+
+            if do_save:
+                ctx.save_for_backward(layernorm_output)
+                ctx.global_q = global_q
+                ctx.global_k = global_k
+                ctx.global_v = global_v
+                ctx.attn_output = global_o
+                ctx.attn_lse = global_lse
+                ctx.head_dim = head_dim
+                ctx.batch_size = batch_size
+
+                ctx.qkv_linear_weight = qkv_linear_weight
+                ctx.qkv_linear_bias = qkv_linear_bias
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        num_chunks = ctx.num_chunks
+        device = ctx.device
+        dtype = ctx.dtype
+        spg = ctx.spg
+        scatter_idx = ctx.scatter_idx
+        gather_idx = ctx.gather_idx
+        softmax_scale = ctx.softmax_scale
+        dropout_p = ctx.dropout_p
+        window_size = ctx.window_size
+        alibi_slopes = ctx.alibi_slopes
+
+        projection_size = ctx.projection_size
+        kv_projection_size = ctx.kv_projection_size
+
+        layernorm_output = ctx.saved_tensors[0]
+
+        global_q = ctx.global_q
+        global_k = ctx.global_k
+        global_v = ctx.global_v
+        attn_output = ctx.attn_output
+        lse = ctx.attn_lse
+
+        qkv_linear_weight = ctx.qkv_linear_weight
+        qkv_linear_bias = ctx.qkv_linear_bias
+
+        input_chunk_size = layernorm_output.shape[0] // num_chunks
+        grad_layernorm_output = [
+            torch.zeros((input_chunk_size, layernorm_output.shape[1], layernorm_output.shape[2]),
+                        device=device,
+                        dtype=dtype) for _ in range(num_chunks)
+        ]
+
+        grad_global_attn_output = []
+        chunk_size = grad_output.shape[1] // num_chunks
+
+        for i in range(num_chunks):
+            st = chunk_size * i
+            ed = st + chunk_size
+            grad_global_attn_output.append(
+                single_all_to_all(grad_output[:, st:ed].contiguous(), scatter_idx, gather_idx, 0, spg))
+
+        del grad_output
+
+        dq = [torch.zeros(global_q[0].shape, dtype=torch.float, device=device) for _ in range(num_chunks)]
+        dk = [torch.zeros(global_q[0].shape, dtype=torch.float, device=device) for _ in range(num_chunks)]
+        dv = [torch.zeros(global_q[0].shape, dtype=torch.float, device=device) for _ in range(num_chunks)]
+
+        grad_qkv_linear_weight = torch.zeros(qkv_linear_weight.shape,
+                                             device=qkv_linear_weight.device,
+                                             dtype=torch.float)
+        grad_qkv_linear_bias = torch.zeros(qkv_linear_bias.shape, device=qkv_linear_weight.device, dtype=torch.float)
+
+        for i in range(num_chunks):
+            k_chunk = global_k[i]
+            v_chunk = global_v[i]
+
+            for q_i in range(num_chunks):
+                no_computation = q_i < i
+                if no_computation:
+                    continue
+
+                causal_chunk = q_i == i
+
+                q_chunk = global_q[q_i]
+                attn_output_chunk = attn_output[q_i]
+                lse_chunk = lse[q_i]
+                d_out = grad_global_attn_output[q_i]
+
+                dq_this = torch.zeros(global_q[0].shape, dtype=dtype, device=device)
+                dk_this = torch.zeros(global_k[0].shape, dtype=dtype, device=device)
+                dv_this = torch.zeros(global_v[0].shape, dtype=dtype, device=device)
+
+                if flash_attn_version >= version.parse("2.6.0"):
+                    _flash_attn_backward(d_out,
+                                         q_chunk,
+                                         k_chunk,
+                                         v_chunk,
+                                         attn_output_chunk,
+                                         lse_chunk,
+                                         dq_this,
+                                         dk_this,
+                                         dv_this,
+                                         dropout_p,
+                                         softmax_scale,
+                                         causal_chunk,
+                                         window_size,
+                                         softcap=0.0,
+                                         alibi_slopes=alibi_slopes,
+                                         deterministic=False,
+                                         rng_state=None)
+                else:
+                    _flash_attn_backward(d_out,
+                                         q_chunk,
+                                         k_chunk,
+                                         v_chunk,
+                                         attn_output_chunk,
+                                         lse_chunk,
+                                         dq_this,
+                                         dk_this,
+                                         dv_this,
+                                         dropout_p,
+                                         softmax_scale,
+                                         causal_chunk,
+                                         window_size,
+                                         alibi_slopes=alibi_slopes,
+                                         deterministic=False,
+                                         rng_state=None)
+
+                dq[q_i].add_(dq_this.to(torch.float))
+                dk[i].add_(dk_this.to(torch.float))
+                dv[i].add_(dv_this.to(torch.float))
+
+            dk_seq_len = dk[i].shape[1]
+
+            if ctx.pos_emb_cos is not None:
+                dk[i] = apply_rotary_pos_emb_backward(dk[i].to(dtype),
+                                                      ctx.pos_emb_cos[:, dk_seq_len * i:dk_seq_len * (i + 1)],
+                                                      ctx.pos_emb_sin[:, dk_seq_len * i:dk_seq_len * (i + 1)])
+            else:
+                dk[i] = dk[i].to(dtype)
+            dv[i] = dv[i].to(dtype)
+            dk[i] = single_all_to_all(dk[i].contiguous(), gather_idx, scatter_idx, 0, spg)
+            dv[i] = single_all_to_all(dv[i].contiguous(), gather_idx, scatter_idx, 0, spg)
+
+            input_st = i * input_chunk_size
+            input_ed = input_st + input_chunk_size
+
+            input_chunk = layernorm_output[input_st:input_ed].reshape(-1, layernorm_output.shape[-1])
+
+            dk[i] = dk[i].flatten(2).permute(1, 0, 2)
+            dv[i] = dv[i].flatten(2).permute(1, 0, 2)
+            l, b = dk[i].shape[0], dk[i].shape[1]
+            grad_qkv_linear_weight[projection_size:projection_size + kv_projection_size].add_(
+                torch.matmul(dk[i].reshape(l * b, -1).t(), input_chunk))
+            grad_qkv_linear_weight[projection_size + kv_projection_size:].add_(
+                torch.matmul(dv[i].reshape(l * b, -1).t(), input_chunk))
+            grad_qkv_linear_bias[projection_size:projection_size + kv_projection_size].add_(dk[i].sum(0).sum(0))
+            grad_qkv_linear_bias[projection_size + kv_projection_size:].add_(dv[i].sum(0).sum(0))
+
+            grad_layernorm_output[i].add_(
+                torch.matmul(dk[i], qkv_linear_weight[projection_size:projection_size + kv_projection_size]))
+            grad_layernorm_output[i].add_(torch.matmul(dv[i],
+                                                       qkv_linear_weight[projection_size + kv_projection_size:]))
+
+            dk[i] = None
+            dv[i] = None
+
+        for i in range(num_chunks):
+            dq_seq_len = dq[i].shape[1]
+            if ctx.pos_emb_cos is not None:
+                dq[i] = apply_rotary_pos_emb_backward(dq[i].to(dtype),
+                                                      ctx.pos_emb_cos[:, dq_seq_len * i:dq_seq_len * (i + 1)],
+                                                      ctx.pos_emb_sin[:, dq_seq_len * i:dq_seq_len * (i + 1)])
+            else:
+                dq[i] = dq[i].to(dtype)
+            dq[i] = single_all_to_all(dq[i].to(dtype).contiguous(), gather_idx, scatter_idx, 0, spg)
+
+            input_chunk = layernorm_output[:input_chunk_size].reshape(-1, layernorm_output.shape[-1])
+            layernorm_output = layernorm_output[input_chunk_size:]
+
+            dq[i] = dq[i].flatten(2).permute(1, 0, 2)
+            l, b = dq[i].shape[0], dq[i].shape[1]
+            grad_qkv_linear_weight[:projection_size].add_(torch.matmul(dq[i].reshape(l * b, -1).t(), input_chunk))
+            grad_qkv_linear_bias[:projection_size].add_(dq[i].sum(0).sum(0))
+
+            grad_layernorm_output[i].add_(torch.matmul(dq[i], qkv_linear_weight[:projection_size]))
+
+            dq[i] = None
+
+        return torch.cat(
+            grad_layernorm_output,
+            dim=0).to(dtype), None, None, None, None, None, None, None, None, None, None, grad_qkv_linear_weight.to(
+                dtype), grad_qkv_linear_bias.to(dtype), None, None, None
+
+
+class SequenceChunk:
+
+    def __init__(self, chunk: torch.Tensor, device=None, is_in_use=False):
+
+        self.chunk_shape = chunk.shape
+        self.chunk_dtype = chunk.dtype
+        self.device = chunk.device if device is None else device
+
+        cpu_chunk = torch.empty(chunk.shape, dtype=chunk.dtype, device='cpu', pin_memory=True)
+
+        if get_accelerator().on_accelerator(chunk):
+            cpu_chunk.copy_(chunk, non_blocking=True)
+        else:
+            cpu_chunk = chunk
+
+        self.cpu_chunk = cpu_chunk
+
+        self.gpu_chunk = chunk if is_in_use else None
+
+    def load_to_gpu(self):
+        assert self.gpu_chunk is None
+        if self.gpu_chunk is not None:
+            pass
+        else:
+            gpu_chunk = torch.empty(self.chunk_shape, device=self.device, dtype=self.chunk_dtype)
+            gpu_chunk.copy_(self.cpu_chunk, non_blocking=True)
+            self.gpu_chunk = gpu_chunk
+
+    def get_gpu_chunk(self):
+        assert self.gpu_chunk is not None and self.gpu_chunk.device == self.device
+        return self.gpu_chunk
+
+    def check_gpu_chunk(self, ):
+        assert (self.gpu_chunk is not None) and (
+            self.gpu_chunk.device == self.device
+        ), f"gpu_chunk {self.gpu_chunk is not None} shound be on {self.device}, but it is now on {self.gpu_chunk.device}"
+        return True
+
+    def offload(self):
+        assert self.gpu_chunk is not None and self.gpu_chunk.device == self.device
+        del self.gpu_chunk
+        self.gpu_chunk = None
+
+    def overwrite_to_cpu(self):
+        assert self.gpu_chunk is not None and self.gpu_chunk.device == self.device
+        self.cpu_chunk.copy_(self.gpu_chunk, non_blocking=True)
+
+
+class _FPDTGPUOffloadingAttentionImpl_(torch.autograd.Function):
+    generate_vmap_rule = False
+
+    @staticmethod
+    def forward(ctx: Any,
+                layernorm_output,
+                attention_mask,
+                inference_params,
+                rotary_pos_emb,
+                spg,
+                scatter_idx,
+                gather_idx,
+                hidden_size,
+                projection_size,
+                hidden_size_per_attention_head,
+                kv_projection_size,
+                qkv_linear_weight,
+                qkv_linear_bias,
+                dropout,
+                num_chunks=8,
+                cpu_offloading=True):
+
+        do_save = layernorm_output.requires_grad
+
+        if rotary_pos_emb is not None:
+            pos_emb_cos, pos_emb_sin = rotary_pos_emb[0].permute(1, 0, 2, 3), rotary_pos_emb[1].permute(1, 0, 2, 3)
+            ctx.pos_emb_cos = pos_emb_cos
+            ctx.pos_emb_sin = pos_emb_sin
+        else:
+            ctx.pos_emb_cos = None
+            ctx.pos_emb_sin = None
+        with torch.no_grad():
+            per_gpu_seq_len = layernorm_output.shape[0]
+            chunk_size = per_gpu_seq_len // num_chunks
+            assert chunk_size * num_chunks == per_gpu_seq_len
+            assert attention_mask is None
+            ctx.num_chunks = num_chunks
+            ctx.cpu_offloading = cpu_offloading
+            ctx.spg = spg
+            ctx.scatter_idx = scatter_idx
+            ctx.gather_idx = gather_idx
+
+            ctx.chunk_size = chunk_size
+            device = get_accelerator().current_device_name()
+            ctx.device = device
+            ctx.dtype = layernorm_output.dtype
+            ctx.projection_size = projection_size
+            ctx.kv_projection_size = kv_projection_size
+
+            global_q = []
+            global_k = []
+            global_v = []
+
+            ctx.softmax_scale = hidden_size_per_attention_head**(-0.5)
+
+            ctx.dropout_p = dropout
+            ctx.window_size = (-1, -1)
+            ctx.alibi_slopes = None
+
+            batch_size = layernorm_output.shape[1]
+
+            global_o = []
+            global_lse = []
+
+            layernorm_output_cpu = []
+            final_output = []
+
+            offload_stream = get_accelerator().Stream()
+            general_offload_stream = get_accelerator().Stream()
+            compute_stream = get_accelerator().default_stream()
+
+            q_compute_chunk_idx = 0
+            kv_compute_chunk_idx = 0
+            for i in range(num_chunks):
+
+                qkv_chunk = torch.matmul(layernorm_output[:chunk_size],
+                                         qkv_linear_weight.t()) + qkv_linear_bias  # torch.Size([18126, 1, 12288])
+
+                with get_accelerator().stream(general_offload_stream):
+                    layernorm_output_cpu.append(SequenceChunk(layernorm_output[:chunk_size]))
+
+                layernorm_output = layernorm_output[chunk_size:]
+
+                q_chunk = qkv_chunk[:, :, :projection_size].contiguous().reshape(
+                    qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
+                    hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous()  # b, l, nh, hd
+                q_chunk = single_all_to_all(q_chunk, scatter_idx, gather_idx, 0, spg)
+                global_q_chunk_len = q_chunk.shape[1]
+
+                k_chunk = qkv_chunk[:, :, projection_size:projection_size + kv_projection_size].contiguous().reshape(
+                    qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
+                    hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous()  # b, l, nh, hd
+                k_chunk = single_all_to_all(k_chunk, scatter_idx, gather_idx, 0, spg)
+
+                v_chunk = qkv_chunk[:, :, projection_size + kv_projection_size:].contiguous().reshape(
+                    qkv_chunk.shape[0], qkv_chunk.shape[1], -1,
+                    hidden_size_per_attention_head).permute(1, 0, 2, 3).contiguous()  # b, l, nh, hd
+                v_chunk = single_all_to_all(v_chunk, scatter_idx, gather_idx, 0, spg)
+
+                dist.barrier()
+
+                if ctx.pos_emb_cos is not None:
+                    pos_emb_cos_chunk = pos_emb_cos[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)]
+                    pos_emb_sin_chunk = pos_emb_sin[:, global_q_chunk_len * i:global_q_chunk_len * (i + 1)]
+
+                    q_chunk = apply_rotary_pos_emb(q_chunk, pos_emb_cos_chunk, pos_emb_sin_chunk)
+                    k_chunk = apply_rotary_pos_emb(k_chunk, pos_emb_cos_chunk, pos_emb_sin_chunk)
+
+                compute_stream.wait_stream(offload_stream)
+                compute_stream.synchronize()
+                with get_accelerator().stream(offload_stream):
+                    global_q.append(SequenceChunk(q_chunk, is_in_use=True))
+                    global_k.append(SequenceChunk(k_chunk, is_in_use=True))
+                    global_v.append(SequenceChunk(v_chunk, is_in_use=True))
+
+                del qkv_chunk
+
+                cur_attn_output = None
+                cur_attn_lse = None
+                for k_i in range(len(global_k)):
+                    causal_chunk = i == k_i
+                    with get_accelerator().stream(compute_stream):
+                        if flash_attn_version >= version.parse("2.6.0"):
+                            block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(
+                                global_q[q_compute_chunk_idx].get_gpu_chunk(),
+                                global_k[kv_compute_chunk_idx].get_gpu_chunk(),
+                                global_v[kv_compute_chunk_idx].get_gpu_chunk(),
+                                ctx.dropout_p,
+                                ctx.softmax_scale,
+                                causal=causal_chunk,
+                                window_size=ctx.window_size,
+                                softcap=0.0,
+                                alibi_slopes=ctx.alibi_slopes,
+                                return_softmax=False)
+                        else:
+                            block_out, _, _, _, _, block_lse, _, _ = _flash_attn_forward(
+                                global_q[q_compute_chunk_idx].get_gpu_chunk(),
+                                global_k[kv_compute_chunk_idx].get_gpu_chunk(),
+                                global_v[kv_compute_chunk_idx].get_gpu_chunk(),
+                                ctx.dropout_p,
+                                ctx.softmax_scale,
+                                causal=causal_chunk,
+                                window_size=ctx.window_size,
+                                alibi_slopes=ctx.alibi_slopes,
+                                return_softmax=False)
+                        cur_attn_output, cur_attn_lse = update_out_and_lse(cur_attn_output, cur_attn_lse, block_out,
+                                                                           block_lse)
+
+                    can_offload_kv = True
+                    if k_i != (len(global_k) - 1) or i != (num_chunks - 1):
+                        if k_i != (len(global_k) - 1):
+                            next_kv_compute_chunk_idx = k_i + 1
+                        else:
+                            next_kv_compute_chunk_idx = 0
+
+                        if next_kv_compute_chunk_idx == kv_compute_chunk_idx:
+                            can_offload_kv = False
+                        else:
+                            if next_kv_compute_chunk_idx != (len(global_k) - 1):
+                                with get_accelerator().stream(offload_stream):
+                                    global_k[next_kv_compute_chunk_idx].load_to_gpu()
+                                    global_v[next_kv_compute_chunk_idx].load_to_gpu()
+
+                    if i == num_chunks - 1 and k_i == num_chunks - 1:
+                        with get_accelerator().stream(offload_stream):
+                            global_q[0].load_to_gpu()
+                            global_k[0].load_to_gpu()
+                            global_v[0].load_to_gpu()
+                            global_o[0].load_to_gpu()
+                            global_lse[0].load_to_gpu()
+
+                    compute_stream.wait_stream(offload_stream)
+                    compute_stream.synchronize()
+
+                    if can_offload_kv:
+                        global_k[kv_compute_chunk_idx].offload()
+                        global_v[kv_compute_chunk_idx].offload()
+                    kv_compute_chunk_idx = next_kv_compute_chunk_idx
+
+                global_q[q_compute_chunk_idx].offload()
+                q_compute_chunk_idx += 1
+
+                all2all_output = single_all_to_all(
+                    cur_attn_output.to(ctx.dtype).contiguous(), gather_idx, scatter_idx, 0, spg)
+                final_output.append(all2all_output)
+                with get_accelerator().stream(general_offload_stream):
+                    global_o.append(SequenceChunk(cur_attn_output.to(ctx.dtype)))
+                    global_lse.append(SequenceChunk(cur_attn_lse[:, :, :, 0].permute(0, 2, 1).contiguous()))
+
+            compute_stream.wait_stream(general_offload_stream)
+            compute_stream.synchronize()
+
+            final_output = torch.cat(final_output, dim=1)
+
+            head_dim = final_output.shape[-1]
+
+        if do_save:
+            ctx.layernorm_output = layernorm_output_cpu
+            ctx.global_q = global_q
+            ctx.global_k = global_k
+            ctx.global_v = global_v
+            ctx.attn_output = global_o
+            ctx.attn_lse = global_lse
+            ctx.head_dim = head_dim
+            ctx.batch_size = batch_size
+
+            ctx.qkv_linear_weight = qkv_linear_weight
+            ctx.qkv_linear_bias = qkv_linear_bias
+
+        return final_output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        num_chunks = ctx.num_chunks
+        device = grad_output.device
+        dtype = ctx.dtype
+        spg = ctx.spg
+        scatter_idx = ctx.scatter_idx
+        gather_idx = ctx.gather_idx
+        softmax_scale = ctx.softmax_scale
+        dropout_p = ctx.dropout_p
+        window_size = ctx.window_size
+        alibi_slopes = ctx.alibi_slopes
+
+        projection_size = ctx.projection_size
+        kv_projection_size = ctx.kv_projection_size
+
+        layernorm_output = ctx.layernorm_output
+
+        global_q = ctx.global_q
+        global_k = ctx.global_k
+        global_v = ctx.global_v
+        attn_output = ctx.attn_output
+        lse = ctx.attn_lse
+
+        qkv_linear_weight = ctx.qkv_linear_weight
+        qkv_linear_bias = ctx.qkv_linear_bias
+
+        offload_stream = get_accelerator().Stream()
+        general_offload_stream = get_accelerator().Stream()
+        compute_stream = get_accelerator().default_stream()
+
+        chunk_size = grad_output.shape[1] // num_chunks
+        assert chunk_size == layernorm_output[0].cpu_chunk.shape[0]
+
+        grad_layernorm_output = [
+            torch.zeros(layernorm_output[0].chunk_shape, device=device, dtype=dtype) for _ in range(num_chunks)
+        ]
+
+        grad_global_attn_output = [None for _ in range(num_chunks)]
+
+        q_compute_chunk_idx = 0
+        kv_compute_chunk_idx = 0
+        last_q_accum_idx = 0
+
+        with get_accelerator().stream(general_offload_stream):
+            layernorm_output[0].load_to_gpu()
+            grad_qkv_linear_weight = torch.zeros(qkv_linear_weight.shape,
+                                                 device=qkv_linear_weight.device,
+                                                 dtype=torch.float)
+            grad_qkv_linear_bias = torch.zeros(qkv_linear_bias.shape,
+                                               device=qkv_linear_weight.device,
+                                               dtype=torch.float)
+
+        grad_global_attn_output_chunk = single_all_to_all(grad_output[:, :chunk_size].contiguous(), scatter_idx,
+                                                          gather_idx, 0, spg)
+        get_accelerator().synchronize()
+        grad_output = grad_output[:, chunk_size:]
+
+        with get_accelerator().stream(offload_stream):
+            grad_global_attn_output[0] = SequenceChunk(grad_global_attn_output_chunk, is_in_use=True)
+            dq = [
+                SequenceChunk(torch.zeros(global_q[0].chunk_shape, dtype=torch.float, device=device), is_in_use=True)
+            ] + [
+                SequenceChunk(torch.zeros(global_q[0].chunk_shape, dtype=torch.float, device='cpu', pin_memory=True),
+                              device) for _ in range(num_chunks - 1)
+            ]
+            dk_accum = torch.zeros(global_k[0].chunk_shape, dtype=torch.float, device=device)
+            dv_accum = torch.zeros(global_v[0].chunk_shape, dtype=torch.float, device=device)
+
+        for i in range(num_chunks):
+            for q_i in range(num_chunks):
+                no_computation = q_i < i
+                if no_computation:
+                    continue
+
+                causal_chunk = q_i == i
+
+                dq_this = torch.zeros(global_q[0].chunk_shape, dtype=dtype, device=device)
+                dk_this = torch.zeros(global_k[0].chunk_shape, dtype=dtype, device=device)
+                dv_this = torch.zeros(global_v[0].chunk_shape, dtype=dtype, device=device)
+
+                with get_accelerator().stream(compute_stream):
+                    if flash_attn_version >= version.parse("2.6.0"):
+                        _flash_attn_backward(grad_global_attn_output[q_compute_chunk_idx].get_gpu_chunk(),
+                                             global_q[q_compute_chunk_idx].get_gpu_chunk(),
+                                             global_k[kv_compute_chunk_idx].get_gpu_chunk(),
+                                             global_v[kv_compute_chunk_idx].get_gpu_chunk(),
+                                             attn_output[q_compute_chunk_idx].get_gpu_chunk(),
+                                             lse[q_compute_chunk_idx].get_gpu_chunk(),
+                                             dq_this,
+                                             dk_this,
+                                             dv_this,
+                                             dropout_p,
+                                             softmax_scale,
+                                             causal_chunk,
+                                             window_size,
+                                             softcap=0.0,
+                                             alibi_slopes=alibi_slopes,
+                                             deterministic=False,
+                                             rng_state=None)
+                    else:
+                        _flash_attn_backward(grad_global_attn_output[q_compute_chunk_idx].get_gpu_chunk(),
+                                             global_q[q_compute_chunk_idx].get_gpu_chunk(),
+                                             global_k[kv_compute_chunk_idx].get_gpu_chunk(),
+                                             global_v[kv_compute_chunk_idx].get_gpu_chunk(),
+                                             attn_output[q_compute_chunk_idx].get_gpu_chunk(),
+                                             lse[q_compute_chunk_idx].get_gpu_chunk(),
+                                             dq_this,
+                                             dk_this,
+                                             dv_this,
+                                             dropout_p,
+                                             softmax_scale,
+                                             causal_chunk,
+                                             window_size,
+                                             alibi_slopes=alibi_slopes,
+                                             deterministic=False,
+                                             rng_state=None)
+
+                if i != (len(global_k) - 1):
+                    if q_i != (len(global_q) - 1):
+                        next_q_compute_chunk_idx = q_i + 1
+                    else:
+                        next_q_compute_chunk_idx = i + 1
+
+                can_offload_q = True
+
+                if next_q_compute_chunk_idx == q_compute_chunk_idx:
+                    can_offload_q = False
+                else:
+                    with get_accelerator().stream(offload_stream):
+                        if i > 0 or q_i > 0:
+                            if can_offload_q and last_q_accum_idx != i:  # the first q chunk calculate in the loop will be sent out, therefore we do not offload it
+                                dq[last_q_accum_idx].offload()
+                        dq[next_q_compute_chunk_idx].load_to_gpu()
+                        global_q[next_q_compute_chunk_idx].load_to_gpu()
+                        attn_output[next_q_compute_chunk_idx].load_to_gpu()
+                        lse[next_q_compute_chunk_idx].load_to_gpu()
+                        if grad_global_attn_output[next_q_compute_chunk_idx] is not None:
+                            grad_global_attn_output[next_q_compute_chunk_idx].load_to_gpu()
+
+                        if grad_global_attn_output[next_q_compute_chunk_idx] is None:
+                            grad_global_attn_output_chunk = single_all_to_all(grad_output[:, :chunk_size].contiguous(),
+                                                                              scatter_idx, gather_idx, 0, spg)
+                            dist.barrier()
+                            grad_output = grad_output[:, chunk_size:]
+                            grad_global_attn_output[next_q_compute_chunk_idx] = SequenceChunk(
+                                grad_global_attn_output_chunk, is_in_use=True)
+
+                compute_stream.wait_stream(offload_stream)
+                compute_stream.synchronize()
+
+                with get_accelerator().stream(compute_stream):
+                    dq[q_compute_chunk_idx].check_gpu_chunk()
+                    dq[q_compute_chunk_idx].gpu_chunk.add_(dq_this)
+                    dk_accum.add_(dk_this)
+                    dv_accum.add_(dv_this)
+
+                offload_stream.wait_stream(compute_stream)
+                with get_accelerator().stream(offload_stream):
+                    dq[q_compute_chunk_idx].overwrite_to_cpu()
+
+                if can_offload_q:
+                    global_q[q_compute_chunk_idx].offload()
+                    attn_output[q_compute_chunk_idx].offload()
+                    lse[q_compute_chunk_idx].offload()
+                    grad_global_attn_output[q_compute_chunk_idx].offload()
+
+                last_q_accum_idx = q_compute_chunk_idx
+                q_compute_chunk_idx = next_q_compute_chunk_idx
+
+            compute_stream.wait_stream(offload_stream)
+            compute_stream.synchronize()
+
+            dk_seq_len = dk_accum.shape[1]
+
+            if ctx.pos_emb_cos is not None:
+                dq_accum = apply_rotary_pos_emb_backward(dq[kv_compute_chunk_idx].get_gpu_chunk().to(dtype),
+                                                         ctx.pos_emb_cos[:, dk_seq_len * i:dk_seq_len * (i + 1)],
+                                                         ctx.pos_emb_sin[:, dk_seq_len * i:dk_seq_len * (i + 1)])
+                dk_accum = apply_rotary_pos_emb_backward(dk_accum.to(dtype),
+                                                         ctx.pos_emb_cos[:, dk_seq_len * i:dk_seq_len * (i + 1)],
+                                                         ctx.pos_emb_sin[:, dk_seq_len * i:dk_seq_len * (i + 1)])
+            else:
+                dq_accum = dq[kv_compute_chunk_idx].get_gpu_chunk().to(dtype)
+                dk_accum = dk_accum.to(dtype)
+            dv_accum = dv_accum.to(dtype)
+
+            dq_accum = single_all_to_all(dq_accum.contiguous(), gather_idx, scatter_idx, 0, spg)
+            dk_accum = single_all_to_all(dk_accum.contiguous(), gather_idx, scatter_idx, 0, spg)
+            dv_accum = single_all_to_all(dv_accum.contiguous(), gather_idx, scatter_idx, 0, spg)
+
+            general_offload_stream.synchronize()
+            compute_stream.wait_stream(general_offload_stream)
+            dist.barrier()
+
+            with get_accelerator().stream(compute_stream):
+                input_chunk = layernorm_output[i].get_gpu_chunk().reshape(-1, layernorm_output[i].chunk_shape[-1])
+
+                dq_accum = dq_accum.flatten(2).permute(1, 0, 2)
+                dk_accum = dk_accum.flatten(2).permute(1, 0, 2)
+                dv_accum = dv_accum.flatten(2).permute(1, 0, 2)
+
+                l, b = dk_accum.shape[0], dk_accum.shape[1]
+
+                grad_qkv_linear_weight[:projection_size].add_(
+                    torch.matmul(dq_accum.reshape(l * b, -1).t(), input_chunk))
+                grad_qkv_linear_weight[projection_size:projection_size + kv_projection_size].add_(
+                    torch.matmul(dk_accum.reshape(l * b, -1).t(), input_chunk))
+                grad_qkv_linear_weight[projection_size + kv_projection_size:].add_(
+                    torch.matmul(dv_accum.reshape(l * b, -1).t(), input_chunk))
+
+                grad_qkv_linear_bias[:projection_size].add_(dq_accum.sum(0).sum(0))
+                grad_qkv_linear_bias[projection_size:projection_size + kv_projection_size].add_(dk_accum.sum(0).sum(0))
+                grad_qkv_linear_bias[projection_size + kv_projection_size:].add_(dv_accum.sum(0).sum(0))
+
+                grad_layernorm_output[i].add_(torch.matmul(dq_accum, qkv_linear_weight[:projection_size]))
+                grad_layernorm_output[i].add_(
+                    torch.matmul(dk_accum, qkv_linear_weight[projection_size:projection_size + kv_projection_size]))
+                grad_layernorm_output[i].add_(
+                    torch.matmul(dv_accum, qkv_linear_weight[projection_size + kv_projection_size:]))
+
+                del dq_accum, dk_accum, dv_accum
+                dk_accum = torch.zeros(global_k[i].chunk_shape, dtype=torch.float, device=device)
+                dv_accum = torch.zeros(global_v[i].chunk_shape, dtype=torch.float, device=device)
+                dq[kv_compute_chunk_idx].offload()
+                dq[kv_compute_chunk_idx] = None
+
+            if i != (len(global_k) - 1):
+                next_kv_compute_chunk_idx = kv_compute_chunk_idx + 1
+                with get_accelerator().stream(offload_stream):
+                    global_k[next_kv_compute_chunk_idx].load_to_gpu()
+                    global_v[next_kv_compute_chunk_idx].load_to_gpu()
+
+                with get_accelerator().stream(general_offload_stream):
+                    layernorm_output[next_kv_compute_chunk_idx].load_to_gpu()
+
+            compute_stream.wait_stream(offload_stream)
+            compute_stream.synchronize()
+
+            layernorm_output[kv_compute_chunk_idx].offload()
+            global_k[kv_compute_chunk_idx].offload()
+            global_v[kv_compute_chunk_idx].offload()
+            kv_compute_chunk_idx = next_kv_compute_chunk_idx
+
+        return torch.cat(
+            grad_layernorm_output,
+            dim=0).to(dtype), None, None, None, None, None, None, None, None, None, None, grad_qkv_linear_weight.to(
+                dtype), grad_qkv_linear_bias.to(dtype), None, None, None
+
+
+class FPDT_Attention(torch.nn.Module):
+
+    def __init__(self,
+                 config,
+                 first_weight,
+                 first_bias,
+                 second_weight,
+                 second_bias,
+                 sequence_process_group,
+                 gather_idx: int = 0,
+                 scatter_idx: int = 2,
+                 return_bias=True,
+                 chunk_size=65536,
+                 enable_offloading=True) -> None:
+
+        super(FPDT_Attention, self).__init__()
+        if _flash_attn_forward is None or _flash_attn_backward is None:
+            raise ImportError(
+                "DeepSpeed FPDT requires flash-attn 2.6.3. Please install it with `pip install flash-attn --no-build-isolation`."
+            )
+
+        self.spg = sequence_process_group
+        self.scatter_idx = scatter_idx
+        self.gather_idx = gather_idx
+        self.config = config
+
+        self.projection_size = config.kv_channels * config.num_attention_heads
+        self.hidden_size_per_attention_head = self.projection_size // config.num_attention_heads
+        self.kv_projection_size = config.kv_channels * config.num_key_value_heads
+        self.hidden_size = config.hidden_size
+
+        self.qkv_linear_weight = first_weight
+        self.qkv_linear_bias = first_bias
+        self.qkv_dense_weight = second_weight
+        self.qkv_dense_bias = second_bias
+
+        self.reture_bias = return_bias
+        self.dropout = config.attention_dropout
+
+        self.chunk_size = chunk_size
+        self.double_buffer = enable_offloading
+
+    def forward(self,
+                layernorm_output,
+                attention_mask,
+                inference_params,
+                rotary_pos_emb,
+                cpu_offloading=True) -> Tensor:
+        self.num_chunks_attn = layernorm_output.shape[0] * dist.get_world_size(self.spg) // self.chunk_size
+
+        if not cpu_offloading or self.num_chunks_attn == 1:
+            output = _FPDTGPUAttentionImpl_.apply(layernorm_output, attention_mask, inference_params, rotary_pos_emb,
+                                                  self.spg, self.scatter_idx, self.gather_idx, self.hidden_size,
+                                                  self.projection_size, self.hidden_size_per_attention_head,
+                                                  self.kv_projection_size, self.qkv_linear_weight,
+                                                  self.qkv_linear_bias, self.dropout, self.num_chunks_attn,
+                                                  cpu_offloading)
+        else:
+            output = _FPDTGPUOffloadingAttentionImpl_.apply(
+                layernorm_output, attention_mask, inference_params, rotary_pos_emb, self.spg, self.scatter_idx,
+                self.gather_idx, self.hidden_size, self.projection_size, self.hidden_size_per_attention_head,
+                self.kv_projection_size, self.qkv_linear_weight, self.qkv_linear_bias, self.dropout,
+                self.num_chunks_attn, cpu_offloading)
+
+        output = output.flatten(2).permute(1, 0, 2).contiguous()
+
+        output = torch.matmul(output, self.qkv_dense_weight.t())
+        if not self.reture_bias:
+            output += self.qkv_dense_bias
+        return output, self.qkv_dense_bias if self.reture_bias else None
+
+
+@torch.jit.script
+def bias_gelu(x):
+    return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
+
+
+@torch.jit.script
+def bias_gelu_back(g, x):
+    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
+    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
+    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
+    return ff * g
+
+
+class FPDT_FFN(torch.autograd.Function):
+    generate_vmap_rule = False
+
+    @staticmethod
+    def forward(ctx: Any, x, w1, b1, w2, b2, add_bias, chunk_size):
+        do_save = x.requires_grad
+        ctx.add_bias = add_bias
+        device = x.device
+
+        with torch.no_grad():
+            num_chunk = x.shape[0] // chunk_size
+            ctx.num_chunk = num_chunk
+            result = torch.empty(x.shape, device=device, dtype=x.dtype)
+            assert chunk_size * num_chunk == x.shape[0]
+            for i in range(num_chunk):
+                st = i * chunk_size
+                ed = st + chunk_size
+                x_ = torch.matmul(x[st:ed], w1.t()) + b1
+                x_ = bias_gelu(x_)
+                if add_bias:
+                    result[st:ed] = torch.matmul(x_, w2.t()) + b2
+                else:
+                    result[st:ed] = torch.matmul(x_, w2.t())
+
+                del x_
+
+            if do_save:
+                ctx.device = device
+                ctx.dtype = x.dtype
+                ctx.save_for_backward(x, w1, b1, w2, b2)
+                ctx.grad_x_shape = x.shape
+        return result.to(x.dtype), b2 if not add_bias else None
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_bias):
+        x, w1, b1, w2, b2 = ctx.saved_tensors
+        device = ctx.device
+        dtype = ctx.dtype
+        add_bias = ctx.add_bias
+
+        num_chunk = ctx.num_chunk
+        chunk_size = x.shape[0] // num_chunk
+        assert chunk_size * num_chunk == grad_output.shape[0]
+
+        grad_w2 = torch.zeros(w2.shape, device=device, dtype=torch.float)
+        grad_b2 = torch.zeros(b2.shape, device=device, dtype=torch.float)
+        grad_w1 = torch.zeros(w1.shape, device=device, dtype=torch.float)
+        grad_b1 = torch.zeros(b1.shape, device=device, dtype=torch.float)
+
+        for i in range(num_chunk):
+            st = i * chunk_size
+            ed = st + chunk_size
+            x_chunk = x[st:ed]
+
+            before_act = (torch.matmul(x_chunk, w1.t()) + b1)
+            before_act_2 = before_act**2
+            tanh_out = torch.tanh(0.79788456 * before_act * (1 + 0.044715 * before_act_2))
+            ff = 0.5 * before_act * ((1 - tanh_out * tanh_out) *
+                                     (0.79788456 + 0.1070322243 * before_act_2)) + 0.5 * (1 + tanh_out)
+            grad_w2.add_(
+                torch.matmul(grad_output[st:ed].reshape(-1, grad_output.shape[2]).t(),
+                             (before_act * 0.5 * (1 + tanh_out)).reshape(-1, before_act.shape[2])))
+            del before_act, before_act_2, tanh_out
+
+            grad_inter = torch.matmul(grad_output[st:ed], w2) * ff
+            del ff
+
+            grad_w1.add_(torch.matmul(
+                grad_inter.reshape(-1, grad_inter.shape[2]).t(), x_chunk.reshape(-1, x.shape[2])))
+            grad_b1.add_(grad_inter.sum(0).sum(0))
+
+            x[st:ed].copy_(torch.matmul(grad_inter, w1))
+
+            del grad_inter
+
+            if add_bias:
+                grad_b2.add_(grad_output[st:ed].sum(0).sum(0))
+
+        return x, grad_w1.to(dtype), grad_b1.to(dtype), grad_w2.to(dtype), grad_b2.to(dtype), None, None
+
+
+class FPDT_LogitsLoss(torch.autograd.Function):
+    generate_vmap_rule = False
+
+    @staticmethod
+    def forward(ctx: Any, lm_output, labels, logit_weights, rank, spg_size, spg, num_chunk):
+        labels = labels.t()
+        chunk_size = lm_output.shape[0] // num_chunk
+        assert chunk_size * num_chunk == lm_output.shape[0]
+        batch_size, local_seq_len = lm_output.shape[1], lm_output.shape[0]
+        loss = torch.empty((batch_size, local_seq_len), dtype=torch.float, device=lm_output.device)
+
+        ctx.num_chunk = num_chunk
+        ctx.chunk_size = chunk_size
+        ctx.device = lm_output.device
+        ctx.dtype = lm_output.dtype
+
+        ctx.rank = rank
+        ctx.local_seq_len = local_seq_len
+        with torch.no_grad():
+            for i in range(num_chunk):
+                st = i * chunk_size
+                ed = st + chunk_size
+                logits_chunk = torch.matmul(lm_output[st:ed], logit_weights.t()).float()
+
+                vocab_size = logits_chunk.size(2)
+                # nll
+                softmax = torch.nn.functional.softmax(logits_chunk, dim=-1)
+                loss_chunk = torch.nn.functional.nll_loss(softmax.log().reshape(-1, vocab_size).contiguous(),
+                                                          labels[st:ed, :].reshape(-1).contiguous(),
+                                                          reduction='none')
+                loss[:, st:ed] = loss_chunk.reshape(chunk_size, batch_size).t()
+
+                del logits_chunk
+            ctx.save_for_backward(lm_output.to('cpu'), labels)
+            ctx.logit_weights = logit_weights
+
+        seqlen = local_seq_len * spg_size
+        batch_size = loss.size(0)
+        loss = loss.t().contiguous()
+        loss_all = torch.empty(seqlen, batch_size, dtype=loss.dtype, device=loss.device).contiguous()
+
+        dist.allgather_fn(loss_all, loss, group=spg)
+
+        return loss_all
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        lm_output, labels = ctx.saved_tensors
+        logit_weights = ctx.logit_weights
+        device = ctx.device
+        dtype = ctx.dtype
+        num_chunk = ctx.num_chunk
+        chunk_size = ctx.chunk_size
+
+        rank = ctx.rank
+        local_seq_len = ctx.local_seq_len
+
+        grad_output = grad_output[rank * local_seq_len:(rank + 1) * local_seq_len]
+        grad_lm_output = [None for _ in range(num_chunk)]
+        grad_logit_weights = torch.zeros(logit_weights.shape, device=grad_output.device, dtype=torch.float)
+        for i in range(num_chunk):
+            st = i * chunk_size
+            ed = st + chunk_size
+            lm_output_chunk = lm_output[st:ed].to(device)
+            logits_chunk = torch.matmul(lm_output_chunk, logit_weights.t()).float()
+
+            # nll
+            softmax = torch.nn.functional.softmax(logits_chunk, dim=-1)
+            vocab_size = logits_chunk.size(2)
+
+            grad_input = softmax
+            grad_2d = grad_input.reshape(-1, vocab_size).contiguous()
+            arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=device)
+
+            grad_2d[arange_1d, labels[st:ed, :].reshape(-1).contiguous()] -= 1
+            grad_input.mul_(grad_output[:chunk_size, :].unsqueeze(dim=-1))
+            grad_input = grad_input.to(dtype)
+
+            grad_output = grad_output[chunk_size:].contiguous()
+
+            grad_lm_output_chunk = torch.matmul(grad_input, logit_weights)
+            grad_lm_output[i] = grad_lm_output_chunk
+
+            grad_logit_weights.add_(
+                torch.matmul(
+                    grad_input.reshape(-1, grad_input.shape[2]).t(),
+                    lm_output_chunk.reshape(-1, lm_output_chunk.shape[2])))
+
+        return torch.cat(grad_lm_output, dim=0).to(dtype), None, grad_logit_weights.to(dtype), None, None, None, None
diff --git a/deepspeed/sequence/layer.py b/deepspeed/sequence/layer.py
index e1dbff87f4ec..ba5e596e0d6d 100644
--- a/deepspeed/sequence/layer.py
+++ b/deepspeed/sequence/layer.py
@@ -2,59 +2,310 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-
 import torch
 
 from typing import Any, Tuple
 from torch import Tensor
 from torch.nn import Module
 
+from einops import rearrange
+
 import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.module_inject.tp_shard import get_shard_size_list, set_num_kv_heads, get_num_kv_heads
+from deepspeed.utils import groups
+
+
+def _generate_layout_params(scatter_idx, batch_dim_idx, seq_world_size, input):
+    """
+    This function generates the parameters required for `permute` and `reshape` operations,
+    which are used to process data before and after `all2all` communication.
+    """
+    if batch_dim_idx == 0:
+        if scatter_idx < 2:
+            bs, global_seq_len, num_local_head, head_dim = input.shape
+            pre_all2all_inp_shape = [bs, seq_world_size, global_seq_len // seq_world_size, num_local_head, head_dim]
+            pre_all2all_permute_idx = (1, 0, 2, 3, 4)
+
+            post_all2all_permute_idx = (1, 2, 0, 3, 4)
+            post_all2all_res_shape = [bs, global_seq_len // seq_world_size, seq_world_size * num_local_head, head_dim]
+        else:
+            bs, local_seq_len, num_total_head, head_dim = input.shape
+            assert num_total_head % seq_world_size == 0, f"Number of heads ({num_total_head}) must be divisible by the sequence parallel size ({seq_world_size})!"
+            pre_all2all_inp_shape = [bs, local_seq_len, seq_world_size, num_total_head // seq_world_size, head_dim]
+            pre_all2all_permute_idx = (2, 0, 1, 3, 4)
+
+            post_all2all_permute_idx = (1, 0, 2, 3, 4)
+            post_all2all_res_shape = [bs, seq_world_size * local_seq_len, num_total_head // seq_world_size, head_dim]
+    else:
+        if scatter_idx < 2:
+            global_seq_len, bs, num_local_head, head_dim = input.shape
+            pre_all2all_inp_shape = [seq_world_size, global_seq_len // seq_world_size, bs, num_local_head, head_dim]
+            pre_all2all_permute_idx = None
+
+            post_all2all_permute_idx = (1, 2, 0, 3, 4)
+            post_all2all_res_shape = [bs, seq_world_size * global_seq_len, num_local_head // seq_world_size, head_dim]
+        else:
+            local_seq_len, bs, num_total_head, head_dim = input.shape
+            assert num_total_head % seq_world_size == 0, f"Number of heads ({num_total_head}) must be divisible by the sequence parallel size ({seq_world_size})!"
+            pre_all2all_inp_shape = [local_seq_len, bs, seq_world_size, num_total_head // seq_world_size, head_dim]
+            pre_all2all_permute_idx = (2, 0, 1, 3, 4)
+            post_all2all_permute_idx = None
+            post_all2all_res_shape = [local_seq_len * seq_world_size, bs, num_total_head // seq_world_size, head_dim]
+
+    return pre_all2all_permute_idx, pre_all2all_inp_shape, post_all2all_permute_idx, post_all2all_res_shape
+
+
+def post_all2all(permute_idx, res_shape):
+    """
+    Post-processing function for `all2all` communication.
+    """
+
+    def post_func(input):
+        if permute_idx is not None:
+            input = input.permute(permute_idx).contiguous()
+        output = input.reshape(res_shape).contiguous()
+
+        return output
+
+    return post_func
+
 
+def pre_all2all_fun(permute_idx, inp_shape, input):
+    """
+    Pre-processing function for `all2all` communication.
+    """
+    input_t = input.reshape(inp_shape).contiguous()
+    if permute_idx is not None:
+        input_t = input_t.permute(permute_idx).contiguous()
+    return input_t
+
+
+def _rotate_half(x):
+    """
+    change sign so the last dimension becomes [-odd, +even]
+    """
+    x = rearrange(x, '... (j d) -> ... j d', j=2)
+    x1, x2 = x.unbind(dim=-2)
+    return torch.cat((-x2, x1), dim=-1)
 
-def single_all_to_all(input, scatter_idx, gather_idx, group):
+
+def apply_rotary_pos_emb(t, freqs_cos, freqs_sin):
+    """
+    input tensor t is of shape [seq_length, ..., dim]
+    rotary positional embeding tensor freqs is of shape [seq_length, ..., dim]
+    check https://kexue.fm/archives/8265 for detailed formulas
+    """
+    rot_dim = freqs_cos.shape[-1]
+    # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
+    t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
+
+    # first part is cosine component
+    # second part is sine component, need to change signs with _rotate_half method
+    t = (t * freqs_cos) + (_rotate_half(t) * freqs_sin)
+
+    res = t if t_pass.shape[-1] == 0 else torch.cat((t, t_pass), dim=-1)
+    return res
+
+
+def uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group):
     seq_world_size = dist.get_world_size(group)
     inp_shape = list(input.shape)
-    inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
-    if scatter_idx < 2:
-        input_t = input.reshape(
-            [seq_world_size, inp_shape[scatter_idx]] + \
-            inp_shape[scatter_idx + 1:]
-        ).contiguous()
+    assert batch_dim_idx in [0, 1], "batch_dim_idx must be either 0 or 1"
+
+    if not (scatter_idx < 2):
+        input_splits = get_shard_size_list(inp_shape[scatter_idx], seq_world_size)
+        input = input.transpose(0, scatter_idx).contiguous()
+        local_heads = input_splits[groups._get_sequence_parallel_rank()]
+        output_splits = [local_heads] * seq_world_size
+
+        output_buffer_shape = [seq_world_size * local_heads] + list(input.shape[1:])
+        output = torch.empty(output_buffer_shape, device=input.device, dtype=input.dtype)
+        dist.all_to_all_single(output,input,output_split_sizes=output_splits,\
+            input_split_sizes=input_splits,group=group)
+        ###[seq_ws*local_heads, ...] to [seq_ws, local_heads, ...]
+        output = output.view(seq_world_size, local_heads, *output.shape[1:])
+        ###[seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]
+
+        ### batch_dim_idx=0 [seq_ws,local_heads,seq_len,b,...] to [b, seq_ws, seq_len, local_heads ...]
+        ### batch_dim_idx=1 [seq_ws,local_heads,b,seq_len,...] to [seq_ws,seq_len,b,local_heads,...]
+        if batch_dim_idx == 0:
+            order = [3, 0, 2, 1] + list(range(4, len(output.shape)))
+            output = output.permute(order).contiguous()
+            ###[b, seq_ws*local_seq_len, local_heads,...]
+            output = output.view(output.shape[0], inp_shape[gather_idx] * seq_world_size,
+                                 *output.shape[3:]).contiguous()
+        elif batch_dim_idx == 1:
+            output = output.transpose(1, 3).contiguous()
+            ###[seq_ws*local_seq_len, b, local_heads,...]
+            output = output.view(inp_shape[gather_idx] * seq_world_size, *output.shape[2:]).contiguous()
     else:
-        # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them!
-        input_t = input.reshape(
-            [-1, seq_world_size, inp_shape[scatter_idx]] + \
-            inp_shape[scatter_idx + 1:]
-        ).transpose(0, 1).contiguous()
+        # The compatibility handling of 4D and 3D tensors, standardizing to 3D.
+        input = input.reshape(input.shape[0], input.shape[1], -1)
+
+        if batch_dim_idx == 0:  #b,s,h
+            input = input.permute(1, 2, 0).contiguous()  #s,h,b
+        elif batch_dim_idx == 1:  #s,b,h
+            input = input.transpose(1, 2).contiguous()  #s,h,b
+        seq_len, h, batch_size = input.shape
+        num_local_heads_list = get_shard_size_list(get_num_kv_heads(), seq_world_size)
+        local_heads = num_local_heads_list[groups._get_sequence_parallel_rank()]
+        h_dim = h // local_heads
+        local_seq_len = seq_len // seq_world_size
+
+        input = input.view(seq_len * h, batch_size)
+        local_seq_len_with_heads = int(input.shape[0] / seq_world_size)  # dim size of local_seq_len*local_heads*hdim
+        input_splits = [local_seq_len_with_heads] * seq_world_size
+        coeff = local_seq_len_with_heads // local_heads  #per head: dim size of local_seq_len*hdim
+
+        #uneven seq_world_size coeff,  total_heads/local_heads.
+        heads_scale_coeff = get_num_kv_heads() / local_heads
+
+        output_splits = [num_local_heads * coeff for num_local_heads in num_local_heads_list]
+        output_buff_d1_size = int(heads_scale_coeff * local_seq_len_with_heads)
+        total_h = int(inp_shape[gather_idx] * heads_scale_coeff)
+        output = torch.empty(output_buff_d1_size, input.shape[1], device=input.device, dtype=input.dtype)
+        dist.all_to_all_single(output,input,output_split_sizes=output_splits, \
+            input_split_sizes=input_splits,group=group)
+        ##################
+        #suppose 7 heads divide into 4 ranks [2,2,2,1]
+        #chunk_num_heads_small=floor(7/4)=1
+        #chunk_num_heads_large=ceil(7/4)=2
+        #num_chunk_heads_large=len([2,2,2])=3, all2all_buffer_counts
+        #num_chunk_heads_small=len([1])=1, all2all_buffer_counts
+        #total_num_large_heads=sum([2,2,2])=7
+        #total_num_small_heads=sum([1])=1
+
+        chunk_num_heads_small = get_num_kv_heads() // seq_world_size  # even heads compatible
+        chunk_num_heads_large = chunk_num_heads_small + 1
+        num_chunk_heads_large = get_num_kv_heads() % seq_world_size
+        num_chunk_heads_small = seq_world_size - num_chunk_heads_large
+        total_num_large_heads = num_chunk_heads_large * chunk_num_heads_large
+        total_num_small_heads = num_chunk_heads_small * chunk_num_heads_small
+
+        heads_large_combine_size = coeff * total_num_large_heads
+        heads_small_combine_size = coeff * total_num_small_heads
+        heads_large_chunk, heads_small_chunk = output.split([heads_large_combine_size, heads_small_combine_size],
+                                                            dim=0)
+        heads_large_chunk = heads_large_chunk.view(num_chunk_heads_large, local_seq_len, chunk_num_heads_large, h_dim,
+                                                   batch_size)
+        heads_small_chunk = heads_small_chunk.view(num_chunk_heads_small, local_seq_len, chunk_num_heads_small, h_dim,
+                                                   batch_size)
+        if batch_dim_idx == 0:
+            #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[batch,local_seq_len,all2all_buffer_counts*n_heads,dim]
+            order = [4, 1, 0, 2, 3]
+            heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
+                                                                                   total_num_large_heads, h_dim)
+            heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(batch_size, local_seq_len,
+                                                                                   total_num_small_heads, h_dim)
+        elif batch_dim_idx == 1:
+            #[all2all_buffer_counts, local_seq_len, n_heads,dim,batch]->[local_seq_len,batch,all2all_buffer_counts*n_heads,dim]
+            order = [1, 4, 0, 2, 3]
+            heads_large_chunk = heads_large_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
+                                                                                   total_num_large_heads, h_dim)
+            heads_small_chunk = heads_small_chunk.permute(order).contiguous().view(local_seq_len, batch_size,
+                                                                                   total_num_small_heads, h_dim)
+
+        output = torch.cat([heads_large_chunk, heads_small_chunk], dim=2).contiguous()
+
+        inp_shape[scatter_idx] = inp_shape[scatter_idx] // seq_world_size
+        output_shape=  inp_shape[: gather_idx] + \
+            [total_h,] + \
+            inp_shape[gather_idx + 1:]
+
+        output = output.view(output_shape)
+
+    return output
+
+
+def single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, async_op=False, handle=None, type=None):
+    seq_world_size = dist.get_world_size(group)
+    # we only need num_heads once
+    num_heads = input.shape[2]
+
+    if get_num_kv_heads() is not None or (num_heads % seq_world_size != 0 and not scatter_idx < 2):
+        # Assuming here that the number of heads for q is consistent with kv
+        # If not, additional logic is required for cases like GQA
+        if get_num_kv_heads() is None:
+            assert num_heads > seq_world_size, f"Number of heads ({num_heads}) must be larger than sequence parallel size ({seq_world_size})"
+            # set heads at first call by num_total_heads.
+            # then use ``get_num_kv_heads() is not None`` to re-entry uneven path.
+            set_num_kv_heads(num_heads)
+        assert async_op == False, "uneven head sp does not support async op"
+        return uneven_heads_all2all(input, scatter_idx, gather_idx, batch_dim_idx, group)
+
+    pre_all2all_permute_idx, pre_all2all_inp_shape, post_all2all_permute_idx, post_all2all_res_shape = _generate_layout_params(
+        scatter_idx, batch_dim_idx, seq_world_size, input)
 
+    input_t = pre_all2all_fun(pre_all2all_permute_idx, pre_all2all_inp_shape, input)
+
+    post_all2all_fun = post_all2all(post_all2all_permute_idx, post_all2all_res_shape)
     output = torch.empty_like(input_t)
-    dist.all_to_all_single(output, input_t, group=group)
+    work = dist.all_to_all_single(output, input_t, group=group, async_op=async_op)
 
-    # if scattering the seq-dim, transpose the heads back to the original dimension
-    if scatter_idx < 2:
-        output = output.transpose(0, 1).contiguous()
+    if async_op:
+        if type in ('dq', 'dk'):
+            handle[type + '_work'] = work
+            handle[type + '_grad'] = output
+            handle[type + '_post_all2all_func'] = post_all2all_fun
+            return output.view(post_all2all_res_shape)
 
-    return output.reshape(
-        inp_shape[: gather_idx] + \
-        [inp_shape[gather_idx] * seq_world_size,] + \
-        inp_shape[gather_idx + 1:]).contiguous()
+    res = post_all2all_fun(output)
+    return res
 
 
 class _SeqAllToAll(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx: Any, group: dist.ProcessGroup, input: Tensor, scatter_idx: int, gather_idx: int) -> Tensor:
-
+    def forward(ctx: Any,
+                group: dist.ProcessGroup,
+                input: Tensor,
+                scatter_idx: int,
+                gather_idx: int,
+                batch_dim_idx: int,
+                stream=None,
+                handle=None,
+                type=None,
+                is_fwd=True) -> Tensor:
         ctx.group = group
         ctx.scatter_idx = scatter_idx
         ctx.gather_idx = gather_idx
+        ctx.stream = stream
+        ctx.handle = handle
+        ctx.type = type
+        ctx.batch_dim_idx = batch_dim_idx
+        if ctx.handle is None:
+            res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False)
+
+        else:
+            # overlap communication path
+            if not is_fwd and type == 'o':
+                assert ctx.stream != None
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False)
+                get_accelerator().current_stream().wait_stream(ctx.stream)
+                # The computation of d o_weight can overlap with the communication of d o_input
+
+            elif not is_fwd and type in ('q', 'k'):
+                # Achieve communication overlap by pipelining the matrix computation and communication of dq, dk, and dv
+                type = 'd' + type
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, True, handle, type)
 
-        return single_all_to_all(input, scatter_idx, gather_idx, group)
+            elif is_fwd and type in ('q', 'k'):
+                # Achieve communication overlap by pipelining the matrix computation and communication of q, k, and v
+                type = 'fwd_' + type
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False, handle, type)
+
+            else:
+                res = single_all_to_all(input, scatter_idx, gather_idx, batch_dim_idx, group, False)
+
+        return res
 
     @staticmethod
     def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor, None, None]:
-        return (None, _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx), None, None)
+
+        return (None,
+                _SeqAllToAll.apply(ctx.group, *grad_output, ctx.gather_idx, ctx.scatter_idx, ctx.batch_dim_idx,
+                                   ctx.stream, ctx.handle, ctx.type, False), None, None, None, None, None, None, None)
 
 
 class DistributedAttention(torch.nn.Module):
@@ -73,6 +324,7 @@ def __init__(
         sequence_process_group: dist.ProcessGroup,
         scatter_idx: int = 2,
         gather_idx: int = 0,
+        sp_stream=None,
     ) -> None:
 
         super(DistributedAttention, self).__init__()
@@ -80,30 +332,89 @@ def __init__(
         self.spg = sequence_process_group
         self.scatter_idx = scatter_idx
         self.gather_idx = gather_idx
+        self.sp_overlap_comm = False
+        self.overlap_handles = None
+        self.sp_stream = sp_stream
+        if sp_stream is not None:
+            self.overlap_handles = {}
+            self.sp_overlap_comm = True
+            self.dafult_stream = get_accelerator().default_stream()
+
+    def layer_sync(self, layer):
+        if self.sp_overlap_comm and hasattr(layer, 'done_event'):
+            self.dafult_stream.wait_event(layer.done_event)
 
-    def forward(self, query: Tensor, key: Tensor, value: Tensor, *args: Any) -> Tensor:
+    def forward(self,
+                query: Tensor,
+                key: Tensor,
+                value: Tensor,
+                batch_dim_idx: int,
+                rotary_pos_emb=None,
+                *args: Any,
+                **kwargs) -> Tensor:
         """ forward
 
         Arguments:
             query (Tensor): query input to the layer
             key (Tensor): key input to the layer
             value (Tensor): value input to the layer
+            batch_dim_idx (int): indicating which dim is batch
             args: other args
 
         Returns:
             * output (Tensor): context output
         """
+
         # TODO Merge three alltoall calls into one
         # TODO (Reza): change the api on the megatron-deepspeed side so that we only receive all data (q,k, and v) together!
         #in shape : e.g.,  [s/p:h:]
-        query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx)
-        key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx)
-        value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx)
+
+        def bwd_hook(layer_type):
+
+            def pre_hook_fun(grad):
+                type = 'd' + layer_type
+                self.overlap_handles[type + '_work'].wait()
+                self.sp_stream.wait_stream(self.dafult_stream)
+                all2all_output = self.overlap_handles[type + '_grad']
+                grad = list(grad)
+                grad[0] = self.overlap_handles[type + '_post_all2all_func'](all2all_output)
+                grad = tuple(grad)
+
+            return pre_hook_fun
+
+        self.layer_sync(query)
+        query_layer = _SeqAllToAll.apply(self.spg, query, self.scatter_idx, self.gather_idx, batch_dim_idx, None,
+                                         self.overlap_handles, 'q')
+        self.layer_sync(key)
+        key_layer = _SeqAllToAll.apply(self.spg, key, self.scatter_idx, self.gather_idx, batch_dim_idx, None,
+                                       self.overlap_handles, 'k')
+        if self.sp_overlap_comm:
+            self.dafult_stream.wait_stream(self.sp_stream)
+
+        value_layer = _SeqAllToAll.apply(self.spg, value, self.scatter_idx, self.gather_idx, batch_dim_idx, None,
+                                         self.overlap_handles, 'v')
+
+        if self.sp_overlap_comm:
+            # Register a hook to synchronize dq and dk after the all-to-all
+            # operation when the gradient data is used.
+            # Place this logic after the q, k, v all-to-all operation to
+            # improve interpreter speed to
+            # call and launch of the forward all-to-all communication.
+            grad_fn_q = query.grad_fn.next_functions[0][0]
+            grad_fn_q.register_prehook(bwd_hook(layer_type='q'))
+            grad_fn_k = key.grad_fn.next_functions[0][0]
+            grad_fn_k.register_prehook(bwd_hook(layer_type='k'))
 
         #out shape : e.g., [s:h/p:]
-        context_layer = self.local_attn(query_layer, key_layer, value_layer, *args)
+        if rotary_pos_emb is not None:
+            pos_emb_cos, pos_emb_sin = rotary_pos_emb[0].permute(1, 0, 2, 3), rotary_pos_emb[1].permute(1, 0, 2, 3)
+            query_layer = apply_rotary_pos_emb(query_layer, pos_emb_cos, pos_emb_sin)
+            key_layer = apply_rotary_pos_emb(key_layer, pos_emb_cos, pos_emb_sin)
+
+        context_layer = self.local_attn(query_layer, key_layer, value_layer, *args, **kwargs)
 
-        output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx)
+        output = _SeqAllToAll.apply(self.spg, context_layer, self.gather_idx, self.scatter_idx, batch_dim_idx,
+                                    self.sp_stream, self.overlap_handles, 'o')
 
         #out e.g., [s/p::h]
         return output
diff --git a/deepspeed/utils/__init__.py b/deepspeed/utils/__init__.py
index 6237d7239682..983e64642c69 100644
--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
@@ -10,12 +10,13 @@
 from .groups import *
 from .nvtx import instrument_w_nvtx
 # TODO: Move tensor fragment and mixed precision to zero utils
-from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad
+from .tensor_fragment import tensor_fragment, get_full_hp_param, get_hp_fragment_mapping, fragment_address, get_full_hp_grad, map_to_flat_opt_states
 from .tensor_fragment import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
-from .tensor_fragment import set_full_hp_param
-from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from .tensor_fragment import set_full_hp_param, set_full_hp_grad
+from .tensor_fragment import safe_set_full_fp32_param, safe_set_full_optimizer_state, safe_set_full_grad
 from .tensor_fragment import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
-from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_optimizer_state
-from .mixed_precision_linkage import link_hp_params
+from .tensor_fragment import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state
+from .z3_leaf_module import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module, z3_leaf_parameter, set_z3_leaf_module
+from .mixed_precision_linkage import link_hp_params, lazy_init_hp_params_optimizer_state
 from deepspeed.runtime.dataloader import RepeatingLoader
 from .numa import get_numactl_cmd
diff --git a/deepspeed/utils/bwc.py b/deepspeed/utils/bwc.py
new file mode 100644
index 000000000000..69fcc251a684
--- /dev/null
+++ b/deepspeed/utils/bwc.py
@@ -0,0 +1,104 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+
+def bwc_tensor_model_parallel_rank(mpu=None):
+    """Backwards-compatible way of querying the tensor model parallel rank from
+    an ``mpu`` object.
+
+    *Tensor* model parallelism means that tensors are physically split across
+    processes. This contrasts with *pipeline* model parallelism, in which the
+    layers are partitioned but tensors left intact.
+
+    The API for tensor model parallelism has changed across versions and this
+    helper provides a best-effort implementation across versions of ``mpu``
+    objects.  The preferred mechanism is
+    ``mpu.get_tensor_model_parallel_rank()``.
+
+    This should "just work" with both Megatron-LM and DeepSpeed's pipeline
+    parallelism.
+
+    Args:
+        mpu (model parallel unit, optional): The tensor model parallel rank.
+            If ``mpu=None``, returns 0. Defaults to ``None``.
+
+    Returns:
+        int: the rank
+    """
+    if mpu is None:
+        # No model parallelism in easy :)
+        return 0
+
+    if hasattr(mpu, 'get_tensor_model_parallel_rank'):
+        # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
+        return mpu.get_tensor_model_parallel_rank()
+    elif hasattr(mpu, 'get_slice_parallel_rank'):
+        # Some DeepSpeed + pipeline parallelism versions
+        return mpu.get_slice_parallel_rank()
+    else:
+        # Deprecated Megatron and DeepSpeed convention
+        return mpu.get_model_parallel_rank()
+
+
+def bwc_tensor_model_parallel_world_size(mpu=None):
+    """Backwards-compatible way of querying the tensor model parallel world size.
+       Similar to bwc_tensor_model_parallel_rank.
+    """
+    if mpu is None:
+        return 1
+
+    if hasattr(mpu, 'get_tensor_model_parallel_world_size'):
+        # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
+        return mpu.get_tensor_model_parallel_world_size()
+    elif hasattr(mpu, 'get_slice_parallel_world_size'):
+        # Some DeepSpeed + pipeline parallelism versions
+        return mpu.get_slice_parallel_world_size()
+    else:
+        # Deprecated Megatron and DeepSpeed convention
+        return mpu.get_model_parallel_world_size()
+
+
+def bwc_tensor_model_parallel_group(mpu=None):
+    """Backwards-compatible way of querying the tensor model parallel group.
+       Similar to bwc_tensor_model_parallel_rank.
+    """
+    if mpu is None:
+        return None
+
+    if hasattr(mpu, 'get_tensor_model_parallel_group'):
+        # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
+        return mpu.get_tensor_model_parallel_group()
+    elif hasattr(mpu, 'get_slice_parallel_group'):
+        # Some DeepSpeed + pipeline parallelism versions
+        return mpu.get_slice_parallel_group()
+    else:
+        # Deprecated Megatron and DeepSpeed convention
+        return mpu.get_model_parallel_group()
+
+
+def bwc_pipeline_parallel_world_size(mpu=None):
+    """Backwards-compatible way of querying the pipeline parallel world size."""
+    world_size = 1
+    if mpu is not None:
+        if hasattr(mpu, 'get_pipeline_model_parallel_world_size'):
+            # New Megatron and DeepSpeed convention (post pipeline-parallelism release)
+            world_size = mpu.get_pipeline_model_parallel_world_size()
+        elif hasattr(mpu, 'get_pipe_parallel_world_size'):
+            # DeepSpeed Topology
+            world_size = mpu.get_pipe_parallel_world_size()
+    return world_size
+
+
+def bwc_pipeline_parallel_group(mpu=None):
+    """Backwards-compatible way of querying the pipeline parallel group."""
+    if mpu is None:
+        return None
+    if hasattr(mpu, 'get_pipeline_model_parallel_group'):
+        # Megatron
+        return mpu.get_pipeline_model_parallel_group()
+    elif hasattr(mpu, 'get_pipe_parallel_group'):
+        # DeepSpeed Topology
+        return mpu.get_pipe_parallel_group()
+    assert False, 'mpu does not support pipeline parallel group'
diff --git a/deepspeed/utils/config.py b/deepspeed/utils/config.py
new file mode 100644
index 000000000000..15f37ca7d874
--- /dev/null
+++ b/deepspeed/utils/config.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed.runtime.config_utils import DeepSpeedConfigModel
+
+#########################################
+# Timers
+#########################################
+# Timers. By default, timers are enabled.
+# Users can configure in ds_config.json as below example:
+TIMERS_FORMAT = '''
+Timers should be enabled as:
+"timers": {
+  "throughput": {
+    "enabled": true,
+    "synchronized": true
+  }
+}
+'''
+
+TIMERS = "timers"
+TIMERS_THROUGHPUT = "throughput"
+
+
+def get_timers_config(param_dict):
+    if param_dict and TIMERS in param_dict and TIMERS_THROUGHPUT in param_dict[TIMERS]:
+        timers_config_dict = param_dict[TIMERS][TIMERS_THROUGHPUT]
+    else:
+        timers_config_dict = {}
+    return DeepSpeedThroughputTimerConfig(**timers_config_dict)
+
+
+class DeepSpeedThroughputTimerConfig(DeepSpeedConfigModel):
+    """ Configure throughput timers """
+
+    enabled: bool = True
+    """ Turn on/off throughput timers """
+
+    synchronized: bool = True
+    """ Whether to synchronize a device when measuring the time.
+        Synchronizing a device is required to produce the most accurate timer measurements.
+        However, this comes at the expense of performance degradation. The CPU timer provides
+        sufficient accuracy in many cases.
+      """
diff --git a/deepspeed/utils/debug.py b/deepspeed/utils/debug.py
index 02295fa98011..cebea56255d9 100644
--- a/deepspeed/utils/debug.py
+++ b/deepspeed/utils/debug.py
@@ -11,6 +11,13 @@
 param_names = {}
 
 
+def debug_clear_module_and_param_names():
+    global module_names
+    global param_names
+    module_names = {}
+    param_names = {}
+
+
 def debug_extract_module_and_param_names(model):
     # extract the fully qualified names as soon as the model is acquired
     global module_names
diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py
old mode 100644
new mode 100755
index 3ebe8cd75522..6dc750035061
--- a/deepspeed/utils/groups.py
+++ b/deepspeed/utils/groups.py
@@ -27,8 +27,10 @@
 
 from deepspeed import comm as dist
 from deepspeed.utils import log_dist
+from deepspeed.utils.bwc import bwc_tensor_model_parallel_world_size, bwc_pipeline_parallel_world_size
 from deepspeed.utils.exceptions import DeprecatedException
 from deepspeed.accelerator import get_accelerator
+
 # Expert parallel group that the current rank belongs to.
 _EXPERT_PARALLEL_GROUP = {}
 # Expert data parallel group that the current rank belongs to.
@@ -44,7 +46,7 @@
 # All to All quantized graident communication groups
 _ALL_TO_ALL_GROUP = {}
 
-_DATA_PARALLEL_GROUP = None
+mesh_device = None
 
 
 # Deprecated groups initialize function.
@@ -60,6 +62,127 @@ def _ensure_divisibility(numerator, denominator):
     assert numerator % denominator == 0, '{} is not divisible by {}'.format(numerator, denominator)
 
 
+# ======== Start: Tensor Parallel Group Attributes ========
+
+# Intra-layer model parallel group that the current rank belongs to.
+_TENSOR_MODEL_PARALLEL_GROUP = None
+
+# Model parallel group (both intra- and pipeline) that the current rank belongs to.
+_MODEL_PARALLEL_GROUP = None
+# Data parallel group that the current rank belongs to.
+_DATA_PARALLEL_GROUP = None
+
+# These values enable us to change the mpu sizes on the fly.
+_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = None
+_MPU_TENSOR_MODEL_PARALLEL_RANK = None
+
+
+def _init_tp_mesh_device(tensor_model_parallel_size=1, data_parallel_size=None):
+    """Initialize model data parallel groups."""
+
+    global _DATA_PARALLEL_GROUP
+    global _MODEL_PARALLEL_GROUP
+    global _TENSOR_MODEL_PARALLEL_GROUP
+
+    if _TENSOR_MODEL_PARALLEL_GROUP is not None:
+        return
+
+    if data_parallel_size is None:
+        data_parallel_size = dist.get_world_size() // tensor_model_parallel_size
+
+    mesh_device = dist.initialize_mesh_device((data_parallel_size, tensor_model_parallel_size),
+                                              ("data_parallel", "tensor_parallel"))
+    _TENSOR_MODEL_PARALLEL_GROUP = mesh_device.get_group(mesh_dim="tensor_parallel")
+    _DATA_PARALLEL_GROUP = mesh_device.get_group(mesh_dim="data_parallel")
+
+    # They are always equal only in 2D (DP + TP) parallelism.
+    # _MODEL_PARALLEL_GROUP is assigned the same value as _TENSOR_MODEL_PARALLEL_GROUP
+    # to allow for future potential changes.
+    _MODEL_PARALLEL_GROUP = _TENSOR_MODEL_PARALLEL_GROUP
+
+    return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
+
+
+def get_tensor_model_parallel_group():
+    """Get the tensor model parallel group the caller rank belongs to."""
+
+    assert _TENSOR_MODEL_PARALLEL_GROUP is not None, \
+        'intra_layer_model parallel group is not initialized'
+    return _TENSOR_MODEL_PARALLEL_GROUP
+
+
+def get_model_parallel_group():
+    """Get the model parallel group the caller rank belongs to."""
+
+    assert _MODEL_PARALLEL_GROUP is not None, \
+        'model parallel group is not initialized'
+    return _MODEL_PARALLEL_GROUP
+
+
+def get_data_parallel_group():
+    """Get the data parallel group the caller rank belongs to."""
+    assert _DATA_PARALLEL_GROUP is not None, \
+        'data parallel group is not initialized'
+    return _DATA_PARALLEL_GROUP
+
+
+def set_tensor_model_parallel_world_size(world_size):
+    """Set the tensor model parallel size"""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE = world_size
+
+
+def get_tensor_model_parallel_world_size():
+    """Return world size for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
+    return dist.get_world_size(group=get_tensor_model_parallel_group())
+
+
+def get_model_parallel_world_size():
+    return get_tensor_model_parallel_world_size()
+
+
+def set_tensor_model_parallel_rank(rank):
+    """Set tensor model parallel rank."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    _MPU_TENSOR_MODEL_PARALLEL_RANK = rank
+
+
+def get_tensor_model_parallel_rank():
+    """Return my rank for the tensor model parallel group."""
+    global _MPU_TENSOR_MODEL_PARALLEL_RANK
+    if _MPU_TENSOR_MODEL_PARALLEL_RANK is not None:
+        return _MPU_TENSOR_MODEL_PARALLEL_RANK
+    return dist.get_rank(group=get_tensor_model_parallel_group())
+
+
+def get_model_parallel_rank():
+    return get_tensor_model_parallel_rank()
+
+
+def get_tensor_model_parallel_src_rank():
+    """Calculate the global rank corresponding to the first local rank
+    in the tensor model parallel group."""
+    global_rank = dist.get_rank()
+    local_world_size = get_tensor_model_parallel_world_size()
+    return (global_rank // local_world_size) * local_world_size
+
+
+def get_data_parallel_world_size():
+    """Return world size for the data parallel group."""
+    return dist.get_world_size(group=get_data_parallel_group())
+
+
+def get_data_parallel_rank():
+    """Return my rank for the data parallel group."""
+    return dist.get_rank(group=get_data_parallel_group())
+
+
+# ======== End: Tensor Parallel Group Attributes ========
+
+
 # Not currently used. Helper function to create a model (tensor) parallel group.
 def _create_model_parallel(model_parallel_size_):
     """
@@ -128,31 +251,32 @@ def _create_expert_and_data_parallel(expert_parallel_size_, use_data_before_expe
 
     log_dist(f'Creating expert and data parallel groups with size {expert_parallel_size_}', ranks=[0])
     world_size = dist.get_world_size()
+    pp_world_size = 1 if mpu is None else bwc_pipeline_parallel_world_size(mpu)
     rank = dist.get_rank()
 
-    _ensure_divisibility(world_size, expert_parallel_size_)
+    pp_stride = world_size // pp_world_size
+    _ensure_divisibility(pp_stride, expert_parallel_size_)
 
     group_name = f"ep_size_{expert_parallel_size_}"
 
     # Build the expert data parallel groups.
     global _EXPERT_DATA_PARALLEL_GROUP
 
-    ep_stride = world_size // expert_parallel_size_
+    ep_stride = pp_stride // expert_parallel_size_
 
     # Only create group if it does not already exist
     if group_name not in _EXPERT_DATA_PARALLEL_GROUP:
-        for i in range(expert_parallel_size_):
-            if use_data_before_expert_parallel_:
-                ranks = range(i * ep_stride, (i + 1) * ep_stride)
-            else:
-                ranks = range(i, world_size, expert_parallel_size_)
-            group = dist.new_group(ranks)
-            log_dist(f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0])
-            if use_data_before_expert_parallel_:
-                if i == (rank // ep_stride):
-                    _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
-            else:
-                if i == (rank % expert_parallel_size_):
+        for pp_stage_start in range(0, world_size, pp_stride):
+            for i in range(expert_parallel_size_):
+                if use_data_before_expert_parallel_:
+                    ranks = range(pp_stage_start + i * ep_stride, pp_stage_start + (i + 1) * ep_stride)
+                else:
+                    ranks = range(pp_stage_start + i, pp_stage_start + pp_stride, expert_parallel_size_)
+                group = dist.new_group(ranks)
+                log_dist(
+                    f'Creating expert data parallel process group named {group_name} '
+                    f'with ranks: {list(ranks)}', [0])
+                if rank in ranks:
                     _EXPERT_DATA_PARALLEL_GROUP[group_name] = group
 
     # Build the expert parallel groups.
@@ -161,24 +285,29 @@ def _create_expert_and_data_parallel(expert_parallel_size_, use_data_before_expe
     # Only create group if it does not already exist
     if group_name not in _EXPERT_PARALLEL_GROUP:
         if use_data_before_expert_parallel_:
-            for i in range(ep_stride):
-                ranks = range(i, world_size, ep_stride)
-                group = dist.new_group(ranks)
-                log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0])
-                if i == (rank % ep_stride):
-                    _EXPERT_PARALLEL_GROUP[group_name] = group
+            for pp_stage_start in range(0, world_size, pp_stride):
+                for i in range(ep_stride):
+                    ranks = range(pp_stage_start + i, pp_stage_start + pp_stride, ep_stride)
+                    group = dist.new_group(ranks)
+                    log_dist(
+                        f'creating expert parallel process group named {group_name} '
+                        f'with ranks: {list(ranks)}', [0])
+                    if rank in ranks:
+                        _EXPERT_PARALLEL_GROUP[group_name] = group
         else:
             for i in range(world_size // expert_parallel_size_):
                 ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_)
                 group = dist.new_group(ranks)
-                log_dist(f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0])
-                if i == (rank // expert_parallel_size_):
+                log_dist(f'creating expert parallel process group named {group_name} '
+                         f'with ranks: {list(ranks)}', [0])
+                if rank in ranks:
                     _EXPERT_PARALLEL_GROUP[group_name] = group
 
 
 def _get_expert_parallel_ranks(world_size,
-                               model_parallel_size_,
+                               tensor_parallel_size_,
                                expert_parallel_size_,
+                               pipeline_parallel_size_=1,
                                use_data_before_expert_parallel_=False):
     """Generate expert parallel and expert data parallel group ranks list.
 
@@ -193,32 +322,40 @@ def _get_expert_parallel_ranks(world_size,
 
     Args:
         world_size (int): Distributed world size.
-        model_parallel_size_ (int): Model parallel group size.
+        tensor_parallel_size_ (int): Tensor parallel group size.
         expert_parallel_size_ (int): Expert parallel group size.
+        pipeline_parallel_size_ (int): Pipeline parallel group size
         use_data_before_expert_parallel_ (bool): Use the D + E instead of E + D topology
     Returns:
         Expert parallel group ranks and Expert data parallel group ranks list.
     """
-    _ensure_divisibility(world_size, model_parallel_size_)
-    dp_world_size = world_size // model_parallel_size_
+    _ensure_divisibility(world_size, tensor_parallel_size_ * pipeline_parallel_size_)
+    dp_world_size = world_size // (tensor_parallel_size_ * pipeline_parallel_size_)
     _ensure_divisibility(dp_world_size, expert_parallel_size_)
 
     # Generate data parallel groups
     data_parallel_groups = []
-    dp_group_size = model_parallel_size_
+    dp_group_size = tensor_parallel_size_
+    pp_stride = world_size // pipeline_parallel_size_
 
     if use_data_before_expert_parallel_:
-        dp_stride = world_size // expert_parallel_size_ // model_parallel_size_
-        for i in range(dp_group_size):
-            data_parallel_groups.append(list())
-            for ds in range(dp_stride):
-                # [0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30]
-                # [1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31]
-                data_parallel_groups[-1].extend(
-                    list(range(i + ds * model_parallel_size_, world_size, dp_stride * model_parallel_size_)))
+        dp_stride = world_size // expert_parallel_size_ // tensor_parallel_size_ // pipeline_parallel_size_
+        for pp_stage_start in range(0, world_size, pp_stride):
+            pp_stage_next = pp_stage_start + pp_stride
+            for i in range(dp_group_size):
+                data_parallel_groups.append(list())
+                for ds in range(dp_stride):
+                    # [0, 4, 8, 12, 16, 20, 24, 28, 2, 6, 10, 14, 18, 22, 26, 30]
+                    # [1, 5, 9, 13, 17, 21, 25, 29, 3, 7, 11, 15, 19, 23, 27, 31]
+                    data_parallel_groups[-1].extend(
+                        list(
+                            range(pp_stage_start + i + ds * tensor_parallel_size_, pp_stage_next,
+                                  dp_stride * tensor_parallel_size_)))
     else:
-        for i in range(dp_group_size):
-            data_parallel_groups.append(list(range(i, world_size, dp_group_size)))
+        for pp_stage_start in range(0, world_size, pp_stride):
+            pp_stage_next = pp_stage_start + pp_stride
+            for i in range(dp_group_size):
+                data_parallel_groups.append(list(range(pp_stage_start + i, pp_stage_next, dp_group_size)))
 
     expert_parallel_groups = []
     expert_data_parallel_groups = []
@@ -252,36 +389,33 @@ def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu, use_data_
         expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[7,15]
     """
     assert dist.is_initialized(), "dist is not initialized"
-    model_parallel_size_ = mpu.get_model_parallel_world_size()
+    tensor_parallel_size_ = bwc_tensor_model_parallel_world_size(mpu)
 
     global expert_tensor_parallel_world_size
-    expert_tensor_parallel_world_size = model_parallel_size_
+    expert_tensor_parallel_world_size = tensor_parallel_size_
 
     world_size = dist.get_world_size()
     rank = dist.get_rank()
     dp_world_size = mpu.get_data_parallel_world_size()
-    dp_rank = mpu.get_data_parallel_rank()
+    pp_world_size = 1 if mpu is None else bwc_pipeline_parallel_world_size(mpu)
 
-    _ensure_divisibility(world_size, model_parallel_size_)
+    _ensure_divisibility(world_size, tensor_parallel_size_)
     _ensure_divisibility(dp_world_size, expert_parallel_size_)
 
     log_dist(
-        f"Creating deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}",
-        [0])
+        f"Creating deepspeed groups with model parallel size {tensor_parallel_size_}, "
+        f"pipeline parallel size {pp_world_size}, expert parallel size {expert_parallel_size_}, "
+        f"world size {world_size}, dp world size {dp_world_size}", [0])
 
     global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP
 
-    # Get world size and rank. Ensure some consistencies.
-    _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group()
-    _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group()
-
     group_name = f"ep_size_{expert_parallel_size_}"
 
     # Only create groups if they don't already exist
     # Need to check conditions outside the group creation loop because of the way torch.dist group creation works
     if group_name not in _EXPERT_DATA_PARALLEL_GROUP and group_name not in _EXPERT_PARALLEL_GROUP:
         expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(
-            world_size, model_parallel_size_, expert_parallel_size_, use_data_before_expert_parallel_)
+            world_size, tensor_parallel_size_, expert_parallel_size_, pp_world_size, use_data_before_expert_parallel_)
         for ranks in expert_parallel_groups:
             group = dist.new_group(ranks)
             if rank in list(ranks):
@@ -386,8 +520,11 @@ def _get_data_parallel_group():
     """Get the data parallel group the caller rank belongs to."""
     assert dist.is_initialized(), 'dist is not initialized'
     global mpu
+    if mesh_device is not None:
+        return mesh_device.get_group(mesh_dim="data_parallel")
     if mpu is not None:
         return mpu.get_data_parallel_group()
+
     # Return the clone of dist world group
     return _clone_world_group()
 
@@ -430,6 +567,8 @@ def _get_expert_data_parallel_rank(group_name):
 
 def _get_data_parallel_world_size():
     """Return world size for the data parallel group."""
+    if mesh_device is not None:
+        return dist.get_world_size(mesh_device.get_group(mesh_dim="data_parallel"))
     global mpu
     if mpu is not None:
         return mpu.get_data_parallel_world_size()
@@ -452,6 +591,8 @@ def _get_data_parallel_rank():
 def _get_sequence_parallel_world_size():
     """Return world size for the model parallel group."""
     global mpu
+    if mesh_device is not None:
+        return dist.get_world_size(mesh_device.get_group(mesh_dim="sequence_parallel"))
     if mpu is not None and hasattr(mpu, 'get_sequence_parallel_world_size'):
         return mpu.get_sequence_parallel_world_size()
     return 1
@@ -462,14 +603,18 @@ def _get_sequence_parallel_rank():
     global mpu
     if mpu is not None and hasattr(mpu, 'get_sequence_parallel_rank'):
         return mpu.get_sequence_parallel_rank()
+    if mesh_device is not None:
+        return dist.get_rank(mesh_device.get_group(mesh_dim="sequence_parallel"))
     return 0
 
 
 def _get_sequence_parallel_group():
     global mpu
-    if mpu is not None and hasattr(mpu, 'get_sequence_parallel_group'):
-        return mpu.get_sequence_parallel_group()
-    return None
+    if mpu is None or not hasattr(mpu, 'get_sequence_parallel_group'):
+        if mesh_device is None:
+            raise KeyError("No sequence parallel group found")
+        return mesh_device.get_group(mesh_dim="sequence_parallel")
+    return mpu.get_sequence_parallel_group()
 
 
 def _get_sequence_data_parallel_world_size():
@@ -508,7 +653,7 @@ def _create_zero_param_parallel_group(group_size):
 
         Example - ZP + D parallel
         world_size = 16
-        zero_hpz_partition_size = 2 # number of ranks with with replicated params (dual partitioning)
+        zero_hpz_partition_size = 2 # number of ranks with replicated params (dual partitioning)
         zero_param_intra_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - segmented (subgroup) with rep partition
         data_parallel_group = [0,1,...,15] - all reduce is on ZeRO model
     """
diff --git a/deepspeed/utils/logging.py b/deepspeed/utils/logging.py
index 1e62d96e1032..ed691e3985e1 100644
--- a/deepspeed/utils/logging.py
+++ b/deepspeed/utils/logging.py
@@ -7,6 +7,8 @@
 import logging
 import sys
 import os
+import torch
+from deepspeed.utils.torch import required_torch_version
 
 log_levels = {
     "debug": logging.DEBUG,
@@ -44,6 +46,15 @@ def create_logger(name=None, level=logging.INFO):
         ch.setLevel(level)
         ch.setFormatter(formatter)
         logger_.addHandler(ch)
+        if required_torch_version(min_version=2.6) and os.getenv("DISABLE_LOGS_WHILE_COMPILING", "0") == "1":
+            excluded_set = {
+                item.strip()
+                for item in os.getenv("LOGGER_METHODS_TO_EXCLUDE_FROM_DISABLE", "").split(",")
+            }
+            ignore_set = {'info', 'debug', 'error', 'warning', 'critical', 'exception', 'isEnabledFor'} - excluded_set
+            for method in ignore_set:
+                original_logger = getattr(logger_, method)
+                torch._dynamo.config.ignore_logger_methods.add(original_logger)
         return logger_
 
 
@@ -147,6 +158,6 @@ def should_log_le(max_log_level_str):
 
     max_log_level_str = max_log_level_str.lower()
     if max_log_level_str not in log_levels:
-        raise ValueError(f"{max_log_level_str} is not one of the `logging` levels")
+        raise ValueError(f"{max_log_level_str} is not one of the logging levels")
 
     return get_current_level() <= log_levels[max_log_level_str]
diff --git a/deepspeed/utils/mixed_precision_linkage.py b/deepspeed/utils/mixed_precision_linkage.py
index b1afa8f00aa3..c97515ca8fef 100644
--- a/deepspeed/utils/mixed_precision_linkage.py
+++ b/deepspeed/utils/mixed_precision_linkage.py
@@ -5,17 +5,23 @@
 
 import types
 from deepspeed.utils import get_full_hp_param, get_full_hp_grad, get_hp_fragment_mapping
-from deepspeed.utils import set_full_hp_param
+from deepspeed.utils import set_full_hp_param, set_full_hp_grad
 
 
 def link_hp_params(lp_param_list, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
-                   param_group_index, partition_start, partition_size, partition_optimizer_state, dp_group):
+                   param_group_index, partition_start, partition_size, dp_group):
     local_lp_param_and_offset = _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group)
 
     for lp_param, lp_start in local_lp_param_and_offset:
         lp_param._hp_mapping = get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict,
                                                        offload_gradient_dict, use_offload, param_group_index,
-                                                       partition_start, partition_size, partition_optimizer_state)
+                                                       partition_start, partition_size)
+
+
+def lazy_init_hp_params_optimizer_state(lp_param_list, flat_hp_partition, optimizer_state):
+    for lp in lp_param_list:
+        if lp._hp_mapping is not None:
+            lp._hp_mapping.set_optim_state_fragment(flat_hp_partition, optimizer_state[flat_hp_partition])
 
 
 def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_group):
@@ -29,6 +35,7 @@ def _init_lp_to_hp_mapping(lp_param_list, partition_start, partition_size, dp_gr
         lp_param.get_full_hp_param = types.MethodType(get_full_hp_param, lp_param)
         lp_param.get_full_hp_grad = types.MethodType(get_full_hp_grad, lp_param)
         lp_param.set_full_hp_param = types.MethodType(set_full_hp_param, lp_param)
+        lp_param.set_full_hp_grad = types.MethodType(set_full_hp_grad, lp_param)
 
         # lp_param overlaps with partition if both are true
         # 1) current_offset < partition_end,
diff --git a/deepspeed/utils/numa.py b/deepspeed/utils/numa.py
index 13617826b1ce..aba3b5179d41 100644
--- a/deepspeed/utils/numa.py
+++ b/deepspeed/utils/numa.py
@@ -23,7 +23,10 @@
 # ]
 def get_numa_cores():
     ret = []
-    output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    try:
+        output = subprocess.check_output(['numactl', '--hardware']).decode("utf-8")
+    except:
+        return []
     lines = output.split('\n')
     for line in lines:
         if line.startswith('available:'):
@@ -49,8 +52,8 @@ def check_for_numactl_pkg():
         flag, lib, tool = data
         path = distutils.spawn.find_executable(pkgmgr)
         if path is not None:
-            cmd = f"{pkgmgr} {flag} {lib}"
-            result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+            cmd = [pkgmgr, flag, lib]
+            result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             if result.wait() == 0:
                 found = True
             else:
diff --git a/deepspeed/utils/nvtx.py b/deepspeed/utils/nvtx.py
index 3823599e7bf2..72d7c863a33f 100644
--- a/deepspeed/utils/nvtx.py
+++ b/deepspeed/utils/nvtx.py
@@ -4,16 +4,22 @@
 # DeepSpeed Team
 
 from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.compiler import is_compiling
+
+enable_nvtx = True
 
 
 def instrument_w_nvtx(func):
-    """decorator that causes an NVTX range to be recorded for the duration of the
-    function call."""
+    """Decorator that records an NVTX range for the duration of the function call.
+       Skips NVTX instrumentation when torch.compile is active to avoid graph breaks.
+    """
 
     def wrapped_fn(*args, **kwargs):
-        get_accelerator().range_push(func.__qualname__)
+        if enable_nvtx and not is_compiling():
+            get_accelerator().range_push(func.__qualname__)
         ret_val = func(*args, **kwargs)
-        get_accelerator().range_pop()
+        if enable_nvtx and not is_compiling():
+            get_accelerator().range_pop()
         return ret_val
 
     return wrapped_fn
diff --git a/deepspeed/utils/tensor_fragment.py b/deepspeed/utils/tensor_fragment.py
index 5f94070dc4c7..053c8b5adad0 100644
--- a/deepspeed/utils/tensor_fragment.py
+++ b/deepspeed/utils/tensor_fragment.py
@@ -21,11 +21,11 @@ class tensor_fragment:
     lp_fragment_address: fragment_address
     hp_fragment: torch.Tensor
     hp_fragment_address: fragment_address
-    optim_fragment: Dict
     gradient_dict: Dict
     offload_gradient_dict: Dict
     use_offload: bool
     param_group_index: int
+    optim_fragment: Dict = None
 
     def update_hp(self):
         self.hp_fragment.data.copy_(self.lp_fragment.data)
@@ -39,6 +39,13 @@ def get_optim_state_fragment(self, key):
         else:
             raise ValueError(f'{key} not found in optimizer state fragment')
 
+    def set_optim_state_fragment(self, flat_hp_partition, optim_fragment):
+        self.optim_fragment = {
+            key: value.narrow(0, self.hp_fragment_address.start, self.hp_fragment_address.numel)
+            for key, value in optim_fragment.items()
+            if torch.is_tensor(value) and value.shape == flat_hp_partition.shape
+        }
+
     def get_hp_fragment_address(self):
         return self.hp_fragment_address
 
@@ -50,6 +57,32 @@ def get_hp_fragment(self, optim_state_key=None):
             return self.hp_fragment
         return self.get_optim_state_fragment(optim_state_key)
 
+    def get_lp_grad_fragment(self, index_in_param_group):
+        if self.use_offload:
+            gradient_dict = self.offload_gradient_dict
+        else:
+            gradient_dict = self.gradient_dict
+
+        if self.param_group_index not in gradient_dict or gradient_dict[self.param_group_index] is None:
+            raise ValueError("Gradients are only available immediately after backward and before engine step")
+
+        return gradient_dict[self.param_group_index][index_in_param_group]
+
+
+def map_to_flat_opt_states(flat_hp_tensor, lp_tensors, optim_state, opt_keys):
+    for key in opt_keys:
+        hp_param = flat_hp_tensor
+        buffer = torch.zeros_like(hp_param)
+
+        for lp in lp_tensors:
+            if lp._hp_mapping is not None:
+                hp_fragment_address = lp._hp_mapping.get_hp_fragment_address()
+                hp_fragment = buffer.narrow(0, hp_fragment_address.start, hp_fragment_address.numel)
+                hp_fragment.data.copy_(lp._hp_mapping.get_hp_fragment(optim_state_key=key).data)
+                lp._hp_mapping.hp_fragment = hp_fragment
+
+        optim_state[hp_param][key] = buffer
+
 
 def get_full_hp_param(self, optim_state_key=None):
     reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
@@ -73,17 +106,7 @@ def set_full_hp_param(self, value, optim_state_key=None):
 def get_full_hp_grad(self):
     reduce_buffer = torch.zeros_like(self, dtype=torch.float32).flatten()
     if self._hp_mapping is not None:
-        hp_mapping = self._hp_mapping
-
-        if hp_mapping.use_offload:
-            gradient_dict = hp_mapping.offload_gradient_dict
-        else:
-            gradient_dict = hp_mapping.gradient_dict
-
-        if hp_mapping.param_group_index not in gradient_dict or gradient_dict[hp_mapping.param_group_index] is None:
-            raise ValueError("Gradients are only available immediately after backward and before engine step")
-
-        lp_grad_fragment = gradient_dict[hp_mapping.param_group_index][self._index_in_param_group]
+        lp_grad_fragment = self._hp_mapping.get_lp_grad_fragment(self._index_in_param_group)
         hp_grad_fragment = lp_grad_fragment.to(torch.float32).flatten()
 
         lp_frag_address = self._hp_mapping.lp_fragment_address
@@ -98,6 +121,14 @@ def get_full_hp_grad(self):
     return reduce_buffer.reshape_as(self)
 
 
+def set_full_hp_grad(self, value):
+    if self._hp_mapping is not None:
+        lp_grad_fragment = self._hp_mapping.get_lp_grad_fragment(self._index_in_param_group)
+        lp_frag_address = self._hp_mapping.lp_fragment_address
+        value_fragment = torch.narrow(value.flatten(), 0, lp_frag_address.start, lp_frag_address.numel)
+        lp_grad_fragment.data.copy_(value_fragment.data.reshape_as(lp_grad_fragment.data))
+
+
 def safe_get_full_fp32_param(param):
     """Assemble and return the fp32 parameter of a low-precision (e.g., fp16) parameter.
 
@@ -166,7 +197,10 @@ def safe_set_full_optimizer_state(param, value, optim_state_key):
 
 # TODO: Figure out the correct return dtype
 def safe_get_full_grad(param):
-    """Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
+    """
+        Assemble and return the fp32 gradient of a low-precision (e.g., fp16) parameter.
+        The return data type is that used for gradient accumulation. This is usually the param data type,
+        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).
 
         Args:
             param (``torch.nn.Parameter``): A model parameter
@@ -185,77 +219,98 @@ def safe_get_full_grad(param):
     return None
 
 
+def safe_set_full_grad(param, value):
+    """
+        Update the partitioned gradient of a low-precision (e.g., fp16) parameter.
+        To avoid precision issues, the update value should have the data type of
+        gradient accumulation.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter
+            value (``torch.Tensor``): The un-partitioned new gradient value.
+    """
+    if param.grad is not None:
+        param.grad.copy_(value)
+    elif hasattr(param, 'ds_id'):
+        # ZeRO stage 3 param
+        param._z3_optimizer.set_fp32_grad_for_param(value, param)
+    elif hasattr(param, '_hp_mapping'):
+        # ZeRO stage 1, 2, and bf16_optimizer params
+        param.set_full_hp_grad(value)
+
+
 ### Local API  START ###
 def safe_get_local_grad(param):
-    """Get the fp32 gradient of a partitioned parameter.
+    """
+        Get the local gradient partition of a ZeRO-3 partitioned parameter.
+        The return data type is that used for gradient accumulation. This is usually the param data type,
+        but could also be different (e.g., bf16 param training with fp32 gradient accumulation).
         Args:
             param (``torch.nn.Parameter``): A model parameter
     """
-    if param.grad is not None:
-        return param.grad
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_grad_for_param(param)
 
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_grad_for_param(param)
 
-    return None
+def safe_set_local_grad(param, value):
+    """
+        Update the local gradient partition of a ZeRO-3 partitioned parameter.
+        To avoid precision issues, the update value should have the data type of
+        gradient accumulation.
+
+        Args:
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local gradient partition.
+    """
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_grad_for_param(value, param)
 
 
 def safe_get_local_fp32_param(param):
-    """Get the fp32 partitioned parameter.
+    """Get the local partition of a ZeRO-3 partitioned parameter in fp32 precision.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
+            param (``torch.nn.Parameter``): A model parameter.
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_param(param)
-
-    return None
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_param(param)
 
 
 def safe_get_local_optimizer_state(param, optim_state_key):
-    """Get the fp32 optimizer state of a partitioned parameter.
+    """Get the local optimizer state partition of ZeRO-3 partitioned parameter in fp32 precision.
         Args:
             param (``torch.nn.Parameter``): A model parameter
             optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
-
-    return None
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    return param._z3_optimizer.get_local_fp32_param(param, optim_state_key)
 
 
 def safe_set_local_optimizer_state(param, value, optim_state_key):
-    """Update the fp32 optimizer state of a partitioned parameter.
+    """Update the local optimizer state partition of a ZeRO-3 partitioned parameter.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
-            value (``torch.Tensor``): New value
-            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer)
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local optimizer state partition.
+            optim_state_key (``string``): Key value of optimizer state (e.g., `exp_avg` in Adam optimizer).
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_hp_param(value, param, optim_state_key)
 
 
 def safe_set_local_fp32_param(param, value):
-    """Update the partitioned fp32 parameter.
+    """Update the local partition of ZeRO-3 partitioned parameter.
         Args:
-            param (``torch.nn.Parameter``): A model parameter
-            value (``torch.Tensor``): New value
+            param (``torch.nn.Parameter``): A model parameter.
+            value (``torch.Tensor``): New value of local parameter partition.
     """
-    # ZeRO stage 3 param
-    if hasattr(param, 'ds_id'):
-        param._z3_optimizer.set_local_hp_param(value, param)
+    assert hasattr(param, 'ds_id'), f'This API is only defined for ZeRO-3 partitioned parameters'
+    param._z3_optimizer.set_local_hp_param(value, param)
 
 
 ### Local API  END ###
 
-# TODO: Implement API for setting ZeRO partitioned gradients
-
 
 def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict, offload_gradient_dict, use_offload,
-                            param_group_index, partition_start, partition_size, optimizer_state_dict):
+                            param_group_index, partition_start, partition_size):
     lp_end = lp_param.numel() + lp_start
     hp_start = partition_start
     hp_end = partition_start + partition_size
@@ -268,11 +323,6 @@ def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict
     fragment_numel = fragment_end - fragment_start
     hp_frag_address = fragment_address(start=fragment_start - hp_start, numel=fragment_numel)
     hp_fragment_tensor = flat_hp_partition.narrow(0, hp_frag_address.start, hp_frag_address.numel)
-    optim_fragment = {
-        key: value.narrow(0, hp_frag_address.start, hp_frag_address.numel)
-        for key, value in optimizer_state_dict.items()
-        if torch.is_tensor(value) and value.shape == flat_hp_partition.shape
-    }
 
     lp_frag_address = fragment_address(start=fragment_start - lp_start, numel=fragment_numel)
     lp_fragment_tensor = lp_param.flatten().narrow(0, lp_frag_address.start, lp_frag_address.numel)
@@ -281,7 +331,6 @@ def get_hp_fragment_mapping(lp_param, lp_start, flat_hp_partition, gradient_dict
                            lp_fragment_address=lp_frag_address,
                            hp_fragment=hp_fragment_tensor,
                            hp_fragment_address=hp_frag_address,
-                           optim_fragment=optim_fragment,
                            gradient_dict=gradient_dict,
                            offload_gradient_dict=offload_gradient_dict,
                            use_offload=use_offload,
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
index 4aac4ded1243..64ae8ac0e5b4 100755
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -18,6 +18,7 @@
 BACKWARD_REDUCE_GLOBAL_TIMER = 'bwd_allreduce'
 STEP_MICRO_TIMER = 'step_microstep'
 STEP_GLOBAL_TIMER = 'step'
+TIME_EPSILON = 1e-6
 
 try:
     import psutil
@@ -50,7 +51,7 @@ def __init__(self, name):
             self.name_ = name
             self.started_ = False
             self.event_timers = []
-            self.use_host_timer = get_accelerator().is_synchronized_device()
+            self.use_host_timer = get_accelerator().use_host_timers()
             self.start_event = None
             self.elapsed_records = None
             self.start_time = 0.0
@@ -197,15 +198,9 @@ def get_mean(self, names, normalizer=1.0, reset=True):
 
 class ThroughputTimer:
 
-    def __init__(
-        self,
-        batch_size,
-        start_step=2,
-        steps_per_output=50,
-        monitor_memory=False,
-        logging_fn=None,
-    ):
+    def __init__(self, config, batch_size, start_step=2, steps_per_output=None, monitor_memory=False, logging_fn=None):
         from deepspeed.utils import logger
+        self.config = config
         self.start_time = 0
         self.end_time = 0
         self.started = False
@@ -234,14 +229,22 @@ def _init_timer(self):
         self.initialized = True
 
     def start(self):
+        if not self.config.enabled:
+            return
         self._init_timer()
         self.started = True
         if self.global_step_count >= self.start_step:
-            get_accelerator().synchronize()
+            if self.config.synchronized:
+                get_accelerator().synchronize()
             self.start_time = time.time()
 
+    def _is_report_boundary(self):
+        if self.steps_per_output is None:
+            return False
+        return self.global_step_count % self.steps_per_output == 0
+
     def stop(self, global_step=False, report_speed=True):
-        if not self.started:
+        if not self.config.enabled or not self.started:
             return
         self.started = False
         self.micro_step_count += 1
@@ -249,14 +252,15 @@ def stop(self, global_step=False, report_speed=True):
             self.global_step_count += 1
 
         if self.start_time > 0:
-            get_accelerator().synchronize()
+            if self.config.synchronized:
+                get_accelerator().synchronize()
             self.end_time = time.time()
             duration = self.end_time - self.start_time
             self.total_elapsed_time += duration
             self.step_elapsed_time += duration
 
             if global_step:
-                if report_speed and self.global_step_count % self.steps_per_output == 0:
+                if report_speed and self._is_report_boundary():
                     self.logging(
                         "epoch={}/micro_step={}/global_step={}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, "
                         "MemAllocated={}GB, MaxMemAllocated={}GB".format(
@@ -264,7 +268,7 @@ def stop(self, global_step=False, report_speed=True):
                             self.micro_step_count,
                             self.global_step_count,
                             self.avg_samples_per_sec(),
-                            self.batch_size / self.step_elapsed_time,
+                            self.batch_size / (self.step_elapsed_time + TIME_EPSILON),
                             round(get_accelerator().memory_allocated() / 1024**3, 2),
                             round(get_accelerator().max_memory_allocated() / 1024**3, 2),
                         ))
diff --git a/deepspeed/utils/torch.py b/deepspeed/utils/torch.py
new file mode 100644
index 000000000000..1d32775fe64a
--- /dev/null
+++ b/deepspeed/utils/torch.py
@@ -0,0 +1,31 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from packaging import version as pkg_version
+
+import torch
+
+
+def required_torch_version(min_version=None, max_version=None):
+    assert min_version or max_version, "Must provide a min_version or max_version argument"
+
+    torch_version = pkg_version.parse(torch.__version__)
+
+    if min_version and pkg_version.parse(str(min_version)) > torch_version:
+        return False
+
+    if max_version and pkg_version.parse(str(max_version)) < torch_version:
+        return False
+
+    return True
+
+
+def register_grad_hook(param, hook):
+    if required_torch_version(min_version=2.1):
+        return param.register_post_accumulate_grad_hook(hook)
+    else:
+        param_tmp = param.expand_as(param)
+        grad_acc = param_tmp.grad_fn.next_functions[0][0]
+        return grad_acc.register_hook(hook)
diff --git a/deepspeed/utils/z3_leaf_module.py b/deepspeed/utils/z3_leaf_module.py
new file mode 100644
index 000000000000..14e8ae2d2823
--- /dev/null
+++ b/deepspeed/utils/z3_leaf_module.py
@@ -0,0 +1,93 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+from typing import List, Type, Union
+
+
+def z3_leaf_module(model: torch.nn.Module) -> bool:
+    """Returns whether a module in `model` has been flagged as a 'leaf' module.
+        See `set_z3_leaf_modules` for more details.
+        Args:
+            model (torch.nn.Module): The model to which the leaf module flag will be applied.
+        Returns:
+            bool: Whether the module has been flagged as a 'leaf' module.
+    """
+    return hasattr(model, '_z3_leaf') and model._z3_leaf
+
+
+def z3_leaf_parameter(model: torch.nn.Parameter) -> bool:
+    """Returns whether a parameter belongs to a leaf module.
+        See `set_z3_leaf_modules` for more details.
+        Args:
+            model (torch.nn.Parameter): The parameter to which the leaf module flag will be applied.
+        Returns:
+            bool: Whether the parameter belongs to a leaf module.
+    """
+    return hasattr(model, 'ds_z3_leaf_module')
+
+
+def get_z3_leaf_modules(model: torch.nn.Module) -> List[torch.nn.Module]:
+    """Returns a list of modules in `model` that have been flagged as 'leaf' modules.
+        See `set_z3_leaf_modules` for more details.
+        Args:
+            model (torch.nn.Module): The model to which the leaf module flag will be applied.
+        Returns:
+            List[torch.nn.Module]: A list of modules that have been flagged as 'leaf' modules.
+    """
+    return [module for module in model.modules() if z3_leaf_module(module)]
+
+
+def set_z3_leaf_module(model: torch.nn.Module, flag: bool):
+    model._z3_leaf = flag
+
+
+def _do_set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: Union[List[Type], List[str]],
+                            flag: bool) -> List[torch.nn.Module]:
+    assert all(isinstance(module_class, (type, str) ) for module_class in leaf_module_classes), \
+        f'leaf_module_classes must be a list of types or names, got {leaf_module_classes}'
+
+    leaf_modules = []
+
+    def _set_z3_leaf_flag(model: torch.nn.Module):
+        nonlocal leaf_modules
+        for module in leaf_module_classes:
+            if (isinstance(module, type) and model.__class__ == module) or \
+            (isinstance(module, str) and model.__class__.__name__ == module):
+                model._z3_leaf = flag
+                leaf_modules.append(model)
+
+    model.apply(_set_z3_leaf_flag)
+
+    if len(leaf_modules) == 0:
+        raise ValueError(f'No modules of type {leaf_module_classes} found in model {model}')
+
+    return leaf_modules
+
+
+def set_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: Union[List[Type],
+                                                                           List[str]]) -> List[torch.nn.Module]:
+    """Sets a flag within a module in `model` to instruct ZeRO3 to stop setting hooks recursively when it encounters a module class listed in `leaf_module_classes`.
+       This is particularly useful in the context of Mixture of Experts (MoE) models. In MoE models, the computation order of experts varies across forward passes. This variability can disrupt ZeRO3's functionality, as ZeRO3 relies on tracking the computation order of modules to prefetch parameters efficiently. By designating a module as a 'leaf' node, ZeRO3 will prefetch parameters for all child modules upon entering the module.
+       Another scenario where this functionality is beneficial is in models with excessively fine-grained nested modules, where it helps to avoid the overhead associated with hooks.
+        Args:
+            model (torch.nn.Module): The model to which the leaf module flag will be applied.
+            leaf_module_classes (Union[List[Type], List[str]]): A list of module classes that should be flagged as 'leaf' modules.
+        Returns:
+            List[torch.nn.Module]: A list of modules that match the module classes in `leaf_module_classes`.
+    """
+    return _do_set_z3_leaf_modules(model, leaf_module_classes, True)
+
+
+def unset_z3_leaf_modules(model: torch.nn.Module, leaf_module_classes: List[Type]) -> List[torch.nn.Module]:
+    """Unsets a flag within a module in `model` to instruct ZeRO3 to resume setting hooks recursively when it encounters a module class listed in `leaf_module_classes`.
+        See `set_z3_leaf_modules` for more details.
+        Args:
+            model (torch.nn.Module): The model to which the leaf module flag will be applied.
+            leaf_module_classes (Union[List[Type], List[str]]): A list of module classes that should be flagged as 'leaf' modules.
+        Returns:
+            List[torch.nn.Module]: A list of modules that match the module classes in `leaf_module_classes`.
+    """
+    return _do_set_z3_leaf_modules(model, leaf_module_classes, False)
diff --git a/deepspeed/utils/zero_to_fp32.py b/deepspeed/utils/zero_to_fp32.py
index c98caae31534..0e759146cadd 100755
--- a/deepspeed/utils/zero_to_fp32.py
+++ b/deepspeed/utils/zero_to_fp32.py
@@ -10,7 +10,10 @@
 # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
 # application.
 #
-# example: python zero_to_fp32.py . pytorch_model.bin
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
 
 import argparse
 import torch
@@ -18,6 +21,10 @@
 import math
 import os
 import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
 from collections import OrderedDict
 from dataclasses import dataclass
 
@@ -95,7 +102,7 @@ def get_model_state_files(checkpoint_dir):
 def parse_model_states(files):
     zero_model_states = []
     for file in files:
-        state_dict = torch.load(file, map_location=device)
+        state_dict = torch.load(file, map_location=device, weights_only=False)
 
         if BUFFER_NAMES not in state_dict:
             raise ValueError(f"{file} is not a model state checkpoint")
@@ -139,11 +146,10 @@ def parse_model_states(files):
 
 
 def parse_optim_states(files, ds_checkpoint_dir):
-
     total_files = len(files)
     state_dicts = []
-    for f in files:
-        state_dict = torch.load(f, map_location=device)
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
         # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
         # and also handle the case where it was already removed by another helper script
         state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
@@ -175,23 +181,11 @@ def parse_optim_states(files, ds_checkpoint_dir):
     else:
         raise ValueError(f"unknown zero stage {zero_stage}")
 
-    if zero_stage <= 2:
-        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
-    elif zero_stage == 3:
-        # if there is more than one param group, there will be multiple flattened tensors - one
-        # flattened tensor per group - for simplicity merge them into a single tensor
-        #
-        # XXX: could make the script more memory efficient for when there are multiple groups - it
-        # will require matching the sub-lists of param_shapes for each param group flattened tensor
-
-        fp32_flat_groups = [
-            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
-        ]
-
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
     return zero_stage, world_size, fp32_flat_groups
 
 
-def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
     """
     Returns fp32 state_dict reconstructed from ds checkpoint
 
@@ -211,9 +205,11 @@ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
     print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
 
     if zero_stage <= 2:
-        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
     elif zero_stage == 3:
-        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
 
 
 def _zero2_merge_frozen_params(state_dict, zero_model_states):
@@ -248,6 +244,11 @@ def _zero2_merge_frozen_params(state_dict, zero_model_states):
     print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
 
 
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+
+
 def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
     param_shapes = zero_model_states[0].param_shapes
 
@@ -287,7 +288,7 @@ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
         avail_numel = full_single_fp32_vector.numel()
         for name, shape in shapes.items():
 
-            unpartitioned_numel = shape.numel()
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
             total_numel += unpartitioned_numel
             total_params += 1
 
@@ -321,7 +322,8 @@ def zero2_align(x):
     print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
 
 
-def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
     state_dict = OrderedDict()
 
     # buffers
@@ -330,7 +332,8 @@ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zer
     if debug:
         print(f"added {len(buffers)} buffers")
 
-    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
 
     _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
 
@@ -385,9 +388,56 @@ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
     print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
 
 
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+
+
 def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
     param_shapes = zero_model_states[0].param_shapes
-    avail_numel = fp32_flat_groups[0].numel() * world_size
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+
     # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
     # param, re-consolidating each param, while dealing with padding if any
 
@@ -411,12 +461,11 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     offset = 0
     total_numel = 0
     total_params = 0
-    for name, shape in param_shapes.items():
-
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
         unpartitioned_numel = shape.numel()
         total_numel += unpartitioned_numel
         total_params += 1
-
         partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
 
         if debug:
@@ -424,10 +473,9 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
                 f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
             )
 
-        # XXX: memory usage doubles here
-        state_dict[name] = torch.cat(
-            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
-            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
         offset += partitioned_numel
 
     offset *= world_size
@@ -439,7 +487,8 @@ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero
     print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
 
 
-def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
     state_dict = OrderedDict()
 
     # buffers
@@ -448,7 +497,8 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zer
     if debug:
         print(f"added {len(buffers)} buffers")
 
-    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
 
     _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
 
@@ -460,7 +510,30 @@ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zer
     return state_dict
 
 
-def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+
+
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
     ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
@@ -469,14 +542,13 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
     Args:
         - ``checkpoint_dir``: path to the desired checkpoint folder
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
 
     Returns:
         - pytorch ``state_dict``
 
-    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
-    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
-    the checkpoint.
-
     A typical usage might be ::
 
         from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
@@ -492,6 +564,16 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
 
     If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
 
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
     """
     if tag is None:
         latest_path = os.path.join(checkpoint_dir, 'latest')
@@ -506,23 +588,96 @@ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
     if not os.path.isdir(ds_checkpoint_dir):
         raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
 
-    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
 
 
-def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
     """
     Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
     loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
 
     Args:
         - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
-        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
         - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
     """
 
-    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
-    print(f"Saving fp32 state dict to {output_file}")
-    torch.save(state_dict, output_file)
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
 
 
 def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
@@ -565,23 +720,41 @@ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser()
     parser.add_argument("checkpoint_dir",
                         type=str,
                         help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
     parser.add_argument(
-        "output_file",
+        "--max_shard_size",
         type=str,
-        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
     parser.add_argument("-t",
                         "--tag",
                         type=str,
                         default=None,
                         help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
     parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
     args = parser.parse_args()
 
     debug = args.debug
 
-    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)
diff --git a/docker/Dockerfile b/docker/Dockerfile
index c5f0124b3908..263a30be27c5 100755
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.7.1-devel-ubuntu18.04
+FROM nvidia/cuda:12.2.2-devel-ubuntu20.04
 
 ENV DEBIAN_FRONTEND noninteractive
 
@@ -19,7 +19,7 @@ RUN apt-get update && \
         curl wget vim tmux emacs less unzip \
         htop iftop iotop ca-certificates openssh-client openssh-server \
         rsync iputils-ping net-tools sudo \
-        llvm-9-dev
+        llvm-dev
 
 ##############################################################################
 # Installation Latest Git
@@ -40,20 +40,20 @@ RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
 ##############################################################################
 # Mellanox OFED
 ##############################################################################
-ENV MLNX_OFED_VERSION=4.6-1.0.1.1
+ENV MLNX_OFED_VERSION=4.9-7.1.0.0
 RUN apt-get install -y libnuma-dev
 RUN cd ${STAGE_DIR} && \
-        wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
-        cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
+        wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64.tgz | tar xzf - && \
+        cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64 && \
         ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
         cd ${STAGE_DIR} && \
-        rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
+        rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu20.04-x86_64*
 
 ##############################################################################
 # nv_peer_mem
 ##############################################################################
-ENV NV_PEER_MEM_VERSION=1.1
-ENV NV_PEER_MEM_TAG=1.1-0
+ENV NV_PEER_MEM_VERSION=1.2
+ENV NV_PEER_MEM_TAG=${NV_PEER_MEM_VERSION}-0
 RUN mkdir -p ${STAGE_DIR} && \
         git clone https://github.com/Mellanox/nv_peer_memory.git --branch ${NV_PEER_MEM_TAG} ${STAGE_DIR}/nv_peer_memory && \
         cd ${STAGE_DIR}/nv_peer_memory && \
@@ -69,8 +69,8 @@ RUN mkdir -p ${STAGE_DIR} && \
 ##############################################################################
 # OPENMPI
 ##############################################################################
-ENV OPENMPI_BASEVERSION=4.0
-ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
+ENV OPENMPI_BASEVERSION=4.1
+ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.6
 RUN cd ${STAGE_DIR} && \
         wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
         cd openmpi-${OPENMPI_VERSION} && \
@@ -106,12 +106,6 @@ RUN apt-get install -y python3 python3-dev && \
 RUN pip install pyyaml
 RUN pip install ipython
 
-##############################################################################
-# TensorFlow
-##############################################################################
-ENV TENSORFLOW_VERSION=1.15.2
-RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION}
-
 ##############################################################################
 # Some Packages
 ##############################################################################
@@ -138,16 +132,13 @@ RUN pip install psutil \
         sentencepiece \
         msgpack \
         requests \
-        pandas \
         sphinx \
         sphinx_rtd_theme \
         scipy \
         numpy \
-        sklearn \
         scikit-learn \
         nvidia-ml-py3 \
-        mpi4py \
-        cupy-cuda100
+        mpi4py
 
 ##############################################################################
 ## SSH daemon port inside container cannot conflict with host OS port
@@ -159,12 +150,8 @@ RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
 ##############################################################################
 # PyTorch
 ##############################################################################
-ENV PYTORCH_VERSION=1.9.0
-ENV TORCHVISION_VERSION=0.10.0
-ENV TENSORBOARDX_VERSION=1.8
+ENV PYTORCH_VERSION=1.13.0
 RUN pip install torch==${PYTORCH_VERSION}
-RUN pip install torchvision==${TORCHVISION_VERSION}
-RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
 
 ##############################################################################
 # PyYAML build issue
@@ -187,7 +174,7 @@ USER deepspeed
 ##############################################################################
 # DeepSpeed
 ##############################################################################
-RUN git clone https://github.com/microsoft/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+RUN git clone https://github.com/deepspeedai/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
 RUN cd ${STAGE_DIR}/DeepSpeed && \
         git checkout . && \
         git checkout master && \
diff --git a/docker/gh-builder/Dockerfile.py311 b/docker/gh-builder/Dockerfile.py311
new file mode 100644
index 000000000000..603fb614314f
--- /dev/null
+++ b/docker/gh-builder/Dockerfile.py311
@@ -0,0 +1,35 @@
+# Start with NGC container
+FROM nvcr.io/nvidia/pytorch:24.03-py3
+
+# Set noninteractive mode for apt-get
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install necessary dependencies for building Python
+RUN apt-get update && apt-get install -y \
+    wget \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    curl \
+    libncursesw5-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libffi-dev \
+    tk-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Download and install Python 3.11
+RUN wget https://www.python.org/ftp/python/3.11.9/Python-3.11.9.tgz \
+    && tar xzf Python-3.11.9.tgz \
+    && cd Python-3.11.9 \
+    && ./configure --enable-optimizations \
+    && make altinstall \
+    && cd .. \
+    && rm -rf Python-3.11.9 Python-3.11.9.tgz
+
+# Set Python 3.11 as the default Python version
+RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.11 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.11 1
diff --git a/docker/gh-builder/Dockerfile.py312 b/docker/gh-builder/Dockerfile.py312
new file mode 100644
index 000000000000..a0a7193201d4
--- /dev/null
+++ b/docker/gh-builder/Dockerfile.py312
@@ -0,0 +1,35 @@
+# Start with NGC container
+FROM nvcr.io/nvidia/pytorch:24.03-py3
+
+# Set noninteractive mode for apt-get
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Install necessary dependencies for building Python
+RUN apt-get update && apt-get install -y \
+    wget \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    curl \
+    libncursesw5-dev \
+    libgdbm-dev \
+    libc6-dev \
+    libffi-dev \
+    tk-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Download and install Python 3.12
+RUN wget https://www.python.org/ftp/python/3.12.5/Python-3.12.5.tgz \
+    && tar xzf Python-3.12.5.tgz \
+    && cd Python-3.12.5 \
+    && ./configure --enable-optimizations \
+    && make altinstall \
+    && cd .. \
+    && rm -rf Python-3.12.5 Python-3.12.5.tgz
+
+# Set Python 3.12 as the default Python version
+RUN update-alternatives --install /usr/bin/python python /usr/local/bin/python3.12 1 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/local/bin/python3.12 1
diff --git a/docker/Dockerfile.rocm b/docker/rocm/Dockerfile
similarity index 100%
rename from docker/Dockerfile.rocm
rename to docker/rocm/Dockerfile
diff --git a/docs/CNAME b/docs/CNAME
index 72033bc5f7fe..47f170e64eeb 100644
--- a/docs/CNAME
+++ b/docs/CNAME
@@ -1 +1 @@
-www.deepspeed.ai
+www.deepspeed.ai
\ No newline at end of file
diff --git a/docs/Gemfile b/docs/Gemfile
index 888e3c8dfd6a..f40c61e4575f 100644
--- a/docs/Gemfile
+++ b/docs/Gemfile
@@ -20,3 +20,5 @@ end
 
 # Performance-booster for watching directories on Windows
 gem "wdm", "~> 0.1.1", :install_if => Gem.win_platform?
+
+gem "webrick", "~> 1.8"
diff --git a/docs/README.md b/docs/README.md
index 0c3aaaeda600..7333a119c7be 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -42,6 +42,16 @@ We now need to install the required Ruby packages for the website.
 bundle install
 ```
 
+Depending on your environment, you may need to add `webrick` to avoid the following [error](https://talk.jekyllrb.com/t/load-error-cannot-load-such-file-webrick/5417/6):
+
+> gems/gems/jekyll-3.9.5/lib/jekyll/commands/serve/servlet.rb:3:in `require': cannot load such file -- webrick (LoadError)
+
+
+```
+bundle add webrick
+```
+
+
 You can now start a local webserver via:
 ```
 bundle exec jekyll serve
diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
index 217d56c14812..46875d75059b 100755
--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -8,7 +8,7 @@ main:
   - title: 'Documentation'
     url: https://deepspeed.readthedocs.io/
   - title: 'GitHub'
-    url: https://github.com/microsoft/DeepSpeed
+    url: https://github.com/deepspeedai/DeepSpeed
 
 lnav:
   - title: 'Training'
@@ -41,7 +41,7 @@ lnav:
       - title: 'Flops Profiler'
         url: /docs/config-json/#flops-profiler
       - title: 'Monitoring'
-        url: /docs/config-json/#monitoring-module-tensorboard-wandb-csv
+        url: /docs/config-json/#monitoring-module
       - title: 'Communication Logging'
         url: /docs/config-json/#communication-logging
       - title: 'Model Compression'
@@ -55,6 +55,10 @@ lnav:
         url: /getting-started/
       - title: 'Getting started on Azure'
         url: /tutorials/azure/
+      - title: 'Accelerator Abstraction'
+        url: /tutorials/accelerator-abstraction-interface/
+      - title: 'Accelerator Setup Guides'
+        url: /tutorials/accelerator-setup-guide/
       - title: 'Automatic Tensor Parallelism'
         url: /tutorials/automatic-tensor-parallelism/
       - title: 'Autotuning'
@@ -69,6 +73,10 @@ lnav:
         url: /tutorials/curriculum-learning/
       - title: 'Data Efficiency'
         url: /tutorials/data-efficiency/
+      - title: 'DeepNVMe'
+        url: /tutorials/deepnvme/
+      - title: 'Domino'
+        url: /tutorials/domino/
       - title: 'DS4Sci_EvoformerAttention'
         url: /tutorials/ds4sci_evoformerattention/
       - title: 'Flops Profiler'
diff --git a/docs/_pages/config-json.md b/docs/_pages/config-json.md
index e9d7166b05b3..43de95b5210b 100755
--- a/docs/_pages/config-json.md
+++ b/docs/_pages/config-json.md
@@ -6,7 +6,7 @@ toc_label: "Contents"
 
 ### Batch Size Related Parameters
 
-**Note:** <i>**train_batch_size**</i> must be equal to  <i>**train_micro_batch_size_per_gpu**</i> * <i>**gradient_accumulation**</i> * number of GPUs. For simplicity, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed.
+**Note:** <i>**train_batch_size**</i> must be equal to  <i>**train_micro_batch_size_per_gpu**</i> * <i>**gradient_accumulation_steps**</i> * number of GPUs. For simplicity, you can choose to only specify two of the three parameters, the last one will be inferred automatically by DeepSpeed.
 {: .notice--warning}
 
 <i>**train_batch_size**</i>: [integer]
@@ -371,11 +371,12 @@ Enabling and configuring ZeRO memory optimizations
     "sub_group_size" : 1e12,
     "elastic_checkpoint" : [true|false],
     "stage3_gather_16bit_weights_on_model_save": [true|false],
-    "ignore_unused_parameters": [true|false]
-    "round_robin_gradients": [true|false]
-    "zero_hpz_partition_size": 1
-    "zero_quantized_weights": [true|false]
-    "zero_quantized_gradients": [true|false]
+    "ignore_unused_parameters": [true|false],
+    "round_robin_gradients": [true|false],
+    "zero_hpz_partition_size": 1,
+    "zero_quantized_weights": [true|false],
+    "zero_quantized_gradients": [true|false],
+    "log_trace_cache_warnings": [true|false],
     }
 ```
 
@@ -480,7 +481,7 @@ Enabling and configuring ZeRO memory optimizations
 
 | Description                                                                                                                                                          | Default |
 | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). | `1e6`   |
+| Do not partition parameters smaller than this threshold. Smaller values use less memory, but can greatly increase communication (especially latency-bound messages). | `1e5`   |
 
 
 ***stage3_gather_16bit_weights_on_model_save***: [boolean]
@@ -489,6 +490,11 @@ Enabling and configuring ZeRO memory optimizations
 |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- |
 | Consolidate the weights before saving the model by `save_16bit_model()`. Since the weights are partitioned across GPUs, they aren't part of `state_dict`, so this function automatically gathers the weights when this option is enabled and then saves the fp16 model weights. | `False` |
 
+***stage3_module_granularity_threshold***: [integer]
+| Description                                                                                                                                                                                                                                                                    | Default |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ------- |
+| The granularity of a module is determined by the ratio of `parameter_count` / `(1 + descendant_count)`. ZeRO3 classifies modules with a granularity below the threshold as fine-grained, treating them as integral units during parameter fetching. This reduces host and communication overhead from separate hooks. | `0` |
+
 ***zero_hpz_partition_size***: [integer]
 
 | Description                                                                                                                         | Default |
@@ -507,6 +513,12 @@ Enabling and configuring ZeRO memory optimizations
 | ----------------------------------------------------------------------------------------------------------------------------------- | ------- |
 |Boolean indicating whether to enable communication efficient quantized gradients of ZeRO++. | `False`   |
 
+<i>**log_trace_cache_warnings**</i>: [boolean]
+
+| Description                                                                                                         | Default |
+| ------------------------------------------------------------------------------------------------------------------- | ------- |
+| Log warnings from trace cache optimization of parameter sharding, such as cache invalidation events. | `False`  |
+
 ***cpu_offload***: [boolean]
 
 **Deprecated:** **cpu_offload** is deprecated and will be removed in future, please use `offload_optimizer` instead.
@@ -663,7 +675,7 @@ Configuring the asynchronous I/O module for offloading parameter and optimizer s
 
 | Description                                                                                                                                                                                                                                                                                                                                                     | Default |
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
-| Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks. This controls whether or not training should terminate with an error message when unused parameters are detected. This is set to `False` by default, which means unused parameters are ignored and training continues. Now is just used in stage 2. | `True`  |
+| Unused parameters in modules may be unexpected in static networks, but could be normal in dynamic networks. This controls whether or not training should terminate with an error message when unused parameters are detected. This is set to `True` by default, which means unused parameters are ignored and training continues. Now is just used in stage 2. | `True`  |
 
 ### Logging
 
@@ -1139,15 +1151,16 @@ DeepSpeed Data Efficiency Library includes two techniques: curriculum learning a
 | ---------------------------------------------------------------------------------------------------------------------------- | ------- |
 | List of which step to change difficulty level. One of the `schedule_config` when the `fixed_discrete` schedule_type is used. | N/A     |
 
-### Monitoring Module (TensorBoard, WandB, CSV)
+### Monitoring Module
 
 **Note:** Deepspeed logs to TensorBoard through PyTorch. Logging to TensorBoard requires that the `tensorboard` package is installed (read more in the [PyTorch documentation](https://pytorch.org/docs/1.8.0/tensorboard.html)).
 {: .notice--warning}
 **Note:** Logging to WandB requires that the `wandb` package is installed (read more in the [WandB documentation](https://docs.wandb.ai/quickstart)).
 {: .notice--warning}
+**Note:** Logging to Comet requires that the `comet_ml` package is installed (read more in the [Comet documentation](https://www.comet.com/docs/v2/guides/quickstart/#1-install-and-configure-the-comet-ml-sdk)).
+{: .notice--warning}
 
-
-Deepspeed's Monitor module can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file, to [WandB](https://wandb.ai/site), or to simple CSV files. Below is an overview of what DeepSpeed will log automatically.
+Deepspeed's Monitor module can log training details into a [Tensorboard](https://www.tensorflow.org/tensorboard)-compatible file, to [WandB](https://wandb.ai/site), to [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=docs) or to simple CSV files. Below is an overview of what DeepSpeed will log automatically.
 
 | Field | Description                                                                                                                                                                                                                                                                                               |Conditions |
 | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----- |
@@ -1201,6 +1214,36 @@ Example of <i>**wandb**</i> configuration:
 }
 ```
 
+<i>**comet**</i>: [dictionary]
+
+| Fields  | Value   | Default   |
+|---  |---  |---  |
+| enabled   | Whether logging to [Comet](https://www.comet.com/site/) is enabled.   | `false`   |
+| workspace   | Comet workspace name.   | `None`  |
+| project   | Comet project name.   | `None`  |
+| samples_log_interval  | Metrics will be submitted to Comet after processing every `samples_log_intervas` samples.   | `100`   |
+| experiment_name   | The name for comet experiment to be used for logging.   | `None`  |
+| api_key   | Comet API key. It's not recommended to save the Comet API Key in code.  | `None`  |
+| experiment_key  | The key for comet experiment to be used for logging. Must be an alphanumeric string whose length is between 32 and 50 characters.   | `None`  |
+| online  | If True, the data will be logged to Comet server, otherwise it will be stored locally in offline experiment. Default is `True`.   | `None`  |
+| mode  | Control how the Comet experiment is started. "get": Continue logging to an existing experiment identified by the `experiment_key` value. "create": Always creates of a new experiment, useful for HPO sweeps. "get_or_create" (default): Starts a fresh experiment if required, or persists logging to an existing one.   | `None`  |
+
+
+Example of <i>**comet**</i> configuration:
+
+```json
+"comet": {
+    "enabled": true,
+    "workspace": "my_workspace",
+    "project": "my_project",
+    "samples_log_interval": 50,
+    "experiment_name": "llama-fine-tuning",
+    "experiment_key": "0c4a1c4a90664f2a8084e600b19a9d7",
+    "online": false,
+    "mode": "get",
+}
+```
+
 <i>**csv_monitor**</i>: [dictionary]
 
 | Fields | Value                                                                                                                                                                                                                                                                                                        |Default |
diff --git a/docs/_pages/deepspeed4science.md b/docs/_pages/deepspeed4science.md
index b35351838f22..b1aa706ad180 100755
--- a/docs/_pages/deepspeed4science.md
+++ b/docs/_pages/deepspeed4science.md
@@ -24,14 +24,14 @@ To cite DeepSpeed4Science, please cite our [white paper](https://arxiv.org/abs/2
 
 ## New Megatron-DeepSpeed for Large-Scale AI4Science Model Training
 
-We are proud to introduce [new Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed), which is an updated framework for large-scale model training. We rebased and enabled DeepSpeed with the newest Megatron-LM for long sequence support and many other capabilities. With the new Megatron-DeepSpeed, users can now train their large AI4Science models like GenSLMs with much longer sequences via a synergetic combination of ZeRO-style data parallelism, tensor parallelism, sequence parallelism, pipeline parallelism, model state offloading, and several newly added memory optimization techniques such as attention mask offloading and position embedding partitioning.
+We are proud to introduce [new Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed), which is an updated framework for large-scale model training. We rebased and enabled DeepSpeed with the newest Megatron-LM for long sequence support and many other capabilities. With the new Megatron-DeepSpeed, users can now train their large AI4Science models like GenSLMs with much longer sequences via a synergetic combination of ZeRO-style data parallelism, tensor parallelism, sequence parallelism, pipeline parallelism, model state offloading, and several newly added memory optimization techniques such as attention mask offloading and position embedding partitioning.
 
 ![new Megatron-DeepSpeed](/assets/images/new-megatron-ds.png){: .align-center}
 <p align="center">
 <em>The figure depicts system capability in terms of enabling long sequence lengths for training a 33B parameter GPT-like model using our new Megatron-DeepSpeed framework. The results show that the new Megatron-DeepSpeed enables 9x longer sequence lengths than NVIDIA's Megatron-LM without triggering out-of-memory error. </em>
 </p>
 
-To see how the new Megatron-DeepSpeed helps enabling new system capabilities, such as training models with massive sequences length, please read our [tutorial](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support).
+To see how the new Megatron-DeepSpeed helps enabling new system capabilities, such as training models with massive sequences length, please read our [tutorial](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/deepspeed4science/megatron_long_seq_support).
 
 Meanwhile, our new Megatron-DeepSpeed has been applied to genome-scale foundation model [GenSLMs](https://github.com/ramanathanlab/genslm), which is a 2022 [ACM Gordon Bell award](https://www.acm.org/media-center/2022/november/gordon-bell-special-prize-covid-research-2022) winning genome-scale language model from Argonne National Lab. To achieve their scientific goal, GenSLMs and similar models require very long sequence support for both training and inference that is beyond generic LLM's long-sequence strategies. By leveraging DeepSpeed4Science's new Megatron-DeepSpeed, GenSLMs team is able to train their 25B model with 512K sequence length, much longer than their original 42K sequence length. Detailed information about the methodology can be found at [our website](https://deepspeed4science.ai/2023/09/18/model-showcase-genslms/). GenSLMs team also hosts an [example](https://github.com/ramanathanlab/genslm/tree/main/examples/long-sequences) about how to use DeepSpeed4Science in the GenSLMs repo.
 
diff --git a/docs/_pages/inference.md b/docs/_pages/inference.md
index d63604e1f022..fb3534872439 100755
--- a/docs/_pages/inference.md
+++ b/docs/_pages/inference.md
@@ -6,8 +6,10 @@ toc: true
 toc_label: "Contents"
 ---
 
+>**DeepSpeed-Inference v2 is here and it's called DeepSpeed-FastGen! For the best performance, latest features, and newest model support please see our [DeepSpeed-FastGen release blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)!**
+
 DeepSpeed-Inference introduces several features to efficiently serve transformer-based PyTorch models. It supports model parallelism (MP) to fit large models that would otherwise not fit in GPU memory. Even for smaller models, MP can be used to reduce latency for inference. To further reduce latency and cost, we introduce inference-customized kernels. Finally, we propose a novel approach to quantize models, called MoQ, to both shrink the model and reduce the inference cost at production. For more details on the inference related optimizations in DeepSpeed, please refer to our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/).
 
-DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py).
+DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py).
 
 To get started with DeepSpeed-Inference, please checkout our [tutorial](https://www.deepspeed.ai/tutorials/inference-tutorial/).
diff --git a/docs/_posts/2020-05-19-bert-record.md b/docs/_posts/2020-05-19-bert-record.md
index b47ad0b0beaf..67d00280348e 100644
--- a/docs/_posts/2020-05-19-bert-record.md
+++ b/docs/_posts/2020-05-19-bert-record.md
@@ -19,4 +19,4 @@ the same number and generation of GPUs.
 * Brief overview, see our [press release](https://www.microsoft.com/en-us/research/blog/ZeRO-2-deepspeed-shattering-barriers-of-deep-learning-speed-scale/).
 * Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/05/27/fastest-bert-training.html).
 * Tutorial on how to reproduce our results, see our [BERT pre-training tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/).
-* The source code for our transformer kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples).
+* The source code for our transformer kernels can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed) and BERT pre-training code can be found in the [DeepSpeedExamples repo](https://github.com/deepspeedai/deepspeedexamples).
diff --git a/docs/_posts/2020-05-28-fastest-bert-training.md b/docs/_posts/2020-05-28-fastest-bert-training.md
index 62be6c1bffce..2154c36fe279 100644
--- a/docs/_posts/2020-05-28-fastest-bert-training.md
+++ b/docs/_posts/2020-05-28-fastest-bert-training.md
@@ -284,7 +284,7 @@ and faster convergence.
 To try out these optimizations and training recipe, please check out our [BERT
 training tutorial](https://www.deepspeed.ai/tutorials/bert-pretraining/) and
 source code at the [DeepSpeed GitHub
-repo](https://github.com/microsoft/deepspeed).
+repo](https://github.com/deepspeedai/deepspeed).
 
 ### References
 
diff --git a/docs/_posts/2020-09-08-sparse-attention-news.md b/docs/_posts/2020-09-08-sparse-attention-news.md
index 79de33a82e3a..b9a0aeb88d9b 100644
--- a/docs/_posts/2020-09-08-sparse-attention-news.md
+++ b/docs/_posts/2020-09-08-sparse-attention-news.md
@@ -11,4 +11,4 @@ DeepSpeed offers sparse attention kernels, an instrumental technology to support
 * Brief overview, see our [press release]({{ site.press_release_v3 }}).
 * Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/sparse-attention.html).
 * Tutorial on how to use sparse attention, see our [Sparse attention tutorial](https://www.deepspeed.ai/tutorials/sparse-attention/).
-* The source code for our sparse attention kernels can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed) and BERT pre-training code using sparse attention can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples).
+* The source code for our sparse attention kernels can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed) and BERT pre-training code using sparse attention can be found in the [DeepSpeedExamples repo](https://github.com/deepspeedai/deepspeedexamples).
diff --git a/docs/_posts/2020-09-09-ZeRO-Offload.md b/docs/_posts/2020-09-09-ZeRO-Offload.md
index 8e2e8423fd55..e0626f791a4e 100755
--- a/docs/_posts/2020-09-09-ZeRO-Offload.md
+++ b/docs/_posts/2020-09-09-ZeRO-Offload.md
@@ -10,4 +10,4 @@ We introduce a new technology called ZeRO-Offload to enable **10X bigger model t
 
 * For more information on ZeRO-Offload, see our [press release]( {{ site.press_release_v3 }} ).
 * For more information on how to use ZeRO-Offload, see our [ZeRO-Offload tutorial](https://www.deepspeed.ai/tutorials/ZeRO-offload/).
-* The source code for ZeRO-Offload can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed).
+* The source code for ZeRO-Offload can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed).
diff --git a/docs/_posts/2020-09-09-onebit-adam-news.md b/docs/_posts/2020-09-09-onebit-adam-news.md
index d0adcb09987f..1fd8ef89edce 100644
--- a/docs/_posts/2020-09-09-onebit-adam-news.md
+++ b/docs/_posts/2020-09-09-onebit-adam-news.md
@@ -17,4 +17,4 @@ its efficient implementation in DeepSpeed. 1-bit Adam offers the ***same converg
 * Brief overview, see our [press release]({{ site.press_release_v3 }}).
 * Detailed technology deep dive, see our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html).
 * Tutorial on how to reproduce our results, see our [1-bit Adam tutorial](/tutorials/onebit-adam/).
-* The source code for 1-bit Adam can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed). The implementation of 1-bit Adam is in [onebit_adam.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/fp16/onebit_adam.py) and CUDA-Aware communication for 1-bit Adam is in [custom_collectives.py](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/custom_collectives.py). Example codes to try this feature can be found in the [DeepSpeedExamples repo](https://github.com/microsoft/deepspeedexamples) as shown in the [tutorial](/tutorials/onebit-adam/).
+* The source code for 1-bit Adam can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed). The implementation of 1-bit Adam is in [onebit_adam.py](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/runtime/fp16/onebit_adam.py) and CUDA-Aware communication for 1-bit Adam is in [custom_collectives.py](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/runtime/custom_collectives.py). Example codes to try this feature can be found in the [DeepSpeedExamples repo](https://github.com/deepspeedai/deepspeedexamples) as shown in the [tutorial](/tutorials/onebit-adam/).
diff --git a/docs/_posts/2020-09-09-pipeline-parallelism.md b/docs/_posts/2020-09-09-pipeline-parallelism.md
index 48343ebd8d1e..fe708bc4d50d 100644
--- a/docs/_posts/2020-09-09-pipeline-parallelism.md
+++ b/docs/_posts/2020-09-09-pipeline-parallelism.md
@@ -14,5 +14,5 @@ low-bandwidth network by up to 7x.
 * For a brief overview and results including trillion-parameter capabilities,
   see our [press release]({{ site.press_release_v3 }}).
 * To get started with pipeline parallel training in DeepSpeed, we recommend our [tutorial](/tutorials/pipeline/).
-* See our AlexNet example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples).
+* See our AlexNet example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples).
 * Read our API documentation on [readthedocs](https://deepspeed.readthedocs.io/en/latest/pipeline.html).
diff --git a/docs/_posts/2020-09-09-sparse-attention.md b/docs/_posts/2020-09-09-sparse-attention.md
index 9675ef1058dd..1ab827d6fc8e 100644
--- a/docs/_posts/2020-09-09-sparse-attention.md
+++ b/docs/_posts/2020-09-09-sparse-attention.md
@@ -28,7 +28,7 @@ In a pre-training experiment, we ran BERT model under three settings: dense, den
 ![Maximum sequence runnable on BERT](/assets/images/sa_maximum_sequence_runnable_on_bert.png){: .align-center}
 
 * **Up to 6.3x faster computation**
-We continued the pre-training experiment for different batch sizes and sequence lengths, using [BERT base/large](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert) and [Megatron GPT2](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM). In this experiment we let the training to continue for 100 iteration and recorded the average time per last 30 iterations. SA reduces total computation comparing with dense and improves training speed:  the boost is higher with increased sequence length and it is up to 6.3x faster for BERT base, 5.3x for BERT large, and 6.1x for GPT2. Following charts show these results.
+We continued the pre-training experiment for different batch sizes and sequence lengths, using [BERT base/large](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert) and [Megatron GPT2](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM). In this experiment we let the training to continue for 100 iteration and recorded the average time per last 30 iterations. SA reduces total computation comparing with dense and improves training speed:  the boost is higher with increased sequence length and it is up to 6.3x faster for BERT base, 5.3x for BERT large, and 6.1x for GPT2. Following charts show these results.
 
 ![Training time for BERT base with varying sequence length](/assets/images/sa_bert_base_time_result.png){: .align-center}
 
diff --git a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
index ee518f53f012..da07edd7b922 100755
--- a/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
+++ b/docs/_posts/2020-10-28-progressive-layer-dropping-news.md
@@ -10,4 +10,4 @@ We introduce a new technology called progressive layer dropping (PLD) to speedup
 
   * For detailed technology deep dive, see our [technical report](https://arxiv.org/pdf/2010.13369.pdf).
   * For more information on how to use PLD, see our [Progressive layer dropping tutorial](https://www.deepspeed.ai/tutorials/progressive_layer_dropping/).
-  * The source code for PLD is now available at the [DeepSpeed repo](https://github.com/microsoft/deepspeed).
+  * The source code for PLD is now available at the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed).
diff --git a/docs/_posts/2021-11-15-autotuning.md b/docs/_posts/2021-11-15-autotuning.md
index 71acf54438ea..410e32c878a3 100644
--- a/docs/_posts/2021-11-15-autotuning.md
+++ b/docs/_posts/2021-11-15-autotuning.md
@@ -8,8 +8,8 @@ toc: false
 
 We introduce a new feature called Autotuning to automatically discover the optimal DeepSpeed configuration that delivers good training speed. One pain point in model training is to figure out good performance-relevant configurations such as micro-batch size to fully utilize the hardware and achieve a high throughput number. This configuration exploring process is commonly done manually but is important since model training is repeated many times and benefits from using a good configuration. Not only is the hand-tuning process time-consuming, but the outcome is hardware-dependent. This means that a good configuration on one hardware might not be the best on another different hardware. The user thus has to hand tune the configuration again. With DeepSpeed, there are more configuration parameters that could potentially affect the training speed, thus making it more tedious to manually tune the configuration.
 
-The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning) would demonstrate the effectiveness of autotuning across different models.
+The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning) would demonstrate the effectiveness of autotuning across different models.
 
 * For a brief overview, see the [Autotuning tutorial](https://www.deepspeed.ai/tutorials/autotuning/).
-* For more information on how to use Autotuning, see the [Autotuning README](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#deepspeed-autotuning).
-* The source code can be found in the [DeepSpeed repo](https://github.com/microsoft/deepspeed).
+* For more information on how to use Autotuning, see the [Autotuning README](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#deepspeed-autotuning).
+* The source code can be found in the [DeepSpeed repo](https://github.com/deepspeedai/deepspeed).
diff --git a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
index 99a62fbe00ea..69fef131d3c0 100644
--- a/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
+++ b/docs/_posts/2021-12-09-deepspeed-moe-nlg.md
@@ -170,9 +170,9 @@ high quality language models accessible to a broad audience, even with limited
 compute resources.
 
 To this end we are releasing our [end-to-end pipeline for training MoE based
-NLG models](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training),
+NLG models](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/moe-training),
 along with [specific example
-scripts](https://github.com/microsoft/Megatron-DeepSpeed/tree/moe-training/examples_deepspeed/MoE)
+scripts](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/moe-training/examples_deepspeed/MoE)
 and [tutorial](/tutorials/mixture-of-experts-nlg) to help get started with our pipeline.  We look forward to the application and
 the innovations that this may bring to the deep learning community.
 
diff --git a/docs/_posts/2022-07-26-deepspeed-azure.md b/docs/_posts/2022-07-26-deepspeed-azure.md
index 749be582d9a0..540f74d4be1b 100644
--- a/docs/_posts/2022-07-26-deepspeed-azure.md
+++ b/docs/_posts/2022-07-26-deepspeed-azure.md
@@ -19,7 +19,7 @@ In this extended post, we share the details of how DeepSpeed users can train tri
 
 ## Making distributed training faster and easier on Azure using DeepSpeed
 
-We compare the existing manual and error-prone workflow with our proposed easy-to-use workflow for DeepSpeed on Azure in *Figure 2*. Customers can now use easy-to-use [training pipelines](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed) to launch training jobs at scale. The new workflow reduces the number of steps from 11 to just 1 if users rely on the recommended [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/) [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml).
+We compare the existing manual and error-prone workflow with our proposed easy-to-use workflow for DeepSpeed on Azure in *Figure 2*. Customers can now use easy-to-use [training pipelines](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed) to launch training jobs at scale. The new workflow reduces the number of steps from 11 to just 1 if users rely on the recommended [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/) [recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml).
 
 
 ![Workflow](/assets/images/old-vs-new-azure.png){: .align-center}
@@ -29,7 +29,7 @@ We compare the existing manual and error-prone workflow with our proposed easy-t
 For users who have custom environments built using Azure VMs or [Azure VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview), only two steps are needed:
 
 - 1) Run the cluster setup script (to be released in the next few weeks)
-- 2) Use the Azure VMSS [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) to launch training.
+- 2) Use the Azure VMSS [recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) to launch training.
 
 ## Key Performance Benefits
 We already shared a summary of our key performance results in the Azure [announcement](https://azure.microsoft.com/en-us/blog/azure-empowers-easytouse-highperformance-and-hyperscale-model-training-using-deepspeed/). We enable the capability to train 2x larger model sizes (2 trillion vs. 1 trillion parameters), scale to 2x more GPUs (1024 vs. 512), and offer up to 1.8x higher compute throughput/GPU (150 TFLOPs vs. 81 TFLOPs) compared to other [cloud providers](https://medium.com/pytorch/training-a-1-trillion-parameter-model-with-pytorch-fully-sharded-data-parallel-on-aws-3ac13aa96cff).
@@ -48,7 +48,7 @@ We share the details of our experimental setup and some of the best practices we
 We used [NDm A100 v4-series](https://docs.microsoft.com/en-us/azure/virtual-machines/ndm-a100-v4-series) instances in our experiments. Each instance includes two socket AMD EPYC 7V12 64-Core CPUs, 1.7TB main memory and eight A100 80GB GPUs. The system has a balanced PCIe topology connecting 4 GPU devices to each CPU socket. Each GPU within the VM is provided with its own dedicated, topology-agnostic 200 Gb/s NVIDIA Mellanox HDR InfiniBand connection providing an accelerated 200 Gbps high speed fabric. The DeepSpeed library exploits offload capabilities where the activation and optimizer states are allocated in the main memory. Hence, 1.7TB memory capacity per node helps us to scale to large model sizes.
 
 ### Training setup using AzureML
-Users can directly use the AzureML studio and use our published [recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) to run experiments without any additional setup. This is the easiest and recommended way of running experiments on Azure.
+Users can directly use the AzureML studio and use our published [recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) to run experiments without any additional setup. This is the easiest and recommended way of running experiments on Azure.
 
 ### Training setup using Azure VMSS
 
@@ -59,7 +59,7 @@ A cluster is created using Azure Virtual Machine Scale Sets (VMSS) to provision
 | ------------------------------:  | :----------------: |
 | PyTorch | 	1.10.2 (installed from source) |
 | DeepSpeed |	0.6.2 (installed from source) |
-| Megatron-LM |	[https://github.com/microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) |
+| Megatron-LM |	[https://github.com/deepspeedai/Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) |
 | Apex |	0.1 |
 | NCCL |	2.12.10 |
 | CUDNN |	8.2.4.15 |
@@ -122,9 +122,9 @@ The 2T parameter model consists of 160 layers, 32k hidden dimension, and 128 att
 
 We recognize that DeepSpeed users are diverse and have different environments. In this tutorial, our focus is on making things simpler for users who plan to run large model training experiments on Azure.
 
-> The easiest way to do model training on Azure is via the Azure ML recipes. The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). Users simply need to setup their Azure ML workspace following the [guide](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up) and submit experiment using the aml_submit.py file.
+> The easiest way to do model training on Azure is via the Azure ML recipes. The job submission and data preparation scripts have been made available [here](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). Users simply need to setup their Azure ML workspace following the [guide](https://github.com/Azure/azureml-examples/tree/main/python-sdk#set-up) and submit experiment using the aml_submit.py file.
 
-Some users have customized environments built on top of Azure VMs and VMSS based clusters. To simplify training on such setups, we are working on an easy-to-use cluster setup script that will be published in the next few weeks. If you already have a cluster setup running, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) for the 175B and the 1T model. The recipes can easily be modified to train other model configurations.
+Some users have customized environments built on top of Azure VMs and VMSS based clusters. To simplify training on such setups, we are working on an easy-to-use cluster setup script that will be published in the next few weeks. If you already have a cluster setup running, you can use the [azure recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) for the 175B and the 1T model. The recipes can easily be modified to train other model configurations.
 
 ## Acknowledgement
 
diff --git a/docs/_posts/2022-09-10-zero-inference.md b/docs/_posts/2022-09-10-zero-inference.md
index 59a3e3bf15fa..3c588e39c1dc 100644
--- a/docs/_posts/2022-09-10-zero-inference.md
+++ b/docs/_posts/2022-09-10-zero-inference.md
@@ -83,7 +83,7 @@ Next, we measure the impact on generation throughput using four V100-32GB GPUs.
 We briefly discuss how users can determine when ZeRO-Inference is suitable for their application and how to enable ZeRO-Inference in DeepSpeed.
 
 ### When to use ZeRO-Inference
-ZeRO-Inference is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. Also, ZeRO-Inference is optimized for inference applications that are **throughput-oriented** and allow **large batch sizes**. Alternative techniques, such as [Accelerate](https://github.com/huggingface/accelerate), [DeepSpeed-Inference](https://www.deepspeed.ai/inference/), and [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) that fit the entire model into GPU memory, possibly using multiple GPUs, are more suitable for inference applications that are latency sensitive or have small batch sizes.
+ZeRO-Inference is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. Also, ZeRO-Inference is optimized for inference applications that are **throughput-oriented** and allow **large batch sizes**. Alternative techniques, such as [Accelerate](https://github.com/huggingface/accelerate), [DeepSpeed-Inference](https://www.deepspeed.ai/inference/), and [DeepSpeed-MII](https://github.com/deepspeedai/deepspeed-mii) that fit the entire model into GPU memory, possibly using multiple GPUs, are more suitable for inference applications that are latency sensitive or have small batch sizes.
 
 ### How to use ZeRO-Inference
 ZeRO-Inference is available in the DeepSpeed library versions >= 0.6.6. Integrating ZeRO-Inference into token generation pipelines, such as [Hugging Face generate](https://huggingface.co/docs/transformers/main_classes/text_generation), requires updating the DeepSpeed configuration to set [ZeRO optimization](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training) to stage 3 and [parameter offloading](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) to CPU or NVMe.
diff --git a/docs/_posts/2022-10-11-mii.md b/docs/_posts/2022-10-11-mii.md
index e0b43f51b1e0..324b7ffbad33 100644
--- a/docs/_posts/2022-10-11-mii.md
+++ b/docs/_posts/2022-10-11-mii.md
@@ -11,7 +11,7 @@ The Deep Learning (DL) open-source community has seen tremendous growth in the l
 
 There has been significant progress in system optimizations for DL model inference that can drastically reduce both latency and cost, but those are not easily accessible. The main reason for this limited accessibility is that the DL model inference landscape is diverse with models varying in size, architecture, system performance characteristics, hardware requirements, etc. Identifying the appropriate set of system optimizations applicable to a given model and applying them correctly is often beyond the scope of most data scientists, making low latency and low-cost inference mostly inaccessible.
 
-[DeepSpeed Model Implementations for Inference (MII)](https://github.com/microsoft/DeepSpeed-MII) is a new open-source python library from DeepSpeed, aimed towards making low-latency, low-cost inference of powerful models not only feasible but also easily accessible.
+[DeepSpeed Model Implementations for Inference (MII)](https://github.com/deepspeedai/DeepSpeed-MII) is a new open-source python library from DeepSpeed, aimed towards making low-latency, low-cost inference of powerful models not only feasible but also easily accessible.
 
 * MII offers access to highly optimized implementations of **thousands of widely used DL models.**
 * MII supported models achieve significantly lower latency and cost compared to their original implementation.
@@ -33,7 +33,7 @@ Under-the-hood MII is powered by [DeepSpeed-Inference](https://arxiv.org/abs/220
 
 MII supports a growing list of tasks such as text generation, question-answering, text classification, etc, across thousands of transformer models available through multiple open-sourced model repositories such as Hugging Face, FairSeq, EluetherAI, etc. It supports dense models based on BERT, RoBERTa, GPT, OPT, and BLOOM architectures ranging from a few hundred million parameters in size to hundreds of billions of parameters in size. At the same time, it supports recent image generation models such as Stable Diffusion.
 
-See the MII GitHub repo for an up-to-date list of [models and tasks supported by MII](https://github.com/microsoft/deepspeed-mii#supported-models-and-tasks).
+See the MII GitHub repo for an up-to-date list of [models and tasks supported by MII](https://github.com/deepspeedai/deepspeed-mii#supported-models-and-tasks).
 
 # Inference Optimizations with MII
 
@@ -133,7 +133,7 @@ mii.deploy(task="text-to-image",
            deployment_type=DeploymentType.AML)
 ```
 
-To learn more about these deployment options and get started with MII, please the [MII getting started guide](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii).
+To learn more about these deployment options and get started with MII, please the [MII getting started guide](https://github.com/deepspeedai/deepspeed-mii#getting-started-with-mii).
 
 # Concluding Remarks
 
diff --git a/docs/_posts/2022-12-12-data-efficiency.md b/docs/_posts/2022-12-12-data-efficiency.md
index 52148707b767..82931a30e167 100644
--- a/docs/_posts/2022-12-12-data-efficiency.md
+++ b/docs/_posts/2022-12-12-data-efficiency.md
@@ -141,4 +141,4 @@ The composed DeepSpeed Data Efficiency solution leverages both data efficiency t
 
 # Concluding Remarks
 
-We are very excited to share DeepSpeed Data Efficiency library with the community and improve it with your feedback. Please find the code, tutorial, and documents at the [DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed), and [website](/tutorials/data-efficiency/). And for more technical details please read our [Random-LTD paper](https://arxiv.org/abs/2211.11586) and [DeepSpeed Data Efficiency paper](https://arxiv.org/abs/2212.03597). We believe that our composable library and novel data efficiency techniques will help users reduce training cost while maintaining model quality or achieve better quality under similar cost. And we hope DeepSpeed Data Efficiency could become a platform that motivates and accelerates future research on deep learning data efficiency.
+We are very excited to share DeepSpeed Data Efficiency library with the community and improve it with your feedback. Please find the code, tutorial, and documents at the [DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed), and [website](/tutorials/data-efficiency/). And for more technical details please read our [Random-LTD paper](https://arxiv.org/abs/2211.11586) and [DeepSpeed Data Efficiency paper](https://arxiv.org/abs/2212.03597). We believe that our composable library and novel data efficiency techniques will help users reduce training cost while maintaining model quality or achieve better quality under similar cost. And we hope DeepSpeed Data Efficiency could become a platform that motivates and accelerates future research on deep learning data efficiency.
diff --git a/docs/_posts/2023-03-31-multi-modal.md b/docs/_posts/2023-03-31-multi-modal.md
index aaef9cfbfd2a..63ea2f94f850 100644
--- a/docs/_posts/2023-03-31-multi-modal.md
+++ b/docs/_posts/2023-03-31-multi-modal.md
@@ -34,4 +34,4 @@ Specifically, we incorporate the MoE structure into the classical single-tower m
 
 A sophisticated MoE model design requires a highly efficient and scalable training system that can support multi-dimensional parallelism and efficient memory management. [DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) training system offers such advanced capabilities including easy-to-use APIs enabling flexible combinations of data, tensor, and expert parallelism. Furthermore, DeepSpeed MoE enables larger model scale than state-of-the-art systems by exploiting expert parallelism and [ZeRO optimizations](https://arxiv.org/abs/1910.02054) together. By leveraging the DeepSpeed MoE system, VL-MoE Base with 32 experts achieves similar model quality as VLMO-dense Large with about 2.5x training speedup.
 
-[DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) system is already open-sourced and can be easily used as plug-and-play component to achieve high-performance low-cost training for any large-scale MoE models. The tutorial of how to use DeepSpeed MoE is available [here](https://www.deepspeed.ai/tutorials/mixture-of-experts/). VL-MoE is currently in the process of being integrated as a model example of [DeepSpeed Examples](https://github.com/microsoft/DeepSpeedExamples). Please stay tuned for our upcoming updates on this thread.
+[DeepSpeed MoE](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/) system is already open-sourced and can be easily used as plug-and-play component to achieve high-performance low-cost training for any large-scale MoE models. The tutorial of how to use DeepSpeed MoE is available [here](https://www.deepspeed.ai/tutorials/mixture-of-experts/). VL-MoE is currently in the process of being integrated as a model example of [DeepSpeed Examples](https://github.com/deepspeedai/DeepSpeedExamples). Please stay tuned for our upcoming updates on this thread.
diff --git a/docs/_posts/2023-04-24-deepspeed-chat-chinese.md b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md
index 2fd962327b54..57a77caab32d 100644
--- a/docs/_posts/2023-04-24-deepspeed-chat-chinese.md
+++ b/docs/_posts/2023-04-24-deepspeed-chat-chinese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Chat: 一键式RLHF训练，让你的类ChatGPT千亿大模型提速省钱15倍"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/chinese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/chinese/README.md
 date: 2023-04-24 00:00:00
 tags: training ZeRO RLHF Chinese
 ---
diff --git a/docs/_posts/2023-04-24-deepspeed-chat-japanese.md b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md
index 63200846ab65..ee3c8dca00fa 100644
--- a/docs/_posts/2023-04-24-deepspeed-chat-japanese.md
+++ b/docs/_posts/2023-04-24-deepspeed-chat-japanese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Chat: ChatGPTライクなモデルを簡単・高速・低コストに、あらゆるスケールで学習"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/japanese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/japanese/README.md
 date: 2023-04-24 00:00:00
 tags: training ZeRO RLHF Japanese
 ---
diff --git a/docs/_posts/2023-04-24-deepspeed-chat.md b/docs/_posts/2023-04-24-deepspeed-chat.md
index 70b627b951ee..f6cad798ca99 100644
--- a/docs/_posts/2023-04-24-deepspeed-chat.md
+++ b/docs/_posts/2023-04-24-deepspeed-chat.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-chat/README.md
 date: 2023-04-24 00:00:00
 tags: training ZeRO RLHF English
 ---
diff --git a/docs/_posts/2023-06-22-zeropp-chinese.md b/docs/_posts/2023-06-22-zeropp-chinese.md
index ca52dd5f59ab..71dc2d51cb70 100644
--- a/docs/_posts/2023-06-22-zeropp-chinese.md
+++ b/docs/_posts/2023-06-22-zeropp-chinese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed ZeRO++：降低4倍网络通信，显著提高大模型及类ChatGPT模型训练效率"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md
 date: 2023-06-22 00:00:00
 tags: training ZeRO RLHF Chinese
 ---
diff --git a/docs/_posts/2023-06-22-zeropp-japanese.md b/docs/_posts/2023-06-22-zeropp-japanese.md
index 745fcac41d97..e81013d11aba 100644
--- a/docs/_posts/2023-06-22-zeropp-japanese.md
+++ b/docs/_posts/2023-06-22-zeropp-japanese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed ZeRO++: LLMやチャットモデルの訓練を劇的に高速化 – 通信オーバヘッドを1/4に大幅削減 -"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md
 date: 2023-06-22 00:00:00
 tags: training ZeRO RLHF Japanese
 ---
diff --git a/docs/_posts/2023-08-24-ulysses-chinese.md b/docs/_posts/2023-08-24-ulysses-chinese.md
index 613af2fe7583..f8d269217b7a 100644
--- a/docs/_posts/2023-08-24-ulysses-chinese.md
+++ b/docs/_posts/2023-08-24-ulysses-chinese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Ulysses: 训练极长序列Transformer模型的系统优化"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md
 date: 2023-08-24 00:00:00
 tags: training ZeRO Chinese
 ---
diff --git a/docs/_posts/2023-08-24-ulysses-japanese.md b/docs/_posts/2023-08-24-ulysses-japanese.md
index 921c7c28739d..291407a5523e 100644
--- a/docs/_posts/2023-08-24-ulysses-japanese.md
+++ b/docs/_posts/2023-08-24-ulysses-japanese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Ulysses: Transformerモデルを非常に長いシーケンスで訓練するための最適化"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md
 date: 2023-08-24 00:00:00
 tags: training ZeRO Japanese
 ---
diff --git a/docs/_posts/2023-08-24-ulysses.md b/docs/_posts/2023-08-24-ulysses.md
index a88a0d66080a..c10b2d599f02 100644
--- a/docs/_posts/2023-08-24-ulysses.md
+++ b/docs/_posts/2023-08-24-ulysses.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/README.md
 date: 2023-08-24 00:00:00
 tags: training ZeRO English
 ---
diff --git a/docs/_posts/2023-09-12-ZeRO-Inference.md b/docs/_posts/2023-09-12-ZeRO-Inference.md
index 7b9852dc160b..04a6347bec59 100644
--- a/docs/_posts/2023-09-12-ZeRO-Inference.md
+++ b/docs/_posts/2023-09-12-ZeRO-Inference.md
@@ -1,6 +1,6 @@
 title: "ZeRO-Inference: 20X faster inference through weight quantization and KV cache offloading"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md
+link: https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md
 date: 2023-09-12 00:09:00
 tags: inference ZeRO quantization English
 ---
diff --git a/docs/_posts/2023-09-19-deepspeed4science-chinese.md b/docs/_posts/2023-09-19-deepspeed4science-chinese.md
index 7b0ccf00aa61..651d61a3b79c 100644
--- a/docs/_posts/2023-09-19-deepspeed4science-chinese.md
+++ b/docs/_posts/2023-09-19-deepspeed4science-chinese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed4Science：利用先进的AI系统优化技术实现科学发现"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md
 date: 2023-09-19 00:00:00
 tags: training inference science Chinese
 ---
diff --git a/docs/_posts/2023-09-19-deepspeed4science-japanese.md b/docs/_posts/2023-09-19-deepspeed4science-japanese.md
index 8c0a1b6d0082..20d83c8e0b5a 100644
--- a/docs/_posts/2023-09-19-deepspeed4science-japanese.md
+++ b/docs/_posts/2023-09-19-deepspeed4science-japanese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed4Scienceイニシアティブ: 洗練されたAIシステムのテクノロジーにより大規模な科学的発見を可能に"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md
 date: 2023-09-19 00:00:00
 tags: training inference science Japanese
 ---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md
index 290b8b4b8ba4..1e0ef0bed34b 100644
--- a/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat-chinese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed-VisualChat：多轮图像+文字，为你展现不一样的AI聊天魅力"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md
 date: 2023-10-04 00:00:00
 tags: training Chinese
 ---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md
index f8b7e20cc2cf..745e9052358e 100644
--- a/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat-japanese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed-VisualChat: 複数ラウンド・複数画像の入力が可能なAIチャット体験を実現"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
+link: https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md
 date: 2023-10-04 00:00:00
 tags: training Japanese
 ---
diff --git a/docs/_posts/2023-10-04-deepspeed-visualchat.md b/docs/_posts/2023-10-04-deepspeed-visualchat.md
index 74a1eb66fd5c..8226597290b2 100644
--- a/docs/_posts/2023-10-04-deepspeed-visualchat.md
+++ b/docs/_posts/2023-10-04-deepspeed-visualchat.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md
+link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md
 date: 2023-10-04 00:00:00
 tags: training English
 ---
diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md b/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md
index cc259c20361a..ec936bb6d79e 100644
--- a/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md
+++ b/docs/_posts/2023-11-06-deepspeed-fastgen-chinese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed-FastGen：通过 MII 和 DeepSpeed-Inference 实现 LLM 高吞吐量文本生成"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md
 date: 2023-11-06 00:00:00
 tags: inference Chinese
 ---
diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md b/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md
index a4bb8237783a..a64b29c88163 100644
--- a/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md
+++ b/docs/_posts/2023-11-06-deepspeed-fastgen-japanese.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed-FastGen: MIIとDeepSpeed-InferenceによるLLMのための高速なテキスト生成"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md
+link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md
 date: 2023-11-06 00:00:00
 tags: inference Japanese
 ---
diff --git a/docs/_posts/2023-11-06-deepspeed-fastgen.md b/docs/_posts/2023-11-06-deepspeed-fastgen.md
index 2af103e81f15..d9062ce56da3 100644
--- a/docs/_posts/2023-11-06-deepspeed-fastgen.md
+++ b/docs/_posts/2023-11-06-deepspeed-fastgen.md
@@ -1,7 +1,7 @@
 ---
 title: "DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference"
 excerpt: ""
-link: https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen
+link: https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen
 date: 2023-11-06 00:00:00
 tags: inference English
 ---
diff --git a/docs/_sass/minimal-mistakes/_sidebar.scss b/docs/_sass/minimal-mistakes/_sidebar.scss
index 63cef338c583..5f657e112d82 100644
--- a/docs/_sass/minimal-mistakes/_sidebar.scss
+++ b/docs/_sass/minimal-mistakes/_sidebar.scss
@@ -76,7 +76,7 @@
 
   @include breakpoint($large) {
     position: absolute;
-    top: 0;
+    top: auto;
     right: 0;
     width: $right-sidebar-width-narrow;
     margin-right: -1.5 * $right-sidebar-width-narrow;
@@ -94,7 +94,7 @@
 
   @include breakpoint($x-large) {
     width: $right-sidebar-width;
-    margin-right: -1.5 * $right-sidebar-width;
+    margin-right: -1.5 * $right-sidebar-width-narrow;
   }
 }
 
diff --git a/docs/_tutorials/accelerator-abstraction-interface.md b/docs/_tutorials/accelerator-abstraction-interface.md
index 0810c3c6b5d7..30a362b82d25 100644
--- a/docs/_tutorials/accelerator-abstraction-interface.md
+++ b/docs/_tutorials/accelerator-abstraction-interface.md
@@ -1,6 +1,6 @@
 ---
 title: DeepSpeed Accelerator Abstraction Interface
-tags: getting-started
+tags: getting-started training accelerator
 ---
 
 # Contents
@@ -12,7 +12,6 @@ tags: getting-started
   - [Tensor operations](#tensor-operations)
   - [Communication backend](#communication-backend)
 - [Run DeepSpeed model on different accelerators](#run-deepspeed-model-on-different-accelerators)
-- [Run DeepSpeed model on CPU](#run-deepspeed-model-on-cpu)
 - [Implement new accelerator extension](#implement-new-accelerator-extension)
 
 # Introduction
@@ -79,69 +78,10 @@ torch.distributed.init_process_group(get_accelerator().communication_backend_nam
 ```
 
 # Run DeepSpeed model on different accelerators
-Once a model is ported with DeepSpeed Accelerator Abstraction Interface, we can run this model on different accelerators using extension to DeepSpeed.  DeepSpeed check whether certain extension is installed in the environment to decide whether to use the Accelerator backend in that extension.  For example if we wish to run model on Intel GPU, we can install _Intel Extension for DeepSpeed_ following the instruction in [link](https://github.com/intel/intel-extension-for-deepspeed/)
-
-After the extension is installed, install DeepSpeed and run model.   The model will be running on top of DeepSpeed.   Because DeepSpeed installation is also accelerator related, it is recommended to install DeepSpeed accelerator extension before install DeepSpeed.
-
-`CUDA_Accelerator` is the default accelerator in DeepSpeed.  If no other DeepSpeed accelerator extension is installed, `CUDA_Accelerator` will be used.
-
-When run a model on different accelerator in a cloud environment, the recommended practice is provision environment for each accelerator in different env with tool such as _anaconda/miniconda/virtualenv_.  When run model on different Accelerator, load the env accordingly.
-
-Note that different accelerator may have different 'flavor' of float16 or bfloat16.   So it is recommended to make the model configurable for both float16 and bfloat16, in that way model code does not need to be changed when running on different accelerators.
-
-# Run DeepSpeed model on CPU
-DeepSpeed support using CPU as accelerator.  DeepSpeed model using DeepSpeed Accelerator Abstraction Interface could run on CPU without change to model code.   DeepSpeed decide whether _Intel Extension for PyTorch_ is installed in the environment.  If this packaged is installed, DeepSpeed will use CPU as accelerator.  Otherwise CUDA device will be used as accelerator.
-
-To run DeepSpeed model on CPU, use the following steps to prepare environment:
-
-```
-python -m pip install intel_extension_for_pytorch
-python -m pip install oneccl_bind_pt==2.0 -f https://developer.intel.com/ipex-whl-stable-cpu
-git clone https://github.com/oneapi-src/oneCCL
-cd oneCCL
-mkdir build
-cd build
-cmake ..
-make
-make install
-```
-
-Before run CPU workload, we need to source oneCCL environment variables
-```
-source <path-to-oneCCL>/build/_install/env/setvars.sh
-```
-
-After environment is prepared, we can launch DeepSpeed inference with the following command
-```
-deepspeed --bind_cores_to_rank <deepspeed-model-script>
-```
-
-This command would launch number of workers equal to number of CPU sockets on the system.  Currently DeepSpeed support running inference model with AutoTP on top of CPU.  The argument `--bind_cores_to_rank` distribute CPU cores on the system evenly among workers, to allow each worker running on a dedicated set of CPU cores.
-
-On CPU system, there might be daemon process that periodically activate which would increase variance of each worker.  One practice is leave a couple of cores for daemon process using `--bind-core-list` argument:
-
-```
-deepspeed --bind_cores_to_rank --bind_core_list 0-51,56-107 <deepspeed-model-script>
-```
-
-The command above leave 4 cores on each socket to daemon process (assume two sockets, each socket has 56 cores).
-
-We can also set an arbitrary number of workers.  Unlike GPU, CPU cores on host can be further divided into subgroups.  When this number is not set, DeepSpeed would detect number of NUMA nodes on the system and launch one worker for each NUMA node.
-
-```
-deepspeed --num_accelerators 4 --bind_cores_to_rank <deepspeed-model-script>
-```
-
-Launching DeepSpeed model on multiple CPU nodes is similar to other accelerators.  We need to specify `impi` as launcher and specify `--bind_cores_to_rank` for better core binding.  Also specify `slots` number according to number of CPU sockets in host file.
-
-```
-# hostfile content should follow the format
-# worker-1-hostname slots=<#sockets>
-# worker-2-hostname slots=<#sockets>
-# ...
-
-deepspeed --hostfile=<hostfile> --bind_cores_to_rank --launcher impi --master_addr <master-ip> <deepspeed-model-script>
-```
+[Accelerator Setup Guide](accelerator-setup-guide.md) provides a guide on how to setup different accelerators for DeepSpeed.  It also comes with simple example how to run deepspeed for different accelerators.  The following guides are provided:
+1. Run DeepSpeed model on CPU
+2. Run DeepSpeed model on XPU
+3. Run DeepSpeed model on Huawei Ascend NPU
 
 # Implement new accelerator extension
 It is possible to implement a new DeepSpeed accelerator extension to support new accelerator in DeepSpeed.  An example to follow is _[Intel Extension For DeepSpeed](https://github.com/intel/intel-extension-for-deepspeed/)_.   An accelerator extension contains the following components:
@@ -150,8 +90,8 @@ This class implements `class DeepSpeedAccelerator` and will be returned by `get_
 2. Op builders following https://github.com/intel/intel-extension-for-deepspeed/tree/main/intel_extension_for_deepspeed/op_builder.   All op builders needs to inherit `deepspeed.ops.op_builder.builder.OpBuilder` directly or indirectly.  A common practice is to implement a base op builder (SYCLOpBuilder in the case of Intel Extension for DeepSpeed) and inherit this base op builder instead.
 3. Op kernels as in the following [link](https://github.com/intel/intel-extension-for-deepspeed/tree/main/intel_extension_for_deepspeed/op_builder/csrc).
 
-Note that an extension does not have to implement all op builders under https://github.com/microsoft/DeepSpeed/tree/master/op_builder all at a time.   A missing op builder usually means certain DeepSpeed functionality cannot be used for that Accelerator, but models that does not use that functionality can still run.
+Note that an extension does not have to implement all op builders under https://github.com/deepspeedai/DeepSpeed/tree/master/op_builder all at a time.   A missing op builder usually means certain DeepSpeed functionality cannot be used for that Accelerator, but models that does not use that functionality can still run.
 
 When implementing op builder for an accelerator extension, one thing needs to be noted is that the op builder native code is being built by DeepSpeed jit load mechanism.  This mean the native source file being built needs to be in DeepSpeed installation directory.  However these files are defined in accelerator extension installation directory, which cannot be built by DeepSpeed directly.  To solve this, follow the example in https://github.com/intel/intel-extension-for-deepspeed/blob/main/intel_extension_for_deepspeed/op_builder/cpu_adam.py to use 'sycl_kernel_path' and 'sycl_kernel_include' (User can change 'sycl' to other prefix in their own accelerator extension) to allow native code be built during DeepSpeed jit load.
 
-When accelerator extension is installed in the environment, it can be used by either explicit call deepspeed.accelerator.set_accelerator(XYZ_Accelerator()) following the example in https://github.com/microsoft/DeepSpeed/blob/master/accelerator/real_accelerator.py, or add an implicit detection code in get_accelerator in the same file above.
+When accelerator extension is installed in the environment, it can be used by either explicit call deepspeed.accelerator.set_accelerator(XYZ_Accelerator()) following the example in https://github.com/deepspeedai/DeepSpeed/blob/master/accelerator/real_accelerator.py, or add an implicit detection code in get_accelerator in the same file above.
diff --git a/docs/_tutorials/accelerator-setup-guide.md b/docs/_tutorials/accelerator-setup-guide.md
new file mode 100644
index 000000000000..ecb77f475375
--- /dev/null
+++ b/docs/_tutorials/accelerator-setup-guide.md
@@ -0,0 +1,256 @@
+---
+title: DeepSpeed Accelerator Setup Guides
+tags: getting-started training accelerator
+---
+
+# Contents
+- [Contents](#contents)
+- [Introduction](#introduction)
+- [Intel Architecture (IA) CPU](#intel-architecture-ia-cpu)
+- [Intel XPU](#intel-xpu)
+- [Huawei Ascend NPU](#huawei-ascend-npu)
+- [Intel Gaudi](#intel-gaudi)
+
+# Introduction
+DeepSpeed supports different accelerators from different companies.   Setup steps to run DeepSpeed on certain accelerators might be different.  This guide allows user to lookup setup instructions for the accelerator family and hardware they are using.
+
+# Intel Architecture (IA) CPU
+DeepSpeed supports CPU with Intel Architecture instruction set.  It is recommended to have the CPU support at least AVX2 instruction set and recommend AMX instruction set.
+
+DeepSpeed has been verified on the following CPU processors:
+* 4th Gen Intel® Xeon® Scalarable Processors
+* 5th Gen Intel® Xeon® Scalarable Processors
+* 6th Gen Intel® Xeon® Scalarable Processors
+
+## Installation steps for Intel Architecture CPU
+To install DeepSpeed on Intel Architecture CPU, use the following steps:
+1. Install gcc compiler
+DeepSpeed requires gcc-9 or above to build kernels on Intel Architecture CPU, install gcc-9 or above.
+
+2. Install numactl
+DeepSpeed use `numactl` for fine grain CPU core allocation for load-balancing, install numactl on your system.
+For example, on Ubuntu system, use the following command:
+`sudo apt-get install numactl`
+
+3. Install PyTorch
+`pip install torch`
+
+4. Install DeepSpeed
+`pip install deepspeed`
+
+## How to launch DeepSpeed on Intel Architecture CPU
+DeepSpeed can launch on Intel Architecture CPU with default deepspeed command.  However, for compute intensive workloads, Intel Architecture CPU works best when each worker process runs on different set of physical CPU cores, so worker process does not compete CPU cores with each other.  To bind cores to each worker (rank), use the following command line switch for better performance.
+```
+deepspeed --bind_cores_to_rank <deepspeed-model-script>
+```
+This switch would automatically detect the number of CPU NUMA node on the host, launch the same number of workers, and bind each worker to cores/memory of a different NUMA node.  This improves performance by ensuring workers do not interfere with each other, and that all memory allocation is from local memory.
+
+If a user wishes to have more control on the number of workers and specific cores that can be used by the workload, user can use the following command line switches.
+```
+deepspeed --num_accelerators <number-of-workers> --bind_cores_to_rank --bind_core_list <comma-seperated-dash-range> <deepspeed-model-script>
+```
+For example:
+```
+deepspeed --num_accelerators 4 --bind_cores_to_rank --bind_core_list <0-27,32-59> inference.py
+```
+This would start 4 workers for the workload.  The core list range will be divided evenly between 4 workers, with worker 0 take 0-13, worker 1, take 14-27, worker 2 take 32-45, and worker 3 take 46-59.  Core 28-31,60-63 are left out because there might be some background process running on the system, leaving some idle cores will reduce performance jitting and straggler effect.
+
+Launching DeepSpeed model on multiple CPU nodes is similar to other accelerators.  We need to specify `impi` as launcher and specify `--bind_cores_to_rank` for better core binding.  Also specify `slots` number according to number of CPU sockets in   host file.
+
+```
+# hostfile content should follow the format
+# worker-1-hostname slots=<#sockets>
+# worker-2-hostname slots=<#sockets>
+# ...
+
+deepspeed --hostfile=<hostfile> --bind_cores_to_rank --launcher impi --master_addr <master-ip> <deepspeed-model-script>
+```
+
+## Install with Intel Extension for PyTorch and oneCCL
+Although not mandatory, Intel Extension for PyTorch and Intel oneCCL provide better optimizations for LLM models.  Intel oneCCL also provide optimization when running LLM model on multi-node.  To use DeepSpeed with Intel Extension for PyTorch and oneCCL, use the following steps:
+1. Install Intel Extension for PyTorch.  This is suggested if you want to get better LLM inference performance on CPU.
+`pip install intel-extension-for-pytorch`
+
+The following steps are to install oneCCL binding for PyTorch.  This is suggested if you are running DeepSpeed on multiple CPU node, for better communication performance.   On single node with multiple CPU socket, these steps are not needed.
+
+2. Install Intel oneCCL binding for PyTorch
+`python -m pip install oneccl_bind_pt -f https://developer.intel.com/ipex-whl-stable-cpu`
+
+3. Install Intel oneCCL, this will be used to build direct oneCCL kernels (CCLBackend kernels)
+```
+pip install oneccl-devel
+pip install impi-devel
+```
+Then set the environment variables for Intel oneCCL (assuming using conda environment).
+```
+export CPATH=${CONDA_PREFIX}/include:$CPATH
+export CCL_ROOT=${CONDA_PREFIX}
+export I_MPI_ROOT=${CONDA_PREFIX}
+export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib/ccl/cpu:${CONDA_PREFIX}/lib/libfabric:${CONDA_PREFIX}/lib
+```
+
+## Optimize LLM inference with Intel Extension for PyTorch
+Intel Extension for PyTorch compatible with DeepSpeed AutoTP tensor parallel inference.  It allows CPU inference to benefit from both DeepSpeed Automatic Tensor Parallelism, and LLM optimizations of Intel Extension for PyTorch.  To use Intel Extension for PyTorch, after calling deepspeed.init_inference, call
+```
+ipex_model = ipex.llm.optimize(deepspeed_model)
+```
+to get model optimzied by Intel Extension for PyTorch.
+
+## More examples for using DeepSpeed on Intel CPU
+Refer to [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/main/examples/cpu/llm) for more code samples of running inference with DeepSpeed on Intel CPU.
+
+
+# Intel XPU
+DeepSpeed XPU accelerator supports Intel® Data Center GPU Max Series.
+
+DeepSpeed has been verified on the following GPU products:
+* Intel® Data Center GPU Max 1100
+* Intel® Data Center GPU Max 1550
+
+## Installation steps for Intel XPU
+To install DeepSpeed on Intel XPU, use the following steps:
+1. Install oneAPI base toolkit \
+The Intel® oneAPI Base Toolkit (Base Kit) is a core set of tools and libraries, including an DPC++/C++ Compiler for building Deepspeed XPU kernels like fusedAdam and CPUAdam, high performance computation libraries demanded by IPEX, etc.
+For easy download, usage and more details, check [Intel oneAPI base-toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+2. Install PyTorch, Intel extension for pytorch, Intel oneCCL Bindings for PyTorch. These packages are required in `xpu_accelerator` for torch functionality and performance, also communication backend on Intel platform. The recommended installation reference:
+https://intel.github.io/intel-extension-for-pytorch/index.html#installation?platform=gpu.
+
+3. Install DeepSpeed \
+`pip install deepspeed`
+
+## How to use DeepSpeed on Intel XPU
+DeepSpeed can be launched on Intel XPU with deepspeed launch command. Before that, user needs activate the oneAPI environment by: \
+`source <oneAPI installed path>/setvars.sh`
+
+To validate the XPU availability and if the XPU accelerator is correctly chosen, here is an example:
+```
+$ python
+>>> import torch; print('torch:', torch.__version__)
+torch: 2.3.0
+>>> import intel_extension_for_pytorch; print('XPU available:', torch.xpu.is_available())
+XPU available: True
+>>> from deepspeed.accelerator import get_accelerator; print('accelerator:', get_accelerator()._name)
+accelerator: xpu
+```
+
+## More examples for using DeepSpeed on Intel XPU
+Refer to [LLM examples](https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main/examples/gpu/llm), [Megatron-DeepSpeed training examples](https://github.com/intel/intel-extension-for-deepspeed/tree/main/examples) for more code samples of running LLM with DeepSpeed on Intel XPU.
+
+
+# Huawei Ascend NPU
+
+DeepSpeed has been verified on the following Huawei Ascend NPU products:
+* Atlas 300T A2
+
+## Installation steps for Huawei Ascend NPU
+
+The following steps outline the process for installing DeepSpeed on an Huawei Ascend NPU:
+1. Install the Huawei Ascend NPU Driver and Firmware
+    <details>
+    <summary>Click to expand</summary>
+
+    Before proceeding with the installation, please download the necessary files from [Huawei Ascend NPU Driver and Firmware](https://www.hiascend.com/en/hardware/firmware-drivers/commercial?product=4&model=11).
+
+    The following instructions below are sourced from the [Ascend Community](https://www.hiascend.com/document/detail/en/canncommercial/700/quickstart/quickstart/quickstart_18_0002.html) (refer to the [Chinese version](https://www.hiascend.com/document/detail/zh/canncommercial/700/quickstart/quickstart/quickstart_18_0002.html)):
+
+    - Execute the following command to install the driver:
+    ```
+    ./Ascend-hdk-<soc_version>-npu-driver_x.x.x_linux-{arch}.run --full --install-for-all
+    ```
+
+    - Execute the following command to install the firmware:
+    ```
+    ./Ascend-hdk-<soc_version>-npu-firmware_x.x.x.x.X.run --full
+    ```
+    </details>
+
+2. Install CANN
+    <details>
+    <summary>Click to expand</summary>
+
+    Prior to installation, download the [CANN Toolkit](https://www.hiascend.com/en/software/cann/commercial).
+
+    - Install third-party dependencies.
+        - Ubuntu (The operations are the same for Debian, UOS20, and Linux.)
+        ```
+        apt-get install -y gcc g++ make cmake zlib1g zlib1g-dev openssl libsqlite3-dev libssl-dev libffi-dev unzip pciutils net-tools libblas-dev gfortran libblas3
+        ```
+        - openEuler (The operations are the same for EulerOS, CentOS, and BC-Linux.)
+        ```
+        yum install -y gcc gcc-c++ make cmake unzip zlib-devel libffi-devel openssl-devel pciutils net-tools sqlite-devel lapack-devel gcc-gfortran
+        ```
+    - Install the required Python dependencies:
+    ```
+    pip3 install attrs numpy decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
+    ```
+    - Install the CANN Toolkit.
+    ```
+    ./Ascend-cann-toolkit_x.x.x_linux-{arch}.run --install
+    ```
+    </details>
+
+3. Install PyTorch \
+    `pip install torch torch_npu`
+
+4. Install DeepSpeed \
+    `pip install deepspeed`
+
+You can view the installation results using the `ds_report` command, Here is an example:
+```
+--------------------------------------------------
+DeepSpeed C++/CUDA extension op report
+--------------------------------------------------
+NOTE: Ops not installed will be just-in-time (JIT) compiled at
+    runtime if needed. Op compatibility means that your system
+    meet the required dependencies to JIT install the op.
+--------------------------------------------------
+JIT compiled ops requires ninja
+ninja .................. [OKAY]
+--------------------------------------------------
+op name ................ installed .. compatible
+--------------------------------------------------
+deepspeed_not_implemented  [NO] ....... [OKAY]
+async_io ............... [NO] ....... [OKAY]
+cpu_adagrad ............ [NO] ....... [OKAY]
+cpu_adam ............... [NO] ....... [OKAY]
+cpu_lion ............... [NO] ....... [OKAY]
+fused_adam ............. [NO] ....... [OKAY]
+transformer_inference .. [NO] ....... [OKAY]
+--------------------------------------------------
+DeepSpeed general environment info:
+torch install path ............... ['/root/miniconda3/envs/ds/lib/python3.10/site-packages/torch']
+torch version .................... 2.2.0
+deepspeed install path ........... ['/root/miniconda3/envs/ds/lib/python3.10/site-packages/deepspeed']
+deepspeed info ................... 0.14.4, unknown, unknown
+deepspeed wheel compiled w. ...... torch 2.2
+torch_npu install path ........... ['/root/miniconda3/envs/ds/lib/python3.10/site-packages/torch_npu']
+torch_npu version ................ 2.2.0
+ascend_cann version .............. 8.0.RC2.alpha002
+shared memory (/dev/shm) size .... 20.00 GB
+```
+
+## How to launch DeepSpeed on Huawei Ascend NPU
+
+To validate the Huawei Ascend NPU availability and if the accelerator is correctly chosen, here is an example(Huawei Ascend NPU detection is automatic starting with DeepSpeed v0.12.6):
+```
+>>> import torch
+>>> print('torch:',torch.__version__)
+torch: 2.2.0
+>>> import torch_npu
+>>> print('torch_npu:',torch.npu.is_available(),",version:",torch_npu.__version__)
+torch_npu: True ,version: 2.2.0
+>>> from deepspeed.accelerator import get_accelerator
+>>> print('accelerator:', get_accelerator()._name)
+accelerator: npu
+```
+
+## Multi-card parallel training using Huawei Ascend NPU
+
+To perform model training across multiple Huawei Ascend NPU cards using DeepSpeed, see the examples provided in [DeepSpeed Examples](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/cifar/cifar10_deepspeed.py).
+
+# Intel Gaudi
+PyTorch models can be run on Intel® Gaudi® AI accelerator using DeepSpeed. Refer to the following user guides to start using DeepSpeed with Intel Gaudi:
+* [Getting Started with DeepSpeed](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Getting_Started_with_DeepSpeed/Getting_Started_with_DeepSpeed.html#getting-started-with-deepspeed)
+* [DeepSpeed User Guide for Training](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/DeepSpeed_User_Guide/DeepSpeed_User_Guide.html#deepspeed-user-guide)
+* [Optimizing Large Language Models](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Optimizing_LLM.html#llms-opt)
+* [Inference Using DeepSpeed](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html#deepspeed-inference-user-guide)
diff --git a/docs/_tutorials/advanced-install.md b/docs/_tutorials/advanced-install.md
index 10197e62f681..b251485f8988 100755
--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
@@ -27,7 +27,7 @@ ds_report
 
 ## Pre-install DeepSpeed Ops
 
-**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed c++/cuda ops. However, this is not required if using the default mode of JIT compilation of ops.
+**Note:** [PyTorch](https://pytorch.org/) must be installed _before_ pre-compiling any DeepSpeed C++/CUDA ops. However, this is not required if using the default mode of JIT compilation of ops.
 {: .notice--info}
 
 Sometimes we have found it useful to pre-install either some or all DeepSpeed
@@ -56,22 +56,22 @@ DS_BUILD_FUSED_LAMB=1 pip install deepspeed
 ```
 
 Available `DS_BUILD` options include:
-* `DS_BUILD_OPS` toggles all ops
-* `DS_BUILD_AIO` builds asynchronous (NVMe) I/O op
-* `DS_BUILD_CCL_COMM` builds the communication collective libs
-* `DS_BUILD_CPU_ADAM` builds the CPUAdam op
-* `DS_BUILD_CPU_LION` builds the CPULion op
-* `DS_BUILD_EVOFORMER_ATTN` builds the EvoformerAttn op (from [Alphafold](https://www.deepspeed.ai/tutorials/ds4sci_evoformerattention/))
-* `DS_BUILD_FUSED_ADAM` builds the FusedAdam op (from [apex](https://github.com/NVIDIA/apex))
-* `DS_BUILD_FUSED_LION` builds the FusedLion op
-* `DS_BUILD_CPU_ADAGRAD` builds the CPUAdagrad op
-* `DS_BUILD_FUSED_LAMB` builds the FusedLamb op
-* `DS_BUILD_QUANTIZER` builds the quantizer op
-* `DS_BUILD_RANDOM_LTD` builds the random ltd op
-* `DS_BUILD_SPARSE_ATTN` builds the sparse attention op
-* `DS_BUILD_TRANSFORMER` builds the transformer op
-* `DS_BUILD_TRANSFORMER_INFERENCE` builds the transformer-inference op
-* `DS_BUILD_STOCHASTIC_TRANSFORMER` builds the stochastic transformer op
+* `DS_BUILD_OPS` toggles all ops.
+* `DS_BUILD_AIO` builds asynchronous (NVMe) I/O op.
+* `DS_BUILD_CCL_COMM` builds the communication collective libs.
+* `DS_BUILD_CPU_ADAM` builds the CPUAdam op.
+* `DS_BUILD_CPU_LION` builds the CPULion op.
+* `DS_BUILD_EVOFORMER_ATTN` builds the EvoformerAttn op (from [Alphafold](https://www.deepspeed.ai/tutorials/ds4sci_evoformerattention/)).
+* `DS_BUILD_FUSED_ADAM` builds the FusedAdam op (from [apex](https://github.com/NVIDIA/apex)).
+* `DS_BUILD_FUSED_LION` builds the FusedLion op.
+* `DS_BUILD_CPU_ADAGRAD` builds the CPUAdagrad op.
+* `DS_BUILD_FUSED_LAMB` builds the FusedLamb op.
+* `DS_BUILD_QUANTIZER` builds the quantizer op.
+* `DS_BUILD_RANDOM_LTD` builds the random ltd op.
+* `DS_BUILD_SPARSE_ATTN` builds the sparse attention op.
+* `DS_BUILD_TRANSFORMER` builds the transformer op.
+* `DS_BUILD_TRANSFORMER_INFERENCE` builds the transformer-inference op.
+* `DS_BUILD_STOCHASTIC_TRANSFORMER` builds the stochastic transformer op.
 
 To speed up the build-all process, you can parallelize the compilation process with:
 
@@ -81,10 +81,10 @@ DS_BUILD_OPS=1 pip install deepspeed --global-option="build_ext" --global-option
 
 This should complete the full build 2-3 times faster. You can adjust `-j` to specify how many cpu-cores are to be used during the build. In the example it is set to 8 cores.
 
-You can also build a binary wheel and install it on multiple machines that have the same type of GPUs and the same software environment (CUDA toolkit, pytorch, python, etc.)
+You can also build a binary wheel and install it on multiple machines that have the same type of GPUs and the same software environment (CUDA toolkit, PyTorch, Python, etc.)
 
 ```bash
-DS_BUILD_OPS=1 python setup.py build_ext -j8 bdist_wheel
+DS_BUILD_OPS=1 python -m build --wheel --no-isolation --config-setting="--build-option=build_ext" --config-setting="--build-option=-j8"
 ```
 
 This will create a pypi binary wheel under `dist`, e.g., ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` and then you can install it directly on multiple machines, in our example:
@@ -106,8 +106,8 @@ pip install .
 
 For installs spanning multiple nodes we find it useful to install DeepSpeed
 using the
-[install.sh](https://github.com/microsoft/DeepSpeed/blob/master/install.sh)
-script in the repo. This will build a python wheel locally and copy it to all
+[install.sh](https://github.com/deepspeedai/DeepSpeed/blob/master/install.sh)
+script in the repo. This will build a Python wheel locally and copy it to all
 the nodes listed in your hostfile (either given via `--hostfile`, or defaults to
 `/job/hostfile`).
 
@@ -118,7 +118,7 @@ extensions will be loaded form that directory.
 
 If you use multiple virtual environments this could be a problem, since by default there is only one
 `torch_extensions` directory, but different virtual environments may use different setups (e.g., different
-python or cuda versions) and then the loading of a CUDA extension built by another environment will
+Python or CUDA versions) and then the loading of a CUDA extension built by another environment will
 fail. Therefore, if you need to you can override the default location with the help of the
  `TORCH_EXTENSIONS_DIR` environment variable. So in each virtual environment you can point it to a
  unique directory and DeepSpeed will use it to save and load CUDA extensions.
@@ -146,9 +146,9 @@ If you're getting the following error:
 ```
 RuntimeError: CUDA error: no kernel image is available for execution on the device
 ```
-when running deepspeed, that means that the cuda extensions weren't built for the card you're trying to use it for.
+when running deepspeed, that means that the CUDA extensions weren't built for the card you're trying to use it for.
 
-When building from source deepspeed will try to support a wide range of architectures, but under jit-mode it'll only
+When building from source DeepSpeed will try to support a wide range of architectures, but under jit-mode it'll only
 support the architectures visible at the time of building.
 
 You can build specifically for a desired range of architectures by setting a `TORCH_CUDA_ARCH_LIST` env variable:
@@ -159,9 +159,9 @@ TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
 
 It will also make the build faster when you only build for a few architectures.
 
-This is also recommended to ensure your exact architecture is used. Due to a variety of technical reasons, a distributed pytorch binary isn't built to fully support all architectures, skipping binary compatible ones, at a potential cost of underutilizing your full card's compute capabilities. To see which architectures get included during the deepspeed build from source - save the log and grep for `-gencode` arguments.
+This is also recommended to ensure your exact architecture is used. Due to a variety of technical reasons, a distributed PyTorch binary isn't built to fully support all architectures, skipping binary compatible ones, at a potential cost of underutilizing your full card's compute capabilities. To see which architectures get included during the DeepSpeed build from source - save the log and grep for `-gencode` arguments.
 
-The full list of nvidia GPUs and their compute capabilities can be found [here](https://developer.nvidia.com/cuda-gpus).
+The full list of Nvidia GPUs and their compute capabilities can be found [here](https://developer.nvidia.com/cuda-gpus).
 
 ## CUDA version mismatch
 
@@ -171,7 +171,7 @@ If you're getting the following error:
 Exception: >- DeepSpeed Op Builder: Installed CUDA version {VERSION} does not match the version torch was compiled with {VERSION}, unable to compile cuda/cpp extensions without a matching cuda version.
 ```
 You have a misaligned version of CUDA installed compared to the version of CUDA
-used to compile torch. A mismatch in the major version is likely to result in
+used to compile Torch. A mismatch in the major version is likely to result in
 errors or unexpected behavior.
 
 The easiest fix for this error is changing the CUDA version installed (check
@@ -195,7 +195,7 @@ DS_SKIP_CUDA_CHECK=1
 Some DeepSpeed features require specific dependencies outside the general dependencies of DeepSpeed.
 
 * Python package dependencies per feature/op please
-see our [requirements directory](https://github.com/microsoft/DeepSpeed/tree/master/requirements).
+see our [requirements directory](https://github.com/deepspeedai/DeepSpeed/tree/master/requirements).
 
 * We attempt to keep the system level dependencies to a minimum, however some features do require special system-level
 packages. Please see our `ds_report` tool output to see if you are missing any system-level packages for a given feature.
diff --git a/docs/_tutorials/automatic-tensor-parallelism.md b/docs/_tutorials/automatic-tensor-parallelism.md
old mode 100644
new mode 100755
index 5d182b2a4532..a7de4721a5ce
--- a/docs/_tutorials/automatic-tensor-parallelism.md
+++ b/docs/_tutorials/automatic-tensor-parallelism.md
@@ -66,7 +66,7 @@ With automatic tensor parallelism, we do not need to provide the injection polic
 
 # Example Script
 
-We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). This script is for testing text-generation models and includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/microsoft/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information.
+We can observe performance improvement with automatic tensor parallelism using the [inference test suite](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/text-generation/inference-test.py). This script is for testing text-generation models and includes per token latency, bandwidth, throughput and memory checks for comparison. See the [README](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/inference/huggingface/text-generation#deepspeed-huggingface-text-generation-examples) for more information.
 
 
 ## Launching
@@ -121,15 +121,21 @@ The following results were collected using V100 SXM2 32GB GPUs.
 The following model families have been successfully tested with automatic tensor parallelism. Other models may work but have not been tested yet.
 
 - albert
+- arctic
+- baichuan
 - bert
 - bigbird_pegasus
 - bloom
 - camembert
+- chatglm2
+- chatglm3
 - codegen
+- codellama
 - deberta_v2
 - electra
 - ernie
 - esm
+- falcon
 - glm
 - gpt-j
 - gpt-neo
@@ -137,8 +143,11 @@ The following model families have been successfully tested with automatic tensor
 - longt5
 - luke
 - llama
+- llama2
 - m2m_100
 - marian
+- mistral
+- mixtral
 - mpt
 - mvp
 - nezha
@@ -146,15 +155,21 @@ The following model families have been successfully tested with automatic tensor
 - opt
 - pegasus
 - perceiver
+- phi
 - plbart
+- qwen
+- qwen2
+- qwen2-moe
 - reformer
 - roberta
 - roformer
 - splinter
+- starcode
 - t5
 - xglm
 - xlm_roberta
 - yoso
+- yuan
 
 # Unsupported Models
 
diff --git a/docs/_tutorials/autotuning.md b/docs/_tutorials/autotuning.md
index 38648daa89f2..2935f38946ac 100644
--- a/docs/_tutorials/autotuning.md
+++ b/docs/_tutorials/autotuning.md
@@ -8,23 +8,23 @@ Make sure you've read the DeepSpeed tutorials on [Getting Started](https://www.d
 
 One pain point in model training is to figure out good performance-relevant configurations such as micro-batch size to fully utilize the hardware and achieve a high throughput number. This configuration exploring process is commonly done manually but is important since model training is repeated many times and benefits from using a good configuration. Not only is the hand-tuning process time-consuming, but the outcome is hardware-dependent. This means that a good configuration on one hardware might not be the best on another different hardware. The user thus has to hand tune the configuration again. With DeepSpeed, there are more configuration parameters that could potentially affect the training speed, thus making it more tedious to manually tune the configuration.
 
-The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. In this tutorial, we showcase the usage and benefits of the autotuning feature in DeepSpeed. For more details, please see the [README.md](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning).
+The DeepSpeed Autotuner mitigates this pain point and automatically discovers the optimal DeepSpeed configuration that delivers good training speed. It not only reduces the time and resources users spend on tuning, but also can discover configurations better than hand-tuned methods. In this tutorial, we showcase the usage and benefits of the autotuning feature in DeepSpeed. For more details, please see the [README.md](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning).
 
 ## Tuning scope and strategy
 
 The DeepSpeed Autotuner uses model information, system information, and heuristics to efficiently tune system knobs that affect compute and memory efficiencies, such as ZeRO optimization stages, micro-batch sizes, and many other ZeRO optimization configurations.
 Currently, the DeepSpeed Autotuner tunes ZeRO stages, micro-batch size per GPU, and ZeRO configurations (offloading is not yet supported) on top of other configurations such as optimizer, scheduler, fp16 defined by the user in the DeepSpeed configuration file.
-Note that ZeRO stages, micro-batch sizes, and other ZeRO configurations to tune are also configurable and can be overwritten by the user through the DeepSpeed configuration file. See [Configuring Tuning Scope](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#configuring-tuning-scope) for details.
+Note that ZeRO stages, micro-batch sizes, and other ZeRO configurations to tune are also configurable and can be overwritten by the user through the DeepSpeed configuration file. See [Configuring Tuning Scope](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#configuring-tuning-scope) for details.
 
 
 ## Ease of use
 
 DeepSpeed Autotuning is easy to use, requiring no code change from DeepSpeed users.
-Compared to the original training script (`deepspeed your_program.py <normal cl args> --deepspeed ds_config.json`), invoking the autotuning feature in DeepSpeed only requires setting an `autotuning` flag after the DeepSpeed launcher (see [Usage](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#usage) for details), and adding `" autotuning": {"enabled": true}` to the DeepSpeed configuration file. Users can further tailor the autotuning process by changing the autotuning configuration in the DeepSpeed configuration JSON file (See [Autotuning Configuration](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/autotuning#autotuning-configuration) for details).
+Compared to the original training script (`deepspeed your_program.py <normal cl args> --deepspeed ds_config.json`), invoking the autotuning feature in DeepSpeed only requires setting an `autotuning` flag after the DeepSpeed launcher (see [Usage](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#usage) for details), and adding `" autotuning": {"enabled": true}` to the DeepSpeed configuration file. Users can further tailor the autotuning process by changing the autotuning configuration in the DeepSpeed configuration JSON file (See [Autotuning Configuration](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/autotuning#autotuning-configuration) for details).
 
 ## Example
 
-We demonstrate the usage and benefit of autotuning using the training of a 0.77 billion parameter [GPT2-large model](https://huggingface.co/gpt2-large) from Hugging Face on 16 Nvidia V100 GPUs. For more examples, refer to [autotuning](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo. Note that autotuning works with any DeepSpeed-accelerated model training, not limited to Hugging Face models.
+We demonstrate the usage and benefit of autotuning using the training of a 0.77 billion parameter [GPT2-large model](https://huggingface.co/gpt2-large) from Hugging Face on 16 Nvidia V100 GPUs. For more examples, refer to [autotuning](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning) in the DeepSpeedExamples repo. Note that autotuning works with any DeepSpeed-accelerated model training, not limited to Hugging Face models.
 
 The model has:
 
@@ -119,7 +119,7 @@ Note that the performance metric used in autotuning is calculated using the timi
 
 Tuning completed in 0:27:33.988447. Total number of experiments: 13.
 
-As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/microsoft/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models.
+As we can see the DeepSpeed Autotuner can select a better than hand-tuned configuration with a reasonable number of experiments. Examples in [Autotuning Hugging Face Examples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/autotuning/hf#autotuning-hugging-face-examples) would demonstrate the effectiveness of autotuning across different models.
 
 ### DeepSpeed Autotuning with AzureML
 
diff --git a/docs/_tutorials/azure.md b/docs/_tutorials/azure.md
index 38af70b3f4b0..1bbfb687d812 100644
--- a/docs/_tutorials/azure.md
+++ b/docs/_tutorials/azure.md
@@ -13,10 +13,10 @@ The recommended and simplest method to try DeepSpeed on Azure is through [AzureM
 
 For AzureML v1 examples, please take a look at easy-to-use examples for Megatron-DeepSpeed, Transformers and CIFAR training [here](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed).
 
-> Our [Megatron-DeepSpeed](https://github.com/microsoft/megatron-deepspeed) contains the most up to date [recipe](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) for end-to-end training on AzureML.
+> Our [Megatron-DeepSpeed](https://github.com/deepspeedai/megatron-deepspeed) contains the most up to date [recipe](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml) for end-to-end training on AzureML.
 
 # DeepSpeed on Azure VMs
 
 If you don't have access to AzureML or if want to build a custom environments using [Azure virtual machines](https://azure.microsoft.com/en-us/services/virtual-machines/) or Azure VM Scale-Sets ([VMSS](https://docs.microsoft.com/en-us/azure/virtual-machine-scale-sets/overview)), we are working on easy-to-use cluster setup scripts that will be published in the next few weeks.
 
-If you already have a cluster setup, you can use the [azure recipes](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) that can easily be modified to train various model configurations.
+If you already have a cluster setup, you can use the [azure recipes](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azure) that can easily be modified to train various model configurations.
diff --git a/docs/_tutorials/bert-finetuning.md b/docs/_tutorials/bert-finetuning.md
index 3014be18d682..efb8fa268e29 100755
--- a/docs/_tutorials/bert-finetuning.md
+++ b/docs/_tutorials/bert-finetuning.md
@@ -10,14 +10,14 @@ In this tutorial we will be adding DeepSpeed to the BingBert model for the SQuAD
 
 If you don't already have a copy of the DeepSpeed repository, please clone in
 now and checkout the DeepSpeedExamples submodule the contains the BingBertSquad
-example (DeepSpeedExamples/BingBertSquad) we will be going over in the rest of
+example (DeepSpeedExamples/training/BingBertSquad) we will be going over in the rest of
 this tutorial.
 
 ```shell
-git clone https://github.com/microsoft/DeepSpeed
+git clone https://github.com/deepspeedai/DeepSpeed
 cd DeepSpeed
 git submodule update --init --recursive
-cd DeepSpeedExamples/BingBertSquad
+cd DeepSpeedExamples/training/BingBertSquad
 ```
 
 ### Pre-requisites
diff --git a/docs/_tutorials/bert-pretraining.md b/docs/_tutorials/bert-pretraining.md
index 14789d3fda96..342918de958d 100755
--- a/docs/_tutorials/bert-pretraining.md
+++ b/docs/_tutorials/bert-pretraining.md
@@ -5,7 +5,7 @@ tags: training pre-training
 ---
 
 **Note:**
-On 08/15/2022 we have added another BERT pre-training/fine-tuning example at [github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile), which includes a README.md that describes how to use it. Compared to the example described below, the new example in Megatron-DeepSpeed adds supports of ZeRO and tensor-slicing model parallelism (thus support larger model scale), uses a public and richer [Pile dataset](https://github.com/EleutherAI/the-pile) (user can also use their own data), together with some changes to the model architecture and training hyperparameters as described in [this paper](https://arxiv.org/abs/1909.08053). As a result, the BERT models trained by the new example is able to provide better MNLI results than original BERT, but with a slightly different model architecture and larger computation requirements. If you want to train a larger-scale or better quality BERT-style model, we recommend to follow the new example in Megatron-DeepSpeed. If your goal is to strictly reproduce the original BERT model, we recommend to follow the example under DeepSpeedExamples/bing_bert as described below. On the other hand, the tutorial below helps explaining how to integrate DeepSpeed into a pre-training codebase, regardless of which BERT example you use.
+On 08/15/2022 we have added another BERT pre-training/fine-tuning example at [github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/bert_with_pile), which includes a README.md that describes how to use it. Compared to the example described below, the new example in Megatron-DeepSpeed adds supports of ZeRO and tensor-slicing model parallelism (thus support larger model scale), uses a public and richer [Pile dataset](https://github.com/EleutherAI/the-pile) (user can also use their own data), together with some changes to the model architecture and training hyperparameters as described in [this paper](https://arxiv.org/abs/1909.08053). As a result, the BERT models trained by the new example is able to provide better MNLI results than original BERT, but with a slightly different model architecture and larger computation requirements. If you want to train a larger-scale or better quality BERT-style model, we recommend to follow the new example in Megatron-DeepSpeed. If your goal is to strictly reproduce the original BERT model, we recommend to follow the example under DeepSpeedExamples/bing_bert as described below. On the other hand, the tutorial below helps explaining how to integrate DeepSpeed into a pre-training codebase, regardless of which BERT example you use.
 {: .notice--info}
 
 In this tutorial we will apply DeepSpeed to pre-train the BERT
@@ -26,7 +26,7 @@ We work from adaptations of
 [huggingface/transformers](https://github.com/huggingface/transformers) and
 [NVIDIA/DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples).
 We have forked this repo under
-[DeepSpeedExamples/bing_bert](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert)
+[DeepSpeedExamples/bing_bert](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert)
 and made several modifications in their script:
 
   * We adopted the modeling code from NVIDIA's BERT under `bing_bert/nvidia/`.
@@ -360,7 +360,7 @@ the scripts/json configs in our DeepSpeedExamples repo. Below is a table contain
 summary of the configurations. Specifically see the
 `ds_train_bert_bsz64k_seq128.sh` and `ds_train_bert_bsz32k_seq512.sh` scripts
 for more details in
-[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert).
+[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert).
 
 
 | Parameters               | 128 Sequence              | 512 Sequence              |
diff --git a/docs/_tutorials/cifar-10.md b/docs/_tutorials/cifar-10.md
index 8b4990d0431e..2bd06abf0e89 100644
--- a/docs/_tutorials/cifar-10.md
+++ b/docs/_tutorials/cifar-10.md
@@ -16,7 +16,7 @@ First we will go over how to run the original CIFAR-10 model. Then we will proce
 
 ## Running Original CIFAR-10
 
-Original model code from the [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/main/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/training/cifar/](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/cifar) and made it available as a submodule. To download, execute:
+Original model code from the [CIFAR-10 Tutorial](https://github.com/pytorch/tutorials/blob/main/beginner_source/blitz/cifar10_tutorial.py), We've copied this repo under [DeepSpeedExamples/training/cifar/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/training/cifar) and made it available as a submodule. To download, execute:
 
 ```bash
 git submodule update --init --recursive
diff --git a/docs/_tutorials/comms-logging.md b/docs/_tutorials/comms-logging.md
index 2719f08ad200..c4f6141a5b6c 100644
--- a/docs/_tutorials/comms-logging.md
+++ b/docs/_tutorials/comms-logging.md
@@ -64,7 +64,7 @@ The steps to add DeepSpeed communication log summaries are as follows:
 2. (Optional) If your application contains `torch.distributed` calls that you wish to log, import `deepspeed.comm` package and modify `torch.distributed` calls to use `deepspeed.comm` (Note: The `deepspeed.comm` collective and pt2pt APIs exactly match `torch.distributed`)
 3. Call `deepspeed.comm.log_summary`
 
-For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) example:
+For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) example:
 
 ```python
 # Step 2: (Optional) Import deepspeed.comm
diff --git a/docs/_tutorials/curriculum-learning.md b/docs/_tutorials/curriculum-learning.md
index 29f9417363f0..0b74945d3715 100644
--- a/docs/_tutorials/curriculum-learning.md
+++ b/docs/_tutorials/curriculum-learning.md
@@ -8,7 +8,7 @@ On 12/12/2022, we released DeepSpeed Data Efficiency Library which provides a mo
 {: .notice--warning}
 
 **Note:**
-This tutorial was updated on 10/29/2021. Changes include: 1) A more detailed tuning strategy. 2) Pipeline parallelism support. 3) Token-based learning rate decay. 4) A new GPT-2 example at [github.com/microsoft/Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed). See details below.
+This tutorial was updated on 10/29/2021. Changes include: 1) A more detailed tuning strategy. 2) Pipeline parallelism support. 3) Token-based learning rate decay. 4) A new GPT-2 example at [github.com/deepspeedai/Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed). See details below.
 {: .notice--info}
 
 In this tutorial, we introduce DeepSpeed's curriculum learning-based data pipeline, which presents easier or simpler examples earlier during training. By enabling stable training with 8x/4x larger batch size/learning rate (whereas the baseline approach struggles with training divergence), we observe that curriculum learning (based on sequence length) provides stable and 3.3x faster GPT-2 pre-training (tested on 117M and 1.5B parameters), together with better token-wise convergence speed and zero-shot WikiText-103/LAMBADA evaluation results. In addition, since curriculum learning only affects the data pipeline, its benefit is complementary to many DeepSpeed features and other system optimization techniques. For example, curriculum learning is compatible with DeepSpeed's [ZeRO Redundancy Optimizer](/tutorials/zero/), [ZeRO-Offload](/tutorials/zero-offload/), and [3D Parallelism](/tutorials/pipeline/).
@@ -114,17 +114,17 @@ After the update on 10/29/2021, now there are two curriculum learning examples f
 
 We provide two curriculum learning examples for Megatron-LM GPT-2 pre-training:
 
-The first one is at [Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning). This integration is based on a newer Megatron-LM fork, and only this curriculum learning example supports pipeline parallelism. However, as of 10/29/2021, we haven't verified ZeRO-2 and ZeRO-3 on this fork. Overall, we highly recommend you to use this example if your model does not require ZeRO-2/3.
+The first one is at [Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/curriculum_learning). This integration is based on a newer Megatron-LM fork, and only this curriculum learning example supports pipeline parallelism. However, as of 10/29/2021, we haven't verified ZeRO-2 and ZeRO-3 on this fork. Overall, we highly recommend you to use this example if your model does not require ZeRO-2/3.
 
-The second one is at [DeepSpeedExamples/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning/](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning). This integration is based on an older Megatron-LM hard copy that we will eventually deprecate and this curriculum learning example does not support pipeline parallelism. We recommend you to ONLY use this example if your model requires ZeRO-2/3.
+The second one is at [DeepSpeedExamples/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3/curriculum_learning). This integration is based on an older Megatron-LM hard copy that we will eventually deprecate and this curriculum learning example does not support pipeline parallelism. We recommend you to ONLY use this example if your model requires ZeRO-2/3.
 
 Besides the DeepSpeed curriculum learning json configurations described above, there are some other necessary changes on the user side to integrate curriculum learning:
 
 ### 2.1 Training data truncation
 
-To enable `seqlen`-based curriculum learning, we need to add the functionality of training data truncation based on the given curriculum sequence length. For the case without pipeline parallelism, it is necessary to add a `curriculum_seqlen` argument in the model's forward pass and use it to perform training data sequence length truncation. For Megatron-LM GPT-2 pre-training, we implement this in `forward()` in [megatron/model/gpt2_model.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/megatron/model/gpt2_model.py) and in `forward_step()` in [pretrain_gpt2.py](https://github.com/microsoft/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/pretrain_gpt2.py).
+To enable `seqlen`-based curriculum learning, we need to add the functionality of training data truncation based on the given curriculum sequence length. For the case without pipeline parallelism, it is necessary to add a `curriculum_seqlen` argument in the model's forward pass and use it to perform training data sequence length truncation. For Megatron-LM GPT-2 pre-training, we implement this in `forward()` in [megatron/model/gpt2_model.py](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/megatron/model/gpt2_model.py) and in `forward_step()` in [pretrain_gpt2.py](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/Megatron-LM-v1.1.5-ZeRO3/pretrain_gpt2.py).
 
-For the case with pipeline parallelism, due to DeepSpeed engine limitations we cannot inject the `curriculum_seqlen` argument in the forward pass. Instead, we create a duplicate of `deepspeed.runtime.data_pipeline.curriculum_scheduler` on the user side, and use it to retrieve the `curriculum_seqlen`. This implementation can be found in [megatron/training.py](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/training.py).
+For the case with pipeline parallelism, due to DeepSpeed engine limitations we cannot inject the `curriculum_seqlen` argument in the forward pass. Instead, we create a duplicate of `deepspeed.runtime.data_pipeline.curriculum_scheduler` on the user side, and use it to retrieve the `curriculum_seqlen`. This implementation can be found in [megatron/training.py](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/megatron/training.py).
 
 ### 2.2 Disable batch size warmup (`--rampup-batch-size`)
 In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that curriculum learning (`seqlen`-based) provides much better training stability than the batch size warmup technique introduced by Open AI GPT-3. So when using curriculum learning you need to remove the `--rampup-batch-size` config in your training script. It's not recommended using both curriculum learning and batch size warmup, because both of them reduce the number of tokens in a batch. Another related change you might want is to increase your micro batch size, since without batch size warmup your batch size will be fixed now.
diff --git a/docs/_tutorials/data-efficiency.md b/docs/_tutorials/data-efficiency.md
index 9ea3a33dab92..b49974f1fa78 100644
--- a/docs/_tutorials/data-efficiency.md
+++ b/docs/_tutorials/data-efficiency.md
@@ -20,7 +20,7 @@ Curriculum learning has been successfully applied to various training tasks (see
 ### 1.3 How to use Curriculum Learning
 
 #### 1.3.1 GPT-3 and BERT pretraining
-The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply curriculum learning to GPT-3 and BERT pretraining. There are 3 steps: data analysis, pretraining, and eval/finetuning.
+The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/deepspeedai/Megatron-DeepSpeed) includes our examples of how to apply curriculum learning to GPT-3 and BERT pretraining. There are 3 steps: data analysis, pretraining, and eval/finetuning.
 
 **Data analysis:** Curriculum learning requires a data analysis before pretraining that calculate the difficulty of each data sample (based on the metric provided by user), and build an index that map difficulty value to corresponding data samples. (There are exceptions: for example the truncation-based sequence length metric can be achieved by data postprocessing without data analysis.) We provide a data analyzer to perform the offline CPU-only data analysis.
 
@@ -31,7 +31,7 @@ The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed re
 **Eval/finetuning** `examples_deepspeed/data_efficiency/gpt/eval/` and `examples_deepspeed/data_efficiency/bert/finetune` include the example scripts for GPT-3 model's zero-/few-shot evaluation and BERT model's finetuning. Our [paper](https://arxiv.org/abs/2212.03597) includes the reference eval/finetune results if you follow our example scripts to perform the pretraining/eval/finetuning.
 
 #### 1.3.2 GPT-2 finetuning
-The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to apply curriculum learning to GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. For CL metrics that require data analysis (e.g., the vocabulary rarity metric), you need to first use ```data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_*``` to analyze and index the dataset, similar to the GPT-3 pre-training case described above in 1.3.1.
+The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples) includes our examples of how to apply curriculum learning to GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script. For CL metrics that require data analysis (e.g., the vocabulary rarity metric), you need to first use ```data_efficiency/gpt_finetuning/finetune/ds_analyze_gpt_data_*``` to analyze and index the dataset, similar to the GPT-3 pre-training case described above in 1.3.1.
 
 ## 2. Random layerwise token dropping (random-LTD)
 
@@ -44,14 +44,14 @@ When you want to pretrain/fine-tune a transformer-based model, it is always a go
 ### 2.3 How to use random-LTD
 
 #### 2.3.1 GPT-3 and BERT pretraining
-The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to apply random-LTD to GPT-3 and BERT pretraining.
+The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/deepspeedai/Megatron-DeepSpeed) includes our examples of how to apply random-LTD to GPT-3 and BERT pretraining.
 
 `examples_deepspeed/data_efficiency/gpt/pretrain` and `examples_deepspeed/data_efficiency/bert/pretrain` include the example pretraining scripts with random-LTD feature. Several changes are needed to enable random-LTD during pretraining: (1) User need to provide a DeepSpeed json config file which includes configurations for random-LTD (see [list of configuration](/docs/config-json/#data-efficiency) for details). We provide tested example configurations in `examples_deepspeed/data_efficiency/gpt/pretrain/ds_pretrain_gpt_1.3B_dense_run.sh` and `examples_deepspeed/data_efficiency/bert/pretrain/ds_pretrain_bert_336M_run.sh`. (2) After initializing the DeepSpeed engine via `deepspeed.initialize`, user needs to use the `convert_to_random_ltd` API to convert and wrap the model layers in order to enable the random-LTD feature. We provide an example implementation of this change in `megatron/training.py` function `setup_model_and_optimizer`. (3) In order for random-LTD to understand the input argument mapping of the forward function, user need to change all the input arguments (except the hidden_states input) into keyword/named argument. For example, in `megatron/model/transformer.py` we changed the forward function from `def forward(self, hidden_states, attention_mask, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):` to `def forward(self, hidden_states, attention_mask=None, encoder_output=None, enc_dec_attn_mask=None, layer_past=None, get_key_value=False):`. (4) When saving model checkpoints, (especially if the state dictionary has non-traditional structure) user needs to use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers. We provide an example implementation of this change in `megatron/model/language_model.py`.
 
 For eval/finetuning of the pretrained model, see [previous section](#131-gpt-3-and-bert-pretraining) about how to use our example scripts.
 
 #### 2.3.2 GPT-2 and ViT finetuning
-The `data_efficiency` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to apply random-LTD to GPT-2 and ViT finetuning.
+The `data_efficiency` directory in our [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples) includes our examples of how to apply random-LTD to GPT-2 and ViT finetuning.
 
 Just like pretraining case, similar changes are required to enable random-LTD for finetuning: (1) DeepSpeed json config file. (2) Use the `convert_to_random_ltd` API to convert and wrap the model layers. (3) When saving model checkpoints, use the `remove_random_ltd_state_dict` API to convert the random-LTD-wrapped layers back to original model layers.
 
@@ -92,9 +92,9 @@ iter 5474 | LR [0.0001]| val_acc 97.97000122070312 | layer_token 305784192
 ## 3. Composing curriculum learning and random-LTD to achieve more
 
 ### 3.1 GPT-3 and BERT pretraining
-The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/microsoft/Megatron-DeepSpeed) includes our examples of how to compose curriculum learning random-LTD, and apply both of them to GPT-3 and BERT pretraining.
+The `examples_deepspeed/data_efficiency` directory in our [Megatron-DeepSpeed repo](https://github.com/deepspeedai/Megatron-DeepSpeed) includes our examples of how to compose curriculum learning random-LTD, and apply both of them to GPT-3 and BERT pretraining.
 
 The changes needed are the same as described in previous two sections, since DeepSpeed Data Efficiency already handles the complexity when composing the two techniques. However, one thing to note is that since both random-LTD and some of the curriculum learning metrics will change the sequence length, it could require some extra code to calculate the effective sequence length at each step. We provide an example implementation of this change in `megatron/training.py` function `train` where we calculate the `actual_seq_length`.
 
 #### 3.2 GPT-2 finetuning
-The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/microsoft/DeepSpeedExamples) includes our examples of how to compose curriculum learning random-LTD for GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script.
+The `data_efficiency/gpt_finetuning` directory in our [DeepSpeedExamples repo](https://github.com/deepspeedai/DeepSpeedExamples) includes our examples of how to compose curriculum learning random-LTD for GPT-2 finetuning. `data_efficiency/gpt_finetuning/finetune/ds_finetune_gpt2_run.sh` is the example finetuning script.
diff --git a/docs/_tutorials/deepnvme.md b/docs/_tutorials/deepnvme.md
new file mode 100644
index 000000000000..a6d4545815dc
--- /dev/null
+++ b/docs/_tutorials/deepnvme.md
@@ -0,0 +1,297 @@
+---
+title: "DeepNVMe"
+tags: training inference IO large-model
+---
+This tutorial will show how to use [DeepNVMe](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md) for data transfers between persistent storage and tensors residing in host or device memory. DeepNVMe improves the performance and efficiency of I/O operations in Deep Learning applications through powerful optimizations built on Non-Volatile Memory Express (NVMe) Solid State Drives (SSDs), Linux Asynchronous I/O (`libaio`), and NVIDIA Magnum IO<sup>TM</sup> GPUDirect® Storage (GDS).
+
+## Requirements
+Ensure your environment is properly configured to use DeepNVMe. First, you need to install DeepSpeed version >= [0.15.0](https://github.com/deepspeedai/DeepSpeed/releases/tag/v0.15.0). Next, ensure that the DeepNVMe operators are available in the DeepSpeed installation. The `async_io` operator is required for any DeepNVMe functionality, while the `gds` operator is required only for GDS functionality. You can confirm availability of each operator by inspecting the output of `ds_report` to check that compatible status is <span style="color:green">[OKAY]</span>. Below is a snippet of `ds_report` output confirming the availability of both `async_io` and `gds` operators.
+
+![deepnvme_ops_report](/assets/images/deepnvme_ops_report.png)
+
+If `async_io` operator is unavailable, you will need to install the appropriate `libaio` library binaries for your Linux flavor. For example, Ubuntu users will need to run `apt install libaio-dev`. In general, you should carefully inspect `ds_report` output for helpful tips such as the following:
+
+```bash
+[WARNING]  async_io requires the dev libaio .so object and headers but these were not found.
+[WARNING]  async_io: please install the libaio-dev package with apt
+[WARNING]  If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+```
+
+To enable `gds` operator, you will need to install NVIDIA GDS by consulting the appropriate guide for [bare-metal systems](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html) or Azure VMs (coming soon).
+
+
+## Creating DeepNVMe Handles
+DeepNVMe functionality can be accessed through two abstractions: `aio_handle` and `gds_handle`. The `aio_handle` is usable on both host and device tensors. while `gds_handle` works only on CUDA tensors, but is more efficient. The first step to use DeepNVMe is to create a desired handle. `aio_handle` requires `async_io` operator, while `gds_handle` requires both `async_io` and `gds` operators. The following snippets illustrate `aio_handle` and `gds_handle` creation respectively.
+
+```python
+### Create aio_handle
+from deepspeed.ops.op_builder import AsyncIOBuilder
+aio_handle = AsyncIOBuilder().load().aio_handle()
+```
+
+```python
+### Create gds_handle
+from deepspeed.ops.op_builder import GDSBuilder
+gds_handle = GDSBuilder().load().gds_handle()
+```
+
+For simplicity, the above examples illustrate handle creation using default parameters. We expect that handles created with default parameters to provide good performance in most environments. However, you can see [below](#advanced-handle-creation) for advanced handle creation.
+
+## Using DeepNVMe Handles
+`aio_handle` and `gds_handle` provide identical APIs for storing tensors to files or loading tensors from files. A common feature of these APIs is that they take a tensor and a file path as arguments for the desired I/O operation. For best performance, pinned device or host tensors should be used for I/O operations (see [here](#pinned-tensors) for details). For brevity, this tutorial will use `aio_handle` for illustration, but keep in mind that `gds_handle` works similarly.
+
+You can see the available APIs in a Python shell via tab completion on an `aio_handle` object . This is illustrated using tab completion of `h.`.
+
+```bash
+>python
+Python 3.10.12 (main, Jul 29 2024, 16:56:48) [GCC 11.4.0] on linux
+Type "help", "copyright", "credits" or "license" for more information.
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle()
+>>> h.
+h.async_pread(             h.free_cpu_locked_tensor(  h.get_overlap_events(      h.get_single_submit(       h.new_cpu_locked_tensor(   h.pwrite(                  h.sync_pread(              h.wait(
+h.async_pwrite(            h.get_block_size(          h.get_queue_depth(         h.get_intra_op_parallelism(        h.pread(                   h.read(                    h.sync_pwrite(             h.write(
+```
+The APIs of interest for performing I/O operations are those named with `pread` and `pwrite` substrings. For brevity, we will focus on the file write APIs, namely `sync_pwrite`, `async_pwrite`, and `pwrite`. We will discuss only `sync_pwrite` and `async_pwrite` below because they are specializations of `pwrite`.
+
+### Blocking File Write
+`sync_pwrite` provides the standard blocking semantics of Python file write. The example below illustrates using `sync_pwrite` to store a 1GB CUDA tensor to a local NVMe file.
+
+```bash
+>>> import os
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+False
+>>> import torch
+>>> t=torch.empty(1024**3, dtype=torch.uint8).cuda()
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle()
+>>> h.sync_pwrite(t,'/local_nvme/test_1GB.pt')
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+True
+>>> os.path.getsize('/local_nvme/test_1GB.pt')
+1073741824
+
+```
+
+### Non-Blocking File Write
+An important DeepNVMe optimization is the non-blocking I/O semantics which enables Python threads to overlap computations with I/O operations. `async_pwrite` provides the non-blocking semantics for file writes. The Python thread can later use `wait()` to synchronize with the I/O operation. `async_write` can also be used to submit multiple back-to-back non-blocking I/O operations, of which can then be later blocked on using a single `wait()`. The example below illustrates using `async_pwrite` to store a 1GB CUDA tensor to a local NVMe file.
+
+```bash
+>>> import os
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+False
+>>> import torch
+>>> t=torch.empty(1024**3, dtype=torch.uint8).cuda()
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle()
+>>> h.async_pwrite(t,'/local_nvme/test_1GB.pt')
+>>> h.wait()
+1
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+True
+>>> os.path.getsize('/local_nvme/test_1GB.pt')
+1073741824
+```
+
+<span style="color:red">Warning for non-blocking I/O operations:</span> To avoid data races and corruptions, `.wait()` must be carefully used to serialize the writing of source tensors, and the reading of destination tensors.  For example, the following update of `t` during a non-blocking file write is unsafe and could corrupt `/local_nvme/test_1GB.pt`.
+
+```bash
+>>> t=torch.empty(1024**3, dtype=torch.uint8).cuda()
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle()
+>>> h.async_pwrite(t,'/local_nvme/test_1GB.pt')
+>>> t += 1 # <--- Data race; avoid by preceding with `h.wait()`
+```
+
+Similar safety problems apply to reading the destination tensor of a non-blocking file read without `.wait()` synchronization.
+
+
+### Parallel File Write
+An important DeepNVMe optimization is the ability to parallelize individual I/O operations. This optimization is enabled by specifying the desired parallelism degree when constructing a DeepNVMe handle. Subsequent I/O operations with that handle are automatically parallelized over the requested number of host or device threads, as appropriate. I/O parallelism is composable with either the blocking or non-blocking I/O APIs. The example below illustrates 4-way parallelism of a file write using `async_pwrite`. Note the use of `intra_op_parallelism` argument to specify the desired parallelism degree in handle creation.
+
+```bash
+>>> import os
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+False
+>>> import torch
+>>> t=torch.empty(1024**3, dtype=torch.uint8).cuda()
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle(intra_op_parallelism=4)
+>>> h.async_pwrite(t,'/local_nvme/test_1GB.pt')
+>>> h.wait()
+1
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+True
+>>> os.path.getsize('/local_nvme/test_1GB.pt')
+1073741824
+```
+
+### Pinned Tensors
+A key part of DeepNVMe optimizations is using direct memory access (DMA) for I/O operations, which requires that the host or device tensor be pinned. To pin host tensors, you can use mechanisms provided by [Pytorch](https://pytorch.org/docs/stable/generated/torch.Tensor.pin_memory.html) or [DeepSpeed Accelerators](/tutorials/accelerator-abstraction-interface/#tensor-operations). The following example illustrates writing a pinned CPU tensor to a local NVMe file.
+
+```bash
+>>> import os
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+False
+>>> import torch
+>>> t=torch.empty(1024**3, dtype=torch.uint8).pin_memory()
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle()
+>>> h.async_pwrite(t,'/local_nvme/test_1GB.pt')
+>>> h.wait()
+1
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+True
+>>> os.path.getsize('/local_nvme/test_1GB.pt')
+1073741824
+```
+
+On the other hand,`gds_handle` provides `new_pinned_device_tensor()` and `pin_device_tensor()` functions for pinning CUDA tensors. The following example illustrates writing a pinned CUDA tensor to a local NVMe file.
+
+```bash
+>>> import os
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+False
+>>> import torch
+>>> t=torch.empty(1024**3, dtype=torch.uint8).cuda()
+>>> from deepspeed.ops.op_builder import GDSBuilder
+>>> h = GDSBuilder().load().gds_handle()
+>>> h.pin_device_tensor(t)
+>>> h.async_pwrite(t,'/local_nvme/test_1GB.pt')
+>>> h.wait()
+1
+>>> os.path.isfile('/local_nvme/test_1GB.pt')
+True
+>>> os.path.getsize('/local_nvme/test_1GB.pt')
+1073741824
+>>> h.unpin_device_tensor(t)
+```
+
+
+## Putting it together
+We hope that the above material helps you to get started with DeepNVMe. You can also use the following links to see DeepNVMe usage in real-world Deep Learning applications.
+
+1. [Parameter swapper](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py#L111-L117) in [ZeRO-Inference](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md) and [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/).
+2. [Optimizer swapper](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L36-L38) in [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/).
+3. [Gradient swapper](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L41-L43) in [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/).
+4. Simple file read and write [operations](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/deepnvme/file_access/README.md).
+
+<!-- 1. ZeRO-Inference: used for [parameter offloading](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py#L111-L117).
+
+2. [ZeRO-Infinity](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/): used for offloading [parameters](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_param_swapper.py#L111-L117), [gradients](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L41-L43), and [optimizer](https://github.com/deepspeedai/DeepSpeed/blob/9b7fc5452471392b0f58844219fcfdd14a9cdc77/deepspeed/runtime/swap_tensor/partitioned_optimizer_swapper.py#L36-L38).
+3. Simple file read and write [operations](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/deepnvme/file_access/README.md).  -->
+
+
+## Acknowledgements
+This tutorial has been significantly improved by feedback from [Guanhua Wang](https://github.com/GuanhuaWang), [Masahiro Tanaka](https://github.com/tohtana), and [Stas Bekman](https://github.com/stas00).
+
+## Appendix
+
+### Advanced Handle Creation
+Achieving peak I/O performance with DeepNVMe requires careful configuration of handle creation. In particular, the parameters of `aio_handle` and `gds_handle` constructors are performance-critical because they determine how efficiently DeepNVMe interacts with the underlying storage subsystem (i.e., `libaio`, GDS, PCIe, and SSD). For convenience we make it possible to create handles using default parameter values which will provide decent performance in most scenarios. However, squeezing out every available performance in your environment will likely require tuning the constructor parameters, namely `block_size`, `queue_depth`, `single_submit`, `overlap_events`, and `intra_op_parallelism`. The `aio_handle` constructor parameters and default values are illustrated below:
+```bash
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> help(AsyncIOBuilder().load().aio_handle())
+Help on aio_handle in module async_io object:
+
+class aio_handle(pybind11_builtins.pybind11_object)
+ |  Method resolution order:
+ |      aio_handle
+ |      pybind11_builtins.pybind11_object
+ |      builtins.object
+ |
+ |  Methods defined here:
+ |
+ |  __init__(...)
+ |      __init__(self: async_io.aio_handle, block_size: int = 1048576, queue_depth: int = 128, single_submit: bool = False, overlap_events: bool = False, intra_op_parallelism: int = 1) -> None
+ |
+ |      AIO handle constructor
+```
+
+### Performance Tuning
+As discussed [earlier](#advanced-handle-creation), achieving peak DeepNVMe performance for a target workload or environment requires using optimally configured `aio_handle` or `gds_handle` handles. For configuration convenience, we provide a utility called `ds_nvme_tune` to automate the discovery of optimal DeepNVMe configurations. `ds_nvme_tune` automatically explores a user-specified or default configuration space and recommends the option that provides the best read and write performance. Below is an example usage of `ds_nvme_tune` to tune `aio_handle` data transfers between GPU memory and a local NVVMe SSD mounted on `/local_nvme`. This example used the default configuration space of `ds_nvme_tune` for tuning.
+
+```bash
+$ ds_nvme_tune --nvme_dir /local_nvme --gpu
+Running DeepNVMe performance tuning on ['/local_nvme/']
+Best performance (GB/sec): read =  3.69, write =  3.18
+{
+   "aio": {
+      "single_submit": "false",
+      "overlap_events": "true",
+      "intra_op_parallelism": 8,
+      "queue_depth": 32,
+      "block_size": 1048576
+   }
+}
+```
+The above tuning was executed on a Lambda workstation equipped with two NVIDIA A6000-48GB GPUs, 252GB of DRAM, and a [CS3040 NVMe 2TB SDD](https://www.pny.com/CS3040-M2-NVMe-SSD?sku=M280CS3040-2TB-RB) with peak read and write speeds of 5.6 GB/s and 4.3 GB/s respectively. The tuning required about four and half minutes. Based on the results, one can expect to achieve read and write transfer speeds of 3.69 GB/sec and 3.18 GB/sec respectively by using an `aio_handle` configured as below.
+
+```python
+>>> from deepspeed.ops.op_builder import AsyncIOBuilder
+>>> h = AsyncIOBuilder().load().aio_handle(block_size=1048576,
+                                           queue_depth=32,
+                                           single_submit=False,
+                                           overlap_events=True,
+                                           intra_op_parallelism=8)
+```
+
+
+The full command line options of `ds_nvme_tune` can be obtained via the normal `-h` or `--help`.
+```bash
+usage: ds_nvme_tune [-h] --nvme_dir NVME_DIR [NVME_DIR ...] [--sweep_config SWEEP_CONFIG] [--no_read] [--no_write] [--io_size IO_SIZE] [--gpu] [--gds] [--flush_page_cache] [--log_dir LOG_DIR] [--loops LOOPS] [--verbose]
+
+options:
+  -h, --help            show this help message and exit
+  --nvme_dir NVME_DIR [NVME_DIR ...]
+                        Directory in which to perform I/O tests. A writeable directory on a NVMe device.
+  --sweep_config SWEEP_CONFIG
+                        Performance sweep configuration json file.
+  --no_read             Disable read performance measurements.
+  --no_write            Disable write performance measurements.
+  --io_size IO_SIZE     Number of I/O bytes to read/write for performance measurements.
+  --gpu                 Test tensor transfers between GPU device and NVME device.
+  --gds                 Run the sweep over NVIDIA GPUDirectStorage operator
+  --flush_page_cache    Page cache will not be flushed and reported read speeds may be higher than actual ***Requires sudo access***.
+  --log_dir LOG_DIR     Output directory for performance log files. Default is ./_aio_bench_logs
+  --loops LOOPS         Count of operation repetitions
+  --verbose             Print debugging information.
+```
+
+### DeepNVMe APIs
+For convenience, we provide listing and brief descriptions of the DeepNVMe APIs.
+
+#### General I/O APIs
+The following functions are used for I/O operations with both `aio_handle` and `gds_handle`.
+
+Function | Description |
+|---|---|
+async_pread | Non-blocking file read into tensor |
+sync_pread | Blocking file read into tensor |
+pread | File read with blocking and non-blocking options |
+async_pwrite | Non-blocking file write from tensor |
+sync_pwrite | Blocking file write from tensor |
+pwrite | File write with blocking and non-blocking options |
+wait | Wait for non-blocking I/O operations to complete |
+
+#### GDS-specific APIs
+The following functions are available only for `gds_handle`
+
+Function | Description
+|---|---|
+new_pinned_device_tensor | Allocate and pin a device tensor |
+free_pinned_device_tensor | Unpin and free a device tensor |
+pin_device_tensor | Pin a device tensor |
+unpin_device_tensor | unpin a device tensor |
+
+
+#### Handle Settings APIs
+The following APIs can be used to probe handle configuration.
+
+Function | Description
+|---|---|
+get_queue_depth | Return queue depth setting |
+get_single_submit | Return whether single_submit is enabled |
+get_intra_op_parallelism | Return I/O parallelism degree |
+get_block_size | Return I/O block size setting |
+get_overlap_events | Return whether overlap_event is enabled |
diff --git a/docs/_tutorials/domino.md b/docs/_tutorials/domino.md
new file mode 100644
index 000000000000..e1cb704fc229
--- /dev/null
+++ b/docs/_tutorials/domino.md
@@ -0,0 +1,6 @@
+---
+title: "Domino"
+tags: training
+---
+
+Domino achieves near-complete communication hiding behind computation for tensor parallel training. Please find our [Domino-tutorial](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/training/DeepSpeed-Domino/README.md) in DeepSpeedExample repo.
diff --git a/docs/_tutorials/ds-sequence.md b/docs/_tutorials/ds-sequence.md
index 815b99d6de35..41a76b784782 100755
--- a/docs/_tutorials/ds-sequence.md
+++ b/docs/_tutorials/ds-sequence.md
@@ -3,7 +3,7 @@ title: "Getting Started with DeepSpeed-Ulysses for Training Transformer Models w
 tags: training
 ---
 
-In this tutorial we describe how to enable DeepSpeed-Ulysses. DeepSpeed-Ulysses is a simple but highly communication and memory efficient mechanism sequence parallelism approach for training of large transformer models with massive sequence lengths. It partitions input tensors along the sequence dimension and uses a communication-efficient all-2-all collective for distributed attention computations. Additionally, DeepSpeed-Ulysses incorporates advanced modeling and system optimizations, such as Flash attention, sparse attention, and ZeRO optimizer, to optimize both computational efficiency and memory usage. Training with DeepSpeed sequence parallelism allows both model size and sequence length to scale near indefinitely unbounded by single GPU memory limitation and at a high fraction of peak compute performance. Currently, DeepSpeed-Ulysses can handle sequences up to 1 million in length (10 times the size of a complete Harry Potter book!) on 64 A100 GPUs. Please read our [DeepSpeed-Ulysses blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses) to learn more!
+In this tutorial we describe how to enable DeepSpeed-Ulysses. DeepSpeed-Ulysses is a simple but highly communication and memory efficient mechanism sequence parallelism approach for training of large transformer models with massive sequence lengths. It partitions input tensors along the sequence dimension and uses a communication-efficient all-2-all collective for distributed attention computations. Additionally, DeepSpeed-Ulysses incorporates advanced modeling and system optimizations, such as Flash attention, sparse attention, and ZeRO optimizer, to optimize both computational efficiency and memory usage. Training with DeepSpeed sequence parallelism allows both model size and sequence length to scale near indefinitely unbounded by single GPU memory limitation and at a high fraction of peak compute performance. Currently, DeepSpeed-Ulysses can handle sequences up to 1 million in length (10 times the size of a complete Harry Potter book!) on 64 A100 GPUs. Please read our [DeepSpeed-Ulysses blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ulysses) to learn more!
 
 ## 1. Installation
 
@@ -12,10 +12,10 @@ You will need to install DeepSpeed v0.10.2 or higher to use the DeepSpeed Sequen
 
 ## 2. How to use DeepSpeed-Ulysses in your application?
 
-Integrating DS-Seq into your training code is easy, and in this section we describe how to integrate DeepSpeed-Ulysses through our [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) code repo.
+Integrating DS-Seq into your training code is easy, and in this section we describe how to integrate DeepSpeed-Ulysses through our [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) code repo.
 
 
-* **Replace attention module**: First, you need to update your attention module with DeepSpeed-Ulysses DistributedAttention. Here, we use the attention from [Megatron-DeepSpeed ](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/megatron/model/transformer.py) which is the causal attention used in GPT-3 like model training. Rewrite the attention block:
+* **Replace attention module**: First, you need to update your attention module with DeepSpeed-Ulysses DistributedAttention. Here, we use the attention from [Megatron-DeepSpeed ](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/megatron/model/transformer.py) which is the causal attention used in GPT-3 like model training. Rewrite the attention block:
 
 ```python
 def __init__():
@@ -49,7 +49,7 @@ def forward():
 
 ```
 
-* **Add sequence parallel communication group**:  Note that DistributedAttention takes `local_attn` and `sequence_parallel_group` as the parameters, where local_attn can be your original attention block. You also need to build the sequence parallel nication group and pass that the DistributedAttention. One way to do this is to build the sequence parallel group at the model initialization stage.
+* **Add sequence parallel communication group**:  Note that DistributedAttention takes `local_attn` and `sequence_parallel_group` as the parameters, where local_attn can be your original attention block. You also need to build the sequence parallel communication group and pass that the DistributedAttention. One way to do this is to build the sequence parallel group at the model initialization stage.
 
 
 ```python
@@ -94,7 +94,7 @@ DeepSpeed's sequence parallelism can be combined with different types of attenti
 
 `FlashAttention`: the implementation from [FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https://arxiv.org/abs/2205.14135). Enabled by `--use-flash-attn`.
 
-`FlashAttention + Triton`: a of FlashAttention in Triton (tested with triton==2.0.0.dev20221202). Enabled by `--use-flash-attn-triton`.
+`FlashAttention + Triton`: FlashAttention in Triton (tested with triton==2.0.0.dev20221202). Enabled by `--use-flash-attn-triton`.
 
 For the best performance, we recommend using FlashAttention + Triton. Below are the installation steps. Note that FlashAttention is compatible only with NVIDIA Turing, Ampere, Ada, or Hopper GPUs.
 
@@ -111,7 +111,7 @@ pip install .
 cd ${WORK_DIR}
 git clone -b v1.0.4 https://github.com/HazyResearch/flash-attention
 cd flash-attention
-python setup.py install
+python -m pip install .
 ```
 
-You may also want to ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the document of FlashAttention for more details.
+You may also want to ensure your model configuration is compliant with FlashAttention's requirements. For instance, to achieve optimal performance, the head size should be divisible by 8. Refer to the FlashAttention documentation for more details.
diff --git a/docs/_tutorials/flops-profiler.md b/docs/_tutorials/flops-profiler.md
index 24efc238615a..d4a7496405b9 100644
--- a/docs/_tutorials/flops-profiler.md
+++ b/docs/_tutorials/flops-profiler.md
@@ -184,7 +184,7 @@ When using DeepSpeed for model training, the profiler can be configured in the d
 
 #### Example: Megatron-LM
 
-For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/megatron/Megatron-LM).
+For information on running Megatron-LM with DeepSpeed, please refer to our tutorial [Megatron-LM](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/megatron/Megatron-LM).
 
 An example output of 12-layer Megatron-LM model (`hidden_size = 8192, num_attention_heads = 32, batch_size = 1024, seq_length = 1024`) is shown below.
 
diff --git a/docs/_tutorials/gan.md b/docs/_tutorials/gan.md
index 09572a439eb0..db3734fb3b96 100755
--- a/docs/_tutorials/gan.md
+++ b/docs/_tutorials/gan.md
@@ -16,7 +16,7 @@ Please go through the [original tutorial](https://pytorch.org/tutorials/beginner
 
 ## Enabling DeepSpeed
 
-The codes may be obtained [here](https://github.com/microsoft/DeepSpeedExamples/tree/master/gan).
+The codes may be obtained [here](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/gan).
 
 ### Argument Parsing
 
diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md
index 8d2bbf2d9964..2c6e27d1319d 100644
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -8,9 +8,10 @@ tags: getting-started
 ## Installation
 
 * Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
-* To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed)
-* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/docs/transformers/main_classes/deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed).
+* To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/cli/jobs/deepspeed)
+* DeepSpeed has direct integrations with [HuggingFace Transformers](https://github.com/huggingface/transformers) and [PyTorch Lightning](https://github.com/PyTorchLightning/pytorch-lightning). HuggingFace Transformers users can now easily accelerate their models with DeepSpeed through a simple ``--deepspeed`` flag + config file [See more details](https://huggingface.co/docs/transformers/deepspeed). PyTorch Lightning provides easy access to DeepSpeed through the Lightning Trainer [See more details](https://pytorch-lightning.readthedocs.io/en/stable/advanced/multi_gpu.html?highlight=deepspeed#deepspeed).
 * DeepSpeed on AMD can be used via our [ROCm images](https://hub.docker.com/r/deepspeed/rocm501/tags), e.g., `docker pull deepspeed/rocm501:ds060_pytorch110`.
+* DeepSpeed also supports Intel Xeon CPU, Intel Data Center Max Series XPU, Intel Gaudi HPU, Huawei Ascend NPU etc, please refer to the [accelerator setup guide](/tutorials/accelerator-setup-guide/)
 
 
 
@@ -226,6 +227,36 @@ deepspeed --include="worker-2:0,1" \
 	<client_entry.py> <client args> \
 	--deepspeed --deepspeed_config ds_config.json
 ```
+### Launching without passwordless SSH
+
+DeepSpeed now supports launching training jobs without the need for passwordless SSH. This mode is
+particularly useful in cloud environments such as Kubernetes, where flexible container orchestration
+is possible, and setting up a leader-worker architecture with passwordless SSH adds unnecessary
+complexity.
+
+To use this mode, you need to run the DeepSpeed command separately on all nodes. The command should
+be structured as follows:
+
+```bash
+deepspeed --hostfile=myhostfile --no_ssh --node_rank=<n> \
+    --master_addr=<addr> --master_port=<port> \
+    <client_entry.py> <client args> \
+    --deepspeed --deepspeed_config ds_config.json
+```
+
+- `--hostfile=myhostfile`: Specifies the hostfile that contains information about the nodes and GPUs.
+- `--no_ssh`: Enables the no-SSH mode.
+- `--node_rank=<n>`: Specifies the rank of the node. This should be a unique integer from 0 to n - 1.
+- `--master_addr=<addr>`: The address of the leader node (rank 0).
+- `--master_port=<port>`: The port of the leader node.
+
+In this setup, the hostnames in the hostfile do not need to be reachable via passwordless SSH.
+However, the hostfile is still required for the launcher to collect information about the environment,
+such as the number of nodes and the number of GPUs per node.
+
+Each node must be launched with a unique `node_rank`, and all nodes must be provided with the address
+and port of the leader node (rank 0). This mode causes the launcher to act similarly to the `torchrun`
+launcher, as described in the [PyTorch documentation](https://pytorch.org/docs/stable/elastic/run.html).
 
 ## Multi-Node Environment Variables
 
@@ -285,10 +316,14 @@ local machine to discover the number of slots available. The `--include` and
 `--exclude` arguments work as normal, but the user should specify 'localhost'
 as the hostname.
 
-Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
-which devices should be used. For example, to use only gpu1 of the current
-node, do:
+Also note that `CUDA_VISIBLE_DEVICES` can be used with `deepspeed` to control
+which devices should be used on a single node. So either of these would work
+to launch just on devices 0 and 1 of the current node:
+
+```bash
+deepspeed --include localhost:0,1 ...
+```
 
 ```bash
-deepspeed --include localhost:1 ...
+CUDA_VISIBLE_DEVICES=0,1 deepspeed ...
 ```
diff --git a/docs/_tutorials/inference-tutorial.md b/docs/_tutorials/inference-tutorial.md
index 6330198053e7..ddf287f24b96 100644
--- a/docs/_tutorials/inference-tutorial.md
+++ b/docs/_tutorials/inference-tutorial.md
@@ -3,15 +3,17 @@ title: "Getting Started with DeepSpeed for Inferencing Transformer based Models"
 tags: inference
 ---
 
+>**DeepSpeed-Inference v2 is here and it's called DeepSpeed-FastGen! For the best performance, latest features, and newest model support please see our [DeepSpeed-FastGen release blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen)!**
+
 DeepSpeed-Inference introduces several features to efficiently serve transformer-based PyTorch models. It supports model parallelism (MP) to fit large models that would otherwise not fit in GPU memory. Even for smaller models, MP can be used to reduce latency for inference. To further reduce latency and cost, we introduce inference-customized kernels. Finally, we propose a novel approach to quantize models, called MoQ, to both shrink the model and reduce the inference cost at production. For more details on the inference related optimizations in DeepSpeed, please refer to our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/).
 
-DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py).
+DeepSpeed provides a seamless inference mode for compatible transformer based models trained using DeepSpeed, Megatron, and HuggingFace, meaning that we don’t require any change on the modeling side such as exporting the model or creating a different checkpoint from your trained checkpoints. To run inference on multi-GPU for compatible models, provide the model parallelism degree and the checkpoint information or the model which is already loaded from a checkpoint, and DeepSpeed will do the rest. It will automatically partition the model as necessary, inject compatible high performance kernels into your model and manage the inter-gpu communication. For list of compatible models please see [here](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py).
 
 ## Initializing for Inference
 
 For inference with DeepSpeed, use `init_inference` API to load the model for inference. Here, you can specify the MP degree, and if the model has not been loaded with the appropriate checkpoint, you can also provide the checkpoint description using a `json` file or the checkpoint path.
 
-To inject the high-performance kernels, you need to set the `replace_with_kernel_inject` to True for the compatible models. For models not supported by DeepSpeed, the users can submit a PR that defines a new policy in [replace_policy class](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py) that specifies the different parameters of a Transformer layer, such as attention and feed-forward parts. The policy classes in DeepSpeed create a mapping between the parameters of the original user-supplied layer implementation with DeepSpeed's inference-optimized Transformer layer.
+To inject the high-performance kernels, you need to set the `replace_with_kernel_inject` to True for the compatible models. For models not supported by DeepSpeed, the users can submit a PR that defines a new policy in [replace_policy class](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py) that specifies the different parameters of a Transformer layer, such as attention and feed-forward parts. The policy classes in DeepSpeed create a mapping between the parameters of the original user-supplied layer implementation with DeepSpeed's inference-optimized Transformer layer.
 
 ```python
 # create the model
@@ -19,18 +21,22 @@ if args.pre_load_checkpoint:
     model = model_class.from_pretrained(args.model_name_or_path)
 else:
     model = model_class()
+
+# create the tokenizer
+tokenizer = model_class.from_pretrained(args.model_name_or_path)
 ...
 
 import deepspeed
 
 # Initialize the DeepSpeed-Inference engine
 ds_engine = deepspeed.init_inference(model,
-                                 mp_size=2,
-                                 dtype=torch.half,
-                                 checkpoint=None if args.pre_load_checkpoint else args.checkpoint_json,
-                                 replace_with_kernel_inject=True)
+                                     tensor_parallel={"tp_size": world_size},
+                                     dtype=torch.half,
+                                     checkpoint=None if args.pre_load_checkpoint else args.checkpoint_json,
+                                     replace_with_kernel_inject=True)
 model = ds_engine.module
-output = model('Input String')
+pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+output = pipe('Input String')
 ```
 
 To run inference with only model-parallelism for the models that we don't support kernels, you can pass an injection policy that shows the two specific linear layers on a Transformer Encoder/Decoder layer: 1) the attention output GeMM and 2) layer output GeMM. We need these part of the layer to add the required all-reduce communication between GPUs to merge the partial results across model-parallel ranks. Below, we bring an example that shows how you can use deepspeed-inference with a T5 model:
@@ -47,7 +53,7 @@ pipe = pipeline("text2text-generation", model="google/t5-v1_1-small", device=loc
 # Initialize the DeepSpeed-Inference engine
 pipe.model = deepspeed.init_inference(
     pipe.model,
-    mp_size=world_size,
+    tensor_parallel={"tp_size": world_size},
     dtype=torch.float,
     injection_policy={T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')}
 )
@@ -108,7 +114,7 @@ generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B',
 
 
 generator.model = deepspeed.init_inference(generator.model,
-                                           mp_size=world_size,
+                                           tensor_parallel={"tp_size": world_size},
                                            dtype=torch.float,
                                            replace_with_kernel_inject=True)
 
diff --git a/docs/_tutorials/large-models-w-deepspeed.md b/docs/_tutorials/large-models-w-deepspeed.md
index 8e09cccee1fe..3d0bae0144b4 100644
--- a/docs/_tutorials/large-models-w-deepspeed.md
+++ b/docs/_tutorials/large-models-w-deepspeed.md
@@ -28,7 +28,7 @@ Since, ZeRO is a replacement to data parallelism, it offers a seamless integrati
 
 ## Deciding which technology to use
 
-**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/microsoft/megatron-deepspeed) repo. For more information on 3D parallelism please checkout the resources below:
+**3D Parallelism for GPT-2/GPT-3 like models**: If you are attempting to train a model whose architecture resembles very closely with GPT-2 or GPT-3, then we have already done the hard work of porting 3D parallelism to a GPT-2/GPT-3 architecture-based model and have created a training pipeline that you can use to efficiently train models with hundreds of billion or even trillions of parameters. Both Megatron-Turing NLG 530B and Big Science use a variation of this code base to scale the model training. You can find the code and tutorial to get started in the [DeepSpeed-Megatron GPT-3](https://github.com/deepspeedai/megatron-deepspeed) repo. For more information on 3D parallelism please checkout the resources below:
 
 [3D Parallelism Tutorial](https://www.deepspeed.ai/tutorials/pipeline/) A generic tutorial on how to port your model to use DeepSpeed 3D parallelism
 
diff --git a/docs/_tutorials/megatron.md b/docs/_tutorials/megatron.md
index 5242c8184db8..286a9a36a926 100644
--- a/docs/_tutorials/megatron.md
+++ b/docs/_tutorials/megatron.md
@@ -19,7 +19,7 @@ reduction_** from using DeepSpeed.
 
 ## Training GPT-2 with the Original Megatron-LM
 
-We've copied the original model code from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) into DeepSpeed [Megatron-LM](https://github.com/microsoft/Megatron-DeepSpeed) and made it available as a submodule. To download, execute:
+We've copied the original model code from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) into DeepSpeed [Megatron-LM](https://github.com/deepspeedai/Megatron-DeepSpeed) and made it available as a submodule. To download, execute:
 ```bash
 git submodule update --init --recursive
 ```
diff --git a/docs/_tutorials/mixed_precision_zeropp.md b/docs/_tutorials/mixed_precision_zeropp.md
index 12ad3556abde..9429b75bac10 100644
--- a/docs/_tutorials/mixed_precision_zeropp.md
+++ b/docs/_tutorials/mixed_precision_zeropp.md
@@ -3,7 +3,7 @@ title: "Mixed Precision ZeRO++"
 tags: training ZeRO communication-efficiency large-model
 ---
 
-Mixed Precision ZeRO++ (MixZ++) is a set of optimization strategies based on [ZeRO](/tutorials/zero/) and [ZeRO++](/tutorials/zeropp/) to improve the efficiency and reduce memory usage for large model training and inference when users use [Low-Rank Adaptation (LoRA)]([/tutorials/zero/](https://arxiv.org/abs/2106.09685)) training. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by up to [3.3x](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31) for the Llama-2-70B model running on 128 V100 GPUs. Read our [DeepSpeed Chat Blog](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31), [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) and [paper](https://arxiv.org/pdf/2306.10209.pdf) to learn more!
+Mixed Precision ZeRO++ (MixZ++) is a set of optimization strategies based on [ZeRO](/tutorials/zero/) and [ZeRO++](/tutorials/zeropp/) to improve the efficiency and reduce memory usage for large model training and inference when users use [Low-Rank Adaptation (LoRA)]([/tutorials/zero/](https://arxiv.org/abs/2106.09685)) training. MixZ++ partitions model parameters across GPUs to reduce footprint and gathers them with quantized communication only when needed similar to its ZeRO and ZeRO++ siblings. Our evaluation indicates MixZ++ increases the training throughput by up to [3.3x](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31) for the Llama-2-70B model running on 128 V100 GPUs. Read our [DeepSpeed Chat Blog](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31), [ZeRO++ blog](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) and [paper](https://arxiv.org/pdf/2306.10209.pdf) to learn more!
 
 We recommend that you read the tutorials on [Getting Started](/getting-started/), [ZeRO](/tutorials/zero/)  and [Megatron-DeepSpeed](/tutorials/megatron/) before stepping through this tutorial.
 
@@ -16,7 +16,7 @@ Collectively, the optimizations bring better scalability and efficiency to LoRA
 
 ## Enabling Mixed Precision ZeRO++ (MixZ++)
 
-A ready to go MixZ++ example has been prepared at [MixZ++ example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh). If you prefer to manually enable MixZ++ in your pipeline, please refer to the instructions below.
+A ready to go MixZ++ example has been prepared at [MixZ++ example script](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/applications/DeepSpeed-Chat/training/step3_rlhf_finetuning/training_scripts/llama2/run_llama2_7b_mixz.sh). If you prefer to manually enable MixZ++ in your pipeline, please refer to the instructions below.
 
 ### DeepSpeed Configuration Changes
 An example snippet of deepspeed configurations with all MixZ++ optimization enabled is shown below:
diff --git a/docs/_tutorials/mixture-of-experts-inference.md b/docs/_tutorials/mixture-of-experts-inference.md
index 882ad7aefd1f..675815dd5d57 100644
--- a/docs/_tutorials/mixture-of-experts-inference.md
+++ b/docs/_tutorials/mixture-of-experts-inference.md
@@ -54,7 +54,7 @@ output = model('Input String')
 Here, we show a text-generation example using an MoE model for which we can specify the model-parallel size and number of experts.
 DeepSpeed inference-engine takes care of creating the different parallelism groups using the tensor-slicing degree, number of experts, and the total number of GPUs used for running the MoE model. Regarding the expert parameters, we first use the expert-parallelism to assign each group of experts to one GPU. If number of GPUs is higher than number of experts, we use expert-slicing to partition each expert vertically/horizontally across the GPUs.
 
-Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/generate_text.sh) for a complete generate-text inference example.
+Let's take a look at some of the parameters passed to run our example. Please refer to [DeepSpeed-Example](https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/examples_deepspeed/generate_text.sh) for a complete generate-text inference example.
 
 
 ```bash
diff --git a/docs/_tutorials/mixture-of-experts-nlg.md b/docs/_tutorials/mixture-of-experts-nlg.md
index 6fc7022ba1fb..c4fb072dd82d 100755
--- a/docs/_tutorials/mixture-of-experts-nlg.md
+++ b/docs/_tutorials/mixture-of-experts-nlg.md
@@ -7,7 +7,7 @@ In this tutorial, we introduce how to apply DeepSpeed Mixture of Experts (MoE) t
 
 ## 1. Installation
 
-You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The MoE for NLG model examples are in the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed) repo under the MoE folder.
+You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The MoE for NLG model examples are in the [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed) repo under the MoE folder.
 
 ## 2. Training NLG+MoE models
 
@@ -15,7 +15,7 @@ You would need to install DeepSpeed v0.6.0 or higher to use the MoE feature. The
 To apply MoE to the GPT-style model, we made several changes in Megatron framework, mostly in `megatron/model/` where we add the MoE layers into the model.
 
 ### 2.2. Pre-training the Standard MoE model
-We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model:
+We provide example training scripts under [examples_deepspeed/MoE](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE) which we used to perform the experiments in our [Blog]({{ site.press_release_v6 }}). There are a few new hyperparameters for standard MoE model:
 
 `--num-experts`: the number of experts per MoE layer. In our experiments we set it to 128. Larger number of experts tend to provide better convergence, but it's a diminishing return.
 
@@ -30,7 +30,7 @@ We provide example training scripts under [examples_deepspeed/MoE](https://githu
 
 
 ### 2.3. Pre-training the PR-MoE model
-PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE:
+PR-MoE is a new designed MoE models, standing for Pyramid-Residual-MoE, which improves the parameter efficiency up to 3x as compared to standard MoE. Please see our [Blog]({{ site.press_release_v6 }}) for more details. We provide example training scripts under [examples_deepspeed/MoE](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). There are a few different hyperparameters for PR-MoE model compared to standard MoE:
 
 `--num-experts`: Instead of providing a single number, to enable Pyramid-MoE, you need to provide a list, whose length is the same as the number of MoE layers. We suggest to use more experts in the latter stage (close to output) of the model.
 
@@ -67,4 +67,4 @@ MoS, standing for Mixture-of-Students, is a staged distillation-based technique
 
 In addition to the new parameters above, we observe that using the teacher PR-MoE during the entire training process may adversely impact the final student model accuracy. In our experiments, we use a staged distillation method by stopping distillation early in the training process (e.g., after 400K steps) and perform optimization only against the standard language modeling loss for the rest of the training.
 
-We provide example training scripts under [examples_deepspeed/MoE](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596).
+We provide example training scripts under [examples_deepspeed/MoE](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/MoE). Details of our parameter settings can be found in the example training scripts. The performance results of MoS can be seen from our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-powers-8x-larger-moe-model-training-with-high-performance/) and our [paper](https://arxiv.org/abs/2201.05596).
diff --git a/docs/_tutorials/mixture-of-experts.md b/docs/_tutorials/mixture-of-experts.md
index e7739a6a5051..d4604b929ff4 100644
--- a/docs/_tutorials/mixture-of-experts.md
+++ b/docs/_tutorials/mixture-of-experts.md
@@ -13,7 +13,7 @@ For more details on results and further discussion, please see our press release
 {: .notice--info}
 
 As a simple starting point we will show how to apply DeepSpeed MoE to a cifar10 example. Please refer to
-our [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) going forward.
+our [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) going forward.
 
 If you are adding MoE to an existing model you can use the snippet below to help guide you:
 
@@ -104,11 +104,11 @@ fc4 = torch.nn.Linear(84, 10)
 
 ```
 
-For a runnable end-to-end example that covers both the standard MoE architecture as well as the PR-MoE model , please look at the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar). In addition, see the advanced usage section of this tutorial that links to a more comprehensive example for NLG models.
+For a runnable end-to-end example that covers both the standard MoE architecture as well as the PR-MoE model , please look at the [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar). In addition, see the advanced usage section of this tutorial that links to a more comprehensive example for NLG models.
 
 ### Combining ZeRO-Offload and DeepSpeed MoE for very large models
 
-To use MoE Layers in DeepSpeed, we rely on two parameter groups that are passed to an optimizer. A concrete example to create such groups is available from the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar).
+To use MoE Layers in DeepSpeed, we rely on two parameter groups that are passed to an optimizer. A concrete example to create such groups is available from the [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar).
 
 The relevant function that creates these param groups is as follows.
 
@@ -124,7 +124,6 @@ def create_moe_param_groups(model):
 The above param groups can then be fed to the ZeRO stage-2 optimizer as follows.
 
 ```python
-
 net = Net()
 
 parameters = create_moe_param_groups(net)
@@ -135,7 +134,7 @@ model_engine, optimizer, trainloader, __ = deepspeed.initialize(
 
 We are working on automating this functionality in the DeepSpeed ZeRO optimizer so the model training code can be simplified further.
 
-To run the [cifar10 example](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) with ZeRO-Offload (stage 2) and MoE, please set the ds_config flags
+To run the [cifar10 example](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) with ZeRO-Offload (stage 2) and MoE, please set the ds_config flags
 
 ```json
 "zero_optimization": {
diff --git a/docs/_tutorials/model-compression.md b/docs/_tutorials/model-compression.md
index c8713cb1f616..d11eadc3d726 100644
--- a/docs/_tutorials/model-compression.md
+++ b/docs/_tutorials/model-compression.md
@@ -25,7 +25,7 @@ If the model is very deep, you may consider using this method. It works much bet
 
 Layer reduction can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#layer-reduction)). Users have the freedom to select any depth by `keep_number_layer` and any subset of the network layers by `teacher_layer`. In addition, users also can choose whether to reinitialize the input/output layers from the given model (teacher model) by `other_module_name`.
 
-To apply layer reduction for task-specific compression, we provide an example on how to do so for BERT fine-tuning. Layer reduction is about resetting the depth of network architecture and reinitialization of weight parameters, which happens before the training process. The example includes the following changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
+To apply layer reduction for task-specific compression, we provide an example on how to do so for BERT fine-tuning. Layer reduction is about resetting the depth of network architecture and reinitialization of weight parameters, which happens before the training process. The example includes the following changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)):
 
 (1) When initial the model, the number of layers in the model config should be the same as `keep_number_layer` in DeepSpeed config JSON file. For Hugging Face BERT example, set `config.num_hidden_layers = ds_config["compression_training"]["layer_reduction"]["keep_number_layer"]`.
 
@@ -33,7 +33,7 @@ To apply layer reduction for task-specific compression, we provide an example on
 
 (3) During training, if KD is not used, nothing needs to be done. Otherwise, one needs to consider applying KD with the `teacher_layer` JSON configuration when calculating the difference between teacher’s and student’s output.
 
-One can run our layer reduction example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our layer reduction example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -49,7 +49,7 @@ Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.834029
 
 To apply layer reduction for task-agnostic compression, we provide an example on how to do so in the GPT pre-training stage.
 
-Step 1: Obtain the latest version of the [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed).
+Step 1: Obtain the latest version of the [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed).
 
 Step 2: Enter `Megatron-DeepSpeed/examples_deepspeed/compression` directory.
 
@@ -97,13 +97,13 @@ Weight quantization can be enabled and configured using the DeepSpeed config JSO
 
 (4)`start_bit` and `target_bit`, to simplify the first experiment we suggest to set them the same such that we apply quantization to the target bit once the iteration reaches `schedule_offset`.
 
-There are two changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)):
+There are two changes to the client code (`compression/bert/run_glue_no_trainer.py` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)):
 
 (1) After initialization of the model, apply `init_compression` function to the model with DeepSpeed JSON configurations.
 
 (2) After training, apply `redundancy_clean` function to save the quantized weight.
 
-One can run our weight quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our weight quantization example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -130,13 +130,13 @@ It can improve computation efficiency similar to [weight quantization](#12-weigh
 
 Activation quantization can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#activation-quantization)). Some of the components are same as weight quantization, such as `schedule_offset` and `quantization_type`. The key configurations we would like to point out are:
 
-(1)`range_calibration`, user has option to set dynamic or static. When using “dynamic”, the activation quantization groups will be automatically set to be token-wise (for Transformer-based models) and image-wise (for CNN-based models). See more in [our ZeroQuant paper](https://arxiv.org/abs/2206.01861) and the code (`deepspeed/compression/basic_layer.py` in [DeepSpeed](https://github.com/microsoft/DeepSpeed)).
+(1)`range_calibration`, user has option to set dynamic or static. When using “dynamic”, the activation quantization groups will be automatically set to be token-wise (for Transformer-based models) and image-wise (for CNN-based models). See more in [our ZeroQuant paper](https://arxiv.org/abs/2206.01861) and the code (`deepspeed/compression/basic_layer.py` in [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)).
 
 (2)`aq1`/`aq2`, users can expand more groups such as `aq3`, `aq4`, etc.
 
 The client code change is the same as [weight quantization](#12-weight-quantization).
 
-One can run our activation quantization example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our activation quantization example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -188,7 +188,7 @@ Sparse pruning can be enabled and configured using the DeepSpeed config JSON fil
 
 The client code change is the same as [weight quantization](#12-weight-quantization).
 
-One can run our sparse pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our sparse pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -223,7 +223,7 @@ Row pruning can be enabled and configured using the DeepSpeed config JSON file (
 
 The client code change is the same as [weight quantization](#12-weight-quantization).
 
-One can run our row pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our row pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -260,7 +260,7 @@ Head pruning can be enabled and configured using the DeepSpeed config JSON file
 
 The client code change is the same as [weight quantization](#12-weight-quantization).
 
-One can run our head pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our head pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -286,7 +286,7 @@ Channel pruning is a feature designed for two back-to-back CONV2d layers (e.g.,
 
 Channel pruning can be enabled and configured using the DeepSpeed config JSON file ([configuration details](/docs/config-json/#channel-pruning)).
 
-One can run our channel pruning example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our channel pruning example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 pip install torch torchvision
@@ -315,7 +315,7 @@ When you want to quantize the transformer-based model to INT8 or INT4/INT8 forma
 
 **How to use ZeroQuant**
 
-One can run our BERT example in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) by:
+One can run our BERT example in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -363,7 +363,7 @@ If you want to significantly compress your models while retaining competitive pe
 
 **How to use XTC**
 
-**Installation:** Examples of XTC extreme compression for BERT models are at `compression/bert/bash_script/XTC` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). You will need to install the requirements by:
+**Installation:** Examples of XTC extreme compression for BERT models are at `compression/bert/bash_script/XTC` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples). You will need to install the requirements by:
 
 ```shell
 DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
@@ -373,7 +373,7 @@ DeepSpeedExamples/compression/bert$ pip install -r requirements.txt
 To accommodate users who do not have a fine-tuned model or task-specific model for compression, with the arg `--model_name_or_path yoshitomo-matsubara/bert-base-uncased-${TASK_NAME}` our python script `run_glue_no_trainer.py` automatically downloads the models from Hugging Face. Users can also use their own models with better accuracy as the teacher and the student model initialization.
 
 ### 3.1  One-bit or Two-bit BERT-base (12-layer) with 8-bit activation quantization
-For the configurations, see `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples). In our paper, we used FP32 (`"fp16": {"enabled": false}`) to perform training, while directly applying 8-bit quantization (`"bits": 8`) to the activations and 1-bit quantization (`"start_bits": 1, "target_bits": 1`) to the attention (query, key, val) and feedforward weight matrices (`"modules": ["attention.self", "intermediate", "output.dense"]`) at the beginning of the training (`"schedule_offset": 0`).  In addition, we also apply 1-bit quantization to `word_embeddings` as weight quantization.
+For the configurations, see `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples). In our paper, we used FP32 (`"fp16": {"enabled": false}`) to perform training, while directly applying 8-bit quantization (`"bits": 8`) to the activations and 1-bit quantization (`"start_bits": 1, "target_bits": 1`) to the attention (query, key, val) and feedforward weight matrices (`"modules": ["attention.self", "intermediate", "output.dense"]`) at the beginning of the training (`"schedule_offset": 0`).  In addition, we also apply 1-bit quantization to `word_embeddings` as weight quantization.
 
 One can run this example by:
 
@@ -387,7 +387,7 @@ And the final result is:
 Clean the best model, and the accuracy of the clean model is acc/mm-acc:0.8293428425878757/0.8396053702196908
 ```
 
-The other important feature we would like to mention is the `quantize_groups` inside `weight_quantization`, which is set to be 1 here to match our XTC paper's FP32 training setup. We find that under FP16 training, smaller number of quantization group (e.g., 1 or 2) could lead to unstable training. Thus, we recommend using larger number of groups (e.g., 64) under FP16. `compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the FP16 example configurations, where `"fp16": {"enabled": true}` and `"weight_quantization": {"shared_parameters": {"quantize_weight_in_forward": false}}` are different from FP32 case.
+The other important feature we would like to mention is the `quantize_groups` inside `weight_quantization`, which is set to be 1 here to match our XTC paper's FP32 training setup. We find that under FP16 training, smaller number of quantization group (e.g., 1 or 2) could lead to unstable training. Thus, we recommend using larger number of groups (e.g., 64) under FP16. `compression/bert/config/ds_config_W1A8_Qgroup64_fp16.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) is the FP16 example configurations, where `"fp16": {"enabled": true}` and `"weight_quantization": {"shared_parameters": {"quantize_weight_in_forward": false}}` are different from FP32 case.
 
 With this config, we quantize the existing fined-tuned models downloaded from Hugging Face. For 2-bit weight quantization, user needs to update the ds_config JSON file. To give a sense of the compression performance of downloaded models compared to our paper, we collect the results (1/2-bit BERT on MNLI and QQP with 18 training epochs) in table below. The difference between this tutorial and paper is because they use different checkpoints. Data augmentation introduces in [TinyBERT](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) will help significantly for smaller tasks (such as mrpc, rte, sst-b and cola). See more details in [our paper](https://arxiv.org/abs/2206.01859).
 
@@ -399,7 +399,7 @@ This section consists of two parts: (a) we first perform a light-weight layer re
 
 **3.2.1 Light-weight Layer Reduction**
 
-`compression/bert/config/XTC/ds_config_layer_reduction_fp16.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration for reducing the 12-layer BERT-base to a 6-layer one. The student’s layers are initialized from i-layer of the teacher with i= [1, 3 ,5 ,7 ,9 ,11] (note that the layer starts from 0), which is called `Skip-BERT_5` in our XTC paper. In addition, student’s modules including embedding, pooler and classifier are also initialized from teacher. For 5-layer layer reduction, one needs to change the configs in `ds_config_layer_reduction_fp16.json` to `"keep_number_layer": 5`, `"teacher_layer": [2, 4 ,6, 8, 10]`(like in `compression/bert/config/ds_config_TEMPLATE.json`).
+`compression/bert/config/XTC/ds_config_layer_reduction_fp16.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) is the example configuration for reducing the 12-layer BERT-base to a 6-layer one. The student’s layers are initialized from i-layer of the teacher with i= [1, 3 ,5 ,7 ,9 ,11] (note that the layer starts from 0), which is called `Skip-BERT_5` in our XTC paper. In addition, student’s modules including embedding, pooler and classifier are also initialized from teacher. For 5-layer layer reduction, one needs to change the configs in `ds_config_layer_reduction_fp16.json` to `"keep_number_layer": 5`, `"teacher_layer": [2, 4 ,6, 8, 10]`(like in `compression/bert/config/ds_config_TEMPLATE.json`).
 
 One can run this example by:
 
@@ -421,7 +421,7 @@ For mnli/qqp, we set `--num_train_epochs 36`, `--learning_rate 5e-5`, and with t
 
 **3.2.2 One-bit or Two-bit quantization for 6-layer (5-layer) BERT**
 
-Given the above layer-reduced models ready, we now continue to compress the model with 1/2-bit quantization. `compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json` in [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) is the example configuration where we set the layer reduction to be true on top of `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json`. In addition to the configuration, we need to update the path for the student model using `--pretrained_dir_student` in the script `compression/bert/bash_script/XTC/layer_reduction_1bit.sh`. User can train with a different teacher model by adding `--pretrained_dir_teacher`.
+Given the above layer-reduced models ready, we now continue to compress the model with 1/2-bit quantization. `compression/bert/config/XTC/ds_config_layer_reduction_W1Q8_fp32.json` in [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) is the example configuration where we set the layer reduction to be true on top of `compression/bert/config/XTC/ds_config_W1A8_Qgroup1_fp32.json`. In addition to the configuration, we need to update the path for the student model using `--pretrained_dir_student` in the script `compression/bert/bash_script/XTC/layer_reduction_1bit.sh`. User can train with a different teacher model by adding `--pretrained_dir_teacher`.
 
 One can run this example by:
 
diff --git a/docs/_tutorials/monitor.md b/docs/_tutorials/monitor.md
index a9c111f8eeec..5e7a6fc4e834 100644
--- a/docs/_tutorials/monitor.md
+++ b/docs/_tutorials/monitor.md
@@ -11,7 +11,7 @@ In this tutorial, we introduce the DeepSpeed Monitor and provide examples of its
 
 ## Overview
 
-Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), and simple CSV files.
+Monitoring model and system metrics during training is vital to ensure hardware resources are fully utilized. The DeepSpeed Monitor enables live logging of metrics through one or more monitoring backends such as PyTorch's [TensorBoard](https://pytorch.org/docs/1.8.0/tensorboard.html), [WandB](https://docs.wandb.ai/quickstart), [Comet](https://www.comet.com/site/?utm_source=deepseed&utm_medium=docs&utm_content=tutorial) and simple CSV files.
 
 Below is a live monitoring view for TensorBoard:
 
@@ -21,16 +21,20 @@ Below is a live monitoring view for WandB:
 
 ![WandB Example Output](/assets/images/wandb_monitor.PNG){: .align-center}
 
+Below is a live monitoring view for Comet:
+
+![CometML Example Output](/assets/images/comet_monitor.png){: .align-center}
+
 ## Usage
 
-The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.
+The DeepSpeed Monitor is configured within the deepspeed [configuration file](/docs/config-json/#monitoring-module). DeepSpeed will automatically monitor key training metrics, including those tracked with the `wall_clock_breakdown` configuration option. In addition, users can log their own custom events and metrics.
 
   - [Automatic Monitoring](#automatic-monitoring)
   - [Custom Monitoring](#custom-monitoring)
 
 ### Automatic Monitoring
 
-When using DeepSpeed for model training, the Monitor can be configured in the DeepSpeed [configuration file](/docs/config-json/#monitoring-module-tensorboard-wandb-csv). No explicit API calls are needed to use the Monitor. The Monitor can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Monitoring](/docs/config-json/#monitoring-module-tensorboard-wandb-csv) for details.
+When using DeepSpeed for model training, the Monitor can be configured in the DeepSpeed [configuration file](/docs/config-json/#monitoring-module). No explicit API calls are needed to use the Monitor. The Monitor can be enabled by adding the following field to DeepSpeed's configuration json file. Refer to [Monitoring](/docs/config-json/#monitoring-module) for details.
 
 ```json
 {
@@ -45,6 +49,11 @@ When using DeepSpeed for model training, the Monitor can be configured in the De
     "group": "my_group",
     "project": "my_project"
   }
+  "comet": {
+    "enabled": true,
+    "project": "my_project",
+    "experiment_name": "my_experiment"
+  }
   "csv_monitor": {
     "enabled": true,
     "output_path": "output/ds_logs/",
@@ -72,7 +81,7 @@ The steps to create a custom monitor are as follows:
 
 \* Note - Some Monitor backends don't support mixed sample values. Be sure to use your DeepSpeed engine object's `global_samples` attribute in each 3-tuple
 
-For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/microsoft/DeepSpeedExamples/tree/master/cifar) example:
+For example usage, see the following modified [DeepSpeedExamples/cifar](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/cifar) example:
 
 ```python
 # Step 1: Import monitor (and DeepSpeed config, if needed)
diff --git a/docs/_tutorials/onebit-adam.md b/docs/_tutorials/onebit-adam.md
index 932bb355cf26..e24dc8f86554 100644
--- a/docs/_tutorials/onebit-adam.md
+++ b/docs/_tutorials/onebit-adam.md
@@ -33,7 +33,7 @@ If you don't already have a copy of the DeepSpeed repository, please clone it
 now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
 
 ```shell
-git clone https://github.com/microsoft/DeepSpeed
+git clone https://github.com/deepspeedai/DeepSpeed
 cd DeepSpeed
 git submodule update --init --recursive
 cd DeepSpeedExamples/
@@ -75,6 +75,12 @@ Alternatively, the standard mpirun launcher can also be used as follows:
 mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
+#### 1.2.3 Compressed implementation
+
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
 ### 1.3 1-bit Algorithm
 
 The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/2020/09/08/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
@@ -106,10 +112,10 @@ Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_
 
 `cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 #### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
-Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
 
 **Watch out!**
 1-bit Adam relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
@@ -130,7 +136,7 @@ You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [Hug
 
 ### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
 
-We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+We provide example scripts under [DeepSpeedExamples/training/BingBertSquad/1-bit_adam/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/training/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
 
 <!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
 already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
@@ -151,7 +157,7 @@ To enable the 1-bit compressed training, 1-bit Adam uses an MPI library (E.g. MV
 
 ### Launch with deepspeed
 
-The following helper script in the DeepSpeedExamples/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
+The following helper script in the DeepSpeedExamples/training/BingBertSQuAD will launch the training without the need for setting any `mpirun` parameters. The number of nodes and GPUs will be automatically detected and the job will be launched on all the available resources.
 
 ```shell
 bash run_squad_deepspeed_onebitadam.sh <PATH_TO_OUTPUT_DIR>
@@ -224,7 +230,7 @@ For data downloading and pre-processing, please refer to the [BERT Pre-training]
 
 ### 3.1 Running Pre-training with DeepSpeed and 1-bit Adam
 
-We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_adam/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
 
 <!-- The main part of training is done in `deepspeed_train.py`, which has
 already been modified to use DeepSpeed. The `ds_train_bert_onebit_bsz4k_seq128.sh` and `ds_train_bert_bsz64k_seq128.sh`
diff --git a/docs/_tutorials/onebit-lamb.md b/docs/_tutorials/onebit-lamb.md
index 4873f1f35c17..ef771283807a 100644
--- a/docs/_tutorials/onebit-lamb.md
+++ b/docs/_tutorials/onebit-lamb.md
@@ -19,7 +19,7 @@ If you don't already have a copy of the DeepSpeed repository, please clone it
 now and checkout the DeepSpeedExamples submodule that contains the BERT Pre-training example.
 
 ```shell
-git clone https://github.com/microsoft/DeepSpeed
+git clone https://github.com/deepspeedai/DeepSpeed
 cd DeepSpeed
 git submodule update --init --recursive
 cd DeepSpeedExamples/
@@ -61,6 +61,10 @@ Alternatively, the standard mpirun launcher can also be used as follows:
 mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
+#### 1.2.3 Compressed implementation
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
 ### 1.3 1-bit LAMB Algorithm
 
 The detailed description of the 1-bit LAMB algorithm can be seen from our [paper](https://arxiv.org/abs/2104.06069).
@@ -101,14 +105,14 @@ Please note the new parameters `freeze_step`, `cuda_aware`, `comm_backend_name`,
 
 `cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" or "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 `coeff_beta` is used when calculating a moving average of the LAMB scaling coefficient during the warmup stage. This moving average is then used as the frozen base scaling coefficient during the compression stage.
 
 `factor_max`, `factor_min`, and `factor_threshold` are used to regularize the adaptive scaling of the frozen base scaling coefficient during the compression stage. `factor_max` and `factor_min` are the scaling factor upper/lower bound. `factor_threshold` defines the threshold of how much the scaling factor can fluctuate between steps.
 
 #### 1.4.1 Momentum masks for parameters with constant zero gradients
-Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit LAMB we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit LAMB we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
 
 **Watch out!**
 1-bit LAMB relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
@@ -119,7 +123,7 @@ For data downloading and pre-processing, please refer to the [BERT Pre-training
 
 ### 2.1 Running Pre-training with DeepSpeed and 1-bit LAMB
 
-We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_lamb/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_lamb). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_lamb/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert/1-bit_lamb). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
 
 ### 2.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit LAMB enabled
 
diff --git a/docs/_tutorials/sparse-attention.md b/docs/_tutorials/sparse-attention.md
index 034dea6c10ff..7499d0370783 100644
--- a/docs/_tutorials/sparse-attention.md
+++ b/docs/_tutorials/sparse-attention.md
@@ -16,7 +16,7 @@ In this tutorial we describe how to use DeepSpeed Sparse Attention (SA) and its
   * `Attention mask`
   * `Key padding mask`
 on the intermediate attention scores. For more details about self attention, please check [MultiHeadAttention](https://pytorch.org/docs/master/generated/torch.nn.MultiheadAttention.html#multiheadattention).
-* **BertSparseSelfAttention**: This module contains a simplified BertSelfAttention layer that can be used instead of original dense Bert Self-Attention layer. Our implementation is based on [DeepSpeedExample](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373-#L434).
+* **BertSparseSelfAttention**: This module contains a simplified BertSelfAttention layer that can be used instead of original dense Bert Self-Attention layer. Our implementation is based on [DeepSpeedExample](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py#L373-#L434).
 * **SparseAttentionUtils**: This module provides few utility functions to handle adapting pre-trained model with sparse attention:
   * `replace_model_self_attention_with_sparse_self_attention`: If you have currently loaded a model and want to replace self-attention module with sparse self-attention, you can simply use this function to handle it for you. It currently handles BERT and RoBERTa based pre-trained models, but you can extend it base on your model type if it is different from these two. You also need to extend the position embedding to handle new sequence length; this can be done using `extend_position_embedding` function.
   * `update_tokenizer_model_max_length`: This function simply updates maximum position embedding in your tokenizer with the new value.
@@ -34,9 +34,9 @@ on the intermediate attention scores. For more details about self attention, ple
 {: .notice--warning}
 
 ## How to use sparse attention with DeepSpeed launcher
-In this section we describe how to use DeepSpeed Sparse Attention through our [bing_bert](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code.
+In this section we describe how to use DeepSpeed Sparse Attention through our [bing_bert](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code.
 
-* **Update attention module**: First, you need to update your attention module based on sparse computation. Here, we use [BertSparseSelfAttention](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py) which is the sparse version of `BertSelfAttention` from our [bing_bert](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code. It rewrites `BertSelfAttention` where it replaces:
+* **Update attention module**: First, you need to update your attention module based on sparse computation. Here, we use [BertSparseSelfAttention](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/bert_sparse_self_attention.py) which is the sparse version of `BertSelfAttention` from our [bing_bert](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) code. It rewrites `BertSelfAttention` where it replaces:
 
 ```python
 attention_scores = torch.matmul(query_layer, key_layer)
@@ -68,7 +68,7 @@ context_layer =
 	key_padding_mask=attention_mask)
 ```
 
-in which `sparse_self_attention` is an instance of [SparseSelfAttention](https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/sparse_self_attention.py). This module computes attention context through sparse attention replacing underlying matrix multiplications and softmax with their equivalent sparse version. You can update any other attention module similarly.
+in which `sparse_self_attention` is an instance of [SparseSelfAttention](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/ops/sparse_attention/sparse_self_attention.py). This module computes attention context through sparse attention replacing underlying matrix multiplications and softmax with their equivalent sparse version. You can update any other attention module similarly.
 
 * **Setup sparse attention config in the model**: You need to setup the sparse attention config. In our example, this is done in the `BertModel`.
 
@@ -82,7 +82,7 @@ self.encoder = BertEncoder(
    config, args, sparse_attention_config=self.sparse_attention_config)
 ```
 
-* **Update encoder model**: Further, you need to update your encoder model to use SA for the attention layer when SA is enabled. Please check our [bing_bert example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) in which we use `BertSparseSelfAttention` instead of `BertSelfAttention` when SA is enabled.
+* **Update encoder model**: Further, you need to update your encoder model to use SA for the attention layer when SA is enabled. Please check our [bing_bert example](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) in which we use `BertSparseSelfAttention` instead of `BertSelfAttention` when SA is enabled.
 
 ```python
 if sparse_attention_config is not None:
@@ -92,7 +92,7 @@ if sparse_attention_config is not None:
          config, sparsity_config=sparse_attention_config)
 ```
 
-* **Pad and unpad input data**: Also you may need to pad sequence dimension of `input_ids` and `attention_mask` to be multiple of sparse block size. As mentioned in [module](#sparse-attention-modules) section above, DeepSpeed provides utility functions for padding and unpadding. Please check our [bing_bert example](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) to see where and how pad and unpad the inputs or outputs of the model.
+* **Pad and unpad input data**: Also you may need to pad sequence dimension of `input_ids` and `attention_mask` to be multiple of sparse block size. As mentioned in [module](#sparse-attention-modules) section above, DeepSpeed provides utility functions for padding and unpadding. Please check our [bing_bert example](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/nvidia/modelingpreln.py) to see where and how pad and unpad the inputs or outputs of the model.
 
 ```python
 if self.sparse_attention_config is not None:
@@ -120,9 +120,9 @@ if self.sparse_attention_config is not None and pad_len > 0:
 --deepspeed_sparse_attention
 ```
 
-Please check [our bing_bert runner script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh) as an example of how to enable SA with DeepSpeed launcher.
+Please check [our bing_bert runner script](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/ds_sa_train_bert_bsz64k_seq128.sh) as an example of how to enable SA with DeepSpeed launcher.
 
-* **Add sparsity config**: The sparsity config can be set through the [DeepSpeed JSON config file](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json). In this example, we have used `fixed` sparsity mode that will be described in [How to config sparsity structures](#how-to-config-sparsity-structures) section.
+* **Add sparsity config**: The sparsity config can be set through the [DeepSpeed JSON config file](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/deepspeed_bsz64k_lamb_config_seq128.json). In this example, we have used `fixed` sparsity mode that will be described in [How to config sparsity structures](#how-to-config-sparsity-structures) section.
 
 ```json
 "sparse_attention": {
diff --git a/docs/_tutorials/ulysses-offload.md b/docs/_tutorials/ulysses-offload.md
new file mode 100644
index 000000000000..22137c54e450
--- /dev/null
+++ b/docs/_tutorials/ulysses-offload.md
@@ -0,0 +1,68 @@
+---
+title: "DeepSpeed Ulysses-Offload"
+tags: training ultra long context language model with fully pipelined distributed transformer
+---
+
+DeepSpeed Ulysses-Offload is a system of chunking and offloading long-context transformer model training scheme built on top of [ZeRO](/tutorials/zero/) and [DeepSpeed Ulysses](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ulysses/README.md). It adopts Fully Pipeliend Distributed Transformer (FPDT) which enables 2M context size training on 8B models with only 4 GPUs, and 4M context size training on 70B models with 32 GPUs. Read our [Ulysses-Offload blog](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/ulysses-offload/README.md) and [paper](https://arxiv.org/pdf/2408.16978) to learn more!
+
+We recommend that you read the tutorials on [Getting Started](/getting-started/), [ZeRO](/tutorials/zero/)  and [Megatron-DeepSpeed](/tutorials/megatron/) before stepping through this tutorial.
+
+
+## Design of Ulysses-Offload
+Ulysses-Offload is a chunking and offloading-based transformer implementation, which retain the full precision of the vanilla transformer, while significantly reduce the activation memory required during long-context model training. FPDT breaks long sequence input into smaller chunks, moving them among host and GPU memory to achieve the superior memory efficiency while reaching over 50% of MFU. FPDT adopts a double-buffer design, which overlaps the fetching/offloading with the attention computation. FPDT also allows uUsers to configure the chunk size to match the expected memory budget.
+
+Ulysses-Offload supports ZeRO, which shards the model and tensors among GPU memory, further pushing the limit of long-context model training with state-of-the-art hardware efficiency.
+
+
+## Training Environment
+
+For this tutorial, Flash Attention (CUDA) is required. We will configure a 8 billion parameter LLaMA model using the DeepSpeed [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/master/) code. We will use 1 nodes of 4x [NVIDIA Tesla A100-SXM4 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/a100/).
+
+
+## Training a 6.7B parameter GPT with Ulysses-Offload
+Users can set the context size at the beginning of the script, for this exercise, we will use 256K context and mini batch of one.
+```
+### Main configs
+seq_len=262144 # need to be power of 2
+```
+
+For 6.7B model, we will enable ZeRO-3, Ulysses, activation checkpointing with CPU offloading first reach a decent GPU memory efficiency, then users can configure the following arguments:
+
+ - ds_sequence_parallel_fpdt: Boolean indicating whether to use FPDT, default is false.
+ - ds_sequence_parallel_fpdt_chunk_size: Integer indicating the chunk size in FPDT, default is 65536, meaning no matter how long the sequence is, FPDT will always process chunks of 65536 tokens until the entire sequence is all processed.
+ - ds_sequence_parallel_fpdt_offloading: Boolean indicating whether to use host memory to offload chunks, default is false.
+
+
+### Megatron-DeepSpeed Configuration Changes
+
+1. An example snippet of megatron-deepspeed configurations with all Ulysses-Offload features enable is shown below:
+    ```
+    megatron_options="\
+    --ds-sequence-parallel-fpdt \
+    --ds-sequence-parallel-fpdt-chunk-size 65536 \
+    --ds-sequence-parallel-fpdt-offloading \
+    --ds-sequence-parallel-size 4"
+    ```
+
+2. FPDT requires Flash Attention, and also supports Rotary Position Embedding (RoPE):
+    ```
+    --use-flash-attn-v2 \
+    --use-rotary-position-embeddings \
+    --rotary-percent 0.25 \
+    --rotary-position-embeddings-theta 100000000 \
+    ```
+
+3. We also enable CPU checkpointing to reduce activation memory footprints:
+    ```
+    if [ "${activation_checkpoint}" = "true" ]; then
+    deepspeed_options="${deepspeed_options} \
+        --deepspeed-activation-checkpointing \
+        --checkpoint-in-cpu"
+    fi
+    ```
+
+You can find the full script [here](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_6.7B_fpdt_32k.sh).
+
+See more details on Megatron-DeepSpeed [tutorial](/tutorials/megatron/) examples on how to launch a Megatron-DeepSpeed job.
+
+Congratulations! You have completed the Ulysses-Offload tutorial.
diff --git a/docs/_tutorials/universal-checkpointing.md b/docs/_tutorials/universal-checkpointing.md
new file mode 100644
index 000000000000..29f8df27bcb1
--- /dev/null
+++ b/docs/_tutorials/universal-checkpointing.md
@@ -0,0 +1,56 @@
+---
+title: "Universal Checkpointing with DeepSpeed: A Practical Guide"
+tags: checkpointing, training, deepspeed
+---
+
+DeepSpeed Universal Checkpointing feature is a powerful tool for saving and loading model checkpoints in a way that is both efficient and flexible, enabling seamless model training continuation and finetuning across different model architectures, different parallelism techniques and training configurations. This tutorial, tailored for both begininers and experienced users, provides a step-by-step guide on how to leverage Universal Checkpointing in your DeepSpeed-powered applications. This tutorial will guide you through the process of creating ZeRO checkpoints, converting them into a Universal format, and resuming training with these universal checkpoints. This approach is crucial for leveraging pre-trained models and facilitating seamless model training across different setups.
+
+
+## Introduction to Universal Checkpointing
+
+Universal Checkpointing in DeepSpeed abstracts away the complexities of saving and loading model states, optimizer states, and training scheduler states. This feature is designed to work out of the box with minimal configuration, supporting a wide range of model sizes and types, from small-scale models to large, distributed models with different parallelism topologies trained across multiple GPUs and other accelerators.
+
+## Prerequisites
+
+Before you begin, ensure you have the following:
+- DeepSpeed installed, installation can be done via `pip install deepspeed`.
+- A model training script that utilizes DeepSpeed for distributed training.
+
+## How to use DeepSpeed Universal Checkpointing
+
+Follow the three simple steps below:
+
+### Step 1: Create ZeRO Checkpoint
+
+The first step in leveraging DeepSpeed Universal Checkpointing is to create a ZeRO checkpoint. [ZeRO](/tutorials/zero/) (Zero Redundancy Optimizer) is a memory optimization technology in DeepSpeed that allows for efficient training of large models. To create a ZeRO checkpoint, you'll need to:
+
+ - Initialize your model with DeepSpeed using the ZeRO optimizer.
+ - Train your model to the desired state (iterations).
+ - Save a checkpoint using DeepSpeed's checkpointing feature.
+
+
+### Step 2: Convert ZeRO Checkpoint to Universal Format
+
+Once you have a ZeRO checkpoint, the next step is to convert it into the Universal format. This format is designed to be flexible and compatible across different model architectures and DeepSpeed configurations. To convert a checkpoint:
+
+ - Use the [ds_to_universal.py](https://github.com/deepspeedai/DeepSpeed/blob/master/deepspeed/checkpoint/ds_to_universal.py) script provided by DeepSpeed.
+ - Specify the path to your ZeRO checkpoint and the desired output path for the Universal checkpoint.
+
+```bash
+python ds_to_universal.py --input_folder /path/to/zero/checkpoint --output_folder /path/to/universal/checkpoint
+```
+
+This script will process the ZeRO checkpoint and generate a new checkpoint in the Universal format. Pass `--help` flag to see other options.
+
+### Step 3: Resume Training with Universal Checkpoint
+With the Universal checkpoint ready, you can now resume training on potentially with different parallelism topologies or training configurations. To do this add `--universal-checkpoint` to your DeepSpeed config (json) file
+
+
+## Conclusion
+DeepSpeed Universal Checkpointing simplifies the management of model states, making it easier to save, load, and transfer model states across different training sessions and parallelism techniques. By following the steps outlined in this tutorial, you can integrate Universal Checkpointing into your DeepSpeed applications, enhancing your model training and development workflow.
+
+For more detailed examples and advanced configurations, please refer to the [Megatron-DeepSpeed examples](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/universal_checkpointing).
+
+For technical in-depth of DeepSpeed Universal Checkpointing, please see [arxiv manuscript](https://arxiv.org/abs/2406.18820) and [blog](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-ucp/).
+
+Happy training!
diff --git a/docs/_tutorials/zero-offload.md b/docs/_tutorials/zero-offload.md
index 764e80b7dd4f..8c754ea4abf8 100644
--- a/docs/_tutorials/zero-offload.md
+++ b/docs/_tutorials/zero-offload.md
@@ -9,10 +9,10 @@ We recommend that you read the tutorials on [Getting Started](/getting-started/)
 ZeRO-Offload is a ZeRO optimization that offloads the optimizer memory and computation from the GPU to the host CPU. ZeRO-Offload enables large models with up to 13 billion parameters to be efficiently trained on a single GPU. In this tutorial we will use ZeRO-Offload to train a 10-billion parameter GPT-2 model in DeepSpeed. Furthermore, *using ZeRO-Offload in a DeepSpeed model is quick and easy because all you need is to change a few configurations in the DeepSpeed configuration json*. No code changes are needed.
 
 ## ZeRO-Offload Overview
-For large model training, optimizers such as [Adam](https://arxiv.org/abs/1412.6980), can consume a significant amount of GPU compute and memory. ZeRO-Offload reduces the GPU compute and memory requirements of such models by leveraging compute and memory resources on the host CPU  to execute the optimizer. Furthermore, to prevent the optimizer from becoming a bottleneck, ZeRO-Offload uses DeepSpeed's highly optimized CPU implementation of Adam called [DeepSpeedCPUAdam](https://github.com/microsoft/DeepSpeed/tree/master/deepspeed/ops/adam). DeepSpeedCPUAdam is 5X--7X faster than the standard PyTorch implementation. To deep dive into the design and performance of ZeRO-Offload, please see our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3).
+For large model training, optimizers such as [Adam](https://arxiv.org/abs/1412.6980), can consume a significant amount of GPU compute and memory. ZeRO-Offload reduces the GPU compute and memory requirements of such models by leveraging compute and memory resources on the host CPU  to execute the optimizer. Furthermore, to prevent the optimizer from becoming a bottleneck, ZeRO-Offload uses DeepSpeed's highly optimized CPU implementation of Adam called [DeepSpeedCPUAdam](https://github.com/deepspeedai/DeepSpeed/tree/master/deepspeed/ops/adam). DeepSpeedCPUAdam is 5X--7X faster than the standard PyTorch implementation. To deep dive into the design and performance of ZeRO-Offload, please see our [blog post](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/#toc-heading-3).
 
 ## Training Environment
-For this tutorial, we will configure a 10 billion parameter GPT-2 model using the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code. We advise stepping through the Megatron-LM [tutorial](/tutorials/megatron/) if you have not previously done so. We will use a single [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM for this exercise.
+For this tutorial, we will configure a 10 billion parameter GPT-2 model using the DeepSpeed [Megatron-LM](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code. We advise stepping through the Megatron-LM [tutorial](/tutorials/megatron/) if you have not previously done so. We will use a single [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM for this exercise.
 
 ## Training a 10B parameter GPT-2 on a single V100 GPU
 We need to make changes to the Megatron-LM launch script and to the DeepSpeed configuration json.
diff --git a/docs/_tutorials/zero-one-adam.md b/docs/_tutorials/zero-one-adam.md
index 2dd956e802fd..298b6745dd6c 100644
--- a/docs/_tutorials/zero-one-adam.md
+++ b/docs/_tutorials/zero-one-adam.md
@@ -20,7 +20,7 @@ If you don't already have a copy of the DeepSpeed repository, please clone it
 now and checkout the DeepSpeedExamples submodule that contains the BERT Pre-training example.
 
 ```shell
-git clone https://github.com/microsoft/DeepSpeed
+git clone https://github.com/deepspeedai/DeepSpeed
 cd DeepSpeed
 git submodule update --init --recursive
 cd DeepSpeedExamples/
@@ -62,6 +62,10 @@ Alternatively, the standard mpirun launcher can also be used as follows:
 mpirun -np [num processes] -ppn [num GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
 ```
 
+#### 1.2.3 Compressed implementation
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend`, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype, which is utilized in one-bit algorithm. An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+This approach does not require NCCL or MPI based communication library. It will automatically use your default communication library selected by your accelerator in `deepspeed/comm`.
+
 ### 1.3 0/1 Adam Algorithm
 
 The detailed description of the 0/1 Adam algorithm can be seen from our [paper](https://arxiv.org/abs/2202.06009).
@@ -107,10 +111,10 @@ The learning rate policy is the default policy used in 0/1 Adam, and the value o
 
 `cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
 
-`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" or "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
+`comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL, MPI-based and compressed implementations by setting `comm_backend_name` to "nccl", "mpi" or "compressed". When using NCCL-based implementation, there is no need to set `cuda_aware`.
 
 #### 1.4.1 Momentum masks for parameters with constant zero gradients
-Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 0/1 Adam we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
+Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 0/1 Adam we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/deepspeedai/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
 
 **Watch out!**
 0/1 Adam relies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, aside from resetting the compression errors as 1-bit Adam, we additionally need to reset the local step buffer. Since the local step buffer can potentially fail to capture the training dynamics if the checkpoints are loaded by different number of nodes (GPUs).
@@ -121,7 +125,7 @@ For data downloading and pre-processing, please refer to the [BERT Pre-training
 
 ### 2.1 Running Pre-training with DeepSpeed and 0/1 Adam
 
-We provide example scripts under [DeepSpeedExamples/bing_bert/01_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/01_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
+We provide example scripts under [DeepSpeedExamples/bing_bert/01_adam/](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/bing_bert/01_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
 
 ### 2.2 Configuration for BERT Pre-training with DeepSpeed and 0/1 Adam enabled
 
@@ -133,7 +137,7 @@ options in terms of batch size, micro batch size, optimizer, learning rate, and
 Performance results can be seen in our [paper](https://arxiv.org/abs/2202.06009).
 
 ### 2.4 GLUE Fine-tuning
-We additionally provide the fine-tuning scripts for BERT pre-training checkpoints over [GLUE tasks](https://gluebenchmark.com/). The scripts are available at [DeepSpeedExamples/BingBertGlue](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertGlue). The `glue_bert_base.json` and `glue_bert_large.json` files give the user the ability to specify DeepSpeed
+We additionally provide the fine-tuning scripts for BERT pre-training checkpoints over [GLUE tasks](https://gluebenchmark.com/). The scripts are available at [DeepSpeedExamples/BingBertGlue](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/BingBertGlue). The `glue_bert_base.json` and `glue_bert_large.json` files give the user the ability to specify DeepSpeed
 options/parameters like micro batch size over BERT-base and BERT-large checkpoints, respectively. Currently we use Adam as the default optimizer for GLUE fine-tuning since the fine-tuning tasks usually use small batch size (~32) and do not require large-scale systems. `run_glue_bert_base_finetune.sh` and `run_glue_bert_large_finetune.sh` give the scripts for launching fine-tuning tasks, where we can modify variables like task name, number of epochs, model, etc. Note that to launch the fine-tuning, we must specify the path for checkpoint, for instance,
 ```
 bash run_glue_bert_base_finetune.sh <path to checkpoint>
diff --git a/docs/_tutorials/zero.md b/docs/_tutorials/zero.md
index c84339ece9e5..07e5ca9c1f5a 100644
--- a/docs/_tutorials/zero.md
+++ b/docs/_tutorials/zero.md
@@ -13,14 +13,14 @@ ZeRO leverages the aggregate computation and memory resources of data parallelis
 
 * **Stage 1**: The optimizer states (e.g., for [Adam optimizer](https://arxiv.org/abs/1412.6980), 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
 
-* **Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+* **Stage 2**: The reduced 16-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
 
 * **Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
 
 In addition, ZeRO-3 includes the *infinity offload engine* to form ZeRO-Infinity ([paper](https://arxiv.org/abs/2104.07857)), which can offload to both CPU and NVMe memory for huge memory savings.
 
 ## Training environment
-We use the DeepSpeed [Megatron-LM](https://github.com/microsoft/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code for this exercise. You can step through the Megatron-LM [tutorial](/tutorials/megatron/) to familiarize yourself with the code. We will train the models in this tutorial on [NVIDIA Tesla V100-SXM3 Tensor Core GPUs](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM.
+We use the DeepSpeed [Megatron-LM](https://github.com/deepspeedai/DeepSpeedExamples/tree/master/Megatron-LM-v1.1.5-ZeRO3) GPT-2 code for this exercise. You can step through the Megatron-LM [tutorial](/tutorials/megatron/) to familiarize yourself with the code. We will train the models in this tutorial on [NVIDIA Tesla V100-SXM3 Tensor Core GPUs](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM.
 
 ## Enabling ZeRO Optimization
 To enable ZeRO optimizations for a DeepSpeed model, we simply add the **_zero_optimization_** key to the DeepSpeed JSON configuration. A full description of configuration knobs of the **zero_optimization** key is available [here](/docs/config-json/#zero-optimizations-for-fp16-training).
@@ -212,7 +212,7 @@ The `deepspeed.zero.TiledLinear` module exploits the data fetch and release
 pattern of ZeRO-3 to reduce the working memory requirements by breaking down
 a large operator into smaller tiles that can be executed sequentially.
 
-We include the changes for one example from Megatron-LM's [ParallelMLP](https://github.com/microsoft/DeepSpeedExamples/blob/bdf8e59aede8c8e0577e8d4d557298ca8515268f/Megatron-LM-v1.1.5-ZeRO3/megatron/model/transformer.py#L82). Three more
+We include the changes for one example from Megatron-LM's [ParallelMLP](https://github.com/deepspeedai/DeepSpeedExamples/blob/bdf8e59aede8c8e0577e8d4d557298ca8515268f/Megatron-LM-v1.1.5-ZeRO3/megatron/model/transformer.py#L82). Three more
 model-parallel layers in `transformer.py` proceed similarly.
 
 The model parallel layers of Megatron-LM have a special form in which the
diff --git a/docs/_tutorials/zeropp.md b/docs/_tutorials/zeropp.md
index 866bb9389e22..80a095698c2e 100644
--- a/docs/_tutorials/zeropp.md
+++ b/docs/_tutorials/zeropp.md
@@ -10,23 +10,23 @@ We recommend that you read the tutorials on [Getting Started](/getting-started/)
 
 ## Three Components of ZeRO++
 ZeRO++ consists of three key designs, namely quantized weights (*qwZ*), hiearchical partitioning ZeRO (*hpZ*), and quantized gradients (*qgZ*):
- - *qwZ* applies block-based quantization to reduce ZeRO parameter all-gather communication volume by half from FP16 to INT8)
- - *hpZ* eliminates inter-node backward parameter all-gather communication through data remapping and recomputation
+ - *qwZ* applies block-based quantization to reduce ZeRO parameter all-gather communication volume by half from FP16 to INT8.
+ - *hpZ* eliminates inter-node backward parameter all-gather communication through data remapping and recomputation.
  - *qgZ* replaces gradients allreduce collective with a new communication efficient all-to-all based quantized gradient averaging.
 
 Collectively, the three optimization reduces communication volume by 4x compared to ZeRO baseline. Each of the three components can be enabled independent of each other and collectively as a group as described in the next section.
 
 ## Training Environment
 
-For this tutorial, we will configure a 18 billion parameter GPT-2 model using the DeepSpeed [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/tree/master/) GPT-2 code. We will use 4 nodes of 16x [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM per node for this exercise.
+For this tutorial, we will configure a 18 billion parameter GPT-2 model using the DeepSpeed [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/master/) GPT-2 code. We will use 4 nodes of 16x [NVIDIA Tesla V100-SXM3 Tensor Core GPU](https://www.nvidia.com/en-us/data-center/v100/) with 32GB RAM per node for this exercise.
 
 
 ## Training a 18B parameter GPT-2 with ZeRO++
 There are no change needed to the user code. However, since ZeRO++ extends ZeRO Stage 3 (ZeRO-3), appropriate flags need to be added to activate each or all of the three ZeRO++ communication collective optimizations. The three flags and their meanings and defaults and preferred values:
 
- - zero_quantized_weights: Boolean indicating whether to use quantized zero weights (*qwZ*), default is false
- - zero_hpz_partition_size: number of ranks in *hpZ* (secondary partition) group, default is 1 meaning no hpZ, ideal is number of ranks (gpus) per node
- - zero_quantized_gradients: Boolean indicating whether to use quantized zero gradients (*qgZ*), default is false
+ - zero_quantized_weights: Boolean indicating whether to use quantized zero weights (*qwZ*), default is false.
+ - zero_hpz_partition_size: number of ranks in *hpZ* (secondary partition) group, default is 1 meaning no hpZ, ideal is number of ranks (gpus) per node.
+ - zero_quantized_gradients: Boolean indicating whether to use quantized zero gradients (*qgZ*), default is false.
 
 
 ### DeepSpeed Configuration Changes
diff --git a/docs/assets/files/DeepSpeed-Meetup-May-Japan-2024.pdf b/docs/assets/files/DeepSpeed-Meetup-May-Japan-2024.pdf
new file mode 100644
index 000000000000..f62b591e8dc2
Binary files /dev/null and b/docs/assets/files/DeepSpeed-Meetup-May-Japan-2024.pdf differ
diff --git a/docs/assets/images/comet_monitor.png b/docs/assets/images/comet_monitor.png
new file mode 100644
index 000000000000..83564cd5f1eb
Binary files /dev/null and b/docs/assets/images/comet_monitor.png differ
diff --git a/docs/assets/images/deepnvme_ops_report.png b/docs/assets/images/deepnvme_ops_report.png
new file mode 100644
index 000000000000..c05e9b863b77
Binary files /dev/null and b/docs/assets/images/deepnvme_ops_report.png differ
diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst
index 67d5aa5fe9fb..93ebccaf0de2 100644
--- a/docs/code-docs/source/index.rst
+++ b/docs/code-docs/source/index.rst
@@ -71,7 +71,7 @@ Optimizers
    optimizers
 
 Learning Rate Schedulers
---------------------
+------------------------
 .. toctree::
    :maxdepth: 2
 
diff --git a/docs/code-docs/source/inference-engine.rst b/docs/code-docs/source/inference-engine.rst
index 00a745fd7078..67df1e9c314d 100644
--- a/docs/code-docs/source/inference-engine.rst
+++ b/docs/code-docs/source/inference-engine.rst
@@ -1,5 +1,5 @@
 Inference API
-============
+=============
 
 :func:`deepspeed.init_inference` returns an *inference engine*
 of type :class:`InferenceEngine`.
diff --git a/docs/code-docs/source/initialize.rst b/docs/code-docs/source/initialize.rst
index 492c42fe9ef6..dd69a5dec4d2 100644
--- a/docs/code-docs/source/initialize.rst
+++ b/docs/code-docs/source/initialize.rst
@@ -38,7 +38,7 @@ Example usage:
 .. autofunction:: deepspeed.initialize
 
 Distributed Initialization
------------------------
+--------------------------
 Optional distributed backend initialization separate from ``deepspeed.initialize()``. Useful in scenarios where the user wants to use torch distributed calls before calling ``deepspeed.initialize()``, such as when using model parallelism, pipeline parallelism, or certain data loader scenarios.
 
 .. autofunction:: deepspeed.init_distributed
diff --git a/docs/code-docs/source/model-checkpointing.rst b/docs/code-docs/source/model-checkpointing.rst
index 85f7c7947a60..ac72d368c9d3 100644
--- a/docs/code-docs/source/model-checkpointing.rst
+++ b/docs/code-docs/source/model-checkpointing.rst
@@ -52,5 +52,5 @@ Parallelism techniques such as ZeRO data parallelism (DP), Tensor parallelism (T
 optimizer states make it difficult to resume training with a checkpoint that was created on a different number of GPUs. DeepSpeed provides the
 Universal Checkpoint mechanism to address this problem. Universal Checkpoints give users the flexibility of changing the number of GPUs when training
 with 3D (TP, PP, and DP) parallelism, and enables more efficient use of elastic training hardware. The easiest way to get started with
-using Universal Checkpoints is to consult the `Megatron-DeepSpeed <https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples_deepspeed/universal_checkpointing/README.md>`_
+using Universal Checkpoints is to consult the `Megatron-DeepSpeed <https://github.com/deepspeedai/Megatron-DeepSpeed/blob/main/examples_deepspeed/universal_checkpointing/README.md>`_
 and `BLOOM <https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/README.md#checkpoint-reshaping>`_ examples.
diff --git a/docs/code-docs/source/moe.rst b/docs/code-docs/source/moe.rst
index 10634a27b949..097a4b0bc27d 100644
--- a/docs/code-docs/source/moe.rst
+++ b/docs/code-docs/source/moe.rst
@@ -1,5 +1,5 @@
 Mixture of Experts (MoE)
-====================
+========================
 
 Layer specification
 --------------------
diff --git a/docs/code-docs/source/monitor.rst b/docs/code-docs/source/monitor.rst
index d286af23f09e..b185ed433c1c 100644
--- a/docs/code-docs/source/monitor.rst
+++ b/docs/code-docs/source/monitor.rst
@@ -9,15 +9,15 @@ overview of what DeepSpeed will log automatically.
     :header: "Field", "Description", "Condition"
     :widths: 20, 20, 10
 
-    `Train/Samples/train_loss`,The training loss.,None
-    `Train/Samples/lr`,The learning rate during training.,None
-    `Train/Samples/loss_scale`,The loss scale when training using `fp16`.,`fp16` must be enabled.
-    `Train/Eigenvalues/ModelBlockParam_{i}`,Eigen values per param block.,`eigenvalue` must be enabled.
-    `Train/Samples/elapsed_time_ms_forward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_backward`,The global duration of the forward pass.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_backward_inner`,The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_backward_allreduce`,The global duration of the allreduce operation.,`flops_profiler.enabled` or `wall_clock_breakdown`.
-    `Train/Samples/elapsed_time_ms_step`,The optimizer step time,`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/train_loss`,"The training loss.",None
+    `Train/Samples/lr`,"The learning rate during training.",None
+    `Train/Samples/loss_scale`,"The loss scale when training using `fp16`.",`fp16` must be enabled.
+    `Train/Eigenvalues/ModelBlockParam_{i}`,"Eigen values per param block.",`eigenvalue` must be enabled.
+    `Train/Samples/elapsed_time_ms_forward`,"The global duration of the forward pass.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward`,"The global duration of the forward pass.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward_inner`,"The backward time that does not include the gradient reduction time. Only in cases where the gradient reduction is not overlapped, if it is overlapped then the inner time should be about the same as the entire backward time.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_backward_allreduce`,"The global duration of the allreduce operation.",`flops_profiler.enabled` or `wall_clock_breakdown`.
+    `Train/Samples/elapsed_time_ms_step`,"The optimizer step time.",`flops_profiler.enabled` or `wall_clock_breakdown`.
 
 TensorBoard
 -----------
@@ -29,6 +29,11 @@ WandB
 .. _WandbConfig:
 .. autopydantic_model:: deepspeed.monitor.config.WandbConfig
 
+Comet
+-----
+.. _CometConfig:
+.. autopydantic_model:: deepspeed.monitor.config.CometConfig
+
 CSV Monitor
 -----------
 .. _CSVConfig:
diff --git a/docs/code-docs/source/schedulers.rst b/docs/code-docs/source/schedulers.rst
index 5bc23ffb0acf..1bbe9594f77a 100755
--- a/docs/code-docs/source/schedulers.rst
+++ b/docs/code-docs/source/schedulers.rst
@@ -1,7 +1,7 @@
 Learning Rate Schedulers
-===================
+=================================
 
-DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR``, ``WarmupCosineLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
   * if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
   * if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
 
@@ -23,3 +23,8 @@ WarmupLR
 WarmupDecayLR
 ---------------------------
 .. autoclass:: deepspeed.runtime.lr_schedules.WarmupDecayLR
+
+
+WarmupCosineLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupCosineLR
diff --git a/docs/code-docs/source/zero3.rst b/docs/code-docs/source/zero3.rst
index 2a6a48ca91db..d97e05b367fa 100644
--- a/docs/code-docs/source/zero3.rst
+++ b/docs/code-docs/source/zero3.rst
@@ -10,7 +10,7 @@ communication efficiency.
 
 #. **ZeRO Stage 1**: The optimizer states (e.g., for `Adam optimizer <https://arxiv.org/abs/1412.6980>`_, 32-bit weights, and the first, and second moment estimates) are partitioned across the processes, so that each process updates only its partition.
 
-#. **ZeRO Stage 2**: The reduced 32-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
+#. **ZeRO Stage 2**: The reduced 16-bit gradients for updating the model weights are also partitioned such that each process retains only the gradients corresponding to its portion of the optimizer states.
 
 #. **ZeRO Stage 3**: The 16-bit model parameters are partitioned across the processes. ZeRO-3 will automatically collect and partition them during the forward and backward passes.
 
@@ -310,6 +310,7 @@ DeepSpeed can automatically detect the following external parameter scenarios:
 
 
 .. `Module.apply <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=module+apply#torch.nn.Module.apply>`_
+
 Overriding Module.apply
 ===============================
 A convenient mechanism for customizing model initialization is `Module.apply <https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=module+apply#torch.nn.Module.apply>`_.
@@ -369,13 +370,13 @@ These routines can be used in a training loop as shown in the following snippet.
     from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
     for n, lp in model.named_parameters():
         # 1. Access the full states
-        # 1) gradient lookup
+        #  1.1) gradient lookup
         # For zero1 and zero2, gradient lookup must be called after `backward` and before `step`
         # For zero3, gradient lookup must be called after `backward`
         hp_grad = safe_get_full_grad(lp)
 
 
-        # 2) fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
+        # 1.2) fp32 and optim states can probably be called anywhere in the training loop, but will be updated after `step`
         hp = safe_get_full_fp32_param(lp)
         exp_avg = safe_get_full_optimizer_state(lp, "exp_avg")
         exp_avg_sq = safe_get_full_optimizer_state(lp, "exp_avg_sq")
@@ -396,34 +397,39 @@ These routines can be used in a training loop as shown in the following snippet.
 Modifying Partitioned States
 ----------------------------
 
-Sometimes, a user may want to modify parameters or optimizer states outside of the regular training loop. This is currently difficult in ZeRO training because of partitioning. To overcome that, DeepSpeed provides the following routines for modifying the fp32 master parameters and the fp32 optimizer states.
+Sometimes, a user may want to modify parameters, gradients, or optimizer states outside of the regular training loop. This is currently difficult in ZeRO training because of partitioning. To overcome that, DeepSpeed provides the following routines for modifying the fp32 master parameters and the fp32 optimizer states.
 
 .. autofunction:: deepspeed.utils.safe_set_full_fp32_param
 
 .. autofunction:: deepspeed.utils.safe_set_full_optimizer_state
 
+.. autofunction:: deepspeed.utils.safe_set_full_grad
+
 .. autofunction:: deepspeed.utils.safe_set_local_fp32_param
 
+.. autofunction:: deepspeed.utils.safe_set_local_grad
+
 .. autofunction:: deepspeed.utils.safe_set_local_optimizer_state
 
-These routines can be used at any point after initialization of the DeepSpeed engine (i.e., ``deepspeed.initialize()``) as shown in the following snippet.
+The routines for modifying parameters and optimizer states can be used at any point after initialization of the DeepSpeed engine (i.e., ``deepspeed.initialize()``) as shown in the following snippet.
 
 .. code-block:: python
 
     [...]
+    from deepspeed.runtime.zero.utils import is_zero_param
     from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
     from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
     # Here is an example to zero all the fp32 parameters and optimizer states.
     for n, lp in model.named_parameters():
-        # 1. For zero stage 1 or 2, set the full fp32 and their full optim states
-        zero_tensor = torch.zeros_like(lp)
+        # 1. For zero stage 1, 2, or 3 set the full fp32 and their full optim states
+        zero_tensor = torch.zeros(lp.ds_shape) if is_zero_param(lp) else torch.zeros(lp.shape)
 
         safe_set_full_fp32_param(lp, zero_tensor)
         safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg")
         safe_get_full_optimizer_state(lp, zero_tensor, "exp_avg_sq")
 
         # 2. For zero stage 3, each process sets its local fp32 parameters and their local optimizer states individually
-        zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
+        zero_tensor_local = torch.zeros(lp.ds_tensor.shape)
 
         safe_set_local_fp32_param(lp, zero_tensor_local)
         safe_set_local_optimizer_state(lp, zero_tensor_local, "exp_avg")
@@ -432,6 +438,31 @@ These routines can be used at any point after initialization of the DeepSpeed en
     [...]
 
 
+The routines for modifying gradients can be used after ``backward`` but before ``step`` as shown in the following snippet.
+
+.. code-block:: python
+
+    backward(loss)
+    [...]
+    from deepspeed.runtime.zero.utils import is_zero_param
+    from deepspeed.utils import safe_set_full_grad, safe_set_local_grad
+    # Here is an example of how to zero all the gradients.
+    for n, lp in model.named_parameters():
+        # 1. For zero stage 1, 2, or 3 set the full gradient.
+        zero_tensor = torch.zeros(lp.ds_shape) if is_zero_param(lp) else torch.zeros(lp.shape)
+
+        safe_set_full_grad(lp, zero_tensor)
+
+        # 2. For zero stage 3, each process sets its local gradient partition.
+        zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
+
+        safe_set_local_grad(lp, zero_tensor_local)
+
+    [...]
+    optimizer.step()
+
+
+
 GPU Memory Management
 ---------------------
 
@@ -456,3 +487,72 @@ The following code snippet illustrates this functionality.
 
     # Free GPU memory consumed by model parameters
     ds_engine.empty_partition_cache()
+
+
+Offload States
+--------------
+
+The DeepSpeed engine maintains a set of states in device memory (e.g., CUDA memory). The following API allows you to offload these states to a different device (currently, only CPU memory is supported), reducing the memory footprint on the device.
+
+.. code-block:: python
+
+    def offload_states(self,
+                       include: Container[OffloadStateTypeEnum] = None,
+                       device: OffloadDeviceEnum = OffloadDeviceEnum.cpu,
+                       pin_memory: bool = True,
+                       non_blocking: bool = False) -> None:
+        """Offload the engine's states to the specified device.
+
+        Arguments:
+            include: Optional. The set of states to offload. If not provided, all states are offloaded.
+            device: Optional. The device to move the ZeRO optimizer buffers to. Currently only `OffloadDeviceEnum.cpu` is supported.
+            pin_memory: Optional. Whether to pin the memory of the offloaded states.
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+
+You can selectively offload specific states by specifying the ``OffloadStateTypeEnum`` in the include argument. ``OffloadStateTypeEnum`` is an enum that defines the states that can be offloaded. The following states are supported:
+
+* ``OffloadStateTypeEnum.optim_states``: Optimizer states. Currently, only states of DeepSpeed's FusedAdam optimizer are supported.
+* ``OffloadStateTypeEnum.hp_params``: FP32 parameters.
+* ``OffloadStateTypeEnum.lp_params``: BF16/FP16 parameters.
+* ``OffloadStateTypeEnum.lp_grads``: BF16/FP16 gradients.
+* ``OffloadStateTypeEnum.contiguous_grad_buffer``: The contiguous gradient buffer for reduce operations.
+
+Note that offloading states comes with a trade-off between memory savings and computational overhead. This API allows states to be reloaded back into device memory when needed.
+
+.. code-block:: python
+
+    def reload_states(self, non_blocking: bool = False) -> None:
+        """Reload the engine states to the original device.
+
+        Arguments:
+            non_blocking: Optional. Whether to offload the states asynchronously.
+        """
+
+Below is an example code snippet demonstrating how to offload FP32 parameters and optimizer states to CPU memory:
+
+.. code-block:: python
+
+    # Offload after forward, backward, and step
+    ds_engine.offload_states(include=[OffloadStateTypeEnum.hp_params, OffloadStateTypeEnum.optim_states])
+
+    # Do something requiring a lot of device memory
+    ...
+    # Load states back to device memory
+    ds_engine.reload_states()
+
+``deepspeed.runtime.zero.offload_states.get_state_devices`` returns devices of the specified state.
+
+.. code-block:: python
+
+    def get_state_devices(model, state: OffloadStateTypeEnum) -> Set[torch.device]:
+        """Retrieve the devices of the specified state of the model.
+
+        Args:
+            model (DeepSpeedEngine): The model whose device allocations are to be checked.
+            state (OffloadStateTypeEnum): The specific state for which the devices should be retrieved.
+
+        Returns:
+            Set[torch.device]: A set of devices of the specified state.
+
+        """
diff --git a/docs/contributing.md b/docs/contributing.md
index 7e55a183c75a..a1fa352e550a 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -27,7 +27,7 @@ and then repeat the previous `git commit` command.
 ## Testing
 DeepSpeed tracks two types of tests: unit tests and more costly model convergence tests.
 The model convergence tests train
-[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) and measure
+[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) and measure
 end-to-end convergence and related metrics. Unit tests are found in `tests/unit/` and
 the model convergence tests are found in `tests/model/`.
 
@@ -44,10 +44,10 @@ tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) an
 
 ### Model Tests
 Model tests require four GPUs and training data downloaded for
-[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/).
+[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/).
 
 To execute model tests, first [install DeepSpeed](/getting-started/#installation). The
-[DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/) repository is cloned
+[DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples/) repository is cloned
 as part of this process. Next, execute the model test driver:
 ```bash
 cd tests/model/
diff --git a/docs/index.md b/docs/index.md
index 2c18c80c6dae..6ce7e6251cbf 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -5,24 +5,28 @@ toc_label: "Contents"
 title: "Latest News"
 
 ---
-<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
+<b> <span style="color:orange" > DeepSpeed empowers ChatGPT-like model training with a single click, offering 15x speedup over SOTA RLHF systems with unprecedented cost reduction at all scales; [learn how](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-chat)</span>.</b>
 
-* [2023/11] [DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-offloadpp)
-* [2023/11] [DeepSpeed-FastGen: High-throughput Text Generation for LLMs via MII and DeepSpeed-Inference](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen)] [[中文](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen/japanese/README.md)]
-* [2023/10] [DeepSpeed-VisualChat: Improve Your Chat Experience with Multi-Round Multi-Image Inputs](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md) [[English](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-visualchat/10-03-2023/README.md)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Chinese.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-visualchat/10-03-2023/README-Japanese.md)]
-* [2023/09] Announcing the DeepSpeed4Science Initiative: Enabling large-scale scientific discovery through sophisticated AI system technologies [[DeepSpeed4Science website](https://deepspeed4science.ai/)] [[Tutorials](https://www.deepspeed.ai/deepspeed4science/)] [[White paper](https://arxiv.org/abs/2310.04610)] [[Blog](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)] [[中文](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/chinese/README.md)] [[日本語](https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed4science/japanese/README.md)]
-* [2023/08] [DeepSpeed ZeRO-Inference: 20x faster inference through weight quantization and KV cache offloading](https://github.com/microsoft/DeepSpeedExamples/blob/master/inference/huggingface/zero_inference/README.md)
+* [2024/12] [DeepSpeed Domino: Communication-Free LLM Training Engine](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-domino/README.md)
+
+* [2024/08] [DeepSpeed on Windows](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/windows/08-2024/README.md)[[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/windows/08-2024/japanese/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/windows/08-2024/chinese/README.md)]
+
+* [2024/08] [DeepNVMe: Improving DL Applications through I/O Optimizations](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/README.md)[[日本語](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/japanese/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/blob/master/blogs/deepspeed-gds/chinese/README.md)]
+* [2024/07] [DeepSpeed Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/README.md)[[日本語](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-ucp/japanese/README.md)]
+* [2024/03] [DeepSpeed-FP6: The Power of FP6-Centric Serving for Large Language Models](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md) [[English](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README.md)] [[中文](https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fp6/03-05-2024/README-Chinese.md)]
 
 <!-- NOTE: we must use html for news items otherwise links will be broken in the 'more news' section -->
 
 <details>
  <summary>More news</summary>
  <ul>
-  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-chat/ds-chat-release-8-31/README.md">DeepSpeed-Chat: Llama/Llama-2 system support, efficiency boost, and training stability improvements</a></li>
+  <li>[2024/01] <a href="https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-fastgen/2024-01-19">DeepSpeed-FastGen: Introducting Mixtral, Phi-2, and Falcon support with major performance and feature enhancements.</a></li>
+
+  <li>[2023/11] <a href="https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/intel-inference/README.md">Llama 2 Inference on 4th Gen Intel® Xeon® Scalable Processor with DeepSpeed</a> [<a href="https://www.intel.com/content/www/us/en/developer/articles/technical/xllama-2-on-xeon-scalable-processor-with-deepspeed.html">Intel version</a>]</li>
+
+  <li>[2023/11] <a href="https://github.com/deepspeedai/DeepSpeed/tree/master/blogs/deepspeed-offloadpp/README.md">DeepSpeed ZeRO-Offload++: 6x Higher Training Throughput via Collaborative CPU/GPU Twin-Flow</a></li>
 
-  <li>[2023/08] <a href="https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-ulysses">DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models</a> [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/deepspeed-ulysses/japanese/README.md">日本語</a>]</li>
 
-  <li>[2023/06] <a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">ZeRO++: A leap in speed for LLM and chat model training with 4X less communication</a> [<a href="https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/">English</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/chinese/README.md">中文</a>] [<a href="https://github.com/microsoft/DeepSpeed/blob/master/blogs/zeropp/japanese/README.md">日本語</a>]</li>
  </ul>
 </details>
 
@@ -62,15 +66,15 @@ In line with Microsoft's mission to solve humanity's most pressing challenges, t
 
 ## DeepSpeed Library
 
-   The [DeepSpeed](https://github.com/microsoft/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for an easy composition of a multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
+   The [DeepSpeed](https://github.com/deepspeedai/deepspeed) library implements and packages the innovations and technologies in DeepSpeed Training, Inference and Compression Pillars into a single easy-to-use, open-sourced repository. It allows for an easy composition of a multitude of features within a single training, inference or compression pipeline. The DeepSpeed Library is heavily adopted by the DL community, and has been used to enable some of the most powerful models (see [DeepSpeed Adoption](#deepspeed-adoption)).
 
 ## Model Implementations for Inference (MII)
 
-   [Model Implementations for Inference (MII)](https://github.com/microsoft/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
+   [Model Implementations for Inference (MII)](https://github.com/deepspeedai/deepspeed-mii) is an open-sourced repository for making low-latency and high-throughput inference accessible to all data scientists by alleviating the need to apply complex system optimization techniques themselves. Out-of-box, MII offers support for thousands of widely used DL models, optimized using DeepSpeed-Inference, that can be deployed with a few lines of code, while achieving significant latency reduction compared to their vanilla open-sourced versions.
 
 ## DeepSpeed on Azure
 
-   DeepSpeed users are diverse and have access to different environments. We recommend trying DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/microsoft/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
+   DeepSpeed users are diverse and have access to different environments. We recommend trying DeepSpeed on Azure as it is the simplest and easiest method. The recommended method to try DeepSpeed on Azure is through AzureML [recipes](https://github.com/Azure/azureml-examples/tree/main/python-sdk/workflows/train/deepspeed). The job submission and data preparation scripts have been made available [here](https://github.com/deepspeedai/Megatron-DeepSpeed/tree/main/examples_deepspeed/azureml). For more details on how to use DeepSpeed on Azure, please follow the [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
 
 # DeepSpeed Adoption
 
@@ -90,8 +94,8 @@ DeepSpeed has been integrated with several different popular open-source DL fram
 
 |                                                                                                | Documentation                                |
 | ---------------------------------------------------------------------------------------------- | -------------------------------------------- |
-| <img src="assets/images/transformers-light.png" width="300px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/main/main_classes/deepspeed) |
-| <img src="assets/images/accelerate-light.png" width="300px">| [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/main/en/deepspeed) |
+| <img src="assets/images/transformers-light.png" width="300px"> | [Transformers with DeepSpeed](https://huggingface.co/docs/transformers/deepspeed) |
+| <img src="assets/images/accelerate-light.png" width="300px">| [Accelerate with DeepSpeed](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) |
 | <img src="assets/images/lightning-light.svg" width="250px"> | [Lightning with DeepSpeed](https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.strategies.DeepSpeedStrategy.html) |
 | <img src="assets/images/mosaicml.svg" width="250px"> | [MosaicML with DeepSpeed](https://docs.mosaicml.com/en/latest/trainer/using_the_trainer.html?highlight=deepspeed#deepspeed-integration) |
 
@@ -127,15 +131,15 @@ comments.
 3. Minjia Zhang, Yuxiong He. (2020) Accelerating Training of Transformer-Based Language Models with Progressive Layer Dropping. [arXiv:2010.13369](https://arxiv.org/abs/2010.13369) and [NeurIPS 2020](https://proceedings.neurips.cc/paper/2020/hash/a1140a3d0df1c81e24ae954d935e8926-Abstract.html).
 4. Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, Yuxiong He. (2021) ZeRO-Offload: Democratizing Billion-Scale Model Training. [arXiv:2101.06840](https://arxiv.org/abs/2101.06840) and [USENIX ATC 2021](https://www.usenix.org/conference/atc21/presentation/ren-jie). [[paper]](https://arxiv.org/abs/2101.06840) [[slides]](https://www.usenix.org/system/files/atc21_slides_ren-jie.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/)
 5. Hanlin Tang, Shaoduo Gan, Ammar Ahmad Awan, Samyam Rajbhandari, Conglong Li, Xiangru Lian, Ji Liu, Ce Zhang, Yuxiong He. (2021) 1-bit Adam: Communication Efficient Large-Scale Training with Adam's Convergence Speed. [arXiv:2102.02888](https://arxiv.org/abs/2102.02888) and [ICML 2021](http://proceedings.mlr.press/v139/tang21a.html).
-6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205). [[paper]](https://arxiv.org/abs/2104.07857) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/SC21-ZeRO-Infinity.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
+6. Samyam Rajbhandari, Olatunji Ruwase, Jeff Rasley, Shaden Smith, Yuxiong He. (2021) ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning. [arXiv:2104.07857](https://arxiv.org/abs/2104.07857) and [SC 2021](https://dl.acm.org/doi/abs/10.1145/3458817.3476205). [[paper]](https://arxiv.org/abs/2104.07857) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/SC21-ZeRO-Infinity.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/zero-infinity-and-deepspeed-unlocking-unprecedented-model-scale-for-deep-learning-training/)
 7. Conglong Li, Ammar Ahmad Awan, Hanlin Tang, Samyam Rajbhandari, Yuxiong He. (2021) 1-bit LAMB: Communication Efficient Large-Scale Large-Batch Training with LAMB's Convergence Speed. [arXiv:2104.06069](https://arxiv.org/abs/2104.06069) and [HiPC 2022](https://hipc.org/advance-program/).
 8. Conglong Li, Minjia Zhang, Yuxiong He. (2021) The Stability-Efficiency Dilemma: Investigating Sequence Length Warmup for Training GPT Models. [arXiv:2108.06084](https://arxiv.org/abs/2108.06084) and [NeurIPS 2022](https://openreview.net/forum?id=JpZ5du_Kdh).
 9. Yucheng Lu, Conglong Li, Minjia Zhang, Christopher De Sa, Yuxiong He. (2022) Maximizing Communication Efficiency for Large-scale Training via 0/1 Adam. [arXiv:2202.06009](https://arxiv.org/abs/2202.06009).
-10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html). [[pdf]](https://arxiv.org/abs/2201.05596) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/ICML-5mins.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/)
+10. Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He. (2022) DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale [arXiv:2201.05596](https://arxiv.org/abs/2201.05596) and [ICML 2022](https://proceedings.mlr.press/v162/rajbhandari22a.html). [[pdf]](https://arxiv.org/abs/2201.05596) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/ICML-5mins.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-advancing-moe-inference-and-training-to-power-next-generation-ai-scale/)
 11. Shaden Smith, Mostofa Patwary, Brandon Norick, Patrick LeGresley, Samyam Rajbhandari, Jared Casper, Zhun Liu, Shrimai Prabhumoye, George Zerveas, Vijay Korthikanti, Elton Zhang, Rewon Child, Reza Yazdani Aminabadi, Julie Bernauer, Xia Song, Mohammad Shoeybi, Yuxiong He, Michael Houston, Saurabh Tiwary, Bryan Catanzaro. (2022) Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model [arXiv:2201.11990](https://arxiv.org/abs/2201.11990).
 12. Xiaoxia Wu, Zhewei Yao, Minjia Zhang, Conglong Li, Yuxiong He. (2022) Extreme Compression for Pre-trained Transformers Made Simple and Efficient. [arXiv:2206.01859](https://arxiv.org/abs/2206.01859) and [NeurIPS 2022](https://openreview.net/forum?id=xNeAhc2CNAl).
-13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/)
-14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946). [[paper]](https://arxiv.org/abs/2207.00032) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/sc22-ds-inference.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/)
+13. Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, Yuxiong He. (2022) ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. [arXiv:2206.01861](https://arxiv.org/abs/2206.01861) and [NeurIPS 2022](https://openreview.net/forum?id=f-fVCElZ-G1) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-compression-a-composable-library-for-extreme-compression-and-zero-cost-quantization/)
+14. Reza Yazdani Aminabadi, Samyam Rajbhandari, Minjia Zhang, Ammar Ahmad Awan, Cheng Li, Du Li, Elton Zheng, Jeff Rasley, Shaden Smith, Olatunji Ruwase, Yuxiong He. (2022) DeepSpeed Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale. [arXiv:2207.00032](https://arxiv.org/abs/2207.00032) and [SC 2022](https://dl.acm.org/doi/abs/10.5555/3571885.3571946). [[paper]](https://arxiv.org/abs/2207.00032) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/sc22-ds-inference.pdf) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-accelerating-large-scale-model-inference-and-training-via-system-optimizations-and-compression/)
 15. Zhewei Yao, Xiaoxia Wu, Conglong Li, Connor Holmes, Minjia Zhang, Cheng Li, Yuxiong He. (2022) Random-LTD: Random and Layerwise Token Dropping Brings Efficient Training for Large-scale Transformers. [arXiv:2211.11586](https://arxiv.org/abs/2211.11586).
 16. Conglong Li, Zhewei Yao, Xiaoxia Wu, Minjia Zhang, Yuxiong He. (2022) DeepSpeed Data Efficiency: Improving Deep Learning Model Quality and Training Efficiency via Efficient Data Sampling and Routing. [arXiv:2212.03597](https://arxiv.org/abs/2212.03597) [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/)
 17. Xiaoxia Wu, Cheng Li, Reza Yazdani Aminabadi, Zhewei Yao, Yuxiong He. (2023) Understanding INT4 Quantization for Transformer Models: Latency Speedup, Composability, and Failure Cases. [arXiv:2301.12017](https://arxiv.org/abs/2301.12017) and [ICML2023](https://icml.cc/Conferences/2023).
@@ -144,13 +148,15 @@ comments.
 20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
 21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023.
 22. Guanhua Wang, Heyang Qin, Sam Ade Jacobs, Xiaoxia Wu, Connor Holmes, Zhewei Yao, Samyam Rajbhandari, Olatunji Ruwase, Feng Yan, Lei Yang, Yuxiong He. (2023) ZeRO++: Extremely Efficient Collective Communication for Giant Model Training [arXiv:2306.10209](https://arxiv.org/abs/2306.10209) and [ML for Sys Workshop at NeurIPS2023](http://mlforsystems.org/) [[blog]](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/)
-23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
+23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
 24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847)
 25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320).
-26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/microsoft/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
+26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782) and [ENLSP2023 Workshop at NeurIPS2023](https://neurips2023-enlsp.github.io/) [[slides]](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/assets/files/zeroquant_series.pdf)
 27. Zhewei Yao, Xiaoxia Wu, Conglong Li, Minjia Zhang, Heyang Qin, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He. (2023) DeepSpeed-VisualChat: Multi-Round Multi-Image Interleave Chat via Multi-Modal Causal Attention [arXiv:2309.14327](https://arxiv.org/pdf/2309.14327.pdf)
 28. Shuaiwen Leon Song, Bonnie Kruft, Minjia Zhang, Conglong Li, Shiyang Chen, Chengming Zhang, Masahiro Tanaka, Xiaoxia Wu, Jeff Rasley, Ammar Ahmad Awan, Connor Holmes, Martin Cai, Adam Ghanem, Zhongzhu Zhou, Yuxiong He, et al. (2023) DeepSpeed4Science Initiative: Enabling Large-Scale Scientific Discovery through Sophisticated AI System Technologies [arXiv:2310.04610](https://arxiv.org/abs/2310.04610) [[blog]](https://www.microsoft.com/en-us/research/blog/announcing-the-deepspeed4science-initiative-enabling-large-scale-scientific-discovery-through-sophisticated-ai-system-technologies/)
 29. Zhewei Yao, Reza Yazdani Aminabadi, Stephen Youn, Xiaoxia Wu, Elton Zheng, Yuxiong He. (2023) ZeroQuant-HERO: Hardware-Enhanced Robust Optimized Post-Training Quantization Framework for W8A8 Transformers [arXiv:2310.17723](https://arxiv.org/abs/2310.17723)
+30. Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Reza Yazdani Aminadabi, Shuaiwen Leon Song, Samyam Rajbhandari, Yuxiong He. (2024) [System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models](https://dl.acm.org/doi/10.1145/3662158.3662806)
+31. Xinyu Lian, Sam Ade Jacobs, Lev Kurilenko, Masahiro Tanaka, Stas Bekman, Olatunji Ruwase, Minjia Zhang. (2024) Universal Checkpointing: Efficient and Flexible Checkpointing for Large Scale Distributed Training [arXiv:2406.18820](https://arxiv.org/abs/2406.18820)
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial
diff --git a/examples/README.md b/examples/README.md
index 5dfc26c17613..c7ff01dcd2d4 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -2,8 +2,8 @@
 
 If you are looking for examples using DeepSpeed please see the following resources:
 
-1. [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)
-2. [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
+1. [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)
+2. [Megatron-DeepSpeed](https://github.com/deepspeedai/Megatron-DeepSpeed)
 3. [DeepSpeed + AzureML](https://github.com/Azure/azureml-examples/tree/main/v1/python-sdk/workflows/train/deepspeed)
-4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/main_classes/deepspeed)
+4. [DeepSpeed + Hugging Face Transformers Integration](https://huggingface.co/docs/transformers/deepspeed)
 5. [DeepSpeed + PyTorch Lightning](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.utilities.deepspeed.html)
diff --git a/install.sh b/install.sh
index 8d56afe40493..8be574c6ec1f 100755
--- a/install.sh
+++ b/install.sh
@@ -152,7 +152,7 @@ if [ ! -f $hostfile ]; then
 fi
 
 echo "Building deepspeed wheel"
-python setup.py $VERBOSE bdist_wheel
+python -m build $VERBOSE --wheel --no-isolation
 
 if [ "$local_only" == "1" ]; then
     echo "Installing deepspeed"
diff --git a/op_builder/all_ops.py b/op_builder/all_ops.py
index 9c41f35eaf1b..ff11ca180072 100644
--- a/op_builder/all_ops.py
+++ b/op_builder/all_ops.py
@@ -30,3 +30,4 @@
                 __op_builders__.append(builder)
 
 ALL_OPS = {op.name: op for op in __op_builders__ if op is not None}
+accelerator_name = get_accelerator()._name
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index b55c821910b9..f29f88531a98 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -3,13 +3,14 @@
 
 # DeepSpeed Team
 
+import os
 import distutils.spawn
 import subprocess
 
-from .builder import OpBuilder
+from .builder import TorchCPUOpBuilder
 
 
-class AsyncIOBuilder(OpBuilder):
+class AsyncIOBuilder(TorchCPUOpBuilder):
     BUILD_VAR = "DS_BUILD_AIO"
     NAME = "async_io"
 
@@ -19,44 +20,57 @@ def __init__(self):
     def absolute_name(self):
         return f'deepspeed.ops.aio.{self.NAME}_op'
 
-    def sources(self):
-        return [
-            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
-            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
-            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
-            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
+    def lib_sources(self):
+        src_list = [
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp',
             'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
         ]
+        return src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp']
 
     def include_paths(self):
-        return ['csrc/aio/py_lib', 'csrc/aio/common']
+        import torch
+        if self.build_for_cpu:
+            CUDA_INCLUDE = []
+        elif not self.is_rocm_pytorch():
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
+            ]
+        return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE
 
     def cxx_args(self):
         # -O0 for improved debugging, since performance is bound by I/O
-        CPU_ARCH = self.cpu_arch()
-        SIMD_WIDTH = self.simd_width()
-        import torch  # Keep this import here to avoid errors when building DeepSpeed wheel without torch installed
+        args = super().cxx_args()
+        import torch
         TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
-        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
-            CPP_STD = '-std=c++17'
-        else:
-            CPP_STD = '-std=c++14'
-        return [
-            '-g',
-            '-Wall',
-            '-O0',
-            CPP_STD,
-            '-shared',
-            '-fPIC',
-            '-Wno-reorder',
-            CPU_ARCH,
-            '-fopenmp',
-            SIMD_WIDTH,
-            '-laio',
-        ]
+        if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1):
+            args.remove('-std=c++17')
+            args.append('-std=c++14')
+        args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder']
+        return args
 
     def extra_ldflags(self):
-        return ['-laio']
+        if self.build_for_cpu:
+            return ['-fopenmp']
+
+        import torch.utils.cpp_extension
+        CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
+        if CUDA_HOME is None:
+            ldflags = ['-laio']  # the ROCM case
+        else:
+            CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
+            ldflags = [f'-L{CUDA_HOME}', f'-L{CUDA_LIB64}', '-laio', '-lcuda', '-lcudart']
+        return ldflags
 
     def check_for_libaio_pkg(self):
         libs = dict(
@@ -70,8 +84,8 @@ def check_for_libaio_pkg(self):
             flag, lib, tool = data
             path = distutils.spawn.find_executable(pkgmgr)
             if path is not None:
-                cmd = f"{pkgmgr} {flag} {lib}"
-                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+                cmd = [pkgmgr, flag, lib]
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                 if result.wait() == 0:
                     found = True
                 else:
@@ -79,13 +93,13 @@ def check_for_libaio_pkg(self):
                 break
         return found
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # Check for the existence of libaio by using distutils
         # to compile and link a test program that calls io_submit,
         # which is a function provided by libaio that is used in the async_io op.
         # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
         # respectively to specify the directories for libaio.h and libaio.so.
-        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
+        aio_compatible = self.has_function('io_submit', ('aio', ))
         if verbose and not aio_compatible:
             self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
 
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 3613791c938d..9b721e110fcc 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import os
+import re
 import sys
 import time
 import importlib
@@ -60,13 +61,20 @@ def installed_cuda_version(name=""):
 
 def get_default_compute_capabilities():
     compute_caps = DEFAULT_COMPUTE_CAPABILITIES
+    # Update compute capability according to: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
     import torch.utils.cpp_extension
-    if torch.utils.cpp_extension.CUDA_HOME is not None and installed_cuda_version()[0] >= 11:
-        if installed_cuda_version()[0] == 11 and installed_cuda_version()[1] == 0:
-            # Special treatment of CUDA 11.0 because compute_86 is not supported.
-            compute_caps += ";8.0"
-        else:
-            compute_caps += ";8.0;8.6"
+    if torch.utils.cpp_extension.CUDA_HOME is not None:
+        if installed_cuda_version()[0] == 11:
+            if installed_cuda_version()[1] >= 0:
+                compute_caps += ";8.0"
+            if installed_cuda_version()[1] >= 1:
+                compute_caps += ";8.6"
+            if installed_cuda_version()[1] >= 8:
+                compute_caps += ";9.0"
+        elif installed_cuda_version()[0] == 12:
+            compute_caps += ";8.0;8.6;9.0"
+            if installed_cuda_version()[1] >= 8:
+                compute_caps += ";10.0;12.0"
     return compute_caps
 
 
@@ -75,7 +83,8 @@ def get_default_compute_capabilities():
 cuda_minor_mismatch_ok = {
     10: ["10.0", "10.1", "10.2"],
     11: ["11.0", "11.1", "11.2", "11.3", "11.4", "11.5", "11.6", "11.7", "11.8"],
-    12: ["12.0", "12.1", "12.2", "12.3"],
+    12: ["12.0", "12.1", "12.2", "12.3", "12.4", "12.5", "12.6",
+         "12.8"],  # There does not appear to be a CUDA Toolkit 12.7
 }
 
 
@@ -107,7 +116,10 @@ def assert_no_cuda_mismatch(name=""):
 
 class OpBuilder(ABC):
     _rocm_version = None
+    _rocm_gpu_arch = None
+    _rocm_wavefront_size = None
     _is_rocm_pytorch = None
+    _is_sycl_enabled = None
     _loaded_ops = {}
 
     def __init__(self, name):
@@ -135,6 +147,9 @@ def sources(self):
     def hipify_extension(self):
         pass
 
+    def sycl_extension(self):
+        pass
+
     @staticmethod
     def validate_torch_version(torch_info):
         install_torch_version = torch_info['version']
@@ -186,6 +201,22 @@ def is_rocm_pytorch():
         OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
         return OpBuilder._is_rocm_pytorch
 
+    @staticmethod
+    def is_sycl_enabled():
+        if OpBuilder._is_sycl_enabled is not None:
+            return OpBuilder._is_sycl_enabled
+
+        _is_sycl_enabled = False
+        try:
+            result = subprocess.run(["c2s", "--version"], capture_output=True)
+        except:
+            pass
+        else:
+            _is_sycl_enabled = True
+
+        OpBuilder._is_sycl_enabled = _is_sycl_enabled
+        return OpBuilder._is_sycl_enabled
+
     @staticmethod
     def installed_rocm_version():
         if OpBuilder._rocm_version:
@@ -193,22 +224,68 @@ def installed_rocm_version():
 
         ROCM_MAJOR = '0'
         ROCM_MINOR = '0'
+        ROCM_VERSION_DEV_RAW = ""
         if OpBuilder.is_rocm_pytorch():
             from torch.utils.cpp_extension import ROCM_HOME
-            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version-dev")
+            rocm_ver_file = Path(ROCM_HOME).joinpath(".info/version")
             if rocm_ver_file.is_file():
                 with open(rocm_ver_file, 'r') as file:
                     ROCM_VERSION_DEV_RAW = file.read()
             elif "rocm" in torch.__version__:
                 ROCM_VERSION_DEV_RAW = torch.__version__.split("rocm")[1]
+            if ROCM_VERSION_DEV_RAW != "":
+                ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
+                ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
             else:
+                # Look in /usr/include/rocm-version.h
+                rocm_ver_file = Path("/usr/include/rocm_version.h")
+                if rocm_ver_file.is_file():
+                    with open(rocm_ver_file, 'r') as file:
+                        for ln in file.readlines():
+                            if "#define ROCM_VERSION_MAJOR" in ln:
+                                ROCM_MAJOR = re.findall(r'\S+', ln)[2]
+                            elif "#define ROCM_VERSION_MINOR" in ln:
+                                ROCM_MINOR = re.findall(r'\S+', ln)[2]
+            if ROCM_MAJOR == '0':
                 assert False, "Could not detect ROCm version"
-            assert ROCM_VERSION_DEV_RAW != "", "Could not detect ROCm version"
-            ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
-            ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
+
         OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
         return OpBuilder._rocm_version
 
+    @staticmethod
+    def get_rocm_gpu_arch():
+        if OpBuilder._rocm_gpu_arch:
+            return OpBuilder._rocm_gpu_arch
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_gpu_arch_cmd = str(rocm_info) + " | grep -o -m 1 'gfx.*'"
+        try:
+            result = subprocess.check_output(rocm_gpu_arch_cmd, shell=True)
+            rocm_gpu_arch = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_gpu_arch = ""
+        OpBuilder._rocm_gpu_arch = rocm_gpu_arch
+        return OpBuilder._rocm_gpu_arch
+
+    @staticmethod
+    def get_rocm_wavefront_size():
+        if OpBuilder._rocm_wavefront_size:
+            return OpBuilder._rocm_wavefront_size
+
+        rocm_info = Path("/opt/rocm/bin/rocminfo")
+        if (not rocm_info.is_file()):
+            rocm_info = Path("rocminfo")
+        rocm_wavefront_size_cmd = str(
+            rocm_info) + " | grep -Eo -m1 'Wavefront Size:[[:space:]]+[0-9]+' | grep -Eo '[0-9]+'"
+        try:
+            result = subprocess.check_output(rocm_wavefront_size_cmd, shell=True)
+            rocm_wavefront_size = result.decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            rocm_wavefront_size = "32"
+        OpBuilder._rocm_wavefront_size = rocm_wavefront_size
+        return OpBuilder._rocm_wavefront_size
+
     def include_paths(self):
         '''
         Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
@@ -227,7 +304,7 @@ def cxx_args(self):
         '''
         return []
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         '''
         Check if all non-python dependencies are satisfied to build this op
         '''
@@ -236,7 +313,7 @@ def is_compatible(self, verbose=True):
     def extra_ldflags(self):
         return []
 
-    def has_function(self, funcname, libraries, verbose=False):
+    def has_function(self, funcname, libraries, library_dirs=None, verbose=False):
         '''
         Test for existence of a function within a tuple of libraries.
 
@@ -292,7 +369,8 @@ def has_function(self, funcname, libraries, verbose=False):
             compiler.link_executable(objs,
                                      os.path.join(tempdir, 'a.out'),
                                      extra_preargs=self.strip_empty_entries(ldflags),
-                                     libraries=libraries)
+                                     libraries=libraries,
+                                     library_dirs=library_dirs)
 
             # Compile and link succeeded
             return True
@@ -334,8 +412,8 @@ def cpu_arch(self):
         try:
             cpu_info = get_cpu_info()
         except Exception as e:
-            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                         "falling back to `lscpu` to get this information.")
+            self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), "
+                         "falling back to lscpu to get this information.")
             cpu_info = self._backup_cpuinfo()
             if cpu_info is None:
                 return "-march=native"
@@ -345,10 +423,11 @@ def cpu_arch(self):
             return '-mcpu=native'
         return '-march=native'
 
-    def is_cuda_enable(self):
+    def get_cuda_compile_flag(self):
         try:
-            assert_no_cuda_mismatch(self.name)
-            return '-D__ENABLE_CUDA__'
+            if not self.is_rocm_pytorch():
+                assert_no_cuda_mismatch(self.name)
+                return "-D__ENABLE_CUDA__"
         except MissingCUDAException:
             print(f"{WARNING} {self.name} cuda is missing or is incompatible with installed torch, "
                   "only cpu ops can be compiled!")
@@ -362,7 +441,7 @@ def _backup_cpuinfo(self):
                          "to detect the CPU architecture. 'lscpu' does not appear to exist on "
                          "your system, will fall back to use -march=native and non-vectorized execution.")
             return None
-        result = subprocess.check_output('lscpu', shell=True)
+        result = subprocess.check_output(['lscpu'])
         result = result.decode('utf-8').strip().lower()
 
         cpu_info = {}
@@ -392,8 +471,8 @@ def simd_width(self):
         try:
             cpu_info = get_cpu_info()
         except Exception as e:
-            self.warning(f"{self.name} attempted to use `py-cpuinfo` but failed (exception type: {type(e)}, {e}), "
-                         "falling back to `lscpu` to get this information.")
+            self.warning(f"{self.name} attempted to use py-cpuinfo but failed (exception type: {type(e)}, {e}), "
+                         "falling back to lscpu to get this information.")
             cpu_info = self._backup_cpuinfo()
             if cpu_info is None:
                 return '-D__SCALAR__'
@@ -412,7 +491,8 @@ def command_exists(self, cmd):
             cmds = [cmd]
         valid = False
         for cmd in cmds:
-            result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+            safe_cmd = ["bash", "-c", f"type {cmd}"]
+            result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
             valid = valid or result.wait() == 0
 
         if not valid and len(cmds) > 1:
@@ -433,9 +513,10 @@ def deepspeed_src_path(self, code_path):
 
     def builder(self):
         from torch.utils.cpp_extension import CppExtension
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
         return CppExtension(name=self.absolute_name(),
                             sources=self.strip_empty_entries(self.sources()),
-                            include_dirs=self.strip_empty_entries(self.include_paths()),
+                            include_dirs=include_dirs,
                             extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
                             extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
 
@@ -443,8 +524,9 @@ def load(self, verbose=True):
         if self.name in __class__._loaded_ops:
             return __class__._loaded_ops[self.name]
 
-        from deepspeed.git_version_info import installed_ops, torch_info
-        if installed_ops.get(self.name, False):
+        from deepspeed.git_version_info import installed_ops, torch_info, accelerator_name
+        from deepspeed.accelerator import get_accelerator
+        if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name:
             # Ensure the op we're about to load was compiled with the same
             # torch/cuda versions we are currently using at runtime.
             self.validate_torch_version(torch_info)
@@ -474,8 +556,8 @@ def jit_load(self, verbose=True):
         from torch.utils.cpp_extension import load
 
         start_build = time.time()
-        sources = [self.deepspeed_src_path(path) for path in self.sources()]
-        extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()]
+        sources = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.sources()]
+        extra_include_paths = [os.path.abspath(self.deepspeed_src_path(path)) for path in self.include_paths()]
 
         # Torch will try and apply whatever CCs are in the arch list at compile time,
         # we have already set the intended targets ourselves we know that will be
@@ -495,9 +577,12 @@ def jit_load(self, verbose=True):
                 nvcc_args.append("-DBF16_AVAILABLE")
                 nvcc_args.append("-U__CUDA_NO_BFLOAT16_OPERATORS__")
                 nvcc_args.append("-U__CUDA_NO_BFLOAT162_OPERATORS__")
+                nvcc_args.append("-U__CUDA_NO_BFLOAT16_CONVERSIONS__")
 
         if self.is_rocm_pytorch():
             cxx_args.append("-D__HIP_PLATFORM_AMD__=1")
+            os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
+            cxx_args.append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
 
         op_module = load(name=self.name,
                          sources=self.strip_empty_entries(sources),
@@ -534,8 +619,8 @@ def compute_capability_args(self, cross_compile_archs=None):
 
         - `TORCH_CUDA_ARCH_LIST` may use ; or whitespace separators. Examples:
 
-        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6" pip install ...
-        TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6+PTX" pip install ...
+        TORCH_CUDA_ARCH_LIST="6.1;7.5;8.6;9.0;10.0" pip install ...
+        TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 9.0 10.0+PTX" pip install ...
 
         - `cross_compile_archs` uses ; separator.
 
@@ -557,7 +642,7 @@ def compute_capability_args(self, cross_compile_archs=None):
             if cross_compile_archs_env is not None:
                 if cross_compile_archs is not None:
                     print(
-                        f"{WARNING} env var `TORCH_CUDA_ARCH_LIST={cross_compile_archs_env}` overrides `cross_compile_archs={cross_compile_archs}`"
+                        f"{WARNING} env var TORCH_CUDA_ARCH_LIST={cross_compile_archs_env} overrides cross_compile_archs={cross_compile_archs}"
                     )
                 cross_compile_archs = cross_compile_archs_env.replace(' ', ';')
             else:
@@ -573,9 +658,9 @@ def compute_capability_args(self, cross_compile_archs=None):
         args = []
         self.enable_bf16 = True
         for cc in ccs:
-            num = cc[0] + cc[2]
+            num = cc[0] + cc[1].split('+')[0]
             args.append(f'-gencode=arch=compute_{num},code=sm_{num}')
-            if cc.endswith('+PTX'):
+            if cc[1].endswith('+PTX'):
                 args.append(f'-gencode=arch=compute_{num},code=compute_{num}')
 
             if int(cc[0]) <= 7:
@@ -588,7 +673,7 @@ def filter_ccs(self, ccs: List[str]):
         Prune any compute capabilities that are not compatible with the builder. Should log
         which CCs have been pruned.
         """
-        return ccs
+        return [cc.split('.') for cc in ccs]
 
     def version_dependent_macros(self):
         # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
@@ -603,7 +688,7 @@ def version_dependent_macros(self):
             version_ge_1_5 = ['-DVERSION_GE_1_5']
         return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         return super().is_compatible(verbose)
 
     def builder(self):
@@ -618,20 +703,27 @@ def builder(self):
             from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
         else:
             from torch.utils.cpp_extension import CUDAExtension as ExtensionBuilder
-
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
         compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())} if self.build_for_cpu else \
                        {'cxx': self.strip_empty_entries(self.cxx_args()), \
                         'nvcc': self.strip_empty_entries(self.nvcc_args())}
 
         if not self.build_for_cpu and self.enable_bf16:
             compile_args['cxx'].append("-DBF16_AVAILABLE")
+            compile_args['nvcc'].append("-DBF16_AVAILABLE")
 
         if self.is_rocm_pytorch():
             compile_args['cxx'].append("-D__HIP_PLATFORM_AMD__=1")
+            #cxx compiler args are required to compile cpp files
+            compile_args['cxx'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            #nvcc compiler args are required to compile hip files
+            compile_args['nvcc'].append('-DROCM_WAVEFRONT_SIZE=%s' % self.get_rocm_wavefront_size())
+            if self.get_rocm_gpu_arch():
+                os.environ["PYTORCH_ROCM_ARCH"] = self.get_rocm_gpu_arch()
 
         cuda_ext = ExtensionBuilder(name=self.absolute_name(),
                                     sources=self.strip_empty_entries(self.sources()),
-                                    include_dirs=self.strip_empty_entries(self.include_paths()),
+                                    include_dirs=include_dirs,
                                     libraries=self.strip_empty_entries(self.libraries_args()),
                                     extra_compile_args=compile_args,
                                     extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
@@ -682,11 +774,25 @@ def nvcc_args(self):
                 '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
             ]
         else:
-            cuda_major, _ = installed_cuda_version()
+            try:
+                nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
+                if nvcc_threads <= 0:
+                    raise ValueError("")
+            except ValueError:
+                nvcc_threads = min(os.cpu_count(), 8)
+
+            cuda_major, cuda_minor = installed_cuda_version()
+            if cuda_major > 10:
+                if cuda_major == 12 and cuda_minor >= 5:
+                    std_lib = '-std=c++20'
+                else:
+                    std_lib = '-std=c++17'
+            else:
+                std_lib = '-std=c++14'
             args += [
-                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math',
-                '-std=c++17' if cuda_major > 10 else '-std=c++14', '-U__CUDA_NO_HALF_OPERATORS__',
-                '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__'
+                '-allow-unsupported-compiler' if sys.platform == "win32" else '', '--use_fast_math', std_lib,
+                '-U__CUDA_NO_HALF_OPERATORS__', '-U__CUDA_NO_HALF_CONVERSIONS__', '-U__CUDA_NO_HALF2_OPERATORS__',
+                f'--threads={nvcc_threads}'
             ]
             if os.environ.get('DS_DEBUG_CUDA_BUILD', '0') == '1':
                 args.append('--ptxas-options=-v')
@@ -705,25 +811,32 @@ def libraries_args(self):
 
 class TorchCPUOpBuilder(CUDAOpBuilder):
 
+    def get_cuda_lib64_path(self):
+        import torch
+        if not self.is_rocm_pytorch():
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+            if not os.path.exists(CUDA_LIB64):
+                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+        return CUDA_LIB64
+
     def extra_ldflags(self):
         if self.build_for_cpu:
             return ['-fopenmp']
 
         if not self.is_rocm_pytorch():
-            return ['-lcurand']
+            ld_flags = ['-lcurand']
+            if not self.build_for_cpu:
+                ld_flags.append(f'-L{self.get_cuda_lib64_path()}')
+            return ld_flags
 
         return []
 
     def cxx_args(self):
-        import torch
         args = []
         if not self.build_for_cpu:
-            if not self.is_rocm_pytorch():
-                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
-                if not os.path.exists(CUDA_LIB64):
-                    CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib")
-            else:
-                CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
+            CUDA_LIB64 = self.get_cuda_lib64_path()
 
             args += super().cxx_args()
             args += [
@@ -735,7 +848,7 @@ def cxx_args(self):
 
         CPU_ARCH = self.cpu_arch()
         SIMD_WIDTH = self.simd_width()
-        CUDA_ENABLE = self.is_cuda_enable()
+        CUDA_ENABLE = self.get_cuda_compile_flag()
         args += [
             CPU_ARCH,
             '-fopenmp',
diff --git a/op_builder/cpu/__init__.py b/op_builder/cpu/__init__.py
index 0ecf2caf65ae..7084db8469f1 100644
--- a/op_builder/cpu/__init__.py
+++ b/op_builder/cpu/__init__.py
@@ -4,7 +4,8 @@
 # DeepSpeed Team
 '''Copyright The Microsoft DeepSpeed Team'''
 
-from .comm import CCLCommBuilder
+from .comm import CCLCommBuilder, ShareMemCommBuilder
 from .fused_adam import FusedAdamBuilder
 from .cpu_adam import CPUAdamBuilder
 from .no_impl import NotImplementedBuilder
+from .async_io import AsyncIOBuilder
diff --git a/op_builder/cpu/async_io.py b/op_builder/cpu/async_io.py
new file mode 100644
index 000000000000..493ef174566e
--- /dev/null
+++ b/op_builder/cpu/async_io.py
@@ -0,0 +1,91 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import distutils.spawn
+import subprocess
+
+from .builder import CPUOpBuilder
+
+
+class AsyncIOBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def lib_sources(self):
+        src_list = [
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp', 'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp', 'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp', 'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp', 'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp', 'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+        ]
+        return src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/aio/py_lib/py_ds_aio.cpp']
+
+    def include_paths(self):
+        return ['csrc/aio/py_lib', 'csrc/aio/common']
+
+    def cxx_args(self):
+        # -O0 for improved debugging, since performance is bound by I/O
+        args = super().cxx_args()
+        import torch
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if not (TORCH_MAJOR >= 2 and TORCH_MINOR >= 1):
+            args.remove('-std=c++17')
+            args.append('-std=c++14')
+        args += ['-Wall', '-O0', '-shared', '-fPIC', '-Wno-reorder']
+        return args
+
+    def extra_ldflags(self):
+        return ['-laio', '-fopenmp']
+
+    def check_for_libaio_pkg(self):
+        libs = dict(
+            dpkg=["-l", "libaio-dev", "apt"],
+            pacman=["-Q", "libaio", "pacman"],
+            rpm=["-q", "libaio-devel", "yum"],
+        )
+
+        found = False
+        for pkgmgr, data in libs.items():
+            flag, lib, tool = data
+            path = distutils.spawn.find_executable(pkgmgr)
+            if path is not None:
+                cmd = [pkgmgr, flag, lib]
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                if result.wait() == 0:
+                    found = True
+                else:
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
+                break
+        return found
+
+    def is_compatible(self, verbose=False):
+        # Check for the existence of libaio by using distutils
+        # to compile and link a test program that calls io_submit,
+        # which is a function provided by libaio that is used in the async_io op.
+        # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
+        # respectively to specify the directories for libaio.h and libaio.so.
+        aio_compatible = self.has_function('io_submit', ('aio', ))
+        if verbose and not aio_compatible:
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
+
+            # Check for the libaio package via known package managers
+            # to print suggestions on which package to install.
+            self.check_for_libaio_pkg()
+
+            self.warning(
+                "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found."
+            )
+        return super().is_compatible(verbose) and aio_compatible
diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
index f6a71c7d1971..d881842ad0b1 100644
--- a/op_builder/cpu/builder.py
+++ b/op_builder/cpu/builder.py
@@ -3,6 +3,8 @@
 
 # DeepSpeed Team
 
+import os
+
 try:
     # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
     # if successful this also means we're doing a local install and not JIT compile path
@@ -16,19 +18,23 @@ class CPUOpBuilder(OpBuilder):
 
     def builder(self):
         from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
-
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
         compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
 
         cpp_ext = ExtensionBuilder(name=self.absolute_name(),
                                    sources=self.strip_empty_entries(self.sources()),
-                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   include_dirs=include_dirs,
                                    libraries=self.strip_empty_entries(self.libraries_args()),
                                    extra_compile_args=compile_args)
 
         return cpp_ext
 
     def cxx_args(self):
-        return ['-O3', '-g', '-Wno-reorder']
+        args = ['-O3', '-g', '-Wno-reorder']
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
+        return args
 
     def libraries_args(self):
         return []
diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py
index ec908eb0622b..fec960b63b2e 100644
--- a/op_builder/cpu/comm.py
+++ b/op_builder/cpu/comm.py
@@ -19,7 +19,7 @@ def absolute_name(self):
         return f'deepspeed.ops.comm.{self.NAME}_op'
 
     def sources(self):
-        return ['csrc/cpu/comm/ccl.cpp']
+        return ['csrc/cpu/comm/ccl.cpp', 'csrc/cpu/comm/shm.cpp']
 
     def include_paths(self):
         includes = ['csrc/cpu/includes']
@@ -28,17 +28,44 @@ def include_paths(self):
     def cxx_args(self):
         return ['-O2', '-fopenmp']
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # TODO: add soft compatibility check for private binary release.
         #  a soft check, as in we know it can be trivially changed.
         return super().is_compatible(verbose)
 
     def extra_ldflags(self):
         ccl_root_path = os.environ.get("CCL_ROOT")
-        if ccl_root_path == None:
+        if ccl_root_path is None:
             raise ValueError(
                 "Didn't find CCL_ROOT, install oneCCL from https://github.com/oneapi-src/oneCCL and source its environment variable"
             )
             return []
         else:
             return ['-lccl', f'-L{ccl_root_path}/lib']
+
+
+class ShareMemCommBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_SHM_COMM"
+    NAME = "deepspeed_shm_comm"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/cpu/comm/shm_interface.cpp', 'csrc/cpu/comm/shm.cpp']
+
+    def include_paths(self):
+        includes = ['csrc/cpu/includes']
+        return includes
+
+    def cxx_args(self):
+        return ['-O2', '-fopenmp']
+
+    def is_compatible(self, verbose=False):
+        # TODO: add soft compatibility check for private binary release.
+        #  a soft check, as in we know it can be trivially changed.
+        return super().is_compatible(verbose)
diff --git a/op_builder/cpu_adagrad.py b/op_builder/cpu_adagrad.py
index d3f163f7464a..c05f71488950 100644
--- a/op_builder/cpu_adagrad.py
+++ b/op_builder/cpu_adagrad.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import os
 from .builder import TorchCPUOpBuilder
 
 
@@ -18,26 +17,11 @@ def absolute_name(self):
         return f'deepspeed.ops.adagrad.{self.NAME}_op'
 
     def sources(self):
-        if self.build_for_cpu:
-            return ['csrc/adagrad/cpu_adagrad.cpp']
-
-        return ['csrc/adagrad/cpu_adagrad.cpp', 'csrc/common/custom_cuda_kernel.cu']
+        return ['csrc/adagrad/cpu_adagrad.cpp']
 
     def libraries_args(self):
         args = super().libraries_args()
-        if self.build_for_cpu:
-            return args
-
-        if not self.is_rocm_pytorch():
-            args += ['curand']
         return args
 
     def include_paths(self):
-        import torch
-        if self.build_for_cpu:
-            CUDA_INCLUDE = []
-        elif not self.is_rocm_pytorch():
-            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
-        else:
-            CUDA_INCLUDE = []
-        return ['csrc/includes'] + CUDA_INCLUDE
+        return ['csrc/includes']
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
index 7c34c4ce43a1..7f4c0847a8c4 100644
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import os
 from .builder import TorchCPUOpBuilder
 
 
@@ -18,27 +17,11 @@ def absolute_name(self):
         return f'deepspeed.ops.adam.{self.NAME}_op'
 
     def sources(self):
-        if self.build_for_cpu:
-            return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
-
-        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp', 'csrc/common/custom_cuda_kernel.cu']
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
 
     def libraries_args(self):
         args = super().libraries_args()
-        if self.build_for_cpu:
-            return args
-
-        if not self.is_rocm_pytorch():
-            args += ['curand']
-
         return args
 
     def include_paths(self):
-        import torch
-        if self.build_for_cpu:
-            CUDA_INCLUDE = []
-        elif not self.is_rocm_pytorch():
-            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
-        else:
-            CUDA_INCLUDE = []
-        return ['csrc/includes'] + CUDA_INCLUDE
+        return ['csrc/includes']
diff --git a/op_builder/cpu_lion.py b/op_builder/cpu_lion.py
index 5c16d10ebb44..9a60d99773b3 100644
--- a/op_builder/cpu_lion.py
+++ b/op_builder/cpu_lion.py
@@ -3,7 +3,6 @@
 
 # DeepSpeed Team
 
-import os
 from .builder import TorchCPUOpBuilder
 
 
@@ -18,31 +17,11 @@ def absolute_name(self):
         return f'deepspeed.ops.lion.{self.NAME}_op'
 
     def sources(self):
-        if self.build_for_cpu:
-            return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp']
-
-        return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp', 'csrc/common/custom_cuda_kernel.cu']
+        return ['csrc/lion/cpu_lion.cpp', 'csrc/lion/cpu_lion_impl.cpp']
 
     def libraries_args(self):
         args = super().libraries_args()
-        if self.build_for_cpu:
-            return args
-
-        if not self.is_rocm_pytorch():
-            args += ['curand']
-
         return args
 
     def include_paths(self):
-        import torch
-        if self.build_for_cpu:
-            CUDA_INCLUDE = []
-        elif not self.is_rocm_pytorch():
-            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
-        else:
-            CUDA_INCLUDE = [
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "rocrand"),
-                os.path.join(torch.utils.cpp_extension.ROCM_HOME, "include", "hiprand"),
-            ]
-        return ['csrc/includes'] + CUDA_INCLUDE
+        return ['csrc/includes']
diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
index 6e7721f94e01..7f68ccf87290 100644
--- a/op_builder/evoformer_attn.py
+++ b/op_builder/evoformer_attn.py
@@ -41,29 +41,50 @@ def nvcc_args(self):
         args.append(f"-DGPU_ARCH={major}{minor}")
         return args
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile kernels")
             return False
         if self.cutlass_path is None:
-            self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
+            if verbose:
+                self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH")
             return False
-        with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
-            if '3.1.0' not in f.read():
-                self.warning("Please use CUTLASS version >= 3.1.0")
+        if os.path.exists(f'{self.cutlass_path}/CHANGELOG.md'):
+            with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f:
+                if '3.1.0' not in f.read():
+                    if verbose:
+                        self.warning("Please use CUTLASS version >= 3.1.0")
+                    return False
+        else:
+            # pip install nvidia-cutlass package
+            try:
+                import cutlass
+            except ImportError:
+                if verbose:
+                    self.warning("Please pip install nvidia-cutlass if trying to pre-compile kernels")
                 return False
+            cutlass_major, cutlass_minor = cutlass.__version__.split('.')[:2]
+            cutlass_compatible = (int(cutlass_major) >= 3 and int(cutlass_minor) >= 1)
+            if not cutlass_compatible:
+                if verbose:
+                    self.warning("Please use CUTLASS version >= 3.1.0")
+                return False
+
         cuda_okay = True
         if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
             sys_cuda_major, _ = installed_cuda_version()
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 7:
-                self.warning("Please use a GPU with compute capability >= 7.0")
+                if verbose:
+                    self.warning("Please use a GPU with compute capability >= 7.0")
                 cuda_okay = False
             if torch_cuda_major < 11 or sys_cuda_major < 11:
-                self.warning("Please use CUDA 11+")
+                if verbose:
+                    self.warning("Please use CUDA 11+")
                 cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/fp_quantizer.py b/op_builder/fp_quantizer.py
new file mode 100644
index 000000000000..df4d967ea09a
--- /dev/null
+++ b/op_builder/fp_quantizer.py
@@ -0,0 +1,120 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    from packaging import version as pkg_version
+except ImportError:
+    pkg_version = None
+
+from .builder import CUDAOpBuilder, installed_cuda_version
+
+
+class FPQuantizerBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FP_QUANTIZER"
+    NAME = "fp_quantizer"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.fp_quantizer.{self.NAME}_op'
+
+    def is_compatible(self, verbose=False):
+        try:
+            import torch
+        except ImportError:
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
+            return False
+
+        cuda_okay = True
+        if not self.is_rocm_pytorch() and torch.cuda.is_available():  #ignore-cuda
+            sys_cuda_major, _ = installed_cuda_version()
+            torch_cuda_major = int(torch.version.cuda.split('.')[0])
+            cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
+            if cuda_capability < 8:
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Ampere and newer architectures")
+                cuda_okay = False
+            if cuda_capability >= 8:
+                if torch_cuda_major < 11 or sys_cuda_major < 11:
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    cuda_okay = False
+
+        try:
+            import triton
+        except ImportError:
+            if verbose:
+                self.warning(
+                    f"please install triton==2.3.0, 2.3.1 or 3.0.0 if you want to use the FP Quantizer Kernels")
+            return False
+
+        # triton 2.3.{0,1} and 3.0.0 are ok.
+        allowed_versions = ("2.3", "3.0")
+        if pkg_version:
+            allowed = (pkg_version.parse(v) for v in allowed_versions)
+            installed_triton = pkg_version.parse(triton.__version__)
+            triton_mismatch = all(installed_triton.major != a.major or installed_triton.minor != a.minor
+                                  for a in allowed)
+        else:
+            installed_triton = triton.__version__
+            major, minor, _ = installed_triton.split(".")
+            allowed = (v.split(".") for v in allowed_versions)
+            triton_mismatch = all(major != v[0] or minor != v[1] for v in allowed)
+
+        if triton_mismatch:
+            if verbose:
+                self.warning(
+                    f"FP Quantizer is using an untested triton version ({installed_triton}), only 2.3.{0,1} and 3.0.0 are known to be compatible with these kernels"
+                )
+            return False
+
+        return super().is_compatible(verbose) and cuda_okay
+
+    def filter_ccs(self, ccs):
+        ccs_retained = []
+        ccs_pruned = []
+        for cc in [cc.split('.') for cc in ccs]:
+            if int(cc[0]) >= 8:
+                ccs_retained.append(cc)
+            else:
+                ccs_pruned.append(cc)
+        if len(ccs_pruned) > 0:
+            self.warning(f"Filtered compute capabilities {ccs_pruned}")
+        return ccs_retained
+
+    def sources(self):
+        return [
+            "csrc/fp_quantizer/fp_quantize.cu",
+            "csrc/fp_quantizer/fp_quantize.cpp",
+        ]
+
+    def extra_ldflags(self):
+        if not self.is_rocm_pytorch():
+            return ['-lcurand']
+        else:
+            return []
+
+    def include_paths(self):
+        return ['csrc/fp_quantizer/includes', 'csrc/includes']
+
+    @staticmethod
+    def get_default_quant_dtype():
+        import torch
+        return torch.uint8
+
+    @staticmethod
+    def get_quant_range(q_bits=None):
+        if q_bits == 8:
+            return 480
+        elif q_bits == 6:
+            return 28.
+        elif q_bits == 12:
+            return 510.
+        else:
+            assert (0), \
+                "Please specify the right quantization range for the selected precision!"
diff --git a/op_builder/gds.py b/op_builder/gds.py
new file mode 100644
index 000000000000..d815ccea387a
--- /dev/null
+++ b/op_builder/gds.py
@@ -0,0 +1,66 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+from .async_io import AsyncIOBuilder
+
+
+class GDSBuilder(AsyncIOBuilder):
+    BUILD_VAR = "DS_BUILD_GDS"
+    NAME = "gds"
+
+    def __init__(self):
+        super().__init__()
+
+    def absolute_name(self):
+        return f'deepspeed.ops.gds.{self.NAME}_op'
+
+    def lib_sources(self):
+        src_list = ['csrc/gds/py_lib/deepspeed_py_gds_handle.cpp', 'csrc/gds/py_lib/deepspeed_gds_op.cpp']
+        return super().lib_sources() + src_list
+
+    def sources(self):
+        return self.lib_sources() + ['csrc/gds/py_lib/py_ds_gds.cpp']
+
+    def cxx_args(self):
+        return super().cxx_args() + ['-lcufile']
+
+    def include_paths(self):
+        import torch
+        CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        return ['csrc/aio/py_lib', 'csrc/aio/common'] + CUDA_INCLUDE
+
+    def extra_ldflags(self):
+        return super().extra_ldflags() + ['-lcufile']
+
+    def is_compatible(self, verbose=False):
+        if self.is_rocm_pytorch():
+            if verbose:
+                self.warning(f'{self.NAME} is not compatible with ROCM')
+            return False
+
+        try:
+            import torch.utils.cpp_extension
+        except ImportError:
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile GDS")
+            return False
+
+        CUDA_HOME = torch.utils.cpp_extension.CUDA_HOME
+        if CUDA_HOME is None:
+            if verbose:
+                self.warning("Please install torch CUDA if trying to pre-compile GDS with CUDA")
+            return False
+
+        CUDA_LIB64 = os.path.join(CUDA_HOME, "lib64")
+        gds_compatible = self.has_function(funcname="cuFileDriverOpen",
+                                           libraries=("cufile", ),
+                                           library_dirs=(
+                                               CUDA_HOME,
+                                               CUDA_LIB64,
+                                           ),
+                                           verbose=verbose)
+
+        return gds_compatible and super().is_compatible(verbose)
diff --git a/op_builder/hpu/__init__.py b/op_builder/hpu/__init__.py
new file mode 100644
index 000000000000..5ad1b9a7f891
--- /dev/null
+++ b/op_builder/hpu/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
+from .transformer_inference import InferenceBuilder
+from .no_impl import NotImplementedBuilder
diff --git a/op_builder/hpu/builder.py b/op_builder/hpu/builder.py
new file mode 100644
index 000000000000..11e710a8ee48
--- /dev/null
+++ b/op_builder/hpu/builder.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class CPUOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=include_dirs,
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        args = ['-O3', '-g', '-Wno-reorder']
+        return args
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/hpu/cpu_adam.py b/op_builder/hpu/cpu_adam.py
new file mode 100644
index 000000000000..58eea2698ebb
--- /dev/null
+++ b/op_builder/hpu/cpu_adam.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class CPUAdamBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/hpu/fp_quantizer.py b/op_builder/hpu/fp_quantizer.py
new file mode 100644
index 000000000000..c74affb55045
--- /dev/null
+++ b/op_builder/hpu/fp_quantizer.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2024 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class FPQuantizerBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_FP_QUANTIZER"
+    NAME = "fp_quantizer"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.fp_quantizer.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def load(self, verbose=True):
+        return FPQuantizer
+
+    @staticmethod
+    def get_default_quant_dtype():
+        return torch.float8_e4m3fn
+
+    @staticmethod
+    def get_quant_range(q_bits=None):
+        import habana_frameworks.torch.utils.experimental as htexp
+        if htexp._get_device_type() == htexp.synDeviceType.synDeviceGaudi2:
+            dtype = torch.float8_e4m3fnuz
+        else:
+            dtype = torch.float8_e4m3fn
+        return torch.finfo(dtype).max
+
+
+class FPQuantizer:
+    CUDA_IMPL = False
+
+    @classmethod
+    def selective_dequantize(cls, val_q, scales, indexes, group_size, q_mantisa_bits, q_exponent_bits):
+        assert False, "Selective dequantize isn't implemented for HPU!"
+
+    @classmethod
+    def dequantize(cls, fp_out, input_q, scale, group_size, q_mantisa_bits, q_exponent_bits):
+        orig_shape = fp_out.shape
+        orig_dtype = fp_out.dtype
+        dequant_out = torch.ops.hpu.cast_from_fp8(input_q, (1.0 / scale), orig_dtype).view(orig_shape)
+        fp_out.copy_(dequant_out)
+        return fp_out
+
+    @classmethod
+    def quantize(cls, out, val, scale, group_size, stochastic_rounding, q_bits, q_mantisa_bits):
+        assert q_bits == 8, "Quantize on HPU only supports quantization to FP8"
+        assert q_mantisa_bits == 3, "Quantize on HPU only supports q_mantissa_bits = 3"
+        assert out.dtype.is_floating_point, "Quantization on HPU is only to float dtypes"
+
+        num_groups, group_size = out.shape
+
+        # Reshape the tensor
+        val_reshaped = val.view(num_groups, group_size).float()
+        # Calculate the scale
+        max_vals = val_reshaped.abs().max(dim=1, keepdim=True)[0]
+        q_range = torch.finfo(out.dtype).max
+        tmp_scale = q_range / max_vals
+        scale.copy_(tmp_scale)
+        # Copy quantized
+        quant, _ = torch.ops.hpu.cast_to_fp8_v2(val_reshaped, scale, stochastic_rounding, dtype=out.dtype)
+        out.copy_(quant)
+
+        return out
+
+    @classmethod
+    def get_scales(cls, out, num_groups):
+        return out
diff --git a/op_builder/hpu/fused_adam.py b/op_builder/hpu/fused_adam.py
new file mode 100644
index 000000000000..5acb121668e3
--- /dev/null
+++ b/op_builder/hpu/fused_adam.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+try:
+    import torch
+    import math
+except ImportError as e:
+    pass
+
+
+class HPUFusedAdam:
+    htcore = None
+    is_lazy_mode = None
+
+    @staticmethod
+    def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
+                          bias_correction, weight_decay, *args):
+
+        if HPUFusedAdam.htcore is None:
+            from habana_frameworks.torch import core as htcore
+            from habana_frameworks.torch.utils.internal import is_lazy
+            HPUFusedAdam.htcore = htcore
+            HPUFusedAdam.is_lazy_mode = is_lazy()
+
+        htcore = HPUFusedAdam.htcore
+
+        htcore.step_closure._mark_step_if_lazy()
+        step_size = lr
+        if bias_correction:
+            bias_correction1 = 1.0 - pow(beta1, step)
+            bias_correction2 = 1.0 - pow(beta2, step)
+            step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
+
+        neg_step = -step_size
+        neg_step_t = (torch.tensor([neg_step], dtype=torch.float,
+                                   requires_grad=False).to(tensor_lists[1][0].dtype).to(tensor_lists[1][0].device,
+                                                                                        non_blocking=True))
+
+        weight_decay = weight_decay if adam_w_mode else 0
+
+        # since lr is fed into the kernel as tensor, perform the scalar multiplication of wd here
+        # NOTE: TODO if lr is updated every step, then we need to convert it as tensor and
+        # perform weight decay unconditonally.
+        modified_wd = 1.0 - weight_decay * lr
+
+        if HPUFusedAdam.is_lazy_mode:
+            torch.ops.hpu.optimizer_adamw(
+                tensor_lists[0],
+                tensor_lists[1],
+                tensor_lists[2],
+                tensor_lists[3],
+                neg_step_t,
+                beta1,
+                beta2,
+                epsilon,
+                modified_wd,
+            )
+        else:
+            modified_wd_t = (torch.tensor([modified_wd], dtype=torch.float, requires_grad=False).to(
+                tensor_lists[1][0].dtype).to(tensor_lists[1][0].device, non_blocking=True))
+            torch.ops.hpu.optimizer_adamw(
+                tensor_lists[0],
+                tensor_lists[1],
+                tensor_lists[2],
+                tensor_lists[3],
+                neg_step_t,
+                beta1,
+                beta2,
+                epsilon,
+                modified_wd_t,
+                modified_wd != 1.0,
+            )
+
+        htcore.step_closure._mark_step_if_lazy()
+
+
+class FusedAdamBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return HPUFusedAdam
diff --git a/op_builder/hpu/no_impl.py b/op_builder/hpu/no_impl.py
new file mode 100644
index 000000000000..140d65b48def
--- /dev/null
+++ b/op_builder/hpu/no_impl.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import CPUOpBuilder
+
+
+class NotImplementedBuilder(CPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on HPU backend.")
+
+    def sources(self):
+        return []
diff --git a/op_builder/hpu/transformer_inference.py b/op_builder/hpu/transformer_inference.py
new file mode 100644
index 000000000000..e397c99200ec
--- /dev/null
+++ b/op_builder/hpu/transformer_inference.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2023 Habana Labs, Ltd. an Intel Company
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+import importlib
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class InferenceBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
+    NAME = "transformer_inference"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f"deepspeed.ops.transformer.inference.{self.NAME}_op"
+
+    def sources(self):
+        return []
+
+    def load(self, verbose=True):
+        if self.name in __class__._loaded_ops:
+            return __class__._loaded_ops[self.name]
+
+        from deepspeed.git_version_info import installed_ops  # noqa: F401
+        if installed_ops.get(self.name, False):
+            op_module = importlib.import_module(self.absolute_name())
+            __class__._loaded_ops[self.name] = op_module
+            return op_module
diff --git a/op_builder/inference_core_ops.py b/op_builder/inference_core_ops.py
index 229b500bebda..b6665ebb7618 100755
--- a/op_builder/inference_core_ops.py
+++ b/op_builder/inference_core_ops.py
@@ -19,11 +19,12 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.kernels{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -32,18 +33,20 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
     def filter_ccs(self, ccs):
         ccs_retained = []
         ccs_pruned = []
-        for cc in ccs:
+        for cc in [cc.split('.') for cc in ccs]:
             if int(cc[0]) >= 6:
                 ccs_retained.append(cc)
             else:
@@ -60,13 +63,15 @@ def sources(self):
         sources = [
             "inference/v2/kernels/core_ops/core_ops.cpp",
             "inference/v2/kernels/core_ops/bias_activations/bias_activation.cpp",
-            "inference/v2/kernels/core_ops/bias_activations/bias_activation.cu",
+            "inference/v2/kernels/core_ops/bias_activations/bias_activation_cuda.cu",
             "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cpp",
-            "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm.cu",
+            "inference/v2/kernels/core_ops/cuda_layer_norm/layer_norm_cuda.cu",
             "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cpp",
-            "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm.cu",
+            "inference/v2/kernels/core_ops/cuda_rms_norm/rms_norm_cuda.cu",
             "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cpp",
-            "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels.cu",
+            "inference/v2/kernels/core_ops/gated_activations/gated_activation_kernels_cuda.cu",
+            "inference/v2/kernels/core_ops/cuda_linear/linear_kernels.cpp",
+            "inference/v2/kernels/core_ops/cuda_linear/linear_kernels_cuda.cu",
         ]
 
         prefix = self.get_prefix()
@@ -83,6 +88,7 @@ def include_paths(self):
             'inference/v2/kernels/core_ops/cuda_layer_norm',
             'inference/v2/kernels/core_ops/cuda_rms_norm',
             'inference/v2/kernels/core_ops/gated_activations',
+            'inference/v2/kernels/core_ops/cuda_linear',
             'inference/v2/kernels/includes',
         ]
 
diff --git a/op_builder/inference_cutlass_builder.py b/op_builder/inference_cutlass_builder.py
index 51f7931d9435..a4a607288ca8 100644
--- a/op_builder/inference_cutlass_builder.py
+++ b/op_builder/inference_cutlass_builder.py
@@ -18,11 +18,12 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.kernels.cutlass_ops.{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -31,18 +32,20 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
     def filter_ccs(self, ccs):
         ccs_retained = []
         ccs_pruned = []
-        for cc in ccs:
+        for cc in [cc.split('.') for cc in ccs]:
             if int(cc[0]) >= 8:
                 # Only support Ampere and newer
                 ccs_retained.append(cc)
diff --git a/op_builder/mlu/__init__.py b/op_builder/mlu/__init__.py
new file mode 100644
index 000000000000..db12afbbf20e
--- /dev/null
+++ b/op_builder/mlu/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+# MLU related operators will be added in the future.
+from .no_impl import NotImplementedBuilder
+from .cpu_adagrad import CPUAdagradBuilder
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
diff --git a/op_builder/mlu/builder.py b/op_builder/mlu/builder.py
new file mode 100644
index 000000000000..17b9723ffcc1
--- /dev/null
+++ b/op_builder/mlu/builder.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class MLUOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        return ['-O3', '-g', '-Wno-reorder']
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/mlu/cpu_adagrad.py b/op_builder/mlu/cpu_adagrad.py
new file mode 100644
index 000000000000..68b7bbe514ee
--- /dev/null
+++ b/op_builder/mlu/cpu_adagrad.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class CPUAdagradBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
+    NAME = "cpu_adagrad"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adagrad.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adagrad/cpu_adagrad.cpp']
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/mlu/cpu_adam.py b/op_builder/mlu/cpu_adam.py
new file mode 100644
index 000000000000..b3c8e476bf39
--- /dev/null
+++ b/op_builder/mlu/cpu_adam.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class CPUAdamBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/mlu/fused_adam.py b/op_builder/mlu/fused_adam.py
new file mode 100644
index 000000000000..2343e55568c4
--- /dev/null
+++ b/op_builder/mlu/fused_adam.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+try:
+    import torch
+except ImportError as e:
+    pass
+
+
+class MLUFusedAdam:
+
+    @staticmethod
+    def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
+                          bias_correction, weight_decay, *args):
+
+        torch.ops.torch_mlu.fused_adam(noop_flag_buffer, tensor_lists[0], tensor_lists[1], tensor_lists[2],
+                                       tensor_lists[3], lr, beta1, beta2, epsilon, step, adam_w_mode, bias_correction,
+                                       weight_decay)
+
+
+class FusedAdamBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return MLUFusedAdam
diff --git a/op_builder/mlu/no_impl.py b/op_builder/mlu/no_impl.py
new file mode 100644
index 000000000000..375c148b4a5e
--- /dev/null
+++ b/op_builder/mlu/no_impl.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Copyright (c) 2024 Cambricon Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import MLUOpBuilder
+
+
+class NotImplementedBuilder(MLUOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on MLU backend.")
+
+    def sources(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        return []
diff --git a/op_builder/npu/__init__.py b/op_builder/npu/__init__.py
index 6ad9124d36f0..e512e25a2a29 100644
--- a/op_builder/npu/__init__.py
+++ b/op_builder/npu/__init__.py
@@ -5,7 +5,9 @@
 '''Copyright The Microsoft DeepSpeed Team'''
 
 from .fused_adam import FusedAdamBuilder
+from .async_io import AsyncIOBuilder
 from .no_impl import NotImplementedBuilder
 from .cpu_adam import CPUAdamBuilder
 from .cpu_adagrad import CPUAdagradBuilder
 from .cpu_lion import CPULionBuilder
+from .inference import InferenceBuilder
diff --git a/op_builder/npu/async_io.py b/op_builder/npu/async_io.py
new file mode 100644
index 000000000000..76d495b8106f
--- /dev/null
+++ b/op_builder/npu/async_io.py
@@ -0,0 +1,103 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import distutils.spawn
+import subprocess
+
+from .builder import NPUOpBuilder
+
+
+class AsyncIOBuilder(NPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp', 'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp', 'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp', 'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp', 'csrc/aio/common/deepspeed_aio_types.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp'
+        ]
+
+    def include_paths(self):
+        args = super().include_paths()
+        args += ['csrc/aio/py_lib', 'csrc/aio/common']
+        return args
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        # -O0 for improved debugging, since performance is bound by I/O
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        import torch  # Keep this import here to avoid errors when building DeepSpeed wheel without torch installed
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
+            CPP_STD = '-std=c++17'
+        else:
+            CPP_STD = '-std=c++14'
+        return args + [
+            '-g',
+            '-Wall',
+            '-O0',
+            CPP_STD,
+            '-shared',
+            '-fPIC',
+            '-Wno-reorder',
+            CPU_ARCH,
+            '-fopenmp',
+            SIMD_WIDTH,
+            '-laio',
+        ]
+
+    def extra_ldflags(self):
+        args = super().extra_ldflags()
+        return args + ['-laio']
+
+    def check_for_libaio_pkg(self):
+        libs = dict(
+            dpkg=["-l", "libaio-dev", "apt"],
+            pacman=["-Q", "libaio", "pacman"],
+            rpm=["-q", "libaio-devel", "yum"],
+        )
+
+        found = False
+        for pkgmgr, data in libs.items():
+            flag, lib, tool = data
+            path = distutils.spawn.find_executable(pkgmgr)
+            if path is not None:
+                cmd = [pkgmgr, flag, lib]
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                if result.wait() == 0:
+                    found = True
+                else:
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
+                break
+        return found
+
+    def is_compatible(self, verbose=False):
+        # Check for the existence of libaio by using distutils
+        # to compile and link a test program that calls io_submit,
+        # which is a function provided by libaio that is used in the async_io op.
+        # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
+        # respectively to specify the directories for libaio.h and libaio.so.
+        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
+        if verbose and not aio_compatible:
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
+
+            # Check for the libaio package via known package managers
+            # to print suggestions on which package to install.
+            self.check_for_libaio_pkg()
+
+            self.warning(
+                "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found."
+            )
+        return super().is_compatible(verbose) and aio_compatible
diff --git a/op_builder/npu/fused_adam.py b/op_builder/npu/fused_adam.py
index fc1bc83c7cc7..d32103db7055 100644
--- a/op_builder/npu/fused_adam.py
+++ b/op_builder/npu/fused_adam.py
@@ -16,8 +16,8 @@ class NPUFusedAdam:
     @staticmethod
     def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
                           bias_correction, weight_decay, *args):
-        bias_correction1 = beta1**step
-        bias_correction2 = beta2**step
+        bias_correction1 = beta1**(step - 1)
+        bias_correction2 = beta2**(step - 1)
 
         # iteration group['params']
         for i in range(len(tensor_lists[0])):
diff --git a/op_builder/npu/inference.py b/op_builder/npu/inference.py
new file mode 100644
index 000000000000..46f28c0d4011
--- /dev/null
+++ b/op_builder/npu/inference.py
@@ -0,0 +1,307 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from enum import IntEnum
+from .builder import NPUOpBuilder
+
+try:
+    import torch
+    import torch_npu
+except ImportError as e:
+    pass
+
+
+class ActivationFuncType(IntEnum):
+    UNKNOWN = 0
+    GELU = 1
+    ReLU = 2
+    GATED_GELU = 3
+    GATED_SILU = 4
+
+
+class InferenceContext:
+    _workspace = None
+
+    _seed = 42
+    _curr_offset = 0
+    _stream = 0
+    _free_memory_size = 0
+    _num_tokens = 1
+    _attention_unfused_workspace_offset = 0
+    _workSpaceSize = 0
+
+    workSpaceSize = 0
+    kv_caches = None
+
+    @staticmethod
+    def reset_tokens(initial_tokens=1):
+        InferenceContext._num_tokens = initial_tokens
+
+    @staticmethod
+    def current_tokens():
+        return InferenceContext._num_tokens
+
+    @staticmethod
+    def GetWorkSpace():
+        return InferenceContext._workspace
+
+
+class NPUInference:
+
+    @staticmethod
+    def layer_norm(inputs, gamma, beta, epsilon):
+        return torch.nn.functional.layer_norm(inputs, [inputs.shape[-1]], gamma, beta, eps=epsilon)
+
+    @staticmethod
+    def _qkv_gemm(inputs, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        inp_norm = torch.nn.functional.layer_norm(inputs, (inputs.shape[2], ), gamma, beta, eps)
+        weight = weight.t() if transpose else weight
+        tmp = torch.matmul(inp_norm, weight)
+        if add_bias:
+            tmp += bias
+        output = [tmp, inp_norm]
+        return output
+
+    @staticmethod
+    def qkv_gemm_fp16(inputs, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        return NPUInference._qkv_gemm(inputs, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose)
+
+    @staticmethod
+    def qkv_gemm_bf16(inputs, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        return NPUInference._qkv_gemm(inputs, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose)
+
+    @staticmethod
+    def qkv_gemm_fp32(inputs, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose):
+        return NPUInference._qkv_gemm(inputs, weight, q_scale, bias, gamma, beta, eps, add_bias, q_int8, transpose)
+
+    @staticmethod
+    def _bias_add_transform_0213(vals, bias, hidden_dim, seq_length, seq_offset, heads, num_kv, rotary_dim,
+                                 rotate_half, rotate_every_two, rope_theta):
+        bsz, _, _ = vals.shape
+        q = vals[..., :hidden_dim].reshape(bsz, seq_length, heads, -1)
+        k = vals[..., hidden_dim:hidden_dim + num_kv * (hidden_dim // heads)].reshape(bsz, seq_length, num_kv, -1)
+        v = vals[..., hidden_dim + num_kv * (hidden_dim // heads):]
+
+        if rotary_dim > 0 and rotate_every_two:
+            # sin, cos may use cache
+            seq_id = torch.arange(0, seq_length).to("npu")
+            inv_freq = torch.arange(0, rotary_dim, 2) / rotary_dim
+            inv_freq = inv_freq.to("npu")
+            inv_freq = 1.0 / torch.pow(rope_theta, inv_freq)
+            inv_freq = torch.outer(seq_id, inv_freq)
+            sin = inv_freq.sin()
+            cos = inv_freq.cos()
+            # shape: [bsz=1, seq_len, heads=1, rotary_dim]
+            sin = sin.view(-1, seq_length, 1, rotary_dim // 2).repeat_interleave(2, dim=-1)
+            cos = cos.view(-1, seq_length, 1, rotary_dim // 2).repeat_interleave(2, dim=-1)
+
+            q_pos, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+            k_pos, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+
+            q_pos = torch_npu.npu_rotary_mul(q_pos, cos, sin)
+            q = torch.cat([q_pos, q_pass], dim=-1)
+            k_pos = torch_npu.npu_rotary_mul(k_pos, cos, sin)
+            k = torch.cat([k_pos, k_pass], dim=-1)
+
+        output = q.reshape(bsz, seq_length, -1).contiguous()  # [b, s, H]
+        k_cache = k.reshape(bsz, seq_length, heads, -1).transpose(1, 2).contiguous()  # [b, n, s, d]
+        v_cache = v.reshape(bsz, seq_length, heads, -1).transpose(1, 2).contiguous()  # [b, n, s, d]
+        return output, k_cache, v_cache
+
+    @staticmethod
+    def _softmax_context(query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads, num_kv,
+                         norm_factor, triangular_masking, local_attention, window_size, no_masking, layer_id,
+                         num_layers, alibi, rope_theta):
+        bsz, seq_len, k = query_key_value.size()
+        k = k // (heads + 2 * (num_kv if num_kv > 0 else heads))
+        hidden_dim = heads * k
+
+        is_promt = seq_len > 1
+        if not InferenceContext.kv_caches:
+            InferenceContext.kv_caches = [[None, None] for _ in range(num_layers)]
+        if is_promt:
+            InferenceContext.reset_tokens(seq_len)
+            InferenceContext.kv_caches[layer_id] = [None, None]
+
+        soft_len = InferenceContext.current_tokens()
+        workspace = InferenceContext.GetWorkSpace()
+        seq_offset = 0 if is_promt else soft_len - 1
+
+        q, k, v = NPUInference._bias_add_transform_0213(vals=query_key_value,
+                                                        bias=None,
+                                                        hidden_dim=hidden_dim,
+                                                        seq_length=seq_len,
+                                                        seq_offset=seq_offset,
+                                                        heads=heads,
+                                                        num_kv=num_kv if num_kv > 0 else heads,
+                                                        rotary_dim=rotary_dim,
+                                                        rotate_half=rotate_half,
+                                                        rotate_every_two=rotate_every_two,
+                                                        rope_theta=rope_theta)
+
+        if not is_promt:
+            k_cache, v_cache = InferenceContext.kv_caches[layer_id]
+            if k_cache is not None:
+                k = torch.cat([k_cache, k], dim=2)
+                v = torch.cat([v_cache, v], dim=2)
+        InferenceContext.kv_caches[layer_id] = [k, v]
+        seq_len = k.shape[2]
+
+        layer_scale = max(1, layer_id) if len(alibi.size()) > 1 else 1.0
+        alpha = norm_factor * norm_factor / layer_scale
+
+        output = torch_npu.npu_fusion_attention(q,
+                                                k.transpose(1, 2).reshape(bsz, seq_len, -1).contiguous(),
+                                                v.transpose(1, 2).reshape(bsz, seq_len, -1).contiguous(),
+                                                heads,
+                                                "BSH",
+                                                pse=None,
+                                                padding_mask=None,
+                                                atten_mask=attn_mask.bool(),
+                                                scale=alpha,
+                                                pre_tockens=65536,
+                                                next_tockens=65536,
+                                                keep_prob=1,
+                                                inner_precise=0)[0]
+
+        return output, k, v
+
+    @staticmethod
+    def softmax_context_fp16(query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads, num_kv,
+                             norm_factor, triangular_masking, local_attention, window_size, no_masking, layer_id,
+                             num_layers, alibi, rope_theta):
+        return NPUInference._softmax_context(query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two,
+                                             heads, num_kv, norm_factor, triangular_masking, local_attention,
+                                             window_size, no_masking, layer_id, num_layers, alibi, rope_theta)
+
+    @staticmethod
+    def softmax_context_bf16(query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads, num_kv,
+                             norm_factor, triangular_masking, local_attention, window_size, no_masking, layer_id,
+                             num_layers, alibi, rope_theta):
+        return NPUInference._softmax_context(query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two,
+                                             heads, num_kv, norm_factor, triangular_masking, local_attention,
+                                             window_size, no_masking, layer_id, num_layers, alibi, rope_theta)
+
+    @staticmethod
+    def softmax_context_fp32(query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two, heads, num_kv,
+                             norm_factor, triangular_masking, local_attention, window_size, no_masking, layer_id,
+                             num_layers, alibi, rope_theta):
+        return NPUInference._softmax_context(query_key_value, attn_mask, rotary_dim, rotate_half, rotate_every_two,
+                                             heads, num_kv, norm_factor, triangular_masking, local_attention,
+                                             window_size, no_masking, layer_id, num_layers, alibi, rope_theta)
+
+    @staticmethod
+    def _vector_matmul(input, weight, async_op, q_scale, q_int8, transposed_mode):
+        if transposed_mode:
+            return torch.matmul(input, weight.t())
+        return torch.matmul(input, weight)
+
+    @staticmethod
+    def vector_matmul_fp16(input, weight, async_op, q_scale, q_int8, transposed_mode):
+        return NPUInference._vector_matmul(input, weight, async_op, q_scale, q_int8, transposed_mode)
+
+    @staticmethod
+    def vector_matmul_bf16(input, weight, async_op, q_scale, q_int8, transposed_mode):
+        return NPUInference._vector_matmul(input, weight, async_op, q_scale, q_int8, transposed_mode)
+
+    @staticmethod
+    def vector_matmul_fp32(input, weight, async_op, q_scale, q_int8, transposed_mode):
+        return NPUInference._vector_matmul(input, weight, async_op, q_scale, q_int8, transposed_mode)
+
+    @staticmethod
+    def _mlp_gemm(input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps, pre_layer_norm,
+                  mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type, transpose):
+        if mlp_after_attn:
+            residual_add = torch.nn.functional.layer_norm(input + residual + input_bias, (input.shape[-1], ), gamma,
+                                                          beta, eps)
+        else:
+            residual_add = torch.nn.functional.layer_norm(input, (input.shape[-1], ), gamma, beta, eps)
+
+        weight_interm = weight_interm.t() if transpose else weight_interm
+        tmp = torch.matmul(residual_add, weight_interm)
+        if mlp_act_func_type == ActivationFuncType.GELU:
+            tmp = torch.nn.functional.gelu(tmp + bias)
+        elif mlp_act_func_type == ActivationFuncType.ReLU:
+            tmp = torch.nn.functional.relu(tmp + bias)
+        else:
+            raise Exception('Unsupported ActivationFuncType {}'.format(mlp_act_func_type))
+        output = torch.matmul(tmp, weight_out.t())
+        return output, residual_add
+
+    @staticmethod
+    def mlp_gemm_fp16(input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps, pre_layer_norm,
+                      mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type, transpose):
+        return NPUInference._mlp_gemm(input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps,
+                                      pre_layer_norm, mlp_after_attn, interm_scale, out_scale, dtype,
+                                      mlp_act_func_type, transpose)
+
+    @staticmethod
+    def mlp_gemm_bf16(input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps, pre_layer_norm,
+                      mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type, transpose):
+        return NPUInference._mlp_gemm(input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps,
+                                      pre_layer_norm, mlp_after_attn, interm_scale, out_scale, dtype,
+                                      mlp_act_func_type, transpose)
+
+    @staticmethod
+    def mlp_gemm_fp32(input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps, pre_layer_norm,
+                      mlp_after_attn, interm_scale, out_scale, dtype, mlp_act_func_type, transpose):
+        return NPUInference._mlp_gemm(input, residual, input_bias, weight_interm, weight_out, bias, gamma, beta, eps,
+                                      pre_layer_norm, mlp_after_attn, interm_scale, out_scale, dtype,
+                                      mlp_act_func_type, transpose)
+
+    @staticmethod
+    def _residual_add_bias(hidden_state, residual, attention_output, attention_bias, final_bias, mp_size,
+                           mlp_after_attn, add_bias, pre_layer_norm):
+        if mlp_after_attn:
+            if pre_layer_norm:
+                tmp = (residual.float() + attention_output.float() + attention_bias.float() +
+                       final_bias.float()) / mp_size + hidden_state.float()
+            else:
+                tmp = residual.float() + hidden_state.float() + final_bias.float()
+        else:
+            if add_bias:
+                residual += attention_bias.float()
+            tmp = hidden_state.float() + attention_output.float() + (residual.float() + final_bias.float()) / mp_size
+
+        input_dtype = hidden_state.dtype
+        residual.set_(tmp.to(input_dtype))
+
+    @staticmethod
+    def residual_add_bias_fp16(hidden_state, residual, attention_output, attention_bias, final_bias, mp_size,
+                               mlp_after_attn, add_bias, pre_layer_norm):
+        return NPUInference._residual_add_bias(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                               mp_size, mlp_after_attn, add_bias, pre_layer_norm)
+
+    @staticmethod
+    def residual_add_bias_bf16(hidden_state, residual, attention_output, attention_bias, final_bias, mp_size,
+                               mlp_after_attn, add_bias, pre_layer_norm):
+        return NPUInference._residual_add_bias(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                               mp_size, mlp_after_attn, add_bias, pre_layer_norm)
+
+    @staticmethod
+    def residual_add_bias_fp32(hidden_state, residual, attention_output, attention_bias, final_bias, mp_size,
+                               mlp_after_attn, add_bias, pre_layer_norm):
+        return NPUInference._residual_add_bias(hidden_state, residual, attention_output, attention_bias, final_bias,
+                                               mp_size, mlp_after_attn, add_bias, pre_layer_norm)
+
+
+class InferenceBuilder(NPUOpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
+    NAME = "transformer_inference"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.inference.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return NPUInference
diff --git a/op_builder/quantizer.py b/op_builder/quantizer.py
index fd765b743de0..0b5348e5af96 100644
--- a/op_builder/quantizer.py
+++ b/op_builder/quantizer.py
@@ -32,4 +32,7 @@ def include_paths(self):
         return ['csrc/includes']
 
     def extra_ldflags(self):
-        return ['-lcurand']
+        if not self.is_rocm_pytorch():
+            return ['-lcurand']
+        else:
+            return []
diff --git a/op_builder/ragged_ops.py b/op_builder/ragged_ops.py
index 13d71b476b5a..0df28cc2282a 100644
--- a/op_builder/ragged_ops.py
+++ b/op_builder/ragged_ops.py
@@ -19,11 +19,12 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.kernels.ragged_ops.{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -32,18 +33,20 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
     def filter_ccs(self, ccs):
         ccs_retained = []
         ccs_pruned = []
-        for cc in ccs:
+        for cc in [cc.split('.') for cc in ccs]:
             if int(cc[0]) >= 8:
                 # Blocked flash has a dependency on Ampere + newer
                 ccs_retained.append(cc)
@@ -63,18 +66,18 @@ def sources(self):
             "inference/v2/kernels/ragged_ops/atom_builder/atom_builder.cpp",
             "inference/v2/kernels/ragged_ops/blocked_flash/blocked_flash.cpp",
             "inference/v2/kernels/ragged_ops/embed/embed.cpp",
-            "inference/v2/kernels/ragged_ops/embed/embed.cu",
+            "inference/v2/kernels/ragged_ops/embed/embed_cuda.cu",
             "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cpp",
-            "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu",
+            "inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary_cuda.cu",
             "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cpp",
-            "inference/v2/kernels/ragged_ops/logits_gather/logits_gather.cu",
+            "inference/v2/kernels/ragged_ops/logits_gather/logits_gather_cuda.cu",
             "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cpp",
-            "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter.cu",
+            "inference/v2/kernels/ragged_ops/moe_scatter/moe_scatter_cuda.cu",
             "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cpp",
-            "inference/v2/kernels/ragged_ops/moe_gather/moe_gather.cu",
+            "inference/v2/kernels/ragged_ops/moe_gather/moe_gather_cuda.cu",
             "inference/v2/kernels/ragged_ops/ragged_helpers/ragged_kernel_helpers.cpp",
-            "inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cpp",
-            "inference/v2/kernels/ragged_ops/top_1_gating/top_1_gating.cu",
+            "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating.cpp",
+            "inference/v2/kernels/ragged_ops/top_k_gating/top_k_gating_cuda.cu",
         ]
 
         prefix = self.get_prefix()
@@ -101,12 +104,13 @@ def include_paths(self):
             'inference/v2/kernels/ragged_ops/atom_builder',
             'inference/v2/kernels/ragged_ops/blocked_flash',
             'inference/v2/kernels/ragged_ops/embed',
+            'inference/v2/kernels/ragged_ops/includes',
             'inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary',
             'inference/v2/kernels/ragged_ops/logits_gather',
             'inference/v2/kernels/ragged_ops/moe_gather',
             'inference/v2/kernels/ragged_ops/moe_scatter',
             'inference/v2/kernels/ragged_ops/ragged_helpers',
-            'inference/v2/kernels/ragged_ops/top_1_gating',
+            'inference/v2/kernels/ragged_ops/top_k_gating',
         ]
 
         prefix = self.get_prefix()
diff --git a/op_builder/ragged_utils.py b/op_builder/ragged_utils.py
index 89450e1fd30d..208c9f833ebe 100755
--- a/op_builder/ragged_utils.py
+++ b/op_builder/ragged_utils.py
@@ -19,11 +19,12 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.inference.v2.{self.NAME}'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -32,18 +33,20 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major  #ignore-cuda
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
     def filter_ccs(self, ccs):
         ccs_retained = []
         ccs_pruned = []
-        for cc in ccs:
+        for cc in [cc.split('.') for cc in ccs]:
             if int(cc[0]) >= 6:
                 ccs_retained.append(cc)
             else:
diff --git a/op_builder/sdaa/__init__.py b/op_builder/sdaa/__init__.py
new file mode 100755
index 000000000000..2a6eb8bbfa2c
--- /dev/null
+++ b/op_builder/sdaa/__init__.py
@@ -0,0 +1,36 @@
+# Copyright (c) Microsoft Corporation.
+
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .no_impl import NotImplementedBuilder
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
diff --git a/op_builder/sdaa/builder.py b/op_builder/sdaa/builder.py
new file mode 100755
index 000000000000..81f0e98c0768
--- /dev/null
+++ b/op_builder/sdaa/builder.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class SDAAOpBuilder(OpBuilder):
+
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension as ExtensionBuilder
+
+        compile_args = {'cxx': self.strip_empty_entries(self.cxx_args())}
+
+        cpp_ext = ExtensionBuilder(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=self.strip_empty_entries(self.include_paths()),
+                                   libraries=self.strip_empty_entries(self.libraries_args()),
+                                   extra_compile_args=compile_args)
+
+        return cpp_ext
+
+    def cxx_args(self):
+        return ['-O3', '-g', '-Wno-reorder']
+
+    def libraries_args(self):
+        return []
diff --git a/op_builder/sdaa/cpu_adam.py b/op_builder/sdaa/cpu_adam.py
new file mode 100755
index 000000000000..b38a71a6275d
--- /dev/null
+++ b/op_builder/sdaa/cpu_adam.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+from .builder import SDAAOpBuilder
+
+
+class CPUAdamBuilder(SDAAOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/sdaa/fused_adam.py b/op_builder/sdaa/fused_adam.py
new file mode 100755
index 000000000000..73a2dff41459
--- /dev/null
+++ b/op_builder/sdaa/fused_adam.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+from .builder import SDAAOpBuilder
+
+try:
+    import torch
+except ImportError as e:
+    pass
+
+
+class SDAAFusedAdam:
+
+    @staticmethod
+    def multi_tensor_adam(chunk_size, noop_flag_buffer, tensor_lists, lr, beta1, beta2, epsilon, step, adam_w_mode,
+                          bias_correction, weight_decay, *args):
+        g_tensor_lis, p_tensor_lis, m_tensor_lis, v_tensor_lis = tensor_lists
+        torch.ops.sdaa.fused_adam(g_tensor_lis, p_tensor_lis, m_tensor_lis, v_tensor_lis, [], beta1, beta2, epsilon,
+                                  lr, weight_decay, adam_w_mode, step, bias_correction)
+
+
+class FusedAdamBuilder(SDAAOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return []
+
+    def include_paths(self):
+        return []
+
+    def load(self, verbose=True):
+        return SDAAFusedAdam
diff --git a/op_builder/sdaa/no_impl.py b/op_builder/sdaa/no_impl.py
new file mode 100755
index 000000000000..10a8b8f48652
--- /dev/null
+++ b/op_builder/sdaa/no_impl.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# BSD 3- Clause License Copyright (c) 2023, Tecorigin Co., Ltd. All rights
+# reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+# Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+# Neither the name of the copyright holder nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY,OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+# OF SUCH DAMAGE.
+
+# DeepSpeed Team
+
+from .builder import SDAAOpBuilder
+
+
+class NotImplementedBuilder(SDAAOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.comm.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on SDAA backend.")
+
+    def sources(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        return []
diff --git a/op_builder/sparse_attn.py b/op_builder/sparse_attn.py
index 188d257ff4ef..b8e5e8a6fd9a 100644
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
@@ -27,45 +27,50 @@ def sources(self):
     def cxx_args(self):
         return ['-O2', '-fopenmp']
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         # Check to see if llvm and cmake are installed since they are dependencies
         #required_commands = ['llvm-config|llvm-config-9', 'cmake']
         #command_status = list(map(self.command_exists, required_commands))
         #deps_compatible = all(command_status)
 
         if self.is_rocm_pytorch():
-            self.warning(f'{self.NAME} is not compatible with ROCM')
+            if verbose:
+                self.warning(f'{self.NAME} is not compatible with ROCM')
             return False
 
         try:
             import torch
         except ImportError:
-            self.warning(f"unable to import torch, please install it first")
+            if verbose:
+                self.warning(f"unable to import torch, please install it first")
             return False
 
         # torch-cpu will not have a cuda version
         if torch.version.cuda is None:
             cuda_compatible = False
-            self.warning(f"{self.NAME} cuda is not available from torch")
+            if verbose:
+                self.warning(f"{self.NAME} cuda is not available from torch")
         else:
             major, minor = torch.version.cuda.split('.')[:2]
             cuda_compatible = (int(major) == 10 and int(minor) >= 1) or (int(major) >= 11)
             if not cuda_compatible:
-                self.warning(f"{self.NAME} requires CUDA version 10.1+")
+                if verbose:
+                    self.warning(f"{self.NAME} requires CUDA version 10.1+")
 
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
         TORCH_MINOR = int(torch.__version__.split('.')[1])
         torch_compatible = (TORCH_MAJOR == 1 and TORCH_MINOR >= 5)
         if not torch_compatible:
-            self.warning(
-                f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
-
+            if verbose:
+                self.warning(
+                    f'{self.NAME} requires a torch version >= 1.5 and < 2.0 but detected {TORCH_MAJOR}.{TORCH_MINOR}')
         try:
             import triton
         except ImportError:
             # auto-install of triton is broken on some systems, reverting to manual install for now
-            # see this issue: https://github.com/microsoft/DeepSpeed/issues/1710
-            self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
+            # see this issue: https://github.com/deepspeedai/DeepSpeed/issues/1710
+            if verbose:
+                self.warning(f"please install triton==1.0.0 if you want to use sparse attention")
             return False
 
         if pkg_version:
@@ -76,7 +81,9 @@ def is_compatible(self, verbose=True):
             triton_mismatch = installed_triton != "1.0.0"
 
         if triton_mismatch:
-            self.warning(f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
+            if verbose:
+                self.warning(
+                    f"using untested triton version ({installed_triton}), only 1.0.0 is known to be compatible")
             return False
 
         return super().is_compatible(verbose) and torch_compatible and cuda_compatible
diff --git a/op_builder/spatial_inference.py b/op_builder/spatial_inference.py
index 59caf57f938d..d6c5fa661156 100644
--- a/op_builder/spatial_inference.py
+++ b/op_builder/spatial_inference.py
@@ -17,11 +17,12 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.ops.spatial.{self.NAME}_op'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -31,7 +32,8 @@ def is_compatible(self, verbose=True):
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py
index 5ee902289448..642aed56a192 100755
--- a/op_builder/transformer_inference.py
+++ b/op_builder/transformer_inference.py
@@ -17,11 +17,12 @@ def __init__(self, name=None):
     def absolute_name(self):
         return f'deepspeed.ops.transformer.inference.{self.NAME}_op'
 
-    def is_compatible(self, verbose=True):
+    def is_compatible(self, verbose=False):
         try:
             import torch
         except ImportError:
-            self.warning("Please install torch if trying to pre-compile inference kernels")
+            if verbose:
+                self.warning("Please install torch if trying to pre-compile inference kernels")
             return False
 
         cuda_okay = True
@@ -30,18 +31,20 @@ def is_compatible(self, verbose=True):
             torch_cuda_major = int(torch.version.cuda.split('.')[0])
             cuda_capability = torch.cuda.get_device_properties(0).major
             if cuda_capability < 6:
-                self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
+                if verbose:
+                    self.warning("NVIDIA Inference is only supported on Pascal and newer architectures")
                 cuda_okay = False
             if cuda_capability >= 8:
                 if torch_cuda_major < 11 or sys_cuda_major < 11:
-                    self.warning("On Ampere and higher architectures please use CUDA 11+")
+                    if verbose:
+                        self.warning("On Ampere and higher architectures please use CUDA 11+")
                     cuda_okay = False
         return super().is_compatible(verbose) and cuda_okay
 
     def filter_ccs(self, ccs):
         ccs_retained = []
         ccs_pruned = []
-        for cc in ccs:
+        for cc in [cc.split('.') for cc in ccs]:
             if int(cc[0]) >= 6:
                 ccs_retained.append(cc)
             else:
diff --git a/op_builder/xpu/__init__.py b/op_builder/xpu/__init__.py
new file mode 100755
index 000000000000..093f90b30234
--- /dev/null
+++ b/op_builder/xpu/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .cpu_adam import CPUAdamBuilder
+from .cpu_adagrad import CPUAdagradBuilder
+from .fused_adam import FusedAdamBuilder
+from .async_io import AsyncIOBuilder
+from .inference import InferenceBuilder
+from .flash_attn import FlashAttentionBuilder
+from .no_impl import NotImplementedBuilder
+from .packbits import PackbitsBuilder
diff --git a/op_builder/xpu/async_io.py b/op_builder/xpu/async_io.py
new file mode 100644
index 000000000000..2da963ae64aa
--- /dev/null
+++ b/op_builder/xpu/async_io.py
@@ -0,0 +1,106 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import distutils.spawn
+import subprocess
+
+from .builder import OpBuilder
+
+
+class AsyncIOBuilder(OpBuilder):
+    BUILD_VAR = "DS_BUILD_AIO"
+    NAME = "async_io"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.aio.{self.NAME}_op'
+
+    def sources(self):
+        return [
+            'csrc/aio/py_lib/deepspeed_py_copy.cpp',
+            'csrc/aio/py_lib/py_ds_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio.cpp',
+            'csrc/aio/py_lib/deepspeed_py_aio_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_thread.cpp',
+            'csrc/aio/common/deepspeed_aio_utils.cpp',
+            'csrc/aio/common/deepspeed_aio_common.cpp',
+            'csrc/aio/common/deepspeed_aio_types.cpp',
+            'csrc/aio/py_lib/deepspeed_pin_tensor.cpp',
+            'csrc/aio/py_lib/deepspeed_py_io_handle.cpp',
+            'csrc/aio/py_lib/deepspeed_cpu_op.cpp',
+            'csrc/aio/py_lib/deepspeed_aio_op_desc.cpp',
+        ]
+
+    def include_paths(self):
+        return ['csrc/aio/py_lib', 'csrc/aio/common']
+
+    def cxx_args(self):
+        import torch
+        # -O0 for improved debugging, since performance is bound by I/O
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        TORCH_MAJOR, TORCH_MINOR = map(int, torch.__version__.split('.')[0:2])
+        if TORCH_MAJOR >= 2 and TORCH_MINOR >= 1:
+            CPP_STD = '-std=c++17'
+        else:
+            CPP_STD = '-std=c++14'
+        return [
+            '-g',
+            '-Wall',
+            '-O0',
+            CPP_STD,
+            '-shared',
+            '-fPIC',
+            '-Wno-reorder',
+            CPU_ARCH,
+            '-fopenmp',
+            SIMD_WIDTH,
+            '-laio',
+        ]
+
+    def extra_ldflags(self):
+        return ['-laio']
+
+    def check_for_libaio_pkg(self):
+        libs = dict(
+            dpkg=["-l", "libaio-dev", "apt"],
+            pacman=["-Q", "libaio", "pacman"],
+            rpm=["-q", "libaio-devel", "yum"],
+        )
+
+        found = False
+        for pkgmgr, data in libs.items():
+            flag, lib, tool = data
+            path = distutils.spawn.find_executable(pkgmgr)
+            if path is not None:
+                cmd = [pkgmgr, flag, lib]
+                result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                if result.wait() == 0:
+                    found = True
+                else:
+                    self.warning(f"{self.NAME}: please install the {lib} package with {tool}")
+                break
+        return found
+
+    def is_compatible(self, verbose=False):
+        # Check for the existence of libaio by using distutils
+        # to compile and link a test program that calls io_submit,
+        # which is a function provided by libaio that is used in the async_io op.
+        # If needed, one can define -I and -L entries in CFLAGS and LDFLAGS
+        # respectively to specify the directories for libaio.h and libaio.so.
+        aio_compatible = self.has_function('io_pgetevents', ('aio', ))
+        if verbose and not aio_compatible:
+            self.warning(f"{self.NAME} requires the dev libaio .so object and headers but these were not found.")
+
+            # Check for the libaio package via known package managers
+            # to print suggestions on which package to install.
+            self.check_for_libaio_pkg()
+
+            self.warning(
+                "If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found."
+            )
+        return super().is_compatible(verbose) and aio_compatible
diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
new file mode 100644
index 000000000000..81b15f197f43
--- /dev/null
+++ b/op_builder/xpu/builder.py
@@ -0,0 +1,131 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import time
+import importlib
+
+try:
+    # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+    # if successful this also means we're doing a local install and not JIT compile path
+    from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+    from op_builder.builder import OpBuilder
+except ImportError:
+    from deepspeed.ops.op_builder.builder import OpBuilder
+
+
+class SYCLOpBuilder(OpBuilder):
+
+    def builder(self):
+        try:
+            from intel_extension_for_pytorch.xpu.cpp_extension import DPCPPExtension
+        except ImportError:
+            from intel_extension_for_pytorch.xpu.utils import DPCPPExtension
+        include_dirs = [os.path.abspath(x) for x in self.strip_empty_entries(self.include_paths())]
+        print("dpcpp sources = {}".format(self.sources()))
+        dpcpp_ext = DPCPPExtension(name=self.absolute_name(),
+                                   sources=self.strip_empty_entries(self.sources()),
+                                   include_dirs=include_dirs,
+                                   extra_compile_args={
+                                       'cxx': self.strip_empty_entries(self.cxx_args()),
+                                   },
+                                   extra_link_args=self.strip_empty_entries(self.fixed_aotflags()))
+        return dpcpp_ext
+
+    def version_dependent_macros(self):
+        try:
+            from op_builder.builder import TORCH_MAJOR, TORCH_MINOR
+        except ImportError:
+            from deepspeed.ops.op_builder.builder import TORCH_MAJOR, TORCH_MINOR
+        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+        version_ge_1_3 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+            version_ge_1_3 = ['-DVERSION_GE_1_3']
+        version_ge_1_5 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+            version_ge_1_5 = ['-DVERSION_GE_1_5']
+        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+    def cxx_args(self):
+        cxx_flags = [
+            '-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64',
+            '-fno-strict-aliasing'
+        ]
+        if os.environ.get('USE_MKL_GEMM'):
+            cxx_flags.append('-DUSE_MKL_GEMM')
+        return cxx_flags
+
+    def extra_ldflags(self):
+        return [
+            '-fPIC', '-fsycl', '-fsycl-targets=spir64_gen', '-fsycl-max-parallel-link-jobs=8',
+            '-Xs "-options -cl-poison-unsupported-fp64-kernels,cl-intel-enable-auto-large-GRF-mode"',
+            '-Xs "-device pvc"', '-Wl,-export-dynamic'
+        ]
+
+    def fixed_aotflags(self):
+        return [
+            '-fsycl', '-fsycl-targets=spir64_gen', '-fsycl-max-parallel-link-jobs=8', '-Xs',
+            "-options -cl-poison-unsupported-fp64-kernels,cl-intel-enable-auto-large-GRF-mode", '-Xs', "-device pvc"
+        ]
+
+    def load(self, verbose=True):
+        from deepspeed.git_version_info import installed_ops, torch_info, accelerator_name  # noqa: F401
+        from deepspeed.accelerator import get_accelerator
+        if installed_ops.get(self.name, False) and accelerator_name == get_accelerator()._name:
+            return importlib.import_module(self.absolute_name())
+        else:
+            return self.jit_load(verbose)
+
+    def jit_load(self, verbose=True):
+        if not self.is_compatible(verbose):
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue. {self.error_log}"
+            )
+        try:
+            import ninja  # noqa: F401
+        except ImportError:
+            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+
+        self.jit_mode = True
+        from intel_extension_for_pytorch.xpu.cpp_extension import load
+
+        start_build = time.time()
+        # Recognize relative paths as absolute paths for jit load
+
+        sources = [self.deepspeed_src_path(path) for path in self.sources()]
+        extra_include_paths = [self.deepspeed_src_path(path) for path in self.include_paths()]
+
+        # Torch will try and apply whatever CCs are in the arch list at compile time,
+        # we have already set the intended targets ourselves we know that will be
+        # needed at runtime. This prevents CC collisions such as multiple __half
+        # implementations. Stash arch list to reset after build.
+        '''
+        torch_arch_list = None
+        if "TORCH_CUDA_ARCH_LIST" in os.environ:
+            torch_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST")
+            os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+        '''
+
+        op_module = load(
+            name=self.name,
+            sources=self.strip_empty_entries(sources),
+            extra_include_paths=self.strip_empty_entries(extra_include_paths),
+            extra_cflags=self.strip_empty_entries(self.cxx_args()),
+            # extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
+            extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
+            verbose=verbose)
+
+        build_duration = time.time() - start_build
+        if verbose:
+            print(f"Time to load {self.name} op: {build_duration} seconds")
+        '''
+        # Reset arch list so we are not silently removing it for other possible use cases
+        if torch_arch_list:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = torch_arch_list
+        '''
+        return op_module
diff --git a/op_builder/xpu/cpu_adagrad.py b/op_builder/xpu/cpu_adagrad.py
new file mode 100644
index 000000000000..18f80848e1b8
--- /dev/null
+++ b/op_builder/xpu/cpu_adagrad.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import SYCLOpBuilder
+
+
+class CPUAdagradBuilder(SYCLOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAGRAD"
+    NAME = "cpu_adagrad"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adagrad.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/xpu/adagrad/cpu_adagrad.cpp', 'csrc/xpu/common/custom_cuda_kernel.dp.cpp']
+
+    def include_paths(self):
+        return ['csrc/xpu/includes']
diff --git a/op_builder/xpu/cpu_adam.py b/op_builder/xpu/cpu_adam.py
new file mode 100644
index 000000000000..9d5fdcd3e8ab
--- /dev/null
+++ b/op_builder/xpu/cpu_adam.py
@@ -0,0 +1,27 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import SYCLOpBuilder
+
+
+class CPUAdamBuilder(SYCLOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/cpu_adam_impl.cpp']
+
+    def libraries_args(self):
+        args = super().libraries_args()
+        return args
+
+    def include_paths(self):
+        return ['csrc/includes']
diff --git a/op_builder/xpu/flash_attn.py b/op_builder/xpu/flash_attn.py
new file mode 100644
index 000000000000..c8c2674d5d27
--- /dev/null
+++ b/op_builder/xpu/flash_attn.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+from .builder import SYCLOpBuilder
+
+
+class FlashAttentionBuilderObject():
+
+    def __init__(self):
+        pass
+
+    # general functions
+    def flash_attn_func_v2(self, q, k, v, dropout_p, softmax_scale, is_causal):
+        try:
+            import torch
+            import intel_extension_for_pytorch  # noqa
+            return torch.nn.functional.scaled_dot_product_attention(q,
+                                                                    k,
+                                                                    v,
+                                                                    dropout_p=dropout_p,
+                                                                    is_causal=is_causal,
+                                                                    scale=softmax_scale)
+        except ImportError:
+            raise ImportError(
+                "Please install pytorch and intel_extension_for_pytorch to include scaled dot product attention.")
+
+
+class FlashAttentionBuilder(SYCLOpBuilder):
+    BUILD_VAR = "DS_BUILD_FlashAttention"
+    NAME = "flash_attn"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def sources(self):
+        return
+
+    def include_paths(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def load(self):
+        return FlashAttentionBuilderObject()
diff --git a/op_builder/xpu/fused_adam.py b/op_builder/xpu/fused_adam.py
new file mode 100644
index 000000000000..0e0f1a66f8e6
--- /dev/null
+++ b/op_builder/xpu/fused_adam.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+from .builder import SYCLOpBuilder
+
+
+class FusedAdamBuilder(SYCLOpBuilder):
+    BUILD_VAR = "DS_BUILD_FUSED_ADAM"
+    NAME = "fused_adam"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/xpu/adam/fused_adam_frontend.cpp', 'csrc/xpu/adam/multi_tensor_adam.dp.cpp']
+
+    def include_paths(self):
+        return ['csrc/xpu/includes', 'csrc/xpu/adam']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
diff --git a/op_builder/xpu/inference.py b/op_builder/xpu/inference.py
new file mode 100644
index 000000000000..a9ac4f84c2ca
--- /dev/null
+++ b/op_builder/xpu/inference.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+from .builder import SYCLOpBuilder
+
+
+class InferenceBuilder(SYCLOpBuilder):
+    BUILD_VAR = "DS_BUILD_TRANSFORMER_INFERENCE"
+    NAME = "transformer_inference"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.transformer.inference.{self.NAME}_op'
+
+    def sources(self):
+        return
+
+    def include_paths(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def load(self):
+        try:
+            import intel_extension_for_pytorch
+            if hasattr(intel_extension_for_pytorch, "deepspeed"):
+                return intel_extension_for_pytorch.deepspeed.transformer_inference.transformer_inference
+            else:
+                return intel_extension_for_pytorch.xpu.deepspeed
+        except ImportError:
+            raise ImportError("Please install intel-extension-for-pytorch >= 2.1.30 to include DeepSpeed kernels.")
diff --git a/op_builder/xpu/no_impl.py b/op_builder/xpu/no_impl.py
new file mode 100644
index 000000000000..8b294f70c279
--- /dev/null
+++ b/op_builder/xpu/no_impl.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .builder import SYCLOpBuilder
+
+
+class NotImplementedBuilder(SYCLOpBuilder):
+    BUILD_VAR = "DS_BUILD_NOT_IMPLEMENTED"
+    NAME = "deepspeed_not_implemented"
+
+    def __init__(self, name=None):
+        name = self.NAME if name is None else name
+        super().__init__(name=name)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def load(self, verbose=True):
+        raise ValueError("This op had not been implemented on XPU backend.")
+
+    def sources(self):
+        return []
+
+    def cxx_args(self):
+        return []
+
+    def extra_ldflags(self):
+        return []
+
+    def include_paths(self):
+        return []
diff --git a/op_builder/xpu/packbits.py b/op_builder/xpu/packbits.py
new file mode 100644
index 000000000000..cf5b5ebc59e4
--- /dev/null
+++ b/op_builder/xpu/packbits.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+from .builder import SYCLOpBuilder
+
+
+class PackbitsBuilder(SYCLOpBuilder):
+    BUILD_VAR = "DS_BUILD_PACK_BITS"
+    NAME = "pack_bits"
+
+    def __init__(self):
+        super().__init__(name=self.NAME)
+
+    def absolute_name(self):
+        return f'deepspeed.ops.{self.NAME}_op'
+
+    def sources(self):
+        return ['csrc/xpu/packbits/packing.cpp']
+
+    def include_paths(self):
+        return ['csrc/xpu/includes']
+
+    def cxx_args(self):
+        args = super().cxx_args()
+        return args + self.version_dependent_macros()
diff --git a/release/release.sh b/release/release.sh
index a83fafcb9b1f..cc3ee2feae62 100644
--- a/release/release.sh
+++ b/release/release.sh
@@ -38,7 +38,7 @@ if [ $? != 0 ]; then
     exit 1
 fi
 
-DS_BUILD_STRING="" python setup.py sdist
+DS_BUILD_STRING="" python -m build --sdist
 
 if [ ! -f dist/deepspeed-${version}.tar.gz ]; then
     echo "prepared version does not match version given ($version), bump version first?"
diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
index 105dd094f995..71ad3f0b262b 100644
--- a/requirements/requirements-dev.txt
+++ b/requirements/requirements-dev.txt
@@ -1,20 +1,21 @@
 accelerate
-clang-format==16.0.2
-coverage
-deepspeed-kernels
+clang-format==18.1.3
+comet_ml>=3.41.0
+deepspeed-kernels ; sys_platform == 'linux'
 docutils<0.18
 future
 importlib-metadata>=4
 mup
-pre-commit>=2.20.0
-pytest
+pre-commit>=3.2.0
+pytest>=7.2.0
 pytest-forked
 pytest-randomly
 pytest-xdist
+qtorch==0.3.0
 recommonmark
 sphinx
 sphinx-rtd-theme
 tensorboard
 torchvision
-transformers
+transformers>=4.39.0
 wandb
diff --git a/requirements/requirements-inf.txt b/requirements/requirements-inf.txt
index 848a7f7a485d..b7fd13787e8b 100644
--- a/requirements/requirements-inf.txt
+++ b/requirements/requirements-inf.txt
@@ -1,5 +1,7 @@
 google
 lm-eval==0.3.0
 protobuf
-transformers
-transformers[sentencepiece]
+qtorch
+safetensors
+sentencepiece
+transformers>=4.32.1
diff --git a/requirements/requirements-readthedocs.txt b/requirements/requirements-readthedocs.txt
index 1a2ad18611e7..a48a47e4428d 100644
--- a/requirements/requirements-readthedocs.txt
+++ b/requirements/requirements-readthedocs.txt
@@ -1,10 +1,10 @@
-autodoc_pydantic
+autodoc_pydantic>=2.0.0
 docutils<0.18
 hjson
 packaging
 psutil
 py-cpuinfo
-pydantic<2.0.0
+pydantic>=2.0.0
 recommonmark
 sphinx_rtd_theme
 torch
diff --git a/requirements/requirements-sd.txt b/requirements/requirements-sd.txt
index cb679ae3771d..0b2ce8c2b56f 100644
--- a/requirements/requirements-sd.txt
+++ b/requirements/requirements-sd.txt
@@ -1,2 +1,2 @@
-diffusers
+diffusers>=0.25.0
 triton>=2.1.0
diff --git a/requirements/requirements-triton.txt b/requirements/requirements-triton.txt
index f43a7e19e242..3b382f83f2ae 100644
--- a/requirements/requirements-triton.txt
+++ b/requirements/requirements-triton.txt
@@ -1 +1 @@
-triton>=2.1.0
+triton==2.1.0
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 80c9f9b3287a..1af4c69c5807 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,10 +1,11 @@
+einops
 hjson
+msgpack
 ninja
 numpy
 packaging>=20.0
 psutil
 py-cpuinfo
-pydantic
-pynvml
+pydantic>=2.0.0
 torch
 tqdm
diff --git a/scripts/check-extraindexurl.py b/scripts/check-extraindexurl.py
new file mode 100755
index 000000000000..01b506dc939d
--- /dev/null
+++ b/scripts/check-extraindexurl.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from __future__ import annotations
+'''Copyright The Microsoft DeepSpeed Team'''
+"""
+Checks each file in sys.argv for the string "--extra-index-url".
+Modified from https://github.com/jlebar/pre-commit-hooks/blob/master/check_do_not_submit.py
+"""
+
+import subprocess
+import sys
+
+
+def err(s: str) -> None:
+    print(s, file=sys.stderr)
+
+
+print(*sys.argv[1:])
+
+# There are many ways we could search for the string "--extra-index-url", but `git
+# grep --no-index` is nice because
+#  - it's very fast (as compared to iterating over the file in Python)
+#  - we can reasonably assume it's available on all machines
+#  - unlike plain grep, which is slower and has different flags on MacOS versus
+#    Linux, git grep is always the same.
+res = subprocess.run(
+    ["git", "grep", "-Hn", "--no-index", "-e", r"--extra-index-url", *sys.argv[1:]],
+    capture_output=True,
+)
+if res.returncode == 0:
+    err('Error: The string "--extra-index-url" was found.\nPlease replace all calls to --extra-index-url with "--index-url"'
+        )
+    err(res.stdout.decode("utf-8"))
+    sys.exit(1)
+elif res.returncode == 2:
+    err(f"Error invoking grep on {', '.join(sys.argv[1:])}:")
+    err(res.stderr.decode("utf-8"))
+    sys.exit(2)
diff --git a/setup.py b/setup.py
index d9aed9b47bd8..0ad54bb99403 100755
--- a/setup.py
+++ b/setup.py
@@ -5,8 +5,8 @@
 """
 DeepSpeed library
 
-To build wheel on Windows:
-1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
+To build wheels on Windows:
+1. Install pytorch, such as pytorch 2.3 + cuda 12.1.
 2. Install visual cpp build tool.
 3. Include cuda toolkit.
 4. Launch cmd console with Administrator privilege for creating required symlink folders.
@@ -18,13 +18,16 @@
 The wheel will be located at: dist/*.whl
 """
 
+import pathlib
 import os
+import shutil
 import sys
 import subprocess
 from setuptools import setup, find_packages
 from setuptools.command import egg_info
 import time
 import typing
+import shlex
 
 torch_available = True
 try:
@@ -35,9 +38,11 @@
         'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
 
 from op_builder import get_default_compute_capabilities, OpBuilder
-from op_builder.all_ops import ALL_OPS
+from op_builder.all_ops import ALL_OPS, accelerator_name
 from op_builder.builder import installed_cuda_version
 
+from accelerator import get_accelerator
+
 # Fetch rocm state.
 is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
 rocm_version = OpBuilder.installed_rocm_version()
@@ -88,13 +93,17 @@ def get_env_if_set(key, default: typing.Any = ""):
     'triton': fetch_requirements('requirements/requirements-triton.txt'),
 }
 
+# Only install pynvml on nvidia gpus.
+if torch_available and get_accelerator().device_name() == 'cuda' and not is_rocm_pytorch:
+    install_requires.append('nvidia-ml-py')
+
 # Add specific cupy version to both onebit extension variants.
-if torch_available and torch.cuda.is_available():
+if torch_available and get_accelerator().device_name() == 'cuda':
     cupy = None
     if is_rocm_pytorch:
         rocm_major, rocm_minor = rocm_version
-        # XXX cupy support for rocm 5 is not available yet.
-        if rocm_major <= 4:
+        # cupy support for rocm>5.0 is not available yet.
+        if (rocm_major == 5 and rocm_minor == 0) or rocm_major <= 4:
             cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
     else:
         cuda_major_ver, cuda_minor_ver = installed_cuda_version()
@@ -118,8 +127,8 @@ def get_env_if_set(key, default: typing.Any = ""):
 
 # For any pre-installed ops force disable ninja.
 if torch_available:
-    from accelerator import get_accelerator
-    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
+    use_ninja = is_env_set("DS_ENABLE_NINJA")
+    cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=use_ninja)
 
 if torch_available:
     TORCH_MAJOR = torch.__version__.split('.')[0]
@@ -128,7 +137,7 @@ def get_env_if_set(key, default: typing.Any = ""):
     TORCH_MAJOR = "0"
     TORCH_MINOR = "0"
 
-if torch_available and not torch.cuda.is_available():
+if torch_available and not get_accelerator().device_name() == 'cuda':
     # Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
     print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
           "you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
@@ -149,10 +158,12 @@ def get_env_if_set(key, default: typing.Any = ""):
 
 def command_exists(cmd):
     if sys.platform == "win32":
-        result = subprocess.Popen(f'{cmd}', stdout=subprocess.PIPE, shell=True)
+        safe_cmd = shlex.split(f'{cmd}')
+        result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
         return result.wait() == 1
     else:
-        result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+        safe_cmd = shlex.split(f"bash -c type {cmd}")
+        result = subprocess.Popen(safe_cmd, stdout=subprocess.PIPE)
         return result.wait() == 0
 
 
@@ -167,19 +178,16 @@ def op_enabled(op_name):
     return int(get_env_if_set(env_var, BUILD_OP_DEFAULT))
 
 
-compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
 install_ops = dict.fromkeys(ALL_OPS.keys(), False)
 for op_name, builder in ALL_OPS.items():
     op_compatible = builder.is_compatible()
-    compatible_ops[op_name] = op_compatible
-    compatible_ops["deepspeed_not_implemented"] = False
 
     # If op is requested but not available, throw an error.
     if op_enabled(op_name) and not op_compatible:
         env_var = op_envvar(op_name)
         if not is_env_set(env_var):
-            builder.warning(f"One can disable {op_name} with {env_var}=0")
-        abort(f"Unable to pre-compile {op_name}")
+            builder.warning(f"Skip pre-compile of incompatible {op_name}; One can disable {op_name} with {env_var}=0")
+        continue
 
     # If op is compatible but install is not enabled (JIT mode).
     if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
@@ -194,13 +202,13 @@ def op_enabled(op_name):
 print(f'Install Ops={install_ops}')
 
 # Write out version/git info.
-git_hash_cmd = "git rev-parse --short HEAD"
-git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
+git_hash_cmd = shlex.split("bash -c \"git rev-parse --short HEAD\"")
+git_branch_cmd = shlex.split("bash -c \"git rev-parse --abbrev-ref HEAD\"")
 if command_exists('git') and not is_env_set('DS_BUILD_STRING'):
     try:
-        result = subprocess.check_output(git_hash_cmd, shell=True)
+        result = subprocess.check_output(git_hash_cmd)
         git_hash = result.decode('utf-8').strip()
-        result = subprocess.check_output(git_branch_cmd, shell=True)
+        result = subprocess.check_output(git_branch_cmd)
         git_branch = result.decode('utf-8').strip()
     except subprocess.CalledProcessError:
         git_hash = "unknown"
@@ -209,28 +217,23 @@ def op_enabled(op_name):
     git_hash = "unknown"
     git_branch = "unknown"
 
-
-def create_dir_symlink(src, dest):
-    if not os.path.islink(dest):
-        if os.path.exists(dest):
-            os.remove(dest)
-        assert not os.path.exists(dest)
-        os.symlink(src, dest)
-
-
 if sys.platform == "win32":
-    # This creates a symbolic links on Windows.
-    # It needs Administrator privilege to create symlinks on Windows.
-    create_dir_symlink('..\\..\\csrc', '.\\deepspeed\\ops\\csrc')
-    create_dir_symlink('..\\..\\op_builder', '.\\deepspeed\\ops\\op_builder')
-    create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
+    shutil.rmtree('.\\deepspeed\\ops\\csrc', ignore_errors=True)
+    pathlib.Path('.\\deepspeed\\ops\\csrc').unlink(missing_ok=True)
+    shutil.copytree('.\\csrc', '.\\deepspeed\\ops\\csrc', dirs_exist_ok=True)
+    shutil.rmtree('.\\deepspeed\\ops\\op_builder', ignore_errors=True)
+    pathlib.Path('.\\deepspeed\\ops\\op_builder').unlink(missing_ok=True)
+    shutil.copytree('.\\op_builder', '.\\deepspeed\\ops\\op_builder', dirs_exist_ok=True)
+    shutil.rmtree('.\\deepspeed\\accelerator', ignore_errors=True)
+    pathlib.Path('.\\deepspeed\\accelerator').unlink(missing_ok=True)
+    shutil.copytree('.\\accelerator', '.\\deepspeed\\accelerator', dirs_exist_ok=True)
     egg_info.manifest_maker.template = 'MANIFEST_win.in'
 
 # Parse the DeepSpeed version string from version.txt.
 version_str = open('version.txt', 'r').read().strip()
 
 # Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
-# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
+# Example: `DS_BUILD_STRING=".dev20201022" python -m build --no-isolation`.
 
 # Building wheel for distribution, update version file.
 if is_env_set('DS_BUILD_STRING'):
@@ -279,11 +282,10 @@ def create_dir_symlink(src, dest):
     fd.write(f"git_hash='{git_hash}'\n")
     fd.write(f"git_branch='{git_branch}'\n")
     fd.write(f"installed_ops={install_ops}\n")
-    fd.write(f"compatible_ops={compatible_ops}\n")
+    fd.write(f"accelerator_name='{accelerator_name}'\n")
     fd.write(f"torch_info={torch_info}\n")
 
 print(f'install_requires={install_requires}')
-print(f'compatible_ops={compatible_ops}')
 print(f'ext_modules={ext_modules}')
 
 # Parse README.md to make long_description for PyPI page.
@@ -291,6 +293,14 @@ def create_dir_symlink(src, dest):
 with open(os.path.join(thisdir, 'README.md'), encoding='utf-8') as fin:
     readme_text = fin.read()
 
+if sys.platform == "win32":
+    scripts = ['bin/deepspeed.bat', 'bin/ds', 'bin/ds_report.bat', 'bin/ds_report']
+else:
+    scripts = [
+        'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
+        'bin/ds_elastic', 'bin/ds_nvme_tune', 'bin/ds_io'
+    ]
+
 start_time = time.time()
 
 setup(name='deepspeed',
@@ -303,20 +313,17 @@ def create_dir_symlink(src, dest):
       url='http://deepspeed.ai',
       project_urls={
           'Documentation': 'https://deepspeed.readthedocs.io',
-          'Source': 'https://github.com/microsoft/DeepSpeed',
+          'Source': 'https://github.com/deepspeedai/DeepSpeed',
       },
       install_requires=install_requires,
       extras_require=extras_require,
       packages=find_packages(include=['deepspeed', 'deepspeed.*']),
       include_package_data=True,
-      scripts=[
-          'bin/deepspeed', 'bin/deepspeed.pt', 'bin/ds', 'bin/ds_ssh', 'bin/ds_report', 'bin/ds_bench', 'bin/dsr',
-          'bin/ds_elastic'
-      ],
+      scripts=scripts,
       classifiers=[
-          'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7',
           'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9',
-          'Programming Language :: Python :: 3.10'
+          'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11',
+          'Programming Language :: Python :: 3.12'
       ],
       license='Apache Software License 2.0',
       ext_modules=ext_modules,
diff --git a/tests/model/BingBertSquad/BingBertSquad_test_common.py b/tests/model/BingBertSquad/BingBertSquad_test_common.py
index ef42f85cc945..b47ddfe0c649 100755
--- a/tests/model/BingBertSquad/BingBertSquad_test_common.py
+++ b/tests/model/BingBertSquad/BingBertSquad_test_common.py
@@ -7,6 +7,7 @@
 import subprocess
 import os
 import time
+import shlex
 
 
 class BaseTestCase(unittest.TestCase):
@@ -40,9 +41,9 @@ def ensure_directory_exists(self, filename):
             os.makedirs(dirname)
 
     def clean_test_env(self):
-        cmd = "dlts_ssh pkill -9 -f /usr/bin/python"
+        cmd = shlex.split("dlts_ssh pkill -9 -f /usr/bin/python")
         print(cmd)
-        subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+        subprocess.run(cmd, check=False, executable='/bin/bash')
         time.sleep(20)
 
     def run_BingBertSquad_test(self, test_config, output):
@@ -50,8 +51,8 @@ def run_BingBertSquad_test(self, test_config, output):
         other_args = " " + test_config["other_args"] if "other_args" in test_config else " "
 
         cmd = "./run_BingBertSquad_sanity.sh -e 1 -g {0} {1} {2}".format(test_config["gpus"], other_args, ds_flag)
-
+        cmd = shlex.split(cmd)
         self.ensure_directory_exists(output)
         with open(output, "w") as f:
             print(cmd)
-            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f)
+            subprocess.run(cmd, check=False, executable='/bin/bash', stdout=f, stderr=f)
diff --git a/tests/model/BingBertSquad/run_BingBertSquad.sh b/tests/model/BingBertSquad/run_BingBertSquad.sh
index fcfdf5e66361..4d06bb1230a4 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad.sh
@@ -93,7 +93,7 @@ done
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
@@ -160,8 +160,11 @@ run_cmd="deepspeed.pt \
       --master_port ${master_port}
       ${BingBertSquad_script} ${other_args} ${squad_args}"
 
-echo ${run_cmd}
-eval ${run_cmd}
+# Sanitize input before running eval()
+safe_cmd=$(printf '%q' "$run_cmd")
+
+echo ${safe_cmd}
+eval ${safe_cmd}
 
 set +x
 
diff --git a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
index 1b49a37b783f..8b6ad942ba59 100755
--- a/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
+++ b/tests/model/BingBertSquad/run_BingBertSquad_sanity.sh
@@ -94,7 +94,7 @@ done
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
diff --git a/tests/model/BingBertSquad/run_tests.sh b/tests/model/BingBertSquad/run_tests.sh
index eef93ef98796..2a69fdf01c79 100755
--- a/tests/model/BingBertSquad/run_tests.sh
+++ b/tests/model/BingBertSquad/run_tests.sh
@@ -31,7 +31,7 @@ validate_folder() {
 
 # Validate path to BingBertSquad script
 if [ -z "${BingBertSquad_DIR+x}" ]; then
-  export BingBertSquad_DIR=../../../DeepSpeedExamples/BingBertSquad
+  export BingBertSquad_DIR=../../../DeepSpeedExamples/training/BingBertSquad
   echo "BingBertSquad_DIR environment variable not set; trying default: ${BingBertSquad_DIR}"
 fi
 validate_folder ${BingBertSquad_DIR} "BingBertSquad_DIR"
diff --git a/tests/model/BingBertSquad/test_e2e_squad.py b/tests/model/BingBertSquad/test_e2e_squad.py
index 9312dc67a193..9f03b89d0829 100644
--- a/tests/model/BingBertSquad/test_e2e_squad.py
+++ b/tests/model/BingBertSquad/test_e2e_squad.py
@@ -10,11 +10,11 @@
 import pytest
 import json
 
-sys.path.append("../../../DeepSpeedExamples/BingBertSquad")
+sys.path.append("../../../DeepSpeedExamples/training/BingBertSquad")
 import evaluate as eval
 
 squad_dir = "/data/BingBertSquad"
-base_dir = "../../../DeepSpeedExamples/BingBertSquad"
+base_dir = "../../../DeepSpeedExamples/training/BingBertSquad"
 
 script_file_name = "run_squad_deepspeed.sh"
 model_file_name = "training_state_checkpoint_162.tar"
diff --git a/tests/model/Megatron_GPT2/run_checkpoint_test.py b/tests/model/Megatron_GPT2/run_checkpoint_test.py
index d97a28ff1ad5..824f8269a972 100755
--- a/tests/model/Megatron_GPT2/run_checkpoint_test.py
+++ b/tests/model/Megatron_GPT2/run_checkpoint_test.py
@@ -10,6 +10,7 @@
 import subprocess
 import os
 import re
+import shlex
 from .test_common import BaseTestCase
 
 LAYERS = 2
@@ -18,9 +19,9 @@
 
 
 def remove_file(test_id, filename):
-    cmd = f"if [ -f {filename} ] ; then rm -v {filename}; fi"
+    cmd = shlex.split(f"if [ -f {filename} ] ; then rm -v {filename}; fi")
     print(f"{test_id} cmd: {cmd}")
-    subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+    subprocess.run(cmd, check=False, executable='/bin/bash')
 
 
 def grep_loss_from_file(file_name):
@@ -451,9 +452,9 @@ def run_test(self, test_config, r_tol):
         checkpoint_name = test_config["checkpoint_name"]
         #---------------remove old checkpoint---------------#
         try:
-            cmd = f"rm -rf {checkpoint_name}"
+            cmd = shlex.split(f"rm -rf {checkpoint_name}")
             print(f"{self.id()} cmd: {cmd}")
-            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+            subprocess.run(cmd, check=False, executable='/bin/bash')
         except:
             print("No old checkpoint")
 
@@ -474,8 +475,8 @@ def run_test(self, test_config, r_tol):
 
         # remove previous test log
         try:
-            cmd = f"rm {base_file}"
-            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+            cmd = shlex.split(f"rm {base_file}")
+            subprocess.run(cmd, check=False, executable='/bin/bash')
         except:
             print(f"{self.id()} No old logs")
 
@@ -489,9 +490,9 @@ def run_test(self, test_config, r_tol):
 
         # set checkpoint load iteration
         try:
-            cmd = f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt"
+            cmd = shlex.split(f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt")
             print(f"{self.id()} running cmd: {cmd}")
-            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+            subprocess.run(cmd, check=False, executable='/bin/bash')
         except:
             print(f"{self.id()} Failed to update the checkpoint iteration file")
             return False
@@ -506,8 +507,8 @@ def run_test(self, test_config, r_tol):
 
         # remove previous test log
         try:
-            cmd = f"rm {test_file}"
-            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+            cmd = shlex.split(f"rm {test_file}")
+            subprocess.run(cmd, check=False, executable='/bin/bash')
         except:
             print(f"{self.id()} no previous logs for")
         self.run_gpt2_test(test_config, test_file)
diff --git a/tests/model/Megatron_GPT2/test_common.py b/tests/model/Megatron_GPT2/test_common.py
index 1bcd891e31d5..4eb84ac7eeee 100755
--- a/tests/model/Megatron_GPT2/test_common.py
+++ b/tests/model/Megatron_GPT2/test_common.py
@@ -7,6 +7,7 @@
 import subprocess
 import os
 import time
+import shlex
 
 
 class BaseTestCase(unittest.TestCase):
@@ -46,9 +47,9 @@ def ensure_directory_exists(self, filename):
             os.makedirs(dirname)
 
     def clean_test_env(self):
-        cmd = "dlts_ssh pkill -9 -f /usr/bin/python"
+        cmd = shlex.split("dlts_ssh pkill -9 -f /usr/bin/python")
         print(cmd)
-        subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
+        subprocess.run(cmd, check=False, executable='/bin/bash')
         time.sleep(20)
 
     def run_gpt2_test(self, test_config, output):
@@ -60,8 +61,8 @@ def run_gpt2_test(self, test_config, output):
             test_config["mp"], test_config["gpus"], test_config["nodes"], test_config["bs"], test_config["steps"],
             test_config["layers"], test_config["hidden_size"], test_config["seq_length"], test_config["heads"],
             ckpt_num, other_args, ds_flag)
-
+        cmd = shlex.split(cmd)
         self.ensure_directory_exists(output)
         with open(output, "w") as f:
             print(cmd)
-            subprocess.run(cmd, shell=True, check=False, executable='/bin/bash', stdout=f, stderr=f)
+            subprocess.run(cmd, check=False, executable='/bin/bash', stdout=f, stderr=f)
diff --git a/tests/onebit/README.md b/tests/onebit/README.md
new file mode 100644
index 000000000000..d62c25421d00
--- /dev/null
+++ b/tests/onebit/README.md
@@ -0,0 +1,31 @@
+# One-Bit tests
+
+In this folder, you can test the functionality and performance of different backend for doing compressed allreduce, which is the main algorithm in one-bit optimizers like [One-Bit Adam](https://www.deepspeed.ai/tutorials/onebit-adam/), [One-Bit Lamb](https://www.deepspeed.ai/tutorials/onebit-lamb/) and [Zero-One Adam](https://www.deepspeed.ai/tutorials/zero-one-adam/).
+
+## How to run
+
+### NCCL and MPI backend
+
+Basically it requires your environment have relative communication backend installed, the NCCL backend of PyTorch distributed or Message Passing Interface (MPI) like MVAPICH2-GDR and OpenMPI. [Detailed Pre-requisites](https://www.deepspeed.ai/tutorials/zero-one-adam/#12-pre-requisites-for-01-adam).
+
+To test accuracy and performance of NCCL backend:
+```bash
+python test_nccl_backend.py
+python test_nccl_perf.py
+```
+Similarly, for MPI backend:
+```bash
+python test_mpi_backend.py
+python test_mpi_perf.py
+```
+
+### Compressed backend
+
+This backend provides an approach to abstract the generic part of one-bit optimizers and implements accelerator dependent part with DeepSpeed custom op builder. To use this `CompressedBackend` and test it, you should make sure that your current accelerator supports `PackbitsBuilder`, so that it could be loaded to do high performance packing and unpacking between float and Byte datatype.
+An example can be found in `Deepspeed/op_builder/xpu/packbits.py`.
+
+The test usage is same as others:
+```bash
+python test_compressed_backend.py
+python test_compressed_perf.py
+```
diff --git a/tests/onebit/test_compressed_backend.py b/tests/onebit/test_compressed_backend.py
new file mode 100644
index 000000000000..f6919a09a54b
--- /dev/null
+++ b/tests/onebit/test_compressed_backend.py
@@ -0,0 +1,96 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import deepspeed.comm as dist
+import numpy as np
+import argparse
+import deepspeed
+import os
+
+from deepspeed.runtime.comm.compressed import CompressedBackend
+from deepspeed.accelerator import get_accelerator
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1)
+args = parser.parse_args()
+
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
+args.local_rank = int(os.environ['LOCAL_RANK'])
+
+get_accelerator().set_device(args.local_rank)
+device = torch.device(get_accelerator().device_name(), args.local_rank)
+
+size = dist.get_world_size()
+rank = dist.get_rank()
+
+backend = CompressedBackend()
+local_rank = args.local_rank
+
+
+# A simulated compression function using deepspeed.comm
+def torch_sim(a):
+    a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    scale = a.norm() / np.sqrt(a.numel())
+    a_compressed = scale * a_sign
+    a_sign = None
+    worker_error = a - a_compressed
+    dist.all_reduce(a_compressed)
+    a_compressed.mul_(1 / dist.get_world_size())
+    a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+    a_list = torch.chunk(a_compressed, chunks=dist.get_world_size())
+    server_scale = [chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list]
+    a_sign_list = torch.chunk(a_server_sign, dist.get_world_size())
+    a_server_compressed = torch.cat([server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size())])
+    rank = dist.get_rank()
+    server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank]
+    get_accelerator().synchronize()
+    dist.barrier()
+    return a_server_compressed, worker_error, server_error
+
+
+tensor_size = 300 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+
+a_torch, worker_error_torch, server_error_torch = torch_sim(a)
+get_accelerator().empty_cache()
+
+a_after = backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+print(a_torch.cpu())
+print(a_after.cpu())
+
+threshold = 1e-6
+magnitude_threshold = 1e-6
+diff_mask = (a_after - a_torch) > threshold
+diff_server_mask = torch.chunk(diff_mask, size)[rank]
+mpi_server = torch.chunk(a_after, size)[rank] + server_error
+torch_server = torch.chunk(a_torch, size)[rank] + server_error_torch
+
+test_correctness = True
+
+# If the number in the compensated_server_m is too small (e.g 1e-8), then calling sign() might be problematic
+# The test would skip those numbers that are too small in compensated_server_m
+if test_correctness:
+    if torch.sum(diff_server_mask) == 0:
+        print('Successfully passed the test for Compressed Backend at Rank {}'.format(rank))
+    else:
+        check_mag_mask = mpi_server[diff_server_mask] > magnitude_threshold
+        if torch.sum(check_mag_mask) == 0:
+            print('Successfully passed the test for Compressed Backend at Rank {}'.format(rank))
+        else:
+            print('Fails at {} of positions'.format(torch.sum(check_mag_mask)))
diff --git a/tests/onebit/test_compressed_perf.py b/tests/onebit/test_compressed_perf.py
new file mode 100644
index 000000000000..a686af0f6b8d
--- /dev/null
+++ b/tests/onebit/test_compressed_perf.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import deepspeed.comm as dist
+import numpy as np
+import argparse
+import deepspeed
+import os
+
+from deepspeed.runtime.comm.compressed import CompressedBackend
+from deepspeed.utils.timer import SynchronizedWallClockTimer
+from deepspeed.accelerator import get_accelerator
+from statistics import mean
+
+timers = SynchronizedWallClockTimer()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1)
+args = parser.parse_args()
+
+deepspeed.init_distributed(dist_backend=get_accelerator().communication_backend_name())
+args.local_rank = int(os.environ['LOCAL_RANK'])
+
+get_accelerator().set_device(args.local_rank)
+device = torch.device(get_accelerator().device_name(), args.local_rank)
+
+size = dist.get_world_size()
+rank = dist.get_rank()
+
+backend = CompressedBackend()
+local_rank = args.local_rank
+
+# Setting tensor_size (BERT-Large)
+tensor_size = 300 * 2**20
+server_size = int(tensor_size / size)
+if tensor_size % (8 * size) != 0:
+    right_tensor_size = tensor_size + (8 * size - (tensor_size % (8 * size)))
+else:
+    right_tensor_size = tensor_size
+right_server_size = right_tensor_size // size
+
+# Adding bias to the initialization of the gradient we are communicating
+# In order to get rid of the case where some elements in the gradient are too small
+a = (torch.rand(tensor_size, device=device) - 0.5) + 0.01 * rank
+
+worker_error = torch.zeros(right_tensor_size, device=device)
+server_error = torch.zeros(right_server_size, device=device)
+
+warmup = 10
+iters = 10
+
+# Warmup
+for i in range(warmup):
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+
+time_list = []
+
+a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0)
+scale = a.norm() / np.sqrt(a.numel())
+a_compressed = scale * a_sign
+
+print("Shape of the compressed buffer:", a_compressed.shape) if rank == 0 else None
+
+for i in range(iters):
+    timers('compressed_allreduce').start()
+    backend.compressed_allreduce(a, worker_error, server_error, local_rank)
+    #deepspeed.comm.all_reduce(a_compressed)
+    timers('compressed_allreduce').stop()
+    time_list.append(timers('compressed_allreduce').elapsed())
+
+#timer_names = ['compressed_allreduce']
+#timers.log(names=timer_names, normalizer=1, memory_breakdown=None)
+
+places = 2
+convert = 1e3
+float_size = 4
+
+if rank == 0:
+    for i in range(iters):
+        lat = time_list[i]
+        print("latency = ", lat * convert)
+
+minlat = round(min(time_list) * convert)
+maxlat = round(max(time_list) * convert)
+meanlat = round(mean(time_list) * convert, places)
+print("min, max, and mean = {} ms, {} ms, {} ms".format(minlat, maxlat, meanlat)) if rank == 0 else None
+#print("tensor shape", a.shape)
+duration = meanlat / 1e3
+tput = ((tensor_size * 4) / duration)
+print("algo throughput: %f Bytes/s, %f GB/s" % (tput, tput / 1e9)) if rank == 0 else None
+size = tensor_size * 4
+n = dist.get_world_size()
+busbw = (size / duration) * (2 * (n - 1) / n)
+print("busbw: %f GB/s" % (busbw / 1e9)) if rank == 0 else None
diff --git a/tests/perf/adam_test1.py b/tests/perf/adam_test1.py
index b35477afb4fe..bde1d53e5179 100755
--- a/tests/perf/adam_test1.py
+++ b/tests/perf/adam_test1.py
@@ -6,12 +6,10 @@
 import torch
 from deepspeed.ops.adam import DeepSpeedCPUAdam
 import time
-from deepspeed.accelerator import get_accelerator
 
 device = 'cpu'
 model_size = 1 * 1024**3
 param = torch.nn.Parameter(torch.ones(model_size, device=device))
-param_fp16 = torch.nn.Parameter(torch.ones(model_size, dtype=torch.half, device=get_accelerator().device_name(0)))
 
 optimizer = DeepSpeedCPUAdam([param])
 #torch.set_num_threads(128)
@@ -19,7 +17,7 @@
 avg = 0
 for i in range(100):
     start = time.time()
-    optimizer.step(fp16_param_groups=[param_fp16])
+    optimizer.step()
     stop = time.time()
     avg += (stop - start)
     param.grad = torch.ones(model_size, device=device) * 2
diff --git a/tests/pytest.ini b/tests/pytest.ini
index 8d043c8b3f9d..f841c47afc0c 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -1,12 +1,13 @@
 [pytest]
-addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion"
+addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion and not evaluation"
 markers =
     sequential:Tests that need to be run sequentially
     inference:Inference model tests
     inference_ops:Individual inference operator tests
-    inference_v2: Inference tests for the v2 stack
-    inference_v2_ops: Op tests for the v2 stack
+    inference_v2:Inference tests for the v2 stack
+    inference_v2_ops:Op tests for the v2 stack
     seq_inference:Inference model tests to run sequentially
     nightly:Tests that should be run nightly
     world_size:Change world size of individual tests in a class
     stable_diffusion:Tests that run Stable Diffusion
+    evaluation:Tests that evaluate model correctness
diff --git a/tests/torch_compile/ds_config_z2.json b/tests/torch_compile/ds_config_z2.json
new file mode 100644
index 000000000000..30e1237c558c
--- /dev/null
+++ b/tests/torch_compile/ds_config_z2.json
@@ -0,0 +1,40 @@
+{
+  "train_batch_size": 8,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "bf16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 2,
+      "overlap_comm": false,
+      "contiguous_gradients": false
+  }
+}
diff --git a/tests/torch_compile/ds_config_z3.json b/tests/torch_compile/ds_config_z3.json
new file mode 100644
index 000000000000..361bc115eaee
--- /dev/null
+++ b/tests/torch_compile/ds_config_z3.json
@@ -0,0 +1,41 @@
+{
+  "train_batch_size": 8,
+  "steps_per_print": 2000,
+  "optimizer": {
+    "type": "Adam",
+    "params": {
+      "lr": 0.001,
+      "betas": [
+        0.8,
+        0.999
+      ],
+      "eps": 1e-8,
+      "weight_decay": 3e-7
+    }
+  },
+  "scheduler": {
+    "type": "WarmupLR",
+    "params": {
+      "warmup_min_lr": 0,
+      "warmup_max_lr": 0.001,
+      "warmup_num_steps": 1000
+    }
+  },
+  "gradient_clipping": 1.0,
+  "prescale_gradients": false,
+  "bf16": {
+      "enabled": true,
+      "loss_scale": 0,
+      "loss_scale_window": 500,
+      "hysteresis": 2,
+      "min_loss_scale": 1,
+      "initial_scale_power": 15
+  },
+  "wall_clock_breakdown": false,
+  "zero_optimization": {
+      "stage": 3,
+      "reduce_scatter": true,
+      "overlap_comm": false,
+      "contiguous_gradients": false
+  }
+}
diff --git a/tests/torch_compile/test_compile.py b/tests/torch_compile/test_compile.py
new file mode 100644
index 000000000000..adbf6eaa947a
--- /dev/null
+++ b/tests/torch_compile/test_compile.py
@@ -0,0 +1,98 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import argparse
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed import comm
+
+import torch
+import intel_extension_for_pytorch  # noqa: F401 # type: ignore
+from torch.utils.data import Dataset, DataLoader
+
+torch._dynamo.config.cache_size_limit = 100
+
+
+def get_dynamo_stats():
+    return torch._dynamo.utils.counters["graph_break"]
+
+
+class RandomDataset(Dataset):
+
+    def __init__(self, size, length):
+        self.len = length
+        self.data = torch.randn(length, size).to(torch.bfloat16)
+
+    def __getitem__(self, index):
+        return self.data[index]
+
+    def __len__(self):
+        return self.len
+
+
+data_size = 1024
+data_length = 100
+rand_loader = DataLoader(dataset=RandomDataset(data_size, data_length), batch_size=1, shuffle=False)
+
+
+class MyModule(torch.nn.Module):
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.fc0 = torch.nn.Linear(1024, 256, bias=False)
+        self.fc1 = torch.nn.Linear(256, 256, bias=False)
+        self.dropout = torch.nn.Dropout(0.5)
+
+    def forward(self, data, residual):
+        output = residual + self.fc1(self.fc0(self.dropout(data))) * 0.5
+        return output
+
+
+model = MyModule()
+params = model.parameters()
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--local_rank', type=int, default=-1, help='local rank passed from distributed launcher')
+parser.add_argument('--deepspeed_config',
+                    type=str,
+                    default='ds_config_z3.json',
+                    help='path to DeepSpeed configuration file')
+cmd_args = parser.parse_args()
+
+# initialize the DeepSpeed engine
+model_engine, optimizer, _, _ = deepspeed.initialize(args=cmd_args, model=model, model_parameters=params)
+model_engine.compile()
+
+residual = torch.rand(256, 256, dtype=torch.float).to(get_accelerator().current_device_name())
+
+start_stats = get_dynamo_stats()
+
+if comm.get_rank() == 0:
+    #print(dynamo_stats['graph_breaks'])
+    for item in start_stats.items():
+        print(item)
+
+for step, batch in enumerate(rand_loader):
+    if step % 10 == 0 and comm.get_rank() == 0:
+        print(f'step={step}')
+    # forward() method
+    loss = model_engine(batch.to(get_accelerator().current_device_name()), residual).sum()
+    # runs backpropagation
+    model_engine.backward(loss)
+    # weight update
+    model_engine.step()
+
+dynamo_stats = get_dynamo_stats()
+
+if comm.get_rank() == 0:
+    # print break down of graph break stats with markdown, print in table format, start with reason, then count
+    # print a tag 'dynamo_output' before each line to allow post processing
+    print("dynamo_output | Reason | Count |")
+    print("dynamo_output | ------ | ----- |")
+    for item in dynamo_stats.items():
+        # replace '|' in item[0] with a literal '|' to avoid mess with table format
+        item = (item[0].replace('|', r'\|'), item[1])
+        print(f"dynamo_output | {item[0]} | {item[1]} |")
+    print(f"dynamo_output | Total | {sum(dynamo_stats.values())} |")
diff --git a/tests/unit/accelerator/test_accelerator.py b/tests/unit/accelerator/test_accelerator.py
new file mode 100644
index 000000000000..964cf2b24f4e
--- /dev/null
+++ b/tests/unit/accelerator/test_accelerator.py
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+import os
+import sys
+import importlib
+import re
+
+import deepspeed
+
+DS_ACCEL_PATH = "deepspeed.accelerator"
+IGNORE_FILES = ["abstract_accelerator.py", "real_accelerator.py"]
+
+
+@pytest.fixture
+def accel_class_name(module_name):
+    class_list = []
+    mocked_modules = []
+
+    # Get the accelerator class name for a given module
+    while True:
+        try:
+            module = importlib.import_module(module_name)
+            break
+        except ModuleNotFoundError as e:
+            # If the environment is missing a module, mock it so we can still
+            # test importing the accelerator class
+            missing_module = re.search(r"\'(.*)\'", e.msg).group().strip("'")
+            sys.modules[missing_module] = lambda x: None
+            mocked_modules.append(missing_module)
+    for name in dir(module):
+        if name.endswith("_Accelerator"):
+            class_list.append(name)
+
+    assert len(class_list) == 1, f"Multiple accelerator classes found in {module_name}"
+
+    yield class_list[0]
+
+    # Clean up mocked modules so as to not impact other tests
+    for module in mocked_modules:
+        del sys.modules[module]
+
+
+@pytest.mark.parametrize(
+    "module_name",
+    [
+        DS_ACCEL_PATH + "." + f.rstrip(".py") for f in os.listdir(deepspeed.accelerator.__path__[0])
+        if f.endswith("_accelerator.py") and f not in IGNORE_FILES
+    ],
+)
+def test_abstract_methods_defined(module_name, accel_class_name):
+    module = importlib.import_module(module_name)
+    accel_class = getattr(module, accel_class_name)
+    accel_class.__init__ = lambda self: None
+    _ = accel_class()
diff --git a/tests/unit/alexnet_model.py b/tests/unit/alexnet_model.py
index e3be2be4894d..6fe84edf4eda 100644
--- a/tests/unit/alexnet_model.py
+++ b/tests/unit/alexnet_model.py
@@ -11,8 +11,10 @@
 import deepspeed
 import deepspeed.comm as dist
 import deepspeed.runtime.utils as ds_utils
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
+from .util import no_child_process_in_deepspeed_io
 
 
 class AlexNet(nn.Module):
@@ -82,7 +84,7 @@ def cast_to_half(x):
 
 def cifar_trainset(fp16=False):
     torchvision = pytest.importorskip("torchvision", minversion="0.5.0")
-    import torchvision.transforms as transforms
+    from torchvision import transforms
 
     transform_list = [
         transforms.ToTensor(),
@@ -99,20 +101,25 @@ def cifar_trainset(fp16=False):
     dist.barrier()
     if local_rank != 0:
         dist.barrier()
-
     data_root = os.getenv("TEST_DATA_DIR", "/tmp/")
-    trainset = torchvision.datasets.CIFAR10(root=os.path.join(data_root, "cifar10-data"),
-                                            train=True,
-                                            download=True,
-                                            transform=transform)
+    if os.getenv("CIFAR10_DATASET_PATH"):
+        data_root = os.getenv("CIFAR10_DATASET_PATH")
+        download = False
+    else:
+        data_root = os.path.join(os.getenv("TEST_DATA_DIR", "/tmp"), "cifar10-data")
+        download = True
+    trainset = torchvision.datasets.CIFAR10(root=data_root, train=True, download=download, transform=transform)
     if local_rank == 0:
         dist.barrier()
     return trainset
 
 
 def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True, seed=123):
-    with get_accelerator().random().fork_rng(devices=[get_accelerator().current_device_name()],
-                                             device_type=get_accelerator().device_name()):
+    if required_torch_version(min_version=2.1):
+        fork_kwargs = {"device_type": get_accelerator().device_name()}
+    else:
+        fork_kwargs = {}
+    with get_accelerator().random().fork_rng(devices=[get_accelerator().current_device_name()], **fork_kwargs):
         ds_utils.set_random_seed(seed)
 
         # disable dropout
@@ -121,22 +128,11 @@ def train_cifar(model, config, num_steps=400, average_dp_losses=True, fp16=True,
         trainset = cifar_trainset(fp16=fp16)
         config['local_rank'] = dist.get_rank()
 
-        # deepspeed_io defaults to creating a dataloader that uses a
-        # multiprocessing pool. Our tests use pools and we cannot nest pools in
-        # python. Therefore we're injecting this kwarg to ensure that no pools
-        # are used in the dataloader.
-        old_method = deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io
-
-        def new_method(*args, **kwargs):
-            kwargs["num_local_io_workers"] = 0
-            return old_method(*args, **kwargs)
-
-        deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = new_method
-
-        engine, _, _, _ = deepspeed.initialize(config=config,
-                                               model=model,
-                                               model_parameters=[p for p in model.parameters()],
-                                               training_data=trainset)
+        with no_child_process_in_deepspeed_io():
+            engine, _, _, _ = deepspeed.initialize(config=config,
+                                                   model=model,
+                                                   model_parameters=[p for p in model.parameters()],
+                                                   training_data=trainset)
 
         losses = []
         for step in range(num_steps):
diff --git a/tests/unit/checkpoint/common.py b/tests/unit/checkpoint/common.py
index d6dda2f14cbe..001c08f1a99f 100644
--- a/tests/unit/checkpoint/common.py
+++ b/tests/unit/checkpoint/common.py
@@ -14,6 +14,7 @@
 from deepspeed.runtime.zero.stage3 import DeepSpeedZeroOptimizer_Stage3
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 
+from unit.common import preferred_dtype
 from unit.simple_model import *
 from unittest.mock import MagicMock, patch
 
@@ -85,15 +86,33 @@ def compare_model_states(saved_model, loaded_model, compare_optimizer=True, load
 
 
 def compare_state_dicts(state0, state1, expected_mismatch_keys=[]):
-    for (k0, s0), (k1, s1) in zip(state0.items(), state1.items()):
-        assert k0 == k1, f'failure due to key mismatch {k0} != {k1}'
-        if k0 in expected_mismatch_keys:
+    key_set0 = set(k for k in state0.keys() if k not in expected_mismatch_keys)
+    key_set1 = set(k for k in state1.keys() if k not in expected_mismatch_keys)
+    assert key_set0 == key_set1, f'failure due to key mismatch {key_set0} != {key_set1}'
+
+    for k in key_set0:
+        s0 = state0[k]
+        s1 = state1[k]
+        if k in expected_mismatch_keys:
             continue
         if isinstance(s0, torch.Tensor) and isinstance(s1, torch.Tensor):
             assert id(s0) != id(s1), f'Comparing optimizer state tensor against itself: {id(s0)} <====> {id(s1)}'
             assert torch.equal(s0.to('cpu'), s1.to('cpu'))
         else:
-            assert s0 == s1, f'failures with keys = {k0}, {k1}, values = {type(s0[0])} and {type(s1[0])}'
+            assert s0 == s1, f'failures with keys = {k}, {k}, values = {s0} and {s1}'
+
+
+def compare_opt_state_dicts(state0, state1, expected_mismatch_keys=[]):
+    for param_group0, saved_param_group1 in zip(state0['param_groups'], state1['param_groups']):
+        compare_state_dicts(param_group0, saved_param_group1, expected_mismatch_keys)
+
+    assert "state" in state0
+    assert "state" in state1
+    assert len([state0["state"].keys()]) == len([state1["state"].keys()])
+
+    for (k0, s0), (k1, s1) in zip(state0["state"].items(), state1["state"].items()):
+        assert k0 == k1, f'failure due to key mismatch {k0} != {k1}'
+        compare_state_dicts(s0, s1, expected_mismatch_keys)
 
 
 def compare_optimizer_states(saved_model, loaded_model, hidden_dim, fp16=True):
@@ -150,13 +169,15 @@ def checkpoint_correctness_verification(config_dict,
                                         tmpdir,
                                         load_optimizer_states=False,
                                         load_lr_scheduler_states=False,
-                                        fp16=True,
                                         train_batch=False,
                                         base_optimizers=[None, None],
                                         empty_tag=False,
                                         seq_dataloader=False,
-                                        load_module_only=False):
-    dtype = torch.half if fp16 else torch.float32
+                                        load_module_only=False,
+                                        dtype=None):
+    if dtype == None:
+        dtype = preferred_dtype()
+
     ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0])
 
     if seq_dataloader:
@@ -197,7 +218,7 @@ def checkpoint_correctness_verification(config_dict,
     for root, _, files in os.walk(save_folder):
         for f in files:
             if "_expert_" in f and "_model_states" in f:
-                expert = torch.load(os.path.join(root, f))
+                expert = torch.load(os.path.join(root, f), weights_only=False)
                 needed, storages = 0, {}
                 for name, tensor in expert.items():
                     needed += tensor.size().numel()
@@ -228,7 +249,7 @@ def checkpoint_correctness_verification(config_dict,
                          load_module_only=load_module_only)
 
     if load_optimizer_states:
-        compare_optimizer_states(trained_model, loaded_model, hidden_dim, fp16)
+        compare_optimizer_states(trained_model, loaded_model, hidden_dim, dtype == torch.float16)
 
     if load_lr_scheduler_states:
         compare_lr_scheduler_states(trained_model, loaded_model)
diff --git a/tests/unit/checkpoint/test_convert_checkpoint.py b/tests/unit/checkpoint/test_convert_checkpoint.py
new file mode 100644
index 000000000000..68fdecb32e16
--- /dev/null
+++ b/tests/unit/checkpoint/test_convert_checkpoint.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import torch.nn as nn
+
+import deepspeed
+from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
+from unit.common import DistributedTest
+
+
+class ModelWithSharedWeights(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.layer0 = nn.Linear(100, 100)
+        self.layer1 = nn.Linear(200, 200)
+        self.layer2 = nn.Linear(300, 300)
+        # tie layer 1 and layer 2
+        self.layer1.weight = self.layer2.weight
+
+
+class TestCheckpointConvert(DistributedTest):
+    world_size = 2
+
+    def test_convert_zero_checkpoint_to_fp32_state_dict(self, tmpdir):
+        config = {
+            "train_micro_batch_size_per_gpu": 2,
+            "zero_allow_untested_optimizer": True,
+            "zero_optimization": {
+                "stage": 3
+            },
+        }
+        model = ModelWithSharedWeights()
+        optimizer = torch.optim.Adam(model.parameters())
+
+        deepspeed_engine, _, _, _ = deepspeed.initialize(
+            config=config,
+            model=model,
+            optimizer=optimizer,
+        )
+        ds_save_dir = tmpdir / "checkpoint_ds"
+        deepspeed_engine.save_checkpoint(ds_save_dir, tag="checkpoint")
+
+        model = ModelWithSharedWeights()
+
+        # save checkpoint
+        fp32_save_dir = tmpdir / "checkpoint_fp32"
+        convert_zero_checkpoint_to_fp32_state_dict(ds_save_dir, fp32_save_dir)
+
+        # load state_dict from fp32 checkpoint
+        state_dict = torch.load(fp32_save_dir / 'pytorch_model.bin')
+
+        # check shared tensor
+        assert id(state_dict['layer1.weight']) == id(state_dict['layer2.weight'])
+
+        # load state_dict into model
+        model.load_state_dict(state_dict, strict=True)
diff --git a/tests/unit/checkpoint/test_latest_checkpoint.py b/tests/unit/checkpoint/test_latest_checkpoint.py
index 41ce2278680f..5d795c4dadcf 100644
--- a/tests/unit/checkpoint/test_latest_checkpoint.py
+++ b/tests/unit/checkpoint/test_latest_checkpoint.py
@@ -38,8 +38,8 @@ def test_existing_latest(self, tmpdir):
                                             tmpdir=tmpdir,
                                             load_optimizer_states=True,
                                             load_lr_scheduler_states=False,
-                                            fp16=False,
-                                            empty_tag=True)
+                                            empty_tag=True,
+                                            dtype=torch.float)
 
     def test_missing_latest(self, tmpdir):
         config_dict = {
diff --git a/tests/unit/checkpoint/test_lr_scheduler.py b/tests/unit/checkpoint/test_lr_scheduler.py
index c4c6773cd474..89c4dd1b49f7 100644
--- a/tests/unit/checkpoint/test_lr_scheduler.py
+++ b/tests/unit/checkpoint/test_lr_scheduler.py
@@ -5,6 +5,7 @@
 
 import deepspeed
 from deepspeed.ops.op_builder import CPUAdamBuilder
+from deepspeed.accelerator import get_accelerator
 
 from unit.common import DistributedTest
 from unit.simple_model import *
@@ -22,6 +23,8 @@ class TestLRSchedulerCheckpoint(DistributedTest):
     def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
+        if get_accelerator().device_name() == 'cpu':
+            pytest.skip("CPU accelerator does not support this test.")
 
         config_dict = {
             "train_batch_size": 2,
@@ -35,9 +38,6 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                     "weight_decay": 3e-7
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": zero_stage,
                 "cpu_offload": use_cpu_offload
@@ -51,6 +51,10 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                 }
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         if zero_stage == 3:
@@ -71,6 +75,8 @@ def test_checkpoint_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
     def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
+        if get_accelerator().device_name() == 'cpu':
+            pytest.skip("CPU accelerator does not support this test.")
 
         config_dict = {
             "train_batch_size": 2,
@@ -81,9 +87,6 @@ def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                     "lr": 1e-5
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": zero_stage,
                 "cpu_offload": use_cpu_offload
@@ -97,6 +100,10 @@ def test_checkpoint_no_lr_scheduler(self, tmpdir, zero_stage, use_cpu_offload):
                 }
             },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         if zero_stage == 3:
diff --git a/tests/unit/checkpoint/test_mics_optimizer.py b/tests/unit/checkpoint/test_mics_optimizer.py
index 3f853cd5c13a..9e56bf3446fa 100644
--- a/tests/unit/checkpoint/test_mics_optimizer.py
+++ b/tests/unit/checkpoint/test_mics_optimizer.py
@@ -8,7 +8,7 @@
 
 import deepspeed
 
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from unit.common import DistributedTest
 from unit.simple_model import *
 from unit.checkpoint.common import *
diff --git a/tests/unit/checkpoint/test_moe_checkpoint.py b/tests/unit/checkpoint/test_moe_checkpoint.py
index 0706b7327ce8..89878b5d8fa9 100644
--- a/tests/unit/checkpoint/test_moe_checkpoint.py
+++ b/tests/unit/checkpoint/test_moe_checkpoint.py
@@ -4,7 +4,7 @@
 # DeepSpeed Team
 
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 from unit.common import DistributedTest
 from unit.simple_model import *
@@ -33,10 +33,10 @@ def test_checkpoint_moe(self, tmpdir, ep_size):
                                             tmpdir=tmpdir,
                                             load_optimizer_states=True,
                                             load_lr_scheduler_states=False,
-                                            fp16=config_dict["fp16"]["enabled"],
                                             empty_tag=True,
                                             base_optimizers=optimizers,
-                                            seq_dataloader=True)
+                                            seq_dataloader=True,
+                                            dtype=torch.float16)
 
     @pytest.mark.parametrize("ep_size, load_optim_states", [(4, True), (4, False), (2, True), (2, False)])
     def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
@@ -77,7 +77,7 @@ def test_checkpoint_moe_and_zero(self, tmpdir, ep_size, load_optim_states):
                                             tmpdir=tmpdir,
                                             load_optimizer_states=load_optim_states,
                                             load_lr_scheduler_states=False,
-                                            fp16=config_dict["fp16"]["enabled"],
                                             empty_tag=True,
                                             base_optimizers=optimizers,
-                                            seq_dataloader=True)
+                                            seq_dataloader=True,
+                                            dtype=torch.float16)
diff --git a/tests/unit/checkpoint/test_other_optimizer.py b/tests/unit/checkpoint/test_other_optimizer.py
index 9cb8c4286880..bcff7f5e3072 100644
--- a/tests/unit/checkpoint/test_other_optimizer.py
+++ b/tests/unit/checkpoint/test_other_optimizer.py
@@ -19,6 +19,8 @@ class TestOtherOptimizerCheckpoint(DistributedTest):
 
     @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME], reason="lamb is not compatible")
     def test_checkpoint_unfused_optimizer(self, tmpdir):
+        #if not get_accelerator().is_fp16_supported():
+        #    pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -29,9 +31,6 @@ def test_checkpoint_unfused_optimizer(self, tmpdir):
                 }
             },
             "gradient_clipping": 1.0,
-            "fp16": {
-                "enabled": True
-            },
             "scheduler": {
                 "type": "OneCycle",
                 "params": {
@@ -49,6 +48,10 @@ def test_checkpoint_unfused_optimizer(self, tmpdir):
                 }
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_fp16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         args = args_from_dict(tmpdir, config_dict)
         hidden_dim = 10
@@ -69,6 +72,8 @@ def test_checkpoint_unfused_optimizer(self, tmpdir):
                                             load_optimizer_states=False)
 
     def test_checkpoint_fused_optimizer(self, tmpdir):
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -81,10 +86,11 @@ def test_checkpoint_fused_optimizer(self, tmpdir):
                     "weight_decay": 3e-7
                 }
             },
-            "fp16": {
-                "enabled": True
-            }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         args = args_from_dict(tmpdir, config_dict)
         hidden_dim = 10
@@ -129,4 +135,4 @@ def test_checkpoint_fp32_optimizer(self, tmpdir):
                                             models=models,
                                             hidden_dim=hidden_dim,
                                             tmpdir=tmpdir,
-                                            fp16=False)
+                                            dtype=torch.float32)
diff --git a/tests/unit/checkpoint/test_pipeline.py b/tests/unit/checkpoint/test_pipeline.py
index 99f1ba2ec433..c6c228ccada7 100644
--- a/tests/unit/checkpoint/test_pipeline.py
+++ b/tests/unit/checkpoint/test_pipeline.py
@@ -58,10 +58,10 @@ def test_checkpoint_pipe_engine(self, zero_stage, tmpdir):
                                             models=models,
                                             hidden_dim=models[0].hidden_dim,
                                             tmpdir=tmpdir,
-                                            fp16=config_dict['fp16']['enabled'],
                                             load_optimizer_states=True,
                                             load_lr_scheduler_states=True,
-                                            train_batch=True)
+                                            train_batch=True,
+                                            dtype=torch.float16 if zero_stage > 0 else torch.float32)
 
     @pytest.mark.parametrize(
         "base_topo,test_topo",
diff --git a/tests/unit/checkpoint/test_universal_checkpoint.py b/tests/unit/checkpoint/test_universal_checkpoint.py
new file mode 100644
index 000000000000..46d4294bdd0d
--- /dev/null
+++ b/tests/unit/checkpoint/test_universal_checkpoint.py
@@ -0,0 +1,227 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+from types import SimpleNamespace
+from torch.utils._pytree import tree_map
+
+from deepspeed.utils.torch import required_torch_version
+from deepspeed.checkpoint import UNIVERSAL_CHECKPOINT_INFO
+from deepspeed.checkpoint.ds_to_universal import main as convert_to_universal
+
+from unit.common import DistributedTest, DistributedFixture
+from unit.simple_model import *
+from unit.util import bf16_required_version_check
+
+from unit.checkpoint.common import compare_opt_state_dicts, compare_state_dicts
+
+import pytest
+import deepspeed.comm as dist
+
+
+def get_expected_mismatch_keys():
+    # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to
+    # false positive mismatches in checkpoint state comparisons.
+    # Newer torch versions store tensor ids as 0, 1, 2, ...
+    return [] if required_torch_version(min_version=1.4) else ['params']
+
+
+def maybe_step(t):
+    return not torch.is_tensor(t) or (t.device.type == 'cpu' and t.numel() == 1)
+
+
+def gather_opt_state(optimizer_state):
+
+    def gather_tensor(t):
+
+        if maybe_step(t):
+            return t
+        else:
+            buffer = [torch.zeros_like(t.flatten()) for _ in range(dist.get_world_size())]
+            dist.all_gather(buffer, t.flatten())
+            return torch.cat(buffer)
+
+    return tree_map(gather_tensor, optimizer_state)
+
+
+def remove_pad_in_opt_state(optimizer_state, num_params):
+
+    def remove_pad(t):
+        if maybe_step(t):
+            return t
+        else:
+            return t[:num_params]
+
+    return tree_map(remove_pad, optimizer_state)
+
+
+CP_TAG = "test_tag"
+
+
+def init_ds_engine(model, ds_config, use_torch_adam):
+
+    if use_torch_adam:
+        ds_optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
+        del ds_config["optimizer"]
+        model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, optimizer=ds_optimizer)
+    else:
+        model, _, _, _ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
+
+    return model
+
+
+def train_save_convert(ds_config, hidden_dim, load_optim, use_torch_adam, dtype, tmpdir):
+    if dtype == torch.bfloat16 and not bf16_required_version_check():
+        return
+
+    test_step = 8
+
+    model = SimpleModel(hidden_dim)
+    model = init_ds_engine(model, ds_config, use_torch_adam)
+    data_loader = random_dataloader(model=model,
+                                    total_samples=test_step,
+                                    hidden_dim=hidden_dim,
+                                    device=model.device,
+                                    dtype=dtype)
+    for batch in data_loader:
+        loss = model(batch[0], batch[1])
+        model.backward(loss)
+        model.step()
+
+    if ds_config["zero_optimization"]["stage"] == 3:
+        model.optimizer._set_fp32_optimizer_param_groups()
+        sd = model.optimizer.optimizer.state_dict() if load_optim else None
+        model.optimizer._clear_fp32_optimizer_param_groups()
+    else:
+        sd = model.optimizer.optimizer.state_dict() if load_optim else None
+
+    client_state = {}
+    client_state[UNIVERSAL_CHECKPOINT_INFO] = {}
+    client_state['iteration'] = test_step
+    model.save_checkpoint(tmpdir, tag=CP_TAG, client_state=client_state)
+
+    cp_dir = os.path.join(tmpdir, CP_TAG)
+    univ_cp_dir = f"{cp_dir}_universal"
+
+    args = SimpleNamespace(input_folder=cp_dir,
+                           output_folder=univ_cp_dir,
+                           num_extract_workers=1,
+                           num_merge_workers=1,
+                           keep_temp_folder=False,
+                           strict=True,
+                           inject_missing_state=False)
+
+    dist.barrier()
+    if dist.get_rank() == 0:
+        convert_to_universal(args)
+
+    model_state = model.state_dict()
+    optimizer_state = None
+    if load_optim:
+        if ds_config["zero_optimization"]["stage"] == 3:
+            model.optimizer._set_fp32_optimizer_param_groups()
+            optimizer_state = gather_opt_state(model.optimizer.optimizer.state_dict())
+            model.optimizer._clear_fp32_optimizer_param_groups()
+        else:
+            optimizer_state = gather_opt_state(model.optimizer.optimizer.state_dict())
+
+    if dist.get_rank() == 0:
+        torch.save((model_state, optimizer_state), os.path.join(tmpdir, "baseline_state.pt"))
+
+    dist.barrier()
+    model.destroy()
+
+
+@pytest.fixture
+def ds_config(zero_stage, dtype):
+    ds_config = {
+        "train_batch_size": 8,
+        "optimizer": {
+            "type": 'Adam'
+        },
+        "zero_optimization": {
+            "stage": zero_stage,
+        }
+    }
+    if dtype == torch.float16:
+        ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8}
+    elif dtype == torch.bfloat16:
+        ds_config["bf16"] = {"enabled": True}
+    return ds_config
+
+
+class _baseline(DistributedFixture):
+    world_size = None
+
+    def run(self, tmpdir, ds_config, zero_stage, dtype, load_optim, use_torch_adam):
+        hidden_dim = 10
+        train_save_convert(ds_config, hidden_dim, load_optim, use_torch_adam, dtype, tmpdir)
+
+
+class baseline_ws2(_baseline):
+    world_size = 2
+
+
+class baseline_ws4(_baseline):
+    world_size = 4
+
+
+@pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16, torch.float32])
+@pytest.mark.parametrize("zero_stage", [1, 3])
+@pytest.mark.parametrize("use_torch_adam", [False, True])
+@pytest.mark.parametrize("load_optim", [False, True])
+class TestZeROUniversalCheckpointDP(DistributedTest):
+
+    def _run_test(self, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
+        if dtype == torch.bfloat16 and not bf16_required_version_check():
+            pytest.skip(
+                " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
+            )
+
+        hidden_dim = 10
+        loaded_model_state, loaded_optimizer_state = torch.load(f"{tmpdir}/baseline_state.pt", weights_only=False)
+
+        ds_config["checkpoint"] = {"load_universal": True}
+        univ_model = SimpleModel(hidden_dim)
+        univ_model = init_ds_engine(univ_model, ds_config, use_torch_adam)
+        univ_model.load_checkpoint(tmpdir, tag=f"{CP_TAG}_universal", load_optimizer_states=load_optim)
+
+        model_state = univ_model.state_dict()
+        compare_state_dicts(model_state, loaded_model_state)
+
+        if load_optim and ds_config["zero_optimization"]["stage"] != 3:
+            optimizer_state = gather_opt_state(univ_model.optimizer.optimizer.state_dict())
+            # padding sizes may differ when dp sizes are different
+            param_count = sum(p.numel() for p in univ_model.parameters())
+            optimizer_state = remove_pad_in_opt_state(optimizer_state, param_count)
+            loaded_optimizer_state = remove_pad_in_opt_state(loaded_optimizer_state, param_count)
+
+            compare_opt_state_dicts(optimizer_state, loaded_optimizer_state, get_expected_mismatch_keys())
+
+        # Run training again to verify that the optimizer has necessary states
+        test_step = 8
+        data_loader = random_dataloader(model=univ_model,
+                                        total_samples=test_step,
+                                        hidden_dim=hidden_dim,
+                                        device=univ_model.device,
+                                        dtype=dtype)
+        for batch in data_loader:
+            loss = univ_model(batch[0], batch[1])
+            univ_model.backward(loss)
+            univ_model.step()
+
+        univ_model.destroy()
+
+    @pytest.mark.world_size(2)
+    def test_dp_world_size_2to2(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
+        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam)
+
+    @pytest.mark.world_size(2)
+    def test_dp_world_size_4to2(self, baseline_ws4, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
+        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam)
+
+    @pytest.mark.world_size(4)
+    def test_dp_world_size_2to4(self, baseline_ws2, tmpdir, dtype, ds_config, load_optim, use_torch_adam):
+        self._run_test(tmpdir, dtype, ds_config, load_optim, use_torch_adam)
diff --git a/tests/unit/checkpoint/test_zero_optimizer.py b/tests/unit/checkpoint/test_zero_optimizer.py
index f2237341ef68..44966b331d0f 100644
--- a/tests/unit/checkpoint/test_zero_optimizer.py
+++ b/tests/unit/checkpoint/test_zero_optimizer.py
@@ -8,7 +8,7 @@
 from deepspeed.ops.op_builder import CPUAdamBuilder
 from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 from unit.common import DistributedTest, DistributedFixture
 from unit.simple_model import *
@@ -28,15 +28,15 @@ def test_pipeline_checkpoint_loading(self, tmpdir, zero_stage):
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
                 "pipeline_loading_checkpoint": True,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         with deepspeed.zero.Init():
@@ -64,16 +64,16 @@ def test_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, adam_op
                     "weight_decay": 3e-7
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "wall_clock_breakdown": True,
             "zero_optimization": {
                 "stage": zero_stage,
                 "cpu_offload": use_cpu_offload
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         if zero_stage == 3:
@@ -104,14 +104,15 @@ def test_not_load_optimizer_state(self, tmpdir, zero_stage, use_cpu_offload, ada
                     "weight_decay": 3e-7
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": zero_stage,
                 "cpu_offload": use_cpu_offload
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         if zero_stage == 3:
@@ -134,11 +135,11 @@ def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
                 "stage": zero_stage
             },
             "zero_allow_untested_optimizer": True,
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         models = [SimpleModel(hidden_dim=hidden_dim) for _ in range(2)]
         optimizers = [HybridStateOptimizer(model.parameters()) for model in models]
@@ -152,19 +153,21 @@ def test_hybrid_optimizer_state(self, tmpdir, zero_stage):
 
     @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
     def test_load_module_only(self, tmpdir, zero_stage):
+        if zero_stage == 0 and get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU Accelerator does not support this test")
         config_dict = {
             "train_batch_size": 2,
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         if zero_stage == 3:
@@ -185,15 +188,15 @@ def run(self, class_tmpdir, elastic_save, load_optim):
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": 2,
                 "elastic_checkpoint": elastic_save
             }
         }
+        if get_accelerator().is_fp16_supported():
+            ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            ds_config["bf16"] = {"enabled": True}
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
@@ -221,15 +224,15 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": 2,
                 "elastic_checkpoint": elastic_save
             }
         }
+        if get_accelerator().is_fp16_supported():
+            ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            ds_config["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         # torch 1.2.* stores raw tensor id numbers in checkpoint state which leads to
@@ -240,13 +243,18 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l
         model, _, _, _ = deepspeed.initialize(config=ds_config,
                                               model=models[0],
                                               model_parameters=models[0].parameters())
-        data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
+        run_steps = 8
+        data_loader = random_dataloader(model=model,
+                                        total_samples=run_steps,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device)
         for n, batch in enumerate(data_loader):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
         if load_optim:
-            torch.save(model.optimizer.optimizer.state_dict(), os.path.join(tmpdir, 'opt-state-dict'))
+            opt_state_dict_file = f'opt-state-dict_rank{dist.get_rank()}'
+            torch.save(model.optimizer.optimizer.state_dict(), os.path.join(tmpdir, opt_state_dict_file))
         model.save_checkpoint(tmpdir)
 
         ds_config["zero_optimization"]["elastic_checkpoint"] = elastic_load
@@ -256,10 +264,9 @@ def test_elastic_checkpoint_fixed_dp(self, tmpdir, elastic_save, elastic_load, l
         model.load_checkpoint(tmpdir, load_optimizer_states=load_optim)
 
         if load_optim:
-            saved_sd = torch.load(os.path.join(tmpdir, 'opt-state-dict'))
+            saved_sd = torch.load(os.path.join(tmpdir, opt_state_dict_file), weights_only=False)
             curr_sd = model.optimizer.optimizer.state_dict()
-            for curr_param_group, saved_param_group in zip(curr_sd['param_groups'], saved_sd['param_groups']):
-                compare_state_dicts(curr_param_group, saved_param_group, expected_mismatch_keys)
+            compare_opt_state_dicts(curr_sd, saved_sd, expected_mismatch_keys)
 
         data_loader = random_dataloader(model=model, total_samples=8, hidden_dim=hidden_dim, device=model.device)
         for n, batch in enumerate(data_loader):
@@ -274,15 +281,15 @@ def test_elastic_checkpoint_change_dp(self, ws4_model_checkpoint, class_tmpdir,
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": 2,
                 "elastic_checkpoint": elastic_load
             }
         }
+        if get_accelerator().is_fp16_supported():
+            ds_config["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            ds_config["bf16"] = {"enabled": True}
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
@@ -305,14 +312,14 @@ def test_immediate_save_load(self, tmpdir, zero_stage):
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
@@ -325,30 +332,27 @@ def test_immediate_save_load(self, tmpdir, zero_stage):
 
     @pytest.mark.parametrize('zero_stage', [0, 1, 2, 3])
     def test_load_immediate_save(self, tmpdir, zero_stage):
+        if zero_stage == 0 and get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU Accelerator does not support this test")
         config_dict = {
             "train_batch_size": 4,
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
         # 1. pretrain a model and save it
-        dtype = torch.half
         ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
-        data_loader = random_dataloader(model=ds_model,
-                                        total_samples=1,
-                                        hidden_dim=hidden_dim,
-                                        device=ds_model.device,
-                                        dtype=dtype)
+        data_loader = random_dataloader(model=ds_model, total_samples=1, hidden_dim=hidden_dim, device=ds_model.device)
         for _, batch in enumerate(data_loader):
             loss = ds_model(batch[0], batch[1])
             ds_model.backward(loss)
@@ -371,10 +375,6 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
                 "stage3_gather_fp16_weights_on_model_save": True,
@@ -383,6 +383,10 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
             "train_micro_batch_size_per_gpu": 1,
             "train_batch_size": 4,
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
@@ -391,11 +395,7 @@ def test_save_before_accum_grad_is_done(self, tmpdir, zero_stage):
         # So we config grad_accum=2 and step only once and save_16bit_model
         ds_model = create_deepspeed_model(config_dict=config_dict, model=model, base_optimizer=None)
 
-        data_loader = random_dataloader(model=ds_model,
-                                        total_samples=2,
-                                        hidden_dim=hidden_dim,
-                                        device=ds_model.device,
-                                        dtype=torch.half)
+        data_loader = random_dataloader(model=ds_model, total_samples=2, hidden_dim=hidden_dim, device=ds_model.device)
 
         batch = next(iter(data_loader))
         loss = ds_model(batch[0], batch[1])
@@ -429,15 +429,15 @@ def test_load_optimizer_state(self, tmpdir, zero_stage):
                     "weight_decay": 3e-7
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "wall_clock_breakdown": True,
             "zero_optimization": {
                 "stage": zero_stage
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         with deepspeed.zero.Init(enabled=zero_stage == 3):
@@ -460,13 +460,14 @@ def test_not_load_optimizer_state(self, tmpdir, zero_stage):
                     "weight_decay": 3e-7
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": zero_stage
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         with deepspeed.zero.Init(enabled=zero_stage == 3):
@@ -481,14 +482,14 @@ def test_load_module_only(self, tmpdir, zero_stage):
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         with deepspeed.zero.Init(enabled=zero_stage == 3):
@@ -504,14 +505,14 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         model = SimpleFrozenModel(hidden_dim, empty_grad=False)
@@ -522,7 +523,7 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
         all_ckpt_folder = os.path.join(tmpdir, 'all_params')
         ds_engine.save_checkpoint(all_ckpt_folder)
         all_params_ckpt_file = get_model_ckpt_name_for_rank(os.path.join(all_ckpt_folder, 'global_step0'), '00')
-        loaded_all_param_model = torch.load(all_params_ckpt_file)['module']
+        loaded_all_param_model = torch.load(all_params_ckpt_file, weights_only=False)['module']
         all_param_names = set([n for n, p in model.named_parameters()])
         assert set(loaded_all_param_model.keys()) == all_param_names
 
@@ -535,7 +536,7 @@ def test_save_exclude_frozen_weights(self, tmpdir, zero_stage):
         # Excluding frozen parameters should reduce checkpoint size
         assert os.path.getsize(all_params_ckpt_file) > os.path.getsize(trainable_ckpt_file)
 
-        loaded_trainable_param_model = torch.load(trainable_ckpt_file)['module']
+        loaded_trainable_param_model = torch.load(trainable_ckpt_file, weights_only=False)['module']
         frozen_param_names = set([n for n, p in model.named_parameters() if not p.requires_grad])
         loaded_trainable_param_names = set(loaded_trainable_param_model.keys())
         overlap_names = set.intersection(loaded_trainable_param_names, frozen_param_names)
@@ -552,14 +553,14 @@ def test_save_exclude_custom_frozen_weights(self, tmpdir, zero_stage):
             "optimizer": {
                 "type": 'Adam'
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": zero_stage,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         model = SimpleFrozenModel(hidden_dim, empty_grad=False)
@@ -574,7 +575,7 @@ def test_save_exclude_custom_frozen_weights(self, tmpdir, zero_stage):
 
         custom_state_dict_ckpt_file = get_model_ckpt_name_for_rank(
             os.path.join(custom_state_dict_ckpt_folder, 'global_step0'), '00')
-        loaded_custom_state_dict_param_model = torch.load(custom_state_dict_ckpt_file)['module']
+        loaded_custom_state_dict_param_model = torch.load(custom_state_dict_ckpt_file, weights_only=False)['module']
         loaded_custom_state_dict_param_names = set(loaded_custom_state_dict_param_model.keys())
 
         custom_state_dict_param_names = set([k for k, v in model.state_dict().items()])
@@ -617,7 +618,8 @@ def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device):
         clone_ckpt_file = os.path.join(tmpdir, 'clone_ckpt.pt')
         torch.save(clone_state_dict, clone_ckpt_file)
 
-        compare_state_dicts(torch.load(ref_ckpt_file), torch.load(clone_ckpt_file))
+        compare_state_dicts(torch.load(ref_ckpt_file, weights_only=False),
+                            torch.load(clone_ckpt_file, weights_only=False))
 
 
 class TestZeRONonDistributed(DistributedTest):
diff --git a/tests/unit/comm/test_dist.py b/tests/unit/comm/test_dist.py
index a40805bf0a75..861ba5c7be1a 100644
--- a/tests/unit/comm/test_dist.py
+++ b/tests/unit/comm/test_dist.py
@@ -127,13 +127,22 @@ def test(self):
         assert torch.all(x == result)
 
 
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16, torch.float16])
 class TestDistInferenceAllReduce(DistributedTest):
-    world_size = 4
+    device_count = get_accelerator().device_count()
+    if device_count >= 4:
+        world_size = [1, 2, 4]
+    elif device_count >= 2:
+        world_size = [1, 2]
+    else:
+        world_size = [1]
 
-    def test(self):
+    def test(self, dtype):
         x = torch.ones(1, 3).to(get_accelerator().device_name()) * (dist.get_rank() + 1)
         sum_of_ranks = (dist.get_world_size() * (dist.get_world_size() + 1)) // 2
         result = torch.ones(1, 3).to(get_accelerator().device_name()) * sum_of_ranks
+        result = result.to(dtype)
+        x = x.to(dtype)
         dist.inference_all_reduce(x)
         assert torch.all(x == result)
 
diff --git a/tests/unit/common.py b/tests/unit/common.py
index cdeca54b01ee..1498b0400ee1 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -23,7 +23,7 @@
 from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
 
 # Worker timeout for tests that hang
-DEEPSPEED_TEST_TIMEOUT = 600
+DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
 
 
 def is_rocm_pytorch():
@@ -58,6 +58,20 @@ def get_master_port(base_port=29500, port_range_size=1000):
     raise IOError('no free ports')
 
 
+def _get_cpu_socket_count():
+    import shlex
+    p1 = subprocess.Popen(shlex.split("cat /proc/cpuinfo"), stdout=subprocess.PIPE)
+    p2 = subprocess.Popen(["grep", "physical id"], stdin=p1.stdout, stdout=subprocess.PIPE)
+    p1.stdout.close()
+    p3 = subprocess.Popen(shlex.split("sort -u"), stdin=p2.stdout, stdout=subprocess.PIPE)
+    p2.stdout.close()
+    p4 = subprocess.Popen(shlex.split("wc -l"), stdin=p3.stdout, stdout=subprocess.PIPE)
+    p3.stdout.close()
+    r = int(p4.communicate()[0])
+    p4.stdout.close()
+    return r
+
+
 def set_accelerator_visible():
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", None)
     xdist_worker_id = get_xdist_worker_id()
@@ -81,16 +95,26 @@ def set_accelerator_visible():
                 match = re.search('Device Type.*GPU', line)
                 if match:
                     num_accelerators += 1
+        elif get_accelerator().device_name() == 'hpu':
+            try:
+                hl_smi = subprocess.check_output(['hl-smi', "-L"])
+                num_accelerators = re.findall(r"Module ID\s+:\s+(\d+)", hl_smi.decode())
+            except FileNotFoundError:
+                sim_list = subprocess.check_output(['ls', '-1', '/dev/accel'])
+                num_accelerators = re.findall(r"accel(\d+)", sim_list.decode())
+            num_accelerators = sorted(num_accelerators, key=int)
+            os.environ["HABANA_VISIBLE_MODULES"] = ",".join(num_accelerators)
         elif get_accelerator().device_name() == 'npu':
             npu_smi = subprocess.check_output(['npu-smi', 'info', '-l'])
             num_accelerators = int(npu_smi.decode('utf-8').strip().split('\n')[0].split(':')[1].strip())
         else:
             assert get_accelerator().device_name() == 'cpu'
-            cpu_sockets = int(
-                subprocess.check_output('cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l', shell=True))
-            num_accelerators = cpu_sockets
+            num_accelerators = _get_cpu_socket_count()
 
-        cuda_visible = ",".join(map(str, range(num_accelerators)))
+        if isinstance(num_accelerators, list):
+            cuda_visible = ",".join(num_accelerators)
+        else:
+            cuda_visible = ",".join(map(str, range(num_accelerators)))
 
     # rotate list based on xdist worker id, example below
     # wid=0 -> ['0', '1', '2', '3']
@@ -113,6 +137,7 @@ class DistributedExec(ABC):
     set_dist_env = True
     requires_cuda_env = True
     reuse_dist_env = False
+    non_daemonic_procs = False
     _pool_cache = {}
     exec_timeout = DEEPSPEED_TEST_TIMEOUT
 
@@ -120,16 +145,13 @@ class DistributedExec(ABC):
     def run(self):
         ...
 
-    def __call__(self, request=None):
+    def __call__(self, request):
         self._fixture_kwargs = self._get_fixture_kwargs(request, self.run)
         world_size = self.world_size
         if self.requires_cuda_env and not get_accelerator().is_available():
             pytest.skip("only supported in accelerator environments.")
 
-        if isinstance(world_size, int):
-            world_size = [world_size]
-        for procs in world_size:
-            self._launch_procs(procs)
+        self._launch_with_file_store(request, world_size)
 
     def _get_fixture_kwargs(self, request, func):
         if not request:
@@ -145,18 +167,15 @@ def _get_fixture_kwargs(self, request, func):
                 pass  # test methods can have kwargs that are not fixtures
         return fixture_kwargs
 
-    def _launch_procs(self, num_procs):
-        # Verify we have enough accelerator devices to run this test
-        if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
-            pytest.skip(
-                f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
-            )
-
-        # Set start method to `forkserver` (or `fork`)
-        mp.set_start_method('forkserver', force=True)
-
+    def _launch_daemonic_procs(self, num_procs, init_method):
         # Create process pool or use cached one
         master_port = None
+
+        if get_accelerator().device_name() == 'hpu':
+            if self.reuse_dist_env:
+                print("Ignoring reuse_dist_env for hpu")
+                self.reuse_dist_env = False
+
         if self.reuse_dist_env:
             if num_procs not in self._pool_cache:
                 self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
@@ -167,7 +186,7 @@ def _launch_procs(self, num_procs):
             master_port = get_master_port()
 
         # Run the test
-        args = [(local_rank, num_procs, master_port) for local_rank in range(num_procs)]
+        args = [(local_rank, num_procs, master_port, init_method) for local_rank in range(num_procs)]
         skip_msgs_async = pool.starmap_async(self._dist_run, args)
 
         try:
@@ -176,19 +195,93 @@ def _launch_procs(self, num_procs):
             # Shortcut to exit pytest in the case of a hanged test. This
             # usually means an environment error and the rest of tests will
             # hang (causing super long unit test runtimes)
-            pytest.exit("Test hanged, exiting", returncode=0)
-
-        # Tear down distributed environment and close process pools
-        self._close_pool(pool, num_procs)
+            pytest.exit("Test hanged, exiting", returncode=1)
+        finally:
+            # Regardless of the outcome, ensure proper teardown
+            # Tear down distributed environment and close process pools
+            self._close_pool(pool, num_procs)
 
         # If we skipped a test, propagate that to this process
         if any(skip_msgs):
             assert len(set(skip_msgs)) == 1, "Multiple different skip messages received"
             pytest.skip(skip_msgs[0])
 
-    def _dist_run(self, local_rank, num_procs, master_port):
-        skip_msg = ''
-        if not dist.is_initialized():
+    def _launch_non_daemonic_procs(self, num_procs, init_method):
+        assert not self.reuse_dist_env, "Cannot reuse distributed environment with non-daemonic processes"
+
+        master_port = get_master_port()
+        skip_msg = mp.Queue()  # Allows forked processes to share pytest.skip reason
+        processes = []
+        prev_start_method = mp.get_start_method()
+        mp.set_start_method('spawn', force=True)
+        for local_rank in range(num_procs):
+            p = mp.Process(target=self._dist_run, args=(local_rank, num_procs, master_port, init_method, skip_msg))
+            p.start()
+            processes.append(p)
+        mp.set_start_method(prev_start_method, force=True)
+
+        # Now loop and wait for a test to complete. The spin-wait here isn't a big
+        # deal because the number of processes will be O(#GPUs) << O(#CPUs).
+        any_done = False
+        start = time.time()
+        while (not any_done) and ((time.time() - start) < self.exec_timeout):
+            for p in processes:
+                if not p.is_alive():
+                    any_done = True
+                    break
+            time.sleep(.1)  # So we don't hog CPU
+
+        # If we hit the timeout, then presume a test is hanged
+        if not any_done:
+            for p in processes:
+                p.terminate()
+            pytest.exit("Test hanged, exiting", returncode=1)
+
+        # Wait for all other processes to complete
+        for p in processes:
+            p.join(self.exec_timeout)
+
+        failed = [(rank, p) for rank, p in enumerate(processes) if p.exitcode != 0]
+        for rank, p in failed:
+            # If it still hasn't terminated, kill it because it hung.
+            if p.exitcode is None:
+                p.terminate()
+                pytest.fail(f'Worker {rank} hung.', pytrace=False)
+            if p.exitcode < 0:
+                pytest.fail(f'Worker {rank} killed by signal {-p.exitcode}', pytrace=False)
+            if p.exitcode > 0:
+                pytest.fail(f'Worker {rank} exited with code {p.exitcode}', pytrace=False)
+
+        if not skip_msg.empty():
+            # This assumed all skip messages are the same, it may be useful to
+            # add a check here to assert all exit messages are equal
+            pytest.skip(skip_msg.get())
+
+    def _launch_procs(self, num_procs, init_method):
+        # Verify we have enough accelerator devices to run this test
+        if get_accelerator().is_available() and get_accelerator().device_count() < num_procs:
+            pytest.skip(
+                f"Skipping test because not enough GPUs are available: {num_procs} required, {get_accelerator().device_count()} available"
+            )
+
+        if get_accelerator().device_name() == 'xpu':
+            self.non_daemonic_procs = True
+            self.reuse_dist_env = False
+
+        # Set start method to `forkserver` (or `fork`)
+        mp.set_start_method('forkserver', force=True)
+
+        if self.non_daemonic_procs:
+            self._launch_non_daemonic_procs(num_procs, init_method)
+        else:
+            self._launch_daemonic_procs(num_procs, init_method)
+
+    def _dist_run(self, local_rank, num_procs, master_port, init_method, skip_msg=""):
+        if dist.is_initialized():
+            if get_accelerator().is_available():
+                # local_rank might not match the rank in the previous run if you are reusing the environment
+                get_accelerator().set_device(dist.get_rank())
+        else:
             """ Initialize deepspeed.comm and execute the user function. """
             if self.set_dist_env:
                 os.environ['MASTER_ADDR'] = '127.0.0.1'
@@ -211,19 +304,41 @@ def _dist_run(self, local_rank, num_procs, master_port):
                 get_accelerator().set_device(local_rank)
 
             if self.init_distributed:
-                deepspeed.init_distributed(dist_backend=self.backend)
+                deepspeed.init_distributed(dist_backend=self.backend,
+                                           init_method=init_method,
+                                           rank=local_rank,
+                                           world_size=num_procs)
                 dist.barrier()
 
         try:
             self.run(**self._fixture_kwargs)
         except BaseException as e:
             if isinstance(e, Skipped):
-                skip_msg = e.msg
+                if self.non_daemonic_procs:
+                    skip_msg.put(e.msg)
+                else:
+                    skip_msg = e.msg
             else:
                 raise e
 
         return skip_msg
 
+    def _launch_with_file_store(self, request, world_size):
+        tmpdir = request.getfixturevalue("tmpdir")
+        dist_file_store = tmpdir.join("dist_file_store")
+        assert not os.path.exists(dist_file_store)
+        init_method = f"file://{dist_file_store}"
+
+        if isinstance(world_size, int):
+            world_size = [world_size]
+        for procs in world_size:
+            try:
+                self._launch_procs(procs, init_method)
+            finally:
+                if os.path.exists(dist_file_store):
+                    os.remove(dist_file_store)
+            time.sleep(0.5)
+
     def _dist_destroy(self):
         if (dist is not None) and dist.is_initialized():
             dist.barrier()
@@ -367,13 +482,9 @@ def __call__(self, request):
                 world_size = mark.args[0]
                 break
         else:
-            world_size = self.world_size
+            world_size = self._fixture_kwargs.get("world_size", self.world_size)
 
-        if isinstance(world_size, int):
-            world_size = [world_size]
-        for procs in world_size:
-            self._launch_procs(procs)
-            time.sleep(0.5)
+        self._launch_with_file_store(request, world_size)
 
     def _get_current_test_func(self, request):
         # DistributedTest subclasses may have multiple test methods
@@ -384,3 +495,13 @@ def _get_current_test_func(self, request):
 def get_test_path(filename):
     curr_path = Path(__file__).parent
     return str(curr_path.joinpath(filename))
+
+
+# fp16 > bf16 > fp32
+def preferred_dtype():
+    if get_accelerator().is_fp16_supported():
+        return torch.float16
+    elif get_accelerator().is_bf16_supported():
+        return torch.bfloat16
+    else:
+        return torch.float32
diff --git a/tests/unit/compression/test_compression.py b/tests/unit/compression/test_compression.py
index c6e5031349cb..1802c09f33b5 100644
--- a/tests/unit/compression/test_compression.py
+++ b/tests/unit/compression/test_compression.py
@@ -14,7 +14,7 @@
 from deepspeed.compression.basic_layer import LinearLayer_Compress, ColumnParallelLinear_Compress, RowParallelLinear_Compress
 from deepspeed.compression.helper import convert_conv1d_to_linear
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from unit.common import DistributedTest
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5),
diff --git a/tests/unit/compression/test_dequantization.py b/tests/unit/compression/test_dequantization.py
index 692f4cef97d7..8446904754b3 100644
--- a/tests/unit/compression/test_dequantization.py
+++ b/tests/unit/compression/test_dequantization.py
@@ -7,8 +7,9 @@
 
 import os
 import torch
+import pytest
 from unit.common import DistributedTest
-from deepspeed.ops.op_builder import InferenceBuilder
+import deepspeed
 from deepspeed.accelerator import get_accelerator
 
 
@@ -18,7 +19,11 @@ def init(self):
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.device = torch.device(get_accelerator().device_name(local_rank))
 
-        self.dequantize_func = InferenceBuilder().load().dequantize_fp16
+        from deepspeed.ops.op_builder import InferenceBuilder
+        if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+            pytest.skip("InferenceBuilder is not implemented")
+        else:
+            self.dequantize_func = InferenceBuilder().load().dequantize_fp16
 
     def run_dequantize_test(self, M, N, num_groups):
         weight = torch.randint(-255, 255, (M, N)).to(dtype=torch.int8, device=self.device)
diff --git a/tests/unit/elasticity/test_elastic.py b/tests/unit/elasticity/test_elastic.py
index a49ec595a420..1f7cbbbca214 100644
--- a/tests/unit/elasticity/test_elastic.py
+++ b/tests/unit/elasticity/test_elastic.py
@@ -9,10 +9,10 @@
 from deepspeed.git_version_info import version as ds_version
 import os
 from unit.simple_model import SimpleModel
-from deepspeed.ops.op_builder import FusedAdamBuilder
+from deepspeed.ops.op_builder import FusedAdamBuilder, FusedLambBuilder
 
 if not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]:
-    pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+    pytest.skip("This op has not been implemented on this system.", allow_module_level=True)
 
 
 @pytest.fixture
@@ -150,6 +150,8 @@ def test_proper_mbsz(ds_config):
 class TestNonElasticBatchParams(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system.")
     def test(self):
         config_dict = {
             "train_batch_size": 2,
@@ -182,6 +184,8 @@ def test(self):
 class TestNonElasticBatchParamsWithOverride(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system.")
     def test(self):
         config_dict = {
             "train_batch_size": 2,
@@ -213,6 +217,8 @@ def test(self):
 class TestElasticConfigChanged(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system.")
     def test(self):
         config_dict = {
             "train_batch_size": 2,
diff --git a/tests/unit/hybrid_engine/test_he_lora.py b/tests/unit/hybrid_engine/test_he_lora.py
index ea27239ed55e..5f53a237c340 100644
--- a/tests/unit/hybrid_engine/test_he_lora.py
+++ b/tests/unit/hybrid_engine/test_he_lora.py
@@ -15,6 +15,7 @@
 import numpy.testing as npt
 from unit.common import DistributedTest
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
@@ -125,7 +126,8 @@ def get_model(self, model_name):
         model_config.dropout = 0.0
         model = AutoModelForCausalLM.from_pretrained(model_name, config=model_config)
         model = model.half()
-        model = model.to(f'cuda:{local_rank}')
+        device = get_accelerator().device_name()
+        model = model.to(f'{device}:{local_rank}')
         return model
 
     def get_tokenizer(self, model_name):
@@ -190,7 +192,8 @@ def test_lora(self, batch_size, model_name, zero_stage, offload_device):
 
         model.train()
         batch = tokenizer(train_sentences, max_length=16, padding="max_length", truncation=True, return_tensors="pt")
-        batch = to_device(batch, f'cuda:{local_rank}')
+        device = get_accelerator().device_name()
+        batch = to_device(batch, f'{device}:{local_rank}')
         batch["labels"] = batch["input_ids"]
         outputs = model(**batch, use_cache=False)
         loss = outputs.loss
diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py
index 56df2b232d15..77b51fcd5814 100644
--- a/tests/unit/inference/quantization/test_intX_quantization.py
+++ b/tests/unit/inference/quantization/test_intX_quantization.py
@@ -11,7 +11,7 @@
 from deepspeed.inference.quantization.quantization import _init_group_wise_weight_quantization
 from deepspeed.inference.quantization.utils import Quantizer, DeQuantizer
 from deepspeed.inference.quantization.layers import QuantizedLinear
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from transformers.models.opt.modeling_opt import OPTDecoderLayer
 from transformers import AutoConfig, OPTConfig, AutoModel
 import pytest
@@ -55,7 +55,7 @@ def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int):
 
 def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
     import deepspeed
-    from transformers.deepspeed import HfDeepSpeedConfig
+    from transformers.integrations.deepspeed import HfDeepSpeedConfig
 
     def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
         GB = 1 << 30
@@ -172,7 +172,7 @@ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: b
 
 def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int):
     import deepspeed
-    from transformers.deepspeed import HfDeepSpeedConfig
+    from transformers.integrations.deepspeed import HfDeepSpeedConfig
 
     def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict:
         GB = 1 << 30
diff --git a/tests/unit/inference/test_checkpoint_sharding.py b/tests/unit/inference/test_checkpoint_sharding.py
index 564b3fab6bf4..f1e37ee26536 100644
--- a/tests/unit/inference/test_checkpoint_sharding.py
+++ b/tests/unit/inference/test_checkpoint_sharding.py
@@ -14,6 +14,7 @@
 from huggingface_hub import snapshot_download
 from transformers.utils import is_offline_mode
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
@@ -44,6 +45,8 @@ def model_name(request):
 
 @pytest.fixture(params=[torch.float16, torch.int8], ids=["fp16", "int8"])
 def dtype(request):
+    if request.param not in get_accelerator().supported_dtypes():
+        pytest.skip(f"{request.param} not supported by {get_accelerator().device_name()}.")
     return request.param
 
 
@@ -110,7 +113,7 @@ def write_checkpoints_json(model_name, class_tmpdir):
                 cached_repo_dir = snapshot_download(
                     model_name,
                     local_files_only=is_offline_mode(),
-                    cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+                    cache_dir=os.getenv("HF_HOME", None),
                     ignore_patterns=["*.safetensors", "*.msgpack", "*.h5"],
                 )
                 file_list = [str(entry) for entry in Path(cached_repo_dir).rglob("*.[bp][it][n]") if entry.is_file()]
diff --git a/tests/unit/inference/test_human_eval.py b/tests/unit/inference/test_human_eval.py
new file mode 100644
index 000000000000..2525aeb5aa0e
--- /dev/null
+++ b/tests/unit/inference/test_human_eval.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import os
+import torch
+from deepspeed.accelerator import get_accelerator
+
+
+@pytest.mark.evaluation
+@pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"])
+def test_human_eval(model_name):
+    import mii
+    import numpy
+    from transformers import pipeline
+    from human_eval.data import write_jsonl, read_problems
+    from human_eval.evaluation import evaluate_functional_correctness
+
+    def generate_base_completion(pipe, problem_prompt: str) -> str:
+        return pipe(problem_prompt, do_sample=True)[0]["generated_text"]
+
+    def generate_mii_completion(pipe, problem_prompt: str) -> str:
+        return pipe(problem_prompt, max_new_tokens=512)[0].generated_text
+
+    def generate_samples(pipe, generation_function):
+        samples = [
+            dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"]))
+            for task_id in problems for _ in range(num_samples_per_task)
+        ]
+        return samples
+
+    # Loading Problems
+    problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz")
+    num_samples_per_task = 20
+
+    # Initializing HuggingFace Pipeline
+    local_rank = os.getenv("LOCAL_RANK", "0")
+    device = torch.device(get_accelerator().device_name(local_rank))
+    base_pipe = pipeline(model=model_name,
+                         device=torch.device(get_accelerator().device_name(local_rank)),
+                         max_length=512,
+                         return_full_text=False)
+
+    # Generating Base Samples
+    base_samples = generate_samples(base_pipe, generate_base_completion)
+
+    # Base Pipeline Teardown
+    del base_pipe
+    get_accelerator().empty_cache()
+
+    # Initializing DeepSpeed-MII Pipeline
+    mii_pipe = mii.pipeline(model_name)
+
+    # Generating MII Samples
+    mii_samples = generate_samples(mii_pipe, generate_mii_completion)
+
+    # MII Pipeline Teardown
+    mii_pipe.destroy()
+
+    # Writing Samples
+    write_jsonl("base_samples.jsonl", base_samples)
+    write_jsonl("mii_samples.jsonl", mii_samples)
+
+    # Evaluating Samples
+    base_results = evaluate_functional_correctness("base_samples.jsonl")
+    mii_results = evaluate_functional_correctness("mii_samples.jsonl")
+
+    # Executing Assertions
+    for key in base_results.keys():
+        assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \
+            f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol."
diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py
index 6b5588d8a1f7..df85ed232a2e 100644
--- a/tests/unit/inference/test_inference.py
+++ b/tests/unit/inference/test_inference.py
@@ -3,52 +3,63 @@
 
 # DeepSpeed Team
 
-import os
-import time
-import torch
 import pytest
+
 import itertools
+import pickle
+import os
+import time
+import requests
+import fcntl
+
+from dataclasses import dataclass
+from typing import List
+
 import deepspeed
-from deepspeed.git_version_info import torch_info
-from unit.common import DistributedTest
+import torch
+
+from huggingface_hub import HfApi
 from packaging import version as pkg_version
-from deepspeed.ops.op_builder import OpBuilder
-from transformers import pipeline, AutoTokenizer
+from torch import nn
+from transformers import pipeline
 from transformers.models.t5.modeling_t5 import T5Block
 from transformers.models.roberta.modeling_roberta import RobertaLayer
-from huggingface_hub import HfApi
-from deepspeed.model_implementations import DeepSpeedTransformerInference
-from torch import nn
+
 from deepspeed.accelerator import get_accelerator
+from deepspeed.git_version_info import torch_info
+from deepspeed.model_implementations import DeepSpeedTransformerInference
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.op_builder import OpBuilder
+
+from unit.common import DistributedTest
 
 rocm_version = OpBuilder.installed_rocm_version()
 if rocm_version != (0, 0):
     pytest.skip("skip inference tests on rocm for now", allow_module_level=True)
 
 _bert_models = [
-    "bert-base-cased",
-    "bert-base-uncased",
-    "bert-large-cased",
-    "bert-large-uncased",
-    "bert-base-multilingual-cased",
-    "bert-base-multilingual-uncased",
+    "google-bert/bert-base-cased",
+    "google-bert/bert-base-uncased",
+    "google-bert/bert-large-cased",
+    "google-bert/bert-large-uncased",
+    "google-bert/bert-base-multilingual-cased",
+    "google-bert/bert-base-multilingual-uncased",
     "deepset/minilm-uncased-squad2",
     "cross-encoder/ms-marco-MiniLM-L-12-v2",
     "dslim/bert-base-NER",
-    "bert-large-uncased-whole-word-masking-finetuned-squad",
-    "distilbert-base-cased-distilled-squad",
+    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad",
+    "distilbert/distilbert-base-cased-distilled-squad",
 ]
 _roberta_models = [
-    "roberta-large",
-    "roberta-base",
+    "FacebookAI/roberta-large",
+    "FacebookAI/roberta-base",
     "deepset/roberta-base-squad2",
     "j-hartmann/emotion-english-distilroberta-base",
     "Jean-Baptiste/roberta-large-ner-english",
 ]
 _gpt_models = [
-    "gpt2",
-    "distilgpt2",
+    "openai-community/gpt2",
+    "distilbert/distilgpt2",
     "Norod78/hebrew-bad_wiki-gpt_neo-tiny",
     "EleutherAI/gpt-j-6b",
     "EleutherAI/pythia-70m-deduped",
@@ -64,10 +75,73 @@
     "text2text-generation", "summarization", "translation"
 ]
 
+
+@dataclass
+class ModelInfo:
+    id: str
+    pipeline_tag: str
+    tags: List[str]
+
+
+def _hf_model_list() -> List[ModelInfo]:
+    """ Caches HF model list to avoid repeated API calls """
+
+    cache_dir = os.getenv("HF_HOME", "~/.cache/huggingface")
+    cache_file_path = os.path.join(cache_dir, "DS_model_cache.pkl")
+    num_days = os.getenv("HF_CACHE_EXPIRY_DAYS", 1)
+    cache_expiration_seconds = num_days * 60 * 60 * 24
+
+    # Load or initialize the cache
+    model_data = {"cache_time": 0, "model_list": []}
+    if os.path.isfile(cache_file_path):
+        with open(cache_file_path, 'rb') as f:
+            try:
+                fcntl.flock(f, fcntl.LOCK_SH)
+                model_data = pickle.load(f)
+            except Exception as e:
+                print(f"Error loading cache file {cache_file_path}: {e}")
+            finally:
+                fcntl.flock(f, fcntl.LOCK_UN)
+
+    current_time = time.time()
+
+    # Update the cache if it has expired
+    if ((model_data["cache_time"] + cache_expiration_seconds) < current_time) or os.getenv("FORCE_UPDATE_HF_CACHE",
+                                                                                           default=False):
+        api = HfApi()
+        while True:
+            try:
+                model_list = []
+                for model in _test_models:
+                    model_list.extend(api.list_models(model_name=model))
+                model_data["model_list"] = [
+                    ModelInfo(id=m.id, pipeline_tag=m.pipeline_tag, tags=m.tags) for m in model_list
+                ]
+                break  # Exit the loop if the operation is successful
+            except requests.exceptions.HTTPError as e:
+                if e.response.status_code == 429:
+                    print("Rate limit exceeded. Retrying in 60 seconds...")
+                    time.sleep(60)
+                else:
+                    raise  # Re-raise the exception if it's not a 429 error
+        model_data["cache_time"] = current_time
+
+        # Save the updated cache
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(cache_file_path, 'wb') as f:
+            try:
+                fcntl.flock(f, fcntl.LOCK_EX)
+                pickle.dump(model_data, f)
+            finally:
+                fcntl.flock(f, fcntl.LOCK_UN)
+
+    return model_data["model_list"]
+
+
 # Get a list of all models and mapping from task to supported models
-_hf_models = list(HfApi().list_models())
-_hf_model_names = [m.modelId for m in _hf_models]
-_hf_task_to_models = {task: [m.modelId for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
+_hf_models = _hf_model_list()
+_hf_model_names = [m.id for m in _hf_models]
+_hf_task_to_models = {task: [m.id for m in _hf_models if m.pipeline_tag == task] for task in _test_tasks}
 
 # Get all combinations of task:model to test
 _model_w_tasks = [(m, t) for m, t in itertools.product(*[_test_models, _test_tasks]) if m in _hf_task_to_models[t]]
@@ -114,6 +188,11 @@ def enable_triton(request):
     return request.param
 
 
+@pytest.fixture(params=[1, 2], ids=["ws1", "ws2"])
+def world_size(request):
+    return request.param
+
+
 """ Fixtures for running query """
 
 
@@ -227,6 +306,12 @@ def verify_injection(module):
     verify_injection(model)
 
 
+# Used to Get Device name
+def getDeviceId(local_rank):
+    device = torch.device(f"{get_accelerator().device_name(local_rank)}")
+    return device
+
+
 # Verify that test is valid
 def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
     model, task = model_w_task
@@ -246,17 +331,19 @@ def validate_test(model_w_task, dtype, enable_cuda_graph, enable_triton):
         msg = f"Not enough GPU memory to run {model} with dtype {dtype}"
     elif ("bloom" in model) and (dtype != torch.half):
         msg = f"Bloom models only support half precision, cannot use dtype {dtype}"
-    elif ("bert" not in model.lower()) and enable_cuda_graph:
+    elif (model not in _bert_models + _roberta_models) and enable_cuda_graph:
         msg = "Non bert/roberta models do no support CUDA Graph"
     elif enable_triton and not (dtype in [torch.half]):
         msg = "Triton is for fp16"
     elif enable_triton and not deepspeed.HAS_TRITON:
         msg = "triton needs to be installed for the test"
-    elif ("bert" not in model.lower()) and enable_triton:
+    elif (model not in _bert_models + _roberta_models) and enable_triton:
         msg = "Triton kernels do not support Non bert/roberta models yet"
 
     # These should be removed once we fix several inference tests failing
-    if model in ["EleutherAI/pythia-70m-deduped", "distilbert-base-cased-distilled-squad", "EleutherAI/gpt-j-6b"]:
+    if model in [
+            "EleutherAI/pythia-70m-deduped", "distilbert/distilbert-base-cased-distilled-squad", "EleutherAI/gpt-j-6b"
+    ]:
         msg = "Test is currently broken"
     return msg
 
@@ -280,6 +367,12 @@ def test(
         if invalid_test_msg:
             pytest.skip(invalid_test_msg)
 
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+            pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
+
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
 
@@ -388,7 +481,7 @@ def test(
 
 
 @pytest.mark.inference
-@pytest.mark.parametrize("model_w_task", [("gpt2", "text-generation")], ids=["gpt2"])
+@pytest.mark.parametrize("model_w_task", [("openai-community/gpt2", "text-generation")], ids=["gpt2"])
 class TestLowCpuMemUsage(DistributedTest):
     world_size = 1
 
@@ -405,8 +498,8 @@ def test(
             pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
 
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-
-        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=local_rank, framework="pt")
+        device = getDeviceId(local_rank)
+        pipe = pipeline(task, model=model, model_kwargs={"low_cpu_mem_usage": True}, device=device, framework="pt")
         bs_output = pipe(query, **inf_kwargs)
         pipe.model = deepspeed.init_inference(pipe.model,
                                               mp_size=self.world_size,
@@ -419,46 +512,6 @@ def test(
         assert assert_fn(bs_output, ds_output)
 
 
-@pytest.mark.seq_inference
-@pytest.mark.parametrize("model_w_task", [("tiiuae/falcon-7b", "text-generation")], ids=["falcon"])
-class TestAutoTP(DistributedTest):
-    world_size = 1
-
-    def test(
-        self,
-        model_w_task,
-        query,
-        inf_kwargs,
-        assert_fn,
-    ):
-        # TODO: enable this test for H100 tests
-        pytest.skip("Not enough GPU memory for this on V100 runners")
-        model, task = model_w_task
-        dtype = torch.bfloat16
-        local_rank = int(os.getenv("LOCAL_RANK", "0"))
-
-        # We have to load these large models on CPU with pipeline because not
-        # enough GPU memory
-        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
-        pipe = pipeline(task,
-                        model=model,
-                        tokenizer=tokenizer,
-                        torch_dtype=dtype,
-                        trust_remote_code=True,
-                        device=torch.device("cpu"),
-                        framework="pt")
-        #bs_output = pipe(query, **inf_kwargs)
-
-        pipe.model = deepspeed.init_inference(pipe.model, mp_size=self.world_size, replace_with_kernel_inject=False)
-        # Switch device to GPU so that input tensors are not on CPU
-        pipe.device = torch.device(get_accelerator().device_name(local_rank))
-        ds_output = pipe(query, **inf_kwargs)
-
-        #print(local_rank, "baseline", bs_output)
-        print(local_rank, "deepspeed", ds_output)
-        #assert assert_fn(bs_output, ds_output)
-
-
 @pytest.mark.seq_inference
 @pytest.mark.parametrize(
     "model_w_task, injection_policy",
@@ -466,7 +519,7 @@ def test(
         (("google/t5-v1_1-small", "text2text-generation"), {
             T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo')
         }),
-        (("roberta-large", "fill-mask"), {
+        (("FacebookAI/roberta-large", "fill-mask"), {
             RobertaLayer: ('output.dense')
         }),
     ],
@@ -474,36 +527,25 @@ def test(
 )
 @pytest.mark.parametrize("dtype", [torch.float], ids=["fp32"])
 class TestInjectionPolicy(DistributedTest):
-    world_size = [1, 2]
 
-    def test(
-        self,
-        model_w_task,
-        injection_policy,
-        query,
-        inf_kwargs,
-        assert_fn,
-        dtype,
-    ):
+    def test(self, model_w_task, injection_policy, query, inf_kwargs, assert_fn, dtype, world_size):
         invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
         if invalid_test_msg:
             pytest.skip(invalid_test_msg)
 
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        world_size = int(os.getenv("WORLD_SIZE", "2"))
 
-        # We have to load these large models on CPU with pipeline because not
-        # enough GPU memory
-        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        pipe = pipeline(task,
+                        model=model,
+                        device=torch.device(get_accelerator().device_name(local_rank)),
+                        framework="pt")
         bs_output = pipe(query, **inf_kwargs)
 
         pipe.model = deepspeed.init_inference(pipe.model,
                                               mp_size=world_size,
                                               dtype=dtype,
                                               injection_policy=injection_policy)
-        # Switch device to GPU so that input tensors are not on CPU
-        pipe.device = torch.device(get_accelerator().device_name(local_rank))
         ds_output = pipe(query, **inf_kwargs)
 
         print(local_rank, "baseline", bs_output)
@@ -512,6 +554,7 @@ def test(
 
 
 @pytest.mark.seq_inference
+@pytest.mark.parametrize('keep_module_on_host', [True, False])
 @pytest.mark.parametrize(
     "model_w_task",
     [("Helsinki-NLP/opus-mt-en-de", "translation"), ("Salesforce/codegen-350M-mono", "text-generation")],
@@ -528,36 +571,43 @@ def test(
         inf_kwargs,
         assert_fn,
         dtype,
+        keep_module_on_host,
     ):
         invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
         if invalid_test_msg:
             pytest.skip(invalid_test_msg)
 
-        if dtype not in get_accelerator().supported_dtypes():
-            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
-
-        # TODO: enable this test after torch 2.1 stable release
-        if dtype == torch.bfloat16 and model_w_task[0] == "Salesforce/codegen-350M-mono":
-            pytest.skip("Codegen model(bf16) need to use torch version > 2.0.")
-
         model, task = model_w_task
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
         world_size = int(os.getenv("WORLD_SIZE", "2"))
 
-        # We have to load these large models on CPU with pipeline because not
-        # enough GPU memory
-        pipe = pipeline(task, model=model, device=torch.device("cpu"), framework="pt")
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"Acceleraor {get_accelerator().device_name()} does not support {dtype}.")
+
+        if model == "Salesforce/codegen-350M-mono":
+            pytest.skip("Disable Codegen model due to slight result difference")
+            #TODO: re-enable this test once we have a fix for the slight result difference
+
+        pipe = pipeline(task,
+                        model=model,
+                        device=torch.device(get_accelerator().device_name(local_rank)),
+                        framework="pt")
         bs_output = pipe(query, **inf_kwargs)
 
-        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
-        # Switch device to GPU so that input tensors are not on CPU
-        pipe.device = torch.device(get_accelerator().device_name(local_rank))
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=world_size,
+                                              dtype=dtype,
+                                              keep_module_on_host=keep_module_on_host)
         ds_output = pipe(query, **inf_kwargs)
 
         print(local_rank, "baseline", bs_output)
         print(local_rank, "deepspeed", ds_output)
         assert assert_fn(bs_output, ds_output)
 
+        if keep_module_on_host:
+            for name, param in model.named_parameters():
+                assert param.device == torch.device('cpu'), f"keep_module_on_host is on but param {name} is not on cpu"
+
     @pytest.mark.world_size(3)
     def test_odd_world_size(
         self,
@@ -566,6 +616,7 @@ def test_odd_world_size(
         inf_kwargs,
         assert_fn,
         dtype,
+        keep_module_on_host,
     ):
         invalid_test_msg = validate_test(model_w_task, dtype, enable_cuda_graph=False, enable_triton=False)
         if invalid_test_msg:
@@ -583,13 +634,20 @@ def test_odd_world_size(
                         framework="pt")
         bs_output = pipe(query, **inf_kwargs)
 
-        pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=dtype)
+        pipe.model = deepspeed.init_inference(pipe.model,
+                                              mp_size=world_size,
+                                              dtype=dtype,
+                                              keep_module_on_host=keep_module_on_host)
         ds_output = pipe(query, **inf_kwargs)
 
         print(local_rank, "baseline", bs_output)
         print(local_rank, "deepspeed", ds_output)
         assert assert_fn(bs_output, ds_output)
 
+        if keep_module_on_host:
+            for name, param in model.named_parameters():
+                assert param.device == torch.device('cpu'), f"keep_module_on_host is on but param {name} is not on cpu"
+
 
 @pytest.mark.nightly
 @pytest.mark.parametrize(
@@ -597,7 +655,7 @@ def test_odd_world_size(
     (
         ["gpt2", "EleutherAI/gpt-neo-2.7B"],
         #["gpt2", "EleutherAI/gpt-j-6b"], # Causing OOM for this test
-        ["gpt2", "gpt2-xl"],
+        ["gpt2", "openai-community/gpt2-xl"],
     ),
 )
 @pytest.mark.parametrize("task", ["lambada_standard"])
@@ -639,8 +697,15 @@ def no_pool_bootstrap_stderr(f, xs, iters):
             setattr(lm, model_family, getattr(lm, model_family).half().to(device))
             lm._device = device
         else:
-            lm = lm_eval.models.get_model(model_family).create_from_arg_string(
-                f"pretrained={model_name}", {"device": get_accelerator().device_name()})
+            if get_accelerator().device_name() == 'hpu':
+                #lm_eval not supporting HPU device, so get model with CPU and move it to HPU.
+                lm = lm_eval.models.get_model(model_family).create_from_arg_string(f"pretrained={model_name}",
+                                                                                   {"device": "cpu"})
+                setattr(lm, model_family, getattr(lm, model_family).to(device))
+                lm._device = device
+            else:
+                lm = lm_eval.models.get_model(model_family).create_from_arg_string(
+                    f"pretrained={model_name}", {"device": get_accelerator().device_name()})
 
         get_accelerator().synchronize()
         start = time.time()
diff --git a/tests/unit/inference/test_inference_config.py b/tests/unit/inference/test_inference_config.py
index 375563abf65b..39d62d17372c 100644
--- a/tests/unit/inference/test_inference_config.py
+++ b/tests/unit/inference/test_inference_config.py
@@ -15,7 +15,7 @@ class TestInferenceConfig(DistributedTest):
     world_size = 1
 
     def test_overlap_kwargs(self):
-        config = {"replace_with_kernel_inject": True}
+        config = {"replace_with_kernel_inject": True, "dtype": torch.float32}
         kwargs = {"replace_with_kernel_inject": True}
 
         engine = deepspeed.init_inference(torch.nn.Module(), config=config, **kwargs)
@@ -37,7 +37,7 @@ def test_kwargs_and_config(self):
         assert engine._config.dtype == kwargs["dtype"]
 
     def test_json_config(self, tmpdir):
-        config = {"replace_with_kernel_inject": True}
+        config = {"replace_with_kernel_inject": True, "dtype": "torch.float32"}
         config_json = create_config_from_dict(tmpdir, config)
 
         engine = deepspeed.init_inference(torch.nn.Module(), config=config_json)
diff --git a/tests/unit/inference/test_model_profiling.py b/tests/unit/inference/test_model_profiling.py
index 23e49f89025b..319055d0ea55 100644
--- a/tests/unit/inference/test_model_profiling.py
+++ b/tests/unit/inference/test_model_profiling.py
@@ -16,6 +16,9 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("This op had not been implemented on this system.", allow_module_level=True)
 
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
+
 
 @pytest.mark.inference
 @pytest.mark.parametrize("use_cuda_events", [True, False])
diff --git a/tests/unit/inference/test_stable_diffusion.py b/tests/unit/inference/test_stable_diffusion.py
index ac39b7ab12fa..775a02c2e878 100644
--- a/tests/unit/inference/test_stable_diffusion.py
+++ b/tests/unit/inference/test_stable_diffusion.py
@@ -20,14 +20,14 @@ class TestStableDiffusion(DistributedTest):
     def test(self):
         from diffusers import DiffusionPipeline
         from image_similarity_measures.quality_metrics import rmse
-        generator = torch.Generator(device=get_accelerator().current_device())
+        dev = get_accelerator().device_name()
+        generator = torch.Generator(device=dev)
         seed = 0xABEDABE7
         generator.manual_seed(seed)
         prompt = "a dog on a rocket"
         model = "prompthero/midjourney-v4-diffusion"
         local_rank = int(os.getenv("LOCAL_RANK", "0"))
-        device = torch.device(f"cuda:{local_rank}")
-
+        device = torch.device(f"{dev}:{local_rank}")
         pipe = DiffusionPipeline.from_pretrained(model, torch_dtype=torch.half)
         pipe = pipe.to(device)
         baseline_image = pipe(prompt, guidance_scale=7.5, generator=generator).images[0]
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
index 90fe26eb4490..5a99422ba9ff 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_kv_copy.py
@@ -13,11 +13,11 @@
 
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(1, 0), (17, 0), (33, 8), (63, 1)])
-def test_single_sequence_single_block(n_tokens: int, history_size: int):
+@pytest.mark.parametrize("head_size", [64, 80, 96, 128])
+def test_single_sequence_single_block(n_tokens: int, history_size: int, head_size: int):
     """
     Validate that the copy works correctly
     """
-    head_size = 64
     n_heads_q = 16
     n_heads_kv = 16
     kv_block_size = 64
@@ -46,11 +46,11 @@ def test_single_sequence_single_block(n_tokens: int, history_size: int):
 
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(128, 0), (177, 0), (169, 8), (117, 88)])
-def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int):
+@pytest.mark.parametrize("head_size", [64, 80, 96, 128])
+def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, head_size: int):
     """
     Validate that the copy works correctly
     """
-    head_size = 64
     n_heads_q = 16
     n_heads_kv = 16
     kv_block_size = 64
@@ -78,8 +78,8 @@ def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int):
 
 
 @pytest.mark.inference_v2_ops
-def test_multi_sequence() -> None:
-    head_size = 64
+@pytest.mark.parametrize("head_size", [64, 80, 96, 128])
+def test_multi_sequence(head_size: int) -> None:
     n_heads_q = 16
     n_heads_kv = 16
     kv_block_size = 64
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
index 618c2d3b87ec..33dd0a4c2700 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_blocked_rotary_emb.py
@@ -21,13 +21,19 @@
 """
 
 
-def rotary_pos_embs(q: torch.Tensor, k: torch.Tensor, seq_descs: List[DSSequenceDescriptor], batch: RaggedBatchWrapper,
-                    head_size: int):
+def rotary_pos_embs(q: torch.Tensor,
+                    k: torch.Tensor,
+                    seq_descs: List[DSSequenceDescriptor],
+                    batch: RaggedBatchWrapper,
+                    head_size: int,
+                    rotary_dim: int = -1) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+
+    rotary_dim = rotary_dim if rotary_dim >= 0 else head_size
 
     def make_cos_sin_emb(seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
         t = torch.arange(seq_len, dtype=torch.float32, device=get_accelerator().current_device())
         inv_freq = (1.0 / (10000.0**(torch.arange(
-            0, head_size, 2, dtype=torch.float32, device=get_accelerator().current_device()) / head_size))).half()
+            0, rotary_dim, 2, dtype=torch.float32, device=get_accelerator().current_device()) / rotary_dim))).half()
 
         freqs = torch.einsum("i,j->ij", t, inv_freq)
         emb = torch.cat((freqs, freqs), dim=-1)
@@ -57,11 +63,17 @@ def rotate_half(x: torch.Tensor) -> torch.Tensor:
         k_src = k[start_idx:start_idx + n_tokens].reshape(n_tokens, n_heads_kv, head_size).float()
         freq_start_offset = seq_desc.seen_tokens
 
+        q_src_rot = q_src[:, :, :rotary_dim]
+        k_src_rot = k_src[:, :, :rotary_dim]
+
         cos_chunk = cos[range(freq_start_offset, freq_start_offset + n_tokens)]
         sin_chunk = sin[range(freq_start_offset, freq_start_offset + n_tokens)]
 
-        q_emb = q_src * cos_chunk + rotate_half(q_src) * sin_chunk
-        k_emb = k_src * cos_chunk + rotate_half(k_src) * sin_chunk
+        q_rot = q_src_rot * cos_chunk + rotate_half(q_src_rot) * sin_chunk
+        k_rot = k_src_rot * cos_chunk + rotate_half(k_src_rot) * sin_chunk
+
+        q_emb = torch.cat((q_rot, q_src[:, :, rotary_dim:]), dim=-1)
+        k_emb = torch.cat((k_rot, k_src[:, :, rotary_dim:]), dim=-1)
 
         q_out[start_idx:start_idx + n_tokens] = q_emb.reshape(n_tokens, n_heads_q * head_size).to(q_out.dtype)
         k_out[start_idx:start_idx + n_tokens] = k_emb.reshape(n_tokens, n_heads_kv * head_size).to(k_out.dtype)
@@ -72,11 +84,11 @@ def rotate_half(x: torch.Tensor) -> torch.Tensor:
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(1, 0), (17, 0), (33, 15), (1, 63)])
 @pytest.mark.parametrize("trained_emb", [False, True])
-def test_single_sequence_single_block(n_tokens: int, history_size: int, trained_emb: bool):
+@pytest.mark.parametrize("head_size", [64, 80, 96])
+def test_single_sequence_single_block(n_tokens: int, history_size: int, trained_emb: bool, head_size: int):
     """
     Validate that the copy works correctly
     """
-    head_size = 64
     n_heads_q = 16
     n_heads_kv = 16
     kv_block_size = 64
@@ -106,7 +118,7 @@ def test_single_sequence_single_block(n_tokens: int, history_size: int, trained_
         copy_impl = BlockedTrainedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
         copy_impl(kv_cache, qkv, batch, freqs)
     else:
-        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16, head_size, 10000.0)
         copy_impl(kv_cache, qkv, batch)
 
     assert allclose(qkv[:, :head_size * n_heads_q], q_ref)
@@ -116,11 +128,11 @@ def test_single_sequence_single_block(n_tokens: int, history_size: int, trained_
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("n_tokens, history_size", [(128, 0), (177, 0), (169, 8), (117, 88)])
 @pytest.mark.parametrize("trained_emb", [False, True])
-def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, trained_emb: bool):
+@pytest.mark.parametrize("head_size", [64, 80, 96])
+def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, trained_emb: bool, head_size: int):
     """
     Validate that the copy works correctly
     """
-    head_size = 64
     n_heads_q = 16
     n_heads_kv = 16
     kv_block_size = 64
@@ -150,7 +162,7 @@ def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, train
         copy_impl = BlockedTrainedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
         copy_impl(kv_cache, qkv, batch, freqs)
     else:
-        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16, head_size, 10000.0)
         copy_impl(kv_cache, qkv, batch)
 
     assert allclose(qkv[:, :head_size * n_heads_q], q_ref)
@@ -159,8 +171,8 @@ def test_single_sequence_multiple_blocks(n_tokens: int, history_size: int, train
 
 @pytest.mark.inference_v2_ops
 @pytest.mark.parametrize("trained_emb", [False, True])
-def test_multi_sequences(trained_emb: bool) -> None:
-    head_size = 64
+@pytest.mark.parametrize("head_size", [64, 80, 96])
+def test_multi_sequences(trained_emb: bool, head_size: int) -> None:
     n_heads_q = 16
     n_heads_kv = 16
     kv_block_size = 64
@@ -196,8 +208,51 @@ def test_multi_sequences(trained_emb: bool) -> None:
         copy_impl = BlockedTrainedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
         copy_impl(kv_cache, qkv, batch, freqs)
     else:
-        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16)
+        copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16, head_size, 10000.0)
         copy_impl(kv_cache, qkv, batch)
 
     assert allclose(qkv[:, :head_size * n_heads_q], q_ref)
     validate_kv_cache(kv_cache, k, v, seq_descs, batch, exact=False)
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("head_size", [80, 96])
+def test_rotary_dim(head_size: int) -> None:
+    trained_emb = False
+    rotary_dim = 64
+    n_heads_q = 16
+    n_heads_kv = 16
+    kv_block_size = 64
+    device = get_accelerator().current_device()
+
+    batch_config = [
+        (128, 0),
+        (177, 0),
+        (169, 8),
+        (117, 88),
+        (1, 293),
+        (1, 733),
+        (1, 33),
+    ]
+
+    batch, state_manager, seq_descs = build_batch_and_manager(batch_config, head_size, n_heads_kv, kv_block_size)
+
+    qkv = torch.randn((batch.current_tokens, (n_heads_q + 2 * n_heads_kv) * head_size),
+                      device=device,
+                      dtype=torch.float16)
+    qkv_ref = qkv.clone()
+
+    q = qkv_ref[:, :head_size * n_heads_q]
+    k = qkv_ref[:, head_size * n_heads_q:head_size * (n_heads_q + n_heads_kv)]
+    v = qkv_ref[:, head_size * (n_heads_q + n_heads_kv):]
+
+    q_ref, k, freqs = rotary_pos_embs(q, k, seq_descs, batch, head_size, rotary_dim=rotary_dim)
+    freqs = freqs.half()
+
+    kv_cache = state_manager.get_cache(0)
+
+    copy_impl = BlockedRotaryEmbeddings(head_size, n_heads_q, n_heads_kv, torch.float16, rotary_dim, 10000.0)
+    copy_impl(kv_cache, qkv, batch)
+
+    assert allclose(qkv[:, :head_size * n_heads_q], q_ref)
+    validate_kv_cache(kv_cache, k, v, seq_descs, batch, exact=False)
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
index 5fa375b49c19..3907fc3e3a4b 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_gather.py
@@ -11,18 +11,28 @@
 from deepspeed.inference.v2.kernels.ragged_ops import (
     MoEGather,
     MoEScatter,
-    RaggedTop1Gating,
+    RaggedTopKGating,
 )
 from .ragged_testing_utils import build_simple_batch
 """
-For simplicity's sake, these tests do rely on ``RaggedTop1Gating``  and
+For simplicity's sake, these tests do rely on ``RaggedTopKGating``  and
 ``MoEScatter`` to produce correct inputs. If either of these kernels is broken
 these tests will fail, so double check the unit test results there before
 debugging here.
 """
 
+TEST_CASES = [
+    # (n_tokens, n_experts, n_top_k)
+    (13, 64, 1),
+    (278, 64, 1),
+    (1977, 64, 1),
+    (13, 8, 2),
+    (278, 8, 2),
+    (1977, 8, 2),
+]
 
-def build_inputs(n_tokens, n_experts, do_padding):
+
+def build_inputs(n_tokens: int, n_experts: int, n_top_k: int, do_padding: bool):
 
     assert n_tokens <= 2048, "This test will break if n_tokens > 2048"
 
@@ -39,22 +49,28 @@ def build_inputs(n_tokens, n_experts, do_padding):
                                  device=get_accelerator().current_device()).repeat_interleave(4096, dim=0).reshape(
                                      batch.tensor_toks, 4096).contiguous()
 
-    gate = RaggedTop1Gating(DtypeEnum.fp16)
+    gate = RaggedTopKGating(DtypeEnum.fp16)
 
     # Gating outputs
     expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
-    scores = torch.empty((batch.tensor_toks, ), dtype=torch.float32, device=get_accelerator().current_device())
-    expert_assignment = torch.empty((batch.tensor_toks, ),
+    scores = torch.empty((batch.tensor_toks, n_top_k), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((batch.tensor_toks, n_top_k),
                                     dtype=torch.int32,
                                     device=get_accelerator().current_device())
-    expert_offset = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+    expert_offset = torch.empty((batch.tensor_toks, n_top_k),
+                                dtype=torch.int32,
+                                device=get_accelerator().current_device())
 
     gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
 
     # Scatter outputs
-    moe_input = torch.empty((batch.tensor_toks, 4096), dtype=torch.float16, device=get_accelerator().current_device())
+    moe_input = torch.empty((batch.tensor_toks * n_top_k, 4096),
+                            dtype=torch.float16,
+                            device=get_accelerator().current_device())
     expert_cumsum = torch.empty((n_experts, ), dtype=torch.int64, device=get_accelerator().current_device())
-    mapped_slots = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+    mapped_slots = torch.empty((batch.tensor_toks, n_top_k),
+                               dtype=torch.int32,
+                               device=get_accelerator().current_device())
 
     scatter = MoEScatter(DtypeEnum.fp16, 4096)
     scatter(moe_input, expert_cumsum, mapped_slots, hidden_states, expert_counts, expert_assignment, expert_offset)
@@ -63,11 +79,12 @@ def build_inputs(n_tokens, n_experts, do_padding):
 
 
 @pytest.mark.inference_v2_ops
-@pytest.mark.parametrize("n_tokens, n_experts", [(13, 64), (278, 64), (1977, 64)])
-@pytest.mark.parametrize("do_padding", [True, False])
-def test_moe_gather(n_tokens, n_experts, do_padding):
+@pytest.mark.parametrize("n_tokens, n_experts, n_top_k", TEST_CASES)
+@pytest.mark.parametrize("do_padding", [False])
+def test_moe_gather(n_tokens: int, n_experts: int, n_top_k: int, do_padding: bool):
+    get_accelerator().manual_seed(0xC0FFEE)
 
-    batch, moe_input, scores, mapped_slots, expert_counts = build_inputs(n_tokens, n_experts, do_padding)
+    batch, moe_input, scores, mapped_slots, expert_counts = build_inputs(n_tokens, n_experts, n_top_k, do_padding)
 
     output = torch.randn((batch.tensor_toks, 4096), dtype=torch.float16, device=get_accelerator().current_device())
 
@@ -75,9 +92,31 @@ def test_moe_gather(n_tokens, n_experts, do_padding):
     gather(output, moe_input, scores, mapped_slots, expert_counts)
 
     for token_idx in range(n_tokens):
+        effective_score = scores[token_idx].sum().item()
         assert torch.equal(
             output[token_idx],
             torch.full((4096, ),
-                       token_idx * scores[token_idx],
+                       token_idx * effective_score,
                        dtype=torch.float16,
                        device=get_accelerator().current_device()))
+
+
+@pytest.mark.inference_v2_ops
+def test_moe_gather_normalize_scales():
+    get_accelerator().manual_seed(0xC0FFEE)
+
+    n_tokens = 72
+    n_experts = 8
+    n_top_k = 2
+    do_padding = False
+
+    batch, moe_input, scores, mapped_slots, expert_counts = build_inputs(n_tokens, n_experts, n_top_k, do_padding)
+    output = torch.randn((batch.tensor_toks, 4096), dtype=torch.float16, device=get_accelerator().current_device())
+
+    gather = MoEGather(DtypeEnum.fp16, 4096, normalize_scores=True)
+    gather(output, moe_input, scores, mapped_slots, expert_counts)
+
+    for token_idx in range(n_tokens):
+        assert torch.equal(
+            output[token_idx],
+            torch.full((4096, ), token_idx, dtype=torch.float16, device=get_accelerator().current_device()))
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
index 4ca051410c1c..aae459f06a6f 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_moe_scatter.py
@@ -8,19 +8,28 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.inference_utils import DtypeEnum
-from deepspeed.inference.v2.kernels.ragged_ops import MoEScatter, RaggedTop1Gating
+from deepspeed.inference.v2.kernels.ragged_ops import MoEScatter, RaggedTopKGating
 from .ragged_testing_utils import build_simple_batch
 """
-For simplicity's sake, these tests do rely on ``RaggedTop1Gating`` to produce correct
-inputs. If ``RaggedTop1Gating`` is broken, these tests will fail, so double check
+For simplicity's sake, these tests do rely on ``RaggedTopKGating`` to produce correct
+inputs. If ``RaggedTopKGating`` is broken, these tests will fail, so double check
 the unit test results there before debugging here.
 """
 
+TEST_CONFIGS = [
+    (13, 64, 1),
+    (278, 64, 1),
+    (1977, 64, 1),
+    (13, 8, 2),
+    (278, 8, 2),
+    (1977, 8, 2),
+]
+
 
 @pytest.mark.inference_v2_ops
-@pytest.mark.parametrize("n_tokens, n_experts", [(13, 64), (278, 64), (1977, 64)])
-@pytest.mark.parametrize("do_padding", [True, False])
-def test_moe_scatter(n_tokens, n_experts, do_padding):
+@pytest.mark.parametrize("n_tokens, n_experts, n_top_k", TEST_CONFIGS)
+@pytest.mark.parametrize("do_padding", [False, True])
+def test_moe_scatter(n_tokens, n_experts, n_top_k, do_padding):
 
     # Sequence composition shouldn't matter here
     batch = build_simple_batch([n_tokens], padding=do_padding)
@@ -35,40 +44,52 @@ def test_moe_scatter(n_tokens, n_experts, do_padding):
                                  device=get_accelerator().current_device()).repeat_interleave(4096, dim=0).reshape(
                                      batch.tensor_toks, 4096).contiguous()
 
-    gate = RaggedTop1Gating(DtypeEnum.fp16)
+    gate = RaggedTopKGating(DtypeEnum.fp16)
 
     # Gating outputs
     expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
-    scores = torch.empty((batch.tensor_toks, ), dtype=torch.float32, device=get_accelerator().current_device())
-    expert_assignment = torch.empty((batch.tensor_toks, ),
+    scores = torch.empty((batch.tensor_toks, n_top_k), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((batch.tensor_toks, n_top_k),
                                     dtype=torch.int32,
                                     device=get_accelerator().current_device())
-    expert_offset = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+    expert_offset = torch.empty((batch.tensor_toks, n_top_k),
+                                dtype=torch.int32,
+                                device=get_accelerator().current_device())
 
     gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
 
     # Scatter outputs
-    moe_input = torch.empty((batch.tensor_toks, 4096), dtype=torch.float16, device=get_accelerator().current_device())
+    moe_input = torch.empty((batch.tensor_toks * n_top_k, 4096),
+                            dtype=torch.float16,
+                            device=get_accelerator().current_device())
     expert_cumsum = torch.empty((n_experts, ), dtype=torch.int64, device=get_accelerator().current_device())
-    mapped_slots = torch.empty((batch.tensor_toks, ), dtype=torch.int32, device=get_accelerator().current_device())
+    mapped_slots = torch.empty((batch.tensor_toks, n_top_k),
+                               dtype=torch.int32,
+                               device=get_accelerator().current_device())
 
     scatter = MoEScatter(DtypeEnum.fp16, 4096)
     scatter(moe_input, expert_cumsum, mapped_slots, hidden_states, expert_counts, expert_assignment, expert_offset)
+    get_accelerator().synchronize()
     assert torch.equal(expert_cumsum, torch.cumsum(expert_counts, dim=0).to(torch.int64))
 
+    if not do_padding:
+        assert torch.unique(mapped_slots).size(0) == n_top_k * n_tokens
+
     for token_idx in range(batch.tensor_toks):
         if token_idx < n_tokens:
-            expert_idx = expert_assignment[token_idx].item()
-            if expert_idx == 0:
-                expert_cumsum_val = 0
-            else:
-                expert_cumsum_val = expert_cumsum[expert_idx - 1]
-            offset = expert_offset[token_idx]
-            total_offset = offset + expert_cumsum_val
-
-            assert total_offset == mapped_slots[token_idx].item()
-            assert torch.equal(moe_input[total_offset], hidden_states[token_idx])
+            for k in range(n_top_k):
+                expert_idx = expert_assignment[token_idx][k].item()
+                if expert_idx == 0:
+                    expert_cumsum_val = 0
+                else:
+                    expert_cumsum_val = expert_cumsum[expert_idx - 1]
+                offset = expert_offset[token_idx][k]
+                total_offset = offset + expert_cumsum_val
+
+                assert total_offset == mapped_slots[token_idx][k].item()
+                assert torch.equal(moe_input[total_offset], hidden_states[token_idx])
         else:
-            assert mapped_slots[token_idx].item() == -1
+            for k in range(n_top_k):
+                assert mapped_slots[token_idx][k].item() == -1
 
-    assert expert_cumsum[-1] == n_tokens
+    assert expert_cumsum[-1] == n_tokens * n_top_k
diff --git a/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py b/tests/unit/inference/v2/kernels/ragged_ops/test_top_k_gating.py
similarity index 51%
rename from tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py
rename to tests/unit/inference/v2/kernels/ragged_ops/test_top_k_gating.py
index 6ff2508bf320..5fa0c8a079f0 100644
--- a/tests/unit/inference/v2/kernels/ragged_ops/test_top_1_gating.py
+++ b/tests/unit/inference/v2/kernels/ragged_ops/test_top_k_gating.py
@@ -9,9 +9,52 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.inference_utils import DtypeEnum
-from deepspeed.inference.v2.kernels.ragged_ops import RaggedTop1Gating
+from deepspeed.inference.v2.kernels.ragged_ops import RaggedTopKGating
 from .ragged_testing_utils import build_simple_batch
-from ....v2.inference_test_utils import allclose
+from ...inference_test_utils import allclose
+
+
+def _top_k_gating_testing_helper(n_tokens: int, n_experts: int, n_top_k: int, seed: int = 0xC0FFEE) -> None:
+
+    torch.manual_seed(seed)
+    logits = torch.randn((n_tokens, n_experts), dtype=torch.float16, device=get_accelerator().current_device())
+    batch = build_simple_batch([n_tokens], padding=False)
+    gate = RaggedTopKGating(DtypeEnum.fp16)
+
+    expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
+    scores = torch.empty((n_tokens, n_top_k), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((n_tokens, n_top_k), dtype=torch.int32, device=get_accelerator().current_device())
+    expert_offset = torch.empty((n_tokens, n_top_k), dtype=torch.int32, device=get_accelerator().current_device())
+
+    gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
+
+    ref_weights = F.softmax(logits, dim=-1, dtype=torch.float32)
+    ref_scores, ref_indices = torch.topk(ref_weights, n_top_k, dim=-1)
+
+    assert allclose(scores, ref_scores), f"expected {ref_scores}, got {scores}"
+    assert torch.equal(expert_assignment,
+                       ref_indices.to(torch.int32)), f"expected {ref_indices}, got {expert_assignment}"
+    assert expert_counts.sum(
+    ) == n_tokens * n_top_k, f"expected {n_tokens * n_top_k} tokens, got {expert_counts.sum()}"
+
+    # Ensure that the expert offsets are unique
+    for i in range(n_experts):
+        expert_idxs = torch.where(expert_assignment == i, expert_offset, 0)
+        if expert_counts[i] > 0:
+            assert expert_idxs.unique().shape[0] == expert_counts[
+                i], f"expected {expert_counts[i]} unique offsets, got {expert_idxs.unique().shape[0]}"
+            assert expert_idxs.max(
+            ) == expert_counts[i] - 1, f"expected max offset {expert_counts[i] - 1}, got {expert_idxs.max()}"
+        else:
+            # Should have all 0's so one unique value
+            assert expert_idxs.unique().shape[0] == 1
+            assert expert_idxs.max() == 0
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize('n_tokens', [1, 17, 32, 89, 433])
+def test_top_2_e_8_gating(n_tokens: int) -> None:
+    _top_k_gating_testing_helper(n_tokens=n_tokens, n_experts=8, n_top_k=2)
 
 
 def _test_single_mapping_helper(n_tokens: int,
@@ -19,6 +62,8 @@ def _test_single_mapping_helper(n_tokens: int,
                                 assigned_expert: int,
                                 logit_fill: float = 0.0,
                                 match_fill: float = 1.0) -> None:
+
+    n_top_k = 1
     logits = torch.full((n_tokens, n_experts),
                         logit_fill,
                         dtype=torch.float16,
@@ -26,12 +71,12 @@ def _test_single_mapping_helper(n_tokens: int,
 
     logits[:, assigned_expert] = match_fill
 
-    gate = RaggedTop1Gating(DtypeEnum.fp16)
+    gate = RaggedTopKGating(DtypeEnum.fp16)
 
     expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
-    scores = torch.empty((n_tokens, ), dtype=torch.float32, device=get_accelerator().current_device())
-    expert_assignment = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
-    expert_offset = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+    scores = torch.empty((n_tokens, n_top_k), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((n_tokens, n_top_k), dtype=torch.int32, device=get_accelerator().current_device())
+    expert_offset = torch.empty((n_tokens, n_top_k), dtype=torch.int32, device=get_accelerator().current_device())
     batch = build_simple_batch([n_tokens], padding=False)
 
     gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
@@ -39,7 +84,7 @@ def _test_single_mapping_helper(n_tokens: int,
     assert expert_counts[assigned_expert] == n_tokens
     assert torch.all(expert_assignment == assigned_expert)
     assert torch.unique(expert_offset).shape[0] == n_tokens
-    assert allclose(scores, F.softmax(logits.float(), dim=1)[:, assigned_expert])
+    assert allclose(scores, F.softmax(logits.float(), dim=1)[:, assigned_expert].reshape(-1, n_top_k))
 
 
 @pytest.mark.inference_v2_ops
@@ -72,6 +117,7 @@ def test_determinism():
 
     n_tokens = 512
     n_experts = 64
+    n_top_k = 1
 
     logits = torch.zeros((n_tokens, n_experts), dtype=torch.float16, device=get_accelerator().current_device())
     batch = build_simple_batch([n_tokens], padding=False)
@@ -79,13 +125,15 @@ def test_determinism():
     logits[:, 19] = 1.0
     logits[:, 26] = 1.0
 
-    gate = RaggedTop1Gating(DtypeEnum.fp16)
+    gate = RaggedTopKGating(DtypeEnum.fp16)
 
     for _ in range(1024):
         expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
-        scores = torch.empty((n_tokens, ), dtype=torch.float32, device=get_accelerator().current_device())
-        expert_assignment = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
-        expert_offset = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+        scores = torch.empty((n_tokens, n_top_k), dtype=torch.float32, device=get_accelerator().current_device())
+        expert_assignment = torch.empty((n_tokens, n_top_k),
+                                        dtype=torch.int32,
+                                        device=get_accelerator().current_device())
+        expert_offset = torch.empty((n_tokens, n_top_k), dtype=torch.int32, device=get_accelerator().current_device())
         batch = build_simple_batch([n_tokens], padding=False)
 
         gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
@@ -94,7 +142,7 @@ def test_determinism():
         assert expert_counts[26] == 0
         assert torch.all(expert_assignment == 19)
         assert torch.unique(expert_offset).shape[0] == n_tokens
-        assert allclose(scores, F.softmax(logits.float(), dim=1)[:, 19])
+        assert allclose(scores, F.softmax(logits.float(), dim=1)[:, 19].reshape(-1, 1))
 
 
 @pytest.mark.inference_v2_ops
@@ -105,16 +153,19 @@ def test_score_accuracy(n_tokens: int, n_experts: int) -> None:
     """
     logits = torch.randn((n_tokens, n_experts), dtype=torch.float16, device=get_accelerator().current_device())
     batch = build_simple_batch([n_tokens], padding=False)
+    n_top_k = 1
 
-    gate = RaggedTop1Gating(DtypeEnum.fp16)
+    gate = RaggedTopKGating(DtypeEnum.fp16)
 
     expert_counts = torch.zeros((n_experts, ), dtype=torch.int32, device=get_accelerator().current_device())
-    scores = torch.empty((n_tokens, ), dtype=torch.float32, device=get_accelerator().current_device())
-    expert_assignment = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
-    expert_offset = torch.empty((n_tokens, ), dtype=torch.int32, device=get_accelerator().current_device())
+    scores = torch.empty((n_tokens, n_top_k), dtype=torch.float32, device=get_accelerator().current_device())
+    expert_assignment = torch.empty((n_tokens, n_top_k), dtype=torch.int32, device=get_accelerator().current_device())
+    expert_offset = torch.empty((n_tokens, n_top_k), dtype=torch.int32, device=get_accelerator().current_device())
 
     ref_scores = F.softmax(logits.float(), dim=1).max(dim=1).values
+    ref_scores = ref_scores.reshape(-1, 1)
 
     gate(expert_counts, scores, expert_assignment, expert_offset, logits, batch)
+
     assert allclose(scores, ref_scores)
     assert expert_counts.sum() == n_tokens
diff --git a/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py b/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
index 260236562ee9..06ff9047d648 100644
--- a/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
+++ b/tests/unit/inference/v2/model_implementations/parameters/test_parameter_list.py
@@ -26,7 +26,7 @@ def __init__(self, experts_per_rank: int) -> None:
         self._num_experts = experts_per_rank
 
     @property
-    def num_experts(self) -> int:
+    def n_experts(self) -> int:
         return self._num_experts
 
     @on_device
diff --git a/tests/unit/inference/v2/modules/test_blocked_attn.py b/tests/unit/inference/v2/modules/test_blocked_attn.py
index 215ad64636b1..6556aa460a44 100644
--- a/tests/unit/inference/v2/modules/test_blocked_attn.py
+++ b/tests/unit/inference/v2/modules/test_blocked_attn.py
@@ -12,7 +12,7 @@
 
 from deepspeed.accelerator import get_accelerator
 from deepspeed.inference.v2.modules import ConfigBundle
-from deepspeed.inference.v2.modules.configs import DSSelfAttentionConfig, PositionalEmbeddingType
+from deepspeed.inference.v2.modules.configs import DSSelfAttentionConfig, PositionalEmbeddingType, RotateHalfConfig
 from deepspeed.inference.v2.modules.interfaces import DSSelfAttentionRegistry, DSSelfAttentionBase
 
 from ..kernels.ragged_ops.ragged_testing_utils import build_batch_and_manager
@@ -37,13 +37,10 @@ def _blocked_flash_testing_helper(head_size: int,
     """
     if trained_freqs is None:
         embed_type = PositionalEmbeddingType.none
-        embed_args = {}
+        embed_args = None
     else:
         embed_type = PositionalEmbeddingType.rotate_half
-        if trained_freqs:
-            embed_args = {'trained_freqs': True}
-        else:
-            embed_args = {'trained_freqs': False}
+        embed_args = RotateHalfConfig(use_trained_freqs=trained_freqs)
 
     attn_config = DSSelfAttentionConfig(max_tokens=2048,
                                         n_heads_q=n_heads_q,
@@ -51,7 +48,7 @@ def _blocked_flash_testing_helper(head_size: int,
                                         head_size=head_size,
                                         max_sequences=32,
                                         positional_embedding_type=embed_type,
-                                        positional_embedding_args=embed_args)
+                                        positional_embedding_config=embed_args)
 
     config = ConfigBundle(name='dense_blocked_attention', config=attn_config)
     attn_module: DSSelfAttentionBase = DSSelfAttentionRegistry.instantiate_config(config)
diff --git a/tests/unit/inference/v2/modules/test_cutlass_moe.py b/tests/unit/inference/v2/modules/test_cutlass_moe.py
index e21170c9ed8f..b14ba127c6be 100644
--- a/tests/unit/inference/v2/modules/test_cutlass_moe.py
+++ b/tests/unit/inference/v2/modules/test_cutlass_moe.py
@@ -212,3 +212,117 @@ def test_in_out_channels(in_channels: int, out_channels: int) -> None:
                                 dtype=DtypeEnum.fp16,
                                 activation_type=ActivationType.IDENTITY,
                                 use_bias=True)
+
+
+def _mixtral_moe_baseline(hidden_states: torch.Tensor,
+                          gate_weight: torch.Tensor,
+                          mlp_w1: torch.Tensor,
+                          mlp_w2: torch.Tensor,
+                          mlp_w3: torch.Tensor,
+                          force_float: bool = False) -> torch.Tensor:
+    """
+    Baseline implementation for mixtral MoE module.
+
+    Based on transformers implementation: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mixtral/modeling_mixtral.py
+    """
+    output_dtype = hidden_states.dtype
+    if force_float:
+        hidden_states = hidden_states.float()
+        gate_weight = gate_weight.float()
+        mlp_w1 = mlp_w1.float()
+        mlp_w2 = mlp_w2.float()
+        mlp_w3 = mlp_w3.float()
+
+    router_logits = torch.nn.functional.linear(hidden_states, gate_weight)
+    routing_weights = torch.nn.functional.softmax(router_logits, dim=-1, dtype=torch.float)
+    routing_weights, selected_experts = routing_weights.topk(k=2, dim=-1)
+    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+    # NOTE(cmikeh2): This is a difference implementation, ours will preserve the original scale
+    # as float32 and perform in-kernel fused FP16->FP32->FP16 conversion.
+    routing_weights = routing_weights.to(hidden_states.dtype)
+
+    final_hidden_states = torch.zeros_like(hidden_states)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=gate_weight.shape[0]).permute(2, 1, 0)
+    get_accelerator().synchronize()
+
+    for expert_idx in range(gate_weight.shape[0]):
+        exp_mlp_w1 = mlp_w1[expert_idx]
+        exp_mlp_w2 = mlp_w2[expert_idx]
+        exp_mlp_w3 = mlp_w3[expert_idx]
+
+        idx, top_x = torch.where(expert_mask[expert_idx])
+
+        if top_x.shape[0] == 0:
+            continue
+
+        top_x_list = top_x.tolist()
+        idx_list = idx.tolist()
+
+        current_state = hidden_states[top_x_list]
+
+        linear = torch.nn.functional.linear
+        intermediate = torch.nn.functional.silu(linear(current_state, exp_mlp_w1)) * linear(current_state, exp_mlp_w3)
+        output = linear(intermediate, exp_mlp_w2) * routing_weights[top_x_list, idx_list].unsqueeze(-1)
+        final_hidden_states.index_add_(0, top_x, output.to(final_hidden_states.dtype))
+
+    return final_hidden_states.to(output_dtype)
+
+
+@pytest.mark.inference_v2_ops
+def test_mixtral_moe_config():
+
+    experts = 8
+    n_top_k = 2
+    in_channels = 4096
+    intermediate_dim = 2048
+    dtype = DtypeEnum.bf16
+
+    # Parameters
+    gate_weight = torch.randn(
+        (experts, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+
+    mlp_w1 = torch.randn(
+        (experts, intermediate_dim, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+    mlp_w3 = torch.randn(
+        (experts, intermediate_dim, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+    mlp_w2 = torch.randn(
+        (experts, in_channels, intermediate_dim), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+
+    n_tokens = 256
+    hidden_states = torch.randn(
+        (n_tokens, in_channels), dtype=dtype.value, device=get_accelerator().current_device()) * .1
+
+    baseline = _mixtral_moe_baseline(hidden_states, gate_weight, mlp_w1, mlp_w2, mlp_w3)
+
+    mlp_w13_fused = torch.cat([mlp_w1, mlp_w3], dim=-1).reshape(experts, 2 * intermediate_dim, in_channels)
+
+    config = DSMoEConfig(max_tokens=4096,
+                         model_dim=in_channels,
+                         intermediate_features=intermediate_dim,
+                         n_experts=experts,
+                         activation=ActivationType.SiGLU,
+                         input_dtype=dtype,
+                         output_dtype=dtype,
+                         top_k=n_top_k,
+                         normalize_scores=True)
+
+    implementation_config = {"weight_dtype": DtypeEnum(dtype)}
+
+    bundle = ConfigBundle(name='cutlass_multi_gemm_moe', config=config, implementation_config=implementation_config)
+    moe_module = DSMoERegistry.instantiate_config(bundle)
+
+    batch = build_simple_batch([n_tokens])
+
+    gate_ds = moe_module.transform_gate_param(gate_weight)
+    mlp_w1_ds = moe_module.transform_moe_mlp_1_param(mlp_w13_fused)
+    mlp_w2_ds = moe_module.transform_moe_mlp_2_param(mlp_w2)
+
+    output = moe_module(hidden_states, batch, gate_ds, mlp_w1_ds, mlp_w2_ds)
+
+    # NOTE(cmikeh2): These are higher than the other tests for reasons that aren't quite
+    # clear to me. My best guess is that the SiGLU activation is causing larger numerical
+    # divergence. The thresholds chosen here is based on the observed error between the
+    # float and bfloat16 reference implementations.
+    assert allclose(output, baseline.to(dtype.value), tolerances=(5e-2, 5e-2))
diff --git a/tests/unit/inference/v2/modules/test_quantized_linear_module.py b/tests/unit/inference/v2/modules/test_quantized_linear_module.py
new file mode 100644
index 000000000000..050f21c3bf3a
--- /dev/null
+++ b/tests/unit/inference/v2/modules/test_quantized_linear_module.py
@@ -0,0 +1,183 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from typing import Optional
+
+import pytest
+import torch
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.inference.v2.inference_utils import ActivationType, DtypeEnum, is_gated
+from deepspeed.inference.v2.modules import ConfigBundle
+from deepspeed.inference.v2.modules.configs import DSLinearConfig
+from deepspeed.inference.v2.modules.interfaces import DSLinearRegistry
+from ...v2.inference_test_utils import allclose
+
+
+def reference_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor],
+                             act_type: ActivationType) -> torch.Tensor:
+    dtype = hidden_states.dtype
+    out_states = torch.nn.functional.linear(hidden_states, weight, bias)
+    out_states.float()
+
+    if is_gated(act_type):
+        act_func_map = {
+            ActivationType.ReGLU: torch.nn.functional.relu,
+            ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"),
+            ActivationType.SiGLU: torch.nn.functional.silu,
+        }
+
+        act_act = out_states[..., ::2]
+        act_linear = out_states[..., 1::2]
+
+        act_act = act_func_map[act_type](act_act)
+        out_states = act_act * act_linear
+    else:
+        act_func_map = {
+            ActivationType.RELU: torch.nn.functional.relu,
+            ActivationType.GELU: torch.nn.functional.gelu,
+            ActivationType.SILU: torch.nn.functional.silu,
+            ActivationType.IDENTITY: lambda x: x,
+        }
+
+        out_states = act_func_map[act_type](out_states)
+    return out_states.to(dtype)
+
+
+def _fp6_quant_dequant_weights(weight: torch.Tensor) -> torch.Tensor:
+    from deepspeed.inference.v2.modules.implementations.linear.quantized_linear import fp_quantize
+    weight_quantized_fake_fp6, scales = fp_quantize(weight, num_bits=6, exp_bits=3)
+    return weight_quantized_fake_fp6 * scales
+
+
+def quant_dequant_implementation(hidden_states: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor],
+                                 act_type: ActivationType) -> torch.Tensor:
+    dtype = hidden_states.dtype
+    weight_dequantized = _fp6_quant_dequant_weights(weight)
+    out_states = torch.nn.functional.linear(hidden_states, weight_dequantized, bias)
+    out_states.float()
+
+    if is_gated(act_type):
+        act_func_map = {
+            ActivationType.ReGLU: torch.nn.functional.relu,
+            ActivationType.GEGLU: lambda x: torch.nn.functional.gelu(x, approximate="tanh"),
+            ActivationType.SiGLU: torch.nn.functional.silu,
+        }
+
+        act_act = out_states[..., ::2]
+        act_linear = out_states[..., 1::2]
+
+        act_act = act_func_map[act_type](act_act)
+        out_states = act_act * act_linear
+    else:
+        act_func_map = {
+            ActivationType.RELU: torch.nn.functional.relu,
+            ActivationType.GELU: torch.nn.functional.gelu,
+            ActivationType.SILU: torch.nn.functional.silu,
+            ActivationType.IDENTITY: lambda x: x,
+        }
+
+        out_states = act_func_map[act_type](out_states)
+    return out_states.to(dtype)
+
+
+def _fp6_quantized_linear_helper(tokens: int,
+                                 in_channels: int,
+                                 out_channels: int,
+                                 dtype: DtypeEnum,
+                                 act_fn: ActivationType,
+                                 use_bias: bool = True,
+                                 expect_failure: bool = False) -> None:
+    # The current FP6 kernel only supports NVIDIA Ampere GPUs.
+    if not 'cuda' in get_accelerator().current_device_name():
+        return
+    major, _ = torch.cuda.get_device_capability()  #ignore-cuda
+    if major != 8:
+        return
+
+    # Input vals
+    hidden_states = torch.randn(
+        (tokens, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+
+    weight_out_channels = 2 * \
+        out_channels if is_gated(act_fn) else out_channels
+    weight = torch.randn(
+        (weight_out_channels, in_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+    if use_bias:
+        bias = torch.randn(
+            (weight_out_channels), dtype=dtype.value, device=get_accelerator().current_device_name()) * .01
+    else:
+        bias = None
+
+    # quantize and dequantize output
+    ref_quant_dequant_output = quant_dequant_implementation(hidden_states, weight, bias, act_fn)
+
+    linear_config = DSLinearConfig(max_tokens=2048,
+                                   in_channels=in_channels,
+                                   out_channels=out_channels,
+                                   activation=act_fn,
+                                   input_dtype=dtype,
+                                   output_dtype=dtype)
+    bundle = ConfigBundle(name='quantized_wf6af16_linear', config=linear_config)
+    fp6_linear_module = DSLinearRegistry.instantiate_config(bundle)
+    weight_fp6 = fp6_linear_module.transform_param(weight.clone().cpu()).to(get_accelerator().current_device_name())
+
+    if expect_failure:
+        with pytest.raises(ValueError) as excinfo:
+            ds_output = fp6_linear_module(hidden_states, weight_fp6, bias)
+        assert "The out and in channel should be multiple of 256 and 64 respectively." in str(excinfo.value)
+    else:
+        ds_output = fp6_linear_module(hidden_states, weight_fp6, bias)
+        # The current FP6 kernel uses FP16 Tensor Core.
+        tolerances = (3e-2, 2e-3)  # tolerances for fp16
+
+        # Check DeepSpeed implementation
+        assert allclose(ds_output, ref_quant_dequant_output, tolerances=tolerances)
+
+
+all_acts = [
+    ActivationType.RELU,
+    ActivationType.GELU,
+    ActivationType.SILU,
+    ActivationType.GEGLU,
+    ActivationType.ReGLU,
+    ActivationType.SiGLU,
+]
+all_tokens = [37]
+all_in_out_channels = [
+    (4096, 4096),
+]
+
+
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens", all_tokens)
+@pytest.mark.parametrize("in_channels, out_channels", all_in_out_channels)
+@pytest.mark.parametrize("act_fn", all_acts)
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_fp6_quantized_linear_act_fn(tokens: int, in_channels: int, out_channels: int, act_fn: ActivationType,
+                                     use_bias: bool) -> None:
+    _fp6_quantized_linear_helper(tokens=tokens,
+                                 in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 dtype=DtypeEnum.fp16,
+                                 act_fn=act_fn,
+                                 use_bias=use_bias)
+
+
+# Other shapes, not supported by FP6 kernels. Will raise ValueError.
+@pytest.mark.inference_v2_ops
+@pytest.mark.parametrize("tokens", all_tokens)
+@pytest.mark.parametrize("in_channels, out_channels", [(4608, 1728)])
+@pytest.mark.parametrize("act_fn", all_acts)
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_fp6_quantized_linear_act_fn_fail(tokens: int, in_channels: int, out_channels: int, act_fn: ActivationType,
+                                          use_bias: bool) -> None:
+    _fp6_quantized_linear_helper(tokens=tokens,
+                                 in_channels=in_channels,
+                                 out_channels=out_channels,
+                                 dtype=DtypeEnum.fp16,
+                                 act_fn=act_fn,
+                                 use_bias=use_bias,
+                                 expect_failure=True)
diff --git a/tests/unit/launcher/test_ds_arguments.py b/tests/unit/launcher/test_ds_arguments.py
index a2d06e7601ab..ee6d4ce6b7be 100644
--- a/tests/unit/launcher/test_ds_arguments.py
+++ b/tests/unit/launcher/test_ds_arguments.py
@@ -40,7 +40,7 @@ def test_no_ds_arguments():
     assert args.deepspeed == False
 
     assert hasattr(args, 'deepspeed_config')
-    assert args.deepspeed_config == None
+    assert args.deepspeed_config is None
 
 
 def test_no_ds_enable_argument():
@@ -74,7 +74,7 @@ def test_no_ds_config_argument():
     assert args.deepspeed == True
 
     assert hasattr(args, 'deepspeed_config')
-    assert args.deepspeed_config == None
+    assert args.deepspeed_config is None
 
 
 def test_no_ds_parser():
diff --git a/tests/unit/launcher/test_multinode_runner.py b/tests/unit/launcher/test_multinode_runner.py
index ec0459ab0a6f..801d4223afce 100644
--- a/tests/unit/launcher/test_multinode_runner.py
+++ b/tests/unit/launcher/test_multinode_runner.py
@@ -19,6 +19,14 @@ def runner_info():
     return env, hosts, world_info, args
 
 
+@pytest.fixture
+def mock_mpi_env(monkeypatch):
+    # Provide the 3 required MPI variables:
+    monkeypatch.setenv('OMPI_COMM_WORLD_LOCAL_RANK', '0')
+    monkeypatch.setenv('OMPI_COMM_WORLD_RANK', '0')
+    monkeypatch.setenv('OMPI_COMM_WORLD_SIZE', '1')
+
+
 def test_pdsh_runner(runner_info):
     env, resource_pool, world_info, args = runner_info
     runner = mnrunner.PDSHRunner(args, world_info)
@@ -27,11 +35,85 @@ def test_pdsh_runner(runner_info):
     assert env['PDSH_RCMD_TYPE'] == 'ssh'
 
 
-def test_openmpi_runner(runner_info):
+def test_openmpi_runner(runner_info, mock_mpi_env):
     env, resource_pool, world_info, args = runner_info
     runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
     cmd = runner.get_cmd(env, resource_pool)
     assert cmd[0] == 'mpirun'
+    assert 'eth0' in cmd
+
+
+def test_btl_nic_openmpi_runner(runner_info, mock_mpi_env):
+    env, resource_pool, world_info, _ = runner_info
+    args = parse_args(['--launcher_arg', '-mca btl_tcp_if_include eth1', 'test_launcher.py'])
+    runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert 'eth0' not in cmd
+    assert 'eth1' in cmd
+
+
+def test_btl_nic_two_dashes_openmpi_runner(runner_info, mock_mpi_env):
+    env, resource_pool, world_info, _ = runner_info
+    args = parse_args(['--launcher_arg', '--mca btl_tcp_if_include eth1', 'test_launcher.py'])
+    runner = mnrunner.OpenMPIRunner(args, world_info, resource_pool)
+    cmd = runner.get_cmd(env, resource_pool)
+    assert 'eth0' not in cmd
+    assert 'eth1' in cmd
+
+
+def test_setup_mpi_environment_success():
+    """Test that _setup_mpi_environment correctly sets environment variables when MPI variables exist."""
+    os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = '0'
+    os.environ['OMPI_COMM_WORLD_RANK'] = '1'
+    os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
+
+    args = parse_args(['--launcher_arg', '--mca btl_tcp_if_include eth1', 'test_launcher.py'])
+
+    runner = mnrunner.OpenMPIRunner(args, None, None)
+    # Set up the MPI environment
+    runner._setup_mpi_environment()
+
+    assert os.environ['LOCAL_RANK'] == '0'
+    assert os.environ['RANK'] == '1'
+    assert os.environ['WORLD_SIZE'] == '2'
+
+    # Clean up environment
+    del os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
+    del os.environ['OMPI_COMM_WORLD_RANK']
+    del os.environ['OMPI_COMM_WORLD_SIZE']
+    del os.environ['LOCAL_RANK']
+    del os.environ['RANK']
+    del os.environ['WORLD_SIZE']
+
+
+def test_setup_mpi_environment_missing_variables():
+    """Test that _setup_mpi_environment raises an EnvironmentError when MPI variables are missing."""
+
+    # Clear relevant environment variables
+    os.environ.pop('OMPI_COMM_WORLD_LOCAL_RANK', None)
+    os.environ.pop('OMPI_COMM_WORLD_RANK', None)
+    os.environ.pop('OMPI_COMM_WORLD_SIZE', None)
+
+    args = parse_args(['--launcher_arg', '--mca btl_tcp_if_include eth1', 'test_launcher.py'])
+
+    with pytest.raises(EnvironmentError, match="MPI environment variables are not set"):
+        mnrunner.OpenMPIRunner(args, None, None)
+
+
+def test_setup_mpi_environment_fail():
+    """Test that _setup_mpi_environment fails if only partial MPI variables are provided."""
+    os.environ['OMPI_COMM_WORLD_LOCAL_RANK'] = '0'
+    os.environ.pop('OMPI_COMM_WORLD_RANK', None)  # missing variable
+    os.environ['OMPI_COMM_WORLD_SIZE'] = '2'
+
+    args = parse_args(['--launcher_arg', '--mca btl_tcp_if_include eth1', 'test_launcher.py'])
+
+    with pytest.raises(EnvironmentError, match="MPI environment variables are not set"):
+        runner = mnrunner.OpenMPIRunner(args, None, None)
+
+    # Clean up environment
+    del os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
+    del os.environ['OMPI_COMM_WORLD_SIZE']
 
 
 def test_mpich_runner(runner_info):
diff --git a/tests/unit/launcher/test_user_args.py b/tests/unit/launcher/test_user_args.py
new file mode 100644
index 000000000000..b86be4dfe74c
--- /dev/null
+++ b/tests/unit/launcher/test_user_args.py
@@ -0,0 +1,66 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import subprocess
+
+from deepspeed.accelerator import get_accelerator
+
+if not get_accelerator().is_available():
+    pytest.skip("only supported in accelerator environments.", allow_module_level=True)
+
+user_arg_test_script = """import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--prompt", type=str)
+parser.add_argument("--local_rank", type=int, default=0)
+parser.add_argument("--world_size", type=int, default=1)
+args = parser.parse_args()
+print("ARG PARSE SUCCESS")
+"""
+
+
+@pytest.fixture(scope="function")
+def user_script_fp(tmpdir):
+    script_fp = tmpdir.join("user_arg_test.py")
+    with open(script_fp, "w") as f:
+        f.write(user_arg_test_script)
+    return script_fp
+
+
+@pytest.fixture(scope="function")
+def cmd(user_script_fp, prompt, multi_node):
+    if multi_node:
+        cmd = ("deepspeed", "--force_multi", "--num_nodes", "1", "--num_gpus", "1", user_script_fp, "--prompt", prompt)
+    else:
+        cmd = ("deepspeed", "--num_nodes", "1", "--num_gpus", "1", user_script_fp, "--prompt", prompt)
+    return cmd
+
+
+@pytest.mark.parametrize("prompt", [
+    '''"I am 6' tall"''', """'I am 72" tall'""", """'"translate English to Romanian: "'""",
+    '''I'm going to tell them "DeepSpeed is the best"'''
+])
+@pytest.mark.parametrize("multi_node", [True, False])
+def test_user_args(cmd, multi_node):
+    if multi_node and get_accelerator().device_name() == "cpu":
+        pytest.skip("CPU accelerator does not support this test yet")
+    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out, err = p.communicate()
+    assert "ARG PARSE SUCCESS" in out.decode("utf-8"), f"User args not parsed correctly: {err.decode('utf-8')}"
+
+
+def test_bash_string_args(tmpdir, user_script_fp):
+    bash_script = f"""
+    ARGS="--prompt 'DeepSpeed is the best'"
+    echo ${{ARGS}}|xargs deepspeed --num_nodes 1 --num_gpus 1 {user_script_fp}
+    """
+
+    bash_fp = tmpdir.join("bash_script.sh")
+    with open(bash_fp, "w") as f:
+        f.write(bash_script)
+
+    p = subprocess.Popen(["bash", bash_fp], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    out, err = p.communicate()
+    assert "ARG PARSE SUCCESS" in out.decode("utf-8"), f"User args not parsed correctly: {err.decode('utf-8')}"
diff --git a/tests/unit/linear/test_ctx.py b/tests/unit/linear/test_ctx.py
new file mode 100644
index 000000000000..e03d13fd6ce2
--- /dev/null
+++ b/tests/unit/linear/test_ctx.py
@@ -0,0 +1,106 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import torch
+import deepspeed
+import pytest
+from unit.common import DistributedTest
+
+import deepspeed.comm as dist
+from deepspeed.linear import LoRAConfig, init_lora
+from deepspeed.linear.optimized_linear import LoRAOptimizedLinear
+from unit.simple_model import random_dataloader, SimpleModel
+
+try:
+    import transformers
+except ImportError:
+    transformers = None
+
+if transformers is None:
+    pytest.skip("transformers is required for this test", allow_module_level=True)
+
+
+def injection_assert(model):
+    # pick out random linear that should have been replaced and initialized
+    q_proj = model.model.layers[1].self_attn.q_proj
+
+    assert isinstance(q_proj, LoRAOptimizedLinear), "injection did not happen"
+    assert q_proj._initialized, "lora was not initialized properly"
+    assert isinstance(q_proj.lora_weight_1, torch.nn.Linear)
+    assert isinstance(q_proj.lora_weight_2, torch.nn.Linear)
+
+
+class TestEngine(DistributedTest):
+    world_size = 2
+
+    def test_model(self):
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=2)
+        quant_config = None
+        hidden_dim = 64
+        nlayers = 4
+
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = SimpleModel(hidden_dim=hidden_dim, nlayers=nlayers)
+
+        init_lora(model)
+
+        model_norms = [model.linears[i].weight.norm().item() for i in range(nlayers)]
+
+        ds_config = {
+            "train_batch_size": 2,
+            "steps_per_print": 1,
+            "bf16": {
+                "enabled": True
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "zero_optimization": {
+                "stage": 1
+            }
+        }
+        model, *_ = deepspeed.initialize(config=ds_config, model=model, model_parameters=model.parameters())
+
+        engine_norms = [model.module.linears[i].weight.norm().item() for i in range(nlayers)]
+
+        # Ensure that sharded weights are not broadcast during engine init
+        assert engine_norms == model_norms, f"{dist.get_rank()=} base weight norms are not the same after engine init, {engine_norms=} != {model_norms=}"
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=50,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.bfloat16)
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+class TestInitTransformers(DistributedTest):
+    world_size = 2
+
+    def test_pretrained_init(self):
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=2)
+        quant_config = None
+
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = transformers.AutoModelForCausalLM.from_pretrained("llamafactory/tiny-random-Llama-3")
+
+        injection_assert(model)
+
+    def test_config_init(self):
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=2)
+        quant_config = None
+
+        config = transformers.AutoConfig.from_pretrained("llamafactory/tiny-random-Llama-3")
+
+        with deepspeed.linear.Init(lora_config=lora_config, quant_config=quant_config):
+            model = transformers.AutoModelForCausalLM.from_config(config)
+
+        injection_assert(model)
diff --git a/tests/unit/linear/test_linear.py b/tests/unit/linear/test_linear.py
new file mode 100644
index 000000000000..2058791dba4a
--- /dev/null
+++ b/tests/unit/linear/test_linear.py
@@ -0,0 +1,123 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+import deepspeed.comm as dist
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.linear import OptimizedLinear, LoRAConfig, QuantizationConfig
+from unit.common import DistributedTest
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+
+class TestBasicLinear(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        lora_config = None
+        quantization_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 1  # Number of samples in a batch
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+
+        dummy_input = torch.rand(batch_size, input_features, dtype=torch.bfloat16)
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("base_weight_sharding", [1, 2])
+class TestLoRALinear(DistributedTest):
+    world_size = 2
+
+    def test(self, base_weight_sharding):
+        rank = dist.get_rank()
+        quantization_config = None
+
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding)
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        if rank == 0:
+            for n, p in linear_layer.named_parameters():
+                print(f"{n}, {p.shape}")
+
+        dummy_input = torch.rand(batch_size, input_features, device=device, dtype=torch.bfloat16)
+
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("q_bits", [8, 6])
+class TestQuantLinear(DistributedTest):
+    world_size = 2
+
+    def test(self, q_bits):
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = None
+        quantization_config = QuantizationConfig(q_bits=q_bits)
+        quantization_config.q_dtype = FPQuantizerBuilder.get_default_quant_dtype()
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16)
+
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
+
+
+@pytest.mark.parametrize("base_weight_sharding", [1, 2], ids=['bws1', 'bws2'])
+@pytest.mark.parametrize("q_bits", [8, 6], ids=['qbit8', 'qbit6'])
+class TestOptimizedLinear(DistributedTest):
+    world_size = 2
+
+    def test(self, base_weight_sharding, q_bits):
+        input_features = 64  # Number of input features
+        output_features = 64  # Number of output features
+        batch_size = 5  # Number of samples in a batch
+
+        lora_config = LoRAConfig(lora_r=16, lora_alpha=16, base_weight_sharding=base_weight_sharding)
+        quantization_config = QuantizationConfig(q_bits=q_bits)
+        quantization_config.q_dtype = FPQuantizerBuilder.get_default_quant_dtype()
+
+        linear_layer = OptimizedLinear(input_dim=input_features,
+                                       output_dim=output_features,
+                                       lora_config=lora_config,
+                                       quantization_config=quantization_config,
+                                       dtype=torch.bfloat16)
+        device = get_accelerator().current_device_name()
+        linear_layer = linear_layer.to(device)
+        dummy_input = torch.rand([batch_size, input_features], device=device, dtype=torch.bfloat16)
+        output = linear_layer(dummy_input)
+        assert output.shape == (batch_size, output_features)
diff --git a/tests/unit/linear/test_quant_param.py b/tests/unit/linear/test_quant_param.py
new file mode 100644
index 000000000000..283d81b4bf36
--- /dev/null
+++ b/tests/unit/linear/test_quant_param.py
@@ -0,0 +1,60 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.linear.quantization import QuantizedParameter
+from deepspeed.linear.config import QuantizationConfig
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+from unit.common import DistributedTest
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+
+class TestQuantParam(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize('dtype', [torch.half, torch.float])
+    def test_unsupported_dtypes(self, dtype):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device='cpu', dtype=dtype)
+        qp = QuantizedParameter(data)
+        with pytest.raises(AssertionError):
+            qp.to(device)
+
+    def test_requires_grad(self):
+        data = torch.rand(5, 5, dtype=torch.bfloat16)
+        with pytest.raises(ValueError):
+            QuantizedParameter(data, requires_grad=True)
+
+    def test_move_to_accelerator(self):
+        device = get_accelerator().current_device()
+        data = torch.rand(5, 5, device='cpu', dtype=torch.bfloat16)
+        quantization_config = QuantizationConfig()
+        quantization_config.q_dtype = FPQuantizerBuilder.get_default_quant_dtype()
+        qp = QuantizedParameter(data, quantization_config=quantization_config)
+        assert qp.device == torch.device('cpu')
+        qp = qp.to(get_accelerator().current_device_name())
+        assert qp.device == torch.device(device)
+        assert qp.dtype == quantization_config.q_dtype
+
+    def test_hf_clone(self):
+        device = get_accelerator().current_device_name()
+        data = torch.rand(5, 5, device=device, dtype=torch.bfloat16)
+
+        quantization_config = QuantizationConfig(q_bits=6)
+        qp = QuantizedParameter(data, quantization_config=quantization_config)
+
+        # should be able to clone parameter via dict, HF expects this to work
+        qp_copy = QuantizedParameter(qp.data, **qp.__dict__)
+
+        assert all(qp.data == qp_copy.data)
+        assert qp.quantization_config == qp_copy.quantization_config
diff --git a/tests/unit/model_parallelism/test_autotp_training.py b/tests/unit/model_parallelism/test_autotp_training.py
new file mode 100644
index 000000000000..7680b28ce6b5
--- /dev/null
+++ b/tests/unit/model_parallelism/test_autotp_training.py
@@ -0,0 +1,574 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import deepspeed.comm as dist
+import torch
+import math
+from copy import deepcopy
+
+from unit.common import DistributedTest, preferred_dtype
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from unit.simple_model import SimpleModel, random_dataloader
+from deepspeed.utils import groups
+from contextlib import contextmanager
+from torch import nn
+from deepspeed.module_inject.layers import LinearAllreduce, LinearLayer, set_autotp_mode
+from unit.checkpoint.common import compare_lr_scheduler_states, compare_optimizer_states
+import os
+
+
+def skip_on_device():
+    if get_accelerator().device_name() == 'xpu':
+        pytest.skip(f"XPU requires a higher version for test")
+
+
+class SequentialLinearModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim, empty_grad=False, nlayers=1):
+        super(SequentialLinearModel, self).__init__()
+        self.linears = torch.nn.ModuleList(
+            [torch.nn.Linear(hidden_dim, hidden_dim, bias=None) for i in range(nlayers)])
+        if empty_grad:
+            self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=None)
+        self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
+        self.empty_grad = empty_grad
+
+    def forward(self, x, y):
+        if len(self.linears) == 1:
+            x = self.linears[0](x)
+        else:
+            for i, l in enumerate(self.linears):
+                x = self.linears[i](x)
+        return self.cross_entropy_loss(x, y)
+
+
+@contextmanager
+def should_assert_with_msg(expected_message):
+    try:
+        yield
+    except AssertionError as e:
+        if dist.get_rank() == 0:
+            print(expected_message)
+            print(str(e))
+        if str(e) == expected_message:
+            pass
+        else:
+            raise e
+
+
+@pytest.mark.parametrize("tp_size", [2, 4])
+class TestTpParallelStates(DistributedTest):
+    world_size = 4
+
+    def test(self, tp_size: int):
+        skip_on_device()
+        set_autotp_mode(training=True)
+
+        dp_size = 4 / tp_size
+        hidden_dim = 128
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "zero_optimization": {
+                "stage": 0
+            }
+        }
+        model = SimpleModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        assert groups.get_tensor_model_parallel_world_size() == tp_size
+        assert groups.get_data_parallel_world_size() == dp_size
+
+
+@pytest.mark.parametrize("tp_size", [2, 4])
+class TestTpDataloaderCorrectness(DistributedTest):
+    world_size = 4
+    reuse_dist_env = True
+
+    def test(self, tp_size: int):
+        skip_on_device()
+        hidden_dim = 128
+        set_autotp_mode(training=True)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "zero_optimization": {
+                "stage": 0,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        model = SimpleModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=3,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=preferred_dtype())
+        dist.barrier()
+        with should_assert_with_msg(
+                "Data inconsistency within the TP group. Please check the Dataloader implementation to ensure consistency."
+        ):
+            for batch in data_loader:
+                # batch[0].requires_grad = requires_grad
+                batch[0] += dist.get_rank()
+                model(batch[0], batch[1])
+
+        model = SimpleModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=3,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=preferred_dtype())
+        for batch in data_loader:
+            dist.broadcast(batch[0],
+                           src=groups.get_tensor_model_parallel_src_rank(),
+                           group=groups.get_tensor_model_parallel_group())
+            dist.broadcast(batch[1],
+                           src=groups.get_tensor_model_parallel_src_rank(),
+                           group=groups.get_tensor_model_parallel_group())
+            model(batch[0], batch[1])
+
+
+def process_linear_layer(hidden_dim, input):
+    torch.manual_seed(42)
+    torch_linear = nn.Linear(hidden_dim,
+                             hidden_dim,
+                             dtype=preferred_dtype(),
+                             device=get_accelerator().current_device(),
+                             bias=None)
+    torch_out = torch_linear(input)
+    torch_loss = torch_out.sum()
+    torch_loss.backward()
+    return torch_linear, torch_out
+
+
+@pytest.mark.sequential
+@pytest.mark.parametrize("tp_size", [2, 4])
+class TestTpLayerFwdBwd(DistributedTest):
+    world_size = 4
+    reuse_dist_env = True
+
+    def testRowParallel(self, tp_size: int):
+        skip_on_device()
+        hidden_dim = 128
+        batch_size_per_device = 1
+        set_autotp_mode(training=True)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "zero_optimization": {
+                "stage": 0,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        model = SequentialLinearModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        input = torch.randn(batch_size_per_device,
+                            hidden_dim,
+                            dtype=preferred_dtype(),
+                            requires_grad=True,
+                            device=get_accelerator().current_device())
+
+        dist.broadcast(input,
+                       groups.get_tensor_model_parallel_src_rank(),
+                       group=groups.get_tensor_model_parallel_group())
+
+        torch_linear, torch_out = process_linear_layer(hidden_dim, input)
+        linear = LinearAllreduce(deepcopy(torch_linear), groups.get_tensor_model_parallel_group())
+
+        input_ = torch.chunk(input, tp_size, dim=-1)[groups.get_tensor_model_parallel_rank()]
+        out = linear(input_.to(get_accelerator().current_device()))
+        loss = out.sum()
+        loss.backward()
+
+        torch_grad = torch.chunk(torch_linear.weight.grad, tp_size, dim=1)[groups.get_tensor_model_parallel_rank()]
+        assert torch.allclose(linear.weight.grad, torch_grad.to(get_accelerator().current_device()), atol=1e-3)
+        assert torch.allclose(out, torch_out.to(get_accelerator().current_device()), atol=1e-3)
+
+    def testColumnParallel(self, tp_size: int):
+        skip_on_device()
+        hidden_dim = 128
+        batch_size_per_device = 1
+        set_autotp_mode(training=True)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "zero_optimization": {
+                "stage": 0,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        model = SequentialLinearModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        input = torch.randn(batch_size_per_device,
+                            hidden_dim,
+                            dtype=preferred_dtype(),
+                            requires_grad=True,
+                            device=get_accelerator().current_device())
+        dist.broadcast(input,
+                       groups.get_tensor_model_parallel_src_rank(),
+                       group=groups.get_tensor_model_parallel_group())
+
+        torch_linear, torch_out = process_linear_layer(hidden_dim, input)
+
+        linear = LinearLayer(deepcopy(torch_linear), groups.get_tensor_model_parallel_group())
+
+        out = linear(input.to(get_accelerator().current_device()))
+        loss = out.sum()
+        loss.backward()
+
+        cur_device_out = torch.chunk(torch_out, tp_size, dim=-1)[groups.get_tensor_model_parallel_rank()]
+        torch_grad = torch.chunk(torch_linear.weight.grad, tp_size, dim=0)[groups.get_tensor_model_parallel_rank()]
+        assert torch.allclose(linear.weight.grad, torch_grad.to(get_accelerator().current_device()), atol=1e-3)
+        assert torch.allclose(cur_device_out.to(get_accelerator().current_device()).contiguous(),
+                              out.contiguous(),
+                              atol=1e-3)
+
+
+@pytest.mark.sequential
+class TestParamsGather(DistributedTest):
+    world_size = 4
+    reuse_dist_env = True
+
+    @pytest.mark.parametrize("layer_type", ["linear", "linearallreduce"])
+    def test(self, layer_type):
+        skip_on_device()
+        tp_size = 4
+        hidden_dim = 128
+        set_autotp_mode(training=True)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "zero_optimization": {
+                "stage": 0,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        torch.manual_seed(42)
+        model = SequentialLinearModel(hidden_dim=hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+
+        torch_linear = nn.Linear(hidden_dim, hidden_dim, dtype=preferred_dtype(), device="cpu", bias=None)
+        total_params = sum(p.numel() for p in torch_linear.parameters())
+
+        tp_layer = None
+        if layer_type == "linear":
+            tp_layer = LinearLayer(torch_linear, groups.get_tensor_model_parallel_group())
+        elif layer_type == "linearallreduce":
+            tp_layer = LinearAllreduce(torch_linear, groups.get_tensor_model_parallel_group())
+        else:
+            raise ValueError(f"Invalid linear type: {config_dict['linear_type']}")
+
+        tp_params = sum(p.numel() for p in tp_layer.parameters())
+
+        assert total_params // tp_size == tp_params
+        for name, param in tp_layer.named_parameters(recurse=False):
+            param.gather_params([param])
+
+        is_same_weights = all(
+            torch.equal(param1, param2) for param1, param2 in zip(tp_layer.parameters(), torch_linear.parameters()))
+
+        assert is_same_weights
+
+        params1 = sum(p.numel() for p in tp_layer.parameters())
+        assert total_params == params1
+
+        for name, param in tp_layer.named_parameters(recurse=False):
+            param._tp_partition([param])
+
+        tp_params2 = sum(p.numel() for p in tp_layer.parameters())
+
+        assert total_params // tp_size == tp_params2
+
+
+def dummy_init_engine(config):
+    # This is a dummy initialization function for the DeepSpeed engine.
+    # We only need to use the config to initialize the distributed settings for the test.
+    model = SequentialLinearModel(hidden_dim=8)
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config)
+
+
+def prepare_tp_model(hidden_dim, nlayers, linear_indices, allreduce_indices, group, return_global_copy=False):
+    model = SequentialLinearModel(hidden_dim=hidden_dim, nlayers=nlayers).to(preferred_dtype())
+    base_model = None
+    if return_global_copy:
+        base_model = deepcopy(model)
+    for i in linear_indices:
+        layer = LinearLayer(model.linears[i], group)
+        model.linears[i] = layer
+
+    for i in allreduce_indices:
+        layer = LinearAllreduce(model.linears[i], group)
+        model.linears[i] = layer
+
+    return model, base_model
+
+
+@pytest.mark.parametrize("zero_stage", [0, 1, 2])
+@pytest.mark.parametrize("tp_size", [2, 4])
+class TestSave(DistributedTest):
+
+    world_size = 4
+    reuse_dist_env = True
+
+    def test_save_original_weight(self, tp_size: int, zero_stage: int):
+        skip_on_device()
+        hidden_dim = 64
+        set_autotp_mode(training=True)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        dummy_init_engine(config_dict)
+        torch.manual_seed(42)
+
+        model, base_model = prepare_tp_model(hidden_dim,
+                                             8, [2, 5], [3, 6],
+                                             groups.get_tensor_model_parallel_group(),
+                                             return_global_copy=True)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+
+        cur_params_numel = sum(p.numel() for p in model.parameters())
+        base_params_numel = sum(p.numel() for p in base_model.parameters())
+        assert cur_params_numel < base_params_numel
+
+        tp_state_dict = model._consolidated_16bit_state_dict()
+
+        def compare_state_dicts(state_dict1, state_dict2):
+            if state_dict1.keys() != state_dict2.keys():
+                print("The state_dicts have different keys!")
+                return False
+
+            for key in state_dict1:
+                if not torch.allclose(state_dict1[key], state_dict2[key], atol=1e-3):
+                    assert state_dict1[key].device == "cpu"
+                    print(f"Parameters for {key} are different!")
+                    return False
+
+            return True
+
+        base_state_dict = base_model.state_dict()
+        if dist.get_rank() == 0:
+            # we should consider the case when zero3 is used in the future.
+            assert compare_state_dicts(base_state_dict, tp_state_dict), f"State_dict is not the same!"
+        else:
+            assert tp_state_dict is None, f"noly rank0 should have the state_dict"
+
+    def test_ckpt_save(self, tmpdir, tp_size: int, zero_stage: int):
+        skip_on_device()
+        hidden_dim = 64
+        set_autotp_mode(training=True)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "scheduler": {
+                "type": "WarmupLR",
+                "params": {
+                    "warmup_min_lr": 0,
+                    "warmup_max_lr": 0.001,
+                    "warmup_num_steps": 1000
+                }
+            }
+        }
+
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        dummy_init_engine(config_dict)
+
+        trained_model, _ = prepare_tp_model(hidden_dim, 8, [2, 5], [3, 6], groups.get_tensor_model_parallel_group())
+        loaded_model, _ = prepare_tp_model(hidden_dim, 8, [2, 5], [3, 6], groups.get_tensor_model_parallel_group())
+
+        trained_model, _, _, _ = deepspeed.initialize(model=trained_model,
+                                                      model_parameters=trained_model.parameters(),
+                                                      config=config_dict)
+        torch.manual_seed(42)
+
+        data_loader = random_dataloader(model=trained_model,
+                                        total_samples=3,
+                                        hidden_dim=hidden_dim,
+                                        device=trained_model.device,
+                                        dtype=preferred_dtype())
+        ckpt_path = os.path.join(tmpdir, 'tp_saved_checkpoint')
+        for i, batch in enumerate(data_loader):
+            batch[0].requires_grad = True
+            loss = trained_model(batch[0], batch[1])
+            loss = loss
+            trained_model.backward(loss)
+            trained_model.step()
+        trained_model.save_checkpoint(ckpt_path)
+
+        loaded_model, _, _, _ = deepspeed.initialize(model=loaded_model,
+                                                     model_parameters=loaded_model.parameters(),
+                                                     config=config_dict)
+        loaded_model.load_checkpoint(ckpt_path, load_optimizer_states=True, load_lr_scheduler_states=True)
+        compare_optimizer_states(trained_model, loaded_model, hidden_dim, fp16=(preferred_dtype() == torch.float16))
+        compare_lr_scheduler_states(trained_model, loaded_model)
+
+
+@pytest.mark.parametrize("zero_stage", [0, 1, 2])
+@pytest.mark.parametrize("tp_size", [2, 4])
+class TestTpGradNorm(DistributedTest):
+
+    world_size = 4
+    reuse_dist_env = True
+
+    def test(self, tp_size: int, zero_stage: int):
+        skip_on_device()
+        hidden_dim = 64
+        set_autotp_mode(training=True)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "tensor_parallel": {
+                "autotp_size": tp_size
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            if zero_stage == 0:
+                pytest.skip(
+                    "This test has an overflow data and needs to implement an overflow skip mechanism in BF16_optimizer"
+                )
+            config_dict["bf16"] = {"enabled": True}
+
+        torch.manual_seed(42)
+
+        dummy_init_engine(config=config_dict)
+        tp_model, base_model = prepare_tp_model(hidden_dim,
+                                                8, [2, 5], [3, 6],
+                                                groups.get_tensor_model_parallel_group(),
+                                                return_global_copy=True)
+
+        base_model, base_optimizer, _, _ = deepspeed.initialize(model=base_model,
+                                                                model_parameters=base_model.parameters(),
+                                                                config=config_dict)
+        data_loader = random_dataloader(model=base_model,
+                                        total_samples=20,
+                                        hidden_dim=hidden_dim,
+                                        device=base_model.device,
+                                        dtype=preferred_dtype())
+
+        for i, batch in enumerate(data_loader):
+            batch[0].requires_grad = True
+            loss = base_model(batch[0], batch[1])
+            loss = loss
+            base_model.backward(loss)
+            base_model.step()
+
+        base_norm = base_optimizer._global_grad_norm
+
+        base_model.destroy()
+
+        tp_model, tp_optimizer, _, _ = deepspeed.initialize(model=tp_model,
+                                                            model_parameters=tp_model.parameters(),
+                                                            config=config_dict)
+        for i, batch in enumerate(data_loader):
+            batch[0].requires_grad = True
+            loss = tp_model(batch[0], batch[1])
+            loss = loss
+            tp_model.backward(loss)
+            tp_model.step()
+
+        tp_norm = tp_optimizer._global_grad_norm
+
+        assert math.isclose(base_norm, tp_norm, abs_tol=1e-3)
+        tp_params_numel = sum(p.numel() for p in tp_model.parameters())
+        base_params_numel = sum(p.numel() for p in base_model.parameters())
+        assert tp_params_numel < base_params_numel
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_mp.py b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
index 824ecea5f144..a7b0d3431ee9 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_mp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_mp.py
@@ -13,7 +13,7 @@
 from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest, DistributedFixture
 from unit.megatron_model import get_gpt2_model, get_megatron_version
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
                                 reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
@@ -170,7 +170,7 @@ def test(self, baseline_mp2, inputs, class_tmpdir):
             test = model(inputs[0].to(device_name), inputs[1].to(device_name), inputs[2].to(device_name))
             if dist.get_rank() == 0:
                 load_path = os.path.join(class_tmpdir, "output.pt")
-                baseline = torch.load(load_path)
+                baseline = torch.load(load_path, weights_only=False)
                 test = test.cpu()
                 assert torch.allclose(
                     baseline, test,
diff --git a/tests/unit/model_parallelism/test_configurable_parallel_pp.py b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
index b500b9d857a5..df469044e186 100644
--- a/tests/unit/model_parallelism/test_configurable_parallel_pp.py
+++ b/tests/unit/model_parallelism/test_configurable_parallel_pp.py
@@ -15,7 +15,7 @@
 from unit.megatron_model import MockGPT2ModelPipe as GPT2ModelPipe
 from deepspeed.utils import RepeatingLoader
 from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 
 pytestmark = pytest.mark.skipif(not required_torch_version(min_version=1.5, max_version=1.13),
                                 reason='Megatron-LM package requires Pytorch version >=1.5 and <=1.13')
@@ -225,7 +225,7 @@ def _test(self, inputs, class_tmpdir, checkpoint_tag, mp_size, pp_size, mp_resiz
                 assert torch.is_tensor(test[0][0])
                 test = test[0][0].cpu()
                 load_path = os.path.join(class_tmpdir, f"output-{checkpoint_tag}.pt")
-                baseline = torch.load(load_path)
+                baseline = torch.load(load_path, weights_only=False)
                 assert torch.allclose(
                     baseline, test,
                     atol=1e-03), f"Baseline output {baseline} is not equal to save-then-load output {test}"
diff --git a/tests/unit/moe/test_moe.py b/tests/unit/moe/test_moe.py
index 310a0df16381..c67a907c6785 100644
--- a/tests/unit/moe/test_moe.py
+++ b/tests/unit/moe/test_moe.py
@@ -7,10 +7,50 @@
 import deepspeed
 import pytest
 import gc
+import random
 from unit.common import DistributedTest
 from unit.simple_model import SimplePRMoEModel, SimpleMoEModel, sequence_dataloader
+import deepspeed.comm as dist
+from deepspeed import get_accelerator
+from deepspeed.moe.sharded_moe import top1gating, topkgating
 from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer, is_moe_param
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
+
+
+@pytest.mark.parametrize("zero_stage", [0, 1, 2])
+class TestSimpleMoE(DistributedTest):
+    world_size = 2
+
+    def test(self, zero_stage):
+        if not required_torch_version(min_version=1.8):
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "fp16": {
+                "enabled": True
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+        # should automatically create moe param groups in deepspeed backend
+        hidden_dim = 16
+        model = SimpleMoEModel(hidden_dim=hidden_dim, ep_size=1)
+        model, optimizer, _, _ = deepspeed.initialize(config=config_dict, model=model)
+        data_loader = sequence_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
 
 
 @pytest.mark.parametrize("ep_size", [2, 4])
@@ -53,7 +93,8 @@ def strict_average_tensor(tensor):
             process_group = optimizer.dp_process_group
             curr_size = 0
             pg_offsets = []
-            for i, param, param_id in optimizer.params_in_ipg_bucket:
+            for i, param_idx, param_id in optimizer.params_in_ipg_bucket:
+                param = optimizer.bit16_groups[i][param_idx]
                 process_group = optimizer.dp_process_group
                 if optimizer.ipg_bucket_has_moe_params:
                     process_group = optimizer.expert_dp_process_group[param.group_name] if is_moe_param(
@@ -132,3 +173,181 @@ def test(self, ep_size, use_residual):
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
+
+
+class TestTopk(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        device = get_accelerator().current_device_name()
+        if dist.get_rank() == 0:
+            logits = torch.rand(2, 2, device=device)
+        elif dist.get_rank() == 1:
+            logits = torch.rand(10, 2, device=device)
+
+        output = top1gating(logits=logits,
+                            capacity_factor=1,
+                            min_capacity=0,
+                            used_token=None,
+                            noisy_gate_policy=None,
+                            drop_tokens=False,
+                            use_rts=True,
+                            use_tutel=False)
+
+
+class TestTopkGate(DistributedTest):
+
+    def test(self):
+
+        def check_equal(logits, cap, sparse_truth, res):
+            m, n = logits.shape
+            dispatch_mask_truth = torch.zeros(m, n, cap)
+            i, j, k = sparse_truth.t()
+            dispatch_mask_truth[i, j, k] = 1
+            assert (torch.equal(dispatch_mask_truth, res))
+
+        #s=4   e=4  topk=2   cap=2(s*topk/e)
+        logits = torch.tensor([[0.11, 0.2, 0.1, 0.3], [0.3, 0.4, 0.11, 0.1], [0.11, 0.1, 0.6, 0.5],
+                               [0.1, 0.11, 0.7, 0.8]])
+        logits *= dist.get_rank() + 1
+        probs_dispatch_res = topkgating(logits, 2, 1, min_capacity=1, drop_policy='probs')[2]
+        probs_sec_sparse = torch.tensor([[0, 1, 0], [1, 0, 0], [1, 1, 1], [2, 2, 0], [2, 3, 0], [3, 2, 1], [3, 3, 1]])
+        check_equal(logits, 2, probs_sec_sparse, probs_dispatch_res)
+
+        position_sec_sparse = torch.tensor([[0, 1, 0], [0, 3, 0], [1, 0, 0], [1, 1, 1], [2, 2, 0], [2, 3, 1],
+                                            [3, 2, 1]])
+        position_dispatch_res = topkgating(logits, 2, 1, min_capacity=1, drop_policy='position')[2]
+        check_equal(logits, 2, position_sec_sparse, position_dispatch_res)
+
+        #s=4   e=6  topk=3   cap=2(s*topk/e)
+        logits2 = torch.tensor([[0.5858, 0.4801, 0.6269, 0.5397, 0.9722, 0.7034],
+                                [0.5445, 0.6332, 0.4519, 0.6308, 0.0519, 0.6450],
+                                [0.4874, 0.8110, 0.7467, 0.8474, 0.0277, 0.3068],
+                                [0.8570, 0.6714, 0.5310, 0.3274, 0.4836, 0.9892]])
+        logits2 *= dist.get_rank() + 1
+
+        #top3 full mask     #prob_mask          #postion_mask
+        #0 0 1 0 1 1        #0 0 1 0 1 1        #0 0 1 0 1 1
+        #0 1 0 1 0 1        #0 0 0 1 0 0        #0 1 0 1 0 1
+        #0 1 1 1 0 0        #0 1 1 1 0 0        #0 1 1 1 0 0
+        #1 1 0 0 0 1        #1 1 0 0 0 1        #1 0 0 0 0 0
+        probs_dispatch_res = topkgating(logits2, 3, 1, min_capacity=1, drop_policy='probs')[2]
+        probs_sec_sparse = torch.tensor([[0, 2, 0], [0, 4, 0], [0, 5, 0], [1, 3, 0], [2, 1, 0], [2, 2, 1], [2, 3, 1],
+                                         [3, 0, 0], [3, 1, 1], [3, 5, 1]])
+        check_equal(logits2, 2, probs_sec_sparse, probs_dispatch_res)
+
+        position_sec_sparse = torch.tensor([[0, 2, 0], [0, 4, 0], [0, 5, 0], [1, 1, 0], [1, 3, 0], [1, 5, 1],
+                                            [2, 1, 1], [2, 2, 1], [2, 3, 1], [3, 0, 0]])
+        position_dispatch_res = topkgating(logits2, 3, 1, min_capacity=1, drop_policy='position')[2]
+        check_equal(logits2, 2, position_sec_sparse, position_dispatch_res)
+
+
+class TestExpertWeightGradWithZero(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("zero_stage", [0, 1, 2])
+    def test(self, zero_stage):
+
+        if not required_torch_version(min_version=1.8):
+            pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
+
+        def seed_everything(seed=11):
+            random.seed(seed)
+            torch.manual_seed(seed)
+            get_accelerator().manual_seed(seed)
+            get_accelerator().manual_seed_all(seed)
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False
+
+        def get_state_dict_ep2(state_dict):
+            """
+            convert state_dict from EP=1 to EP=2
+            """
+            rank = int(deepspeed.comm.get_rank())
+            ep_state_dict = dict()
+            dst_sub_key = f"deepspeed_moe.experts.deepspeed_experts.0"
+            src_sub_key = f"deepspeed_moe.experts.deepspeed_experts.{rank}"
+            for moe_layer in ["moe_1", "moe_2"]:
+                for mlp_in_moe in [0, 1]:
+                    dst_key = f"{moe_layer}.{dst_sub_key}.{mlp_in_moe}"
+                    src_key = f"{moe_layer}.{src_sub_key}.{mlp_in_moe}"
+                    ep_state_dict[f"{dst_key}.weight"] = state_dict[f"{src_key}.weight"].detach().clone()
+                    ep_state_dict[f"{dst_key}.bias"] = state_dict[f"{src_key}.bias"].detach().clone()
+
+            for key in state_dict.keys():
+                if "deepspeed_moe.experts.deepspeed_experts" not in key:
+                    ep_state_dict[key] = state_dict[key].detach().clone()
+            return ep_state_dict
+
+        def get_models(hidden_dim):
+            model_ep1 = SimpleMoEModel(hidden_dim=hidden_dim, num_experts=2, ep_size=1, use_rts=False)
+            model_ep2 = SimpleMoEModel(hidden_dim=hidden_dim, num_experts=2, ep_size=2, use_rts=False)
+
+            state_dict_ep1 = model_ep1.state_dict()
+            state_dict_ep2 = get_state_dict_ep2(state_dict_ep1)
+            model_ep2.load_state_dict(state_dict_ep2)
+
+            model_ep1, _, _, _ = deepspeed.initialize(config=config_dict, model=model_ep1)
+            model_ep2, _, _, _ = deepspeed.initialize(config=config_dict, model=model_ep2)
+
+            return model_ep1, model_ep2
+
+        def extract_expert_grad(model, expert_id):
+
+            def _get_weight_bias(experts):
+                return ([deepspeed.utils.safe_get_full_grad(expert[0].weight)
+                         for expert in experts][expert_id].detach().clone(),
+                        [deepspeed.utils.safe_get_full_grad(expert[0].bias)
+                         for expert in experts][expert_id].detach().clone(),
+                        [deepspeed.utils.safe_get_full_grad(expert[1].weight)
+                         for expert in experts][expert_id].detach().clone(),
+                        [deepspeed.utils.safe_get_full_grad(expert[1].bias)
+                         for expert in experts][expert_id].detach().clone())
+
+            return (*_get_weight_bias(model.moe_1.deepspeed_moe.experts.deepspeed_experts),
+                    *_get_weight_bias(model.moe_2.deepspeed_moe.experts.deepspeed_experts))
+
+        seed_everything()
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.1,
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage
+            }
+        }
+
+        hidden_dim = 4
+        total_samples = 2
+        rank = deepspeed.comm.get_rank()
+        model_ep1, model_ep2 = get_models(hidden_dim)
+
+        data_loader = sequence_dataloader(model=model_ep1,
+                                          total_samples=total_samples,
+                                          hidden_dim=hidden_dim,
+                                          device=model_ep1.device,
+                                          dtype=torch.float32)
+        expert_weight_grad_ep1 = []
+        expert_weight_grad_ep2 = []
+        for batch in data_loader:
+            loss_ep1 = model_ep1(batch[0], batch[1])
+            loss_ep2 = model_ep2(batch[0], batch[1])
+
+            model_ep1.backward(loss_ep1)
+            model_ep2.backward(loss_ep2)
+
+            expert_weight_grad_ep1.extend(extract_expert_grad(model_ep1, rank))
+            expert_weight_grad_ep2.extend(extract_expert_grad(model_ep2, 0))
+
+            model_ep1.step()
+            model_ep2.step()
+
+        assert len(expert_weight_grad_ep1) == len(expert_weight_grad_ep2)
+        for grad_from_ep1, grad_from_ep2 in zip(expert_weight_grad_ep1, expert_weight_grad_ep2):
+            assert torch.allclose(grad_from_ep1, grad_from_ep2, atol=0, rtol=1e-4)
diff --git a/tests/unit/moe/test_moe_tp.py b/tests/unit/moe/test_moe_tp.py
index 0069c674690c..eb4668015c01 100644
--- a/tests/unit/moe/test_moe_tp.py
+++ b/tests/unit/moe/test_moe_tp.py
@@ -7,7 +7,7 @@
 import deepspeed
 import pytest
 from unit.common import DistributedTest
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.moe.layer import MoE
 
 
diff --git a/tests/unit/monitor/test_monitor.py b/tests/unit/monitor/test_monitor.py
index 3e04bebfb6c1..d4b3cf43921d 100644
--- a/tests/unit/monitor/test_monitor.py
+++ b/tests/unit/monitor/test_monitor.py
@@ -7,10 +7,14 @@
 from deepspeed.monitor.wandb import WandbMonitor
 from deepspeed.monitor.csv_monitor import csvMonitor
 from deepspeed.monitor.config import DeepSpeedMonitorConfig
+from deepspeed.monitor.comet import CometMonitor
 
 from unit.common import DistributedTest
+from unittest.mock import Mock, patch
 from deepspeed.runtime.config import DeepSpeedConfig
 
+import deepspeed.comm as dist
+
 
 class TestTensorBoard(DistributedTest):
     world_size = 2
@@ -97,3 +101,66 @@ def test_empty_csv_monitor(self):
         assert csv_monitor.enabled == defaults.enabled
         assert csv_monitor.output_path == defaults.output_path
         assert csv_monitor.job_name == defaults.job_name
+
+
+class TestCometMonitor(DistributedTest):
+    world_size = 2
+
+    def test_comet_monitor(self):
+        import comet_ml
+        mock_experiment = Mock()
+        mock_start = Mock(return_value=mock_experiment)
+
+        config_dict = {
+            "train_batch_size": 2,
+            "comet": {
+                "enabled": True,
+                "samples_log_interval": 42,
+                "workspace": "some-workspace",
+                "project": "some-project",
+                "api_key": "some-api-key",
+                "experiment_name": "some-experiment-name",
+                "experiment_key": "some-experiment-key",
+                "mode": "get_or_create",
+                "online": True
+            }
+        }
+
+        ds_config = DeepSpeedConfig(config_dict)
+
+        with patch.object(comet_ml, "start", mock_start):
+            comet_monitor = CometMonitor(ds_config.monitor_config.comet)
+
+        assert comet_monitor.enabled is True
+        assert comet_monitor.samples_log_interval == 42
+
+        # experiment should be initialized via comet_ml.start only if rank == 0
+        if dist.get_rank() == 0:
+            mock_start.assert_called_once_with(
+                api_key="some-api-key",
+                project="some-project",
+                workspace="some-workspace",
+                experiment_key="some-experiment-key",
+                mode="get_or_create",
+                online=True,
+            )
+
+            mock_experiment.set_name.assert_called_once_with("some-experiment-name")
+            assert comet_monitor.experiment is mock_experiment
+        else:
+            mock_start.assert_not_called()
+
+    def test_empty_comet(self):
+        import comet_ml
+        mock_start = Mock()
+
+        config_dict = {"train_batch_size": 2, "comet": {}}
+        ds_config = DeepSpeedConfig(config_dict)
+
+        with patch.object(comet_ml, "start", mock_start):
+            comet_monitor = CometMonitor(ds_config.monitor_config.comet)
+
+        defaults = DeepSpeedMonitorConfig().comet
+        assert comet_monitor.enabled == defaults.enabled
+        assert comet_monitor.samples_log_interval == defaults.samples_log_interval
+        mock_start.assert_not_called()
diff --git a/tests/unit/multi_output_model.py b/tests/unit/multi_output_model.py
index e84215fb4e95..d7a5f9a46b97 100644
--- a/tests/unit/multi_output_model.py
+++ b/tests/unit/multi_output_model.py
@@ -4,6 +4,7 @@
 # DeepSpeed Team
 
 import torch
+from .common import preferred_dtype
 
 
 class MultiOutputModel(torch.nn.Module):
@@ -28,8 +29,11 @@ def multi_output_dataloader(model, total_samples, hidden_dim, device, inputs, ta
     batch_size = model.train_micro_batch_size_per_gpu()
 
     train_data = [
-        torch.full(size=(total_samples, hidden_dim), fill_value=x, device=device, dtype=torch.half, requires_grad=True)
-        for x in inputs
+        torch.full(size=(total_samples, hidden_dim),
+                   fill_value=x,
+                   device=device,
+                   dtype=preferred_dtype(),
+                   requires_grad=True) for x in inputs
     ]
 
     train_label = [torch.empty(total_samples, device=device, dtype=torch.long).fill_(y) for y in targets]
diff --git a/tests/unit/ops/accelerators/test_accelerator_backward.py b/tests/unit/ops/accelerators/test_accelerator_backward.py
index 43f7b471e2ae..4b1b392e933a 100644
--- a/tests/unit/ops/accelerators/test_accelerator_backward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_backward.py
@@ -9,17 +9,15 @@
 import random
 import copy
 import os
+import deepspeed
 from torch import nn
 from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from deepspeed.accelerator import get_accelerator
 from unit.modeling import BertConfig, BertLayerNorm, BertEncoder as BertEncoderPostln
 from unit.modelingpreln import BertEncoder as BertEncoderPreln
 from unit.common import DistributedTest, is_rocm_pytorch
+from deepspeed.ops.op_builder import TransformerBuilder
 
-#if not deepspeed.ops.__installed_ops__['transformer']:
-#pytest.skip(
-#    "transformer kernels are temporarily disabled because of unexplained failures",
-#    allow_module_level=True)
 if torch.half not in get_accelerator().supported_dtypes():
     pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
 
@@ -261,6 +259,8 @@ class TestCUDABackward(DistributedTest):
         #This is to flush denorms in forward pass. Please refer to https://github.com/pytorch/pytorch/blob/main/docs/source/notes/numerical_accuracy.rst#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
         os.environ['ROCBLAS_INTERNAL_FP16_ALT_IMPL'] = '1'
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[TransformerBuilder.NAME],
+                        reason="TransformerBuilder has not been implemented on this system.")
     def test_backward(self, is_preln, use_fp16, batch_size, hidden_size, seq_len, heads, num_layers, atol):
         # Only run fp16 test cases on devices with FP16 capability.
         if not get_accelerator().is_fp16_supported() and (use_fp16 is True or is_preln is False):
diff --git a/tests/unit/ops/accelerators/test_accelerator_forward.py b/tests/unit/ops/accelerators/test_accelerator_forward.py
index ee9464f63aa1..e2f4ac177f1b 100644
--- a/tests/unit/ops/accelerators/test_accelerator_forward.py
+++ b/tests/unit/ops/accelerators/test_accelerator_forward.py
@@ -8,12 +8,14 @@
 import pytest
 import random
 import copy
+import deepspeed
 from torch import nn
 from unit.modelingpreln import BertEncoder as BertEncoderPreln
 from unit.modeling import BertLayerNorm, BertConfig, BertEncoder as BertEncoderPostln
 from deepspeed import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
 from deepspeed.accelerator import get_accelerator
 from unit.common import DistributedTest
+from deepspeed.ops.op_builder import TransformerBuilder
 
 if torch.half not in get_accelerator().supported_dtypes():
     pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
@@ -260,6 +262,8 @@ def test_forward(self, batch_size, hidden_size, seq_len, heads, num_layers, is_p
 class TestCUDAForwardSmallBatchSize(DistributedTest):
     world_size = 1
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[TransformerBuilder.NAME],
+                        reason="TransformerBuilder has not been implemented on this system.")
     def test_forward_with_small_bsz(self, batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln,
                                     use_fp16):
         # Only run fp16 test cases on devices with FP16 capability.
diff --git a/tests/unit/ops/adagrad/test_cpu_adagrad.py b/tests/unit/ops/adagrad/test_cpu_adagrad.py
index 99e934e2efda..0c675ecd6a85 100644
--- a/tests/unit/ops/adagrad/test_cpu_adagrad.py
+++ b/tests/unit/ops/adagrad/test_cpu_adagrad.py
@@ -18,8 +18,8 @@
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
+    x = first.detach().float().numpy()
+    y = second.detach().float().numpy()
     if verbose:
         print("x = {}".format(x.flatten()))
         print("y = {}".format(y.flatten()))
diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py
index 9a6ff6689446..851485440428 100644
--- a/tests/unit/ops/adam/test_cpu_adam.py
+++ b/tests/unit/ops/adam/test_cpu_adam.py
@@ -11,7 +11,7 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.adam import FusedAdam
-from deepspeed.ops.op_builder import CPUAdamBuilder
+from deepspeed.ops.op_builder import CPUAdamBuilder, FusedAdamBuilder
 from unit.common import DistributedTest
 
 if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
@@ -21,8 +21,8 @@
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
+    x = first.detach().float().numpy()
+    y = second.detach().float().numpy()
     print("ATOL", atol)
     if verbose:
         print("x = {}".format(x.flatten()))
@@ -43,7 +43,7 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
     check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
 
 
-@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
+@pytest.mark.parametrize('dtype', [torch.half, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"])
 @pytest.mark.parametrize('model_size',
                          [
                              (64),
@@ -62,7 +62,12 @@ class TestCPUAdam(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
+                        reason="FusedAdam is not compatible")
     def test_fused_adam_equal(self, dtype, model_size):
+        if dtype not in get_accelerator().supported_dtypes():
+            pytest.skip(f"dtype {dtype} not supported in current accelerator")
+
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
             pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
 
@@ -89,6 +94,8 @@ def test_fused_adam_equal(self, dtype, model_size):
 
     def test_torch_adamw_equal(self, dtype, model_size):
         if get_accelerator().is_available():
+            if dtype == torch.half:
+                pytest.skip("torch.optim.AdamW with half precision inf/nan output.")
             if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
                 pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
             ref_param_device = get_accelerator().device_name()
@@ -97,20 +104,20 @@ def test_torch_adamw_equal(self, dtype, model_size):
                 pytest.skip("torch.optim.AdamW with half precision only supported in CUDA environments.")
             ref_param_device = 'cpu'
 
-            from deepspeed.ops.adam import DeepSpeedCPUAdam
+        from deepspeed.ops.adam import DeepSpeedCPUAdam
 
-            cpu_data = torch.randn(model_size, device='cpu').to(dtype)
-            cpu_param = torch.nn.Parameter(cpu_data)
-            ref_param = torch.nn.Parameter(cpu_data.to(ref_param_device))
+        cpu_data = torch.randn(model_size, device='cpu').to(dtype)
+        cpu_param = torch.nn.Parameter(cpu_data)
+        ref_param = torch.nn.Parameter(cpu_data.to(ref_param_device))
 
-            cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
-            ref_optimizer = torch.optim.AdamW([ref_param])
+        cpu_optimizer = DeepSpeedCPUAdam([cpu_param])
+        ref_optimizer = torch.optim.AdamW([ref_param])
 
-            _compare_optimizers(model_size=model_size,
-                                param1=cpu_param,
-                                optimizer1=cpu_optimizer,
-                                param2=ref_param,
-                                optimizer2=ref_optimizer)
+        _compare_optimizers(model_size=model_size,
+                            param1=cpu_param,
+                            optimizer1=cpu_optimizer,
+                            param2=ref_param,
+                            optimizer2=ref_optimizer)
 
 
 class TestCPUAdamGPUError(DistributedTest):
diff --git a/tests/unit/ops/adam/test_hybrid_adam.py b/tests/unit/ops/adam/test_hybrid_adam.py
index c7ef4890b322..652090d5b9d5 100644
--- a/tests/unit/ops/adam/test_hybrid_adam.py
+++ b/tests/unit/ops/adam/test_hybrid_adam.py
@@ -12,7 +12,7 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.adam import FusedAdam, DeepSpeedCPUAdam
-from deepspeed.ops.op_builder import CPUAdamBuilder
+from deepspeed.ops.op_builder import CPUAdamBuilder, FusedAdamBuilder
 from unit.common import DistributedTest
 
 if not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
@@ -22,8 +22,8 @@
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
+    x = first.detach().float().numpy()
+    y = second.detach().float().numpy()
     print("ATOL", atol)
     if verbose:
         print("x = {}".format(x.flatten()))
@@ -32,7 +32,7 @@ def check_equal(first, second, atol=1e-2, verbose=False):
     np.testing.assert_allclose(x, y, err_msg="param-update mismatch!", atol=atol)
 
 
-@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
+@pytest.mark.parametrize('dtype', [torch.half, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"])
 @pytest.mark.parametrize('model_size', [8, 16])
 class TestHybridAdam(DistributedTest):
     world_size = 1
@@ -43,6 +43,8 @@ class TestHybridAdam(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
+                        reason="FusedAdam is not compatible")
     def test_hybrid_adam_equal(self, dtype, model_size):
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
             pytest.skip("cpu-adam with half precision not supported on AMD CPUs")
diff --git a/tests/unit/ops/aio/test_aio.py b/tests/unit/ops/aio/test_aio.py
index f6d175ce67bc..1aa5f647a8aa 100644
--- a/tests/unit/ops/aio/test_aio.py
+++ b/tests/unit/ops/aio/test_aio.py
@@ -23,12 +23,10 @@
     pytest.skip('Skip tests since async-io is not compatible', allow_module_level=True)
 
 
-def _skip_for_invalid_environment(use_cuda_device=True, use_cuda_pinned_tensor=True):
-    if not get_accelerator().is_available():
-        if use_cuda_device:
-            pytest.skip("GPU tensors only supported in CUDA environments.")
+def _skip_for_invalid_environment(use_cuda_pinned_tensor=True):
+    if get_accelerator().device_name() != 'cuda':
         if use_cuda_pinned_tensor:
-            pytest.skip("CUDA-pinned tensors only supported in CUDA environments.")
+            pytest.skip("torch.pin_memory is only supported in CUDA environments.")
 
 
 def _get_local_rank():
@@ -37,28 +35,33 @@ def _get_local_rank():
     return 0
 
 
-def _do_ref_write(tmpdir, index=0):
+def _do_ref_write(tmpdir, index=0, file_size=IO_SIZE):
     file_suffix = f'{_get_local_rank()}_{index}'
     ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
-    ref_buffer = os.urandom(IO_SIZE)
+    ref_buffer = os.urandom(file_size)
     with open(ref_file, 'wb') as f:
         f.write(ref_buffer)
 
     return ref_file, ref_buffer
 
 
+def _get_file_path(tmpdir, file_prefix, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'{file_prefix}_{file_suffix}.pt')
+
+
 def _get_test_write_file(tmpdir, index):
     file_suffix = f'{_get_local_rank()}_{index}'
     return os.path.join(tmpdir, f'_aio_write_random_{file_suffix}.pt')
 
 
-def _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer, index=0):
+def _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
     return test_file, test_buffer
 
 
-def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, index=0):
+def _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, aio_handle=None, index=0):
     test_file = _get_test_write_file(tmpdir, index)
     if aio_handle is None:
         test_buffer = get_accelerator().pin_memory(torch.ByteTensor(list(ref_buffer)))
@@ -73,7 +76,7 @@ def _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, aio_handle=None, ind
 def _validate_handle_state(handle, single_submit, overlap_events):
     assert handle.get_single_submit() == single_submit
     assert handle.get_overlap_events() == overlap_events
-    assert handle.get_thread_count() == IO_PARALLEL
+    assert handle.get_intra_op_parallelism() == IO_PARALLEL
     assert handle.get_block_size() == BLOCK_SIZE
     assert handle.get_queue_depth() == QUEUE_DEPTH
 
@@ -89,12 +92,15 @@ class TestRead(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
-        if use_cuda_pinned_tensor:
+        if use_unpinned_tensor:
+            aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        elif use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
         else:
             aio_buffer = h.new_cpu_locked_tensor(IO_SIZE, torch.empty(0, dtype=torch.uint8))
@@ -102,7 +108,7 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.sync_pread(aio_buffer, ref_file)
+        read_status = h.sync_pread(aio_buffer, ref_file, 0)
         assert read_status == 1
 
         with open(ref_file, 'rb') as f:
@@ -112,14 +118,14 @@ def test_parallel_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, over
         if not use_cuda_pinned_tensor:
             h.free_cpu_locked_tensor(aio_buffer)
 
-    @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         use_cpu_locked_tensor = False
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
-        if cuda_device:
+        if use_unpinned_tensor:
             aio_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
         elif use_cuda_pinned_tensor:
             aio_buffer = get_accelerator().pin_memory(torch.empty(IO_SIZE, dtype=torch.uint8, device='cpu'))
@@ -130,7 +136,7 @@ def test_async_read(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap
         _validate_handle_state(h, single_submit, overlap_events)
 
         ref_file, _ = _do_ref_write(tmpdir)
-        read_status = h.async_pread(aio_buffer, ref_file)
+        read_status = h.async_pread(aio_buffer, ref_file, 0)
         assert read_status == 0
 
         wait_status = h.wait()
@@ -155,20 +161,23 @@ class TestWrite(DistributedTest):
         init_distributed = False
         set_dist_env = False
 
-    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events):
-        _skip_for_invalid_environment(use_cuda_device=False, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
+        if use_unpinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer)
         if use_cuda_pinned_tensor:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer)
         else:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, h)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.sync_pwrite(aio_buffer, aio_file)
+        write_status = h.sync_pwrite(aio_buffer, aio_file, 0)
         assert write_status == 1
 
         if not use_cuda_pinned_tensor:
@@ -179,25 +188,25 @@ def test_parallel_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, ove
         filecmp.clear_cache()
         assert filecmp.cmp(ref_file, aio_file, shallow=False)
 
-    @pytest.mark.parametrize("cuda_device", [True, False])
-    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    @pytest.mark.parametrize("use_unpinned_tensor", [True, False])
+    def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overlap_events, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_file, ref_buffer = _do_ref_write(tmpdir)
 
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
         use_cpu_locked_tensor = False
-        if cuda_device:
-            aio_file, aio_buffer = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffer)
+        if use_unpinned_tensor:
+            aio_file, aio_buffer = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffer)
         elif use_cuda_pinned_tensor:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer)
         else:
-            aio_file, aio_buffer = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffer, h)
+            aio_file, aio_buffer = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffer, h)
             use_cpu_locked_tensor = True
 
         _validate_handle_state(h, single_submit, overlap_events)
 
-        write_status = h.async_pwrite(aio_buffer, aio_file)
+        write_status = h.async_pwrite(aio_buffer, aio_file, 0)
         assert write_status == 0
 
         wait_status = h.wait()
@@ -214,7 +223,7 @@ def test_async_write(self, tmpdir, use_cuda_pinned_tensor, single_submit, overla
 
 @pytest.mark.sequential
 @pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
-@pytest.mark.parametrize("cuda_device", [True, False])
+@pytest.mark.parametrize("use_unpinned_tensor", [True, False])
 class TestAsyncQueue(DistributedTest):
     world_size = 1
     requires_cuda_env = False
@@ -223,8 +232,8 @@ class TestAsyncQueue(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         for i in range(async_queue):
@@ -236,7 +245,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
         h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
 
         use_cpu_locked_tensor = False
-        if cuda_device:
+        if use_unpinned_tensor:
             aio_buffers = [
                 torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
                 for _ in range(async_queue)
@@ -254,7 +263,7 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pread(aio_buffers[i], ref_files[i])
+            read_status = h.async_pread(aio_buffers[i], ref_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -270,8 +279,8 @@ def test_read(self, tmpdir, async_queue, use_cuda_pinned_tensor, cuda_device):
                 h.free_cpu_locked_tensor(t)
 
     @pytest.mark.parametrize("async_queue", [2, 3])
-    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
-        _skip_for_invalid_environment(use_cuda_device=cuda_device, use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+    def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, use_unpinned_tensor):
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
 
         ref_files = []
         ref_buffers = []
@@ -287,21 +296,21 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
         aio_files = []
         aio_buffers = []
         for i in range(async_queue):
-            if cuda_device:
-                f, buf = _get_test_write_file_and_cuda_buffer(tmpdir, ref_buffers[i], i)
+            if use_unpinned_tensor:
+                f, buf = _get_test_write_file_and_unpinned_tensor(tmpdir, ref_buffers[i], i)
             elif use_cuda_pinned_tensor:
-                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], None, i)
+                f, buf = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffers[i], None, i)
             else:
-                f, buf = _get_test_write_file_and_cpu_buffer(tmpdir, ref_buffers[i], h, i)
+                f, buf = _get_test_write_file_and_pinned_tensor(tmpdir, ref_buffers[i], h, i)
             aio_files.append(f)
             aio_buffers.append(buf)
 
-        use_cpu_locked_tensor = not (cuda_device or use_cuda_pinned_tensor)
+        use_cpu_locked_tensor = not (use_unpinned_tensor or use_cuda_pinned_tensor)
 
         _validate_handle_state(h, single_submit, overlap_events)
 
         for i in range(async_queue):
-            read_status = h.async_pwrite(aio_buffers[i], aio_files[i])
+            read_status = h.async_pwrite(aio_buffers[i], aio_files[i], 0)
             assert read_status == 0
 
         wait_status = h.wait()
@@ -316,3 +325,79 @@ def test_write(self, tmpdir, use_cuda_pinned_tensor, async_queue, cuda_device):
 
             filecmp.clear_cache()
             assert filecmp.cmp(ref_files[i], aio_files[i], shallow=False)
+
+
+@pytest.mark.parametrize("use_cuda_pinned_tensor", [True, False])
+@pytest.mark.parametrize('file_partitions', [[1, 1, 1], [1, 1, 2], [1, 2, 1], [2, 1, 1]])
+class TestAsyncFileOffset(DistributedTest):
+    world_size = 1
+
+    def test_offset_write(self, tmpdir, file_partitions, use_cuda_pinned_tensor):
+
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        ref_file = _get_file_path(tmpdir, '_py_random')
+        aio_file = _get_file_path(tmpdir, '_aio_random')
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            data_buffer = torch.ByteTensor(list(os.urandom(file_size))).pin_memory()
+        else:
+            data_buffer = h.new_cpu_locked_tensor(file_size, torch.empty(0, dtype=torch.uint8))
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        ref_fd = open(ref_file, 'wb')
+        for i in range(len(file_partitions)):
+            src_buffer = torch.narrow(data_buffer, 0, file_offsets[i], file_partitions[i] * partition_unit_size)
+
+            ref_fd.write(src_buffer.numpy().tobytes())
+            ref_fd.flush()
+
+            assert 1 == h.sync_pwrite(buffer=src_buffer, filename=aio_file, file_offset=file_offsets[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+        ref_fd.close()
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(data_buffer)
+
+    def test_offset_read(self, tmpdir, file_partitions, use_cuda_pinned_tensor):
+
+        _skip_for_invalid_environment(use_cuda_pinned_tensor=use_cuda_pinned_tensor)
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+        ref_file, _ = _do_ref_write(tmpdir, 0, file_size)
+        h = AsyncIOBuilder().load().aio_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        if use_cuda_pinned_tensor:
+            data_buffer = torch.zeros(file_size, dtype=torch.uint8, device='cpu').pin_memory()
+        else:
+            data_buffer = h.new_cpu_locked_tensor(file_size, torch.empty(0, dtype=torch.uint8))
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        with open(ref_file, 'rb') as ref_fd:
+            for i in range(len(file_partitions)):
+                ref_fd.seek(file_offsets[i])
+                bytes_to_read = file_partitions[i] * partition_unit_size
+                ref_buf = list(ref_fd.read(bytes_to_read))
+
+                dst_tensor = torch.narrow(data_buffer, 0, 0, bytes_to_read)
+                assert 1 == h.sync_pread(dst_tensor, ref_file, file_offsets[i])
+                assert dst_tensor.tolist() == ref_buf
+
+        if not use_cuda_pinned_tensor:
+            h.free_cpu_locked_tensor(data_buffer)
diff --git a/tests/unit/ops/aio/test_gds.py b/tests/unit/ops/aio/test_gds.py
new file mode 100644
index 000000000000..d97eff452eb5
--- /dev/null
+++ b/tests/unit/ops/aio/test_gds.py
@@ -0,0 +1,341 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import os
+import filecmp
+import torch
+import deepspeed
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import GDSBuilder
+from unit.common import DistributedTest
+
+KILO_BYTE = 1024 * 256
+BLOCK_SIZE = KILO_BYTE
+QUEUE_DEPTH = 2
+IO_SIZE = 4 * BLOCK_SIZE
+IO_PARALLEL = 2
+
+if not deepspeed.ops.__compatible_ops__[GDSBuilder.NAME]:
+    pytest.skip('Skip tests since gds is not compatible', allow_module_level=True)
+
+
+def _get_local_rank():
+    if get_accelerator().is_available():
+        return dist.get_rank()
+    return 0
+
+
+def _do_ref_write(tmpdir, index=0, file_size=IO_SIZE):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    ref_file = os.path.join(tmpdir, f'_py_random_{file_suffix}.pt')
+    ref_buffer = os.urandom(file_size)
+    with open(ref_file, 'wb') as f:
+        f.write(ref_buffer)
+
+    return ref_file, ref_buffer
+
+
+def _get_file_path(tmpdir, file_prefix, index=0):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'{file_prefix}_{file_suffix}.pt')
+
+
+def _get_test_write_file(tmpdir, index):
+    file_suffix = f'{_get_local_rank()}_{index}'
+    return os.path.join(tmpdir, f'_gds_write_random_{file_suffix}.pt')
+
+
+def _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, gds_handle, index=0):
+    test_file = _get_test_write_file(tmpdir, index)
+    test_buffer = get_accelerator().ByteTensor(list(ref_buffer))
+    gds_handle.pin_device_tensor(test_buffer)
+    return test_file, test_buffer
+
+
+def _validate_handle_state(handle, single_submit, overlap_events):
+    assert handle.get_single_submit() == single_submit
+    assert handle.get_overlap_events() == overlap_events
+    assert handle.get_intra_op_parallelism() == IO_PARALLEL
+    assert handle.get_block_size() == BLOCK_SIZE
+    assert handle.get_queue_depth() == QUEUE_DEPTH
+
+
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestRead(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_read(self, tmpdir, single_submit, overlap_events):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.sync_pread(gds_buffer, ref_file, 0)
+        assert read_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == gds_buffer.tolist()
+
+        h.unpin_device_tensor(gds_buffer)
+
+    def test_async_read(self, tmpdir, single_submit, overlap_events):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        ref_file, _ = _do_ref_write(tmpdir)
+        read_status = h.async_pread(gds_buffer, ref_file, 0)
+        assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        with open(ref_file, 'rb') as f:
+            ref_buffer = list(f.read())
+        assert ref_buffer == gds_buffer.tolist()
+
+        h.unpin_device_tensor(gds_buffer)
+
+
+@pytest.mark.parametrize("single_submit", [True, False])
+@pytest.mark.parametrize("overlap_events", [True, False])
+class TestWrite(DistributedTest):
+    world_size = 1
+    reuse_dist_env = True
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_parallel_write(self, tmpdir, single_submit, overlap_events):
+
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.sync_pwrite(gds_buffer, gds_file, 0)
+        assert write_status == 1
+
+        h.unpin_device_tensor(gds_buffer)
+
+        assert os.path.isfile(gds_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, gds_file, shallow=False)
+
+    def test_async_write(self, tmpdir, single_submit, overlap_events):
+        ref_file, ref_buffer = _do_ref_write(tmpdir)
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+        gds_file, gds_buffer = _get_test_write_file_and_device_buffer(tmpdir, ref_buffer, h)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        write_status = h.async_pwrite(gds_buffer, gds_file, 0)
+        assert write_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == 1
+
+        h.unpin_device_tensor(gds_buffer)
+
+        assert os.path.isfile(gds_file)
+
+        filecmp.clear_cache()
+        assert filecmp.cmp(ref_file, gds_file, shallow=False)
+
+
+@pytest.mark.sequential
+class TestAsyncQueue(DistributedTest):
+    world_size = 1
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_read(self, tmpdir, async_queue):
+
+        ref_files = []
+        for i in range(async_queue):
+            f, _ = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+
+        single_submit = True
+        overlap_events = True
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_buffers = [
+            torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name()) for _ in range(async_queue)
+        ]
+        for buf in gds_buffers:
+            h.pin_device_tensor(buf)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pread(gds_buffers[i], ref_files[i], 0)
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for i in range(async_queue):
+            with open(ref_files[i], 'rb') as f:
+                ref_buffer = list(f.read())
+            assert ref_buffer == gds_buffers[i].tolist()
+
+        for t in gds_buffers:
+            h.unpin_device_tensor(t)
+
+    @pytest.mark.parametrize("async_queue", [2, 3])
+    def test_write(self, tmpdir, async_queue):
+        ref_files = []
+        ref_buffers = []
+        for i in range(async_queue):
+            f, buf = _do_ref_write(tmpdir, i)
+            ref_files.append(f)
+            ref_buffers.append(buf)
+
+        single_submit = True
+        overlap_events = True
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, single_submit, overlap_events, IO_PARALLEL)
+
+        gds_files = []
+        gds_buffers = []
+        for i in range(async_queue):
+            f, buf = _get_test_write_file_and_device_buffer(tmpdir, ref_buffers[i], h, i)
+            gds_files.append(f)
+            gds_buffers.append(buf)
+
+        _validate_handle_state(h, single_submit, overlap_events)
+
+        for i in range(async_queue):
+            read_status = h.async_pwrite(gds_buffers[i], gds_files[i], 0)
+            assert read_status == 0
+
+        wait_status = h.wait()
+        assert wait_status == async_queue
+
+        for t in gds_buffers:
+            h.unpin_device_tensor(t)
+
+        for i in range(async_queue):
+            assert os.path.isfile(gds_files[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_files[i], gds_files[i], shallow=False)
+
+
+@pytest.mark.parametrize("use_new_api", [True, False])
+class TestLockDeviceTensor(DistributedTest):
+    world_size = 2
+    reuse_dist_env = True
+    if not get_accelerator().is_available():
+        init_distributed = False
+        set_dist_env = False
+
+    def test_pin_device_tensor(self, use_new_api):
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        unpinned_buffer = torch.empty(IO_SIZE, dtype=torch.uint8, device=get_accelerator().device_name())
+        if use_new_api:
+            pinned_buffer = h.new_pinned_device_tensor(unpinned_buffer.numel(), unpinned_buffer)
+        else:
+            pinned_buffer = torch.empty_like(unpinned_buffer)
+            h.pin_device_tensor(pinned_buffer)
+
+        assert unpinned_buffer.device == pinned_buffer.device
+        assert unpinned_buffer.dtype == pinned_buffer.dtype
+        assert unpinned_buffer.numel() == pinned_buffer.numel()
+
+        if use_new_api:
+            h.free_pinned_device_tensor(pinned_buffer)
+        else:
+            h.unpin_device_tensor(pinned_buffer)
+
+
+@pytest.mark.parametrize('file_partitions', [[1, 1, 1], [1, 1, 2], [1, 2, 1], [2, 1, 1]])
+class TestAsyncFileOffset(DistributedTest):
+    world_size = 1
+
+    def test_offset_write(self, tmpdir, file_partitions):
+        ref_file = _get_file_path(tmpdir, '_py_random')
+        aio_file = _get_file_path(tmpdir, '_aio_random')
+        partition_unit_size = IO_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        gds_buffer = torch.empty(file_size, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        ref_fd = open(ref_file, 'wb')
+        for i in range(len(file_partitions)):
+            src_buffer = torch.narrow(gds_buffer, 0, file_offsets[i],
+                                      file_partitions[i] * partition_unit_size).to(device='cpu')
+
+            ref_fd.write(src_buffer.numpy().tobytes())
+            ref_fd.flush()
+
+            assert 1 == h.sync_pwrite(buffer=src_buffer, filename=aio_file, file_offset=file_offsets[i])
+
+            filecmp.clear_cache()
+            assert filecmp.cmp(ref_file, aio_file, shallow=False)
+
+        ref_fd.close()
+
+        h.unpin_device_tensor(gds_buffer)
+
+    def test_offset_read(self, tmpdir, file_partitions):
+        partition_unit_size = BLOCK_SIZE
+        file_size = sum(file_partitions) * partition_unit_size
+        ref_file, _ = _do_ref_write(tmpdir, 0, file_size)
+        h = GDSBuilder().load().gds_handle(BLOCK_SIZE, QUEUE_DEPTH, True, True, IO_PARALLEL)
+
+        gds_buffer = torch.empty(file_size, dtype=torch.uint8, device=get_accelerator().device_name())
+        h.pin_device_tensor(gds_buffer)
+
+        file_offsets = []
+        next_offset = 0
+        for i in range(len(file_partitions)):
+            file_offsets.append(next_offset)
+            next_offset += file_partitions[i] * partition_unit_size
+
+        with open(ref_file, 'rb') as ref_fd:
+            for i in range(len(file_partitions)):
+                ref_fd.seek(file_offsets[i])
+                bytes_to_read = file_partitions[i] * partition_unit_size
+                ref_buf = list(ref_fd.read(bytes_to_read))
+
+                dst_tensor = torch.narrow(gds_buffer, 0, 0, bytes_to_read)
+                assert 1 == h.sync_pread(dst_tensor, ref_file, file_offsets[i])
+                assert dst_tensor.tolist() == ref_buf
+
+        h.unpin_device_tensor(gds_buffer)
diff --git a/tests/unit/ops/fp_quantizer/test_fp8_gemm.py b/tests/unit/ops/fp_quantizer/test_fp8_gemm.py
new file mode 100644
index 000000000000..d068a05b77bb
--- /dev/null
+++ b/tests/unit/ops/fp_quantizer/test_fp8_gemm.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+from deepspeed.ops.fp_quantizer import FP_Quantize, matmul_fp8
+
+from deepspeed import get_accelerator
+from deepspeed.linear import QuantizationConfig
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+@pytest.mark.parametrize("q_bits", [8], ids=[
+    "qbits8",
+])
+@pytest.mark.parametrize("M", [1, 2, 4, 8, 32, 64, 128, 256, 512, 1024, 2048])
+def test_fp_quant(dtype, q_bits, M):
+    device_name = get_accelerator().device_name()
+    quantization_group_size = 128
+    quant_config = QuantizationConfig(q_dtype=FPQuantizerBuilder.get_default_quant_dtype(),
+                                      group_size=quantization_group_size)
+    fpq = FP_Quantize(quantization_config=quant_config)
+
+    N = 8192
+    H = 4096
+
+    x = torch.randn(M, H, dtype=dtype, device=device_name)
+    weight_bf16 = torch.randn(H, N, dtype=dtype, device=device_name)
+
+    weight, _ = fpq.quantize(weight_bf16.data, q_bits=q_bits, return_meta_tensor=True)
+    scale = fpq.get_scales()
+    out = matmul_fp8(x, weight, scale, quantization_group_size, fpq)
+
+    out_q = torch.matmul(x, fpq.dequantize(weight, scale=fpq.scale))
+
+    error = ((out - out_q).abs() / (out.abs() + 1e-5)).sum() / out.numel()
+    assert 0.004 > error, f"failed on batch-size {M} with error {error}"
diff --git a/tests/unit/ops/fp_quantizer/test_fp_quant.py b/tests/unit/ops/fp_quantizer/test_fp_quant.py
new file mode 100644
index 000000000000..e9baf016310e
--- /dev/null
+++ b/tests/unit/ops/fp_quantizer/test_fp_quant.py
@@ -0,0 +1,134 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+from deepspeed.linear import QuantizationConfig
+
+import deepspeed
+
+from deepspeed.ops.fp_quantizer import FP_Quantize
+from deepspeed.ops.op_builder import FPQuantizerBuilder
+from deepspeed.accelerator import get_accelerator
+
+if not deepspeed.ops.__compatible_ops__[FPQuantizerBuilder.NAME]:
+    pytest.skip("FPQuantizer op is not available on this system", allow_module_level=True)
+
+# warning: this import silently JIT builds a set of kernels and may take a minute
+from qtorch.quant import float_quantize
+
+
+def qtorch_quantize(input, exp_bits=4, man_bits=3, rounding="nearest", group_size=1024):
+    ori_dt = input.dtype
+    ori_shape = input.shape
+    last_dim = group_size
+    input = input.view(-1, last_dim)
+
+    q_bits = exp_bits + man_bits + 1
+    q_range = FPQuantizerBuilder.get_quant_range(q_bits)
+    input_to_float = input.float()
+    input_max = input_to_float.abs().amax(dim=-1, keepdim=True)
+
+    return ((float_quantize(input_to_float / input_max * q_range, exp_bits, man_bits, rounding=rounding) * \
+            input_max / q_range).to(ori_dt)).reshape(ori_shape)
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+def test_fp_quant_meta(dtype):
+    device_name = get_accelerator().device_name()
+    group_size = 128
+    q_bits = 8
+    exp_bits = 4
+    man_bits = 3
+
+    quant_config = QuantizationConfig()
+    quant_config.q_dtype = FPQuantizerBuilder.get_default_quant_dtype()
+    quant_config.group_size = group_size
+    fpq = FP_Quantize(quantization_config=quant_config)
+
+    for i in range(10):
+        x = torch.rand(4, 1024, dtype=dtype)
+
+        ds_x = x.clone().to(device_name)
+        x_quantized, meta_tensor = fpq.quantize(ds_x, q_bits=q_bits, return_meta_tensor=True)
+        x_dequantized = fpq.dequantize(x_quantized, q_bits=q_bits, scale=meta_tensor)
+
+        qtorch_out = qtorch_quantize(x, exp_bits=exp_bits, man_bits=man_bits, group_size=group_size)
+        qtorch_error = (qtorch_out - x).abs().sum() / x.numel()
+        ds_error = (x_dequantized - x).abs().sum() / x.numel()
+
+        assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}"
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+def test_fp_quant_selective(dtype):
+    group_size = 128
+    q_bits = 8
+    exp_bits = 4
+    man_bits = 3
+
+    device_name = get_accelerator().device_name()
+
+    quant_config = QuantizationConfig()
+    quant_config.q_dtype = FPQuantizerBuilder.get_default_quant_dtype()
+    quant_config.group_size = group_size
+    fpq = FP_Quantize(quantization_config=quant_config)
+
+    indexes = torch.zeros(2, dtype=torch.int32, device=device_name)
+    indexes[0] = 1
+    indexes[1] = 3
+    for i in range(10):
+        x = torch.rand(4, 1024, dtype=dtype, device=device_name)
+
+        x = x.reshape(4, 1, x.shape[-1])
+        ds_x = x.clone()
+        x_quantized = fpq.quantize(ds_x, q_bits=q_bits)
+        x_dequantized = fpq.selective_dequantize(x_quantized, indexes, q_bits=q_bits)
+
+        qtorch_out = qtorch_quantize(x.index_select(0, indexes),
+                                     exp_bits=exp_bits,
+                                     man_bits=man_bits,
+                                     group_size=group_size)
+        qtorch_error = (qtorch_out - x.index_select(0, indexes)).abs().sum() / x.numel()
+        ds_error = (x_dequantized - x.index_select(0, indexes)).abs().sum() / x.numel()
+
+        assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}"
+
+
+@pytest.mark.parametrize("dtype", [torch.bfloat16], ids=["bf16"])
+@pytest.mark.parametrize("q_bits", [8, 6, 12], ids=["qbits8", "qbits6", "qbits12"])
+def test_fp_quant(dtype, q_bits):
+    device_name = get_accelerator().device_name()
+
+    quant_config = QuantizationConfig()
+    quant_config.q_dtype = FPQuantizerBuilder.get_default_quant_dtype()
+    quant_config.group_size = 128
+    fpq = FP_Quantize(quantization_config=quant_config)
+
+    for i in range(10):
+        x = torch.rand(4, 1024, dtype=dtype)
+
+        ds_x = x.clone().to(device_name)
+        x_quantized = fpq.quantize(ds_x, q_bits=q_bits)
+        x_dequantized = fpq.dequantize(x_quantized, q_bits=q_bits)
+
+        if q_bits == 8:
+            exp_bits = 4
+            man_bits = 3
+        elif q_bits == 6:
+            exp_bits = 3
+            man_bits = 2
+        elif q_bits == 12:
+            exp_bits = 4
+            man_bits = 7
+        else:
+            raise ValueError(f"unknown {q_bits=}")
+
+        qtorch_out = qtorch_quantize(x, exp_bits=exp_bits, man_bits=man_bits, group_size=quant_config.group_size)
+
+        qtorch_error = (qtorch_out - x).abs().sum() / x.numel()
+        ds_error = (x_dequantized - x).abs().sum() / x.numel()
+
+        assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}"
diff --git a/tests/unit/ops/lion/test_cpu_lion.py b/tests/unit/ops/lion/test_cpu_lion.py
index 61a069af3257..dce027e286fb 100644
--- a/tests/unit/ops/lion/test_cpu_lion.py
+++ b/tests/unit/ops/lion/test_cpu_lion.py
@@ -14,15 +14,12 @@
 from deepspeed.ops.op_builder import CPULionBuilder
 from unit.common import DistributedTest
 
-if not deepspeed.ops.__compatible_ops__[CPULionBuilder.NAME]:
-    pytest.skip("cpu-lion is not compatible", allow_module_level=True)
-
 pytest.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
 
 
 def check_equal(first, second, atol=1e-2, verbose=False):
-    x = first.detach().numpy()
-    y = second.detach().numpy()
+    x = first.detach().float().numpy()
+    y = second.detach().float().numpy()
     print("ATOL", atol)
     if verbose:
         print("x = {}".format(x.flatten()))
@@ -43,7 +40,7 @@ def _compare_optimizers(model_size, param1, optimizer1, param2, optimizer2):
     check_equal(param1.float().norm(), param2.float().cpu().norm(), atol=tolerance, verbose=True)
 
 
-@pytest.mark.parametrize('dtype', [torch.half, torch.float], ids=["fp16", "fp32"])
+@pytest.mark.parametrize('dtype', [torch.half, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"])
 @pytest.mark.parametrize('model_size',
                          [
                              (64),
@@ -62,6 +59,8 @@ class TestCPULion(DistributedTest):
         set_dist_env = False
 
     @pytest.mark.skipif(not get_accelerator().is_available(), reason="only supported in CUDA environments.")
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[CPULionBuilder.NAME],
+                        reason="CPULionBuilder has not been implemented on this system.")
     def test_fused_lion_equal(self, dtype, model_size):
         if ("amd" in pytest.cpu_vendor) and (dtype == torch.half):
             pytest.skip("cpu-lion with half precision not supported on AMD CPUs")
@@ -84,6 +83,8 @@ def test_fused_lion_equal(self, dtype, model_size):
 
 class TestCPULionGPUError(DistributedTest):
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[CPULionBuilder.NAME],
+                        reason="CPULionBuilder has not been implemented on this system.")
     def test_cpu_lion_gpu_error(self):
         model_size = 64
         from deepspeed.ops.lion import DeepSpeedCPULion
diff --git a/tests/unit/ops/lion/test_lion.py b/tests/unit/ops/lion/test_lion.py
index b2c3ac2f52df..507ff72ea51a 100644
--- a/tests/unit/ops/lion/test_lion.py
+++ b/tests/unit/ops/lion/test_lion.py
@@ -12,6 +12,7 @@
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel
 from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import CPULionBuilder
 
 if torch.half not in get_accelerator().supported_dtypes():
     pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
@@ -27,6 +28,7 @@ class TestLionConfigs(DistributedTest):
     world_size = 1
     reuse_dist_env = True
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[CPULionBuilder.NAME], reason="CPULionBuilder has not been implemented on this system.")
     def test(self,
              optimizer,
              zero_offload,
diff --git a/tests/unit/ops/transformer/inference/inference_test_utils.py b/tests/unit/ops/transformer/inference/inference_test_utils.py
index 9c7b428c0e68..d63c51267e51 100644
--- a/tests/unit/ops/transformer/inference/inference_test_utils.py
+++ b/tests/unit/ops/transformer/inference/inference_test_utils.py
@@ -3,6 +3,8 @@
 
 # DeepSpeed Team
 
+from typing import Tuple
+
 import torch
 from deepspeed.accelerator import get_accelerator
 
@@ -23,10 +25,10 @@ def get_tolerances():
 DTYPES = None
 
 
-def get_dtypes():
+def get_dtypes(include_float=True):
     global DTYPES
     if DTYPES is None:
-        DTYPES = [torch.float16, torch.float32]
+        DTYPES = [torch.float16, torch.float32] if include_float else [torch.float16]
         try:
             if get_accelerator().is_bf16_supported():
                 DTYPES.append(torch.bfloat16)
@@ -35,31 +37,10 @@ def get_dtypes():
     return DTYPES
 
 
-def allclose(x, y):
+def allclose(x, y, tolerances: Tuple[int, int] = None):
     assert x.dtype == y.dtype
-    rtol, atol = get_tolerances()[x.dtype]
+    if tolerances is None:
+        rtol, atol = get_tolerances()[x.dtype]
+    else:
+        rtol, atol = tolerances
     return torch.allclose(x, y, rtol=rtol, atol=atol)
-
-
-def assert_almost_equal(x, y, decimal=2, err_msg=''):
-    import numpy.testing as npt
-    if isinstance(x, torch.Tensor):
-        if x.dtype == torch.bfloat16:
-            x = x.float()
-        x = x.cpu().detach().numpy()
-    if isinstance(y, torch.Tensor):
-        if y.dtype == torch.bfloat16:
-            y = y.float()
-        y = y.cpu().detach().numpy()
-    npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal)
-
-
-def max_diff(a, b):
-    a = a.to(torch.float32).flatten()
-    b = b.to(torch.float32).flatten()
-    diff = torch.abs(a - b)
-    max_diff_indices = torch.argsort(diff)[-1]
-    print("Max difference indices:", max_diff_indices)
-    print("Max difference values:", diff[max_diff_indices])
-    print(f"{a[max_diff_indices]} vs {b[max_diff_indices]}")
-    return max_diff_indices
diff --git a/tests/unit/ops/transformer/inference/test_attention.py b/tests/unit/ops/transformer/inference/test_attention.py
index 13abe8b915c7..cae201d747a3 100644
--- a/tests/unit/ops/transformer/inference/test_attention.py
+++ b/tests/unit/ops/transformer/inference/test_attention.py
@@ -7,7 +7,7 @@
 import torch
 import deepspeed
 from deepspeed.accelerator import get_accelerator
-from .inference_test_utils import assert_almost_equal
+from .inference_test_utils import allclose
 
 
 # reference timplementation
@@ -27,27 +27,27 @@ def ref_torch_attention(q, k, v, mask, sm_scale):
 @pytest.mark.parametrize("causal", [True, False])
 @pytest.mark.parametrize("use_flash", [True, False])
 def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float16):
-    if not deepspeed.HAS_TRITON:
-        pytest.skip("triton has to be installed for the test")
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
 
     minus_inf = -65504.0
-
+    dev = deepspeed.accelerator.get_accelerator().device_name()
     # skip autotune in testing
     from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
     fp16_matmul.skip_autotune()
 
     from deepspeed.ops.transformer.inference.triton.attention import _triton_attention, _triton_packed_flash
     torch.manual_seed(20)
-    q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
-    k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
-    v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5)
+    q = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5)
+    k = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5)
+    v = torch.empty((BATCH, H, N_CTX, D_HEAD), dtype=dtype, device=dev).normal_(mean=0, std=.5)
     sm_scale = 0.3
 
     # reference implementation
     p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
     score = p
-    mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
-    M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda"))
+    mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device=dev)
+    M = torch.tril(torch.ones((N_CTX, N_CTX), device=dev))
     if causal:
         for z in range(BATCH):
             for h in range(H):
@@ -58,7 +58,7 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float
     context = ref_out
 
     # adjust it to expected tensor format and run test
-    qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device='cuda', requires_grad=False)
+    qkv = torch.randn((BATCH, N_CTX, 3 * H * D_HEAD), dtype=dtype, device=dev, requires_grad=False)
     qkv[:, :, :H * D_HEAD] = q.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
     qkv[:, :, 1 * H * D_HEAD:2 * H * D_HEAD] = k.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
     qkv[:, :, 2 * H * D_HEAD:] = v.permute(0, 2, 1, 3).contiguous().reshape((BATCH, N_CTX, H * D_HEAD))
@@ -66,12 +66,12 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float
     if use_flash:
         if not get_accelerator().is_triton_supported():
             pytest.skip("triton flash attention is supported when the compute capability > 8.0")
-        triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device="cuda")
+        triton_mask = torch.zeros((BATCH, 1, 1, N_CTX), dtype=dtype, device=dev)
         if not causal:
-            lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device='cuda')
+            lengths = torch.randint(N_CTX - 8, N_CTX, (BATCH, 1), device=dev)
             for i, l in enumerate(lengths):
                 triton_mask[i, ..., l:] = minus_inf
-            mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device="cuda")
+            mask = torch.zeros((BATCH, H, N_CTX, N_CTX), dtype=dtype, device=dev)
             for b in range(BATCH):
                 mask[b, :, :, lengths[b]:] = minus_inf
             ref_out = ref_torch_attention(q, k, v, mask, sm_scale)
@@ -88,4 +88,4 @@ def test_attention(BATCH, H, N_CTX, D_HEAD, causal, use_flash, dtype=torch.float
                                     use_triton_flash=False,
                                     use_ds_attention=False)
     tri_out = tri_out.reshape((BATCH, N_CTX, H, D_HEAD)).permute(0, 2, 1, 3)
-    assert_almost_equal(ref_out, tri_out)
+    assert (allclose(ref_out, tri_out))
diff --git a/tests/unit/ops/transformer/inference/test_bias_add.py b/tests/unit/ops/transformer/inference/test_bias_add.py
index 843c9b889c2b..eb283924f73c 100644
--- a/tests/unit/ops/transformer/inference/test_bias_add.py
+++ b/tests/unit/ops/transformer/inference/test_bias_add.py
@@ -8,29 +8,21 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_add import BiasAddOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-torch_minor_version = None
-
 
 def run_bias_add_reference(activations, bias):
     return activations + bias
 
 
 def run_bias_add_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_add_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_add_bf16(activations, bias)
-    else:
-        return inference_module.bias_add_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasAddOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index d5ab13964974..c995d2a8c46d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -8,15 +8,13 @@
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.transformer.inference.op_binding.gated_activation import GatedActivationOp
 from deepspeed.utils.types import ActivationFuncType
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-torch_minor_version = None
-
 
 def run_bias_geglu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
@@ -27,10 +25,7 @@ def run_bias_geglu_reference(activations, bias):
 
 
 def run_bias_geglu_ds(activation, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_GELU)
+    return GatedActivationOp()(activation, bias, ActivationFuncType.GATED_GELU)
 
 
 @pytest.mark.inference_ops
@@ -56,17 +51,14 @@ def run_gated_silu_reference(activations, bias):
 
 
 def run_gated_silu_ds(activation, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.gated_activation(activation, bias, ActivationFuncType.GATED_SILU)
+    return GatedActivationOp()(activation, bias, ActivationFuncType.GATED_SILU)
 
 
 @pytest.mark.inference_ops
 @pytest.mark.parametrize("batch", [1, 2])
 @pytest.mark.parametrize("sequence", [1, 128, 255])
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
-@pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
+@pytest.mark.parametrize("dtype", get_dtypes())
 def test_gated_silu(batch, sequence, channels, dtype):
     activation = torch.randn((batch, sequence, channels * 2), dtype=dtype, device=get_accelerator().device_name())
     bias = torch.randn((channels * 2), dtype=dtype, device=get_accelerator().device_name())
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index fd82da51380c..f0a09245e890 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -8,15 +8,14 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
+from deepspeed.utils.torch import required_torch_version
 from .inference_test_utils import allclose, get_dtypes
-from packaging import version as pkg_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-torch_minor_version = None
-
 
 def run_bias_gelu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation
@@ -25,15 +24,8 @@ def run_bias_gelu_reference(activations, bias):
 
 
 def run_bias_gelu_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_gelu_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_gelu_bf16(activations, bias)
-    else:
-        return inference_module.bias_gelu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasGeluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
@@ -42,7 +34,7 @@ def run_bias_gelu_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_gelu(batch, sequence, channels, dtype):
-    if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
+    if not required_torch_version(min_version=1.12):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
     activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index 881af78e92cf..69078f9f7646 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -8,14 +8,13 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_relu import BiasReluOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-torch_minor_version = None
-
 
 def run_bias_relu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
@@ -23,15 +22,8 @@ def run_bias_relu_reference(activations, bias):
 
 
 def run_bias_relu_ds(activations, bias):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_relu_fp16(activations, bias)
-    elif activations.dtype == torch.bfloat16:
-        return inference_module.bias_relu_bf16(activations, bias)
-    else:
-        return inference_module.bias_relu_fp32(activations, bias)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasReluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index de924848bfb4..a58abfdb100c 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -7,13 +7,13 @@
 import torch
 import deepspeed
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
+from deepspeed.utils.torch import required_torch_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-torch_minor_version = None
-
 
 def allclose(x, y):
     assert x.dtype == y.dtype
@@ -22,14 +22,11 @@ def allclose(x, y):
 
 
 def version_appropriate_gelu(activations):
-    global torch_minor_version
-    if torch_minor_version is None:
-        torch_minor_version = int(torch.__version__.split('.')[1])
-    # If torch version = 1.12
-    if torch_minor_version < 12:
-        return torch.nn.functional.gelu(activations)
-    else:
+    # gelu behavior changes (correctly) in torch 1.12
+    if required_torch_version(min_version=1.12):
         return torch.nn.functional.gelu(activations, approximate='tanh')
+    else:
+        return torch.nn.functional.gelu(activations)
 
 
 def run_gelu_reference(activations):
@@ -42,15 +39,11 @@ def run_gelu_ds(activations, use_triton_ops=False):
         from deepspeed.ops.transformer.inference.triton import gelu
         return gelu(activations)
 
+    device = deepspeed.accelerator.get_accelerator().device_name()
     channels = activations.shape[-1]
-    bias = torch.zeros((channels), dtype=activations.dtype, device='cuda')
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    if activations.dtype == torch.float16:
-        return inference_module.bias_gelu_fp16(activations, bias)
-    else:
-        return inference_module.bias_gelu_fp32(activations, bias)
+    bias = torch.zeros((channels), dtype=activations.dtype, device=device)
+    config = DeepSpeedInferenceConfig(dtype=activations.dtype)
+    return BiasGeluOp(config)(activations, bias)
 
 
 @pytest.mark.inference_ops
@@ -60,11 +53,12 @@ def run_gelu_ds(activations, use_triton_ops=False):
 @pytest.mark.parametrize("dtype", [torch.float16])
 @pytest.mark.parametrize("use_triton_ops", [True, False])
 def test_gelu(batch, sequence, channels, dtype, use_triton_ops):
-    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+    device = deepspeed.accelerator.get_accelerator().device_name()
+    activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     activations_ref = activations_ds.clone().detach()
 
-    if not deepspeed.HAS_TRITON and use_triton_ops:
-        pytest.skip("triton has to be installed for the test")
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
     ds_out = run_gelu_ds(activations_ds, use_triton_ops)
     ref_out = run_gelu_reference(activations_ref)
     assert (allclose(ds_out, ref_out))
diff --git a/tests/unit/ops/transformer/inference/test_layer_norm.py b/tests/unit/ops/transformer/inference/test_layer_norm.py
index 711a35213015..4a84add16046 100644
--- a/tests/unit/ops/transformer/inference/test_layer_norm.py
+++ b/tests/unit/ops/transformer/inference/test_layer_norm.py
@@ -8,7 +8,8 @@
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
-from .inference_test_utils import allclose, get_dtypes, assert_almost_equal
+from deepspeed.ops.transformer.inference.op_binding.layer_norm import LayerNormOp
+from .inference_test_utils import allclose, get_dtypes
 try:
     import triton  # noqa: F401 # type: ignore
     from deepspeed.ops.transformer.inference.triton import (
@@ -21,8 +22,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
     vals_f = vals.to(torch.float32)
@@ -32,10 +31,7 @@ def ref_implementation(vals, gamma, beta, epsilon, channels, dtype):
 
 
 def ds_implementation(vals, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm(vals, gamma, beta, epsilon)
+    return LayerNormOp()(vals, gamma, beta, epsilon)
 
 
 def ds_triton_implementation(vals, gamma, beta, epsilon):
@@ -49,8 +45,8 @@ def ds_triton_implementation(vals, gamma, beta, epsilon):
 @pytest.mark.parametrize("dtype", get_dtypes())
 @pytest.mark.parametrize("use_triton_ops", [False, True])
 def test_layer_norm(batch, seq_len, channels, dtype, use_triton_ops):
-    if not deepspeed.HAS_TRITON and use_triton_ops:
-        pytest.skip("triton has to be installed for the test")
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
 
     vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
     gamma = torch.randn((channels), dtype=dtype, device=get_accelerator().current_device_name())
@@ -83,10 +79,7 @@ def residual_ref_implementation(vals, bias, res, gamma, beta, epsilon, channels,
 
 
 def residual_ds_implementation(vals, bias, res, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module._layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
+    return LayerNormOp.layer_norm_residual(vals, bias, res, gamma, beta, epsilon)
 
 
 def residual_ds_triton_implementation(vals, bias, res, gamma, beta, epsilon):
@@ -100,8 +93,8 @@ def residual_ds_triton_implementation(vals, bias, res, gamma, beta, epsilon):
 @pytest.mark.parametrize("dtype", get_dtypes())
 @pytest.mark.parametrize("use_triton_ops", [False, True])
 def test_layer_norm_residual(batch, seq_len, channels, dtype, use_triton_ops):
-    if not deepspeed.HAS_TRITON and use_triton_ops:
-        pytest.skip("triton has to be installed for the test")
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
 
     vals = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
     residual = torch.randn((batch, seq_len, channels), dtype=dtype, device=get_accelerator().current_device_name())
@@ -137,10 +130,7 @@ def residual_store_ref_implementation(vals, bias, res, gamma, beta, epsilon, cha
 
 
 def residual_store_ds_implementation(vals, bias, res, gamma, beta, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
+    return LayerNormOp.layer_norm_residual_store_pre_ln_res(vals, bias, res, gamma, beta, epsilon)
 
 
 @pytest.mark.inference_ops
@@ -173,21 +163,22 @@ def test_layer_norm_residual_store_pre_ln_res(batch, seq_len, channels, dtype):
 @pytest.mark.parametrize("residual", [True, False])
 @pytest.mark.parametrize("input_bias", [True, False])
 def test_triton_layer_norm(M, N, dtype, residual, input_bias, eps=1e-5, device='cuda'):
-    if not deepspeed.HAS_TRITON:
-        pytest.skip("triton has to be installed for the test")
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
+    dev = get_accelerator().device_name()
     torch.manual_seed(0)
     # create data
     x_shape = (M, N)
     w_shape = (x_shape[-1], )
-    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
-    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
-    x_bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=False)
-    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+    weight = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False)
+    bias = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False)
+    x_bias = torch.rand(w_shape, dtype=dtype, device=dev, requires_grad=False)
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device=dev)
     dy = .1 * torch.randn_like(x)
     if residual:
-        res = torch.rand(x_shape, dtype=dtype, device='cuda', requires_grad=False)
+        res = torch.rand(x_shape, dtype=dtype, device=dev, requires_grad=False)
     else:
-        res = torch.zeros(x_shape, dtype=dtype, device='cuda', requires_grad=False)
+        res = torch.zeros(x_shape, dtype=dtype, device=dev, requires_grad=False)
     x.requires_grad_(True)
     # forward pass
     if residual or input_bias:
@@ -197,4 +188,4 @@ def test_triton_layer_norm(M, N, dtype, residual, input_bias, eps=1e-5, device='
     y_ref = torch.nn.functional.layer_norm(x + res + (x_bias if input_bias else 0), w_shape, weight, bias,
                                            eps).to(dtype)
     # compare
-    assert_almost_equal(y_tri, y_ref)
+    assert (allclose(y_tri, y_ref))
diff --git a/tests/unit/ops/transformer/inference/test_matmul.py b/tests/unit/ops/transformer/inference/test_matmul.py
index 804a85750a3a..6f5173bbc827 100644
--- a/tests/unit/ops/transformer/inference/test_matmul.py
+++ b/tests/unit/ops/transformer/inference/test_matmul.py
@@ -11,9 +11,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-torch_minor_version = None
-
 
 def allclose(x, y):
     assert x.dtype == y.dtype
@@ -42,8 +39,8 @@ def run_matmul_ds(a, b, use_triton_ops=False):
 @pytest.mark.parametrize("dtype", [torch.float16])
 @pytest.mark.parametrize("use_triton_ops", [True])
 def test_matmul_4d(B, H, M, K, N, dtype, use_triton_ops):
-    if not deepspeed.HAS_TRITON and use_triton_ops:
-        pytest.skip("triton has to be installed for the test")
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
 
     # skip autotune in testing
     from deepspeed.ops.transformer.inference.triton.matmul_ext import fp16_matmul
diff --git a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
index e1c8127a83ac..dcf9f16baaf1 100644
--- a/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
+++ b/tests/unit/ops/transformer/inference/test_moe_res_matmult.py
@@ -8,24 +8,20 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer.inference.op_binding.moe_res_matmul import MoEResMatmulOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def run_moe_res_matmul_reference(residual, coef1, coef2, output):
     return residual * coef1 + output * coef2
 
 
 def run_moe_res_matmul_ds(residual, coef, output):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
     coef_t = coef.transpose(-1, -2).contiguous()
-    return inference_module.moe_res_matmul(residual, coef_t, output)
+    return MoEResMatmulOp()(residual, coef_t, output)
 
 
 @pytest.mark.inference_ops
diff --git a/tests/unit/ops/transformer/inference/test_residual_add.py b/tests/unit/ops/transformer/inference/test_residual_add.py
index c2952f74ff2d..cab64d1d0555 100644
--- a/tests/unit/ops/transformer/inference/test_residual_add.py
+++ b/tests/unit/ops/transformer/inference/test_residual_add.py
@@ -8,6 +8,8 @@
 import deepspeed
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.ops.transformer import DeepSpeedInferenceConfig
+from deepspeed.ops.transformer.inference.op_binding import ResidualAddOp
 from .inference_test_utils import get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
@@ -36,11 +38,6 @@ def allclose(x, y):
     return torch.allclose(x, y, rtol=rtol, atol=atol)
 
 
-@pytest.fixture(scope="module")
-def inference_module():
-    return InferenceBuilder().load()
-
-
 def res_add_bias_ref(hidden_state, residual, attn_output, attn_bias, final_bias, mp_size=1, pre_attn_norm=True):
     if pre_attn_norm:
         hidden_state += (residual + final_bias + attn_output + attn_bias) / mp_size
@@ -75,10 +72,10 @@ def run_residual_add_reference(hidden_state, residual, attn_output, attn_bias, f
 @pytest.mark.parametrize("mp_size", [1, 2])
 @pytest.mark.parametrize("pre_attn_norm", [True, False])
 @pytest.mark.parametrize("use_triton_ops", [True, False])
-def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size,
-                      pre_attn_norm, use_triton_ops):
-    if not deepspeed.HAS_TRITON and use_triton_ops and dtype == torch.float16:
-        pytest.skip("triton has to be installed for the test")
+def test_residual_add(batch, sequence, hidden_dim, dtype, mlp_after_attn, add_bias, mp_size, pre_attn_norm,
+                      use_triton_ops):
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
     ds_out = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     residual = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
     attn_output = torch.randn((batch, sequence, hidden_dim), dtype=dtype, device=get_accelerator().device_name())
@@ -96,19 +93,9 @@ def test_residual_add(inference_module, batch, sequence, hidden_dim, dtype, mlp_
     if use_triton_ops:
         from deepspeed.ops.transformer.inference.triton import residual_add_bias
         ds_out = residual_add_bias(*res_add_args)
-    if dtype == torch.float16:
-        ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
-    elif dtype == torch.float32:
-        ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
-    elif dtype == torch.bfloat16:
-        ds_out = inference_module.residual_add_bias_bf16(*res_add_args)
     else:
-        if dtype == torch.float16:
-            ds_out = inference_module.residual_add_bias_fp16(*res_add_args)
-        elif dtype == torch.float32:
-            ds_out = inference_module.residual_add_bias_fp32(*res_add_args)
-        else:
-            raise ValueError(f"Unsupported dtype: {dtype}")
+        config = DeepSpeedInferenceConfig(dtype=dtype)
+        ds_out = ResidualAddOp(config).residual_add_func(*res_add_args)
 
     if not allclose(ds_out, ref_out):
         print((ds_out - ref_out).abs().max())
diff --git a/tests/unit/ops/transformer/inference/test_rms_norm.py b/tests/unit/ops/transformer/inference/test_rms_norm.py
index 508a40e12e8d..fde9c9510771 100644
--- a/tests/unit/ops/transformer/inference/test_rms_norm.py
+++ b/tests/unit/ops/transformer/inference/test_rms_norm.py
@@ -8,13 +8,13 @@
 import pytest
 from deepspeed.accelerator import get_accelerator
 from deepspeed.ops.op_builder import InferenceBuilder  # type: ignore
+from deepspeed.ops.transformer.inference.op_binding.pre_rms_norm import PreRMSNormOp
+from deepspeed.ops.transformer.inference.op_binding.rms_norm import RMSNormOp
 from .inference_test_utils import allclose, get_dtypes
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-
 
 def ref_implementation(vals, gamma, epsilon):
     variance = vals.to(torch.float32).pow(2).mean(-1, keepdim=True)
@@ -27,10 +27,7 @@ def ref_implementation(vals, gamma, epsilon):
 
 
 def ds_implementation(vals, gamma, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.rms_norm(vals, gamma, epsilon)
+    return RMSNormOp()(vals, gamma, epsilon)
 
 
 @pytest.mark.inference_ops
@@ -51,10 +48,7 @@ def test_rms_norm(batch, seq_len, channels, dtype):
 
 
 def pre_ds_implementation(vals, residual, gamma, epsilon):
-    global inference_module
-    if inference_module is None:
-        inference_module = InferenceBuilder().load()
-    return inference_module.pre_rms_norm(vals, residual, gamma, epsilon)
+    return PreRMSNormOp()(vals, residual, gamma, epsilon)
 
 
 def pre_ref_implementation(vals, residual, gamma, epsilon):
diff --git a/tests/unit/ops/transformer/inference/test_rope.py b/tests/unit/ops/transformer/inference/test_rope.py
new file mode 100644
index 000000000000..1f0ca0578e04
--- /dev/null
+++ b/tests/unit/ops/transformer/inference/test_rope.py
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import deepspeed
+from deepspeed.ops.op_builder import InferenceBuilder
+from deepspeed.accelerator import get_accelerator
+
+if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
+    pytest.skip("Inference ops are not available on this system", allow_module_level=True)
+
+
+@pytest.mark.inference_ops
+@pytest.mark.parametrize("num_heads", [64, 32, 16, 8])
+def test_rope_warp_size_alignment(num_heads):
+    if get_accelerator().device_name() != "cuda":
+        pytest.skip("This test runs only on GPU")
+
+    batch = 1
+    head = 8
+    seq_len = 1024
+    head_dim = 32
+    rotary_dim = 32
+    offset = 8
+    rotate_half = False
+    rope_theta = 2
+
+    cuda0 = torch.device('cuda:0')
+    query = torch.randn(batch, head, seq_len, head_dim, device=cuda0)
+    key = torch.randn(batch, head, seq_len, head_dim, device=cuda0)
+
+    inference = InferenceBuilder().load()
+    # For num_heads values of 64, 32, 16, 8
+    # corresponding threads_per_head (defined in apply_rotary_pos_emb.cu) values are 4, 8, 16, 32
+    inference.apply_rotary_pos_emb(query, key, rotary_dim, offset, num_heads, rotate_half, rope_theta)
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
index 76046f31e01a..83785ac38ebb 100644
--- a/tests/unit/ops/transformer/inference/test_softmax.py
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -11,9 +11,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-inference_module = None
-torch_minor_version = None
-
 
 def allclose(x, y):
     assert x.dtype == y.dtype
@@ -41,9 +38,11 @@ def run_softmax_ds(input, use_triton_ops=False):
 @pytest.mark.parametrize("dtype", [torch.float16, torch.float32])
 @pytest.mark.parametrize("use_triton_ops", [True])
 def test_softmax(batch, sequence, channels, dtype, use_triton_ops):
-    if not deepspeed.HAS_TRITON and use_triton_ops:
-        pytest.skip("triton has to be installed for the test")
-    input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device='cuda')
+    if not deepspeed.get_accelerator().is_triton_supported():
+        pytest.skip("triton is not supported on this system")
+
+    device = deepspeed.accelerator.get_accelerator().device_name()
+    input_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=device)
     input_ref = input_ds.clone().detach()
 
     ds_out = run_softmax_ds(input_ds, use_triton_ops)
diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py
index 05c6a82ef55a..2a8a4b9b7d82 100644
--- a/tests/unit/pipe/test_pipe_module.py
+++ b/tests/unit/pipe/test_pipe_module.py
@@ -60,9 +60,12 @@ def batch_input():
 
 class TestPipeModuleSequential(DistributedTest):
     world_size = 2
+    # needs to be set for torch.compile: running torch.compile with daemonic process causes an error
+    non_daemonic_procs = True
 
     @pytest.mark.parametrize("activation_checkpoints", [False, True])
-    def test(self, sequential_model, simple_config, batch_input, activation_checkpoints):
+    @pytest.mark.parametrize("use_compile", [False, True])
+    def test(self, sequential_model, simple_config, batch_input, activation_checkpoints, use_compile):
         base_model = copy.deepcopy(sequential_model)
         base_input = batch_input.clone().detach()
         base_output = base_model(base_input)
@@ -71,7 +74,8 @@ def test(self, sequential_model, simple_config, batch_input, activation_checkpoi
 
         pipe_model = copy.deepcopy(sequential_model)
         pipe_model = PipelineModule(layers=pipe_model, num_stages=2)
-
+        if (use_compile):
+            pipe_model.compile()
         # Ensure all parameters are accounted for.
         my_params = sum(p.numel() for p in pipe_model.parameters())
         total_pipe_params = torch.LongTensor([my_params]).to(get_accelerator().device_name())
diff --git a/tests/unit/profiling/flops_profiler/test_flops_profiler.py b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
index bbcb01b489f4..c72deecf287f 100644
--- a/tests/unit/profiling/flops_profiler/test_flops_profiler.py
+++ b/tests/unit/profiling/flops_profiler/test_flops_profiler.py
@@ -9,7 +9,7 @@
 from deepspeed.profiling.flops_profiler import get_model_profile
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.common import DistributedTest
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 if torch.half not in get_accelerator().supported_dtypes():
diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
index 0232457a4f9c..dd3bcd7fb6bd 100644
--- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py
@@ -8,6 +8,7 @@
 import pytest
 import torch
 import deepspeed
+from deepspeed.pipe import PipelineModule, LayerSpec
 from deepspeed.accelerator import get_accelerator
 from copy import deepcopy
 from unit.common import DistributedTest
@@ -62,6 +63,8 @@ def _match_outputs(ref, tgt):
 
 
 def _test_activation_checkpoint(module, *inputs):
+    if get_accelerator().device_name() == "cpu":
+        pytest.skip("CPU accelerator does not support this test yet")
     # Move to device
     module.to(get_accelerator().device_name())
 
@@ -82,6 +85,8 @@ def _test_activation_checkpoint(module, *inputs):
 
 
 def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
+    if get_accelerator().device_name() == "cpu":
+        pytest.skip("CPU accelerator does not support this test yet")
     # Move to device
     module.to(get_accelerator().device_name())
 
@@ -255,3 +260,52 @@ def test_ckpt_non_tensor_output_ordering(self, non_tensor_output):
         else:
             ordering += [torch.is_tensor(non_tensor_output)]
         _test_activation_checkpoint_ordering(module, ordering, inputs)
+
+
+class TestCheckpointableLayersConfig(DistributedTest):
+    world_size = 1
+
+    def test_gpt2_checkpointable_layers(self):
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test yet")
+
+        # Create a simple topology for testing
+        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
+        topo = PipeModelDataParallelTopology(num_pp=1, num_mp=1, num_dp=1)
+
+        # Create test classes that we want to checkpoint
+        class TestTransformerLayer(torch.nn.Module):
+
+            def forward(self, x):
+                return x
+
+        class ParallelTransformerLayerPipe(TestTransformerLayer):
+            pass
+
+        class GMLPBlock(TestTransformerLayer):
+            pass
+
+        # Create a mock GPT2 model with different layer types
+        class TestGPT2ModelPipe(PipelineModule):
+
+            def __init__(self):
+                self.layers_spec = [
+                    LayerSpec(ParallelTransformerLayerPipe),
+                    LayerSpec(GMLPBlock),
+                    LayerSpec(torch.nn.Linear, 10, 10),  # Should not be checkpointed
+                ]
+
+                super().__init__(layers=self.layers_spec,
+                                 topology=topo,
+                                 checkpointable_layers=["GMLPBlock", "ParallelTransformerLayerPipe"])
+
+        model = TestGPT2ModelPipe()
+        model.to(get_accelerator().device_name())
+
+        # Build layers manually for testing
+        layers = [spec.build() for spec in model.layers_spec]
+
+        # Test that _is_checkpointable returns correct values
+        assert model._is_checkpointable([layers[0]]) == True  # ParallelTransformerLayerPipe
+        assert model._is_checkpointable([layers[1]]) == True  # GMLPBlock
+        assert model._is_checkpointable([layers[2]]) == False  # Linear layer
diff --git a/tests/unit/runtime/comm/test_coalesced_collectives.py b/tests/unit/runtime/comm/test_coalesced_collectives.py
index 8e736c1eaaa6..2d5db192f2ca 100644
--- a/tests/unit/runtime/comm/test_coalesced_collectives.py
+++ b/tests/unit/runtime/comm/test_coalesced_collectives.py
@@ -7,9 +7,11 @@
 """
 
 import torch
+import deepspeed
 import deepspeed.comm as dist
-from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced
+from deepspeed.runtime.comm.coalesced_collectives import reduce_scatter_coalesced, all_to_all_quant_reduce
 from deepspeed.accelerator import get_accelerator
+import pytest
 
 from unit.common import DistributedTest
 
@@ -59,3 +61,101 @@ def test(self):
             assert torch.allclose(output, torch.zeros_like(output))
         elif dist.get_rank() == 1:
             assert output.shape == (0, )
+
+
+# Currently we cannot test all_to_all_quant_reduce in non-fallback cases because we don't support multinodes tests.
+class TestAllToAllQuantReduceFallback(DistributedTest):
+    world_size = 2
+
+    def test_1d_tensor(self):
+        # case 1: 1D tensor
+        input = torch.zeros((10, ), dtype=torch.half, device=get_accelerator().current_device_name())
+        from deepspeed.ops.op_builder import QuantizerBuilder
+        if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
+            pytest.skip("QuantizerBuilder is not implemented")
+        output = all_to_all_quant_reduce([input], {})[0]
+
+        if dist.get_rank() == 0:
+            assert output.shape == (5, )
+            assert torch.allclose(output, torch.zeros_like(output))
+        elif dist.get_rank() == 1:
+            assert output.shape == (5, )
+            assert torch.allclose(output, torch.zeros_like(output))
+
+    def test_non_divisible(self):
+        # case 2: tensor size not divisible by global_world_size
+        input = torch.zeros((7, 7), dtype=torch.half, device=get_accelerator().current_device_name())
+        from deepspeed.ops.op_builder import QuantizerBuilder
+        if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
+            pytest.skip("QuantizerBuilder is not implemented")
+        output = all_to_all_quant_reduce([input], {})[0]
+
+        if dist.get_rank() == 0:
+            assert output.shape == (25, )
+            assert torch.allclose(output, torch.zeros_like(output))
+        elif dist.get_rank() == 1:
+            assert output.shape == (24, )
+            assert torch.allclose(output, torch.zeros_like(output))
+
+
+class TestLocoQuantized(DistributedTest):
+
+    world_size = 1
+
+    @pytest.mark.parametrize("num_bits", [4, 8])
+    @pytest.mark.parametrize("tensor_size", [(16, 16), (64, 64)])
+    @pytest.mark.parametrize("devices_per_node", [4, 8])
+    def test_loco_quantized_reduction(self, num_bits, tensor_size, devices_per_node):
+        from deepspeed.ops.op_builder import QuantizerBuilder
+        if not deepspeed.ops.__compatible_ops__[QuantizerBuilder.NAME]:
+            pytest.skip("QuantizerBuilder is not implemented")
+
+        quantizer_module = QuantizerBuilder().load()
+
+        tensor = torch.randn(tensor_size, device='cuda', dtype=torch.half)
+
+        num_nodes = 2  # Fake world size
+        total_elements = tensor.numel()
+        total_devices = devices_per_node * num_nodes
+        num_groups = max(tensor.shape[0], tensor.shape[1], total_devices)
+
+        # Initialize error_feedback tensor
+        error_feedback = torch.randn(tensor_size, device=tensor.device, dtype=tensor.dtype)
+        error_feedback_ori = error_feedback.clone()
+        # Swizzle the original tensor
+        tensor_reshaped = tensor.reshape(num_nodes, devices_per_node, total_elements // total_devices)
+        swizzled_tensor = tensor_reshaped.permute(1, 0, 2).reshape(tensor.size())
+
+        # Perform loco_swizzle_quant
+        output, scales = quantizer_module.loco_swizzle_quant(tensor, error_feedback, 0.0, num_groups, num_bits,
+                                                             quantizer_module.Symmetric, 1, num_nodes,
+                                                             devices_per_node)
+
+        # Compare swizzled_tensor with the output of loco_swizzle_quant
+        dequantized = quantizer_module.dequantize(output, scales, scales.numel(), num_bits,
+                                                  quantizer_module.Symmetric).view(tensor.size())
+
+        assert torch.allclose(swizzled_tensor + error_feedback_ori, dequantized + error_feedback)
+
+        # Calculate elements per group and groups per partition
+        elements_per_group = total_elements // num_groups
+        groups_per_partition = num_groups // devices_per_node
+
+        # Reshape dequantized data to match the grouping in loco_quantized_reduction
+        dequantized_reshaped = dequantized.view(devices_per_node, groups_per_partition, elements_per_group)
+
+        # Perform reduction across devices_per_node dimension
+        reduced_dequantized = dequantized_reshaped.cumsum(dim=0)[-1]
+        # Initialize error_feedback tensor
+        error_feedback = torch.randn(reduced_dequantized.shape, device=tensor.device, dtype=dequantized.dtype)
+        error_feedback_ori = error_feedback.clone()
+
+        # perform loco_quantized_reduction
+        output, scales = quantizer_module.loco_quantized_reduction(output, scales, error_feedback, 0.0, num_groups,
+                                                                   num_groups // devices_per_node, num_bits,
+                                                                   quantizer_module.Symmetric, devices_per_node)
+
+        dequantized_reduced = quantizer_module.dequantize(output, scales, scales.numel(), num_bits,
+                                                          quantizer_module.Symmetric).view(error_feedback.size())
+
+        assert torch.allclose(reduced_dequantized + error_feedback_ori, dequantized_reduced + error_feedback)
diff --git a/tests/unit/runtime/compile/test_compile_zero.py b/tests/unit/runtime/compile/test_compile_zero.py
new file mode 100644
index 000000000000..ca80eef8b31e
--- /dev/null
+++ b/tests/unit/runtime/compile/test_compile_zero.py
@@ -0,0 +1,68 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.utils.torch import required_torch_version
+from deepspeed.accelerator import get_accelerator
+
+from unit.runtime.compile.util import compare_loss
+from unit.common import DistributedTest
+from unit.util import bf16_required_version_check, skip_on_arch
+
+pytestmark = pytest.mark.skipif(not required_torch_version(min_version=2.1),
+                                reason="Compile tests requires Pytorch version 2.1 or above")
+
+
+class TestZeRO(DistributedTest):
+    world_size = 2
+    non_daemonic_procs = True
+
+    @pytest.mark.parametrize('dtype', [torch.bfloat16, torch.float16, torch.float32])
+    @pytest.mark.parametrize('zero_stage', [1, 2, 3])
+    @pytest.mark.parametrize('offload_device', [OffloadDeviceEnum.none, OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme])
+    def test_compile_zero(self, tmpdir, zero_stage, dtype, offload_device):
+        if dtype == torch.bfloat16:
+            skip_on_arch(min_arch=8)
+        if dtype == torch.bfloat16 and not bf16_required_version_check():
+            pytest.skip(
+                "DeepSpeed BFloat16 tests need NCCL >= 2.10.3, CUDA >=11.0, and HW support for BFloat16 to run correctly"
+            )
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU does not support this test yet")
+
+        if offload_device == OffloadDeviceEnum.nvme:
+            if zero_stage != 3:
+                pytest.skip(f"Nvme offload not supported for zero stage {zero_stage}")
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            }
+        }
+
+        if offload_device == OffloadDeviceEnum.cpu:
+            config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
+        elif offload_device == OffloadDeviceEnum.nvme:
+            config_dict["zero_optimization"]["offload_optimizer"] = {
+                "device": offload_device,
+                "nvme_path": str(tmpdir)
+            }
+        if dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        compare_loss(self, config_dict, dtype)
diff --git a/tests/unit/runtime/compile/util.py b/tests/unit/runtime/compile/util.py
new file mode 100644
index 000000000000..d53886a81429
--- /dev/null
+++ b/tests/unit/runtime/compile/util.py
@@ -0,0 +1,124 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import random
+import os
+import numpy as np
+from copy import deepcopy
+
+import torch
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.zero import GatheredParameters
+
+from unit.simple_model import SimpleModel
+from typing import Callable, Any
+
+
+class EnableDeterminism:
+
+    def __init__(self, seed: int):
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+
+        self.seed = seed + local_rank
+        self.saved_random_state = None
+        self.saved_np_random_state = None
+        self.saved_cuda_launch_blocking = None
+        self.saved_cublas_workspace_config = None
+        self.saved_deterministic_algorithms = None
+
+    def __enter__(self):
+        self.saved_random_state = random.getstate()
+        self.saved_np_random_state = np.random.get_state()
+        self.saved_acc_rng_state = get_accelerator().get_rng_state()
+        self.saved_cuda_launch_blocking = os.environ.get("CUDA_LAUNCH_BLOCKING", "")
+        self.saved_cublas_workspace_config = os.environ.get("CUBLAS_WORKSPACE_CONFIG", "")
+        self.saved_deterministic_algorithms = torch.are_deterministic_algorithms_enabled()
+
+        random.seed(self.seed)
+        np.random.seed(self.seed)
+        get_accelerator().manual_seed(self.seed)
+        get_accelerator().manual_seed_all(self.seed)
+
+        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
+        torch.use_deterministic_algorithms(True)
+
+    def __exit__(self, type, value, traceback):
+        random.setstate(self.saved_random_state)
+        np.random.set_state(self.saved_np_random_state)
+        get_accelerator().set_rng_state(self.saved_acc_rng_state)
+        os.environ["CUDA_LAUNCH_BLOCKING"] = self.saved_cuda_launch_blocking
+        os.environ["CUBLAS_WORKSPACE_CONFIG"] = self.saved_cublas_workspace_config
+        torch.use_deterministic_algorithms(self.saved_deterministic_algorithms)
+
+
+def enable_determinism(seed: int):
+
+    def decorator(func: Callable) -> Callable:
+
+        def wrapper(*args: Any, **kwargs: Any):
+            with EnableDeterminism(seed):
+                return func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+@enable_determinism(123)
+def compare_loss(self, config, dtype):
+    iteration = 5
+    hidden_dim = 10
+    RTOL = 5e-1
+    ATOL = 1e-2
+
+    device = torch.device(get_accelerator().current_device_name())
+    model = SimpleModel(hidden_dim)
+
+    i = get_accelerator().current_device()
+    baseline_model = deepcopy(model)
+    baseline_config = deepcopy(config)
+    baseline_config["zero_optimization"]["stage"] = 0
+    baseline_config["zero_optimization"]["offload_optimizer"] = {}
+    baseline_engine, baseline_optimizer, _, _ = deepspeed.initialize(config=baseline_config,
+                                                                     model=baseline_model,
+                                                                     model_parameters=baseline_model.parameters())
+
+    if config["zero_optimization"]["stage"] == 3:
+        with deepspeed.zero.Init(config_dict_or_path=config):
+            target_model = SimpleModel(hidden_dim)
+        with GatheredParameters(target_model.parameters(), modifier_rank=0):
+            for p1, p2 in zip(target_model.parameters(), model.parameters()):
+                p1.data.copy_(p2.data)
+    else:
+        target_model = deepcopy(model)
+
+    target_engine, target_optimizer, _, _ = deepspeed.initialize(config=config,
+                                                                 model=target_model,
+                                                                 model_parameters=target_model.parameters())
+    target_engine.compile()
+
+    train_batch_size = config["train_micro_batch_size_per_gpu"]
+
+    xs = [torch.randn(train_batch_size, hidden_dim, device=device, dtype=dtype) for _ in range(iteration)]
+    ys = [torch.randn_like(x) for x in xs]
+
+    for x, y in zip(xs, ys):
+        baseline_loss = baseline_engine(x, y)
+        target_loss = target_engine(x, y)
+
+        assert torch.allclose(baseline_loss, target_loss, rtol=RTOL, atol=ATOL)
+
+        baseline_engine.backward(baseline_loss)
+        target_engine.backward(target_loss)
+
+        baseline_optimizer.step()
+        target_optimizer.step()
+
+        with GatheredParameters(target_engine.parameters()):
+            for p1, p2 in zip(baseline_engine.parameters(), target_engine.parameters()):
+                assert torch.allclose(p1.to(dtype), p2, rtol=RTOL, atol=ATOL)
diff --git a/tests/unit/runtime/half_precision/onebit/test_onebit.py b/tests/unit/runtime/half_precision/onebit/test_onebit.py
index ba795a853be0..1be2d73ef4c0 100644
--- a/tests/unit/runtime/half_precision/onebit/test_onebit.py
+++ b/tests/unit/runtime/half_precision/onebit/test_onebit.py
@@ -17,7 +17,7 @@
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, random_dataloader
 from unit.alexnet_model import AlexNetPipe, train_cifar
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
 
 PipeTopo = PipeDataParallelTopology
@@ -33,12 +33,18 @@
     pytest.skip("NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
                 allow_module_level=True)
 
+if get_accelerator().device_name() == 'hpu':
+    pytest.skip("1-bit compression is not supported by HPU.", allow_module_level=True)
+
 
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16], ids=["fp32", "fp16"])
 class TestOneBitAdamBasic(DistributedTest):
     world_size = 2
 
     def test(self, dtype):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
+
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -80,6 +86,8 @@ class TestOneBitAdamExpAvgMask(DistributedTest):
     world_size = 2
 
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -144,6 +152,8 @@ class TestOneBitAdamCheckpointing(DistributedTest):
     world_size = 2
 
     def test(self, tmpdir):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -293,6 +303,8 @@ def test(self, tmpdir):
         assert optimizer_3.optimizer.adam_freeze_key is False
 
     def test_overflow(self, tmpdir):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -343,6 +355,8 @@ class TestOneBitAdamFP16Pipeline(DistributedTest):
     world_size = 4
 
     def test(self, topo_config):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 4,
             "grandient_accumulation_steps": 1,
@@ -388,6 +402,8 @@ class TestZeroOneAdamBasic(DistributedTest):
     world_size = 2
 
     def test(self, dtype):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -432,6 +448,8 @@ class TestZeroOneAdamExpAvgMask(DistributedTest):
     world_size = 2
 
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -499,6 +517,8 @@ class TestZeroOneAdamCheckpointing(DistributedTest):
     world_size = 2
 
     def test(self, tmpdir):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -647,6 +667,8 @@ def test(self, tmpdir):
             assert "server_error" not in v, f"Incorrect server error"
 
     def test_overflow(self, tmpdir):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -700,6 +722,8 @@ class TestZeroOneAdamFP16Pipeline(DistributedTest):
     world_size = 4
 
     def test(self, topo_config):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 4,
             "grandient_accumulation_steps": 1,
@@ -748,6 +772,8 @@ class TestOneBitLambBasic(DistributedTest):
     world_size = 2
 
     def test(self, dtype):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -795,6 +821,8 @@ class TestOneBitLampExpAvgMask(DistributedTest):
     world_size = 2
 
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -864,6 +892,8 @@ class TestOneBitLambCheckpointing(DistributedTest):
     world_size = 2
 
     def test(self, tmpdir):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -1030,6 +1060,8 @@ def test(self, tmpdir):
         assert optimizer_3.optimizer.lamb_freeze_key is False
 
     def test_overflow(self, tmpdir):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -1086,6 +1118,8 @@ class TestOneBitLambFP16Pipeline(DistributedTest):
     world_size = 4
 
     def test(self, topo_config):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 4,
             "grandient_accumulation_steps": 1,
@@ -1131,6 +1165,8 @@ class TestCompressedAllReduceBasic(DistributedTest):
     world_size = 2
 
     def test(self, tmpdir):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         from deepspeed.runtime.comm.nccl import NcclBackend
 
         size = dist.get_world_size()
diff --git a/tests/unit/runtime/half_precision/test_bf16.py b/tests/unit/runtime/half_precision/test_bf16.py
index 3f551fb0fd4a..0af14abc3be5 100644
--- a/tests/unit/runtime/half_precision/test_bf16.py
+++ b/tests/unit/runtime/half_precision/test_bf16.py
@@ -12,6 +12,7 @@
 from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader
 from unit.util import bf16_required_version_check
 from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
 
 
 class TestAdamBF16ZeroOneCycleCompatibility(DistributedTest):
@@ -287,8 +288,8 @@ def test(self, stage=2):
             model.step()
 
 
-@pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bfp16", "fp32"])
-@pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16, None], ids=["fp16", "bfp16", "default"])
+@pytest.mark.parametrize("comp_type", [torch.float16, torch.bfloat16, torch.float], ids=["fp16", "bf16", "fp32"])
+@pytest.mark.parametrize("comm_type", [torch.float16, torch.bfloat16, None], ids=["fp16", "bf16", "default"])
 class TestZeroDtypeCocktail(DistributedTest):
     world_size = 2
 
@@ -299,7 +300,11 @@ def test(self, comp_type, comm_type):
                     " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
                 )
 
-        type_str = {torch.float16: "fp16", torch.bfloat16: "bfp16"}
+        if comp_type == torch.float16 or comm_type == torch.float16:
+            if not get_accelerator().is_fp16_supported():
+                pytest.skip("fp16 is not supported")
+
+        type_str = {torch.float16: "fp16", torch.bfloat16: "bf16"}
 
         config_dict = {
             "train_micro_batch_size_per_gpu": 2,
diff --git a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
index 2a58fd6b4a57..4b263172261c 100644
--- a/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
+++ b/tests/unit/runtime/half_precision/test_dynamic_loss_scale.py
@@ -5,9 +5,12 @@
 
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
+import pytest
 import numpy as np
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel
+from deepspeed.ops.op_builder import FusedLambBuilder
 
 
 def run_model_step(model, gradient_list):
@@ -22,6 +25,9 @@ class TestFused(DistributedTest):
     world_size = 1
 
     def test_no_overflow(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
+
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
@@ -57,6 +63,8 @@ def test_no_overflow(self):
                 expected_loss_scale *= 2
 
     def test_all_overflow(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
@@ -90,6 +98,8 @@ def test_all_overflow(self):
             assert optim.cur_iter == (i + 1)
 
     def test_some_overflow(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
@@ -143,10 +153,14 @@ def test_some_overflow(self):
         assert optim.cur_iter == expected_iteration
 
 
+@pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                    reason="FusedLambBuilder has not been implemented on this system.")
 class TestUnfused(DistributedTest):
     world_size = 1
 
     def test_no_overflow(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
@@ -181,6 +195,8 @@ def test_no_overflow(self):
                 expected_loss_scale *= 2
 
     def test_all_overflow(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
@@ -217,6 +233,8 @@ def test_all_overflow(self):
             assert optim.cur_iter == (i + 1)
 
     def test_some_overflow(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
diff --git a/tests/unit/runtime/half_precision/test_fp16.py b/tests/unit/runtime/half_precision/test_fp16.py
index 3d5e18b46502..dba15a969459 100644
--- a/tests/unit/runtime/half_precision/test_fp16.py
+++ b/tests/unit/runtime/half_precision/test_fp16.py
@@ -10,9 +10,10 @@
 from deepspeed.ops.adam import FusedAdam
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, SimpleOptimizer, random_dataloader, SimpleMoEModel, sequence_dataloader
-from deepspeed.runtime.utils import required_torch_version
+from deepspeed.utils.torch import required_torch_version
 from deepspeed.accelerator import get_accelerator
-from deepspeed.ops.op_builder import CPUAdamBuilder
+from deepspeed.ops.op_builder import CPUAdamBuilder, FusedLambBuilder
+from deepspeed.moe.utils import split_params_into_different_moe_groups_for_optimizer
 
 try:
     from apex import amp  # noqa: F401 # type: ignore
@@ -21,11 +22,18 @@
     _amp_available = False
 amp_available = pytest.mark.skipif(not _amp_available, reason="apex/amp is not installed")
 
+if torch.half not in get_accelerator().supported_dtypes():
+    pytest.skip(f"fp16 not supported, valid dtype: {get_accelerator().supported_dtypes()}", allow_module_level=True)
+
 
 class TestLambFP32GradClip(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system.")
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -55,7 +63,11 @@ def test(self):
 class TestLambFP16(DistributedTest):
     world_size = 2
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system.")
     def test__basic(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -80,7 +92,11 @@ def test__basic(self):
             model.backward(loss)
             model.step()
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system.")
     def test_empty_grad(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -143,6 +159,8 @@ class TestAdamwFP16Basic(DistributedTest):
     world_size = 1
 
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
@@ -160,6 +178,8 @@ class TestFP16OptimizerForMoE(DistributedTest):
     world_size = 2
 
     def test_unfused_gradnorm(self, monkeypatch):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
@@ -188,6 +208,8 @@ def mock_unscale_and_clip_grads(total_norm, apply_scale=True):
             engine.step()
 
     def test_fused_gradnorm(self, monkeypatch):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
@@ -203,8 +225,10 @@ def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True)
 
         # initialize MoE
         model = SimpleMoEModel(hidden_dim, ep_size=2)
+        param_group = {'params': [p for p in model.parameters()], 'name': 'random-unique-name'}
+        params = split_params_into_different_moe_groups_for_optimizer(param_group)
         # optimizer = torch.optim.AdamW(params=model.parameters())
-        optimizer = FusedAdam(params=model.parameters())
+        optimizer = FusedAdam(params=params)
         engine, optimizer, _, _ = deepspeed.initialize(config=config_dict,
                                                        model=model,
                                                        optimizer=optimizer,
@@ -217,7 +241,11 @@ def mock_unscale_and_clip_grads(grads_groups_flat, total_norm, apply_scale=True)
             engine.step()
 
     @pytest.mark.parametrize("fused_lamb_legacy", [(False), (True)])
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system.")
     def test_lamb_gradnorm(self, monkeypatch, fused_lamb_legacy: bool):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         if not required_torch_version(min_version=1.8):
             pytest.skip("DeepSpeed MoE tests need torch 1.8 or higher to run correctly")
 
@@ -262,6 +290,8 @@ class TestAdamwFP16EmptyGrad(DistributedTest):
     world_size = 1
 
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {"train_batch_size": 1, "steps_per_print": 1, "fp16": {"enabled": True}}
         hidden_dim = 10
 
@@ -281,6 +311,8 @@ class TestAdamFP16ZeroOneCycleCompatibility(DistributedTest):
     world_size = 1
 
     def test(self, zero_stage, use_cpu_offload):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -332,6 +364,8 @@ class TestZeroStaticScale(DistributedTest):
     world_size = 1
 
     def test(self, zero_stage, use_cpu_offload, hidden_dim=4):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -375,6 +409,8 @@ class TestZeroAllowUntestedOptimizer(DistributedTest):
     world_size = 1
 
     def test(self, zero_stage, use_cpu_offload):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -408,6 +444,8 @@ class TestZeroEmptyPartition(DistributedTest):
     world_size = 3
 
     def test(self, zero_stage, use_cpu_offload):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         if use_cpu_offload and not deepspeed.ops.__compatible_ops__[CPUAdamBuilder.NAME]:
             pytest.skip("cpu-adam is not compatible")
 
@@ -454,6 +492,8 @@ class TestAmp(DistributedTest):
     world_size = 2
 
     def test_adam_basic(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {"train_batch_size": 2, "steps_per_print": 1, "amp": {"enabled": True}}
         hidden_dim = 10
 
@@ -466,7 +506,11 @@ def test_adam_basic(self):
             model.backward(loss)
             model.step()
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedLambBuilder.NAME],
+                        reason="FusedLambBuilder has not been implemented on this system")
     def test_lamb_basic(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -492,6 +536,8 @@ def test_lamb_basic(self):
             model.step()
 
     def test_adam_O2(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -518,6 +564,8 @@ def test_adam_O2(self):
             model.step()
 
     def test_adam_O2_empty_grad(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -550,6 +598,8 @@ class TestZeroSupportedClientOptimizer(DistributedTest):
     world_size = 1
 
     def test(self, zero_stage, optimizer_constructor):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -571,6 +621,8 @@ class TestZero2ReduceScatterOff(DistributedTest):
     world_size = 2
 
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -610,6 +662,8 @@ class TestFP16AdamTypes(DistributedTest):
     world_size = 1
 
     def test(self, adam_type, torch_impl):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
@@ -642,6 +696,8 @@ class TestZero3LazyScatter(DistributedTest):
     world_size = 1
 
     def test(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
@@ -677,6 +733,8 @@ class TestZeroEmptyGrad(DistributedTest):
     world_size = 1
 
     def test(self, stage):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         config_dict = {
             "train_batch_size": 1,
             "steps_per_print": 1,
diff --git a/tests/unit/runtime/pipe/test_pipe.py b/tests/unit/runtime/pipe/test_pipe.py
index 88e26290b650..f198762c5fcc 100644
--- a/tests/unit/runtime/pipe/test_pipe.py
+++ b/tests/unit/runtime/pipe/test_pipe.py
@@ -7,12 +7,15 @@
 import torch.nn as nn
 import pytest
 
+import torch
+
+import deepspeed
 import deepspeed.comm as dist
 from deepspeed.runtime.pipe.topology import PipeDataParallelTopology
 from deepspeed.runtime.pipe.module import PipelineModule
 from unit.alexnet_model import AlexNetPipe, train_cifar
 from unit.common import DistributedTest
-from unit.util import skip_on_arch
+from unit.util import skip_on_arch, no_child_process_in_deepspeed_io
 
 PipeTopo = PipeDataParallelTopology
 
@@ -155,3 +158,95 @@ def test_pipe_use_reentrant(self, topo_config):
         # the following check could passed on higher version docker: nvcr.io/nvidia/pytorch:23.07-py3(torch2.1.0 cuda12.1)
         # Check if models have same weights after training
         # self._check_model_params_equal(base_model, test_model)
+
+
+class DynamicShapeTestLayer(nn.Module):
+
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.shapes = set()
+
+    def forward(self, x):
+        self.shapes.add(x.shape)
+        y = self.fc(x)
+        return y
+
+
+class DynamicShapeTestModel(nn.Module):
+
+    def __init__(self, n_layers, hidden_size):
+        super().__init__()
+        self.layers = nn.ModuleList([DynamicShapeTestLayer(hidden_size) for _ in range(n_layers)])
+
+
+@pytest.mark.parametrize('topo_config', [
+    {
+        "num_pp": 1,
+        "num_dp": 4
+    },
+    {
+        "num_pp": 2,
+        "num_dp": 2
+    },
+    {
+        "num_pp": 4,
+        "num_dp": 1
+    },
+])
+class TestPipeDynamicShape(DistributedTest):
+    world_size = 4
+
+    def test_pipe_base(self, topo_config):
+        """This test checks if the pipeline engine can handle dynamic shapes correctly.
+        We pass inputs of different shapes to the pipeline engine.
+        """
+
+        n_iter = 10
+        n_layers = 4
+        n_samples = 1024
+        batch_size = 4
+        channel_dims = [8, 16, 32, 64]
+        hidden_size = 16
+
+        topo = PipeTopo(**topo_config)
+
+        model = DynamicShapeTestModel(n_layers, hidden_size)
+        model = PipelineModule(layers=model.layers, topology=topo, loss_fn=nn.MSELoss(), dynamic_shape=True)
+
+        # Each batch has different channel dim but we use the same channel dim in the same batch
+        xs = [
+            torch.randn(channel_dims[(i // batch_size) % len(channel_dims)], hidden_size, dtype=torch.float32)
+            for i in range(n_samples)
+        ]
+        ys = [torch.randn_like(x) for x in xs]
+
+        class CustomDataset(torch.utils.data.Dataset):
+
+            def __init__(self, xs, ys):
+                self.xs = xs
+                self.ys = ys
+
+            def __len__(self):
+                return len(self.xs)
+
+            def __getitem__(self, idx):
+                return self.xs[idx], self.ys[idx]
+
+        dataset = CustomDataset(xs, ys)
+
+        config_dict["train_batch_size"] = batch_size
+
+        with no_child_process_in_deepspeed_io():
+            engine, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                   model=model,
+                                                   model_parameters=[p for p in model.parameters()],
+                                                   training_data=dataset)
+
+        for _ in range(n_iter):
+            _ = engine.train_batch()
+
+        # Check if all layers have seen different shapes
+        for layer in model.modules():
+            if isinstance(layer, DynamicShapeTestLayer):
+                assert len(layer.shapes) > 1
diff --git a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
index 92da2257bdb0..badd0bcee549 100644
--- a/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
+++ b/tests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py
@@ -4,9 +4,14 @@
 # DeepSpeed Team
 
 import torch
+import pytest
 import deepspeed
 from unit.common import DistributedTest
 from unit.util import skip_on_arch
+from deepspeed.accelerator import get_accelerator
+
+if get_accelerator().device_name() == 'hpu':
+    pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True)
 
 
 class Model(torch.nn.Module):
diff --git a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
index 0689adc08670..6338a16b8dbb 100644
--- a/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
+++ b/tests/unit/runtime/sparse_tensor/test_sparse_grads.py
@@ -4,11 +4,15 @@
 # DeepSpeed Team
 
 import torch
+import pytest
 import deepspeed
 from unit.common import DistributedTest
-
+from deepspeed.accelerator import get_accelerator
 import deepspeed.utils.groups as groups
 
+if get_accelerator().device_name() == 'hpu':
+    pytest.skip("sparse_gradients not supported by HPU.", allow_module_level=True)
+
 
 class Model(torch.nn.Module):
 
diff --git a/tests/unit/runtime/test_data_efficiency.py b/tests/unit/runtime/test_data_efficiency.py
index b9bd9c3aa56e..87fb49aad830 100644
--- a/tests/unit/runtime/test_data_efficiency.py
+++ b/tests/unit/runtime/test_data_efficiency.py
@@ -7,6 +7,7 @@
 import os
 import deepspeed
 from deepspeed.accelerator import get_accelerator
+import pytest
 from unit.common import DistributedTest
 from unit.simple_model import Curriculum_SimpleModel, SimpleModel, random_dataloader, random_dataset
 
@@ -53,6 +54,8 @@ class TestDataEfficiency(DistributedTest):
     world_size = 2
 
     def test_curriculum_learning(self):
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test yet")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -64,11 +67,6 @@ def test_curriculum_learning(self):
                 }
             },
             "gradient_clipping": 1.0,
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 0,
-                "initial_scale_power": 16
-            },
             "data_efficiency": {
                 "enabled": True,
                 "seed": 1234,
@@ -98,6 +96,10 @@ def test_curriculum_learning(self):
                 }
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "loss_scale": 0, "initial_scale_power": 16}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         def data_post_process(data, data_sampler_state_dict):
             assert 'dummy_metric' in data_sampler_state_dict['current_difficulties']
@@ -105,7 +107,7 @@ def data_post_process(data, data_sampler_state_dict):
 
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
-        dataset = random_dataset(20, hidden_dim, torch.device('cpu'), dtype=torch.half)
+        dataset = random_dataset(20, hidden_dim, torch.device('cpu'))
         model, _, data_loader, _ = deepspeed.initialize(config=config_dict,
                                                         model=model,
                                                         training_data=dataset,
@@ -128,6 +130,8 @@ class TestLegacyCurriculumScheduler(DistributedTest):
     world_size = 2
 
     def test_fixed_discrete(self):
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test yet")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -139,11 +143,6 @@ def test_fixed_discrete(self):
                 }
             },
             "gradient_clipping": 1.0,
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 0,
-                "initial_scale_power": 16
-            },
             "curriculum_learning": {
                 "enabled": True,
                 "curriculum_type": "seqlen",
@@ -156,6 +155,10 @@ def test_fixed_discrete(self):
                 }
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "loss_scale": 0, "initial_scale_power": 16}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         ground_truths = {1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 3, 7: 4, 8: 4}
 
@@ -172,6 +175,8 @@ def test_fixed_discrete(self):
             assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
 
     def test_fixed_linear(self):
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test yet")
         config_dict = {
             "train_batch_size": 2,
             "steps_per_print": 1,
@@ -183,11 +188,6 @@ def test_fixed_linear(self):
                 }
             },
             "gradient_clipping": 1.0,
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 0,
-                "initial_scale_power": 16
-            },
             "curriculum_learning": {
                 "enabled": True,
                 "curriculum_type": "seqlen",
@@ -200,6 +200,10 @@ def test_fixed_linear(self):
                 }
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "loss_scale": 0, "initial_scale_power": 16}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         ground_truths = {1: 2, 2: 4, 3: 4, 4: 6, 5: 6, 6: 8, 7: 8, 8: 10, 9: 10, 10: 10}
 
diff --git a/tests/unit/runtime/test_ds_config_dict.py b/tests/unit/runtime/test_ds_config_dict.py
index 6cd01644fad5..d06b35e208fe 100644
--- a/tests/unit/runtime/test_ds_config_dict.py
+++ b/tests/unit/runtime/test_ds_config_dict.py
@@ -47,9 +47,6 @@ def base_config():
                 "lr": 0.00015
             }
         },
-        "fp16": {
-            "enabled": True
-        }
     }
     return config_dict
 
@@ -70,13 +67,11 @@ def _batch_assert(status, ds_config, batch, micro_batch, gas, success):
 
     if not success:
         assert not status
-        print("Failed but All is well")
         return
 
     assert ds_config.train_batch_size == batch
     assert ds_config.train_micro_batch_size_per_gpu == micro_batch
     assert ds_config.gradient_accumulation_steps == gas
-    print("All is well")
 
 
 #Tests different batch config provided in deepspeed json file
@@ -90,7 +85,7 @@ class TestBatchConfig(DistributedTest):
 
     def test(self, num_ranks, batch, micro_batch, gas, success):
         assert dist.get_world_size() == num_ranks, \
-        'The test assumes a world size of f{num_ranks}'
+        f'The test assumes a world size of {num_ranks}'
 
         ds_batch_config = get_test_path('ds_batch_config.json')
         ds_config = DeepSpeedConfig(ds_batch_config)
@@ -163,11 +158,19 @@ class TestConfigLoad(DistributedTest):
     world_size = 1
 
     def test_dict(self, base_config):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
         model, _, _, _ = deepspeed.initialize(config=base_config, model=model, model_parameters=model.parameters())
 
     def test_json(self, base_config, tmpdir):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         config_path = os.path.join(tmpdir, "config.json")
         with open(config_path, 'w') as fp:
             json.dump(base_config, fp)
@@ -176,6 +179,10 @@ def test_json(self, base_config, tmpdir):
         model, _, _, _ = deepspeed.initialize(config=config_path, model=model, model_parameters=model.parameters())
 
     def test_hjson(self, base_config, tmpdir):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         config_path = os.path.join(tmpdir, "config.json")
         with open(config_path, 'w') as fp:
             hjson.dump(base_config, fp)
@@ -188,6 +195,10 @@ class TestDeprecatedDeepScaleConfig(DistributedTest):
     world_size = 1
 
     def test(self, base_config, tmpdir):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         config_path = create_config_from_dict(tmpdir, base_config)
         parser = argparse.ArgumentParser()
         args = parser.parse_args(args='')
@@ -209,6 +220,10 @@ class TestDistInit(DistributedTest):
     world_size = 1
 
     def test(self, base_config):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
@@ -227,6 +242,12 @@ class TestInitNoOptimizer(DistributedTest):
     world_size = 1
 
     def test(self, base_config):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("This test timeout with CPU accelerator")
         del base_config["optimizer"]
         hidden_dim = 10
 
@@ -246,6 +267,10 @@ class TestArgs(DistributedTest):
     world_size = 1
 
     def test_none_args(self, base_config):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(args=None, model=model, config=base_config)
         data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
@@ -253,6 +278,10 @@ def test_none_args(self, base_config):
             loss = model(batch[0], batch[1])
 
     def test_no_args(self, base_config):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         model = SimpleModel(hidden_dim=10)
         model, _, _, _ = deepspeed.initialize(model=model, config=base_config)
         data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=10, device=model.device)
@@ -264,6 +293,10 @@ class TestNoModel(DistributedTest):
     world_size = 1
 
     def test(self, base_config):
+        if get_accelerator().is_fp16_supported():
+            base_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            base_config["bf16"] = {"enabled": True}
         model = SimpleModel(hidden_dim=10)
         with pytest.raises(AssertionError):
             model, _, _, _ = deepspeed.initialize(model=None, config=base_config)
diff --git a/tests/unit/runtime/test_ds_config_model.py b/tests/unit/runtime/test_ds_config_model.py
index 87ea747cf423..4d184b2858a8 100644
--- a/tests/unit/runtime/test_ds_config_model.py
+++ b/tests/unit/runtime/test_ds_config_model.py
@@ -4,18 +4,25 @@
 # DeepSpeed Team
 
 import pytest
-import os
 import json
-from typing import List
-from deepspeed.pydantic_v1 import Field, ValidationError
+import os
+from typing import List, Optional
+
+from pydantic import Field, ValidationError
+
 from deepspeed.runtime import config as ds_config
 from deepspeed.runtime.config_utils import DeepSpeedConfigModel
 
 
 class SimpleConf(DeepSpeedConfigModel):
     param_1: int = 0
-    param_2_old: str = Field(None, deprecated=True, new_param="param_2", new_param_fn=(lambda x: [x]))
-    param_2: List[str] = None
+    param_2_old: Optional[str] = Field(None,
+                                       json_schema_extra={
+                                           "deprecated": True,
+                                           "new_param": "param_2",
+                                           "new_param_fn": (lambda x: [x])
+                                       })
+    param_2: Optional[List[str]] = None
     param_3: int = Field(0, alias="param_3_alias")
 
 
diff --git a/tests/unit/runtime/test_ds_initialize.py b/tests/unit/runtime/test_ds_initialize.py
index 8ec9f05a0a17..a30f81cedde9 100644
--- a/tests/unit/runtime/test_ds_initialize.py
+++ b/tests/unit/runtime/test_ds_initialize.py
@@ -17,7 +17,10 @@
 from deepspeed.ops.adam import FusedAdam
 from deepspeed.runtime.lr_schedules import WARMUP_LR, WarmupLR
 from deepspeed.runtime.config import ADAM_OPTIMIZER
-from deepspeed.runtime.utils import see_memory_usage, required_torch_version
+from deepspeed.runtime.utils import see_memory_usage
+from deepspeed.utils.torch import required_torch_version
+from deepspeed.accelerator import get_accelerator
+from deepspeed.ops.op_builder import FusedAdamBuilder
 
 
 @pytest.mark.parametrize('zero_stage', [0, 3])
@@ -30,9 +33,6 @@ def test(self, zero_stage):
 
         ds_config = {
             'train_batch_size': self.world_size,
-            'fp16': {
-                'enabled': True
-            },
             'zero_optimization': {
                 "stage": zero_stage,
                 "offload_param": {
@@ -40,6 +40,10 @@ def test(self, zero_stage):
                 }
             }
         }
+        if get_accelerator().is_fp16_supported():
+            ds_config["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            ds_config["bf16"] = {"enabled": True}
         # 20B test
         #hidden_dim = 16 * 1024
         hidden_dim = 4
@@ -49,11 +53,7 @@ def test(self, zero_stage):
         see_memory_usage('pre-init', force=True)
         model, _, _, _ = deepspeed.initialize(model=model, config=ds_config)
         see_memory_usage('post-init', force=True)
-        data_loader = random_dataloader(model=model,
-                                        total_samples=50,
-                                        hidden_dim=hidden_dim,
-                                        device=model.device,
-                                        dtype=torch.half)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
         for batch in data_loader:
             model(batch[0], batch[1])
         see_memory_usage('post-fwds', force=True)
@@ -68,6 +68,9 @@ def test(self, optimizer_type):
         def _optimizer_callable(params) -> Optimizer:
             return AdamW(params=params)
 
+        if (optimizer_type is None) and (not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME]):
+            pytest.skip("FusedAdam is not compatible")
+
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
 
@@ -96,6 +99,8 @@ def _optimizer_callable(params) -> Optimizer:
 class TestConfigOptimizer(DistributedTest):
     world_size = 1
 
+    @pytest.mark.skipif(not deepspeed.ops.__compatible_ops__[FusedAdamBuilder.NAME],
+                        reason="FusedAdam is not compatible")
     def test(self, client_parameters):
         ds_config = {"train_batch_size": 1, "optimizer": {"type": "Adam", "params": {"lr": 0.001}}}
 
@@ -120,6 +125,9 @@ class TestOptimizerImplementation(DistributedTest):
     reuse_dist_env = True
 
     def test(self, optimizer_extension, model_dtype, grad_accum_dtype):
+        if not get_accelerator().is_fp16_supported():
+            if model_dtype == 'fp16' or grad_accum_dtype == 'fp16':
+                pytest.skip("fp16 is not supported")
         if optimizer_extension == 'zero1':
             zero_stage = 1
         elif optimizer_extension == 'zero2':
@@ -297,3 +305,132 @@ def _lr_scheduler_callable(optimizer) -> _LRScheduler:
                 assert ds_lr_scheduler == client_scheduler
             else:
                 assert isinstance(ds_lr_scheduler, LambdaLR)
+
+
+@pytest.mark.parametrize("scheduler_type", [None, _LRScheduler, Callable])
+class TestClientLrSchedulerInit(DistributedTest):
+    world_size = 1
+
+    def test_same_lrscheler_and_callable(self, scheduler_type):
+        """
+        Expect behavior
+
+        if lr scheduler is defined in code and passed into initialize as arg,
+        it will be used even this is a lr scheduler has been defined in config.
+
+        Initialize lr scheduler from config when no lr scheduler is defined in code.
+        """
+
+        def _my_lambda(epoch):
+            return epoch // 10
+
+        def _lr_scheduler_callable(optimizer) -> _LRScheduler:
+            return LambdaLR(optimizer, _my_lambda)
+
+        config_dict = {'train_batch_size': 1}
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        client_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+
+        if scheduler_type is None:
+            config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}}
+            client_scheduler = None
+        elif scheduler_type == _LRScheduler:
+            client_scheduler = LambdaLR(client_optimizer, _my_lambda)
+        else:
+            client_scheduler = _lr_scheduler_callable
+
+        _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                        model=model,
+                                                        model_parameters=list(model.parameters()),
+                                                        optimizer=client_optimizer,
+                                                        lr_scheduler=client_scheduler)
+        if scheduler_type is None:
+            # in this case, we initialize from config
+            assert not isinstance(ds_lr_scheduler, LambdaLR)
+            assert isinstance(ds_lr_scheduler, WarmupLR)
+        else:
+            # in this case, we initialize from passed-in scheduler
+            assert isinstance(ds_lr_scheduler, LambdaLR)
+            assert not isinstance(ds_lr_scheduler, WarmupLR)
+
+    def test_diff_lrscheler_and_callable(self, scheduler_type):
+        """
+        In this test,
+        the LambdaLR will be used for lrscheduler type
+        and the StepLR will be used for callable type
+        """
+
+        from torch.optim.lr_scheduler import StepLR
+
+        def _my_lambda(epoch):
+            return epoch // 10
+
+        def _lr_scheduler_callable(optimizer) -> _LRScheduler:
+            return StepLR(optimizer, step_size=30)
+
+        config_dict = {'train_batch_size': 1}
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        client_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+
+        if scheduler_type is None:
+            config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}}
+            client_scheduler = None
+        elif scheduler_type == _LRScheduler:
+            client_scheduler = LambdaLR(client_optimizer, _my_lambda)
+        else:
+            client_scheduler = _lr_scheduler_callable
+
+        _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                        model=model,
+                                                        model_parameters=list(model.parameters()),
+                                                        optimizer=client_optimizer,
+                                                        lr_scheduler=client_scheduler)
+        if scheduler_type is None:
+            assert isinstance(ds_lr_scheduler, WarmupLR)
+        elif scheduler_type == _LRScheduler:
+            assert isinstance(ds_lr_scheduler, LambdaLR)
+        else:
+            # callable
+            assert isinstance(ds_lr_scheduler, StepLR)
+
+    def test_diff_lrscheler_and_callable_onecyclelr_steplr(self, scheduler_type):
+
+        from deepspeed.runtime.lr_schedules import OneCycle, ONE_CYCLE, CYCLE_MIN_LR, CYCLE_MAX_LR
+        from torch.optim.lr_scheduler import OneCycleLR, StepLR
+
+        def _lr_scheduler_callable(optimizer) -> _LRScheduler:
+            return OneCycleLR(optimizer, max_lr=0.01, total_steps=200)
+
+        config_dict = {'train_batch_size': 1}
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        client_optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+
+        if scheduler_type is None:
+            config_dict['scheduler'] = {'type': ONE_CYCLE, 'params': {CYCLE_MIN_LR: 0, CYCLE_MAX_LR: 0.1}}
+            client_scheduler = None
+        elif scheduler_type == _LRScheduler:
+            client_scheduler = StepLR(client_optimizer, step_size=30)
+        else:
+            client_scheduler = _lr_scheduler_callable
+
+        _, _, _, ds_lr_scheduler = deepspeed.initialize(config=config_dict,
+                                                        model=model,
+                                                        model_parameters=list(model.parameters()),
+                                                        optimizer=client_optimizer,
+                                                        lr_scheduler=client_scheduler)
+        if scheduler_type is None:
+            assert isinstance(ds_lr_scheduler, OneCycle)
+        elif scheduler_type == _LRScheduler:
+            assert isinstance(ds_lr_scheduler, StepLR)
+        else:
+            # callable
+            assert isinstance(ds_lr_scheduler, OneCycleLR)
diff --git a/tests/unit/runtime/test_lr_schedulers.py b/tests/unit/runtime/test_lr_schedulers.py
index bcfc485f2b8f..47734c0cd864 100644
--- a/tests/unit/runtime/test_lr_schedulers.py
+++ b/tests/unit/runtime/test_lr_schedulers.py
@@ -37,6 +37,9 @@ def _verify_staircase_increase(values, step_size):
                                                    (WARMUP_DECAY_LR, {
                                                        WARMUP_NUM_STEPS: 10,
                                                        TOTAL_NUM_STEPS: 20
+                                                   }), (WARMUP_COSINE_LR, {
+                                                       WARMUP_NUM_STEPS: 10,
+                                                       TOTAL_NUM_STEPS: 20
                                                    }), (ONE_CYCLE, {
                                                        CYCLE_MIN_LR: 0,
                                                        CYCLE_MAX_LR: 0.1
@@ -71,6 +74,11 @@ def test(self, scheduler_type, params):
                                         hidden_dim=hidden_dim,
                                         device=model.device,
                                         dtype=torch.float)
+
+        true_lrs = lr_scheduler.get_lr()
+        for group, true_lr in zip(model.optimizer.param_groups, true_lrs):
+            assert group['lr'] == true_lr, f"True lr {true_lr}, optimizer lr {group['lr']}"
+
         for n, batch in enumerate(data_loader):
             # get lr before training starts
             lr_scheduler.get_lr()
diff --git a/tests/unit/runtime/test_multi_output_model.py b/tests/unit/runtime/test_multi_output_model.py
index d9aba419b158..cda0d4f054d3 100644
--- a/tests/unit/runtime/test_multi_output_model.py
+++ b/tests/unit/runtime/test_multi_output_model.py
@@ -5,8 +5,9 @@
 
 import torch
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 from pytest import approx
-from unit.common import DistributedTest
+from unit.common import DistributedTest, preferred_dtype
 from unit.multi_output_model import MultiOutputModel, multi_output_dataloader
 
 
@@ -28,10 +29,11 @@ def test(self, tmpdir):
                     "lr": 0.00015
                 }
             },
-            "fp16": {
-                "enabled": True
-            }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         hidden_dim = 10
         weight_value = 0.1
@@ -53,7 +55,7 @@ def test(self, tmpdir):
             inputs, targets = batch[:midpoint], batch[midpoint:]
             loss_tuple = model(inputs, targets)
 
-            expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device)
+            expected_loss = torch.tensor(2.302734375, dtype=preferred_dtype(), device=model.device)
             for loss in loss_tuple:
                 assert loss.shape == torch.Size([])
                 assert loss.item() == approx(expected_loss.item())
@@ -84,10 +86,11 @@ def test(self, tmpdir):
                     "lr": 0.00015
                 }
             },
-            "fp16": {
-                "enabled": True
-            }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         hidden_dim = 10
         weight_value = 0.1
@@ -111,7 +114,7 @@ def test(self, tmpdir):
             loss_tuple = model(inputs, targets)
             assert len(loss_tuple) == 3
 
-            expected_loss = torch.tensor(2.302734375, dtype=torch.half, device=model.device)
+            expected_loss = torch.tensor(2.302734375, dtype=preferred_dtype(), device=model.device)
 
             for loss in loss_tuple:
                 assert loss.shape == torch.Size([])
diff --git a/tests/unit/runtime/test_mup_optimizers.py b/tests/unit/runtime/test_mup_optimizers.py
index ebecf73d416f..7666fa9d1c1f 100644
--- a/tests/unit/runtime/test_mup_optimizers.py
+++ b/tests/unit/runtime/test_mup_optimizers.py
@@ -10,6 +10,7 @@
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, random_dataloader
 from mup.shape import set_base_shapes
+from deepspeed.accelerator import get_accelerator
 
 
 @pytest.mark.parametrize("optimizer, expected_opt_class", [("MuAdam", torch.optim.Adam),
@@ -31,14 +32,15 @@ def test(self, optimizer, expected_opt_class, zero_offload):
                 }
             },
             "gradient_clipping": 1.0,
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": 2,
                 "cpu_offload": zero_offload
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
         model = SimpleModel(hidden_dim)
         set_base_shapes(model, None)
diff --git a/tests/unit/runtime/test_no_sync_ctxt.py b/tests/unit/runtime/test_no_sync_ctxt.py
new file mode 100644
index 000000000000..8c6497013809
--- /dev/null
+++ b/tests/unit/runtime/test_no_sync_ctxt.py
@@ -0,0 +1,197 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+from contextlib import nullcontext
+import torch
+
+from unit.simple_model import SimpleModel, random_dataloader
+from unit.common import DistributedTest
+
+import deepspeed
+import deepspeed.comm as dist
+from deepspeed.utils import safe_get_full_grad
+
+
+class TestNoSyncCtxt(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1, 2, 3])
+    def test_zero_stage(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        invalid_cfg = zero_stage > 1
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        with pytest.raises(AssertionError) if invalid_cfg else nullcontext() as assertinfo:
+            with model.no_sync():
+                for _, batch in enumerate(data_loader):
+                    loss = model(batch[0], batch[1])
+                    model.backward(loss)
+        if invalid_cfg:
+            assert ("no_sync context manager is incompatible" in str(assertinfo))
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1])
+    def test_engine_step(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                with pytest.raises(AssertionError) as assertinfo:
+                    model.step()
+                assert ("It is illegal to call Engine.step() inside no_sync context manager" in str(assertinfo))
+
+    @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("zero_stage", [0, 1])
+    def test_multiple_ctxts(self, zero_stage, dtype):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": zero_stage,
+            },
+        }
+
+        if dtype == torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        elif dtype == torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+
+        hidden_dim = 64
+        total_samples = 32
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model,
+                                        total_samples=total_samples,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=dtype)
+        dist.barrier()
+
+        param_list = list(model.parameters())
+        first_losses = []
+        first_grad_norms = []
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                first_losses.append(loss.item())
+                model.backward(loss)
+                grad_norm = sum([safe_get_full_grad(p).norm() for p in param_list])
+                first_grad_norms.append(grad_norm.item())
+
+        second_losses = []
+        second_grad_norms = []
+
+        model.zero_grad()
+        with model.no_sync():
+            for _, batch in enumerate(data_loader):
+                loss = model(batch[0], batch[1])
+                second_losses.append(loss.item())
+                model.backward(loss)
+                grad_norm = sum([safe_get_full_grad(p).norm() for p in param_list])
+                second_grad_norms.append(grad_norm.item())
+
+        assert len(first_losses) == len(second_losses)
+        for x, y in zip(first_losses, second_losses):
+            assert x == y
+
+        assert len(first_grad_norms) == len(second_grad_norms)
+        for x, y in zip(first_grad_norms, second_grad_norms):
+            assert x == y
+
+    def test_reentry(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "zero_optimization": {
+                "stage": 1,
+            },
+        }
+
+        hidden_dim = 64
+        model = SimpleModel(hidden_dim)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        dist.barrier()
+
+        with model.no_sync():
+            with pytest.raises(AssertionError) as assertinfo:
+                with model.no_sync():
+                    pass
+            assert ("no_sync context manager reentry is unsupported" in str(assertinfo))
diff --git a/tests/unit/runtime/test_pld.py b/tests/unit/runtime/test_pld.py
index 1f602db73b2f..f6da992d5e11 100644
--- a/tests/unit/runtime/test_pld.py
+++ b/tests/unit/runtime/test_pld.py
@@ -10,6 +10,7 @@
 
 from unit.common import DistributedTest
 from unit.simple_model import SimpleModel, PLD_SimpleModel, random_dataloader
+from deepspeed.accelerator import get_accelerator
 
 
 @pytest.mark.parametrize('theta', [0, 0.1, 0.9, 1.0])
@@ -39,15 +40,16 @@ def test_pld_model(self, theta):
                     "lr": 0.0001
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "progressive_layer_drop": {
                 "enabled": True,
                 "theta": theta,
                 "gamma": gamma
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         model = PLD_SimpleModel(hidden_dim, empty_grad=False)
@@ -80,15 +82,16 @@ def test_non_pld_model(self):
                     "lr": 0.0001
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "progressive_layer_drop": {
                 "enabled": True,
                 "theta": theta,
                 "gamma": gamma
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim, empty_grad=False)
diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py
index 5d8478b249be..6fdeb2074246 100644
--- a/tests/unit/runtime/test_runtime_utils.py
+++ b/tests/unit/runtime/test_runtime_utils.py
@@ -26,10 +26,10 @@ def test_call_to_str():
     assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)'
 
 
-class TestClibGradNorm(DistributedTest):
+class TestClipGradNorm(DistributedTest):
     world_size = 2
 
-    def test(self):
+    def test_gather(self):
         param1 = torch.nn.Parameter(torch.Tensor([0]))
         param1.grad = torch.Tensor([1])
         param2 = torch.nn.Parameter(torch.Tensor([0]))
@@ -50,6 +50,27 @@ def test(self):
 
         assert gathered_norm[0] == gathered_norm[1], "norm at rank 0 does not match the norm at rank 1"
 
+    def test_clipped_val(self):
+        max_norm = 0.1
+
+        def test_params():
+            param1 = torch.nn.Parameter(torch.Tensor([0]))
+            param1.grad = torch.Tensor([1])
+            param2 = torch.nn.Parameter(torch.Tensor([0]))
+            param2.grad = torch.Tensor([1])
+            return [param1, param2]
+
+        # This assumes gradients are same on all the ranks and doesn't consider multiple ranks
+        params_expected = test_params()
+        torch.nn.utils.clip_grad_norm_(params_expected, max_norm)
+
+        params_actual = test_params()
+        ds_utils.clip_grad_norm_(params_actual, max_norm=max_norm)
+
+        # This can be allclose
+        assert torch.equal(params_expected[0].grad, params_actual[0].grad)
+        assert torch.equal(params_expected[1].grad, params_actual[1].grad)
+
 
 @pytest.mark.parametrize("check_using_norm", [(False), (True)])
 class TestCheckOverflow(DistributedTest):
diff --git a/tests/unit/runtime/utils/test_partition.py b/tests/unit/runtime/utils/test_partition.py
index e7085ee2c4bd..8f7768d0d730 100644
--- a/tests/unit/runtime/utils/test_partition.py
+++ b/tests/unit/runtime/utils/test_partition.py
@@ -22,7 +22,6 @@ class TestPartitionedTensor(DistributedTest):
 
     def test(self):
         world = dist.get_world_size()
-        rank = dist.get_rank()
 
         group = dist.new_group(ranks=list(range(world)))
 
@@ -40,12 +39,32 @@ def test(self):
         assert torch.equal(full, reconstructed)
 
 
+class TestPartitionedTensorUnEven(DistributedTest):
+    world_size = 4
+
+    def test(self):
+        world = dist.get_world_size()
+
+        group = dist.new_group(ranks=list(range(world)))
+
+        rows = world * 4 - 1
+        cols = world + 1
+
+        full = torch.rand(rows, cols).to(get_accelerator().device_name())
+        dist.broadcast(full, src=0, group=group)
+        part = PartitionedTensor(full, group=group)
+
+        assert len(part.local_size()) == 1
+
+        reconstructed = part.full()
+        assert torch.equal(full, reconstructed)
+
+
 class TestPartitionedTensorMeta(DistributedTest):
     world_size = 4
 
     def test(self):
         world = dist.get_world_size()
-        rank = dist.get_rank()
 
         group = dist.new_group(ranks=list(range(world)))
 
diff --git a/tests/unit/runtime/zero/test_ignore_unused_parameters.py b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
index aade488fde42..b1d341486e55 100644
--- a/tests/unit/runtime/zero/test_ignore_unused_parameters.py
+++ b/tests/unit/runtime/zero/test_ignore_unused_parameters.py
@@ -9,6 +9,7 @@
 from deepspeed.ops.op_builder import CPUAdamBuilder
 
 import deepspeed
+from deepspeed.accelerator import get_accelerator
 
 
 @pytest.mark.parametrize('ignore_unused_parameters', [False, True])
@@ -36,11 +37,11 @@ def test(self, ignore_unused_parameters):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        else:
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 4
 
         model = UnusedParametersModel(hidden_dim=hidden_dim)
diff --git a/tests/unit/runtime/zero/test_nvme_checkpointing.py b/tests/unit/runtime/zero/test_nvme_checkpointing.py
new file mode 100644
index 000000000000..01a75aa64b4e
--- /dev/null
+++ b/tests/unit/runtime/zero/test_nvme_checkpointing.py
@@ -0,0 +1,146 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import pytest
+import deepspeed.comm as dist
+import torch
+
+from unit.common import DistributedTest
+from unit.simple_model import random_dataloader, SimpleModel
+
+import deepspeed
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
+from deepspeed.runtime.zero.partition_parameters import Init
+from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.accelerator import get_accelerator
+
+
+class TestNVMeCheckpointing(DistributedTest):
+    world_size = 1
+
+    @pytest.mark.parametrize('param_offload_device, optim_offload_device',
+                             [(OffloadDeviceEnum.none, OffloadDeviceEnum.nvme),
+                              (OffloadDeviceEnum.cpu, OffloadDeviceEnum.nvme),
+                              (OffloadDeviceEnum.nvme, OffloadDeviceEnum.none),
+                              (OffloadDeviceEnum.nvme, OffloadDeviceEnum.cpu),
+                              (OffloadDeviceEnum.nvme, OffloadDeviceEnum.nvme)])
+    def test_nvme_checkpointing(self, tmpdir, param_offload_device, optim_offload_device):
+        zero_dir, ckpt_dir = os.path.join(tmpdir, "zero"), os.path.join(tmpdir, "checkpoint")
+
+        first_stage_steps, second_stage_steps = 2, 2
+
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
+
+        if not deepspeed.ops.__compatible_ops__[AsyncIOBuilder.NAME]:
+            pytest.skip('Skip tests since async-io is not compatible')
+
+        torch.manual_seed(123)
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 0.00015,
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "offload_param": {
+                    "device": param_offload_device,
+                    "nvme_path": str(zero_dir)
+                },
+                "offload_optimizer": {
+                    "device": optim_offload_device,
+                    "nvme_path": str(zero_dir)
+                },
+                "sub_group_size": 100,
+                "stage3_max_live_parameters": 100,
+                "stage3_param_persistence_threshold": 0,
+            },
+            "aio": {
+                "block_size": 1048576  # Minimum AIO bytes, anything smaller than this will not be offloaded
+            }
+        }
+
+        hidden_dim, nlayers = 2048, 2
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim, nlayers=nlayers, empty_grad=False)
+
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        model.empty_partition_cache()
+
+        assert first_stage_steps > 0
+
+        data_loader = random_dataloader(model=model,
+                                        total_samples=first_stage_steps,
+                                        hidden_dim=hidden_dim,
+                                        device=model.device,
+                                        dtype=torch.float16)
+        dist.barrier()
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+        dist.barrier()
+        model.save_checkpoint(ckpt_dir)
+
+        if second_stage_steps > 0:
+            second_stage_batches = list(
+                random_dataloader(model=model,
+                                  total_samples=second_stage_steps,
+                                  hidden_dim=hidden_dim,
+                                  device=model.device,
+                                  dtype=torch.float16))
+            dist.barrier()
+            for n, batch in enumerate(second_stage_batches):
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                model.step()
+            dist.barrier()
+
+        final_batch = next(
+            iter(
+                random_dataloader(model=model,
+                                  total_samples=1,
+                                  hidden_dim=hidden_dim,
+                                  device=model.device,
+                                  dtype=torch.float16)))
+        dist.barrier()
+        loss_before = float(model(final_batch[0], final_batch[1]))
+
+        # Needed in ZeRO 3. Not doing so can give memory leak
+        model.destroy()
+
+        # TODO: This should be on the engine? There needs to be a better way.
+        Init.param_id = 0
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim, nlayers=nlayers, empty_grad=False)
+
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+
+        model.load_checkpoint(ckpt_dir)
+
+        if second_stage_steps > 0:
+            dist.barrier()
+            for n, batch in enumerate(second_stage_batches):
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                model.step()
+            dist.barrier()
+
+        dist.barrier()
+        loss_after = float(model(final_batch[0], final_batch[1]))
+
+        assert loss_before == loss_after
diff --git a/tests/unit/runtime/zero/test_offload_states.py b/tests/unit/runtime/zero/test_offload_states.py
new file mode 100644
index 000000000000..44bff480e27b
--- /dev/null
+++ b/tests/unit/runtime/zero/test_offload_states.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+
+import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
+import torch
+
+from unit.common import DistributedTest
+from unit.simple_model import random_dataloader, SimpleModel
+
+import deepspeed
+from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum, OffloadStateTypeEnum
+from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_optimizer_state
+from deepspeed.runtime.zero.offload_states import get_state_devices
+
+
+def validate_device(model, device: torch.device, include) -> None:
+
+    def compare_device(state) -> bool:
+        devices = get_state_devices(model, state)
+        return len(devices) == 1 and device in devices
+
+    for state in OffloadStateTypeEnum:
+        if include is None or state in include:
+            if state == OffloadStateTypeEnum.contiguous_grad_buffer and device == torch.device("cpu"):
+                assert len(get_state_devices(model,
+                                             state)) == 0, f"State {state} must be removed after offload_states()"
+            else:
+                assert compare_device(state), f"State {state} is not on device {device}"
+
+
+def run_model(model, param_groups, config_dict, hidden_dim, dtype, include, pin_memory, non_blocking):
+    # Currently we only support OffloadDeviceEnum.cpu
+    offload_device = OffloadDeviceEnum.cpu
+
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=param_groups, config=config_dict)
+    data_loader = random_dataloader(model=model,
+                                    total_samples=10,
+                                    hidden_dim=hidden_dim,
+                                    device=model.device,
+                                    dtype=dtype)
+    dist.barrier()
+    for batch in data_loader:
+        loss = model(batch[0], batch[1])
+        model.backward(loss)
+        model.step()
+
+        hp_params_expected = [safe_get_local_fp32_param(p).clone() for p in model.parameters()]
+        lp_params_expected = [p.ds_tensor.clone() for p in model.parameters()]
+        lp_grads_expected = model.optimizer.grad_partitions_flat_buffer.clone()
+        adam_exp_avg_expected = [safe_get_local_optimizer_state(p, "exp_avg").clone() for p in model.parameters()]
+        adam_exp_avg_sq = [safe_get_local_optimizer_state(p, "exp_avg_sq").clone() for p in model.parameters()]
+
+        # Start offloading
+        alloc_before_offload = get_accelerator().memory_allocated()
+        model.offload_states(include=include, device=offload_device, pin_memory=pin_memory, non_blocking=non_blocking)
+        alloc_after_offload = get_accelerator().memory_allocated()
+        assert alloc_after_offload < alloc_before_offload, f"Allocated memory should decrease after offload"
+
+        validate_device(model, torch.device(offload_device.value), include)
+
+        # Reload states
+        model.reload_states()
+        assert alloc_after_offload < get_accelerator().memory_allocated(
+        ), f"Allocated memory should increase after offload back"
+
+        # Verify restored states
+        hp_param_restored = [safe_get_local_fp32_param(p) for p in model.parameters()]
+        for hp_param_expected, hp_param_restored in zip(hp_params_expected, hp_param_restored):
+            assert torch.equal(hp_param_expected, hp_param_restored)
+
+        lp_param_restored = [p.ds_tensor for p in model.parameters()]
+
+        for lp_param_expected, lp_param_restored in zip(lp_params_expected, lp_param_restored):
+            assert torch.equal(lp_param_expected, lp_param_restored)
+
+        assert torch.equal(lp_grads_expected, model.optimizer.grad_partitions_flat_buffer)
+
+        adam_exp_avg_restored = [safe_get_local_optimizer_state(p, "exp_avg") for p in model.parameters()]
+        for adam_exp_avg_expected, adam_exp_avg_restored in zip(adam_exp_avg_expected, adam_exp_avg_restored):
+            assert torch.equal(adam_exp_avg_expected, adam_exp_avg_restored)
+
+        adam_exp_avg_sq_restored = [safe_get_local_optimizer_state(p, "exp_avg_sq") for p in model.parameters()]
+        for adam_exp_avg_sq_expected, adam_exp_avg_sq_restored in zip(adam_exp_avg_sq, adam_exp_avg_sq_restored):
+            assert torch.equal(adam_exp_avg_sq_expected, adam_exp_avg_sq_restored)
+
+        validate_device(model, torch.device(get_accelerator().current_device_name()), include)
+
+    # Needed in ZeRO 3. Not doing so can give memory leak
+    model.destroy()
+
+
+@pytest.mark.parametrize("included_state", [
+    OffloadStateTypeEnum.hp_params, OffloadStateTypeEnum.lp_params, OffloadStateTypeEnum.optim_states,
+    OffloadStateTypeEnum.lp_grads, OffloadStateTypeEnum.contiguous_grad_buffer, None
+])
+@pytest.mark.parametrize("pin_memory", [False, True])
+@pytest.mark.parametrize("non_blocking", [False, True])
+class TestOffloadStates(DistributedTest):
+    # Need multiple gpus to test possible hanging
+    world_size = 2
+
+    def test_offload_states(self, included_state, pin_memory, non_blocking):
+        hidden_dim = 1024
+
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "zero_optimization": {
+                "stage": 3,
+            }
+        }
+        config_dict["bf16"] = {"enabled": True}
+
+        with deepspeed.zero.Init(config_dict_or_path=config_dict):
+            model = SimpleModel(hidden_dim, nlayers=4)
+
+        param_groups = [{
+            "params": [p for n, p in model.named_parameters() if not 'bias' in n],
+            "weight_decay": 0.1
+        }, {
+            "params": [p for n, p in model.named_parameters() if 'bias' in n],
+            "weight_decay": 0.0
+        }]
+        include = None if included_state is None else [included_state]
+        run_model(model, param_groups, config_dict, hidden_dim, torch.bfloat16, include, pin_memory, non_blocking)
diff --git a/tests/unit/runtime/zero/test_unwrap_model.py b/tests/unit/runtime/zero/test_unwrap_model.py
new file mode 100644
index 000000000000..d75519b67f68
--- /dev/null
+++ b/tests/unit/runtime/zero/test_unwrap_model.py
@@ -0,0 +1,67 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+from deepspeed.runtime.zero import unwrap_model_for_generation
+from deepspeed.accelerator import get_accelerator
+
+from unit.common import DistributedTest
+from unit.simple_model import SimpleModel
+
+config = {
+    "train_batch_size": 2,
+    "steps_per_print": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 0.00015
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "stage3_param_persistence_threshold": 1,
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": True
+        }
+    }
+}
+
+if get_accelerator().is_fp16_supported():
+    config["fp16"] = {"enabled": True, "loss_scale": 138.}
+elif get_accelerator().is_bf16_supported():
+    config["bf16"] = {"enabled": True}
+
+
+class TestUnwrapModel(DistributedTest):
+    # gather across more than 1 gpu
+    world_size = 2
+
+    def test(self):
+
+        def hooks_exist(engine):
+            if engine.optimizer is not None and hasattr(engine.optimizer, "parameter_offload"):
+                optimizer_offload = engine.optimizer.parameter_offload
+            elif engine.optimizer is not None:
+                optimizer_offload = engine.optimizer
+
+            hooks = 0
+            for hook in optimizer_offload.forward_hooks:
+                hooks += 1
+            if hooks > 0:
+                return True
+            return False
+
+        model = SimpleModel(hidden_dim=100)
+        engine, _, _, _ = deepspeed.initialize(args=None, model=model, config=config)
+
+        with unwrap_model_for_generation(engine):
+            # assert no hooks
+            assert not hooks_exist(engine)
+            # assert parameters gathered
+            assert model.linears[0].weight.numel() != 0, "GatheredParameters should give a non-0-sized tensor"
+
+        # assert hooks
+        assert hooks_exist(engine)
diff --git a/tests/unit/runtime/zero/test_zero.py b/tests/unit/runtime/zero/test_zero.py
index db81a0578160..2ae2755086f8 100644
--- a/tests/unit/runtime/zero/test_zero.py
+++ b/tests/unit/runtime/zero/test_zero.py
@@ -14,8 +14,9 @@
 from torch.nn.modules.container import ModuleList
 from torch.nn.modules.loss import L1Loss
 from torch.nn.parameter import Parameter
+from torch.nn.utils import skip_init
 
-from unit.common import DistributedTest
+from unit.common import DistributedTest, preferred_dtype
 from unit.simple_model import SimpleModel, random_dataloader
 
 import deepspeed
@@ -70,11 +71,11 @@ def test(self, zero_stage):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
@@ -84,12 +85,19 @@ def test(self, zero_stage):
         run_unbalanced_gradients(model, data_loader)
 
 
-# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
+# testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227
+@pytest.mark.parametrize("mics_enabled", [True, False])
 class TestZero3RepeatForwardLoop(DistributedTest):
     world_size = 1
 
-    def test(self, zero_stage=3):
+    def test(self, mics_enabled, zero_stage=3):
+        if mics_enabled and get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test yet")
         # force all params to be partitioned by forcing threshold=0
+        mics_shard_size = -1
+        if mics_enabled:
+            mics_shard_size = self.world_size
+
         config_dict = {
             "train_micro_batch_size_per_gpu": 2,
             "gradient_accumulation_steps": 2,
@@ -97,6 +105,7 @@ def test(self, zero_stage=3):
             "zero_optimization": {
                 "stage": zero_stage,
                 "stage3_param_persistence_threshold": 0,
+                "mics_shard_size": mics_shard_size,
             },
             "optimizer": {
                 "type": "Adam",
@@ -104,11 +113,11 @@ def test(self, zero_stage=3):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 4
 
         class AlbertLikeModel(torch.nn.Module):
@@ -135,8 +144,8 @@ def forward(self, x, y):
             model.step()
 
 
-# testing the fix https://github.com/microsoft/DeepSpeed/pull/1227
-# also reproduces the https://github.com/microsoft/DeepSpeed/pull/1372
+# testing the fix https://github.com/deepspeedai/DeepSpeed/pull/1227
+# also reproduces the https://github.com/deepspeedai/DeepSpeed/pull/1372
 @pytest.mark.parametrize("zero_stage", [2, 3])
 @pytest.mark.parametrize("freeze_params", [True, False])
 class TestZeroToFP32(DistributedTest):
@@ -159,17 +168,17 @@ def test_1_param_group(self, tmpdir, zero_stage, freeze_params):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         class MyModel(torch.nn.Module):
 
             def __init__(self, hidden_dim, n_layers, freeze_params):
                 super().__init__()
-                # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that
+                # to reproduce https://github.com/deepspeedai/DeepSpeed/pull/1372 it is important that
                 # the number of total elements is uneven:
                 # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total
                 self.ll = torch.nn.ModuleList(torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers))
@@ -253,11 +262,11 @@ def test_2_param_groups(self, tmpdir, zero_stage, freeze_params):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         class MyModel(torch.nn.Module):
 
@@ -359,11 +368,11 @@ def test(self, allgather_bucket_size, zero_stage=2):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
@@ -394,11 +403,11 @@ def test(self, zero_stage=2):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim)
@@ -618,6 +627,8 @@ def test_param_persistence_threshold(self, param_persistence_threshold):
 
     @pytest.mark.parametrize("fp16_enabled", [True, False])
     def test_fp16_enabled(self, fp16_enabled):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         self._test(fp16_enabled=fp16_enabled)
 
     @pytest.mark.parametrize("contiguous_gradients", [True, False])
@@ -683,11 +694,11 @@ def _test(
                     "lr": 1.0
                 }
             },
-            "fp16": {
-                "enabled": fp16_enabled,
-                "loss_scale": 1.0,
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            cfg["fp16"] = {"enabled": True, "loss_scale": 1.0}
+        elif get_accelerator().is_bf16_supported():
+            cfg["bf16"] = {"enabled": True}
 
         if offload_optimizer:
             cfg["zero_optimization"]["offload_optimizer"] = {
@@ -852,11 +863,11 @@ def forward(self, x: Tensor) -> Tensor:
                     "lr": 1.0
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.0,
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            ds_config["fp16"] = {"enabled": True, "loss_scale": 1.0}
+        elif get_accelerator().is_bf16_supported():
+            ds_config["bf16"] = {"enabled": True}
         with deepspeed.zero.Init(mem_efficient_linear=False, enabled=init_context_manager):
             model = LargeParamModel()
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_config)
@@ -931,24 +942,24 @@ def forward(self, x: Tensor) -> Tensor:
                     "lr": 1.0
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.0,
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            ds_cfg["fp16"] = {"enabled": True, "loss_scale": 1.0}
+        elif get_accelerator().is_bf16_supported():
+            ds_cfg["bf16"] = {"enabled": True}
 
         with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=init_context_manager):
             model = ManyParamModel()
 
         ds_engine = _ds_initialize_for_param_partitioning_testing(model, ds_cfg)
 
+        dtype = preferred_dtype()
         for _ in range(3):  # test multiple iterations to cover prefetching
-            activations: List[Tensor] = ds_engine(
-                torch.ones((param_sz, ), dtype=torch.float16, device=ds_engine.device))
+            activations: List[Tensor] = ds_engine(torch.ones((param_sz, ), dtype=dtype, device=ds_engine.device))
             assert len(activations) == n_layers
 
             partition_sz = math.ceil(param_sz / self.world_size)
-            expected_activations = torch.empty(param_sz, dtype=torch.float16, device=ds_engine.device)
+            expected_activations = torch.empty(param_sz, dtype=dtype, device=ds_engine.device)
             for start_idx in range(0, param_sz, partition_sz):
                 expected_activations[start_idx:start_idx + partition_sz] = dist.get_rank()
 
@@ -1000,11 +1011,11 @@ def __init_weights(self, module):
                     "lr": 1.0
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "loss_scale": 1.0,
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            ds_cfg["fp16"] = {"enabled": True, "loss_scale": 1.0}
+        elif get_accelerator().is_bf16_supported():
+            ds_cfg["bf16"] = {"enabled": True}
 
         with deepspeed.zero.Init(config=ds_cfg, mem_efficient_linear=False, enabled=True):
             model = ModelWhereParentInitializesChildWeights()
@@ -1187,6 +1198,83 @@ def create_tensor(vals):
         _assert_partition_status(ds_engine, {ZeroParamStatus.NOT_AVAILABLE})
 
 
+class TestParamPartitioningSkipInit(DistributedTest):
+    world_size = 2
+
+    def test(self):
+        config_dict = {
+            "train_batch_size": 4,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-4
+                }
+            },
+            "zero_optimization": {
+                "stage": 3
+            },
+        }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
+        hidden_dim = 10
+
+        class SubModel(torch.nn.Module):
+
+            def __init__(self, input_size, output_size, dropout_prob=0.5, device=None):
+                super(SubModel, self).__init__()
+                self.linear = torch.nn.Linear(input_size, output_size, device=device)
+                self.dropout = torch.nn.Dropout(dropout_prob)
+                self.module_list = torch.nn.ModuleList([torch.nn.Linear(input_size, output_size, device=device)])
+
+            def forward(self, x):
+                x = self.linear(x)
+                x = self.dropout(x)
+                x = self.module_list[0](x)
+                return x
+
+        class MyModel(torch.nn.Module):
+
+            def __init__(self, hidden_dim):
+                super(MyModel, self).__init__()
+                self.l1 = skip_init(Linear, hidden_dim, hidden_dim)
+                self.l2 = skip_init(SubModel, hidden_dim, hidden_dim)
+                self.l3 = torch.nn.Linear(hidden_dim, hidden_dim)
+                self.cel = torch.nn.CrossEntropyLoss()
+                self.l4 = skip_init(SubModel, hidden_dim, hidden_dim)
+
+            def forward(self, x, y):
+                x = self.l1(x)
+                x = self.l2(x)
+                x = self.l3(x)
+                x = self.l4(x)
+                loss = self.cel(x, y)
+                val = [x, loss]
+                return val
+
+        with deepspeed.zero.Init(config=config_dict):
+            model = MyModel(hidden_dim)
+        world_size = dist.get_world_size()
+        ds_tensor_numel = math.ceil(hidden_dim * hidden_dim / world_size)
+        assert model.l1.weight.ds_tensor.numel() == ds_tensor_numel
+        assert model.l2.linear.weight.ds_tensor.numel() == ds_tensor_numel
+        assert model.l2.module_list[0].weight.ds_tensor.numel() == ds_tensor_numel
+        assert model.l3.weight.ds_tensor.numel() == ds_tensor_numel
+        assert model.l4.linear.weight.ds_tensor.numel() == ds_tensor_numel
+        assert model.l4.module_list[0].weight.ds_tensor.numel() == ds_tensor_numel
+
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device)
+        dist.barrier()
+        for n, batch in enumerate(data_loader):
+            loss = model(batch[0], batch[1])
+            loss = loss[1]
+            model.backward(loss)
+            model.step()
+
+
 class TestZeroOffloadStage1(DistributedTest):
     world_size = 2
 
@@ -1201,9 +1289,6 @@ def test(self):
                     "lr": 1e-4
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": 1,
                 "offload_optimizer": {
@@ -1211,6 +1296,10 @@ def test(self):
                 }
             },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
@@ -1228,6 +1317,8 @@ class TestZero3DictFwd(DistributedTest):
     world_size = 1
 
     def test(self, return_type):
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test yet")
         config_dict = {
             "train_batch_size": 4,
             "steps_per_print": 1,
@@ -1237,13 +1328,14 @@ def test(self, return_type):
                     "lr": 1e-4
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": 3
             },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
@@ -1287,6 +1379,11 @@ class TestZeroAdamOptimizerStepCount(DistributedTest):
     world_size = 1
 
     def test(self, zero_stage):
+        # We verify trhee conditions:
+        # 1. global_steps starts at 0
+        # 2. All subgroups have the same step count
+        # 3. The global step count is the same as the step count of the first subgroup
+
         # force all params to be partitioned by forcing threshold=0
         config_dict = {
             "train_micro_batch_size_per_gpu": 2,
@@ -1303,11 +1400,11 @@ def test(self, zero_stage):
                     "lr": 1e-3
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 4
 
         model = SimpleModel(hidden_dim=hidden_dim, nlayers=12)
@@ -1316,24 +1413,31 @@ def test(self, zero_stage):
                                                       model_parameters=model.parameters())
         data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
 
-        for i, batch in enumerate(data_loader):
+        assert model.global_steps == 0
+
+        for batch in data_loader:
             loss = model(batch[0], batch[1])
             model.backward(loss)
+
+            is_gradient_accumulation_boundary = model.is_gradient_accumulation_boundary()
             model.step()
 
-            step_counts = []
-            if zero_stage == 3:
-                for sub_group_id, _ in enumerate(optimizer.fp16_groups):
-                    fp32_param = optimizer.fp32_partitioned_groups_flat[sub_group_id]
-                    state = optimizer.optimizer.state[fp32_param]
-                    step_counts.append(state["step"])
-                assert all(step == step_counts[0] for step in step_counts)
-            elif zero_stage == 1 or zero_stage == 2:
-                for param_group in optimizer.optimizer.param_groups:
-                    for param in param_group["params"]:
-                        state = optimizer.optimizer.state[param]
+            if is_gradient_accumulation_boundary:
+                step_counts = []
+
+                if zero_stage == 3:
+                    for sub_group_id, _ in enumerate(optimizer.fp16_groups):
+                        fp32_param = optimizer.fp32_partitioned_groups_flat[sub_group_id]
+                        state = optimizer.optimizer.state[fp32_param]
                         step_counts.append(state["step"])
+                elif zero_stage == 1 or zero_stage == 2:
+                    for param_group in optimizer.optimizer.param_groups:
+                        for param in param_group["params"]:
+                            state = optimizer.optimizer.state[param]
+                            step_counts.append(state["step"])
+
                 assert all(step == step_counts[0] for step in step_counts)
+                assert model.global_steps == step_counts[0]
 
 
 @pytest.mark.parametrize("zero_stage", [1, 2, 3])
@@ -1350,13 +1454,14 @@ def test(self, zero_stage):
                     "lr": 1e-4
                 }
             },
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": zero_stage
             },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         class MyModel(torch.nn.Module):
@@ -1402,9 +1507,6 @@ def test(self, force_ds_optim):
             "train_batch_size": 4,
             "gradient_accumulation_steps": 2,
             "steps_per_print": 1,
-            "fp16": {
-                "enabled": True
-            },
             "zero_optimization": {
                 "stage": 1,
                 "offload_optimizer": {
@@ -1413,6 +1515,10 @@ def test(self, force_ds_optim):
             },
             "zero_force_ds_cpu_optimizer": force_ds_optim,
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         hidden_dim = 10
 
         model = SimpleModel(hidden_dim)
@@ -1434,15 +1540,15 @@ def test_training_partition_cache(self, training):
         hidden_dim = 10
         config_dict = {
             "train_batch_size": 2,
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 8
-            },
             "zero_optimization": {
                 "stage": 3,
                 "stage3_param_persistence_threshold": hidden_dim,
             },
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
         if training:
             config_dict["optimizer"] = {"type": "Adam"}
 
@@ -1451,13 +1557,11 @@ def test_training_partition_cache(self, training):
 
         model, _, _, _ = deepspeed.initialize(model=model, config=config_dict)
 
-        dtype = torch.half
         data_loader = random_dataloader(
             model=model,
             total_samples=6,
             hidden_dim=hidden_dim,
             device=model.device,
-            dtype=dtype,
         )
 
         for _, batch in enumerate(data_loader):
@@ -1481,6 +1585,8 @@ class TestEmptyParameterGroup(DistributedTest):
     world_size = 1
 
     def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_group):
+        if dtype == torch.float16 and not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         model = SimpleModel(hidden_dim=4, nlayers=4)
         param_groups = [
             {
@@ -1509,7 +1615,7 @@ def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_grou
         }
 
         if use_client_optimizer:
-            optimizer = deepspeed.ops.adam.FusedAdam(param_groups, lr=0.1)
+            optimizer = torch.optim.AdamW(param_groups, lr=0.1)
             model_parameters = model.parameters()
         else:
             config_dict["optimizer"] = {"type": "adamw"}
@@ -1522,3 +1628,82 @@ def test_empty_param_groups(self, dtype, use_client_optimizer, empty_weight_grou
             optimizer=optimizer,
             config=config_dict,
         )
+
+
+class TestZero3SwitchModes(DistributedTest):
+    world_size = 2
+
+    @pytest.mark.parametrize("prefetch_ratio", [0.0, 0.5, 1.0])
+    def test(self, prefetch_ratio, zero_stage=3):
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+
+        prefetch_bucket_size = int(sum([p.numel() for p in model.parameters(recurse=True)]) * prefetch_ratio)
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 2,
+            "gradient_accumulation_steps": 2,
+            "zero_optimization": {
+                "stage": zero_stage,
+                "stage3_prefetch_bucket_size": prefetch_bucket_size
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "initial_scale_power": 8
+            }
+        }
+
+        model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())
+        data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device)
+
+        for _ in range(3):
+            model.train()
+            for batch in data_loader:
+                loss = model(batch[0], batch[1])
+                model.backward(loss)
+                model.step()
+
+            model.eval()
+            with torch.no_grad():
+                for batch in data_loader:
+                    loss = model(batch[0], batch[1])
+
+
+# Avoid overwriting client module id
+# https://github.com/deepspeedai/DeepSpeed/issues/6772
+class TestZero3ClientModuleID(DistributedTest):
+    world_size = 2
+
+    def test_client_module_id(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+            },
+            "zero_optimization": {
+                "stage": 3
+            },
+        }
+
+        class MyModel(torch.nn.Module):
+
+            def __init__(self):
+                super().__init__()
+                self.id = 3  # ID arbitrary client usage, e.g. GPU placement
+                self.fc = Linear(128, 128)
+
+            def forward(self, x):
+                return self.fc(x)
+
+        model = MyModel()
+        pre_init_m_id = model.id
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        post_init_m_id = model.id
+        assert pre_init_m_id == post_init_m_id
diff --git a/tests/unit/runtime/zero/test_zero_config.py b/tests/unit/runtime/zero/test_zero_config.py
index db9fd6516034..8b20eca8c7d2 100644
--- a/tests/unit/runtime/zero/test_zero_config.py
+++ b/tests/unit/runtime/zero/test_zero_config.py
@@ -48,12 +48,12 @@ def test_zero_config_overlapcomm():
 
 def test_zero_config_offload_configs():
     config = DeepSpeedZeroConfig()
-    assert config.offload_param == None
-    assert config.offload_optimizer == None
+    assert config.offload_param is None
+    assert config.offload_optimizer is None
 
     config = DeepSpeedZeroConfig(**{"offload_param": None, "offload_optimizer": None})
-    assert config.offload_param == None
-    assert config.offload_optimizer == None
+    assert config.offload_param is None
+    assert config.offload_optimizer is None
 
     config = DeepSpeedZeroConfig(**{"offload_param": {}, "offload_optimizer": {}})
     assert isinstance(config.offload_param, DeepSpeedZeroOffloadParamConfig)
diff --git a/tests/unit/runtime/zero/test_zero_context.py b/tests/unit/runtime/zero/test_zero_context.py
index aabe7f0b7f15..1d4fcd60022c 100644
--- a/tests/unit/runtime/zero/test_zero_context.py
+++ b/tests/unit/runtime/zero/test_zero_context.py
@@ -6,11 +6,13 @@
 from types import SimpleNamespace
 
 import torch
+import pytest
 import deepspeed
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus, partitioned_param_data_shape
 import deepspeed.comm as dist
+from deepspeed.accelerator import get_accelerator
 
-from unit.common import DistributedTest
+from unit.common import DistributedTest, preferred_dtype
 from unit.simple_model import SimpleModel
 from utils import setup_serial_env
 
@@ -47,16 +49,17 @@ def forward(self, x):
             "lr": 0.00015
         }
     },
-    "fp16": {
-        "enabled": True,
-        "loss_scale": 138.
-    },
     "zero_optimization": {
         "stage": 3,
         "stage3_param_persistence_threshold": 1,
     }
 }
 
+if get_accelerator().is_fp16_supported():
+    config["fp16"] = {"enabled": True, "loss_scale": 138.}
+elif get_accelerator().is_bf16_supported():
+    config["bf16"] = {"enabled": True}
+
 
 class TestZeroGatheredParametersFree(DistributedTest):
     world_size = 1
@@ -81,6 +84,29 @@ def __init__(self, hidden_dim):
         assert model.l1.weight.numel() == 0, "outside of GatheredParameters the param should go back to be 0-sized"
 
 
+class TestMiCSGatheredParametersFree(DistributedTest):
+    world_size = 1
+
+    def test(self):
+        config_dict = {"train_batch_size": 1, "zero_optimization": {"stage": 3, "mics_shard_size": 1}}
+        hidden_dim = 10
+
+        class MyModel(torch.nn.Module):
+
+            def __init__(self, hidden_dim):
+                super(MyModel, self).__init__()
+                self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
+
+        with deepspeed.zero.MiCS_Init(config_dict_or_path=config_dict):
+            model = MyModel(hidden_dim)
+
+        with deepspeed.zero.GatheredParameters(list(model.parameters())):
+            assert model.l1.weight.numel() != 0, "GatheredParameters should give a non-0-sized tensor"
+
+        # on exit from `GatheredParameters` the gathered params should be freed and not leak memory
+        assert model.l1.weight.numel() == 0, "outside of GatheredParameters the param should go back to be 0-sized"
+
+
 class TestSerialContext(DistributedTest):
     world_size = 1
     init_distributed = False
@@ -101,6 +127,8 @@ def test_scattered_init_dist(self):
             assert dist.is_initialized()
 
     def test_scatter_halftype(self):
+        if not get_accelerator().is_fp16_supported():
+            pytest.skip("fp16 is not supported")
         setup_serial_env()
 
         with deepspeed.zero.Init():
@@ -190,9 +218,9 @@ def test_throughput_calculation(self):
             engine.tput_timer.stop(global_step=global_step)
             duration = engine.tput_timer.end_time - engine.tput_timer.start_time
             # step elapsed time is reset after gradient accumulation steps
-            assert engine.tput_timer.step_elapsed_time == (
-                0 if engine.tput_timer.global_step_count != engine.tput_timer.start_step else current_duration +
-                duration)
+            assert engine.tput_timer.step_elapsed_time == (0 if engine.tput_timer.global_step_count
+                                                           != engine.tput_timer.start_step else current_duration +
+                                                           duration)
             assert engine.tput_timer.total_elapsed_time == total_duration + duration
 
     def test_ext_param_getattr(self):
@@ -225,7 +253,7 @@ def forward(self, input):
         with deepspeed.zero.GatheredParameters(net.linear1.weight):
             assert net.linear1.weight.numel() == net.dim**2
 
-        input = torch.rand(net.dim).to(engine.device).half()
+        input = torch.rand(net.dim).to(engine.device).to(preferred_dtype())
         loss = engine(input)
         engine.backward(loss)
         engine.step()
diff --git a/tests/unit/runtime/zero/test_zero_context_ancestry.py b/tests/unit/runtime/zero/test_zero_context_ancestry.py
index 21955f5df152..77a8744ab5bc 100644
--- a/tests/unit/runtime/zero/test_zero_context_ancestry.py
+++ b/tests/unit/runtime/zero/test_zero_context_ancestry.py
@@ -32,7 +32,7 @@
 
 
 # test that sub-classes get params that aren't prematurely partitioned and thus requiring gathering
-# fixed by https://github.com/microsoft/DeepSpeed/pull/1202
+# fixed by https://github.com/deepspeedai/DeepSpeed/pull/1202
 class GrandPa(torch.nn.Module):
 
     def __init__(self, *args):
diff --git a/tests/unit/runtime/zero/test_zero_context_return.py b/tests/unit/runtime/zero/test_zero_context_return.py
index 874a8ea3b676..9d49b6d3ba88 100644
--- a/tests/unit/runtime/zero/test_zero_context_return.py
+++ b/tests/unit/runtime/zero/test_zero_context_return.py
@@ -8,9 +8,10 @@
 import pytest
 import deepspeed
 from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+from deepspeed.accelerator import get_accelerator
 
 from utils import setup_serial_env
-from unit.common import DistributedTest
+from unit.common import DistributedTest, preferred_dtype
 
 
 class DanglingBias(torch.nn.Linear):
@@ -119,16 +120,17 @@ def forward(self, input):
             "lr": 0.00015
         }
     },
-    "fp16": {
-        "enabled": True,
-        "loss_scale": 138.
-    },
     "zero_optimization": {
         "stage": 3,
         "stage3_param_persistence_threshold": 1,
     }
 }
 
+if get_accelerator().is_fp16_supported():
+    config["fp16"] = {"enabled": True, "loss_scale": 138.}
+elif get_accelerator().is_bf16_supported():
+    config["bf16"] = {"enabled": True}
+
 
 class TestReturnParam(DistributedTest):
     world_size = 1
@@ -142,7 +144,7 @@ def test_ext_param_return(self):
         engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
-            input = torch.rand(net.dim).to(engine.device).half()
+            input = torch.rand(net.dim).to(engine.device).to(preferred_dtype())
             loss = engine(input)
             engine.backward(loss)
             engine.step()
@@ -158,7 +160,7 @@ def test_ext_param_returnobj(self):
         engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(5):
-            input = torch.rand(net.dim).to(engine.device).half()
+            input = torch.rand(net.dim).to(engine.device).to(preferred_dtype())
             loss = engine(input)
             assert len(net._external_params) == 1
             assert len(net.dangler._external_params) == 0
@@ -176,7 +178,7 @@ def test_stage_3_output_type(self, output_type):
         engine, _, _, _ = deepspeed.initialize(args=args, model=net, model_parameters=net.parameters(), config=config)
 
         for _ in range(1):
-            input = torch.rand(net.dim).to(engine.device).half()
+            input = torch.rand(net.dim).to(engine.device).to(preferred_dtype())
             loss = engine(input)
             if loss is not None:
                 if isinstance(loss, dict):
diff --git a/tests/unit/runtime/zero/test_zero_leaf_module.py b/tests/unit/runtime/zero/test_zero_leaf_module.py
new file mode 100644
index 000000000000..74c709883645
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_leaf_module.py
@@ -0,0 +1,262 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import deepspeed.comm as dist
+import torch
+
+from unit.common import DistributedTest, preferred_dtype
+from unit.simple_model import random_dataloader
+
+import deepspeed
+from deepspeed.utils import set_z3_leaf_modules, unset_z3_leaf_modules, get_z3_leaf_modules, z3_leaf_module
+from deepspeed.accelerator import get_accelerator
+from torch import nn
+import time
+
+
+class ChooseModuleByCounter(torch.nn.Module):
+
+    def __init__(self, hidden_dim):
+        super(ChooseModuleByCounter, self).__init__()
+        self.linears = torch.nn.ModuleList(
+            [torch.nn.Linear(hidden_dim, hidden_dim, bias=False),
+             torch.nn.Linear(hidden_dim, hidden_dim, bias=False)])
+        self.act = torch.nn.ReLU()
+        self.cel = torch.nn.CrossEntropyLoss()
+        self.counter = 0
+
+    def forward(self, x, y):
+        # This fails without setting this module as a leaf module.
+        # See the comment in `set_z3_leaf_modules()`.
+        x = self.linears[self.counter % len(self.linears)](x)
+        x = self.act(x)
+        loss = self.cel(x, y)
+        self.counter += 1
+        return x, loss
+
+
+class ChooseModuleByRankModel(torch.nn.Module):
+
+    def __init__(self, hidden_dim):
+        super(ChooseModuleByRankModel, self).__init__()
+        self.linears = torch.nn.ModuleList(
+            [torch.nn.Linear(hidden_dim, hidden_dim, bias=False),
+             torch.nn.Linear(hidden_dim, hidden_dim, bias=False)])
+        self.act = torch.nn.ReLU()
+        self.cel = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        # Each rank runs only one of the linear layers
+        x = self.linears[dist.get_rank() % len(self.linears)](x)
+        x = self.act(x)
+        loss = self.cel(x, y)
+        return x, loss
+
+
+class MLPBlock(nn.Module):
+
+    def __init__(self, hidden_dim):
+        super(MLPBlock, self).__init__()
+        self.gate_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
+        self.act_fn = nn.GELU()
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+class FineGrainedBlock(nn.Module):
+
+    def __init__(self, hidden_dim, num_block):
+        super(FineGrainedBlock, self).__init__()
+        self.num_block = num_block
+        self.mlp_layers = torch.nn.ModuleList([MLPBlock(hidden_dim=hidden_dim) for _ in range(self.num_block)])
+
+    def forward(self, x):
+        for i in range(self.num_block):
+            x = self.mlp_layers[i](x)
+        return x
+
+
+class modelWithFineGrainedBlock(nn.Module):
+
+    def __init__(self, hidden_dim, num_block):
+        super(modelWithFineGrainedBlock, self).__init__()
+        self.coarse_grained_layer1 = nn.Linear(hidden_dim, 8 * hidden_dim)
+        self.coarse_grained_layer2 = nn.Linear(8 * hidden_dim, hidden_dim)
+        self.fine_grained_layer = FineGrainedBlock(hidden_dim, num_block)
+        self.cel = torch.nn.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.coarse_grained_layer1(x)
+        x = self.coarse_grained_layer2(x)
+        x = self.fine_grained_layer(x)
+        loss = self.cel(x, y)
+        return x, loss
+
+
+def run_model(model, config_dict, hidden_dim, dtype, requires_grad):
+    model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+    data_loader = random_dataloader(model=model,
+                                    total_samples=10,
+                                    hidden_dim=hidden_dim,
+                                    device=model.device,
+                                    dtype=dtype)
+    dist.barrier()
+    for batch in data_loader:
+        batch[0].requires_grad = requires_grad
+        loss = model(batch[0], batch[1])
+        loss = loss[1]
+        model.backward(loss)
+        model.step()
+
+    # Needed in ZeRO 3. Not doing so can give memory leak
+    model.destroy()
+
+
+class TestSetZ3LeafModule(DistributedTest):
+    # Need multiple gpus to test possible hanging
+    world_size = 2
+    reuse_dist_env = True
+
+    def _test_set_z3_leaf_modules(self, cls, requires_grad):
+        hidden_dim = 128
+
+        # `stage3_max_reuse_distance` is set to 0 to cause an error if the module is not set as a leaf module
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_prefetch_bucket_size": hidden_dim**2,
+                "stage3_param_persistence_threshold": 0,
+                "stage3_max_reuse_distance": 0,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        model = cls(hidden_dim)
+
+        assert not z3_leaf_module(model)
+        set_z3_leaf_modules(model, [cls])
+        assert z3_leaf_module(model)
+
+        run_model(model, config_dict, hidden_dim, preferred_dtype(), requires_grad)
+
+    def test_choose_module_by_counter(self):
+        self._test_set_z3_leaf_modules(ChooseModuleByCounter, True)
+
+    def test_choose_module_by_rank(self):
+        self._test_set_z3_leaf_modules(ChooseModuleByRankModel, True)
+
+    def test_no_grad_input_error(self):
+        try:
+            self._test_set_z3_leaf_modules(ChooseModuleByCounter, False)
+            raise AssertionError(
+                "Expected RuntimeError: inputs with requires_grad=False is not supported for a leaf module")
+        except RuntimeError as e:
+            pass
+
+    def test_set_unset_leaf_modules(self):
+        hidden_dim = 128
+        model = ChooseModuleByCounter(hidden_dim)
+        assert len(set_z3_leaf_modules(model, [torch.nn.ModuleList])) == 1, \
+            "Expected only one module to be set as a leaf module"
+        assert len(get_z3_leaf_modules(model)) == 1, "Expected there is only one leaf module"
+
+        assert len(unset_z3_leaf_modules(model, [torch.nn.ModuleList])) == 1, \
+            "Expected only one module to be unset as a leaf module"
+        assert len(get_z3_leaf_modules(model)) == 0, "Expected there is no leaf module"
+
+    def test_set_no_match_class(self):
+        hidden_dim = 128
+        model = ChooseModuleByCounter(hidden_dim)
+        try:
+            set_z3_leaf_modules(model, [torch.nn.Conv2d])
+            raise AssertionError("Expected error that no module is set as a leaf module")
+        except ValueError as e:
+            pass
+
+
+@pytest.mark.parametrize("module_granularity_threshold", [0, 100, 12100, 10000000])
+class TestZ3LeafOptimization(DistributedTest):
+    world_size = 2
+    reuse_dist_env = True
+
+    def test_finegrained_optimization(self, module_granularity_threshold: int):
+        hidden_dim = 128
+        num_block = 16
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "steps_per_print": 1,
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-6
+                }
+            },
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_prefetch_bucket_size": hidden_dim**2,
+                "stage3_param_persistence_threshold": 0,
+                "stage3_max_reuse_distance": 0,
+            }
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+
+        def bench_loss_and_time(config):
+            warm_up_step = 10
+            model = modelWithFineGrainedBlock(hidden_dim, num_block)
+            model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config)
+            data_loader = random_dataloader(model=model,
+                                            total_samples=20,
+                                            hidden_dim=hidden_dim,
+                                            device=model.device,
+                                            dtype=preferred_dtype())
+            dist.barrier()
+            loss_list = []
+
+            for i, batch in enumerate(data_loader):
+                if i == warm_up_step:
+                    dist.barrier()
+                    get_accelerator().synchronize()
+                    start_time = time.time()
+                batch[0].requires_grad = True
+                loss = model(batch[0], batch[1])
+                loss = loss[1]
+                loss_list.append(loss)
+                model.backward(loss)
+                model.step()
+            get_accelerator().synchronize()
+            end_time = time.time()
+            duration = end_time - start_time
+            model.destroy()
+            return loss_list, duration
+
+        baseline_loss_list, baseline_exec_time = bench_loss_and_time(config_dict)
+
+        config_dict["zero_optimization"]["stage3_module_granularity_threshold"] = module_granularity_threshold
+        loss, duration = bench_loss_and_time(config_dict)
+
+        if dist.get_rank() == 0:
+            print(f"baseline exec time:", baseline_exec_time)
+            print(
+                f"finegrained optimziation exec time: {duration},granularity threshold:{module_granularity_threshold} "
+            )
+            assert baseline_loss_list == loss, f"incorrect loss value with threshold:{module_granularity_threshold}"
diff --git a/tests/unit/runtime/zero/test_zero_multiple_run.py b/tests/unit/runtime/zero/test_zero_multiple_run.py
new file mode 100644
index 000000000000..d4eb3a578cc9
--- /dev/null
+++ b/tests/unit/runtime/zero/test_zero_multiple_run.py
@@ -0,0 +1,53 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import deepspeed
+import torch
+from unit.common import DistributedTest, preferred_dtype
+from unit.simple_model import SimpleModel, random_dataloader
+
+
+class TestZ3MultipleModelCall(DistributedTest):
+    world_size = 1
+
+    def test_z3_multiple_model_call(self):
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 1,
+            "steps_per_print": 1,
+            "zero_optimization": {
+                "stage": 3
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-3
+                }
+            },
+        }
+        if preferred_dtype() is torch.float16:
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
+        elif preferred_dtype() is torch.bfloat16:
+            config_dict["bf16"] = {"enabled": True}
+        hidden_dim, nlayers = 2048, 3
+        model = SimpleModel(hidden_dim=hidden_dim, nlayers=nlayers)
+        model_engine, _, _, _ = deepspeed.initialize(config=config_dict,
+                                                     model=model,
+                                                     model_parameters=model.parameters())
+        data_loader = iter(
+            random_dataloader(model=model_engine, total_samples=10, hidden_dim=hidden_dim, device=model_engine.device))
+
+        for n, batch in enumerate(data_loader):
+            loss1 = model_engine(batch[0], batch[1])
+            with torch.no_grad():
+                loss2 = model_engine(batch[0], batch[1])
+            loss = loss1 + loss2
+            model_engine.backward(loss)
+            for name, submodule in model_engine.module.linears._modules.items():
+                assert hasattr(submodule, "ds_grads_remaining"), \
+                  f"linears.{name} does not have variable ds_grads_remaining"
+                assert submodule.ds_grads_remaining == 0, \
+                  f"ds_grads_remaining of linears.{name} is not 0 ({submodule.ds_grads_remaining})"
+            model_engine.step()
diff --git a/tests/unit/runtime/zero/test_zero_nesting_init.py b/tests/unit/runtime/zero/test_zero_nesting_init.py
index 143e7e997b13..15d82fd8be00 100644
--- a/tests/unit/runtime/zero/test_zero_nesting_init.py
+++ b/tests/unit/runtime/zero/test_zero_nesting_init.py
@@ -8,7 +8,7 @@
 from unit.common import DistributedTest
 
 from transformers import VisionEncoderDecoderModel
-from transformers.deepspeed import HfDeepSpeedConfig
+from transformers.integrations.deepspeed import HfDeepSpeedConfig
 
 import deepspeed
 
diff --git a/tests/unit/runtime/zero/test_zero_offloadpp.py b/tests/unit/runtime/zero/test_zero_offloadpp.py
index c376686f8052..8ae99e2237e2 100644
--- a/tests/unit/runtime/zero/test_zero_offloadpp.py
+++ b/tests/unit/runtime/zero/test_zero_offloadpp.py
@@ -39,9 +39,11 @@ class TestZeroPartialOffloadConfigSweep(DistributedTest):
     world_size = 4
 
     def test(self, h_dim: int, n_layers: int) -> None:
+
         config_dict = {
             "train_batch_size": 256,
             "steps_per_print": 1,
+            "gradient_clipping": 1.0,
             "optimizer": {
                 "type": "Adam",
                 "params": {
diff --git a/tests/unit/runtime/zero/test_zero_tensor_fragment.py b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
index e50b03035bad..2e3a652668ed 100644
--- a/tests/unit/runtime/zero/test_zero_tensor_fragment.py
+++ b/tests/unit/runtime/zero/test_zero_tensor_fragment.py
@@ -7,52 +7,45 @@
 import deepspeed.comm as dist
 import torch
 
-from unit.common import DistributedTest
+from unit.common import DistributedTest, preferred_dtype
 from unit.simple_model import random_dataloader, SimpleModel
 from unit.util import bf16_required_version_check
 
 import deepspeed
 from deepspeed.utils import safe_get_full_fp32_param, safe_get_full_grad, safe_get_full_optimizer_state
-from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
+from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_grad, safe_set_full_optimizer_state
 from deepspeed.utils import safe_get_local_fp32_param, safe_get_local_grad, safe_get_local_optimizer_state
-from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
+from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_grad, safe_set_local_optimizer_state
 from deepspeed.runtime.zero.offload_config import OffloadDeviceEnum
 from deepspeed.ops.aio import AsyncIOBuilder
+from deepspeed.accelerator import get_accelerator
 
 WEIGHT_KEY = 'weight'
 FIRST_ORDER_KEY = 'exp_avg'
 SECOND_ORDER_KEY = 'exp_avg_sq'
+GRADIENT_KEY = 'gradient'
 
 
-def validate_full_tensors(model):
+def validate_tensor(model, api_type, opt_states):
+    assert api_type in ["full", "local"]
     for _, lp in model.named_parameters():
-        hp = safe_get_full_fp32_param(lp)
-        exp_avg = safe_get_full_optimizer_state(lp, 'exp_avg')
-        exp_avg_sq = safe_get_full_optimizer_state(lp, 'exp_avg_sq')
-        hp_grad = safe_get_full_grad(lp)
-        param_list = [hp, hp_grad, exp_avg, exp_avg_sq]
-        if lp.requires_grad:
-            assert all([p is not None for p in param_list])
+        param_list = []
+        if opt_states:
+            param_list.append(
+                safe_get_full_optimizer_state(lp, 'exp_avg') if api_type ==
+                "full" else safe_get_local_optimizer_state(lp, 'exp_avg'))
+            param_list.append(
+                safe_get_full_optimizer_state(lp, 'exp_avg_sq') if api_type ==
+                "full" else safe_get_local_optimizer_state(lp, 'exp_avg_sq'))
         else:
-            assert all([p is None for p in param_list])
-
-
-def validate_local_tensors(model):
-    for _, lp in model.named_parameters():
-        hp = safe_get_local_fp32_param(lp)
-        exp_avg = safe_get_local_optimizer_state(lp, 'exp_avg')
-        exp_avg_sq = safe_get_local_optimizer_state(lp, 'exp_avg_sq')
-        hp_grad = safe_get_local_grad(lp)
-        param_list = [hp, hp_grad, exp_avg, exp_avg_sq]
+            param_list.append(safe_get_full_fp32_param(lp) if api_type == "full" else safe_get_local_fp32_param(lp))
+            param_list.append(safe_get_full_grad(lp) if api_type == "full" else safe_get_local_grad(lp))
         if lp.requires_grad:
             assert all([p is not None for p in param_list])
         else:
             assert all([p is None for p in param_list])
 
 
-validate_funcs_mapping = {"full": validate_full_tensors, "local": validate_local_tensors}
-
-
 class MyModel(torch.nn.Module):
 
     def __init__(self, hidden_dim, frozen_weights):
@@ -71,12 +64,10 @@ def forward(self, x, y):
         for l in self.linears:
             x = l(x)
             x = self.act(x)
-        loss = self.cel(x, y)
-        val = (x, loss)
-        return val
+        return self.cel(x, y)
 
 
-def run_fragmented_model(model, config_dict, hidden_dim, dtype, validate_func):
+def run_fragmented_model(model, config_dict, hidden_dim, dtype, validate_after_bwd, validate_after_step):
     model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
     data_loader = random_dataloader(model=model,
                                     total_samples=10,
@@ -86,10 +77,10 @@ def run_fragmented_model(model, config_dict, hidden_dim, dtype, validate_func):
     dist.barrier()
     for n, batch in enumerate(data_loader):
         loss = model(batch[0], batch[1])
-        loss = loss[1]
         model.backward(loss)
-        validate_func(model)
+        validate_after_bwd(model)
         model.step()
+        validate_after_step(model)
 
     # Needed in ZeRO 3. Not doing so can give memory leak
     model.destroy()
@@ -123,14 +114,14 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, froz
                     "lr": 1e-6
                 }
             },
-            "fp16": {
-                "enabled": True,
-                "initial_scale_power": 2
-            },
             "zero_optimization": {
                 "stage": zero_stage,
             }
         }
+        if get_accelerator().is_fp16_supported():
+            config_dict["fp16"] = {"enabled": True, "initial_scale_power": 2}
+        elif get_accelerator().is_bf16_supported():
+            config_dict["bf16"] = {"enabled": True}
 
         if offload_device == OffloadDeviceEnum.cpu:
             config_dict["zero_optimization"]["offload_optimizer"] = {"device": offload_device}
@@ -147,15 +138,19 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, froz
         else:
             model = MyModel(hidden_dim, frozen_weights)
 
-        validate_func = validate_funcs_mapping[api_type]
+        validate_after_bwd = lambda model: validate_tensor(model, api_type, opt_states=False)
+        validate_after_step = lambda model: validate_tensor(model, api_type, opt_states=True)
 
-        run_fragmented_model(model, config_dict, hidden_dim, torch.float16, validate_func)
+        run_fragmented_model(model, config_dict, hidden_dim, preferred_dtype(), validate_after_bwd,
+                             validate_after_step)
 
     def test_bf16_fragments(self, frozen_weights):
+        if get_accelerator().device_name() == "cpu":
+            pytest.skip("CPU accelerator does not support this test yet.")
         if frozen_weights:
             pytest.skip("TODO: Frozen weights not currently supported by BF16 Optimizer")
 
-        if not bf16_required_version_check(accelerator_check=False):
+        if not bf16_required_version_check():
             pytest.skip(
                 " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly"
             )
@@ -178,16 +173,22 @@ def test_bf16_fragments(self, frozen_weights):
 
         hidden_dim = 128
         model = MyModel(hidden_dim, frozen_weights)
-        run_fragmented_model(model, config_dict, hidden_dim, torch.bfloat16, validate_full_tensors)
+
+        api_type = "full"
+        validate_after_bwd = lambda model: validate_tensor(model, api_type, opt_states=False)
+        validate_after_step = lambda model: validate_tensor(model, api_type, opt_states=True)
+
+        run_fragmented_model(model, config_dict, hidden_dim, torch.bfloat16, validate_after_bwd, validate_after_step)
 
 
-def create_random_values(model, key_list, group, use_cuda=True):
+def create_random_values(model, key_list, group, grad_dtype, use_cuda=True):
     param_values = {}
     for n, lp in model.named_parameters():
         param_shape = lp.ds_shape if hasattr(lp, 'ds_id') else lp.shape
         param_values[n] = {}
         for key in key_list:
-            rand_value = torch.rand(param_shape, dtype=torch.float32, device=model.device)
+            dtype = grad_dtype if key == GRADIENT_KEY else torch.float32
+            rand_value = torch.rand(param_shape, dtype=dtype, device=model.device)
             dist.broadcast(rand_value, src=0, group=group)
             param_values[n][key] = rand_value
     return param_values
@@ -196,7 +197,9 @@ def create_random_values(model, key_list, group, use_cuda=True):
 def set_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, value_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                safe_set_full_grad(lp, value_tensor)
+            elif key == WEIGHT_KEY:
                 safe_set_full_fp32_param(lp, value_tensor)
             else:
                 safe_set_full_optimizer_state(lp, value_tensor, key)
@@ -205,21 +208,25 @@ def set_param_values_with_dict(model, value_dict):
 def validate_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, expected_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                actual_tensor = safe_get_full_grad(lp)
+            elif key == WEIGHT_KEY:
                 actual_tensor = safe_get_full_fp32_param(lp)
             else:
                 actual_tensor = safe_get_full_optimizer_state(lp, key)
+
             assert torch.equal(expected_tensor, actual_tensor)
 
 
-def create_random_values_for_local(model, key_list, group, use_cuda=True):
+def create_random_values_for_local(model, key_list, group, grad_dtype, use_cuda=True):
     param_values = {}
     for n, lp in model.named_parameters():
         param_shape = lp.ds_tensor.shape
         param_values[n] = {}
         for key in key_list:
             device = model.device if use_cuda else "cpu"
-            rand_value = torch.rand(param_shape, dtype=torch.float32, device=device)
+            dtype = grad_dtype if key == GRADIENT_KEY else torch.float32
+            rand_value = torch.rand(param_shape, dtype=dtype, device=device)
             # dist.broadcast(rand_value, src=0, group=group)
             param_values[n][key] = rand_value
     return param_values
@@ -229,7 +236,9 @@ def set_local_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
 
         for key, value_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                safe_set_local_grad(lp, value_tensor)
+            elif key == WEIGHT_KEY:
                 safe_set_local_fp32_param(lp, value_tensor)
             else:
                 safe_set_local_optimizer_state(lp, value_tensor, key)
@@ -238,10 +247,13 @@ def set_local_param_values_with_dict(model, value_dict):
 def validate_local_param_values_with_dict(model, value_dict):
     for n, lp in model.named_parameters():
         for key, expected_tensor in value_dict[n].items():
-            if key == WEIGHT_KEY:
+            if key == GRADIENT_KEY:
+                actual_tensor = safe_get_local_grad(lp)
+            elif key == WEIGHT_KEY:
                 actual_tensor = safe_get_local_fp32_param(lp)
             else:
                 actual_tensor = safe_get_local_optimizer_state(lp, key)
+
             assert torch.equal(expected_tensor, actual_tensor)
 
 
@@ -307,6 +319,8 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, dtyp
             }
 
         if dtype == torch.float16:
+            if not get_accelerator().is_fp16_supported():
+                pytest.skip("fp16 is not supported")
             config_dict["fp16"] = {"enabled": True, "initial_scale_power": 8}
         elif dtype == torch.bfloat16:
             config_dict["bf16"] = {"enabled": True}
@@ -315,23 +329,29 @@ def test_zero_fragments(self, tmpdir, api_type, zero_stage, offload_device, dtyp
         if zero_stage == 3:
             config_dict["zero_optimization"]["param_persistence_threshold"] = hidden_dim
             with deepspeed.zero.Init(config_dict_or_path=config_dict):
-                model = SimpleModel(hidden_dim, nlayers=4)
+                model = SimpleModel(hidden_dim)
         else:
-            model = SimpleModel(hidden_dim, nlayers=4)
+            model = SimpleModel(hidden_dim)
 
-        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
         world = dist.get_world_size()
         group = dist.new_group(ranks=list(range(world)))
 
         dist.barrier()
-        optim_keys = [WEIGHT_KEY, FIRST_ORDER_KEY, SECOND_ORDER_KEY]
-        helper_funcs = helper_funcs_mapping[api_type]
-        optim_state_values = helper_funcs["create_random_values"](model,
-                                                                  optim_keys,
-                                                                  group,
-                                                                  use_cuda=offload_device == OffloadDeviceEnum.none)
-        helper_funcs["set_param_values_with_dict"](model, optim_state_values)
-        helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
-
-        # Needed in ZeRO 3. Not doing so can leak memory.
-        model.destroy()
+
+        def after_bwd_validate_func(model):
+            state_keys = [WEIGHT_KEY, GRADIENT_KEY]
+            helper_funcs = helper_funcs_mapping[api_type]
+            optim_state_values = helper_funcs["create_random_values"](
+                model, state_keys, group, grad_dtype=dtype, use_cuda=offload_device == OffloadDeviceEnum.none)
+            helper_funcs["set_param_values_with_dict"](model, optim_state_values)
+            helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
+
+        def after_step_validate_func(model):
+            state_keys = [WEIGHT_KEY, FIRST_ORDER_KEY, SECOND_ORDER_KEY]
+            helper_funcs = helper_funcs_mapping[api_type]
+            optim_state_values = helper_funcs["create_random_values"](
+                model, state_keys, group, grad_dtype=dtype, use_cuda=offload_device == OffloadDeviceEnum.none)
+            helper_funcs["set_param_values_with_dict"](model, optim_state_values)
+            helper_funcs["validate_param_values_with_dict"](model, optim_state_values)
+
+        run_fragmented_model(model, config_dict, hidden_dim, dtype, after_bwd_validate_func, after_step_validate_func)
diff --git a/tests/unit/runtime/zero/test_zeropp.py b/tests/unit/runtime/zero/test_zeropp.py
index 27ec7269afc6..7a05c2a8001b 100644
--- a/tests/unit/runtime/zero/test_zeropp.py
+++ b/tests/unit/runtime/zero/test_zeropp.py
@@ -14,6 +14,12 @@
 from deepspeed.runtime.zero.config import DeepSpeedZeroConfig
 
 import torch.nn as nn
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from torch.utils.data import DataLoader
+
+import numpy as np
 
 
 class NNModel(nn.Module):
@@ -40,9 +46,16 @@ def _assert_no_secondary_tensor_group(model: Module) -> None:
         assert param.ds_zero_param_process_group is None
 
 
-def _assert_secondary_tensor_size(model: Module) -> None:
+def _check_secondary_tensor_existence(model: Module) -> None:
     for _, param in model.named_parameters():
-        assert param.ds_secondary_tensor is not None
+        if param.ds_secondary_tensor is not None:
+            return True
+    return False
+
+
+def _assert_secondary_tensor_size(model: Module) -> None:
+    for name, param in model.named_parameters():
+        assert param.ds_secondary_tensor is not None, f"param {param.ds_id}:{name} does not have secondary tensor"
         assert param.ds_secondary_tensor.size()[0] % param.ds_tensor.size()[0] == 0
 
 
@@ -50,7 +63,7 @@ def _assert_secondary_tensor_size(model: Module) -> None:
 #Assert when zpg=1 that secondary group and tensors are invalid
 @pytest.mark.sequential
 @pytest.mark.parametrize("h_dim", [1024])
-@pytest.mark.parametrize("n_layers", [4, 9])
+@pytest.mark.parametrize("n_layers", [9])
 @pytest.mark.parametrize("zpg", [1, 2, 4])
 class TestZeroPPConfigSweep(DistributedTest):
     world_size = 4
@@ -92,3 +105,172 @@ def test(self, h_dim: int, n_layers: int, zpg: int) -> None:
             loss = model(batch[0], batch[1])
             model.backward(loss)
             model.step()
+
+    def test_eval(self, h_dim: int, n_layers: int, zpg: int) -> None:
+        # in this test case, we are testing that hpz should be enabled when eval mode is on
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "zero_hpz_partition_size": zpg,
+                "contiguous_gradients": True,
+                "overlap_comm": True,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 1.,
+            }
+        }
+
+        model = NNModel(h_dim, n_layers)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
+        dist.barrier()
+        if zpg == 1:
+            _assert_no_secondary_tensor_group(model)
+
+        for n, batch in enumerate(data_loader):
+            if zpg != 1:
+                # here we check that the hpz is enabled when the previous iteration does not update the model
+                _assert_secondary_tensor_size(model)
+            with torch.no_grad():
+                loss = model(batch[0], batch[1])
+
+    def test_gradient_accumulation(self, h_dim: int, n_layers: int, zpg: int) -> None:
+        # in this test case, we are testing that hpz should be enabled for the intermediate gradient accumulation steps
+        # In this test, we should disable loss_scale
+        config_dict = {
+            "train_micro_batch_size_per_gpu": 1,
+            "gradient_accumulation_steps": 3,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "zero_hpz_partition_size": zpg,
+                "contiguous_gradients": True,
+                "overlap_comm": True,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1.
+                }
+            },
+            "fp16": {
+                "enabled": True,
+                "loss_scale": 0.,
+            }
+        }
+
+        model = NNModel(h_dim, n_layers)
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=h_dim, device=model.device)
+        dist.barrier()
+        if zpg == 1:
+            _assert_no_secondary_tensor_group(model)
+
+        for n, batch in enumerate(data_loader):
+            if n == 0 and zpg != 1:
+                _assert_secondary_tensor_size(model)
+            # here we cannot assert that secondary tensor does not exist because the gradient is likely overflowed as we use random data
+            if n > 0 and n % 3 != 0 and zpg != 1:
+                # if the previous iteration does not update the model, then the hpz should be enabled
+                assert _check_secondary_tensor_existence(model), f"n={n}"
+            loss = model(batch[0], batch[1])
+            model.backward(loss)
+            model.step()
+
+
+@pytest.mark.nightly
+@pytest.mark.parametrize("model_name", ["gpt2"])
+class TestZeroPPConvergence(DistributedTest):
+    world_size = 4
+
+    def load_and_prepare_data(self, model_name):
+        """Load model, tokenizer and dataset, and prepare data loader."""
+        from datasets import load_dataset
+
+        # Load model and tokenizer
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        # Load and tokenize dataset
+        dataset = load_dataset("wikitext", 'wikitext-103-raw-v1', split='train[:1%]').filter(lambda x: x["text"])
+
+        def tokenize_function(examples):
+            # Tokenize and ensure 'labels' are the same as 'input_ids'
+            tokenized_output = tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors='pt')
+            tokenized_output["labels"] = tokenized_output["input_ids"].clone()
+            return tokenized_output
+
+        tokenized_dataset = dataset.map(tokenize_function, batched=True)
+        tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
+
+        # Create data loader
+        data_loader = DataLoader(tokenized_dataset, batch_size=1, shuffle=False)
+        return model, data_loader
+
+    def get_loss(self, model, data_loader, config_dict, step=500):
+        """Train the model and calculate average loss."""
+        # Initialize DeepSpeed
+        model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
+        dist.barrier()
+        model.train()
+
+        # Training loop
+        losses = []
+        for n, batch in enumerate(data_loader):
+            if n >= step:
+                break
+            batch = {k: v.to(model.device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            model.backward(loss)
+            model.step()
+            losses.append(loss.item())
+
+        return np.nanmean(losses[-100:])
+
+    def get_config_dict(self, use_quantized_weights=False, use_hpz=False):
+        """Generate the configuration dictionary for DeepSpeed."""
+        config = {
+            "train_micro_batch_size_per_gpu": 1,
+            "zero_optimization": {
+                "stage": 3,
+                "stage3_max_reuse_distance": 0,
+                "contiguous_gradients": True,
+                "overlap_comm": True,
+            },
+            "optimizer": {
+                "type": "Adam",
+                "params": {
+                    "lr": 1e-5
+                }
+            },
+            "fp16": {
+                "enabled": True
+            }
+        }
+        if use_quantized_weights:
+            config["zero_optimization"]["zero_quantized_weights"] = True
+        if use_hpz:
+            config["zero_optimization"]["zero_hpz_partition_size"] = self.world_size // 2
+        return config
+
+    def test(self, model_name):
+        torch.manual_seed(0)
+        model, data_loader = self.load_and_prepare_data(model_name)
+        zeropp_loss = self.get_loss(model, data_loader, self.get_config_dict(use_quantized_weights=True, use_hpz=True))
+        model, data_loader = self.load_and_prepare_data(model_name)
+        baseline_loss = self.get_loss(model, data_loader, self.get_config_dict())
+
+        # Output and assert
+        print(f"zeropp_loss={zeropp_loss}, baseline_loss={baseline_loss}")
+        assert zeropp_loss < baseline_loss * 1.1, f"zeropp_loss={zeropp_loss}, baseline_loss={baseline_loss}"
diff --git a/tests/unit/sequence_parallelism/test_ulysses.py b/tests/unit/sequence_parallelism/test_ulysses.py
new file mode 100644
index 000000000000..bd20900a3d1e
--- /dev/null
+++ b/tests/unit/sequence_parallelism/test_ulysses.py
@@ -0,0 +1,257 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import torch
+import torch.nn.functional as F
+import deepspeed.comm as dist
+from deepspeed import initialize
+from transformers import AutoModel
+from unit.common import DistributedTest
+from deepspeed.sequence.layer import _SeqAllToAll
+from deepspeed.sequence.fpdt_layer import _FPDTGPUOffloadingAttentionImpl_, FPDT_InputConstruct
+from unit.util import skip_on_arch
+from unit.simple_model import *
+from deepspeed.utils import groups
+from deepspeed.module_inject.tp_shard import get_shard_size_list
+#Use mesh device to create data and sequence parallel group
+
+
+class TestUlyssesUtils(DistributedTest):
+    world_size = 4
+
+    def test_mesh_device_creation(self) -> None:
+        skip_on_arch(min_arch=8)
+        model = AutoModel.from_pretrained('bert-base-uncased')
+        sp_size = 2
+        dp_size = 2
+        ds_engine, _, _, _ = initialize(
+            model=model,
+            config_params={
+                "train_batch_size": 8,
+                "data_parallel_size": dp_size,
+                "sequence_parallel_size": sp_size
+            },
+        )
+        assert ds_engine.seq_parallel_group is not None
+        assert ds_engine.data_parallel_group is not None
+        assert dist.get_world_size(group=ds_engine.seq_parallel_group) == sp_size
+        assert dist.get_world_size(group=ds_engine.data_parallel_group) == dp_size
+        assert dist.get_world_size() == sp_size * dp_size
+
+
+#Sweep b,s,h,d to test all2all consistency
+@pytest.mark.parametrize("d0", [2, 4])  #batch or sequence dimension
+@pytest.mark.parametrize("d1", [4, 8])  #batch or sequence dimension
+@pytest.mark.parametrize("num_heads", [4, 8])
+@pytest.mark.parametrize("head_dim", [16, 32])
+class TestUlyssesAll2All(DistributedTest):
+    world_size = 4
+
+    def test_alltoall_output_consistency(self, d0: int, d1: int, head_dim: int, num_heads: int) -> None:
+        skip_on_arch(min_arch=8)
+        model = AutoModel.from_pretrained('bert-base-uncased')
+        ds_engine, _, _, _ = initialize(model=model, config_params={"train_batch_size": 8}, mesh_param=(2, 2))
+        #4D tensor : b,s,h,d or s,b,h,d
+        input_tensor = torch.randn(d0, d1, num_heads, head_dim, device=ds_engine.device)
+        scatter_idx = 2
+        batch_dim_idx = 0
+        outputs = []
+        seq_dims = [0]  #seq first API
+        #TODO: Add support for batch first (that seq_dims=[0,1]) after PR for bs>1 issue with batch first is fixed
+        ## See discussion in : https://github.com/deepspeedai/DeepSpeed/issues/5808
+        for seq_dim in seq_dims:
+            gather_idx = seq_dim
+            #first all2all: sequence parallel to head parallel
+            s2h_tensor = _SeqAllToAll.apply(ds_engine.seq_parallel_group, input_tensor, scatter_idx, gather_idx,
+                                            batch_dim_idx)
+
+            #No op
+            # second all2all: head parallel to sequence parallel
+            h2s_tensor = _SeqAllToAll.apply(ds_engine.seq_parallel_group, s2h_tensor, gather_idx, scatter_idx,
+                                            batch_dim_idx)
+            print(
+                f'[{dist.get_rank()}] s={seq_dim} input: {input_tensor.shape} s2h: {s2h_tensor.shape} h2s_tensor: {h2s_tensor.shape}'
+            )
+            outputs.append(h2s_tensor)
+
+        # Check outputs are the same as input
+        for i in range(1, len(outputs)):
+            assert torch.allclose(input_tensor, outputs[i]), f"Outputs differ for sequence dim {seq_dims[i]}"
+
+
+@pytest.mark.parametrize("d0", [2, 4])  #batch or sequence dimension
+@pytest.mark.parametrize("d1", [4, 8])  #batch or sequence dimension
+@pytest.mark.parametrize("num_heads", [3, 7])
+@pytest.mark.parametrize("head_dim", [16])
+class TestUlyssesAll2All_odd(DistributedTest):
+    world_size = 4
+
+    def test_alltoall_output_consistency(self, d0: int, d1: int, head_dim: int, num_heads: int) -> None:
+
+        data_parallel_size = 2
+        seq_parallel_size = self.world_size // data_parallel_size
+        skip_on_arch(min_arch=8)
+
+        def seq_batch_heads_hash(d0, d1, h, offset_d0=0, offset_d1=0, offset_h=0):
+            d0 += offset_d0
+            d1 += offset_d1
+            h += offset_h
+            return d0 * 10 + h + d1 * 0.1
+
+        hidden_dim = 10
+        model = SimpleModel(hidden_dim)
+        ds_engine, _, _, _ = initialize(model=model,
+                                        config_params={"train_batch_size": 8},
+                                        mesh_param=(data_parallel_size, seq_parallel_size))
+
+        scatter_idx = 2
+        outputs = []
+        inputs = []
+        batch_dims = [0, 1]
+        seq_dims = [1, 0]
+
+        for idx, seq_dim in enumerate(seq_dims):
+            gather_idx = seq_dim
+            batch_dim_idx = batch_dims[idx]
+
+            #4D tensor : b,s,h,d or s,b,h,d
+            #create a hash tensor from pos_id, head_id, and batch_id
+            d0_indices = torch.arange(d0).reshape(-1, 1, 1, 1)
+            d1_indices = torch.arange(d1).reshape(1, -1, 1, 1)
+            h_indices = torch.arange(num_heads).reshape(1, 1, -1, 1)
+            input_tensor = torch.randn(d0, d1, num_heads, head_dim, device=ds_engine.device)
+            if batch_dim_idx == 1:  #seq_len_dim : 0(d0)
+                input_tensor[:] = seq_batch_heads_hash(d0_indices, d1_indices, h_indices,
+                                                       d0 * groups._get_sequence_parallel_rank(), 0)
+            elif batch_dim_idx == 0:  #seq_len_dim : 1(d1)
+                input_tensor[:] = seq_batch_heads_hash(d0_indices, d1_indices, h_indices, 0,
+                                                       d1 * groups._get_sequence_parallel_rank())
+            inputs.append(input_tensor)
+
+            ### first all2all: sequence parallel to head parallel
+            s2h_tensor = _SeqAllToAll.apply(ds_engine.seq_parallel_group, input_tensor, scatter_idx, gather_idx,
+                                            batch_dim_idx)
+
+            # s2h_tensor check for the first all2all: compare with the expected ground truth
+            d0_indices = torch.arange(s2h_tensor.shape[0]).reshape(-1, 1, 1, 1)
+            d1_indices = torch.arange(s2h_tensor.shape[1]).reshape(1, -1, 1, 1)
+            h_indices = torch.arange(s2h_tensor.shape[2]).reshape(1, 1, -1, 1)
+            shard_list = get_shard_size_list(num_heads, groups._get_sequence_parallel_world_size())
+            head_offset = sum(shard_list[:groups._get_sequence_parallel_rank()])
+            s2h_truth = torch.zeros_like(s2h_tensor)
+            s2h_truth[:] = seq_batch_heads_hash(d0_indices, d1_indices, h_indices, 0, 0, head_offset)
+
+            assert torch.allclose(s2h_truth,
+                                  s2h_tensor), f"s2h_tensor differs from the expected for sequence dim: {seq_dim}"
+            #No op
+            ### second all2all: head parallel to sequence parallel
+            h2s_tensor = _SeqAllToAll.apply(ds_engine.seq_parallel_group, s2h_tensor, gather_idx, scatter_idx,
+                                            batch_dim_idx)
+            print(
+                f'[{dist.get_rank()}] s={seq_dim} input: {input_tensor.shape} s2h: {s2h_tensor.shape} h2s_tensor: {h2s_tensor.shape}'
+            )
+            outputs.append(h2s_tensor)
+
+        # Check outputs for the second all2all
+        for i in range(0, len(outputs)):
+            assert torch.allclose(inputs[i],
+                                  outputs[i]), f"[{dist.get_rank()}]Outputs differ for sequence dim {seq_dims[i]}"
+
+
+@pytest.mark.parametrize("d0", [4, 1])  #batch dimension
+@pytest.mark.parametrize("d1", [2048, 8192])  #sequence dimension
+@pytest.mark.parametrize("chunk_size", [128, 256])  #size of chunk
+@pytest.mark.parametrize("num_heads", [8, 4])
+@pytest.mark.parametrize("head_dim", [32])
+class TestFPDTAttention(DistributedTest):
+
+    def test_FPDT_attention_offloading_output_consistency(self, d0: int, d1: int, chunk_size: int, head_dim: int,
+                                                          num_heads: int) -> None:
+        skip_on_arch(min_arch=8)
+        world_size = 2
+
+        try:
+            from flash_attn.flash_attn_interface import _flash_attn_forward, _flash_attn_backward
+        except ImportError:
+            _flash_attn_forward = None
+            _flash_attn_backward = None
+
+        if _flash_attn_forward is None or _flash_attn_backward is None:
+            pytest.skip("Flash Attention is not available.")
+
+        model = AutoModel.from_pretrained('bert-base-uncased')
+        ds_engine, _, _, _ = initialize(
+            model=model,
+            config_params={
+                "train_batch_size": 8,
+                "data_parallel_size": 1,
+                "sequence_parallel_size": world_size
+            },
+        )
+        #3D tensor : l, b, d
+        dim = head_dim * num_heads
+
+        seed = 42
+        torch.manual_seed(seed)
+        get_accelerator().manual_seed_all(seed)
+
+        input_tensor = torch.randn(d1, d0, dim, device=ds_engine.device, dtype=torch.half)  # l, b, d
+        spg = ds_engine.seq_parallel_group
+
+        dist.broadcast(input_tensor, src=0, group=spg)
+
+        class args:
+
+            def __init__(self):
+                self.ds_sequence_parallel_fpdt_chunk_size = chunk_size
+
+        fpdt_input_tensor = FPDT_InputConstruct(input_tensor.permute(1, 0, 2), None, None, None, None, args(),
+                                                world_size, dist.get_rank()).generate()[0].permute(1, 0, 2)
+
+        if dist.get_rank() == 0:
+            qkv_linear_weight = torch.nn.Parameter(
+                torch.empty(dim + 2 * dim, dim, device=dist.get_rank(), dtype=torch.half))
+            torch.nn.init.normal_(qkv_linear_weight, mean=0.0, std=0.02)
+
+            qkv_linear_bias = torch.nn.Parameter(torch.empty(dim + 2 * dim, device=dist.get_rank(), dtype=torch.half))
+            torch.nn.init.normal_(qkv_linear_bias, mean=0.0, std=0.02)
+        else:
+            qkv_linear_weight = torch.nn.Parameter(
+                torch.empty(dim + 2 * dim, dim, device=dist.get_rank(), dtype=torch.half))
+            qkv_linear_bias = torch.nn.Parameter(torch.empty(dim + 2 * dim, device=dist.get_rank(), dtype=torch.half))
+
+        dist.broadcast(qkv_linear_weight, src=0, group=spg)
+        dist.broadcast(qkv_linear_bias, src=0, group=spg)
+
+        num_chunks_attn = fpdt_input_tensor.shape[0] * dist.get_world_size(spg) // chunk_size
+        fpdt_output = _FPDTGPUOffloadingAttentionImpl_.apply(fpdt_input_tensor, None, None, None, spg, 2, 0, dim, dim,
+                                                             head_dim, dim, qkv_linear_weight, qkv_linear_bias, 0,
+                                                             num_chunks_attn, True)
+
+        # baseline
+        qkv = torch.matmul(input_tensor, qkv_linear_weight.t()) + qkv_linear_bias
+        q = qkv[:, :, :dim].contiguous().reshape(qkv.shape[0], qkv.shape[1], -1, head_dim).permute(1, 2, 0,
+                                                                                                   3).contiguous()
+        k = qkv[:, :, dim:dim * 2].contiguous().reshape(qkv.shape[0], qkv.shape[1], -1,
+                                                        head_dim).permute(1, 2, 0, 3).contiguous()
+        v = qkv[:, :, dim * 2:dim * 3].contiguous().reshape(qkv.shape[0], qkv.shape[1], -1,
+                                                            head_dim).permute(1, 2, 0,
+                                                                              3).contiguous()  # b, nhead, l, d
+
+        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(dim, dtype=torch.half))
+
+        causal_mask = torch.triu(torch.ones(d1, d1, device=ds_engine.device), diagonal=1).bool()
+        causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)
+        scores = scores.masked_fill(causal_mask, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1)
+        output = torch.matmul(attn_weights, v).permute(0, 2, 1, 3)
+
+        baseline_output_shuffled = FPDT_InputConstruct(output, None, None, None, None, args(), world_size,
+                                                       dist.get_rank()).generate()[0]  # b, l, n, d
+
+        assert torch.allclose(
+            fpdt_output, baseline_output_shuffled, rtol=0.01, atol=0.1
+        ), f"rank {dist.get_rank()}, sp size: {dist.get_world_size(spg)}, input_tensor: {input_tensor.shape}, fpdt_input_tensor: {fpdt_input_tensor.shape}, fpdt_output: {fpdt_output.shape},            baseline_output_shuffled: {baseline_output_shuffled.shape},{torch.max(torch.abs(fpdt_output - baseline_output_shuffled))}"
diff --git a/tests/unit/simple_model.py b/tests/unit/simple_model.py
index 01ce3d2fe4c9..a5538a8c6e68 100644
--- a/tests/unit/simple_model.py
+++ b/tests/unit/simple_model.py
@@ -14,6 +14,7 @@
 from deepspeed.accelerator import get_accelerator
 
 import deepspeed.comm as dist
+from .common import preferred_dtype
 
 
 class SimpleModel(torch.nn.Module):
@@ -78,7 +79,7 @@ def forward(self, x, y, **kwargs):
 
 class SimpleMoEModel(torch.nn.Module):
 
-    def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
+    def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False, use_rts=True):
         super(SimpleMoEModel, self).__init__()
         self.linear1 = torch.nn.Linear(hidden_dim, hidden_dim)
         expert = torch.nn.Sequential(torch.nn.Linear(hidden_dim, hidden_dim), torch.nn.Linear(hidden_dim, hidden_dim))
@@ -88,7 +89,8 @@ def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
                          ep_size=ep_size,
                          use_residual=use_residual,
                          num_experts=num_experts,
-                         k=1)
+                         k=1,
+                         use_rts=use_rts)
         # interleaving MoE modules with dense to create an opportunity
         # for gradients to be merged in ZeRO stage 2 average_tensor reduce bucket
         self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
@@ -97,7 +99,8 @@ def __init__(self, hidden_dim, num_experts=4, ep_size=1, use_residual=False):
                          ep_size=ep_size,
                          use_residual=use_residual,
                          num_experts=num_experts,
-                         k=1)
+                         k=1,
+                         use_rts=use_rts)
         self.linear3 = torch.nn.Linear(hidden_dim, hidden_dim)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
@@ -262,21 +265,21 @@ def forward(self, x, y, **kwargs):
         return hidden_dim
 
 
-def random_dataset(total_samples, hidden_dim, device, dtype=torch.half):
+def random_dataset(total_samples, hidden_dim, device, dtype=preferred_dtype()):
     train_data = torch.randn(total_samples, hidden_dim, device=device, dtype=dtype)
     train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
     train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
     return train_dataset
 
 
-def random_dataloader(model, total_samples, hidden_dim, device, dtype=torch.half):
+def random_dataloader(model, total_samples, hidden_dim, device, dtype=preferred_dtype()):
     batch_size = model.train_micro_batch_size_per_gpu()
     train_dataset = random_dataset(total_samples, hidden_dim, device, dtype=dtype)
     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size)
     return train_loader
 
 
-def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=torch.half):
+def sequence_dataloader(model, total_samples, hidden_dim, device, seq_len: int = 32, dtype=preferred_dtype()):
     batch_size = model.train_micro_batch_size_per_gpu()
     train_data = torch.randn(total_samples, seq_len, hidden_dim, device=device, dtype=dtype)
     train_label = torch.empty(total_samples, dtype=torch.long, device=device).random_(hidden_dim)
diff --git a/tests/unit/util.py b/tests/unit/util.py
index 13eab3ef3a72..dba29ed27a4c 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -5,9 +5,10 @@
 
 import pytest
 import torch
+
+import deepspeed
 from deepspeed.accelerator import get_accelerator, is_current_accelerator_supported
 from deepspeed.git_version_info import torch_info
-from packaging import version as pkg_version
 
 
 def skip_on_arch(min_arch=7):
@@ -39,7 +40,7 @@ def bf16_required_version_check(accelerator_check=True):
 
     # Sometimes bf16 tests are runnable even if not natively supported by accelerator
     if accelerator_check:
-        accelerator_pass = torch_info['bf16_support']
+        accelerator_pass = get_accelerator().is_bf16_supported()
     else:
         accelerator_pass = True
 
@@ -47,32 +48,43 @@ def bf16_required_version_check(accelerator_check=True):
     cuda_version_available = CUDA_MAJOR >= 11
     nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)
     npu_available = get_accelerator().device_name() == 'npu'
+    hpu_available = get_accelerator().device_name() == 'hpu'
+    xpu_available = get_accelerator().device_name() == 'xpu'
 
     if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass:
         return True
     elif npu_available:
         return True
+    elif hpu_available:
+        return True
+    elif xpu_available:
+        return True
     else:
         return False
 
 
-def required_torch_version(min_version=None, max_version=None):
-    assert min_version or max_version, "Must provide a min_version or max_version argument"
-
-    torch_version = pkg_version.parse(torch.__version__)
-
-    if min_version and pkg_version.parse(str(min_version)) > torch_version:
-        return False
-
-    if max_version and pkg_version.parse(str(max_version)) < torch_version:
-        return False
-
-    return True
-
-
 def required_amp_check():
     from importlib.util import find_spec
     if find_spec('apex') is None:
         return False
     else:
         return True
+
+
+class no_child_process_in_deepspeed_io:
+
+    def __enter__(self):
+        # deepspeed_io defaults to creating a dataloader that uses a
+        # multiprocessing pool. Our tests use pools and we cannot nest pools in
+        # python. Therefore we're injecting this kwarg to ensure that no pools
+        # are used in the dataloader.
+        self.old_method = deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io
+
+        def new_method(*args, **kwargs):
+            kwargs["num_local_io_workers"] = 0
+            return self.old_method(*args, **kwargs)
+
+        deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = new_method
+
+    def __exit__(self, *_):
+        deepspeed.runtime.engine.DeepSpeedEngine.deepspeed_io = self.old_method
diff --git a/tests/unit/utils/test_groups.py b/tests/unit/utils/test_groups.py
index d8f12be4f3c6..5cd35baf3510 100644
--- a/tests/unit/utils/test_groups.py
+++ b/tests/unit/utils/test_groups.py
@@ -18,7 +18,7 @@ def test_get_expert_parallel_ranks():
     expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[7,15]
     """
     expert_parallel_groups, expert_data_parallel_groups = _get_expert_parallel_ranks(world_size=16,
-                                                                                     model_parallel_size_=2,
+                                                                                     tensor_parallel_size_=2,
                                                                                      expert_parallel_size_=4)
     assert expert_parallel_groups == [
         [0, 2, 4, 6],
diff --git a/tests/unit/utils/test_partition_balanced.py b/tests/unit/utils/test_partition_balanced.py
new file mode 100644
index 000000000000..e7285e478c53
--- /dev/null
+++ b/tests/unit/utils/test_partition_balanced.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from deepspeed.runtime import utils as ds_utils
+
+
+def check_partition(weights, num_parts, target_diff):
+    result = ds_utils.partition_balanced(weights=weights, num_parts=num_parts)
+
+    parts_sum = []
+    for b, e in zip(result[:-1], result[1:]):
+        parts_sum.append(sum(weights[b:e]))
+
+    assert max(parts_sum) - min(
+        parts_sum
+    ) == target_diff, f"ds_utils.partition_balanced(weights={weights}, num_parts={num_parts}) return {result}"
+
+
+def test_partition_balanced():
+    check_partition([1, 2, 1], 4, target_diff=2)
+    check_partition([1, 1, 1, 1], 4, target_diff=0)
+    check_partition([1, 1, 1, 1, 1], 4, target_diff=1)
+    check_partition([1, 1, 1, 1, 0, 1], 4, target_diff=1)
diff --git a/version.txt b/version.txt
index e01e0ddd8e8b..19270385eaf7 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.12.4
+0.16.5