diff --git a/megatron/legacy/fused_kernels/__init__.py b/megatron/legacy/fused_kernels/__init__.py
deleted file mode 100644
index 87cceac3e35..00000000000
--- a/megatron/legacy/fused_kernels/__init__.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-
-import os
-import pathlib
-import subprocess
-
-from torch.utils import cpp_extension
-
-# Setting this param to a list has a problem of generating different
-# compilation commands (with diferent order of architectures) and
-# leading to recompilation of fused kernels. Set it to empty string
-# to avoid recompilation and assign arch flags explicity in
-# extra_cuda_cflags below
-os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-
-
-def load(args):
-
-    # Check if cuda 11 is installed for compute capability 8.0
-    cc_flag = []
-    _, bare_metal_major, bare_metal_minor = _get_cuda_bare_metal_version(
-        cpp_extension.CUDA_HOME
-    )
-    if int(bare_metal_major) >= 11:
-        cc_flag.append('-gencode')
-        cc_flag.append('arch=compute_80,code=sm_80')
-        if int(bare_metal_minor) >= 8:
-            cc_flag.append('-gencode')
-            cc_flag.append('arch=compute_90,code=sm_90')
-
-    # Build path
-    srcpath = pathlib.Path(__file__).parent.absolute()
-    buildpath = srcpath / "build"
-    _create_build_dir(buildpath)
-
-    # Helper function to build the kernels.
-    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
-        return cpp_extension.load(
-            name=name,
-            sources=sources,
-            build_directory=buildpath,
-            extra_cflags=[
-                "-O3",
-            ],
-            extra_cuda_cflags=[
-                "-O3",
-                "-gencode",
-                "arch=compute_70,code=sm_70",
-                "--use_fast_math",
-            ]
-            + extra_cuda_flags
-            + cc_flag,
-            verbose=(args.rank == 0),
-        )
-
-
-def _get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output(
-        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
-    )
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    release = output[release_idx].split(".")
-    bare_metal_major = release[0]
-    bare_metal_minor = release[1][0]
-
-    return raw_output, bare_metal_major, bare_metal_minor
-
-
-def _create_build_dir(buildpath):
-    try:
-        os.mkdir(buildpath)
-    except OSError:
-        if not os.path.isdir(buildpath):
-            print(f"Creation of the build directory {buildpath} failed")
diff --git a/megatron/legacy/fused_kernels/compat.h b/megatron/legacy/fused_kernels/compat.h
deleted file mode 100644
index 5495d780776..00000000000
--- a/megatron/legacy/fused_kernels/compat.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved. */
-
-/*This code is copied fron NVIDIA apex:
- *     https://github.com/NVIDIA/apex
- *     with minor changes. */
-
-
-
-#ifndef TORCH_CHECK
-#define TORCH_CHECK AT_CHECK
-#endif
-
-#ifdef VERSION_GE_1_3
-#define DATA_PTR data_ptr
-#else
-#define DATA_PTR data
-#endif
diff --git a/megatron/legacy/fused_kernels/tests/__init__.py b/megatron/legacy/fused_kernels/tests/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/megatron/legacy/fused_kernels/tests/test_fused_kernels.py b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py
deleted file mode 100644
index f5b2b78a3f7..00000000000
--- a/megatron/legacy/fused_kernels/tests/test_fused_kernels.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
-import math
-
-import torch
-from torch.nn import LayerNorm
-
-from megatron.legacy.model.enums import AttnMaskType
-from megatron.legacy.model.fused_layer_norm import MixedFusedLayerNorm
-from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
-from megatron.legacy.model.utils import attention_mask_func
-from megatron.legacy.fused_kernels import load
-
-def test_load_fused_kernels():
-    try:
-        import fused_layer_norm_cuda
-        import scaled_masked_softmax_cuda
-        import scaled_upper_triang_masked_softmax_cuda
-        import torch
-
-        print("[Success] load_fused_kernels")
-    except ImportError as e:
-        print("[Fail] load_fused_kernels")
-        raise e
-
-def test_fused_softmax():
-    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-    test_text = (
-        "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
-    )
-
-    tokens = tokenizer(
-        [test_text] * 4,
-        return_tensors="pt",
-    )
-
-    embedding_output = bert.embeddings(
-        input_ids=tokens["input_ids"].cuda(),
-        position_ids=None,
-        token_type_ids=tokens["token_type_ids"].cuda(),
-        inputs_embeds=None,
-        past_key_values_length=0,
-    )
-
-    # (bsz, 1, 1, seq_len)
-    mask = bert.get_extended_attention_mask(
-        attention_mask=tokens["attention_mask"].cuda(),
-        input_shape=tokens["input_ids"].shape,
-        device=bert.device,
-    )
-    # (bsz, 1, seq_len, seq_len)
-    mask = mask.repeat(1, 1, mask.size()[-1], 1)
-
-    attention = bert.encoder.layer[0].attention.self
-    key_layer = attention.transpose_for_scores(attention.key(embedding_output))
-    query_layer = attention.transpose_for_scores(attention.query(embedding_output))
-
-    attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-    attention_scores /= math.sqrt(key_layer.size()[-1])
-
-    fused_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.padding,
-            scaled_masked_softmax_fusion=True,
-        )
-        .cuda()
-        .half()
-    )
-
-    fused_softmax_output = fused_softmax(
-        attention_scores,
-        (mask != 0),
-    )
-
-    torch_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.padding,
-            scaled_masked_softmax_fusion=False,
-        )
-        .cuda()
-        .half()
-    )
-
-    torch_softmax_output = torch_softmax(
-        attention_scores,
-        (mask != 0),
-    )
-
-    test_result = (fused_softmax_output - torch_softmax_output).abs()
-
-    while test_result.dim() != 1:
-        test_result = test_result.mean(dim=-1)
-
-    diff = test_result.mean(dim=-1)
-
-    if diff <= 1e-3:
-        print(
-            f"\n[Success] test_fused_softmax"
-            f"\n > mean_difference={diff}"
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-    else:
-        print(
-            f"\n[Fail] test_fused_softmax"
-            f"\n > mean_difference={diff}, "
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-
-
-def test_fused_upper_triangle_mask_softmax():
-    gpt = GPT2Model.from_pretrained("gpt2").cuda().half()
-    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    test_text = (
-        "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi"  # 24
-    )
-
-    tokens = tokenizer(
-        [test_text] * 4,
-        return_tensors="pt",
-    )
-
-    attention_mask = tokens["attention_mask"].cuda()
-    attention_mask = attention_mask.view(attention_mask.size(0), -1)
-    attention_mask = attention_mask[:, None, None, :]
-    attention_mask = (1.0 - attention_mask) * -10000.0
-    attention_mask = attention_mask.repeat(1, 1, attention_mask.size()[-1], 1)
-    attn = gpt.h[0]
-
-    hidden_states = gpt.wte(tokens["input_ids"].cuda())
-    q, k, v = attn.attn.c_attn(hidden_states).split(768, dim=-1)
-    q = attn.attn._split_heads(q, attn.attn.num_heads, attn.attn.head_dim)
-    k = attn.attn._split_heads(k, attn.attn.num_heads, attn.attn.head_dim)
-    attn_weights = torch.matmul(q, k.transpose(-1, -2))
-
-    sq, sk = q.size(-2), k.size(-2)
-    causal_mask = attn.attn.bias[:, :, sk - sq : sk, :sk].bool()
-    total_mask = ~(causal_mask & (attention_mask == 0))
-    """
-    tensor([[[[False,  True,  True,  ...,  True,  True,  True],
-              [False, False,  True,  ...,  True,  True,  True],
-              [False, False, False,  ...,  True,  True,  True],
-              ...,
-              [False, False, False,  ..., False,  True,  True],
-              [False, False, False,  ..., False, False,  True],
-              [False, False, False,  ..., False, False, False]]]
-    """
-
-    fused_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.causal,
-            scaled_masked_softmax_fusion=True,
-        )
-        .cuda()
-        .half()
-    )
-
-    fused_softmax_output = fused_softmax(
-        attn_weights,
-        total_mask,
-    )
-
-    torch_softmax = (
-        FusedScaleMaskSoftmax(
-            input_in_fp16=True,
-            input_in_bf16=False,
-            mask_func=attention_mask_func,
-            scale=None,
-            softmax_in_fp32=False,
-            attn_mask_type=AttnMaskType.causal,
-            scaled_masked_softmax_fusion=False,
-        )
-        .cuda()
-        .half()
-    )
-
-    torch_softmax_output = torch_softmax(
-        attn_weights,
-        total_mask,
-    )
-
-    test_result = (fused_softmax_output - torch_softmax_output).abs()
-
-    while test_result.dim() != 1:
-        test_result = test_result.mean(dim=-1)
-
-    diff = test_result.mean(dim=-1)
-
-    if diff <= 1e-3:
-        print(
-            f"\n[Success] test_fused_upper_triangle_mask_softmax"
-            f"\n > mean_difference={diff}"
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}"
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-    else:
-        print(
-            f"\n[Fail] test_fused_upper_triangle_mask_softmax"
-            f"\n > mean_difference={diff}, "
-            f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, "
-            f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}"
-        )
-
-
-def test_layer_norm():
-    bert = BertModel.from_pretrained("bert-base-cased").cuda().half()
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
-    test_text = (
-        "Hello. How are you? I am fine thank you and you? yes Good. "
-        "hi hi hi hi hi hi hi hi hi hi hi hi hi"  # 32
-    )
-
-    tokens = tokenizer(
-        [test_text] * 4,
-        return_tensors="pt",
-    )
-
-    # [bsz, seq_len, d_model]
-    embedding_output = (
-        bert.embeddings(
-            input_ids=tokens["input_ids"].cuda(),
-            position_ids=None,
-            token_type_ids=tokens["token_type_ids"].cuda(),
-            inputs_embeds=None,
-            past_key_values_length=0,
-        )
-        .cuda()
-        .half()
-    )
-
-    fused_layernorm_layer = (
-        MixedFusedLayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
-    )
-
-    torch_layernorm_layer = (
-        LayerNorm(normalized_shape=embedding_output.size(-1)).cuda().half()
-    )
-
-    fused_output = fused_layernorm_layer(embedding_output)
-    torch_output = torch_layernorm_layer(embedding_output)
-    test_result = (fused_output - torch_output).abs()
-
-    while test_result.dim() != 1:
-        test_result = test_result.mean(dim=-1)
-
-    diff = test_result.mean(dim=-1)
-
-    if diff <= 1e-3:
-        print(
-            f"\n[Success] test_layer_norm"
-            f"\n > mean_difference={diff}"
-            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}"
-            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
-        )
-    else:
-        print(
-            f"\n[Fail] test_layer_norm"
-            f"\n > mean_difference={diff}, "
-            f"\n > fused_values={fused_output[-1][-1][:5].tolist()}, "
-            f"\n > torch_values={torch_output[-1][-1][:5].tolist()}"
-        )
-
-
-def attention_mask_func(attention_scores, attention_mask):
-    attention_scores.masked_fill_(attention_mask, -10000.0)
-    return attention_scores
-
-
-def forward_torch_softmax(input, mask, scale):
-    input = input * scale
-    mask_output = attention_mask_func(input, mask) if mask is not None else input
-    probs = torch.nn.Softmax(dim=-1)(mask_output)
-    return probs
-
-
-def test_masked_softmax_forward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
-            error = (softmax_results_torch - softmax_results).abs().max()
-            assert error < 1e-3
-
-def test_masked_softmax_backward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0')
-            masks = torch.randint(0, 2, (batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item())
-
-            inputs.requires_grad = True
-            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
-            softmax_results_torch.backward(backward)
-            error = (back_grad - inputs.grad).abs().max()
-            assert error < 1e-3
-
-
-def test_allmasked_softmax_forward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            softmax_results_torch = torch.zeros_like(inputs)
-            error = (softmax_results_torch - softmax_results).abs().max()
-            assert error == 0.0
-
-
-def test_allmasked_softmax_backward():
-    import scaled_masked_softmax_cuda
-
-    batch = 2
-    attn = 16
-    scale_t = torch.tensor([1.0])
-    for qlen in [128, 256, 1024, 2048, 4096]:
-        for klen in [128, 256, 1024, 2048]:
-            inputs = torch.normal(0, 2, (batch, attn, qlen, klen), dtype=torch.float16, device='cuda:0')
-            backward = torch.rand_like(inputs, dtype=torch.float16, device='cuda:0')
-            masks = torch.ones((batch, 1, qlen, klen), dtype=torch.bool, device='cuda:0')
-            softmax_results = scaled_masked_softmax_cuda.forward(inputs, masks, scale_t[0].item())
-            back_grad = scaled_masked_softmax_cuda.backward(backward, softmax_results, scale_t[0].item())
-            inputs.requires_grad = True
-            softmax_results_torch = forward_torch_softmax(inputs, masks, scale_t[0].item())
-            softmax_results_torch.backward(backward)
-            error = (back_grad - inputs.grad).abs().max()
-            assert error < 1e-3
-
-
-if __name__ == "__main__":
-    try:
-        from transformers import BertTokenizer, GPT2Tokenizer
-        from transformers.models.bert.modeling_bert import BertModel
-        from transformers.models.gpt2.modeling_gpt2 import GPT2Model
-        import transformers
-
-        transformers.logging.set_verbosity(
-            transformers.logging.FATAL,
-        )
-
-    except ImportError:
-        print("\n[Fail] Please install `transformers` package to test fused kernels\n")
-        exit(-1)
-
-    load()
-    test_masked_softmax_forward()
-    test_masked_softmax_backward()
-    test_allmasked_softmax_forward()
-    test_allmasked_softmax_backward()
-    test_load_fused_kernels()
-    test_fused_softmax()
-    test_fused_upper_triangle_mask_softmax()
-    test_layer_norm()
diff --git a/megatron/legacy/fused_kernels/type_shim.h b/megatron/legacy/fused_kernels/type_shim.h
deleted file mode 100644
index d60a6f8c6fb..00000000000
--- a/megatron/legacy/fused_kernels/type_shim.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. */
-
-
-#include <ATen/ATen.h>
-#include "compat.h"
-
-
-#define DISPATCH_HALF_AND_BFLOAT(TYPE, NAME, ...)			\
-  switch(TYPE)								\
-    {									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t = at::BFloat16;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
-      }
-
-
-#define DISPATCH_HALF_BFLOAT_AND_FLOAT(TYPE, NAME, ...)			\
-  switch(TYPE)								\
-    {									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t = at::BFloat16;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::Float:						\
-      {									\
-	using scalar_t = float;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'");	\
-      }
-
-
-
-#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
-  switch(TYPEIN)							\
-    {									\
-    case at::ScalarType::Float:						\
-      {									\
-	using scalar_t_in = float;					\
-	switch(TYPEOUT)							\
-	  {								\
-	  case at::ScalarType::Float:					\
-	    {								\
-	      using scalar_t_out = float;				\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  case at::ScalarType::Half:					\
-	    {								\
-	      using scalar_t_out = at::Half;				\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  case at::ScalarType::BFloat16:				\
-	    {								\
-	      using scalar_t_out = at::BFloat16;			\
-	      __VA_ARGS__;						\
-	      break;							\
-	    }								\
-	  default:							\
-	    AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
-	  }								\
-	break;								\
-      }									\
-    case at::ScalarType::Half:						\
-      {									\
-	using scalar_t_in = at::Half;					\
-	using scalar_t_out = at::Half;					\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    case at::ScalarType::BFloat16:					\
-      {									\
-	using scalar_t_in = at::BFloat16;				\
-	using scalar_t_out = at::BFloat16;				\
-	__VA_ARGS__;							\
-	break;								\
-      }									\
-    default:								\
-      AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");	\
-    }
-