vllm-project · mikaylagawarecki · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026 · Mar 29, 2026
@@ -339,7 +339,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   FetchContent_MakeAvailable(cutlass)
 
   list(APPEND VLLM_EXT_SRC
-    "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/cutlass_extensions/common.cpp"
     "csrc/quantization/fused_kernels/fused_silu_mul_block_quant.cu")
 
@@ -472,46 +471,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
                    " in CUDA target architectures")
   endif()
 
-  # Only build AllSpark kernels if we are building for at least some compatible archs.
-  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
-  if (ALLSPARK_ARCHS)
-    set(ALLSPARK_SRCS
-       "csrc/quantization/gptq_allspark/allspark_repack.cu"
-       "csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${ALLSPARK_SRCS}"
-      CUDA_ARCHS "${ALLSPARK_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
-    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
-  else()
-    message(STATUS "Not building AllSpark kernels as no compatible archs found"
-                   " in CUDA target architectures")
-  endif()
-
-  # CUTLASS MLA Archs and flags
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
-  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
-    set(SRCS
-      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${MLA_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
-    # Add MLA-specific include directories only to MLA source files
-    set_source_files_properties(${SRCS}
-      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
-    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
-  else()
-    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
-    # clear MLA_ARCHS
-    set(MLA_ARCHS)
-  endif()
-
   # Expert-specialization MXFP8 blockscaled grouped kernels (SM100+).
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
     cuda_archs_loose_intersection(ES_MXFP8_GROUPED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
@@ -539,24 +498,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
-  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
-    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
-  else()
-    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
-  endif()
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
-    set(DSV3_FUSED_A_GEMM_SRC "csrc/dsv3_fused_a_gemm.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${DSV3_FUSED_A_GEMM_SRC}"
-      CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
-    list(APPEND VLLM_EXT_SRC ${DSV3_FUSED_A_GEMM_SRC})
-    message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
-  else()
-    message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
-                   "in CUDA target architectures.")
-  endif()
-
   #
   # Machete kernels
 
@@ -628,16 +569,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   endif()
 
 
-  # Hadacore kernels
-  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
-  if(HADACORE_ARCHS)
-    set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
-    set_gencode_flags_for_srcs(
-      SRCS "${SRCS}"
-      CUDA_ARCHS "${HADACORE_ARCHS}")
-    list(APPEND VLLM_EXT_SRC "${SRCS}")
-    message(STATUS "Building hadacore")
-  endif()
 
 # if CUDA endif
 endif()
@@ -687,7 +618,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     list(APPEND VLLM_STABLE_EXT_SRC
       "csrc/libtorch_stable/permute_cols.cu"
       "csrc/libtorch_stable/quantization/w8a8/fp8/per_token_group_quant.cu"
-      "csrc/libtorch_stable/quantization/w8a8/int8/per_token_group_quant.cu")
+      "csrc/libtorch_stable/quantization/w8a8/int8/per_token_group_quant.cu"
+      "csrc/libtorch_stable/quantization/awq/gemm_kernels.cu")
   endif()
 
   if(VLLM_GPU_LANG STREQUAL "CUDA")
@@ -696,6 +628,40 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
       CUDA_ARCHS "${CUDA_ARCHS}")
   endif()
 
+  # DeepSeek V3 fused A GEMM kernel (requires SM 9.0+, Hopper and later)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0f;11.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(DSV3_FUSED_A_GEMM_ARCHS "9.0a;10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
+  endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND DSV3_FUSED_A_GEMM_ARCHS)
+    set(SRCS "csrc/libtorch_stable/dsv3_fused_a_gemm.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${DSV3_FUSED_A_GEMM_ARCHS}")
+    list(APPEND VLLM_STABLE_EXT_SRC "${SRCS}")
+    message(STATUS "Building dsv3_fused_a_gemm for archs: ${DSV3_FUSED_A_GEMM_ARCHS}")
+  else()
+    message(STATUS "Not building dsv3_fused_a_gemm as no compatible archs found "
+                   "in CUDA target architectures.")
+  endif()
+
+  # Only build AllSpark kernels if we are building for at least some compatible archs.
+  cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
+  if (ALLSPARK_ARCHS)
+    set(SRCS
+       "csrc/libtorch_stable/quantization/gptq_allspark/allspark_repack.cu"
+       "csrc/libtorch_stable/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${ALLSPARK_ARCHS}")
+    list(APPEND VLLM_STABLE_EXT_SRC "${SRCS}")
+    message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
+  else()
+    message(STATUS "Not building AllSpark kernels as no compatible archs found"
+                   " in CUDA target architectures")
+  endif()
+
   #
   # CUTLASS scaled_mm kernels (moved from _C to _C_stable_libtorch)
   #
@@ -989,6 +955,41 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     endif()
   endif()
 
+  # CUTLASS MLA Archs and flags
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+  else()
+    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+  endif()
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
+    set(SRCS
+      "csrc/libtorch_stable/attention/mla/sm100_cutlass_mla_kernel.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${MLA_ARCHS}")
+    list(APPEND VLLM_STABLE_EXT_SRC "${SRCS}")
+    list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
+    # Add MLA-specific include directories only to MLA source files
+    set_source_files_properties(${SRCS}
+      PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
+    message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
+  else()
+    message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
+    # clear MLA_ARCHS
+    set(MLA_ARCHS)
+  endif()
+
+  # Hadacore kernels
+  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0+PTX;9.0+PTX" "${CUDA_ARCHS}")
+  if(HADACORE_ARCHS)
+    set(SRCS "csrc/libtorch_stable/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
+    set_gencode_flags_for_srcs(
+      SRCS "${SRCS}"
+      CUDA_ARCHS "${HADACORE_ARCHS}")
+    list(APPEND VLLM_STABLE_EXT_SRC "${SRCS}")
+    message(STATUS "Building hadacore")
+  endif()
+
   message(STATUS "Enabling C_stable extension.")
   define_extension_target(
     _C_stable_libtorch

diff --git a/csrc/core/scalar_type.hpp b/csrc/core/scalar_type.hpp
@@ -1,7 +1,13 @@
 #pragma once
 
-// For TORCH_CHECK
-#include <torch/library.h>
+#include <cstdint>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <variant>
+
+// For STD_TORCH_CHECK
+#include <torch/headeronly/util/Exception.h>
 
 namespace vllm {
 
@@ -45,19 +51,20 @@ class ScalarType {
   // IEEE 754 compliant floating point type
   static constexpr ScalarType float_IEEE754(uint8_t exponent,
                                             uint8_t mantissa) {
-    TORCH_CHECK(mantissa > 0 && exponent > 0);
+    STD_TORCH_CHECK(mantissa > 0 && exponent > 0);
     return ScalarType(exponent, mantissa, true, 0, false, NAN_IEEE_754);
   }
 
   // IEEE 754 non-compliant floating point type
   static constexpr ScalarType float_(uint8_t exponent, uint8_t mantissa,
                                      bool finite_values_only,
                                      NanRepr nan_repr) {
-    TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
-    TORCH_CHECK(mantissa > 0 && exponent > 0);
-    TORCH_CHECK(nan_repr != NAN_IEEE_754,
-                "use `float_IEEE754` constructor for floating point types that "
-                "follow IEEE 754 conventions");
+    STD_TORCH_CHECK(nan_repr < NAN_REPR_ID_MAX, "Invalid NanRepr");
+    STD_TORCH_CHECK(mantissa > 0 && exponent > 0);
+    STD_TORCH_CHECK(
+        nan_repr != NAN_IEEE_754,
+        "use `float_IEEE754` constructor for floating point types that "
+        "follow IEEE 754 conventions");
     return ScalarType(exponent, mantissa, true, 0, finite_values_only,
                       nan_repr);
   }
@@ -176,8 +183,8 @@ class ScalarType {
 
  private:
   double _floating_point_max() const {
-    TORCH_CHECK(mantissa <= 52 && exponent <= 11,
-                "Cannot represent max/min as a double for type ", str());
+    STD_TORCH_CHECK(mantissa <= 52 && exponent <= 11,
+                    "Cannot represent max/min as a double for type ", str());
 
     uint64_t max_mantissa = (uint64_t(1) << mantissa) - 1;
     if (nan_repr == NAN_EXTD_RANGE_MAX_MIN) {
@@ -186,8 +193,8 @@ class ScalarType {
 
     uint64_t max_exponent = (uint64_t(1) << exponent) - 2;
     if (nan_repr == NAN_EXTD_RANGE_MAX_MIN || nan_repr == NAN_NONE) {
-      TORCH_CHECK(exponent < 11,
-                  "Cannot represent max/min as a double for type ", str());
+      STD_TORCH_CHECK(exponent < 11,
+                      "Cannot represent max/min as a double for type ", str());
       max_exponent += 1;
     }
 
@@ -216,25 +223,26 @@ class ScalarType {
     if (is_floating_point()) {
       return {_floating_point_max()};
     } else {
-      TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
-                  "Cannot represent max as a int64_t");
+      STD_TORCH_CHECK(size_bits() < 64 || size_bits() == 64 && is_signed(),
+                      "Cannot represent max as a int64_t");
       return {(int64_t(1) << mantissa) - 1};
     }
   }
 
   constexpr std::variant<int64_t, double> _raw_min() const {
     if (is_floating_point()) {
-      TORCH_CHECK(is_signed(),
-                  "We currently assume all floating point types are signed");
+      STD_TORCH_CHECK(
+          is_signed(),
+          "We currently assume all floating point types are signed");
       constexpr uint64_t sign_bit_double = (uint64_t(1) << 63);
 
       double max = _floating_point_max();
       uint64_t max_raw = *reinterpret_cast<uint64_t*>(&max);
       uint64_t min_raw = max_raw | sign_bit_double;
       return {*reinterpret_cast<double*>(&min_raw)};
     } else {
-      TORCH_CHECK(!is_signed() || size_bits() <= 64,
-                  "Cannot represent min as a int64_t");
+      STD_TORCH_CHECK(!is_signed() || size_bits() <= 64,
+                      "Cannot represent min as a int64_t");
       if (is_signed()) {
         // set the top bit to 1 (i.e. INT64_MIN) and the rest to 0
         // then perform an arithmetic shift right to set all the bits above

diff --git a/...la/cutlass_sm100_mla/device/sm100_mla.hpp → ...la/cutlass_sm100_mla/device/sm100_mla.hpp b/...la/cutlass_sm100_mla/device/sm100_mla.hpp → ...la/cutlass_sm100_mla/device/sm100_mla.hpp
diff --git a/...0_mla/kernel/sm100_fmha_mla_reduction.hpp → ...0_mla/kernel/sm100_fmha_mla_reduction.hpp b/...0_mla/kernel/sm100_fmha_mla_reduction.hpp → ...0_mla/kernel/sm100_fmha_mla_reduction.hpp
diff --git a/...el/sm100_fmha_mla_tma_warpspecialized.hpp → ...el/sm100_fmha_mla_tma_warpspecialized.hpp b/...el/sm100_fmha_mla_tma_warpspecialized.hpp → ...el/sm100_fmha_mla_tma_warpspecialized.hpp
diff --git a/...0_mla/kernel/sm100_mla_tile_scheduler.hpp → ...0_mla/kernel/sm100_mla_tile_scheduler.hpp b/...0_mla/kernel/sm100_mla_tile_scheduler.hpp → ...0_mla/kernel/sm100_mla_tile_scheduler.hpp