ROCm
diff --git a/‎dnn-providers/hip-kernel-provider/kernels/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎dnn-providers/hip-kernel-provider/kernels/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎dnn-providers/hip-kernel-provider/kernels/rmsnorm/RMSNormBwd.cpp‎
Lines changed: 117 additions & 0 deletions b/‎dnn-providers/hip-kernel-provider/kernels/rmsnorm/RMSNormBwd.cpp‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎dnn-providers/hip-kernel-provider/kernels/rmsnorm/RMSNormCommon.hpp‎
Lines changed: 63 additions & 0 deletions b/‎dnn-providers/hip-kernel-provider/kernels/rmsnorm/RMSNormCommon.hpp‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎dnn-providers/hip-kernel-provider/kernels/rmsnorm/RMSNormFwd.cpp‎
Lines changed: 11 additions & 61 deletions b/‎dnn-providers/hip-kernel-provider/kernels/rmsnorm/RMSNormFwd.cpp‎
Lines changed: 11 additions & 61 deletions
diff --git a/‎dnn-providers/hip-kernel-provider/src/engines/plans/RMSnorm/RMSnormApplicabilityChecks.cpp‎
Lines changed: 11 additions & 39 deletions b/‎dnn-providers/hip-kernel-provider/src/engines/plans/RMSnorm/RMSnormApplicabilityChecks.cpp‎
Lines changed: 11 additions & 39 deletions
diff --git a/‎dnn-providers/hip-kernel-provider/src/engines/plans/RMSnorm/RMSnormApplicabilityChecks.hpp‎
Lines changed: 0 additions & 2 deletions b/‎dnn-providers/hip-kernel-provider/src/engines/plans/RMSnorm/RMSnormApplicabilityChecks.hpp‎
Lines changed: 0 additions & 2 deletions
@@ -82,7 +82,9 @@ set(KERNEL_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/batchnorm/Configuration.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/batchnorm/ReductionFunctions.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/batchnorm/StaticUnroll.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/rmsnorm/RMSNormCommon.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/rmsnorm/RMSNormFwd.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/rmsnorm/RMSNormBwd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/hip/vector_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/layernorm/LayernormFwd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/types/FloatTypes.h
 
@@ -0,0 +1,117 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <type_traits>
+
+#include "RMSNormCommon.hpp"
+
+constexpr unsigned int LOCAL_SIZE = HIP_PLUGIN_RMSNORM_LOCAL_SIZE;
+constexpr unsigned int INNER_SIZE = HIP_PLUGIN_RMSNORM_INNER_SIZE;
+constexpr unsigned int OUTER_SIZE = HIP_PLUGIN_RMSNORM_OUTER_SIZE;
+constexpr unsigned int STRIDE = HIP_PLUGIN_RMSNORM_STRIDE;
+
+using XType = HIP_PLUGIN_RMSNORM_X_TYPE;
+using DyType = HIP_PLUGIN_RMSNORM_DY_TYPE;
+using DxType = HIP_PLUGIN_RMSNORM_DX_TYPE;
+using ScaleType = HIP_PLUGIN_RMSNORM_SCALE_TYPE;
+using ComputeType = HIP_PLUGIN_RMSNORM_COMPUTE_TYPE;
+
+extern "C" __global__ void RMSnormBwdWeightBias(const DyType* __restrict__ dy,
+                                                const XType* __restrict__ x,
+                                                const ComputeType* __restrict__ rstd,
+                                                ScaleType* __restrict__ dweight,
+                                                ScaleType* __restrict__ dbias)
+{
+    static_assert(std::is_same<ComputeType, float>::value,
+                  "ComputeType must be float for the RMSnormBwdWeightBias kernel");
+
+    const unsigned int tidx = threadIdx.x + blockIdx.x * LOCAL_SIZE;
+
+    if(tidx >= INNER_SIZE)
+    {
+        return;
+    }
+
+    float sum_dw = 0.0f;
+    float sum_db = 0.0f;
+
+    // backward weight calculation
+    for(unsigned int o = 0; o < OUTER_SIZE; ++o)
+    {
+        for(unsigned int s = 0; s < STRIDE; ++s)
+        {
+            size_t idx = o * INNER_SIZE * STRIDE + tidx * STRIDE + s;
+
+            float prstd = rstd[o * STRIDE + s];
+            float pdy = hip_kernel_provider::rmsnorm::to_float32<DyType>(dy[idx]);
+            float px = hip_kernel_provider::rmsnorm::to_float32<XType>(x[idx]);
+
+            sum_dw += pdy * px * prstd;
+            sum_db += pdy;
+        }
+    }
+
+    dweight[tidx] = hip_kernel_provider::rmsnorm::from_float32<ScaleType>(sum_dw);
+    if(dbias)
+    {
+        dbias[tidx] = hip_kernel_provider::rmsnorm::from_float32<ScaleType>(sum_db);
+    }
+}
+
+extern "C" __global__ void RMSnormBwdData(const DyType* __restrict__ dy,
+                                          const XType* __restrict__ x,
+                                          const ScaleType* __restrict__ weight,
+                                          const ComputeType* __restrict__ rstd,
+                                          DxType* __restrict__ dx)
+{
+    static_assert(std::is_same<ComputeType, float>::value,
+                  "ComputeType must be float for the RMSnormBwdData kernel");
+
+    const unsigned int gid = blockIdx.x;
+    const unsigned int lid = threadIdx.x;
+    const unsigned int o = gid / STRIDE;
+    const unsigned int s = gid % STRIDE;
+
+    __shared__ float ltmp[LOCAL_SIZE];
+    float mean = 0.0f;
+
+    // reduce sum
+    for(unsigned int i = lid; i < INNER_SIZE; i += LOCAL_SIZE)
+    {
+        size_t idx = o * INNER_SIZE * STRIDE + i * STRIDE + s;
+
+        float pdy = hip_kernel_provider::rmsnorm::to_float32<DyType>(dy[idx]);
+        float px = hip_kernel_provider::rmsnorm::to_float32<XType>(x[idx]);
+        float pw = hip_kernel_provider::rmsnorm::to_float32<ScaleType>(weight[i]);
+
+        mean += pdy * pw * px;
+    }
+
+    ltmp[lid] = mean;
+    __syncthreads();
+
+    for(unsigned int i = LOCAL_SIZE >> 1; i > 0; i >>= 1)
+    {
+        if(lid < i)
+        {
+            ltmp[lid] += ltmp[lid + i];
+        }
+        __syncthreads();
+    }
+
+    mean = ltmp[0] / INNER_SIZE;
+    float prstd = rstd[gid];
+
+    // backward data calculation
+    for(unsigned int i = lid; i < INNER_SIZE; i += LOCAL_SIZE)
+    {
+        size_t idx = o * INNER_SIZE * STRIDE + i * STRIDE + s;
+
+        float pdy = hip_kernel_provider::rmsnorm::to_float32<DyType>(dy[idx]);
+        float px = hip_kernel_provider::rmsnorm::to_float32<XType>(x[idx]);
+        float pw = hip_kernel_provider::rmsnorm::to_float32<ScaleType>(weight[i]);
+
+        float dx_val = (pdy * pw * prstd) - (mean * px * prstd * prstd * prstd);
+        dx[idx] = hip_kernel_provider::rmsnorm::from_float32<DxType>(dx_val);
+    }
+}
@@ -0,0 +1,63 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "Bfloat16Dev.hpp"
+
+namespace hip_kernel_provider::rmsnorm
+{
+
+template <typename T>
+struct Cast;
+
+template <>
+struct Cast<float>
+{
+    static __device__ __forceinline__ float to(float value)
+    {
+        return value;
+    }
+    static __device__ __forceinline__ float from(float value)
+    {
+        return value;
+    }
+};
+
+template <>
+struct Cast<half>
+{
+    static __device__ __forceinline__ float to(half value)
+    {
+        return __half2float(value);
+    }
+    static __device__ __forceinline__ half from(float value)
+    {
+        return __float2half(value);
+    }
+};
+
+template <>
+struct Cast<ushort>
+{
+    static __device__ __forceinline__ float to(ushort value)
+    {
+        return bfloat16_to_float(value);
+    }
+    static __device__ __forceinline__ ushort from(float value)
+    {
+        return float_to_bfloat16(value);
+    }
+};
+
+template <typename T>
+__device__ __forceinline__ float to_float32(T value)
+{
+    return Cast<T>::to(value);
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float32(float value)
+{
+    return Cast<T>::from(value);
+}
+
+} // namespace hip_kernel_provider::rmsnorm
@@ -3,70 +3,17 @@
 
 #include <type_traits>
 
-#include "Bfloat16Dev.hpp"
+#include "RMSNormCommon.hpp"
 
 constexpr unsigned int LOCAL_SIZE = HIP_PLUGIN_RMSNORM_LOCAL_SIZE;
 constexpr unsigned int INNER_SIZE = HIP_PLUGIN_RMSNORM_INNER_SIZE;
+constexpr unsigned int STRIDE = HIP_PLUGIN_RMSNORM_STRIDE;
 
 using InputType = HIP_PLUGIN_RMSNORM_INPUT_TYPE;
 using OutputType = HIP_PLUGIN_RMSNORM_OUTPUT_TYPE;
 using ScaleType = HIP_PLUGIN_RMSNORM_SCALE_TYPE;
 using ComputeType = HIP_PLUGIN_RMSNORM_COMPUTE_TYPE;
 
-template <typename T>
-struct Cast;
-
-template <>
-struct Cast<float>
-{
-    static __device__ __forceinline__ float to(float value)
-    {
-        return value;
-    }
-    static __device__ __forceinline__ float from(float value)
-    {
-        return value;
-    }
-};
-
-template <>
-struct Cast<half>
-{
-    static __device__ __forceinline__ float to(half value)
-    {
-        return __half2float(value);
-    }
-    static __device__ __forceinline__ half from(float value)
-    {
-        return __float2half(value);
-    }
-};
-
-template <>
-struct Cast<ushort>
-{
-    static __device__ __forceinline__ float to(ushort value)
-    {
-        return bfloat16_to_float(value);
-    }
-    static __device__ __forceinline__ ushort from(float value)
-    {
-        return float_to_bfloat16(value);
-    }
-};
-
-template <typename T>
-__device__ __forceinline__ float to_float32(T value)
-{
-    return Cast<T>::to(value);
-}
-
-template <typename T>
-__device__ __forceinline__ T from_float32(float value)
-{
-    return Cast<T>::from(value);
-}
-
 extern "C" __global__ void RMSnormFwd(const InputType* __restrict__ x,
                                       const ScaleType* __restrict__ weight,
                                       const ScaleType* __restrict__ bias,
@@ -80,15 +27,17 @@ extern "C" __global__ void RMSnormFwd(const InputType* __restrict__ x,
 
     const unsigned int gid = blockIdx.x;
     const unsigned int lid = threadIdx.x;
+    const unsigned int o = gid / STRIDE;
+    const unsigned int s = gid % STRIDE;
 
     float pvar = 0.0f;
     __shared__ float ltmp[LOCAL_SIZE];
 
     // reduce sum
     for(unsigned int i = lid; i < INNER_SIZE; i += LOCAL_SIZE)
     {
-        size_t idx = gid * INNER_SIZE + i;
-        float tmp = to_float32<InputType>(x[idx]);
+        size_t idx = o * INNER_SIZE * STRIDE + i * STRIDE + s;
+        float tmp = hip_kernel_provider::rmsnorm::to_float32<InputType>(x[idx]);
         pvar += tmp * tmp;
     }
 
@@ -114,12 +63,13 @@ extern "C" __global__ void RMSnormFwd(const InputType* __restrict__ x,
     // forward calculation
     for(unsigned int i = lid; i < INNER_SIZE; i += LOCAL_SIZE)
     {
-        size_t idx = gid * INNER_SIZE + i;
-        float y_val = to_float32<InputType>(x[idx]) * prstd * to_float32<ScaleType>(weight[i]);
+        size_t idx = o * INNER_SIZE * STRIDE + i * STRIDE + s;
+        float y_val = hip_kernel_provider::rmsnorm::to_float32<InputType>(x[idx]) * prstd
+                      * hip_kernel_provider::rmsnorm::to_float32<ScaleType>(weight[i]);
         if(bias != nullptr)
         {
-            y_val += to_float32<ScaleType>(bias[i]);
+            y_val += hip_kernel_provider::rmsnorm::to_float32<ScaleType>(bias[i]);
         }
-        y[idx] = from_float32<OutputType>(y_val);
+        y[idx] = hip_kernel_provider::rmsnorm::from_float32<OutputType>(y_val);
     }
 }
@@ -14,35 +14,6 @@
 
 namespace hip_kernel_provider::rmsnorm
 {
-// --- Validation Utilities ---
-
-void RMSnormValidator::validateSupportedLayout(const std::vector<int64_t>& strideOrder,
-                                               size_t numDims)
-{
-    if(numDims == 4)
-    {
-        const auto layoutNchw = hipdnn_data_sdk::utilities::TensorLayout::NCHW;
-
-        if(strideOrder != layoutNchw.strideOrder)
-        {
-            throw hipdnn_plugin_sdk::HipdnnPluginException(
-                HIPDNN_PLUGIN_STATUS_BAD_PARAM,
-                "RMSnorm implementation supports only NCHW layouts for 4D tensors.");
-        }
-    }
-    else
-    {
-        const auto layoutNcdhw = hipdnn_data_sdk::utilities::TensorLayout::NCDHW;
-
-        if(strideOrder != layoutNcdhw.strideOrder)
-        {
-            throw hipdnn_plugin_sdk::HipdnnPluginException(
-                HIPDNN_PLUGIN_STATUS_BAD_PARAM,
-                "RMSnorm implementation supports only NCDHW layouts for 5D tensors.");
-        }
-    }
-}
-
 // --- Component Validators ---
 
 void RMSnormValidator::checkTensorLayoutsAndDimsSupported()
@@ -83,23 +54,24 @@ void RMSnormValidator::checkTensorDataTypesSupported(const std::vector<int64_t>&
                                     "BFLOAT16 data types for x and y tensors.");
     }
 
-    // Only fp32 compute type is supported for now
-    const std::unordered_set<hipdnn_flatbuffers_sdk::data_objects::DataType> allowedComputeTypes{
-        hipdnn_flatbuffers_sdk::data_objects::DataType::FLOAT
+    const std::unordered_set<hipdnn_flatbuffers_sdk::data_objects::DataType> allowedAffineTypes{
+        hipdnn_flatbuffers_sdk::data_objects::DataType::FLOAT,
+        hipdnn_flatbuffers_sdk::data_objects::DataType::BFLOAT16,
+        hipdnn_flatbuffers_sdk::data_objects::DataType::HALF};
 
-    };
     validateConsistentDataTypes(affineTensorIds,
-                                allowedComputeTypes,
+                                allowedAffineTypes,
                                 "RMSnorm affine tensors use unsupported data type.",
                                 "All affine tensors for RMSnorm must have the same data type.");
 
-    const std::unordered_set<hipdnn_flatbuffers_sdk::data_objects::DataType> allowedStatTypes{
-        hipdnn_flatbuffers_sdk::data_objects::DataType::FLOAT,
-        hipdnn_flatbuffers_sdk::data_objects::DataType::BFLOAT16,
-        hipdnn_flatbuffers_sdk::data_objects::DataType::HALF};
+    // Only fp32 compute type is supported for now
+    const std::unordered_set<hipdnn_flatbuffers_sdk::data_objects::DataType> allowedComputeTypes{
+        hipdnn_flatbuffers_sdk::data_objects::DataType::FLOAT
+
+    };
 
     validateConsistentDataTypes(statTensorIds,
-                                allowedStatTypes,
+                                allowedComputeTypes,
                                 "RMSnorm stat tensors use unsupported data type.",
                                 "All stat tensors for RMSnorm must have the same data type.");
 }
 
@@ -13,8 +13,6 @@ namespace hip_kernel_provider::rmsnorm
 class RMSnormValidator : public IValidator
 {
 private:
-    void validateSupportedLayout(const std::vector<int64_t>& strideOrder, size_t numDims) override;
-
     void checkTensorLayoutsAndDimsSupported() override;
 
     void checkTensorDataTypesSupported(const std::vector<int64_t>& ioTensorIds,
Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,6 @@ namespace hip_kernel_provider::rmsnorm`
`13`	`13`	`class RMSnormValidator : public IValidator`
`14`	`14`	`{`
`15`	`15`	`private:`
`16`		`- void validateSupportedLayout(const std::vector<int64_t>& strideOrder, size_t numDims) override;`
`17`		`-`
`18`	`16`	`void checkTensorLayoutsAndDimsSupported() override;`
`19`	`17`
`20`	`18`	`void checkTensorDataTypesSupported(const std::vector<int64_t>& ioTensorIds,`