undo layer norm kernel

tianleiwu · tianleiwu · commit a47b6af567d9 · 2025-01-10T23:52:06.000Z
diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
@@ -101,7 +101,6 @@ Status SkipLayerNorm<T, Simplified>::ComputeInternal(OpKernelContext* ctx) const
         (double)epsilon_,                                                               // epsilon
         reinterpret_cast<const CudaT*>(gamma->Data<T>()),                               // gamma
         (beta != nullptr) ? reinterpret_cast<const CudaT*>(beta->Data<T>()) : nullptr,  // beta
-        0,                                                                              // broadcast stride for gamma/beta
         reinterpret_cast<const CudaT*>(skip->Data<T>()),                                // skip or residual to add
         (bias != nullptr) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr,  // bias to add
         sum_output != nullptr ? reinterpret_cast<CudaT*>(sum_output->MutableData<T>()) : nullptr);
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm.cc b/onnxruntime/core/providers/cuda/nn/layer_norm.cc
@@ -44,36 +44,19 @@ Status LayerNorm<T, U, V, simplified>::ComputeInternal(OpKernelContext* ctx) con
   auto bias_data = (simplified || (nullptr == bias)) ? nullptr : reinterpret_cast<const CudaV*>(bias->Data<V>());
 
   const TensorShape& x_shape = X->Shape();
-  auto x_num_dims = x_shape.NumDimensions();
-  const int64_t axis = HandleNegativeAxis(axis_, x_num_dims);
+  const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
 
   int n1 = gsl::narrow<int>(x_shape.SizeToDimension(axis));
   int n2 = gsl::narrow<int>(x_shape.SizeFromDimension(axis));
 
   const auto scale_size = scale->Shape().Size();
   const auto bias_size = (bias_data) ? bias->Shape().Size() : 0;
-
-  int broadcast = 0;
   if (n2 == 1 || scale_size != n2 || (bias_data && bias_size != n2)) {
-    // Handle a special case for MMDit where scale and bias need broadcast.
-    // X shape is (B, S, D), scale and bias shape is (B, 1, D), and we store S as broadcast stride.
-    if (x_num_dims == 3 && axis == 2 && n2 > 1 &&
-        scale->Shape().NumDimensions() == x_num_dims &&
-        scale->Shape().GetDims()[0] == x_shape.GetDims()[0] &&
-        scale->Shape().GetDims()[1] == 1 &&
-        scale->Shape().GetDims()[2] == x_shape.GetDims()[2] &&
-        bias->Shape().NumDimensions() == x_num_dims &&
-        bias->Shape().GetDims()[0] == x_shape.GetDims()[0] &&
-        bias->Shape().GetDims()[1] == 1 &&
-        bias->Shape().GetDims()[2] == x_shape.GetDims()[2]) {
-      broadcast = static_cast<int>(x_shape.GetDims()[1]);
-    } else {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Size of X.shape()[axis:] == ", n2,
-                             ". Size of scale and bias (if provided) must match this "
-                             "and the size must not be 1. Got scale size of ",
-                             scale_size, " and bias size of ", bias_size);
-    }
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Size of X.shape()[axis:] == ", n2,
+                           ". Size of scale and bias (if provided) must match this "
+                           "and the size must not be 1. Got scale size of ",
+                           scale_size, " and bias size of ", bias_size);
   }
 
   // Outputs
@@ -82,7 +65,7 @@ Status LayerNorm<T, U, V, simplified>::ComputeInternal(OpKernelContext* ctx) con
 
   // Mean and variance
   std::vector<int64_t> mean_inv_std_var_dim;
-  for (int i = 0; i < static_cast<int>(x_num_dims); ++i) {
+  for (int i = 0; i < static_cast<int>(x_shape.NumDimensions()); ++i) {
     if (i < axis) {
       mean_inv_std_var_dim.emplace_back(x_shape.GetDims()[i]);
     } else {
@@ -111,7 +94,7 @@ Status LayerNorm<T, U, V, simplified>::ComputeInternal(OpKernelContext* ctx) con
   }
 
   HostApplyLayerNorm<CudaT, CudaU, CudaV, simplified>(GetDeviceProp(), Stream(ctx), Y_data, mean_data, inv_var_data,
-                                                      X_data, n1, n2, epsilon_, scale_data, bias_data, broadcast);
+                                                      X_data, n1, n2, epsilon_, scale_data, bias_data);
   CUDA_RETURN_IF_ERROR(cudaGetLastError());
   return Status::OK();
 }
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.cu
@@ -334,7 +334,6 @@ __global__ void cuApplyLayerNorm(
     const U epsilon,
     const V* __restrict__ gamma,
     const V* __restrict__ beta,
-    int broadcast,
     const T* __restrict__ skip,
     const T* __restrict__ bias,
     T* __restrict__ skip_input_bias_add_output) {
@@ -367,13 +366,8 @@ __global__ void cuApplyLayerNorm(
         curr += static_cast<U>(skip_vals[i]);
       }
 
-      // onnx operator LayerNormalization support broadcast.
-      // gamma and beta should be unidirectional broadcastable to tensor x.
-      // Here we support a special case for transformer models that x is (B, S, D) and gamma/beta is (B, 1, D)
-      int index = (broadcast > 0) ? ((i1 / broadcast) * n2 + i) : i;
-      U gamma_i = (gamma != nullptr) ? (U)gamma[index] : (U)1;
-      U beta_i = (beta != nullptr) ? (U)beta[index] : (U)0;
-
+      U gamma_i = (gamma != nullptr) ? (U)gamma[i] : (U)1;
+      U beta_i = (beta != nullptr) ? (U)beta[i] : (U)0;
       if (simplified) {
         ovals[i] = static_cast<V>(gamma_i * c_inv_std_dev * curr);
       } else {
@@ -415,7 +409,6 @@ void HostApplyLayerNorm(
     double epsilon,
     const V* gamma,
     const V* beta,
-    int broadcast,
     const T* skip,
     const T* bias,
     T* skip_input_bias_add_output) {
@@ -449,15 +442,15 @@ void HostApplyLayerNorm(
       input,
       n1, n2,
       U(epsilon),
-      gamma, beta, broadcast,
+      gamma, beta,
       skip, bias, skip_input_bias_add_output);
 }
 
 #define LAYERNORM_LINEAR_IMPL(T, U, V, simplified)                                                                    \
   template void HostApplyLayerNorm<T, U, V, simplified>(const cudaDeviceProp& prop, cudaStream_t stream, V* output,   \
                                                         U* mean, U* inv_std_dev, const T* input, int n1, int n2,      \
-                                                        double epsilon, const V* gamma, const V* beta, int broadcast, \
-                                                        const T* skip, const T* bias, T* skip_input_bias_add_output);
+                                                        double epsilon, const V* gamma, const V* beta, const T* skip, \
+                                                        const T* bias, T* skip_input_bias_add_output);
 
 LAYERNORM_LINEAR_IMPL(float, float, float, true)
 LAYERNORM_LINEAR_IMPL(half, float, half, true)
diff --git a/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h b/onnxruntime/core/providers/cuda/nn/layer_norm_impl.h
@@ -41,7 +41,6 @@ void HostApplyLayerNorm(
     double epsilon,
     const V* gamma,
     const V* beta,
-    int broadcast = 0,  // broadcast stride for gamma/beta
     const T* skip = nullptr,
     const T* bias = nullptr,
     T* skip_input_bias_add_output = nullptr);
diff --git a/onnxruntime/python/tools/transformers/onnx_model_mmdit.py b/onnxruntime/python/tools/transformers/onnx_model_mmdit.py
@@ -87,14 +87,6 @@ def _optimize(self, options: Optional[FusionOptions] = None, progress_bar=None):
         if progress_bar:
             progress_bar.update(1)
 
-        # TODO: SkipLayerNormalization does not support broadcast yet.
-        # if (options is None) or options.enable_skip_layer_norm:
-        #    self.fuse_skip_simplified_layer_norm()
-        #    self.fuse_skip_layer_norm()
-        # if (options is None) or options.enable_bias_skip_layer_norm:
-        #     # Fuse SkipLayerNormalization and Add Bias before it.
-        #     self.fuse_add_bias_skip_layer_norm()
-
         self.postprocess()
         if progress_bar:
             progress_bar.update(1)
@@ -110,7 +102,6 @@ def get_fused_operator_statistics(self):
             "FastGelu",
             "MultiHeadAttention",
             "LayerNormalization",
-            # "SkipLayerNormalization",
             "SimplifiedLayerNormalization",
         ]