@@ -355,12 +355,13 @@ __global__ void cuApplyLayerNorm(
355355 T* skip_input_bias_add_ovals = (skip_input_bias_add_output != nullptr ) ? skip_input_bias_add_output + offset : nullptr ;
356356 U c_inv_std_dev = rsqrt (sigma2 + epsilon);
357357
358- // When X shape is (B, S, ...), and task_idx is in the range of [0, B * S).
358+ // When X shape is (B, S, ...), and i1 is in the range of [0, B * S).
359359 // We support scale and bias shape like below:
360360 // When scale and bias shape is (1, 1, ...) or (...), value of broadcast_param is 0.
361361 // When scale and bias shape is (B, 1, ...), value of broadcast_param is S.
362362 // When scale and bias shape is (B, S, ...), value of broadcast_param is 1.
363363 // When scale and bias shape is (1, S, ...), value of broadcast_param is -S.
364+ // Here we compute the offset of gamma and beta (assuming they have same shape) to support broadcasting.
364365 int gamma_beta_offset = (broadcast_param == 0 )
365366 ? 0
366367 : n2 * (broadcast_param > 0 ? (i1 / broadcast_param) : (i1 % (-broadcast_param)));
@@ -378,9 +379,6 @@ __global__ void cuApplyLayerNorm(
378379 curr += static_cast <U>(skip_vals[i]);
379380 }
380381
381- // onnx operator LayerNormalization support broadcast.
382- // gamma and beta should be unidirectional broadcastable to tensor x.
383- // Here we support a special case for transformer models that x is (B, S, D) and gamma/beta is (B, 1, D)
384382 int index = gamma_beta_offset + i;
385383 U gamma_i = (gamma != nullptr ) ? (U)gamma[index] : (U)1 ;
386384 U beta_i = (beta != nullptr ) ? (U)beta[index] : (U)0 ;
0 commit comments