@@ -41,14 +41,10 @@ void LayerNormalization ::calculate_forward_stats(
4141 util::MPIRootPrintStreamInfo () << " WARNING: EMPTY INPUT FOUND \n " ;
4242 return ; // no op for empty inputs
4343 }
44-
4544 const auto & input_dims = input.get_local_shape ();
4645 const auto & statistics_dims = statistics.get_local_shape ();
47-
4846 const auto local_num_samples = input_0_dims[3 ];
49-
5047 const auto global_num_samples = statistics_dims[3 ];
51-
5248 const auto local_sample_size = std::accumulate (input_dims.begin (),
5349 input_dims.end () - 1 ,
5450 1 ,
@@ -61,7 +57,7 @@ void LayerNormalization ::calculate_forward_stats(
6157 local_sample_size);
6258
6359 LocalMat local_statistics (2 ,
64- local_num_samples ,
60+ global_num_samples ,
6561 statistics.get_local_shape (),
6662 2 );
6763
@@ -101,7 +97,79 @@ void LayerNormalization::apply_normalization(
10197 const DCTensor<Allocator>& input,
10298 const DCTensor<Allocator>& statistics,
10399 DCTensor<Allocator>& output)
104- {}
100+ {
101+ const auto & input_dims = input.get_local_shape ();
102+ const auto & statistics_dims = statistics.get_local_shape ();
103+ const auto local_num_samples = input_0_dims[3 ];
104+ const auto global_num_samples = statistics_dims[3 ];
105+ const auto local_sample_size = std::accumulate (input_dims.begin (),
106+ input_dims.end () - 1 ,
107+ 1 ,
108+ std::multiplies<int >());
109+
110+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
111+ const LocalMat local_input (local_sample_size,
112+ local_num_samples,
113+ input.get_buffer (),
114+ local_sample_size);
115+
116+ const LocalMat local_statistics (2 ,
117+ global_num_samples,
118+ statistics.get_local_shape (),
119+ 2 );
120+
121+ LocalMat local_output (local_sample_size,
122+ local_num_samples,
123+ output.get_buffer (),
124+ local_sample_size);
125+
126+ const auto local_means = El::View (local_statistics, El::IR (0 ), El::ALL);
127+ const auto local_vars = El::View (local_statistics, El::IR (1 ), El::ALL);
128+ {
129+ using namespace hydrogen ;
130+ auto sync_info = gpu::get_sync_info (local_statistics);
131+ constexpr size_t block_size = 256 ;
132+ dim3 block_dims, grid_dims;
133+ block_dims.x = block_size;
134+ grid_dims.x = (local_num_samples + block_size - 1 ) / block_size;
135+ hydrogen::gpu::LaunchKernel (layer_norm_fp_statistics_kernel<TensorDataType>,
136+ grid_dims,
137+ block_dims,
138+ 0 ,
139+ sync_info,
140+ sample_size,
141+ local_num_samples,
142+ local_means.Buffer (),
143+ local_means.LDim (),
144+ local_vars.Buffer (),
145+ local_vars.LDim ());
146+
147+ auto multisync = El::MakeMultiSync (gpu::get_sync_info (local_output),
148+ gpu::get_sync_info (local_statistics),
149+ gpu::get_sync_info (local_input));
150+ constexpr size_t block_size = 256 ;
151+ dim3 block_dims, grid_dims;
152+ block_dims.x = block_size;
153+ grid_dims.x = (local_sample_size + block_size - 1 ) / block_size;
154+ grid_dims.y = local_num_samples;
155+ hydrogen::gpu::LaunchKernel (layer_norm_fp_output_kernel<TensorDataType>,
156+ grid_dims,
157+ block_dims,
158+ 0 ,
159+ multisync,
160+ local_num_samples,
161+ local_sample_size,
162+ epsilon,
163+ local_input.LockedBuffer (),
164+ local_input.LDim (),
165+ local_output.Buffer (),
166+ local_output.LDim (),
167+ local_means.LockedBuffer (),
168+ local_means.LDim (),
169+ local_vars.LockedBuffer (),
170+ local_vars.LDim ());
171+ }
172+ }
105173
106174template <typename Backend, typename DataType>
107175template <typename Allocator>
@@ -110,15 +178,147 @@ void LayerNormalization::calculate_backward_stats(
110178 const DCTensor<Allocator>& output_grad,
111179 const DCTensor<Allocator>& statistics,
112180 DCTensor<Allocator>& statistics_grad)
113- {}
181+ {
182+ const auto & input_dims = input.get_local_shape ();
183+ const auto & statistics_dims = statistics.get_local_shape ();
184+ const auto local_num_samples = input_0_dims[3 ];
185+ const auto global_num_samples = statistics_dims[3 ];
186+ const auto local_sample_size = std::accumulate (input_dims.begin (),
187+ input_dims.end () - 1 ,
188+ 1 ,
189+ std::multiplies<int >());
190+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
191+ const LocalMat local_input (local_sample_size,
192+ local_num_samples,
193+ input.get_buffer (),
194+ local_sample_size);
195+ const LocalMat local_output_grad (local_sample_size,
196+ local_num_samples,
197+ output_grad.get_buffer (),
198+ local_sample_size);
199+
200+ const LocalMat local_statistics (2 ,
201+ global_num_samples,
202+ statistics.get_local_shape (),
203+ 2 );
204+
205+ LocalMat local_statistics_grad (2 ,
206+ global_num_samples,
207+ statistics_grad.get_buffer (),
208+ 2 );
209+ {
210+ using namespace hydrogen ;
211+ auto multisync =
212+ El::MakeMultiSync (gpu::get_sync_info (local_statistics_grad),
213+ gpu::get_sync_info (local_output_grad),
214+ gpu::get_sync_info (local_statistics),
215+ gpu::get_sync_info (local_input));
216+ constexpr size_t block_size = 256 ;
217+ dim3 block_dims, grid_dims;
218+ block_dims.x = block_size;
219+ grid_dims.x = (local_sample_size + block_size - 1 ) / block_size;
220+ grid_dims.y = local_num_samples;
221+ hydrogen::gpu::LaunchKernel (
222+ layer_norm_bp_statistics_grad_kernel<block_size, TensorDataType>,
223+ grid_dims,
224+ block_dims,
225+ 0 ,
226+ multisync,
227+ local_num_samples,
228+ local_sample_size,
229+ m_epsilon,
230+ local_input.LockedBuffer (),
231+ local_input.LDim (),
232+ local_output_grad.LockedBuffer (),
233+ local_output_grad.LDim (),
234+ local_means.LockedBuffer (),
235+ local_means.LDim (),
236+ local_vars.LockedBuffer (),
237+ local_vars.LDim (),
238+ local_means_grad.Buffer (),
239+ local_means_grad.LDim (),
240+ local_vars_grad.Buffer (),
241+ local_vars_grad.LDim ());
242+ }
243+ }
244+
114245template <typename Backend, typename DataType>
115246template <typename Allocator>
116247void LayerNormalization::apply_grad (const DCTensor<Allocator>& input,
117248 const DCTensor<Allocator>& output_grad,
118249 const DCTensor<Allocator>& statistics,
119250 const DCTensor<Allocator>& statistics_grad,
120251 DCTensor<Allocator>& input_grad)
121- {}
252+ {
253+ const auto & input_dims = input.get_local_shape ();
254+ const auto & statistics_dims = statistics.get_local_shape ();
255+ const auto local_num_samples = input_0_dims[3 ];
256+ const auto global_num_samples = statistics_dims[3 ];
257+ const auto local_sample_size = std::accumulate (input_dims.begin (),
258+ input_dims.end () - 1 ,
259+ 1 ,
260+ std::multiplies<int >());
261+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
262+ const LocalMat local_input (local_sample_size,
263+ local_num_samples,
264+ input.get_buffer (),
265+ local_sample_size);
266+ const LocalMat local_output_grad (local_sample_size,
267+ local_num_samples,
268+ output_grad.get_buffer (),
269+ local_sample_size);
270+
271+ const LocalMat local_statistics (2 ,
272+ global_num_samples,
273+ statistics.get_local_shape (),
274+ 2 );
275+
276+ const LocalMat local_statistics_grad (2 ,
277+ global_num_samples,
278+ statistics_grad.get_buffer (),
279+ 2 );
280+
281+ LocalMat local_input_grad (local_sample_size,
282+ local_num_samples,
283+ input_grad.get_buffer (),
284+ local_sample_size);
285+ {
286+ using namespace hydrogen ;
287+ auto multisync =
288+ El::MakeMultiSync (gpu::get_sync_info (local_statistics_grad),
289+ gpu::get_sync_info (local_output_grad),
290+ gpu::get_sync_info (local_statistics),
291+ gpu::get_sync_info (local_input));
292+ constexpr size_t block_size = 256 ;
293+ dim3 block_dims, grid_dims;
294+ block_dims.x = block_size;
295+ grid_dims.x = (local_sample_size + block_size - 1 ) / block_size;
296+ grid_dims.y = local_num_samples;
297+ hydrogen::gpu::LaunchKernel (layer_norm_bp_input_grad_kernel<TensorDataType>,
298+ grid_dims,
299+ block_dims,
300+ 0 ,
301+ multisync,
302+ sample_size,
303+ local_num_samples,
304+ local_sample_size,
305+ m_epsilon,
306+ local_input.LockedBuffer (),
307+ local_input.LDim (),
308+ local_output_grad.LockedBuffer (),
309+ local_output_grad.LDim (),
310+ local_input_grad.Buffer (),
311+ local_input_grad.LDim (),
312+ local_means.LockedBuffer (),
313+ local_means.LDim (),
314+ local_vars.LockedBuffer (),
315+ local_vars.LDim (),
316+ local_means_grad.LockedBuffer (),
317+ local_means_grad.LDim (),
318+ local_vars_grad.LockedBuffer (),
319+ local_vars_grad.LDim ());
320+ }
321+ }
122322
123323#define ETI (T, Backend ) \
124324 template class LayerNormalization <Backend, T>; \
0 commit comments