3434template <typename Backend, typename DataType>
3535template <typename Allocator>
3636void LayerNormalization ::calculate_forward_stats (
37- const DCTensor<Allocator>& input)
38- {}
37+ const DCTensor<Allocator>& input,
38+ DCTensor<Allocator>& statistics)
39+ {
40+ if (input_0.get_local_size () == 0 ) {
41+ util::MPIRootPrintStreamInfo () << " WARNING: EMPTY INPUT FOUND \n " ;
42+ return ; // no op for empty inputs
43+ }
44+
45+ const auto & input_dims = input.get_local_shape ();
46+ const auto & statistics_dims = statistics.get_local_shape ();
47+
48+ const auto local_num_samples = input_0_dims[3 ];
49+
50+ const auto global_num_samples = statistics_dims[3 ];
51+
52+ const auto local_sample_size = std::accumulate (input_dims.begin (),
53+ input_dims.end () - 1 ,
54+ 1 ,
55+ std::multiplies<int >());
56+
57+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
58+ LocalMat local_input (local_sample_size,
59+ local_num_samples,
60+ input.get_buffer (),
61+ local_sample_size);
62+
63+ LocalMat local_statistics (2 ,
64+ local_num_samples,
65+ statistics.get_local_shape (),
66+ 2 );
67+
68+ El::Zero (local_statistics);
69+ auto local_means = El::View (local_statistics, El::IR (0 ), El::ALL);
70+ auto local_vars = El::View (local_statistics, El::IR (1 ), El::ALL);
71+
72+ {
73+ using namespace hydrogen ;
74+ auto multisync = El::MakeMultiSync (gpu::get_sync_info (local_statistics),
75+ gpu::get_sync_info (local_input));
76+ constexpr size_t block_size = 256 ;
77+ dim3 block_dims, grid_dims;
78+ block_dims.x = block_size;
79+ grid_dims.x = (local_sample_size + block_size - 1 ) / block_size;
80+ grid_dims.y = local_num_samples;
81+ hydrogen::gpu::LaunchKernel (
82+ ::lbann::layer_norm_fp_sums_kernel<block_size, TensorDataType>,
83+ grid_dims,
84+ block_dims,
85+ 0 ,
86+ multisync,
87+ local_num_samples,
88+ local_sample_size,
89+ local_input.LockedBuffer (),
90+ local_input.LDim (),
91+ local_means.Buffer (),
92+ local_means.LDim (),
93+ local_vars.Buffer (),
94+ local_vars.LDim ());
95+ }
96+ }
3997
4098template <typename Backend, typename DataType>
4199template <typename Allocator>
42- void LayerNormalization ::apply_normalization (
100+ void LayerNormalization::apply_normalization (
43101 const DCTensor<Allocator>& input,
44102 const DCTensor<Allocator>& statistics,
45103 DCTensor<Allocator>& output)
46104{}
47105
48106template <typename Backend, typename DataType>
49107template <typename Allocator>
50- void LayerNormalization ::calculate_backward_stats (
108+ void LayerNormalization::calculate_backward_stats (
51109 const DCTensor<Allocator>& input,
52110 const DCTensor<Allocator>& output_grad,
53111 const DCTensor<Allocator>& statistics,
54112 DCTensor<Allocator>& statistics_grad)
55113{}
56114template <typename Backend, typename DataType>
57115template <typename Allocator>
58- void LayerNormalization ::apply_grad (const DCTensor<Allocator>& input,
59- const DCTensor<Allocator>& output_grad,
60- const DCTensor<Allocator>& statistics,
61- const DCTensor<Allocator>& statistics_grad,
62- DCTensor<Allocator>& input_grad)
116+ void LayerNormalization::apply_grad (const DCTensor<Allocator>& input,
117+ const DCTensor<Allocator>& output_grad,
118+ const DCTensor<Allocator>& statistics,
119+ const DCTensor<Allocator>& statistics_grad,
120+ DCTensor<Allocator>& input_grad)
63121{}
64122
123+ #define ETI (T, Backend ) \
124+ template class LayerNormalization <Backend, T>; \
125+ template void LayerNormalization<Backend, T>::calculate_forward_stats< \
126+ tensor::CUDAAllocator>( \
127+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
128+ tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& statistics); \
129+ template void \
130+ LayerNormalization<Backend, T>::apply_normalization<tensor::CUDAAllocator>( \
131+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
132+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
133+ statistics, \
134+ tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& output); \
135+ template void LayerNormalization<Backend, T>::calculate_backward_stats< \
136+ tensor::CUDAAllocator>( \
137+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
138+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
139+ output_grad, \
140+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
141+ statistics, \
142+ tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
143+ statistics_grad); \
144+ template void \
145+ LayerNormalization<Backend, T>::apply_grad<tensor::CUDAAllocator>( \
146+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
147+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
148+ output_grad, \
149+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
150+ statistics, \
151+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
152+ statistics_grad, \
153+ tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input_grad);
154+
155+ ETI (float , BackendDNNLib)
156+ ETI(double , BackendDNNLib)
157+ #endef ETI
65158#endif // LBANN_HAS_DISTCONV
0 commit comments