Skip to content

Commit dbff063

Browse files
committed
Added ETI
1 parent 0d39f26 commit dbff063

File tree

2 files changed

+103
-10
lines changed

2 files changed

+103
-10
lines changed

include/lbann/layers/regularizers/distconv/distconv_layer_norm.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232
namespace distconv {
3333
template <typename Backend, typename DataType>
34-
class LayerNorm
34+
class LayerNormalization
3535
{
3636
using LocaleMPI = tensor::LocaleMPI;
3737

src/layers/regularizers/distconv/distconv_layer_norm.cu

Lines changed: 102 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,32 +34,125 @@
3434
template <typename Backend, typename DataType>
3535
template <typename Allocator>
3636
void LayerNormalization ::calculate_forward_stats(
37-
const DCTensor<Allocator>& input)
38-
{}
37+
const DCTensor<Allocator>& input,
38+
DCTensor<Allocator>& statistics)
39+
{
40+
if (input_0.get_local_size() == 0) {
41+
util::MPIRootPrintStreamInfo() << "WARNING: EMPTY INPUT FOUND \n";
42+
return; // no op for empty inputs
43+
}
44+
45+
const auto& input_dims = input.get_local_shape();
46+
const auto& statistics_dims = statistics.get_local_shape();
47+
48+
const auto local_num_samples = input_0_dims[3];
49+
50+
const auto global_num_samples = statistics_dims[3];
51+
52+
const auto local_sample_size = std::accumulate(input_dims.begin(),
53+
input_dims.end() - 1,
54+
1,
55+
std::multiplies<int>());
56+
57+
using LocalMat = El::Matrix<DataType, El::Device::GPU>;
58+
LocalMat local_input(local_sample_size,
59+
local_num_samples,
60+
input.get_buffer(),
61+
local_sample_size);
62+
63+
LocalMat local_statistics(2,
64+
local_num_samples,
65+
statistics.get_local_shape(),
66+
2);
67+
68+
El::Zero(local_statistics);
69+
auto local_means = El::View(local_statistics, El::IR(0), El::ALL);
70+
auto local_vars = El::View(local_statistics, El::IR(1), El::ALL);
71+
72+
{
73+
using namespace hydrogen;
74+
auto multisync = El::MakeMultiSync(gpu::get_sync_info(local_statistics),
75+
gpu::get_sync_info(local_input));
76+
constexpr size_t block_size = 256;
77+
dim3 block_dims, grid_dims;
78+
block_dims.x = block_size;
79+
grid_dims.x = (local_sample_size + block_size - 1) / block_size;
80+
grid_dims.y = local_num_samples;
81+
hydrogen::gpu::LaunchKernel(
82+
::lbann::layer_norm_fp_sums_kernel<block_size, TensorDataType>,
83+
grid_dims,
84+
block_dims,
85+
0,
86+
multisync,
87+
local_num_samples,
88+
local_sample_size,
89+
local_input.LockedBuffer(),
90+
local_input.LDim(),
91+
local_means.Buffer(),
92+
local_means.LDim(),
93+
local_vars.Buffer(),
94+
local_vars.LDim());
95+
}
96+
}
3997

4098
template <typename Backend, typename DataType>
4199
template <typename Allocator>
42-
void LayerNormalization ::apply_normalization(
100+
void LayerNormalization::apply_normalization(
43101
const DCTensor<Allocator>& input,
44102
const DCTensor<Allocator>& statistics,
45103
DCTensor<Allocator>& output)
46104
{}
47105

48106
template <typename Backend, typename DataType>
49107
template <typename Allocator>
50-
void LayerNormalization ::calculate_backward_stats(
108+
void LayerNormalization::calculate_backward_stats(
51109
const DCTensor<Allocator>& input,
52110
const DCTensor<Allocator>& output_grad,
53111
const DCTensor<Allocator>& statistics,
54112
DCTensor<Allocator>& statistics_grad)
55113
{}
56114
template <typename Backend, typename DataType>
57115
template <typename Allocator>
58-
void LayerNormalization ::apply_grad(const DCTensor<Allocator>& input,
59-
const DCTensor<Allocator>& output_grad,
60-
const DCTensor<Allocator>& statistics,
61-
const DCTensor<Allocator>& statistics_grad,
62-
DCTensor<Allocator>& input_grad)
116+
void LayerNormalization::apply_grad(const DCTensor<Allocator>& input,
117+
const DCTensor<Allocator>& output_grad,
118+
const DCTensor<Allocator>& statistics,
119+
const DCTensor<Allocator>& statistics_grad,
120+
DCTensor<Allocator>& input_grad)
63121
{}
64122

123+
#define ETI(T, Backend) \
124+
template class LayerNormalization<Backend, T>; \
125+
template void LayerNormalization<Backend, T>::calculate_forward_stats< \
126+
tensor::CUDAAllocator>( \
127+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
128+
tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& statistics); \
129+
template void \
130+
LayerNormalization<Backend, T>::apply_normalization<tensor::CUDAAllocator>( \
131+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
132+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
133+
statistics, \
134+
tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& output); \
135+
template void LayerNormalization<Backend, T>::calculate_backward_stats< \
136+
tensor::CUDAAllocator>( \
137+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
138+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
139+
output_grad, \
140+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
141+
statistics, \
142+
tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
143+
statistics_grad); \
144+
template void \
145+
LayerNormalization<Backend, T>::apply_grad<tensor::CUDAAllocator>( \
146+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input, \
147+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
148+
output_grad, \
149+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
150+
statistics, \
151+
const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
152+
statistics_grad, \
153+
tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& input_grad);
154+
155+
ETI(float, BackendDNNLib)
156+
ETI(double, BackendDNNLib)
157+
#endef ETI
65158
#endif // LBANN_HAS_DISTCONV

0 commit comments

Comments
 (0)