2525// //////////////////////////////////////////////////////////////////////////////
2626
2727#define LBANN_LAYERS_MISC_CHANNELWISE_SOFTMAX_INSTANTIATE
28- #include " lbann/utils/distconv.hpp "
28+ #include " ../channelwise_softmax_kernels.cuh "
2929#include " lbann/base.hpp"
3030#include " lbann/layers/misc/distconv/distconv_channelwise_softmax.hpp"
31+ #include " lbann/utils/distconv.hpp"
3132#include " lbann/utils/gpu/helpers.hpp"
32- #include " ../channelwise_softmax_kernels.cuh"
33-
3433
3534#ifdef LBANN_HAS_DISTCONV
36- namespace distconv {
37- template <typename Backend, typename DataType>
38- template <typename Allocator>
39- int
40- ChannelwiseSoftmax<Backend, DataType>
41- ::forward (const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_0,
42- tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output){
43-
44- if (input_0.get_local_size () == 0 || output.get_local_size () == 0 ){
45- util::MPIRootPrintStreamInfo () << " WARNING: EMPTY INPUT FOUND \n " ;
46- return 1 ; // no op for empty inputs
47- }
48-
49- const auto & input_0_dims = input_0.get_local_shape ();
50-
51- const auto num_channels = input_0_dims[2 ];
52- const auto local_mini_batch_size = input_0_dims[3 ];
53- const auto mat_channel_size = input_0_dims[0 ] * input_0_dims[1 ];
54- const auto mat_stride = num_channels * mat_channel_size;
55-
56- // Convert to Hydrogen matrices for kernel launch
57-
58- using LocalMat = El::Matrix<DataType, El::Device::GPU>;
59-
60- LocalMat local_input (mat_stride,
35+ namespace distconv {
36+ template <typename Backend, typename DataType>
37+ template <typename Allocator>
38+ int ChannelwiseSoftmax<Backend, DataType>::forward(
39+ const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& input_0,
40+ tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output)
41+ {
42+
43+ if (input_0.get_local_size () == 0 || output.get_local_size () == 0 ) {
44+ util::MPIRootPrintStreamInfo () << " WARNING: EMPTY INPUT FOUND \n " ;
45+ return 1 ; // no op for empty inputs
46+ }
47+
48+ const auto & input_0_dims = input_0.get_local_shape ();
49+
50+ const auto num_channels = input_0_dims[2 ];
51+ const auto local_mini_batch_size = input_0_dims[3 ];
52+ const auto mat_channel_size = input_0_dims[0 ] * input_0_dims[1 ];
53+ const auto mat_stride = num_channels * mat_channel_size;
54+
55+ // Convert to Hydrogen matrices for kernel launch
56+
57+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
58+
59+ LocalMat local_input (mat_stride,
60+ local_mini_batch_size,
61+ input_0.get_buffer (),
62+ mat_stride);
63+
64+ LocalMat local_output (mat_stride,
6165 local_mini_batch_size,
62- input_0 .get_buffer (),
66+ output .get_buffer (),
6367 mat_stride);
6468
65- LocalMat local_output (mat_stride,
66- local_mini_batch_size,
67- output.get_buffer (),
68- mat_stride);
69-
70- ::lbann::channelwise_softmax_fp_impl (num_channels,
71- mat_channel_size,
72- local_input,
73- local_output);
74- return 1 ;
69+ ::lbann::channelwise_softmax_fp_impl (num_channels,
70+ mat_channel_size,
71+ local_input,
72+ local_output);
73+ return 1 ;
74+ }
75+
76+ template <typename Backend, typename DataType>
77+ template <typename Allocator>
78+ int ChannelwiseSoftmax<Backend, DataType>::backward(
79+ const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output,
80+ const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& output_grad,
81+ tensor::Tensor<DataType, tensor::LocaleMPI, Allocator>& input_grad_0)
82+ {
83+ if (output.get_local_size () == 0 || output_grad.get_local_size () == 0 ||
84+ input_grad_0.get_local_size () == 0 ) {
85+ util::MPIRootPrintStreamInfo () << " WARNING: EMPTY INPUT FOUND \n " ;
86+ return 1 ; // no op for empty inputs
7587 }
7688
77- template <typename Backend, typename DataType>
78- template <typename Allocator>
79- int
80- ChannelwiseSoftmax<Backend, DataType>
81- ::backward (const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output,
82- const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
83- tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_grad_0){
84- if (output.get_local_size () == 0 ||
85- output_grad.get_local_size () == 0 ||
86- input_grad_0.get_local_size () == 0 ){
87- util::MPIRootPrintStreamInfo () << " WARNING: EMPTY INPUT FOUND \n " ;
88- return 1 ; // no op for empty inputs
89- }
90-
91- const auto & input_0_dims = output.get_local_shape ();
92- const auto num_channels = input_0_dims[2 ];
93- const auto local_mini_batch_size = input_0_dims[3 ];
94- const auto mat_channel_size = input_0_dims[0 ] * input_0_dims[1 ];
95- const auto mat_stride = num_channels * mat_channel_size;
96-
97- // Convert to Hydrogen matrices for kernel launch
98-
99- using LocalMat = El::Matrix<DataType, El::Device::GPU>;
100-
101- LocalMat local_output (mat_stride,
102- local_mini_batch_size,
103- output.get_buffer (),
104- mat_stride);
105-
106- LocalMat local_output_grad (mat_stride,
107- local_mini_batch_size,
108- output_grad.get_buffer (),
109- mat_stride);
110-
111- LocalMat local_input_grad (mat_stride,
112- local_mini_batch_size,
113- input_grad_0.get_buffer (),
114- mat_stride);
115-
116- ::lbann::channelwise_softmax_bp_impl (num_channels,
117- mat_channel_size,
118- local_output,
119- local_output_grad,
120- local_input_grad);
121- return 1 ;
122- }
89+ const auto & input_0_dims = output.get_local_shape ();
90+ const auto num_channels = input_0_dims[2 ];
91+ const auto local_mini_batch_size = input_0_dims[3 ];
92+ const auto mat_channel_size = input_0_dims[0 ] * input_0_dims[1 ];
93+ const auto mat_stride = num_channels * mat_channel_size;
94+
95+ // Convert to Hydrogen matrices for kernel launch
96+
97+ using LocalMat = El::Matrix<DataType, El::Device::GPU>;
98+
99+ LocalMat local_output (mat_stride,
100+ local_mini_batch_size,
101+ output.get_buffer (),
102+ mat_stride);
103+
104+ LocalMat local_output_grad (mat_stride,
105+ local_mini_batch_size,
106+ output_grad.get_buffer (),
107+ mat_stride);
108+
109+ LocalMat local_input_grad (mat_stride,
110+ local_mini_batch_size,
111+ input_grad_0.get_buffer (),
112+ mat_stride);
113+
114+ ::lbann::channelwise_softmax_bp_impl (num_channels,
115+ mat_channel_size,
116+ local_output,
117+ local_output_grad,
118+ local_input_grad);
119+ return 1 ;
120+ }
123121
124122// =========================================================
125123// Explicit template instantiation
126124// =========================================================
127125
128- #define ETI (T, Backend ) \
129- template class ChannelwiseSoftmax <Backend, T>; \
130- template int ChannelwiseSoftmax<Backend, T>::forward<tensor::CUDAAllocator>( \
131- const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &input_0, \
132- tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &output_0); \
133- template int ChannelwiseSoftmax<Backend, T>::backward<tensor::CUDAAllocator>( \
134- const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &input_0, \
135- const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &input_1, \
136- tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator> &output_grad);
137-
126+ #define ETI (T, Backend ) \
127+ template class ChannelwiseSoftmax <Backend, T>; \
128+ template int ChannelwiseSoftmax<Backend, T>::forward<tensor::CUDAAllocator>( \
129+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
130+ input_0, \
131+ tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& output_0); \
132+ template int \
133+ ChannelwiseSoftmax<Backend, T>::backward<tensor::CUDAAllocator>( \
134+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
135+ input_0, \
136+ const tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& \
137+ input_1, \
138+ tensor::Tensor<T, tensor::LocaleMPI, tensor::CUDAAllocator>& output_grad);
139+
140+ // / @todo: fp16
138141ETI (float , BackendDNNLib)
142+ #ifdef LBANN_HAS_DOUBLE
139143ETI (double , BackendDNNLib)
144+ #endif // LBANN_HAS_DOUBLE
145+
140146#undef ETI
141- } // namespace distconv
142- #endif // LBANN_HAS_DISTCONV
147+ } // namespace distconv
148+ #endif // LBANN_HAS_DISTCONV
0 commit comments