Skip to content

Commit 64f921a

Browse files
committed
Strange behavior on CI. Every couple of gradient checks fail...
1 parent dd55b40 commit 64f921a

File tree

4 files changed

+11
-11
lines changed

4 files changed

+11
-11
lines changed

ReleaseNotes.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Support for new network structures:
2323
- RoBERTa with pretrained weights
2424

2525
Support for new layers:
26-
- - Added distributed tensor parallelism with channelwise decomposition for channelwise softmax layer
26+
- Added distributed tensor parallelism with channelwise decomposition for channelwise softmax layer
2727
- Added support for 2D Matrices for Scatter and Gather layers
2828
- Added image rotation layer and composite image transformation layer
2929
(rotate, shear, translate)

ci_test/unit_tests/test_unit_layer_channelwise_softmax_distconv.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
# Data
2121
np.random.seed(20200115)
2222
_num_samples = 15
23-
_sample_dims = (15,36,1)
23+
_sample_dims = (15,5,1)
2424
_sample_size = functools.reduce(operator.mul, _sample_dims)
2525
_samples = np.random.normal(loc=0.5, size=(_num_samples,_sample_size)).astype(np.float32)
2626

@@ -103,11 +103,12 @@ def construct_model(lbann):
103103
x = x_lbann
104104

105105
y = lbann.ChannelwiseSoftmax(x,
106+
data_layout='data_parallel',
106107
parallel_strategy=create_parallel_strategy(num_channel_groups),
107108
name="Channelwise_softmax_distconv")
108109
z = lbann.L2Norm2(y)
109110
obj.append(z)
110-
metrics.append(lbann.Metric(z, name='data-parallel layout'))
111+
metrics.append(lbann.Metric(z, name='channelwise split distconv'))
111112

112113
# NumPy implementation
113114
vals = []

include/lbann/layers/misc/channelwise_softmax.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,7 @@ void channelwise_softmax_layer<TensorDataType,Layout,Device>::setup_dims(DataRea
184184
output_dims.size(),"-D output tensor");
185185
}
186186
}
187-
188-
#endif
187+
#endif // LBANN_HAS_DISTCONV
189188
}
190189

191190
#ifdef LBANN_HAS_DISTCONV

src/layers/misc/distconv/distconv_channelwise_softmax.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,17 +78,17 @@ namespace distconv{
7878
template<typename Allocator>
7979
int
8080
ChannelwiseSoftmax<Backend, DataType>
81-
::backward(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_0,
81+
::backward(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output,
8282
const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
8383
tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_grad_0){
84-
if (input_0.get_local_size() == 0 ||
84+
if (output.get_local_size() == 0 ||
8585
output_grad.get_local_size() == 0 ||
8686
input_grad_0.get_local_size() == 0){
8787
util::MPIRootPrintStreamInfo() << "WARNING: EMPTY INPUT FOUND \n";
8888
return 1; // no op for empty inputs
8989
}
9090

91-
const auto& input_0_dims = input_0.get_local_shape();
91+
const auto& input_0_dims = output.get_local_shape();
9292
const auto num_channels = input_0_dims[2];
9393
const auto local_mini_batch_size = input_0_dims[3];
9494
const auto mat_channel_size = input_0_dims[0] * input_0_dims[1];
@@ -98,9 +98,9 @@ namespace distconv{
9898

9999
using LocalMat = El::Matrix<DataType, El::Device::GPU>;
100100

101-
LocalMat local_input(mat_stride,
101+
LocalMat local_output(mat_stride,
102102
local_mini_batch_size,
103-
input_0.get_buffer(),
103+
output.get_buffer(),
104104
mat_stride);
105105

106106
LocalMat local_output_grad(mat_stride,
@@ -115,7 +115,7 @@ namespace distconv{
115115

116116
::lbann::channelwise_softmax_bp_impl(num_channels,
117117
mat_channel_size,
118-
local_input,
118+
local_output,
119119
local_output_grad,
120120
local_input_grad);
121121
return 1;

0 commit comments

Comments
 (0)