Strange behavior on CI. Every couple of gradient checks fail...

szaman19 · szaman19 · commit 64f921a8f072 · 2023-02-01T19:18:50.000-08:00
diff --git a/ReleaseNotes.txt b/ReleaseNotes.txt
@@ -23,7 +23,7 @@ Support for new network structures:
  - RoBERTa with pretrained weights
 
 Support for new layers:
-- - Added distributed tensor parallelism with channelwise decomposition for channelwise softmax layer
+- Added distributed tensor parallelism with channelwise decomposition for channelwise softmax layer
 - Added support for 2D Matrices for Scatter and Gather layers
 - Added image rotation layer and composite image transformation layer
   (rotate, shear, translate)
diff --git a/ci_test/unit_tests/test_unit_layer_channelwise_softmax_distconv.py b/ci_test/unit_tests/test_unit_layer_channelwise_softmax_distconv.py
@@ -20,7 +20,7 @@
 # Data
 np.random.seed(20200115)
 _num_samples = 15
-_sample_dims = (15,36,1)
+_sample_dims = (15,5,1)
 _sample_size = functools.reduce(operator.mul, _sample_dims)
 _samples = np.random.normal(loc=0.5, size=(_num_samples,_sample_size)).astype(np.float32)
 
@@ -103,11 +103,12 @@ def construct_model(lbann):
     x = x_lbann
 
     y = lbann.ChannelwiseSoftmax(x,
+                                 data_layout='data_parallel',
                                  parallel_strategy=create_parallel_strategy(num_channel_groups),
                                  name="Channelwise_softmax_distconv")
     z = lbann.L2Norm2(y)
     obj.append(z)
-    metrics.append(lbann.Metric(z, name='data-parallel layout'))
+    metrics.append(lbann.Metric(z, name='channelwise split distconv'))
 
     # NumPy implementation
     vals = []
diff --git a/include/lbann/layers/misc/channelwise_softmax.hpp b/include/lbann/layers/misc/channelwise_softmax.hpp
@@ -184,8 +184,7 @@ void channelwise_softmax_layer<TensorDataType,Layout,Device>::setup_dims(DataRea
                   output_dims.size(),"-D output tensor");
     }
   }
-  
-  #endif  
+  #endif  // LBANN_HAS_DISTCONV
 }
 
 #ifdef LBANN_HAS_DISTCONV
diff --git a/src/layers/misc/distconv/distconv_channelwise_softmax.cu b/src/layers/misc/distconv/distconv_channelwise_softmax.cu
@@ -78,17 +78,17 @@ namespace distconv{
   template<typename Allocator>
   int
   ChannelwiseSoftmax<Backend, DataType>
-  ::backward(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_0,
+  ::backward(const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output,
              const tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &output_grad,
              tensor::Tensor<DataType, tensor::LocaleMPI, Allocator> &input_grad_0){
-    if (input_0.get_local_size() == 0 ||
+    if (output.get_local_size() == 0 ||
         output_grad.get_local_size() == 0 ||
         input_grad_0.get_local_size() == 0){
       util::MPIRootPrintStreamInfo() << "WARNING: EMPTY INPUT FOUND \n";
       return 1; // no op for empty inputs
     }
 
-    const auto& input_0_dims = input_0.get_local_shape();
+    const auto& input_0_dims = output.get_local_shape();
     const auto num_channels = input_0_dims[2];
     const auto local_mini_batch_size = input_0_dims[3];
     const auto mat_channel_size = input_0_dims[0] * input_0_dims[1];
@@ -98,9 +98,9 @@ namespace distconv{
 
     using LocalMat = El::Matrix<DataType, El::Device::GPU>;
 
-    LocalMat local_input(mat_stride,
+    LocalMat local_output(mat_stride,
                          local_mini_batch_size,
-                         input_0.get_buffer(),
+                         output.get_buffer(),
                          mat_stride);
 
     LocalMat local_output_grad(mat_stride,
@@ -115,7 +115,7 @@ namespace distconv{
 
     ::lbann::channelwise_softmax_bp_impl(num_channels,
                                          mat_channel_size,
-                                         local_input,
+                                         local_output,
                                          local_output_grad,
                                          local_input_grad);
     return 1;        

Original file line number	Diff line number	Diff line change
`@@ -184,8 +184,7 @@ void channelwise_softmax_layer<TensorDataType,Layout,Device>::setup_dims(DataRea`
`184`	`184`	`output_dims.size(),"-D output tensor");`
`185`	`185`	`}`
`186`	`186`	`}`
`187`		`-`
`188`		`- #endif`
	`187`	`+ #endif // LBANN_HAS_DISTCONV`
`189`	`188`	`}`
`190`	`189`
`191`	`190`	`#ifdef LBANN_HAS_DISTCONV`