Updated implementation to incorporate updated channelwise softmax API

szaman19 · szaman19 · commit 2d58a58b4830 · 2024-06-11T11:59:13.000-07:00
- Currently building and linking
diff --git a/include/lbann/layers/misc/channelwise_softmax.hpp b/include/lbann/layers/misc/channelwise_softmax.hpp
@@ -37,29 +37,37 @@
 #include "lbann/layers/misc/distconv/distconv_channelwise_softmax.hpp"
 #endif
 
-
 namespace lbann {
 
 #ifdef LBANN_HAS_DISTCONV
+namespace dc {
+template <typename TensorDataType>
+using ChannelwiseSoftmax =
+  ::distconv::ChannelwiseSoftmax<Backend, TensorDataType>;
+} // namespace dc
+
 template <typename TensorDataType, data_layout Layout, El::Device Device>
 class channelwise_softmax_distconv_adapter
-  : public data_type_distconv_adapter<TensorDataType>{
-  public:
-    using TensorDevType = typename data_type_distconv_adapter<TensorDataType>::TensorDevType; 
-
-    channelwise_softmax_distconv_adapter(Layer& layer)
-      : data_type_distconv_adapter<TensorDataType>(layer){}
-    
-    virtual ~channelwise_softmax_distconv_adapter() = default;
-    void setup_distributions(tensor_overlap_constraints &constraints) override;
-    void setup_layer(size_t workspace_capacity) override; 
-    void fp_compute();
-    void bp_compute();
-    std::unique_ptr<dc::ChannelwiseSoftmax<TensorDataType>> m_channelwise_softmax_operator; 
-  }; // class definition channelwise_softmax_distconv_adapter 
-
-#endif  // LBANN_HAS_DISTCONV
+  : public data_type_distconv_adapter<TensorDataType>
+{
+public:
+  using TensorDevType =
+    typename data_type_distconv_adapter<TensorDataType>::TensorDevType;
+
+  channelwise_softmax_distconv_adapter(Layer& layer)
+    : data_type_distconv_adapter<TensorDataType>(layer)
+  {}
+
+  virtual ~channelwise_softmax_distconv_adapter() = default;
+  void setup_distributions(tensor_overlap_constraints& constraints) override;
+  void setup_layer(size_t workspace_capacity) override;
+  void fp_compute();
+  void bp_compute();
+  std::unique_ptr<dc::ChannelwiseSoftmax<TensorDataType>>
+    m_channelwise_softmax_operator;
+}; // class definition channelwise_softmax_distconv_adapter
 
+#endif // LBANN_HAS_DISTCONV
 
 /** @brief Apply softmax to tensor channels.
  *
@@ -121,14 +129,29 @@ class channelwise_softmax_layer : public data_type_layer<TensorDataType>
   void bp_compute() override;
 
 #ifdef LBANN_HAS_DISTCONV
-  friend class channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>;
-  protected:
-    void setup_distconv_adapter(const DataReaderMetaData& dr_metadata) override;
-    bool is_distconv_supported() const override;
-    channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() override;
-    const channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>& get_distconv_adapter() const override;
+  friend class channelwise_softmax_distconv_adapter<TensorDataType,
+                                                    Layout,
+                                                    Device>;
+
+protected:
+  void setup_distconv_adapter() override;
+  bool is_distconv_supported() const override;
+  channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>&
+  get_distconv_adapter() override;
+  const channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>&
+  get_distconv_adapter() const override;
 #endif // LBANN_HAS_DISTCONV
+private:
+  void get_channel_size_and_stride(El::Int& channel_size,
+                                   El::Int& channel_stride,
+                                   El::Int& num_channels) const;
+
+  /** Specifies the dimension of the tensor to perform softmax on. */
+  int64_t m_dim;
 
+  /** @brief If true, only performs softmax on the chosen dimension. Otherwise
+             all dimensions but ``m_dim`` will be used. */
+  bool m_single_dim_mode;
 };
 
 // Builder function
@@ -184,127 +207,114 @@ El::Device channelwise_softmax_layer<TensorDataType, Layout, Device>::
   return Device;
 }
 
-template <typename TensorDataType, data_layout Layout, El::Device Device>
-void channelwise_softmax_layer<TensorDataType,Layout,Device>::setup_dims(DataReaderMetaData& dr_metadata) {
-  data_type_layer<TensorDataType>::setup_dims(dr_metadata);
-  this->set_output_dims(this->get_input_dims());
-
-  #ifdef LBANN_HAS_DISTCONV
-  
-  if (this->distconv_enabled()){
-    // Additional checks when distconv mode is enabled
-    const auto& input_dims = this->get_input_dims();
-    const auto& output_dims = this->get_output_dims();
-
-    if (input_dims.size() != 3 || output_dims.size() != 3){
-      LBANN_ERROR(this->get_type()," layer \"",this->get_name(),"\" ",
-                  "expects an input and output tensor with 3 dimensions (channel, *, *), "
-                  "but it has been configured as a ",
-                  input_dims.size(), "-D input tensor and ",
-                  output_dims.size(),"-D output tensor");
-    }
-  }
-  #endif  // LBANN_HAS_DISTCONV
-}
-
 #ifdef LBANN_HAS_DISTCONV
 
 // =========================================================
 // DistConv-Adapter member functions
 // =========================================================
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void
-channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>
-::setup_distributions(tensor_overlap_constraints &constraints){
+void channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>::
+  setup_distributions(tensor_overlap_constraints& constraints)
+{
   data_type_distconv_adapter<TensorDataType>::setup_distributions(constraints);
 
-  for (auto &d: this->m_prev_activations_dists) {
+  for (auto& d : this->m_prev_activations_dists) {
     d.clear_overlap();
     constraints.mark_updated(d);
     constraints.mark_invariant(d);
   }
-  for (auto &d: this->m_activations_dists) {
+  for (auto& d : this->m_activations_dists) {
     d.clear_overlap();
     constraints.mark_updated(d);
     constraints.mark_invariant(d);
   }
-  for (auto &d: this->m_prev_error_signals_dists) {
+  for (auto& d : this->m_prev_error_signals_dists) {
     d.clear_overlap();
     constraints.mark_updated(d);
     constraints.mark_invariant(d);
   }
-  for (auto &d: this->m_error_signals_dists) {
+  for (auto& d : this->m_error_signals_dists) {
     d.clear_overlap();
     constraints.mark_updated(d);
     constraints.mark_invariant(d);
   }
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void
-channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>
-::setup_layer(size_t workspace_capacity){
+void channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>::
+  setup_layer(size_t workspace_capacity)
+{
   data_type_distconv_adapter<TensorDataType>::setup_layer(workspace_capacity);
 
-  m_channelwise_softmax_operator = std::make_unique<dc::ChannelwiseSoftmax<TensorDataType>>(dc::get_backend());
+  m_channelwise_softmax_operator =
+    std::make_unique<dc::ChannelwiseSoftmax<TensorDataType>>(dc::get_backend());
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void
-channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>
-::fp_compute(){
-  auto &layer = dynamic_cast<
-    channelwise_softmax_layer<TensorDataType, Layout, Device>&>(this->layer());
+void channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>::
+  fp_compute()
+{
+  auto& layer =
+    dynamic_cast<channelwise_softmax_layer<TensorDataType, Layout, Device>&>(
+      this->layer());
   m_channelwise_softmax_operator->forward(this->get_prev_activations(0),
                                           this->get_activations(0));
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void
-channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>
-::bp_compute(){
-  auto &layer = dynamic_cast<
-    channelwise_softmax_layer<TensorDataType, Layout, Device>&>(this->layer());
-    m_channelwise_softmax_operator->backward(this->get_activations(0),
-                                             this->get_prev_error_signals(),
-                                             this->get_error_signals(0));
+void channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>::
+  bp_compute()
+{
+  auto& layer =
+    dynamic_cast<channelwise_softmax_layer<TensorDataType, Layout, Device>&>(
+      this->layer());
+  m_channelwise_softmax_operator->backward(this->get_activations(0),
+                                           this->get_prev_error_signals(),
+                                           this->get_error_signals(0));
 }
 // =============================================================
 // DistConv-enabled Channelwise-Softmax member functions
 // =============================================================
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-bool
-channelwise_softmax_layer<TensorDataType, Layout, Device>
-::is_distconv_supported() const {
-  return Device==El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
+bool channelwise_softmax_layer<TensorDataType, Layout, Device>::
+  is_distconv_supported() const
+{
+  return Device == El::Device::GPU && Layout == data_layout::DATA_PARALLEL;
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
-void
-channelwise_softmax_layer<TensorDataType, Layout, Device>
-::setup_distconv_adapter(const DataReaderMetaData& dr_metadata){
-  this->get_distconv_adapter_ptr() = std::make_unique<channelwise_softmax_distconv_adapter<
-    TensorDataType, Layout, Device>>(*this);
+void channelwise_softmax_layer<TensorDataType, Layout, Device>::
+  setup_distconv_adapter()
+{
+  this->get_distconv_adapter_ptr() = std::make_unique<
+    channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>>(
+    *this);
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
 const channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>&
-channelwise_softmax_layer<TensorDataType, Layout, Device>
-::get_distconv_adapter() const{
-  return dynamic_cast<const channelwise_softmax_distconv_adapter< 
-    TensorDataType, Layout, Device>&>(data_type_layer<TensorDataType>::get_distconv_adapter());
+channelwise_softmax_layer<TensorDataType, Layout, Device>::
+  get_distconv_adapter() const
+{
+  return dynamic_cast<const channelwise_softmax_distconv_adapter<TensorDataType,
+                                                                 Layout,
+                                                                 Device>&>(
+    data_type_layer<TensorDataType>::get_distconv_adapter());
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
 channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>&
-channelwise_softmax_layer<TensorDataType, Layout, Device>
-::get_distconv_adapter(){
-  return const_cast<channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>&>(
-    static_cast<const channelwise_softmax_layer<TensorDataType, Layout, Device>&>(*this).get_distconv_adapter());
+channelwise_softmax_layer<TensorDataType, Layout, Device>::
+  get_distconv_adapter()
+{
+  return const_cast<
+    channelwise_softmax_distconv_adapter<TensorDataType, Layout, Device>&>(
+    static_cast<
+      const channelwise_softmax_layer<TensorDataType, Layout, Device>&>(*this)
+      .get_distconv_adapter());
 }
 
-
 #endif //  LBANN_HAS_DISTCONV
 
 #ifndef LBANN_CHANNELWISE_SOFTMAX_LAYER_INSTANTIATE
diff --git a/include/lbann/layers/misc/channelwise_softmax_impl.hpp b/include/lbann/layers/misc/channelwise_softmax_impl.hpp
@@ -53,6 +53,28 @@ void channelwise_softmax_layer<TensorDataType, Layout, Device>::setup_dims()
   }
 
   this->set_output_dims(this->get_input_dims());
+#ifdef LBANN_HAS_DISTCONV
+
+  if (this->distconv_enabled()) {
+    // Additional checks when distconv mode is enabled
+    const auto& input_dims = this->get_input_dims();
+    const auto& output_dims = this->get_output_dims();
+
+    if (input_dims.size() != 3 || output_dims.size() != 3) {
+      LBANN_ERROR(
+        this->get_type(),
+        " layer \"",
+        this->get_name(),
+        "\" ",
+        "expects an input and output tensor with 3 dimensions (channel, *, *), "
+        "but it has been configured as a ",
+        input_dims.size(),
+        "-D input tensor and ",
+        output_dims.size(),
+        "-D output tensor");
+    }
+  }
+#endif // LBANN_HAS_DISTCONV
 }
 
 template <typename TensorDataType, data_layout Layout, El::Device Device>
diff --git a/include/lbann/utils/distconv.hpp b/include/lbann/utils/distconv.hpp
@@ -53,17 +53,6 @@
 #include "p2p/p2p.hpp"
 #endif // DISTCONV_HAS_P2P
 
-#include "lbann/layers/learning/distconv/distconv_layers.hpp"
-#include "lbann/layers/math/distconv/distconv_matmul.hpp"
-
-#ifdef LBANN_HAS_NVSHMEM
-#include "lbann/layers/transform/distconv/distconv_scatter.hpp"
-#include "lbann/layers/transform/distconv/distconv_gather.hpp"
-#include "lbann/layers/transform/distconv/distconv_nvshmem_vector_addressing.hpp"
-#endif // LBANN_HAS_NVSHMEM
-
-#include "lbann/layers/misc/distconv/distconv_channelwise_softmax.hpp"
-
 namespace lbann {
 
 inline auto default_hydrogen_stream()
@@ -137,23 +126,8 @@ using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning;
 
 // Distconv layer classes
 using Backend = ::distconv::BackendDNNLib;
-using ReLU = ::distconv::ReLU<Backend>;
-using LeakyReLU = ::distconv::LeakyReLU<Backend>;
-template <typename TensorDataType>
-using Convolution = ::distconv::Convolution<Backend, TensorDataType>;
-template <typename TensorDataType>
-using ChannelwiseFullyConnected = ::distconv::ChannelwiseFullyConnected<Backend, TensorDataType>;
-template <typename TensorDataType>
-using Pooling = ::distconv::Pooling<Backend, TensorDataType>;
-template <typename TensorDataType>
-using BatchNormalization = ::distconv::BatchNormalization<Backend, TensorDataType>;
-template <typename TensorDataType>
-using MatMul = ::distconv::MatMul<Backend, TensorDataType>;
-template <typename TensorDataType>
-using ChannelwiseSoftmax = ::distconv::ChannelwiseSoftmax<Backend, TensorDataType>;
-using Softmax = ::distconv::Softmax<Backend>;
-using CrossEntropy = ::distconv::CrossEntropy<Backend>;
-using MeanSquaredError = ::distconv::MeanSquaredError<Backend>;
+using AlCommType = typename decltype(std::declval<Backend>()
+                                       .get_al_mpi_cuda_comm())::element_type;
 
 using ::distconv::get_channel_dim;
 using ::distconv::get_sample_dim;