Gradient Norm Clipping (#2344)

tbennun · fiedorowicz1 · web-flow · commit 02b7f7908e0e · 2023-10-23T13:42:50.000-07:00
* Implement gradient norm clipping callback
* Default to loss/metric when not using gradient checking

---------

Co-authored-by: Pier Fiedorowicz &lt;fiedorowicz1@llnl.gov&gt;
Co-authored-by: Pier Fiedorowicz &lt;117680821+fiedorowicz1@users.noreply.github.com&gt;
diff --git a/applications/nlp/transformer/pretrain_gpt.py b/applications/nlp/transformer/pretrain_gpt.py
@@ -116,7 +116,7 @@ def main():
         beta1=0.9,
         beta2=0.95,
         eps=1e-8,
-        clip_gradient=0.0,
+        clip_gradient=1.0,
         lr_decay='cosine',
         lr_decay_steps=int((260 * 1e9) // tokens_per_step),
         end_learning_rate=chosen_config.lr / 10,
diff --git a/applications/nlp/transformer/trainer.py b/applications/nlp/transformer/trainer.py
@@ -148,7 +148,6 @@ def make_batch_script(model: lbann.Model,
             ))
 
     if clip_gradient > 0:
-        raise NotImplementedError('Gradient norm clipping not yet implemented')
         model.callbacks.append(
             lbann.CallbackClipGradientNorm(global_norm=True,
                                            value=clip_gradient))
diff --git a/ci_test/common_python/test_util.py b/ci_test/common_python/test_util.py
@@ -80,13 +80,13 @@ def wrapped(*args, **kwargs):
                                           error_on_failure=True,
                                           execution_modes='train' if train else 'test'))
 
-            obj_func = None
+            check_grad_obj_func = None
             if check_gradients:
                 if tester.check_gradients_tensor is None:
                     raise ValueError(
                         'LBANN test did not set a tensor for checking gradients, '
                         'use ``ModelTester.set_check_gradients_tensor``.')
-                obj_func = tester.check_gradients_tensor
+                check_grad_obj_func = tester.check_gradients_tensor
                 callbacks.append(
                     lbann.CallbackCheckGradients(error_on_failure=True))
             callbacks.extend(tester.extra_callbacks)
@@ -95,7 +95,7 @@ def wrapped(*args, **kwargs):
             metrics.extend(tester.extra_metrics)
             model = lbann.Model(epochs=1 if train else 0,
                                 layers=full_graph,
-                                objective_function=obj_func,
+                                objective_function=check_grad_obj_func if check_gradients else tester.loss,
                                 metrics=metrics,
                                 callbacks=callbacks)
 
diff --git a/ci_test/unit_tests/test_unit_callback_gradient_clipping.py b/ci_test/unit_tests/test_unit_callback_gradient_clipping.py
@@ -0,0 +1,93 @@
+import lbann
+import numpy as np
+import test_util
+from glob import glob
+import functools
+import os
+
+
+def check_gradients(global_norm=True, clip=1.0):
+
+    def decorator(f):
+
+        @functools.wraps(f)
+        def wrapper(*args, **kwargs):
+            # Clear any gradient outputs from previous runs.
+            grad_files = glob(
+                os.path.join(test_util._get_work_dir(__file__),
+                             'gradients*.txt'))
+            for gf in grad_files:
+                os.remove(gf)
+
+            # Run the model.
+            f(*args, **kwargs)
+
+            eps = np.finfo(np.float32).eps
+            grad_files = glob(
+                os.path.join(test_util._get_work_dir(__file__),
+                             'gradients*.txt'))
+
+            # Compute the weight gradient norms, check they are less than
+            # "clip", and update global gradient norm.
+            norm = 0
+            for gf in grad_files:
+                weight_norm = np.square(np.loadtxt(gf)).sum()
+                assert np.sqrt(weight_norm) <= clip + 8 * eps
+                norm += weight_norm
+
+            # Check the global gradient norm is less than "clip" if requested.
+            if global_norm:
+                assert np.sqrt(norm) <= clip + 8 * eps
+
+        return wrapper
+
+    return decorator
+
+
+def setup_tester(scale, global_norm, clip):
+    np.random.seed(20231018)
+    x = np.random.normal(scale=scale, size=[8, 16]).astype(np.float32)
+    ref = np.zeros_like(x)
+
+    tester = test_util.ModelTester()
+    x = tester.inputs(x)
+    ref = tester.make_reference(ref)
+
+    y = lbann.FullyConnected(x, num_neurons=16, has_bias=True)
+
+    z = lbann.FullyConnected(y, num_neurons=16, has_bias=True)
+
+    tester.set_loss(lbann.MeanSquaredError(z, ref), tolerance=10 * scale**2)
+    tester.extra_callbacks = [
+        lbann.CallbackClipGradientNorm(global_norm=global_norm, value=clip),
+        lbann.CallbackDumpGradients(basename='gradients')
+    ]
+    return tester
+
+
+# Case where no clipping is needed.
+@check_gradients(global_norm=True)
+@test_util.lbann_test(train=True)
+def test_gradient_no_clipping():
+    return setup_tester(scale=0.1, global_norm=True, clip=1.0)
+
+
+# Case with global clipping.
+@check_gradients(global_norm=True)
+@test_util.lbann_test(train=True)
+def test_gradient_clipping():
+    return setup_tester(scale=1, global_norm=True, clip=1.0)
+
+
+# Case with global clipping and another clip value.
+@check_gradients(global_norm=True, clip=0.3)
+@test_util.lbann_test(train=True)
+def test_gradient_clipping_diffclip():
+    return setup_tester(scale=1, global_norm=True, clip=0.3)
+
+
+# Case with per-weight clipping only.
+@check_gradients(global_norm=False)
+@test_util.lbann_test(train=True)
+def test_gradient_clipping_local():
+    return setup_tester(scale=10, global_norm=False, clip=1.0)
diff --git a/include/lbann/callbacks/CMakeLists.txt b/include/lbann/callbacks/CMakeLists.txt
@@ -46,6 +46,7 @@ set_full_path(THIS_DIR_HEADERS
   dump_weights.hpp
   early_stopping.hpp
   gpu_memory_usage.hpp
+  gradient_clipping.hpp
   hang.hpp
   learning_rate.hpp
   ltfb.hpp
diff --git a/include/lbann/callbacks/gradient_clipping.hpp b/include/lbann/callbacks/gradient_clipping.hpp
@@ -0,0 +1,110 @@
+////////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory.
+// Written by the LBANN Research Team (B. Van Essen, et al.) listed in
+// the CONTRIBUTORS file. <lbann-dev@llnl.gov>
+//
+// LLNL-CODE-697807.
+// All rights reserved.
+//
+// This file is part of LBANN: Livermore Big Artificial Neural Network
+// Toolkit. For details, see http://software.llnl.gov/LBANN or
+// https://github.com/LLNL/LBANN.
+//
+// Licensed under the Apache License, Version 2.0 (the "Licensee"); you
+// may not use this file except in compliance with the License.  You may
+// obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the license.
+//
+// gradient_clipping .hpp .cpp - Callbacks to clip gradient values in training
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef LBANN_CALLBACKS_CALLBACK_GRADIENT_CLIPPING_HPP_INCLUDED
+#define LBANN_CALLBACKS_CALLBACK_GRADIENT_CLIPPING_HPP_INCLUDED
+
+#include <unordered_set>
+#include <utility>
+
+#include "lbann/callbacks/callback.hpp"
+
+namespace lbann {
+namespace callback {
+
+/**
+ * @brief Clip gradients whose norm is larger than a user-defined value by
+ * dividing them.
+ */
+class clip_gradient_norm : public callback_base
+{
+public:
+  using callback_base::on_backward_prop_end;
+
+  /**
+   * @param weights Parameters whose gradient to clip, or empty for all
+   * @param global_norm Whether to clip according to the norm of all parameters
+   *                    or each one separately
+   * @param value Value to clip to
+   */
+  clip_gradient_norm(std::vector<std::string> weights,
+                     bool global_norm = false,
+                     float value = 1.0f)
+    : callback_base(1),
+      m_weight_names(std::move(weights)),
+      m_global_norm(global_norm),
+      m_value(value)
+  {}
+  clip_gradient_norm(const clip_gradient_norm&) = default;
+  clip_gradient_norm& operator=(const clip_gradient_norm&) = default;
+  void setup(model* m) override;
+  clip_gradient_norm* copy() const override
+  {
+    return new clip_gradient_norm(*this);
+  }
+  void on_backward_prop_end(model* m) override;
+  std::string name() const override { return "clip gradient norm"; }
+
+  /** @name Serialization */
+  ///@{
+
+  /** @brief Store state to archive for checkpoint and restart */
+  template <class Archive>
+  void serialize(Archive& ar);
+
+  ///@}
+
+private:
+  /** Add callback specific data to prototext */
+  void write_specific_proto(lbann_data::Callback& proto) const final;
+
+  friend class cereal::access;
+  clip_gradient_norm();
+
+  /** @brief Parameter names whose gradients to clip. */
+  std::vector<std::string> m_weight_names;
+
+  /** @brief Whether to clip according to the norm of all parameters. */
+  bool m_global_norm;
+
+  /** @brief Value to clip to. */
+  float m_value;
+
+  /** Weights to update. */
+  std::unordered_set<weights*> m_weights;
+};
+
+// Builder function
+std::unique_ptr<callback_base> build_clip_gradient_norm_callback_from_pbuf(
+  const google::protobuf::Message&,
+  std::shared_ptr<lbann_summary> const&);
+
+} // namespace callback
+} // namespace lbann
+
+#endif // LBANN_CALLBACKS_CALLBACK_GRADIENT_CLIPPING_HPP_INCLUDED
diff --git a/include/lbann/lbann.hpp b/include/lbann/lbann.hpp
@@ -176,6 +176,7 @@
 #include "lbann/callbacks/dump_weights.hpp"
 #include "lbann/callbacks/early_stopping.hpp"
 #include "lbann/callbacks/gpu_memory_usage.hpp"
+#include "lbann/callbacks/gradient_clipping.hpp"
 #include "lbann/callbacks/hang.hpp"
 #include "lbann/callbacks/learning_rate.hpp"
 #include "lbann/callbacks/load_model.hpp"
diff --git a/src/base.cpp b/src/base.cpp
@@ -651,6 +651,7 @@ CEREAL_FORCE_DYNAMIC_INIT(callback_dump_outputs);
 CEREAL_FORCE_DYNAMIC_INIT(callback_dump_weights);
 CEREAL_FORCE_DYNAMIC_INIT(callback_early_stopping);
 CEREAL_FORCE_DYNAMIC_INIT(callback_gpu_memory_usage);
+CEREAL_FORCE_DYNAMIC_INIT(callback_clip_gradient_norm);
 CEREAL_FORCE_DYNAMIC_INIT(callback_hang);
 CEREAL_FORCE_DYNAMIC_INIT(callback_load_model);
 CEREAL_FORCE_DYNAMIC_INIT(callback_mixup);
diff --git a/src/callbacks/CMakeLists.txt b/src/callbacks/CMakeLists.txt
@@ -46,6 +46,7 @@ set_full_path(THIS_DIR_SOURCES
   dump_weights.cpp
   early_stopping.cpp
   gpu_memory_usage.cpp
+  gradient_clipping.cpp
   hang.cpp
   learning_rate.cpp
   load_model.cpp
diff --git a/src/callbacks/gradient_clipping.cpp b/src/callbacks/gradient_clipping.cpp
diff --git a/src/proto/callbacks.proto b/src/proto/callbacks.proto
diff --git a/src/proto/factories/callback_factory.cpp b/src/proto/factories/callback_factory.cpp