fix/add to allow half and bfloat16 at the same time

yhmtsai · yhmtsai · commit 41735db974e8 · 2025-04-14T19:55:29.000+02:00
diff --git a/common/unified/components/fill_array_kernels.cpp b/common/unified/components/fill_array_kernels.cpp
@@ -41,7 +41,8 @@ void fill_seq_array(std::shared_ptr<const DefaultExecutor> exec,
     run_kernel(
         exec,
         [] GKO_KERNEL(auto idx, auto array) {
-            if constexpr (std::is_same_v<remove_complex<ValueType>, float16>) {
+            if constexpr (std::is_same_v<remove_complex<ValueType>, float16> ||
+                          std::is_same_v<remove_complex<ValueType>, bfloat16>) {
                 // __half can not be from int64_t
                 // __hip_bfloat16 can not be from long long
                 array[idx] = static_cast<float>(idx);
diff --git a/core/config/config_helper.hpp b/core/config/config_helper.hpp
@@ -213,7 +213,8 @@ get_value(const pnode& config)
  */
 template <typename ValueType>
 inline std::enable_if_t<std::is_floating_point<ValueType>::value ||
-                            std::is_same<ValueType, float16>::value,
+                            std::is_same<ValueType, float16>::value ||
+                            std::is_same<ValueType, bfloat16>::value,
                         ValueType>
 get_value(const pnode& config)
 {
diff --git a/core/config/type_descriptor_helper.hpp b/core/config/type_descriptor_helper.hpp
@@ -40,9 +40,11 @@ TYPE_STRING_OVERLOAD(void, "void");
 TYPE_STRING_OVERLOAD(double, "float64");
 TYPE_STRING_OVERLOAD(float, "float32");
 TYPE_STRING_OVERLOAD(float16, "float16");
+TYPE_STRING_OVERLOAD(bfloat16, "bfloat16");
 TYPE_STRING_OVERLOAD(std::complex<double>, "complex<float64>");
 TYPE_STRING_OVERLOAD(std::complex<float>, "complex<float32>");
 TYPE_STRING_OVERLOAD(std::complex<float16>, "complex<float16>");
+TYPE_STRING_OVERLOAD(std::complex<bfloat16>, "complex<bfloat16>");
 TYPE_STRING_OVERLOAD(int32, "int32");
 TYPE_STRING_OVERLOAD(int64, "int64");
 
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
@@ -415,7 +415,9 @@ __global__ void sptrsv_naive_caching_kernel(
     // memory operation on the half-precision shared_memory seem to give
     // wrong result. we use float in shared_memory.
     using SharedValueType = std::conditional_t<
-        std::is_same<remove_complex<ValueType>, device_type<float16>>::value,
+        std::is_same<remove_complex<ValueType>, device_type<float16>>::value ||
+            std::is_same<remove_complex<ValueType>,
+                         device_type<bfloat16>>::value,
         std::conditional_t<is_complex<ValueType>(), thrust::complex<float>,
                            float>,
         ValueType>;
diff --git a/include/ginkgo/core/base/bfloat16.hpp b/include/ginkgo/core/base/bfloat16.hpp
@@ -89,7 +89,9 @@ class alignas(std::uint16_t) bfloat16 {
     // caused by something else in jacobi or isai.
     constexpr bfloat16() noexcept : data_(0){};
 
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    template <typename T,
+              typename = std::enable_if_t<std::is_scalar<T>::value ||
+                                          std::is_same_v<T, half>>>
     bfloat16(const T& val) : data_(0)
     {
         this->float2bfloat16(static_cast<float>(val));
@@ -135,12 +137,16 @@ class alignas(std::uint16_t) bfloat16 {
 
     // Do operation with different type
     // If it is floating point, using floating point as type.
-    // If it is integer, using bfloat16 as type
+    // If it is bfloat16, using float as type.
+    // If it is integer, using bfloat16 as type.
 #define BFLOAT16_FRIEND_OPERATOR(_op, _opeq)                                   \
     template <typename T>                                                      \
     friend std::enable_if_t<                                                   \
-        !std::is_same<T, bfloat16>::value && std::is_scalar<T>::value,         \
-        std::conditional_t<std::is_floating_point<T>::value, T, bfloat16>>     \
+        !std::is_same<T, bfloat16>::value &&                                   \
+            (std::is_scalar<T>::value || std::is_same_v<T, half>),             \
+        std::conditional_t<                                                    \
+            std::is_floating_point<T>::value, T,                               \
+            std::conditional_t<std::is_same_v<T, half>, float, bfloat16>>>     \
     operator _op(const bfloat16& hf, const T& val)                             \
     {                                                                          \
         using type =                                                           \
@@ -151,8 +157,11 @@ class alignas(std::uint16_t) bfloat16 {
     }                                                                          \
     template <typename T>                                                      \
     friend std::enable_if_t<                                                   \
-        !std::is_same<T, bfloat16>::value && std::is_scalar<T>::value,         \
-        std::conditional_t<std::is_floating_point<T>::value, T, bfloat16>>     \
+        !std::is_same<T, bfloat16>::value &&                                   \
+            (std::is_scalar<T>::value || std::is_same_v<T, half>),             \
+        std::conditional_t<                                                    \
+            std::is_floating_point<T>::value, T,                               \
+            std::conditional_t<std::is_same_v<T, half>, float, bfloat16>>>     \
     operator _op(const T& val, const bfloat16& hf)                             \
     {                                                                          \
         using type =                                                           \
@@ -255,23 +264,29 @@ class complex<gko::bfloat16> {
         : real_(real), imag_(imag)
     {}
 
-    template <typename T, typename U,
-              typename = std::enable_if_t<std::is_scalar<T>::value &&
-                                          std::is_scalar<U>::value>>
+    template <
+        typename T, typename U,
+        typename = std::enable_if_t<
+            (std::is_scalar<T>::value || std::is_same_v<T, gko::half>)&&(
+                std::is_scalar<U>::value || std::is_same_v<U, gko::half>)>>
     explicit complex(const T& real, const U& imag)
         : real_(static_cast<value_type>(real)),
           imag_(static_cast<value_type>(imag))
     {}
 
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    template <typename T,
+              typename = std::enable_if_t<std::is_scalar<T>::value ||
+                                          std::is_same_v<T, gko::half>>>
     complex(const T& real)
         : real_(static_cast<value_type>(real)),
           imag_(static_cast<value_type>(0.f))
     {}
 
     // When using complex(real, imag), MSVC with CUDA try to recognize the
     // complex is a member not constructor.
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    template <typename T,
+              typename = std::enable_if_t<std::is_scalar<T>::value ||
+                                          std::is_same_v<T, gko::half>>>
     explicit complex(const complex<T>& other)
         : real_(static_cast<value_type>(other.real())),
           imag_(static_cast<value_type>(other.imag()))
diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
@@ -25,6 +25,8 @@ class truncated;
 
 class half;
 
+class bfloat16;
+
 
 namespace detail {
 
@@ -298,7 +300,9 @@ class alignas(std::uint16_t) half {
     // caused by something else in jacobi or isai.
     constexpr half() noexcept : data_(0){};
 
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    template <typename T,
+              typename = std::enable_if_t<std::is_scalar<T>::value ||
+                                          std::is_same_v<T, bfloat16>>>
     half(const T& val) : data_(0)
     {
         this->float2half(static_cast<float>(val));
@@ -345,6 +349,8 @@ class alignas(std::uint16_t) half {
     // Do operation with different type
     // If it is floating point, using floating point as type.
     // If it is integer, using half as type
+    // Note: we do not define the operation with bfloat16, which is already
+    // defined in bfloat16.hpp
 #define HALF_FRIEND_OPERATOR(_op, _opeq)                                   \
     template <typename T>                                                  \
     friend std::enable_if_t<                                               \
@@ -464,23 +470,29 @@ class complex<gko::half> {
         : real_(real), imag_(imag)
     {}
 
-    template <typename T, typename U,
-              typename = std::enable_if_t<std::is_scalar<T>::value &&
-                                          std::is_scalar<U>::value>>
+    template <
+        typename T, typename U,
+        typename = std::enable_if_t<
+            (std::is_scalar<T>::value || std::is_same_v<T, gko::bfloat16>)&&(
+                std::is_scalar<U>::value || std::is_same_v<U, gko::bfloat16>)>>
     explicit complex(const T& real, const U& imag)
         : real_(static_cast<value_type>(real)),
           imag_(static_cast<value_type>(imag))
     {}
 
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    template <typename T,
+              typename = std::enable_if_t<std::is_scalar<T>::value ||
+                                          std::is_same_v<T, gko::bfloat16>>>
     complex(const T& real)
         : real_(static_cast<value_type>(real)),
           imag_(static_cast<value_type>(0.f))
     {}
 
     // When using complex(real, imag), MSVC with CUDA try to recognize the
     // complex is a member not constructor.
-    template <typename T, typename = std::enable_if_t<std::is_scalar<T>::value>>
+    template <typename T,
+              typename = std::enable_if_t<std::is_scalar<T>::value ||
+                                          std::is_same_v<T, gko::bfloat16>>>
     explicit complex(const complex<T>& other)
         : real_(static_cast<value_type>(other.real())),
           imag_(static_cast<value_type>(other.imag()))
diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp
@@ -89,13 +89,17 @@ GKO_REGISTER_MPI_TYPE(unsigned long long, MPI_UNSIGNED_LONG_LONG);
 GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT);
 GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE);
 GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE);
-#if GINKGO_ENABLE_HALF || GINKGO_ENABLE_BFLOAT16
+#if GINKGO_ENABLE_HALF
 // OpenMPI 5.0 have support from MPIX_C_FLOAT16 and MPICHv3.4a1 MPIX_C_FLOAT16
 // Only OpenMPI support complex float16
 // TODO: use native type when mpi is configured with half feature
-GKO_REGISTER_MPI_TYPE(float16, MPI_UNSIGNED_SHORT);
-GKO_REGISTER_MPI_TYPE(std::complex<float16>, MPI_FLOAT);
+GKO_REGISTER_MPI_TYPE(half, MPI_UNSIGNED_SHORT);
+GKO_REGISTER_MPI_TYPE(std::complex<half>, MPI_FLOAT);
 #endif  // GKO_ENABLE_HALF
+#if GINKGO_ENABLE_BFLOAT16
+GKO_REGISTER_MPI_TYPE(bfloat16, MPI_UNSIGNED_SHORT);
+GKO_REGISTER_MPI_TYPE(std::complex<bfloat16>, MPI_FLOAT);
+#endif  // GKO_ENABLE_BFLOAT16
 GKO_REGISTER_MPI_TYPE(std::complex<float>, MPI_C_FLOAT_COMPLEX);
 GKO_REGISTER_MPI_TYPE(std::complex<double>, MPI_C_DOUBLE_COMPLEX);
 
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
@@ -145,11 +145,7 @@ using uintptr = std::uintptr_t;
 /**
  * 16 bit floating point type.
  */
-#if !GINKGO_ENABLE_BFLOAT16
 using float16 = half;
-#else
-using float16 = bfloat16;
-#endif
 
 
 /**
diff --git a/include/ginkgo/core/log/logger.hpp b/include/ginkgo/core/log/logger.hpp
@@ -579,7 +579,7 @@ public:                                                              \
     {}
 
 
-#if GINKGO_ENABLE_HALF || GINKGO_ENABLE_BFLOAT16
+#if GINKGO_ENABLE_HALF
 
 
     /**
@@ -598,6 +598,25 @@ public:                                                              \
 #endif
 
 
+#if GINKGO_ENABLE_BFLOAT16
+
+
+    /**
+     * Batch solver's event that records the iteration count and the residual
+     * norm.
+     *
+     * @param iters  the array of iteration counts.
+     * @param residual_norms  the array storing the residual norms.
+     */
+    virtual void on_batch_solver_completed(
+        const array<int>& iters,
+        const array<gko::bfloat16>& residual_norms) const
+    {}
+
+
+#endif
+
+
 public:
 #undef GKO_LOGGER_REGISTER_EVENT
 
diff --git a/omp/components/atomic.hpp b/omp/components/atomic.hpp
@@ -84,6 +84,40 @@ inline void atomic_add(float16& out, float16 val)
 }
 
 
+template <>
+inline void atomic_add(bfloat16& out, bfloat16 val)
+{
+#ifdef __NVCOMPILER
+// NVC++ uses atomic capture on uint16 leads the following error.
+// use of undefined value '%L.B*' br label %L.B* !llvm.loop !*, !dbg !*
+#pragma omp critical
+    {
+        out += val;
+    }
+#else
+    static_assert(
+        sizeof(bfloat16) == sizeof(uint16_t) &&
+            std::alignment_of_v<uint16_t> == std::alignment_of_v<bfloat16>,
+        "half does not fulfill the requirement of reinterpret_cast to half or "
+        "vice versa.");
+    // It is undefined behavior with reinterpret_cast, but we do not have any
+    // workaround when the #omp atomic does not support custom precision
+    uint16_t* address_as_converter = reinterpret_cast<uint16_t*>(&out);
+    uint16_t old = *address_as_converter;
+    uint16_t assumed;
+    do {
+        assumed = old;
+        auto answer = copy_cast<uint16_t>(copy_cast<bfloat16>(assumed) + val);
+#pragma omp atomic capture
+        {
+            old = *address_as_converter;
+            *address_as_converter = (old == assumed) ? answer : old;
+        }
+    } while (assumed != old);
+#endif
+}
+
+
 // There is an error in Clang 17 which prevents us from merging the
 // implementation of double and float. The compiler will throw an error if the
 // templated version is implemented. GCC doesn't throw an error.
@@ -119,6 +153,14 @@ inline void store(float16* addr, float16 val)
     *uint_addr = uint_val;
 }
 
+inline void store(bfloat16* addr, bfloat16 val)
+{
+    auto uint_addr = copy_cast<uint16_t*>(addr);
+    auto uint_val = copy_cast<uint16_t>(val);
+#pragma omp atomic write
+    *uint_addr = uint_val;
+}
+
 template <typename T>
 inline void store(std::complex<T>* addr, std::complex<T> val)
 {
@@ -170,6 +212,15 @@ inline float16 load(float16* addr)
     return copy_cast<float16>(uint_val);
 }
 
+inline bfloat16 load(bfloat16* addr)
+{
+    uint16_t uint_val;
+    auto uint_addr = copy_cast<uint16_t*>(addr);
+#pragma omp atomic read
+    uint_val = *uint_addr;
+    return copy_cast<bfloat16>(uint_val);
+}
+
 template <typename T>
 inline std::complex<T> load(std::complex<T>* addr)
 {