ginkgo-project · emre-safa · Aug 29, 2025 · Sep 4, 2025 · Sep 8, 2025 · Sep 9, 2025
diff --git a/core/test/base/half.cpp b/core/test/base/half.cpp
@@ -130,6 +130,54 @@ TEST(FloatToHalf, TruncatesLargeNumberRoundToEven)
     EXPECT_EQ(get_bits(neg_x2), get_bits("1" "11110" "1001001111"));
 }
 
+TEST(FloatToHalf, ConvertsRandomPositiveNumber)
+{
+    half x = create_from_bits<float>("0" "01101011" "00011011000001011110010");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000010010"));
+}
+
+TEST(FloatToHalf, RoundsUpToEvenNumber)
+{
+    half x = create_from_bits<float>("0" "01101001" "11100000000000000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000001000"));
+}
+
+TEST(FloatToHalf, RoundsDownToEvenNumber)
+{
+    half x = create_from_bits<float>("1" "01101100" "11010100000000000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "00000" "0000111010"));
+}
+
+TEST(FloatToHalf, LargestNumberThatConvertsToZero)
+{
+    half x = create_from_bits<float>("0" "01100110" "00000000000000000000000");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000000000"));
+}
+
+TEST(FloatToHalf, SmallestNumberThatDoesntConvertToZero)
+{
+    half x = create_from_bits<float>("0" "01100110" "00000000000000000000001");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000000001"));
+}
+
+TEST(FloatToHalf, RandomNumberThatConvertsToPositiveZero)
+{
+    half x = create_from_bits<float>("0" "01001101" "10010100101111001101010");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "00000" "0000000000"));
+}
+
+TEST(FloatToHalf, RandomNumberThatConvertsToNegativeZero)
+{
+    half x = create_from_bits<float>("1" "01001000" "10000010110100001010001");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "00000" "0000000000"));
+}
 
 TEST(HalfToFloat, ConvertsOne)
 {
@@ -195,4 +243,52 @@ TEST(HalfToFloat, ExtendsLargeNumber)
 }
 
 
+TEST(HalfToFloat, ConvertsPositiveRandomDenormal)
+{
+    float x = create_from_bits<half>("0" "00000" "1110101100");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "01110000" "11010110000000000000000"));
+}
+
+
+TEST(HalfToFloat, ConvertsNegativeRandomDenormal)
+{
+    float x = create_from_bits<half>("1" "00000" "0010111101");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "01101110" "01111010000000000000000"));
+}
+
+
+TEST(HalfToFloat, ConvertsSmallestPositiveDenormal)
+{
+    float x = create_from_bits<half>("0" "00000" "0000000001");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "01100111" "00000000000000000000000"));
+}
+
+
+TEST(HalfToFloat, ConvertsSmallestNegativeDenormal)
+{
+    float x = create_from_bits<half>("1" "00000" "1111111111");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "01110000" "11111111100000000000000"));
+}
+
+
+TEST(HalfToFloat, ConvertsLargestPositiveDenormal)
+{
+    float x = create_from_bits<half>("0" "00000" "1111111111");
+
+    ASSERT_EQ(get_bits(x), get_bits("0" "01110000" "11111111100000000000000"));
+}
+
+
+TEST(HalfToFloat, ConvertsLargestNegativeDenormal)
+{
+    float x = create_from_bits<half>("1" "00000" "0000000001");
+
+    ASSERT_EQ(get_bits(x), get_bits("1" "01100111" "00000000000000000000000"));
+}
+
+
 // clang-format on
diff --git a/include/ginkgo/core/base/half.hpp b/include/ginkgo/core/base/half.hpp
@@ -416,7 +416,41 @@ class alignas(std::uint16_t) half {
                 return conv::shift_sign(data_) | exp;
             } else if (f16_traits::is_denom(exp)) {
                 // TODO: handle denormals
-                return conv::shift_sign(data_);
+
+                // This can not be negative if f16_traits::is_denom(exp) is true
+                const auto tail_length =
+                    ((f32_traits::bias_mask -
+                      (data_ & f32_traits::exponent_mask)) >>
+                     f32_traits::significand_bits) -
+                    1;
+                if (tail_length > f32_traits::significand_bits + 1) {
-                if (tail_length > f32_traits::significand_bits + 1) {
+                // all significant (including implicitly leading 1) will be moved after representation field more than one digit (less than half) such that it will rounding to zero.
+                if (tail_length > f32_traits::significand_bits + 1) {
-                if (tail_length > f32_traits::significand_bits + 1) {
+                // all significant (including implicitly leading 1) will be moved after representation field more than one digit (less than half) such that it will rounding to zero.
+                if (tail_length > f32_traits::significand_bits + 1) {
+                    return conv::shift_sign(data_);
+                }
+
+                // It would be better if defined with const?
+                auto tail =
+                    (data_ & f32_traits::significand_mask) &
+                    static_cast<f32_traits::bits_type>((1 << tail_length) - 1);
+
+                // Handle if the tail_length is 24. It means half precision will
+                // be the smallest possible number it can represent(or zero)
+                if (tail_length == f32_traits::significand_bits + 1) {
+                    tail |= 1 << f32_traits::significand_bits;
+                }
+
+                auto new_significand =
+                    ((data_ & f32_traits::significand_mask) >> tail_length) |
+                    (1 << (f32_traits::significand_bits - tail_length));
+
+                const auto result =
+                    conv::shift_sign(data_) | exp | new_significand;
+
+                // It would be better if defined with constexpr
+                const auto half =
+                    static_cast<f32_traits::bits_type>(1 << (tail_length - 1));
+
+                return result +
+                       (tail > half || ((tail == half) && (result & 1)));
             } else {
                 // Rounding to even
                 const auto result = conv::shift_sign(data_) | exp |
@@ -442,8 +476,42 @@ class alignas(std::uint16_t) half {
             return conv::shift_sign(data_) | f32_traits::exponent_mask |
                    f32_traits::significand_mask;
         } else if (f16_traits::is_denom(data_)) {
-            // TODO: handle denormals
-            return conv::shift_sign(data_);
+            if (!(data_ & f16_traits::significand_mask)) {
+                return conv::shift_sign(data_);
+            }
+
+            int leading_zeros{};
+
+// Counts leading zeros in the significand to determine the
+// normalization shift
+#if defined(_MSC_VER)
+            unsigned long index{};
+            _BitScanReverse(&index, static_cast<std::uint32_t>(
+                                        f16_traits::significand_mask & data_));
+
+            leading_zeros = f16_traits::significand_bits - index - 1;
+#else
+            leading_zeros =
+                __builtin_clz(static_cast<std::uint32_t>(
+                    f16_traits::significand_mask & data_)) -
+                f16_traits::exponent_bits - f16_traits::sign_bits -
+                8 * (sizeof(conv::result_bits) - sizeof(conv::source_bits));
-                8 * (sizeof(conv::result_bits) - sizeof(conv::source_bits));
+                CHAR_BIT * (sizeof(conv::result_bits) - sizeof(conv::source_bits));
-                8 * (sizeof(conv::result_bits) - sizeof(conv::source_bits));
+                CHAR_BIT * (sizeof(conv::result_bits) - sizeof(conv::source_bits));
+#endif
+
+            // Computes the new exponent, 0xxxxxxxx000...00
+            auto new_exponent =
+                ((conv::bias_change >> f32_traits::significand_bits) -
+                 leading_zeros)
+                << f32_traits::significand_bits;
-            auto new_exponent =
-                ((conv::bias_change >> f32_traits::significand_bits) -
-                 leading_zeros)
-                << f32_traits::significand_bits;
+            auto new_exponent =
+                conv::bias_change - (leading_zeros << f32_traits::significand_bits);
-            auto new_exponent =
-                ((conv::bias_change >> f32_traits::significand_bits) -
-                 leading_zeros)
-                << f32_traits::significand_bits;
+            auto new_exponent =
+                conv::bias_change - (leading_zeros << f32_traits::significand_bits);
+
+            // Shifts the original significand to normalize it, remove the
+            // implicit '1', and align it in the new 23-bit field
+            auto new_significand =
+                (static_cast<f32_traits::bits_type>(data_)
+                 << (conv::significand_offset + leading_zeros + 1)) &
+                f32_traits::significand_mask;
+
+            return conv::shift_sign(data_) | new_exponent | new_significand;
         } else {
             return conv::shift_sign(data_) | conv::shift_exponent(data_) |
                    conv::shift_significand(data_);