diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h
index e9f743a632f..275ff39e7a5 100644
--- a/programming_examples/basic/matrix_multiplication/common.h
+++ b/programming_examples/basic/matrix_multiplication/common.h
@@ -21,7 +21,7 @@
 #include <iostream>
 #include <optional>
 #include <ostream>
-#include <stdfloat>
+#include <type_traits>
 
 #include "test_utils.h"
 
@@ -92,6 +92,52 @@ void parse_options(int argc, const char *argv[], cxxopts::Options &options,
 template <typename T>
 static inline T get_random();
 
+template <typename T>
+static inline auto scalar_to_arithmetic(T value) {
+  if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
+    return test_utils::bfloat16_to_float(value);
+  } else {
+    return value;
+  }
+}
+
+template <typename T>
+static inline float scalar_to_float(T value) {
+  return static_cast<float>(scalar_to_arithmetic(value));
+}
+
+template <typename T, typename Tacc>
+static inline T scalar_from_accum(Tacc value) {
+  auto arithmetic_value = scalar_to_arithmetic(value);
+  if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
+    return test_utils::bfloat16_from_float(
+        static_cast<float>(arithmetic_value));
+  } else {
+    return static_cast<T>(arithmetic_value);
+  }
+}
+
+template <typename Tacc>
+static inline Tacc zero_accum() {
+  if constexpr (std::is_same_v<Tacc, test_utils::bfloat16_t>) {
+    return test_utils::bfloat16_from_float(0.0f);
+  } else {
+    return Tacc(0);
+  }
+}
+
+template <typename Tacc, typename Tin>
+static inline Tacc accum_add_product(Tacc running_sum, Tin lhs, Tin rhs) {
+  auto product = scalar_to_arithmetic(lhs) * scalar_to_arithmetic(rhs);
+  if constexpr (std::is_same_v<Tacc, test_utils::bfloat16_t>) {
+    return test_utils::bfloat16_add(
+        running_sum,
+        test_utils::bfloat16_from_float(static_cast<float>(product)));
+  } else {
+    return running_sum + Tacc(product);
+  }
+}
+
 template <>
 std::int16_t get_random<std::int16_t>() {
   return (std::int16_t)rand() % 0x10000;
@@ -103,10 +149,11 @@ int8_t get_random<int8_t>() {
 }
 
 template <>
-std::bfloat16_t get_random<std::bfloat16_t>() {
+test_utils::bfloat16_t get_random<test_utils::bfloat16_t>() {
   // Random numbers should NOT be uniformly between 0 and 1, because that
   // would make the matrix product AB always close to 1.
-  return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
+  return test_utils::bfloat16_from_float(4.0f * (float)rand() /
+                                         (float)(RAND_MAX));
 }
 
 template <typename Tin, typename Tout, typename Tacc>
@@ -115,18 +162,20 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
             int c_col_maj) {
   for (int row = 0; row < M; row++) {
     for (int col = 0; col < N; col++) {
-      Tacc running_sum = 0;
+      Tacc running_sum = zero_accum<Tacc>();
       for (int k = 0; k < K; k++) {
         if (!b_col_maj) {
-          running_sum += Tacc(A[row * K + k] * B[k * N + col]);
+          running_sum = accum_add_product<Tacc>(running_sum, A[row * K + k],
+                                                B[k * N + col]);
         } else {
-          running_sum += Tacc(A[row * K + k] * B[k + col * K]);
+          running_sum = accum_add_product<Tacc>(running_sum, A[row * K + k],
+                                                B[k + col * K]);
         }
       }
       if (!c_col_maj) {
-        C[row * N + col] = Tout(running_sum);
+        C[row * N + col] = scalar_from_accum<Tout>(running_sum);
       } else {
-        C[row + col * M] = Tout(running_sum);
+        C[row + col * M] = scalar_from_accum<Tout>(running_sum);
       }
     }
   }
@@ -135,15 +184,17 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
 template <typename Tin, typename Tout, typename Tacc>
 Tout mul_acc(int M, int N, int K, int row, int col, const std::vector<Tin> A,
              const std::vector<Tin> B, int b_col_maj) {
-  Tacc running_sum = 0;
+  Tacc running_sum = zero_accum<Tacc>();
   for (int k = 0; k < K; k++) {
     if (!b_col_maj) {
-      running_sum += Tacc(A[row * K + k] * B[k * N + col]);
+      running_sum =
+          accum_add_product<Tacc>(running_sum, A[row * K + k], B[k * N + col]);
     } else {
-      running_sum += Tacc(A[row * K + k] * B[k + col * K]);
+      running_sum =
+          accum_add_product<Tacc>(running_sum, A[row * K + k], B[k + col * K]);
     }
   }
-  return (Tout)running_sum;
+  return scalar_from_accum<Tout>(running_sum);
 }
 
 // nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0
@@ -184,7 +235,7 @@ float get_abs_tol<std::int32_t>() {
 }
 
 template <>
-float get_abs_tol<std::bfloat16_t>() {
+float get_abs_tol<test_utils::bfloat16_t>() {
   return 0.5;
 }
 
@@ -209,7 +260,7 @@ float get_rel_tol<std::int32_t>() {
 }
 
 template <>
-float get_rel_tol<std::bfloat16_t>() {
+float get_rel_tol<test_utils::bfloat16_t>() {
   return 0.05;
 }
 
@@ -314,8 +365,9 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual,
               float abs_tol, float rel_tol) {
   bool match = expected == actual;
   if (abs_tol > 0 || rel_tol > 0) {
-    // Allow for some tolerance for float data types
-    match = nearly_equal(expected, actual, rel_tol, abs_tol);
+    // Allow for some tolerance for float and host-side bfloat16 data types.
+    match = nearly_equal(scalar_to_float(expected), scalar_to_float(actual),
+                         rel_tol, abs_tol);
   }
   if (!match) {
     return (struct error<Tout>){row, col, expected, actual};
@@ -326,12 +378,13 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual,
 template <typename Tout>
 void print_error_summary(std::ostream &os, int n_errors,
                          std::vector<struct error<Tout>> &errors,
-                         Tout max_rel_error) {
+                         float max_rel_error) {
   for (struct error<Tout> &err : errors) {
     os << "[" << std::setw(5) << err.row << ", " << std::setw(5) << err.col
        << "] " << std::setw(4) << std::setprecision(2) << std::fixed
-       << (float)err.actual << " =!= " << std::setw(4) << std::setprecision(2)
-       << std::fixed << (float)err.expected << std::endl;
+       << scalar_to_float(err.actual) << " =!= " << std::setw(4)
+       << std::setprecision(2) << std::fixed << scalar_to_float(err.expected)
+       << std::endl;
   }
   if (n_errors > max_printable_errors) {
     os << "...and " << std::setw(0) << n_errors - max_printable_errors
@@ -357,7 +410,7 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
            float rel_tol = 0.05, int b_col_maj = 0, int c_col_maj = 0) {
   int n_errors = 0;
   std::vector<struct error<Tout>> errors;
-  Tout max_rel_error = (Tout)0.0f;
+  float max_rel_error = 0.0f;
   struct error<Tout> max_error;
 
   std::vector<Tout> CRef(M * N);
@@ -372,9 +425,11 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
         if (n_errors < max_printable_errors) {
           errors.push_back(*error);
         }
-        Tout rel_error =
-            std::abs(error->actual - error->expected) /
-            std::max(std::abs(error->actual), std::abs(error->expected));
+        float actual_value = scalar_to_float(error->actual);
+        float expected_value = scalar_to_float(error->expected);
+        float rel_error =
+            std::abs(actual_value - expected_value) /
+            std::max(std::abs(actual_value), std::abs(expected_value));
         if (rel_error > max_rel_error) {
           max_rel_error = rel_error;
           max_error = *error;
@@ -414,7 +469,7 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
 
   int n_errors = 0;
   std::vector<struct error<Tout>> errors;
-  Tout max_rel_error = (Tout)0.0f;
+  float max_rel_error = 0.0f;
   double progress = 0;
   for (std::tuple<size_t, std::tuple<int &, int &>> cell :
        std::views::enumerate(std::views::zip(sampled_rows, sampled_cols))) {
@@ -440,9 +495,11 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
       if (n_errors < max_printable_errors) {
         errors.push_back(*error);
       }
-      Tout rel_error =
-          std::abs(error->actual - error->expected) /
-          std::max(std::abs(error->actual), std::abs(error->expected));
+      float actual_value = scalar_to_float(error->actual);
+      float expected_value = scalar_to_float(error->expected);
+      float rel_error =
+          std::abs(actual_value - expected_value) /
+          std::max(std::abs(actual_value), std::abs(expected_value));
       if (rel_error > max_rel_error) {
         max_rel_error = rel_error;
       }
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index afc028f200e..284a288de4f 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -42,10 +42,10 @@ devicename ?= $(if $(filter 1,$(NPU2)),npu2,npu)
 colshift ?= $(if $(filter npu,$(devicename)),1,0)
 
 ifeq ($(dtype_in),bf16)
-	dtype_in_cpp=std::bfloat16_t
+	dtype_in_cpp=test_utils::bfloat16_t
 endif
 ifeq ($(dtype_out),bf16)
-	dtype_out_cpp=std::bfloat16_t
+	dtype_out_cpp=test_utils::bfloat16_t
 	dtype_acc_cpp=float
 endif
 ifeq ($(dtype_in),i16)
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
index eb41adafa3b..31cc6e3a3e9 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp
@@ -8,12 +8,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <stdfloat>
 #include <stdint.h>
 
 #define DATATYPES_USING_DEFINED
-using A_DATATYPE = int16_t; // std::bfloat16_t;
-using B_DATATYPE = int16_t; // std::bfloat16_t;
+using A_DATATYPE = int16_t;
+using B_DATATYPE = int16_t;
 using C_DATATYPE = int32_t; // float;
 using ACC_DATATYPE = int32_t;
 
diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp
index 1c2c33f1394..de82d8b84be 100644
--- a/programming_examples/basic/matrix_multiplication/test.cpp
+++ b/programming_examples/basic/matrix_multiplication/test.cpp
@@ -18,7 +18,6 @@
 #include <iomanip>
 #include <iostream>
 #include <sstream>
-#include <stdfloat>
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
@@ -29,10 +28,10 @@
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
 #ifndef DTYPE_IN
-#define DTYPE_IN std::bfloat16_t
+#define DTYPE_IN test_utils::bfloat16_t
 #endif
 #ifndef DTYPE_OUT
-#define DTYPE_OUT std::bfloat16_t
+#define DTYPE_OUT test_utils::bfloat16_t
 #endif
 #ifndef DTYPE_ACC
 #define DTYPE_ACC float
diff --git a/programming_examples/basic/vector_exp/test.cpp b/programming_examples/basic/vector_exp/test.cpp
index 1c179c0d4df..4b3f16846ba 100644
--- a/programming_examples/basic/vector_exp/test.cpp
+++ b/programming_examples/basic/vector_exp/test.cpp
@@ -9,9 +9,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "cxxopts.hpp"
-#include <bits/stdc++.h>
+#include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -26,8 +30,8 @@
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-using INOUT0_DATATYPE = std::bfloat16_t;
-using INOUT1_DATATYPE = std::bfloat16_t;
+using INOUT0_DATATYPE = test_utils::bfloat16_t;
+using INOUT1_DATATYPE = test_utils::bfloat16_t;
 #endif
 
 // ----------------------------------------------------------------------------
@@ -37,16 +41,19 @@ template <typename T>
 int verify(int CSize, std::vector<T> A, std::vector<T> C, int verbosity) {
   int errors = 0;
   for (uint32_t i = 0; i < CSize; i++) {
-    std::bfloat16_t ref = exp(A[i]);
+    const float input = test_utils::bfloat16_to_float(A[i]);
+    const float actual = test_utils::bfloat16_to_float(C[i]);
+    const auto ref_bf16 = test_utils::bfloat16_from_float(std::exp(input));
+    const float ref = test_utils::bfloat16_to_float(ref_bf16);
     // Let's check if they are inf or nan, and if so just pass because
     // comparisions will then fail, even for matches
-    if (std::isinf(ref) || std::isinf(C[i]))
+    if (std::isinf(ref) || std::isinf(actual))
       break;
-    if (std::isnan(ref) || std::isnan(C[i]))
+    if (std::isnan(ref) || std::isnan(actual))
       break;
-    if (!test_utils::nearly_equal(ref, C[i], 0.128)) {
+    if (!test_utils::nearly_equal(ref, actual, 0.128)) {
       if (errors < 100) {
-        std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
+        std::cout << "Error in output " << actual << " != " << ref << std::endl;
       } else if (errors == 100) {
         std::cout << "..." << std::endl;
         std::cout << "[Errors truncated]" << std::endl;
@@ -54,7 +61,7 @@ int verify(int CSize, std::vector<T> A, std::vector<T> C, int verbosity) {
       errors++;
     } else {
       if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+        std::cout << "Correct output " << actual << " == " << ref << std::endl;
     }
   }
   return errors;
@@ -161,9 +168,8 @@ int main(int argc, const char *argv[]) {
   INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
   std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
   for (int i = 0; i < INOUT0_VOLUME; i++) {
-    std::uint16_t u16 = (std::uint16_t)i;
-    std::bfloat16_t bf16 = *(std::bfloat16_t *)&u16;
-    AVec[i] = bf16;
+    const std::uint16_t bits = static_cast<std::uint16_t>(i);
+    AVec[i] = test_utils::bfloat16_from_bits(bits);
   }
   memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
 
@@ -203,7 +209,7 @@ int main(int argc, const char *argv[]) {
       /* Warmup iterations do not count towards average runtime. */
       continue;
     }
-    std::bfloat16_t *bufOut = bo_inout1.map<std::bfloat16_t *>();
+    INOUT1_DATATYPE *bufOut = bo_inout1.map<INOUT1_DATATYPE *>();
 
     // Copy output results and verify they are correct
     std::vector<INOUT1_DATATYPE> CVec(INOUT1_VOLUME);
diff --git a/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile b/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile
index c228738ebec..3efcbca088e 100755
--- a/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile
+++ b/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile
@@ -40,7 +40,7 @@ else
 endif
 
 ifeq ($(dtype),bf16)
-	dtype_cpp=std::bfloat16_t
+	dtype_cpp=test_utils::bfloat16_t
 endif
 ifeq ($(dtype),i32)
 	dtype_cpp=std::int32_t
diff --git a/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile b/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile
index 6ebb0850f09..79291cd6d39 100644
--- a/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile
+++ b/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile
@@ -41,7 +41,7 @@ aie_py_src=${targetname}.py
 endif
 
 ifeq ($(dtype),bf16)
-	dtype_cpp=std::bfloat16_t
+	dtype_cpp=test_utils::bfloat16_t
 endif
 ifeq ($(dtype),i32)
 	dtype_cpp=std::int32_t
diff --git a/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile b/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile
index 5e1ab1db180..55630048482 100755
--- a/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile
+++ b/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile
@@ -30,7 +30,7 @@ aie_py_src=${targetname}.py
 endif
 
 ifeq ($(dtype),bf16)
-	dtype_cpp=std::bfloat16_t
+	dtype_cpp=test_utils::bfloat16_t
 endif
 ifeq ($(dtype),i32)
 	dtype_cpp=std::int32_t
diff --git a/programming_examples/basic/vector_reduce_max/test.cpp b/programming_examples/basic/vector_reduce_max/test.cpp
index 86f4219496b..890c54df985 100644
--- a/programming_examples/basic/vector_reduce_max/test.cpp
+++ b/programming_examples/basic/vector_reduce_max/test.cpp
@@ -7,69 +7,108 @@
 // Copyright (C) 2025, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
+
 #include "xrt_test_wrapper.h"
 #include <cstdint>
-#include <stdfloat>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <type_traits>
+
 #ifndef DTYPE
-#define DTYPE std::bfloat16_t
+#define DTYPE test_utils::bfloat16_t
 #endif
-// ------------------------------------------------------
-// Configure this to match your buffer data type
-// -----------------------------------------------------
+
 using DATATYPE = DTYPE;
 
+template <typename T>
+T random_input_value() {
+  if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
+    return test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(-4.0f),
+                                         test_utils::bfloat16_from_float(8.0f));
+  } else if constexpr (std::is_same_v<T, std::int32_t>) {
+    return test_utils::random_int32_t(100000);
+  } else {
+    std::cerr << "Unsupported data type" << std::endl;
+    std::exit(EXIT_FAILURE);
+  }
+}
+
+template <typename T>
+T lowest_value() {
+  if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
+    return test_utils::bfloat16_from_float(
+        -std::numeric_limits<float>::infinity());
+  } else {
+    return std::numeric_limits<T>::lowest();
+  }
+}
+
+template <typename T>
+bool less_than(T lhs, T rhs) {
+  if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
+    return test_utils::bfloat16_to_float(lhs) <
+           test_utils::bfloat16_to_float(rhs);
+  } else {
+    return lhs < rhs;
+  }
+}
+
+template <typename T>
+bool values_equal(T lhs, T rhs) {
+  if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
+    return test_utils::nearly_equal_bfloat16(lhs, rhs);
+  } else {
+    return lhs == rhs;
+  }
+}
+
+template <typename T>
+auto printable_value(T value) {
+  if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
+    return test_utils::bfloat16_to_float(value);
+  } else {
+    return value;
+  }
+}
+
 void initialize_bufIn1(DATATYPE *bufIn1, int SIZE) {
-  DATATYPE max = std::numeric_limits<DATATYPE>::lowest();
+  DATATYPE max = lowest_value<DATATYPE>();
   for (int i = 0; i < SIZE; i++) {
-    DATATYPE next;
-    if constexpr (std::is_same_v<DATATYPE, std::bfloat16_t> &&
-                  std::is_same_v<DATATYPE, std::bfloat16_t>) {
-      next = test_utils::random_bfloat16_t((std::bfloat16_t)-4.0,
-                                           (std::bfloat16_t)8.0);
-    } else if constexpr (std::is_same_v<DATATYPE, int32_t> &&
-                         std::is_same_v<DATATYPE, int32_t>) {
-      next = test_utils::random_int32_t(100000);
-    } else {
-      std::cerr << "Unsupported data type" << std::endl;
-      std::exit(EXIT_FAILURE);
-    }
-    if (next > max)
+    DATATYPE next = random_input_value<DATATYPE>();
+    if (less_than(max, next))
       max = next;
     bufIn1[i] = next;
   }
 }
 
-// Initialize Output buffer
-void initialize_bufOut(DATATYPE *bufOut, int SIZE) { memset(bufOut, 0, SIZE); }
+void initialize_bufOut(DATATYPE *bufOut, int SIZE) {
+  std::memset(bufOut, 0, SIZE);
+}
 
-// Functional correctness verifyer
 int verify_vector_reduce_max(DATATYPE *bufIn1, DATATYPE *bufOut, int SIZE,
                              int verbosity) {
   int errors = 0;
 
-  // Calculate max within the function
-  DATATYPE max = std::numeric_limits<DATATYPE>::lowest();
+  DATATYPE max = lowest_value<DATATYPE>();
   for (int i = 0; i < SIZE; i++) {
-    if (bufIn1[i] > max)
+    if (less_than(max, bufIn1[i]))
       max = bufIn1[i];
   }
 
-  if (bufOut[0] != max) {
+  if (!values_equal(bufOut[0], max)) {
     errors++;
-    std::cout << "max is " << max << " calc " << bufOut[0] << std::endl;
-  } else {
-    if (verbosity >= 1)
-      std::cout << "max is " << max << " calc " << bufOut[0] << std::endl;
+    std::cout << "max is " << printable_value(max) << " calc "
+              << printable_value(bufOut[0]) << std::endl;
+  } else if (verbosity >= 1) {
+    std::cout << "max is " << printable_value(max) << " calc "
+              << printable_value(bufOut[0]) << std::endl;
   }
   return errors;
 }
 
-//*****************************************************************************
-// Should not need to modify below section
-//*****************************************************************************
-
 int main(int argc, const char *argv[]) {
-
   constexpr int IN1_VOLUME = IN1_SIZE / sizeof(DATATYPE);
   constexpr int OUT_VOLUME = OUT_SIZE / sizeof(DATATYPE);
 
@@ -79,4 +118,4 @@ int main(int argc, const char *argv[]) {
                               initialize_bufOut, verify_vector_reduce_max>(
       IN1_VOLUME, OUT_VOLUME, myargs);
   return res;
-}
\ No newline at end of file
+}
diff --git a/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp b/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp
index 0f785c104e5..44fc021a9d6 100644
--- a/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp
+++ b/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp
@@ -8,7 +8,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
 #include <boost/program_options.hpp>
 #include <cmath>
 #include <cstdint>
@@ -28,15 +27,6 @@
 #include "../helper.h"
 #include "common.h"
 
-#include <stdfloat>
-
-// Clangd fix, remove
-#ifdef _CLANGD
-namespace std {
-using bfloat16_t = float;
-} // namespace std
-#endif
-
 int main(int argc, const char *argv[]) {
 
   // ------------------------------------------------------
@@ -113,9 +103,9 @@ int main(int argc, const char *argv[]) {
 
   auto boInstr = xrt::bo(device, instr.size() * sizeof(int),
                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto boInA = xrt::bo(device, numberFloats * sizeof(std::bfloat16_t),
+  auto boInA = xrt::bo(device, numberFloats * sizeof(test_utils::bfloat16_t),
                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto boInB = xrt::bo(device, numberFloats * sizeof(std::bfloat16_t),
+  auto boInB = xrt::bo(device, numberFloats * sizeof(test_utils::bfloat16_t),
                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
   auto boOut = xrt::bo(device, bfpBytesSize * sizeof(int8_t),
                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
@@ -139,22 +129,24 @@ int main(int argc, const char *argv[]) {
     return generateRandomFloatingPoint(rng, -5, 5);
   });
 
-  std::bfloat16_t bfloatA[numberFloats];
-  std::bfloat16_t bfloatB[numberFloats];
+  test_utils::bfloat16_t bfloatA[numberFloats];
+  test_utils::bfloat16_t bfloatB[numberFloats];
 
-  std::ranges::transform(
-      floatA, bfloatA, [](float f) { return static_cast<std::bfloat16_t>(f); });
-  std::ranges::transform(
-      floatB, bfloatB, [](float f) { return static_cast<std::bfloat16_t>(f); });
+  std::ranges::transform(floatA, bfloatA, [](float f) {
+    return test_utils::bfloat16_from_float(f);
+  });
+  std::ranges::transform(floatB, bfloatB, [](float f) {
+    return test_utils::bfloat16_from_float(f);
+  });
 
   // ------------------------------------------------------
   // Write data into buffers
   // ------------------------------------------------------
-  std::bfloat16_t *bufInA = boInA.map<std::bfloat16_t *>();
-  memcpy(bufInA, bfloatA, (numberFloats * sizeof(std::bfloat16_t)));
+  test_utils::bfloat16_t *bufInA = boInA.map<test_utils::bfloat16_t *>();
+  memcpy(bufInA, bfloatA, (numberFloats * sizeof(test_utils::bfloat16_t)));
 
-  std::bfloat16_t *bufInB = boInB.map<std::bfloat16_t *>();
-  memcpy(bufInB, bfloatB, (numberFloats * sizeof(std::bfloat16_t)));
+  test_utils::bfloat16_t *bufInB = boInB.map<test_utils::bfloat16_t *>();
+  memcpy(bufInB, bfloatB, (numberFloats * sizeof(test_utils::bfloat16_t)));
 
   void *bufInstr = boInstr.map<void *>();
   memcpy(bufInstr, instr.data(), instr.size() * sizeof(int));
diff --git a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp
index b7dce1e8087..9196eb2418e 100644
--- a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp
+++ b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp
@@ -27,20 +27,12 @@
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
-#include <stdfloat>
 #include <vector>
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-// Clangd fix, remove
-#ifdef _CLANGD
-namespace std {
-using bfloat16_t = double;
-} // namespace std
-#endif
-
 #include "../helper.h"
 #include "common.h"
 #include "gemm_atb_layout.h"
@@ -61,8 +53,8 @@ constexpr int verify_stochastic_n_samples = 1000;
 // are initialized to all-ones: every C[i,j] reduces to exactly K, which is
 // representable losslessly in BFP16 ebs8 (no requantization compounding),
 // and the CPU reference produces the same value bit-for-bit.
-float abs_tol = matmul_common::get_abs_tol<std::bfloat16_t>();
-float rel_tol = matmul_common::get_rel_tol<std::bfloat16_t>();
+float abs_tol = matmul_common::get_abs_tol<test_utils::bfloat16_t>();
+float rel_tol = matmul_common::get_rel_tol<test_utils::bfloat16_t>();
 
 int main(int argc, const char *argv[]) {
 
diff --git a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp
index 5468ddc2640..4080c103386 100644
--- a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp
+++ b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp
@@ -22,20 +22,12 @@
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
-#include <stdfloat>
 #include <vector>
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-// Clangd fix, remove
-#ifdef _CLANGD
-namespace std {
-using bfloat16_t = double;
-} // namespace std
-#endif
-
 #include "../helper.h"
 #include "common.h"
 #include "gemm_atb_layout.h"
@@ -49,8 +41,8 @@ constexpr int verify_stochastic_n_samples = 1000;
 // Verification tolerance
 // See "Note on Numerical Tolerances" in README.md
 // TODO: This might have to be adjusted for bfp
-float abs_tol = matmul_common::get_abs_tol<std::bfloat16_t>();
-float rel_tol = matmul_common::get_rel_tol<std::bfloat16_t>() * 2.0f;
+float abs_tol = matmul_common::get_abs_tol<test_utils::bfloat16_t>();
+float rel_tol = matmul_common::get_rel_tol<test_utils::bfloat16_t>() * 2.0f;
 
 int main(int argc, const char *argv[]) {
 
@@ -154,11 +146,11 @@ int main(int argc, const char *argv[]) {
 
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_a = xrt::bo(device, A_SIZE * sizeof(std::bfloat16_t),
+  auto bo_a = xrt::bo(device, A_SIZE * sizeof(test_utils::bfloat16_t),
                       XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
   auto bo_b =
       xrt::bo(device, B_VOLUME, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-  auto bo_out = xrt::bo(device, C_SIZE * sizeof(std::bfloat16_t),
+  auto bo_out = xrt::bo(device, C_SIZE * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
 
   // ------------------------------------------------------
@@ -168,20 +160,22 @@ int main(int argc, const char *argv[]) {
     std::cout << "Writing data into buffer objects.\n";
   }
 
-  std::vector<std::bfloat16_t> AVec(A_SIZE);
+  std::vector<test_utils::bfloat16_t> AVec(A_SIZE);
   for (int i = 0; i < A_SIZE; i++) {
-    AVec[i] = (std::bfloat16_t)((rand() % 8) - 4);
+    AVec[i] =
+        test_utils::bfloat16_from_float(static_cast<float>((rand() % 8) - 4));
   }
-  std::vector<std::bfloat16_t> BVec(B_SIZE);
+  std::vector<test_utils::bfloat16_t> BVec(B_SIZE);
   for (int i = 0; i < B_SIZE; i++) {
-    BVec[i] = (std::bfloat16_t)((rand() % 8) - 4);
+    BVec[i] =
+        test_utils::bfloat16_from_float(static_cast<float>((rand() % 8) - 4));
   }
 
   // This is a quick conversion to avoid having to create a custom function for
   // bf16 for now
   std::vector<float> BVecFloat(B_SIZE);
   for (int i = 0; i < B_SIZE; i++) {
-    BVecFloat[i] = (float)BVec[i];
+    BVecFloat[i] = test_utils::bfloat16_to_float(BVec[i]);
   }
 
   auto shuffleStart = std::chrono::high_resolution_clock::now();
@@ -201,9 +195,9 @@ int main(int argc, const char *argv[]) {
   // ------------------------------------------------------
   // Write data into buffers
   // ------------------------------------------------------
-  std::bfloat16_t *bufA = bo_a.map<std::bfloat16_t *>();
+  test_utils::bfloat16_t *bufA = bo_a.map<test_utils::bfloat16_t *>();
   uint8_t *bufB = bo_b.map<uint8_t *>();
-  memcpy(bufA, AVec.data(), AVec.size() * sizeof(std::bfloat16_t));
+  memcpy(bufA, AVec.data(), AVec.size() * sizeof(test_utils::bfloat16_t));
   memcpy(bufB, BVecBfpShuffled.data(), B_VOLUME);
 
   // Initialize outputs; bufOut is results matrix
@@ -260,8 +254,8 @@ int main(int argc, const char *argv[]) {
   // verify pass does not pollute the per-iter average runtime.
   // ------------------------------------------------------
   if (do_verify) {
-    std::vector<std::bfloat16_t> CVec(C_SIZE);
-    memcpy(CVec.data(), bufOut, CVec.size() * sizeof(std::bfloat16_t));
+    std::vector<test_utils::bfloat16_t> CVec(C_SIZE);
+    memcpy(CVec.data(), bufOut, CVec.size() * sizeof(test_utils::bfloat16_t));
 
     if (verbosity >= 1) {
       std::cout << "Verifying against reference matmul ..." << std::endl;
@@ -272,12 +266,13 @@ int main(int argc, const char *argv[]) {
       // in fp32, and the bf16 accumulator pattern from the canonical
       // mixed_test.cpp loses too much precision over the K=4096 reductions
       // that the paper-scale shapes require.
-      errors = matmul_common::verify_stochastic<std::bfloat16_t,
-                                                std::bfloat16_t, float>(
+      errors = matmul_common::verify_stochastic<test_utils::bfloat16_t,
+                                                test_utils::bfloat16_t, float>(
           M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity,
           abs_tol, rel_tol, /*b_col_maj=*/0);
     } else {
-      errors = matmul_common::verify<std::bfloat16_t, std::bfloat16_t, float>(
+      errors = matmul_common::verify<test_utils::bfloat16_t,
+                                     test_utils::bfloat16_t, float>(
           M, N, K, AVec, BVec, CVec, verbosity, abs_tol, rel_tol,
           /*b_col_maj=*/0);
     }
diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp
index 0969e719218..92522808858 100644
--- a/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp
+++ b/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp
@@ -14,20 +14,12 @@
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
-#include <stdfloat>
 #include <vector>
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-// Clangd fix, remove
-#ifdef _CLANGD
-namespace std {
-using bfloat16_t = double;
-} // namespace std
-#endif
-
 #include "../helper.h"
 #include "common.h"
 
@@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000;
 // Verification tolerance
 // See "Note on Numerical Tolerances" in README.md
 // TODO: This might have to be adjusted for bfp
-float abs_tol = matmul_common::get_abs_tol<std::bfloat16_t>();
-float rel_tol = matmul_common::get_rel_tol<std::bfloat16_t>();
+float abs_tol = matmul_common::get_abs_tol<test_utils::bfloat16_t>();
+float rel_tol = matmul_common::get_rel_tol<test_utils::bfloat16_t>();
 
 int main(int argc, const char *argv[]) {
 
diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp
index 4cc51f2c0e8..fab597f1f46 100644
--- a/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp
+++ b/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp
@@ -14,20 +14,12 @@
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
-#include <stdfloat>
 #include <vector>
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-// Clangd fix, remove
-#ifdef _CLANGD
-namespace std {
-using bfloat16_t = double;
-} // namespace std
-#endif
-
 #include "../../helper.h"
 #include "common.h"
 
@@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000;
 // Verification tolerance
 // See "Note on Numerical Tolerances" in README.md
 // TODO: This might have to be adjusted for bfp
-float abs_tol = matmul_common::get_abs_tol<std::bfloat16_t>();
-float rel_tol = matmul_common::get_rel_tol<std::bfloat16_t>();
+float abs_tol = matmul_common::get_abs_tol<test_utils::bfloat16_t>();
+float rel_tol = matmul_common::get_rel_tol<test_utils::bfloat16_t>();
 
 int main(int argc, const char *argv[]) {
 
diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp
index f536ba3cc1d..c875796c5af 100644
--- a/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp
+++ b/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp
@@ -14,20 +14,12 @@
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
-#include <stdfloat>
 #include <vector>
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-// Clangd fix, remove
-#ifdef _CLANGD
-namespace std {
-using bfloat16_t = double;
-} // namespace std
-#endif
-
 #include "../helper.h"
 #include "common.h"
 
@@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000;
 // Verification tolerance
 // See "Note on Numerical Tolerances" in README.md
 // TODO: This might have to be adjusted for bfp
-float abs_tol = matmul_common::get_abs_tol<std::bfloat16_t>();
-float rel_tol = matmul_common::get_rel_tol<std::bfloat16_t>() * 2.0f;
+float abs_tol = matmul_common::get_abs_tol<test_utils::bfloat16_t>();
+float rel_tol = matmul_common::get_rel_tol<test_utils::bfloat16_t>() * 2.0f;
 
 int main(int argc, const char *argv[]) {
 
@@ -145,11 +137,11 @@ int main(int argc, const char *argv[]) {
 
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_a = xrt::bo(device, A_SIZE * sizeof(std::bfloat16_t),
+  auto bo_a = xrt::bo(device, A_SIZE * sizeof(test_utils::bfloat16_t),
                       XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
   auto bo_b =
       xrt::bo(device, B_VOLUME, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-  auto bo_out = xrt::bo(device, C_SIZE * sizeof(std::bfloat16_t),
+  auto bo_out = xrt::bo(device, C_SIZE * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
 
   // ------------------------------------------------------
@@ -159,10 +151,11 @@ int main(int argc, const char *argv[]) {
     std::cout << "Writing data into buffer objects.\n";
   }
 
-  std::vector<std::bfloat16_t> AVec(A_SIZE);
+  std::vector<test_utils::bfloat16_t> AVec(A_SIZE);
   for (int i = 0; i < A_SIZE; i++) {
     // Limiting to 16 to avoid precision loss issues
-    AVec[i] = (std::bfloat16_t)((rand() % 8) - 4);
+    AVec[i] =
+        test_utils::bfloat16_from_float(static_cast<float>((rand() % 8) - 4));
     // AVec[i] = i;
     // if (i % N == i / N) {
     //   AVec[i] = 1.0;
@@ -173,10 +166,11 @@ int main(int argc, const char *argv[]) {
     // AVec[i] = (i / 8) % 1000;
   }
 
-  std::vector<std::bfloat16_t> BVec(B_SIZE);
+  std::vector<test_utils::bfloat16_t> BVec(B_SIZE);
   for (int i = 0; i < B_SIZE; i++) {
     // Limiting to 16 to avoid precision loss issues
-    BVec[i] = (std::bfloat16_t)((rand() % 8) - 4);
+    BVec[i] =
+        test_utils::bfloat16_from_float(static_cast<float>((rand() % 8) - 4));
     // Diagonal:
     // if (i % N == i / N) {
     //   BVec[i] = 1.0;
@@ -191,7 +185,7 @@ int main(int argc, const char *argv[]) {
   // bf16 for now
   std::vector<float> BVecFloat(B_SIZE);
   for (int i = 0; i < B_SIZE; i++) {
-    BVecFloat[i] = (float)BVec[i];
+    BVecFloat[i] = test_utils::bfloat16_to_float(BVec[i]);
   }
 
   auto BVecBfp = floatToBfp16(8, B_SIZE, BVecFloat.data(), 0);
@@ -208,9 +202,9 @@ int main(int argc, const char *argv[]) {
   // ------------------------------------------------------
   // Write data into buffers
   // ------------------------------------------------------
-  std::bfloat16_t *bufA = bo_a.map<std::bfloat16_t *>();
+  test_utils::bfloat16_t *bufA = bo_a.map<test_utils::bfloat16_t *>();
   uint8_t *bufB = bo_b.map<uint8_t *>();
-  memcpy(bufA, AVec.data(), AVec.size() * sizeof(std::bfloat16_t));
+  memcpy(bufA, AVec.data(), AVec.size() * sizeof(test_utils::bfloat16_t));
   memcpy(bufB, BVecBfpShuffled.data(), B_VOLUME);
 
   // Initialize outputs; bufOut is results matrix
@@ -257,22 +251,23 @@ int main(int argc, const char *argv[]) {
     // Check output
     // ------------------------------------------------------
     if (do_verify) {
-      std::vector<std::bfloat16_t> CVec(C_SIZE);
-      memcpy(CVec.data(), bufOut, CVec.size() * sizeof(std::bfloat16_t));
+      std::vector<test_utils::bfloat16_t> CVec(C_SIZE);
+      memcpy(CVec.data(), bufOut, CVec.size() * sizeof(test_utils::bfloat16_t));
 
       if (verbosity >= 1) {
         std::cout << "Verifying against reference matmul ..." << std::endl;
       }
       auto vstart = std::chrono::system_clock::now();
       if (do_verify_stochastic) {
-        errors =
-            matmul_common::verify_stochastic<std::bfloat16_t, std::bfloat16_t,
-                                             std::bfloat16_t>(
-                M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples,
-                verbosity, abs_tol, rel_tol, true);
+        errors = matmul_common::verify_stochastic<test_utils::bfloat16_t,
+                                                  test_utils::bfloat16_t,
+                                                  test_utils::bfloat16_t>(
+            M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity,
+            abs_tol, rel_tol, true);
       } else {
-        errors = matmul_common::verify<std::bfloat16_t, std::bfloat16_t,
-                                       std::bfloat16_t>(
+        errors = matmul_common::verify<test_utils::bfloat16_t,
+                                       test_utils::bfloat16_t,
+                                       test_utils::bfloat16_t>(
             M, N, K, AVec, BVec, CVec, verbosity, abs_tol, rel_tol, true);
       }
       auto vstop = std::chrono::system_clock::now();
diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp
index 4b91f58dbb8..22fff460945 100644
--- a/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp
+++ b/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp
@@ -14,20 +14,12 @@
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
-#include <stdfloat>
 #include <vector>
 
 #include "xrt/xrt_bo.h"
 #include "xrt/xrt_device.h"
 #include "xrt/xrt_kernel.h"
 
-// Clangd fix, remove
-#ifdef _CLANGD
-namespace std {
-using bfloat16_t = double;
-} // namespace std
-#endif
-
 #include "../../helper.h"
 #include "common.h"
 
@@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000;
 // Verification tolerance
 // See "Note on Numerical Tolerances" in README.md
 // TODO: This might have to be adjusted for bfp
-float abs_tol = matmul_common::get_abs_tol<std::bfloat16_t>();
-float rel_tol = matmul_common::get_rel_tol<std::bfloat16_t>();
+float abs_tol = matmul_common::get_abs_tol<test_utils::bfloat16_t>();
+float rel_tol = matmul_common::get_rel_tol<test_utils::bfloat16_t>();
 
 int main(int argc, const char *argv[]) {
 
diff --git a/programming_examples/ml/eltwise_add/test.cpp b/programming_examples/ml/eltwise_add/test.cpp
index fbb1d911657..b3ddfd960a3 100644
--- a/programming_examples/ml/eltwise_add/test.cpp
+++ b/programming_examples/ml/eltwise_add/test.cpp
@@ -8,8 +8,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <bits/stdc++.h>
+#include <algorithm>
+#include <chrono>
 #include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -25,9 +29,9 @@
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-using INOUT0_DATATYPE = std::bfloat16_t;
-using INOUT1_DATATYPE = std::bfloat16_t;
-using INOUT2_DATATYPE = std::bfloat16_t;
+using INOUT0_DATATYPE = test_utils::bfloat16_t;
+using INOUT1_DATATYPE = test_utils::bfloat16_t;
+using INOUT2_DATATYPE = test_utils::bfloat16_t;
 #endif
 
 // ----------------------------------------------------------------------------
@@ -38,11 +42,15 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
            int verbosity) {
   int errors = 0;
   for (uint32_t i = 0; i < size; i++) {
-    T ref = A[i] + B[i];
-    if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) {
+    const float a = test_utils::bfloat16_to_float(A[i]);
+    const float b = test_utils::bfloat16_to_float(B[i]);
+    const float actual = test_utils::bfloat16_to_float(C[i]);
+    const auto ref_bf16 = test_utils::bfloat16_add(A[i], B[i]);
+    const float ref = test_utils::bfloat16_to_float(ref_bf16);
+    if (!test_utils::nearly_equal(ref, actual, 0.00390625)) {
       if (errors < 100) {
-        std::cout << "Error in output " << C[i] << " != " << ref << " from "
-                  << A[i] << " * " << B[i] << std::endl;
+        std::cout << "Error in output " << actual << " != " << ref << " from "
+                  << a << " + " << b << std::endl;
       } else if (errors == 100) {
         std::cout << "..." << std::endl;
         std::cout << "[Errors truncated]" << std::endl;
@@ -50,7 +58,7 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
       errors++;
     } else {
       if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+        std::cout << "Correct output " << actual << " == " << ref << std::endl;
     }
   }
   return errors;
@@ -166,16 +174,18 @@ int main(int argc, const char *argv[]) {
   INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
   std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
   for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
-                                            (std::bfloat16_t)-0.5);
+    AVec[i] =
+        test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f),
+                                      test_utils::bfloat16_from_float(-0.5f));
   memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
 
   // Initialize Inout buffer 1
   INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
   std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
   for (int i = 0; i < INOUT1_VOLUME; i++)
-    BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
-                                            (std::bfloat16_t)-0.5);
+    BVec[i] =
+        test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f),
+                                      test_utils::bfloat16_from_float(-0.5f));
   memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
 
   // Initialize Inout buffer 2
diff --git a/programming_examples/ml/eltwise_mul/test.cpp b/programming_examples/ml/eltwise_mul/test.cpp
index 36aea4639cc..e6f1050a676 100644
--- a/programming_examples/ml/eltwise_mul/test.cpp
+++ b/programming_examples/ml/eltwise_mul/test.cpp
@@ -8,8 +8,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <bits/stdc++.h>
+#include <algorithm>
+#include <chrono>
 #include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -25,9 +29,9 @@
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-using INOUT0_DATATYPE = std::bfloat16_t;
-using INOUT1_DATATYPE = std::bfloat16_t;
-using INOUT2_DATATYPE = std::bfloat16_t;
+using INOUT0_DATATYPE = test_utils::bfloat16_t;
+using INOUT1_DATATYPE = test_utils::bfloat16_t;
+using INOUT2_DATATYPE = test_utils::bfloat16_t;
 #endif
 
 // ----------------------------------------------------------------------------
@@ -38,11 +42,15 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
            int verbosity) {
   int errors = 0;
   for (uint32_t i = 0; i < size; i++) {
-    T ref = A[i] * B[i];
-    if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) {
+    const float a = test_utils::bfloat16_to_float(A[i]);
+    const float b = test_utils::bfloat16_to_float(B[i]);
+    const float actual = test_utils::bfloat16_to_float(C[i]);
+    const auto ref_bf16 = test_utils::bfloat16_mul(A[i], B[i]);
+    const float ref = test_utils::bfloat16_to_float(ref_bf16);
+    if (!test_utils::nearly_equal(ref, actual, 0.00390625)) {
       if (errors < 100) {
-        std::cout << "Error in output " << C[i] << " != " << ref << " from "
-                  << A[i] << " * " << B[i] << std::endl;
+        std::cout << "Error in output " << actual << " != " << ref << " from "
+                  << a << " * " << b << std::endl;
       } else if (errors == 100) {
         std::cout << "..." << std::endl;
         std::cout << "[Errors truncated]" << std::endl;
@@ -50,7 +58,7 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
       errors++;
     } else {
       if (verbosity > 1)
-        std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+        std::cout << "Correct output " << actual << " == " << ref << std::endl;
     }
   }
   return errors;
@@ -166,16 +174,18 @@ int main(int argc, const char *argv[]) {
   INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
   std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
   for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
-                                            (std::bfloat16_t)-0.5);
+    AVec[i] =
+        test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f),
+                                      test_utils::bfloat16_from_float(-0.5f));
   memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
 
   // Initialize Inout buffer 1
   INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
   std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
   for (int i = 0; i < INOUT1_VOLUME; i++)
-    BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0,
-                                            (std::bfloat16_t)-0.5);
+    BVec[i] =
+        test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f),
+                                      test_utils::bfloat16_from_float(-0.5f));
   memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
 
   // Initialize Inout buffer 2
diff --git a/programming_examples/ml/gelu/test.cpp b/programming_examples/ml/gelu/test.cpp
index f3c3692fb74..1bc14a8733f 100644
--- a/programming_examples/ml/gelu/test.cpp
+++ b/programming_examples/ml/gelu/test.cpp
@@ -9,8 +9,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "cxxopts.hpp"
+#include <algorithm>
+#include <chrono>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -23,20 +27,28 @@
 
 #include "test_utils.h"
 
-// gelu reference implementation
-std::bfloat16_t gelu_bf16(std::bfloat16_t &input) {
+// GELU reference implementation.
+test_utils::bfloat16_t gelu_bf16(test_utils::bfloat16_t input) {
   // Approximate GELU: 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
-  constexpr auto sqrt_2_over_pi = std::bfloat16_t(0.79788456f);
-  constexpr auto beta = std::bfloat16_t(0.044715f);
-
-  std::bfloat16_t x = input;
-  std::bfloat16_t x3 = x * x * x;
-  std::bfloat16_t inner = sqrt_2_over_pi * (x + beta * x3);
-  std::bfloat16_t tanh_val = std::tanh(inner);
-  std::bfloat16_t gelu =
-      std::bfloat16_t(0.5f) * x * (std::bfloat16_t(1.0f) + tanh_val);
-
-  return std::bfloat16_t(gelu);
+  const test_utils::bfloat16_t k0_5 = test_utils::bfloat16_from_float(0.5f);
+  const test_utils::bfloat16_t k1 = test_utils::bfloat16_from_float(1.0f);
+  const test_utils::bfloat16_t sqrt_2_over_pi =
+      test_utils::bfloat16_from_float(0.79788456f);
+  const test_utils::bfloat16_t beta =
+      test_utils::bfloat16_from_float(0.044715f);
+
+  const test_utils::bfloat16_t x2 = test_utils::bfloat16_mul(input, input);
+  const test_utils::bfloat16_t x3 = test_utils::bfloat16_mul(input, x2);
+  const test_utils::bfloat16_t x3_beta = test_utils::bfloat16_mul(x3, beta);
+  const test_utils::bfloat16_t inner = test_utils::bfloat16_add(input, x3_beta);
+  const test_utils::bfloat16_t inner1 =
+      test_utils::bfloat16_mul(inner, sqrt_2_over_pi);
+  const test_utils::bfloat16_t tanh_out = test_utils::bfloat16_tanh(inner1);
+  const test_utils::bfloat16_t one_plus_tanh =
+      test_utils::bfloat16_add(tanh_out, k1);
+  const test_utils::bfloat16_t mul_v05 =
+      test_utils::bfloat16_mul(k0_5, one_plus_tanh);
+  return test_utils::bfloat16_mul(input, mul_v05);
 }
 
 int main(int argc, const char *argv[]) {
@@ -53,7 +65,7 @@ int main(int argc, const char *argv[]) {
       "instr,i",
       "path of file containing userspace instructions to be sent to the LX6",
       cxxopts::value<std::string>())(
-      "length,l", "the length of the transfer in std::bfloat16_t",
+      "length,l", "the length of the transfer in bfloat16 elements",
       cxxopts::value<int>()->default_value("4096"));
 
   try {
@@ -133,19 +145,21 @@ int main(int argc, const char *argv[]) {
 
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t),
+  auto bo_inA = xrt::bo(device, N * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t),
+  auto bo_out = xrt::bo(device, N * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects." << std::endl;
 
-  std::bfloat16_t *bufInA = bo_inA.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> srcVecA;
+  test_utils::bfloat16_t *bufInA = bo_inA.map<test_utils::bfloat16_t *>();
+  std::vector<test_utils::bfloat16_t> srcVecA;
   for (int i = 0; i < N; i++)
-    srcVecA.push_back(std::bfloat16_t(i * 0.05f + -2.0f)); // Example data
-  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t)));
+    srcVecA.push_back(
+        test_utils::bfloat16_from_float(i * 0.05f + -2.0f)); // Example data
+  memcpy(bufInA, srcVecA.data(),
+         (srcVecA.size() * sizeof(test_utils::bfloat16_t)));
 
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
@@ -173,24 +187,27 @@ int main(int argc, const char *argv[]) {
   std::cout << "Latency (us): " << npu_time << std::endl;
   std::cout << std::endl;
 
-  double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output
+  double total_bytes =
+      2.0 * N * sizeof(test_utils::bfloat16_t); // input and output
   double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9;
   std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s"
             << std::endl;
 
-  std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
+  test_utils::bfloat16_t *bufOut = bo_out.map<test_utils::bfloat16_t *>();
 
   int errors = 0;
 
   for (int i = 0; i < N; i++) {
-    std::bfloat16_t ref = gelu_bf16(srcVecA[i]);
-    if (!test_utils::nearly_equal(*(bufOut + i), ref, 0.1)) {
+    const test_utils::bfloat16_t ref = gelu_bf16(srcVecA[i]);
+    const float expected = test_utils::bfloat16_to_float(ref);
+    const float actual = test_utils::bfloat16_to_float(*(bufOut + i));
+    if (!test_utils::nearly_equal(actual, expected, 0.1)) {
       errors++;
       // Print the first 100 mismatches
       if (errors <= 100) {
         std::cout << "Mismatch at index " << i << ": "
-                  << "Expected: " << ref << ", "
-                  << "Got: " << *(bufOut + i) << std::endl;
+                  << "Expected: " << expected << ", "
+                  << "Got: " << actual << std::endl;
       }
     }
   }
diff --git a/programming_examples/ml/layernorm/test.cpp b/programming_examples/ml/layernorm/test.cpp
index 8c698f634c0..c1bb1a6ac50 100644
--- a/programming_examples/ml/layernorm/test.cpp
+++ b/programming_examples/ml/layernorm/test.cpp
@@ -12,6 +12,7 @@
 #include "xrt_test_wrapper.h"
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 #include <iostream>
 #include <vector>
 
@@ -20,22 +21,23 @@
 // ------------------------------------------------------
 // Configure this to match your buffer data type
 // ------------------------------------------------------
-using DATATYPE_IN1 = std::bfloat16_t;
-using DATATYPE_OUT = std::bfloat16_t;
+using DATATYPE_IN1 = test_utils::bfloat16_t;
+using DATATYPE_OUT = test_utils::bfloat16_t;
 #endif
 
 // Initialize Input buffer 1
 void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int in_volume) {
   for (int i = 0; i < in_volume; i++) {
-    DATATYPE_IN1 val = static_cast<DATATYPE_IN1>(test_utils::random_bfloat16_t(
-        (std::bfloat16_t)8.0, (std::bfloat16_t)-4.0));
+    DATATYPE_IN1 val =
+        test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f),
+                                      test_utils::bfloat16_from_float(-4.0f));
     bufIn1[i] = val;
   }
 }
 
 // Initialize Output buffer
 void initialize_bufOut(DATATYPE_OUT *bufOut, int out_volume) {
-  memset(bufOut, 0, out_volume);
+  memset(bufOut, 0, out_volume * sizeof(DATATYPE_OUT));
 }
 
 int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
@@ -52,7 +54,7 @@ int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
     // Accumulate sum and sum of squares for each row
     for (int c = 0; c < COLS; c++) {
       int idx = r * COLS + c;
-      float val = static_cast<float>(bufIn1[idx]);
+      float val = test_utils::bfloat16_to_float(bufIn1[idx]);
       sum += val;
       sum_sq += val * val;
     }
@@ -63,7 +65,7 @@ int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
     // Compute expected output for the current row
     for (int c = 0; c < COLS; c++) {
       int idx = r * COLS + c;
-      float val = static_cast<float>(bufIn1[idx]);
+      float val = test_utils::bfloat16_to_float(bufIn1[idx]);
       float norm = (val - mean) * inv_std;
       float scaled = norm * gamma;
       float out_val = scaled + beta;
@@ -73,7 +75,7 @@ int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
   // Now compare the expected results with the computed results in bufOut
   for (int i = 0; i < (ROWS * COLS); i++) {
     float expected_val = expected[i];
-    float hw_val = static_cast<float>(bufOut[i]);
+    float hw_val = test_utils::bfloat16_to_float(bufOut[i]);
     float diff = std::abs(expected_val - hw_val);
     if (diff > 0.1) {
       std::cout << "Mismatch at index " << i << ": expected " << expected_val
diff --git a/programming_examples/ml/relu/test.cpp b/programming_examples/ml/relu/test.cpp
index 2bb1fd62e94..27d82509d91 100644
--- a/programming_examples/ml/relu/test.cpp
+++ b/programming_examples/ml/relu/test.cpp
@@ -9,8 +9,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "cxxopts.hpp"
+#include <algorithm>
+#include <chrono>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -27,10 +30,11 @@
 
 #include "test_utils.h"
 
-// relu reference implementation
-std::bfloat16_t relu_bf16(std::bfloat16_t &input) {
-  // Return the relu output
-  return (input > std::bfloat16_t(0.0f)) ? input : std::bfloat16_t(0.0f);
+// ReLU reference implementation.
+test_utils::bfloat16_t relu_bf16(test_utils::bfloat16_t input) {
+  return (test_utils::bfloat16_to_float(input) > 0.0f)
+             ? input
+             : test_utils::bfloat16_from_float(0.0f);
 }
 
 int main(int argc, const char *argv[]) {
@@ -47,7 +51,7 @@ int main(int argc, const char *argv[]) {
       "instr,i",
       "path of file containing userspace instructions to be sent to the LX6",
       cxxopts::value<std::string>())(
-      "length,l", "the length of the transfer in std::bfloat16_t",
+      "length,l", "the length of the transfer in bfloat16 elements",
       cxxopts::value<int>()->default_value("4096"));
 
   try {
@@ -123,17 +127,19 @@ int main(int argc, const char *argv[]) {
     std::cout << "Getting handle to kernel:" << kernelName << std::endl;
   auto kernel = xrt::ext::kernel(context, mod, kernelName);
 
-  xrt::bo bo_inA = xrt::ext::bo{device, N * sizeof(std::bfloat16_t)};
-  xrt::bo bo_out = xrt::ext::bo{device, N * sizeof(std::bfloat16_t)};
+  xrt::bo bo_inA = xrt::ext::bo{device, N * sizeof(test_utils::bfloat16_t)};
+  xrt::bo bo_out = xrt::ext::bo{device, N * sizeof(test_utils::bfloat16_t)};
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects." << std::endl;
 
-  std::bfloat16_t *bufInA = bo_inA.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> srcVecA;
+  test_utils::bfloat16_t *bufInA = bo_inA.map<test_utils::bfloat16_t *>();
+  std::vector<test_utils::bfloat16_t> srcVecA;
   for (int i = 0; i < N; i++)
-    srcVecA.push_back(std::bfloat16_t(i * 0.05f + -3.0f)); // Example data
-  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t)));
+    srcVecA.push_back(
+        test_utils::bfloat16_from_float(i * 0.05f + -3.0f)); // Example data
+  memcpy(bufInA, srcVecA.data(),
+         (srcVecA.size() * sizeof(test_utils::bfloat16_t)));
 
   bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
@@ -157,24 +163,27 @@ int main(int argc, const char *argv[]) {
   std::cout << "Latency (us): " << npu_time << std::endl;
   std::cout << std::endl;
 
-  double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output
+  double total_bytes =
+      2.0 * N * sizeof(test_utils::bfloat16_t); // input and output
   double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9;
   std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s"
             << std::endl;
 
-  std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
+  test_utils::bfloat16_t *bufOut = bo_out.map<test_utils::bfloat16_t *>();
 
   int errors = 0;
 
   for (int i = 0; i < N; i++) {
-    std::bfloat16_t ref = relu_bf16(srcVecA[i]);
-    if (!test_utils::nearly_equal(*(bufOut + i), ref)) {
+    test_utils::bfloat16_t ref = relu_bf16(srcVecA[i]);
+    const float expected = test_utils::bfloat16_to_float(ref);
+    const float actual = test_utils::bfloat16_to_float(*(bufOut + i));
+    if (!test_utils::nearly_equal(actual, expected)) {
       errors++;
       // Print the first 100 mismatches
       if (errors <= 100) {
         std::cout << "Mismatch at index " << i << ": "
-                  << "Expected: " << ref << ", "
-                  << "Got: " << *(bufOut + i) << std::endl;
+                  << "Expected: " << expected << ", "
+                  << "Got: " << actual << std::endl;
       }
     }
   }
diff --git a/programming_examples/ml/rmsnorm/test.cpp b/programming_examples/ml/rmsnorm/test.cpp
index 16124c7dbe4..1b8795dab67 100644
--- a/programming_examples/ml/rmsnorm/test.cpp
+++ b/programming_examples/ml/rmsnorm/test.cpp
@@ -11,6 +11,7 @@
 #include "xrt_test_wrapper.h"
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 #include <iostream>
 #include <vector>
 
@@ -19,22 +20,23 @@
 // ------------------------------------------------------
 // Configure this to match your buffer data type
 // ------------------------------------------------------
-using DATATYPE_IN1 = std::bfloat16_t;
-using DATATYPE_OUT = std::bfloat16_t;
+using DATATYPE_IN1 = test_utils::bfloat16_t;
+using DATATYPE_OUT = test_utils::bfloat16_t;
 #endif
 
 // Initialize Input buffer 1
 void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int in_volume) {
   for (int i = 0; i < in_volume; i++) {
-    DATATYPE_IN1 val = test_utils::random_bfloat16_t((std::bfloat16_t)8.0,
-                                                     (std::bfloat16_t)-4.0);
+    DATATYPE_IN1 val =
+        test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f),
+                                      test_utils::bfloat16_from_float(-4.0f));
     bufIn1[i] = val;
   }
 }
 
 // Initialize Output buffer
 void initialize_bufOut(DATATYPE_OUT *bufOut, int out_volume) {
-  memset(bufOut, 0, out_volume);
+  memset(bufOut, 0, out_volume * sizeof(DATATYPE_OUT));
 }
 
 int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
@@ -49,7 +51,7 @@ int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
     float sum_sq = 0.0f;
     for (int c = 0; c < COLS; c++) {
       int idx = r * COLS + c;
-      float val = static_cast<float>(bufIn1[idx]);
+      float val = test_utils::bfloat16_to_float(bufIn1[idx]);
       sum_sq += val * val;
     }
 
@@ -57,7 +59,7 @@ int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
 
     for (int c = 0; c < COLS; c++) {
       int idx = r * COLS + c;
-      float val = static_cast<float>(bufIn1[idx]);
+      float val = test_utils::bfloat16_to_float(bufIn1[idx]);
       float norm = (val * gamma) / rms;
       expected[idx] = norm;
     }
@@ -65,7 +67,7 @@ int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut,
 
   for (int i = 0; i < (ROWS * COLS); i++) {
     float expected_val = expected[i];
-    float hw_val = static_cast<float>(bufOut[i]);
+    float hw_val = test_utils::bfloat16_to_float(bufOut[i]);
     if (std::abs(expected_val - hw_val) > 0.05f) {
       std::cout << "Mismatch at index " << i << ": expected " << expected_val
                 << ", got " << hw_val << std::endl;
diff --git a/programming_examples/ml/rope/test.cpp b/programming_examples/ml/rope/test.cpp
index 5ca6850b6ca..2cbe3a8301c 100644
--- a/programming_examples/ml/rope/test.cpp
+++ b/programming_examples/ml/rope/test.cpp
@@ -18,16 +18,17 @@
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
 
-using DATATYPE_IN1 = std::bfloat16_t;
-using DATATYPE_IN2 = std::bfloat16_t; // For LUT (cos,sin) pairs
-using DATATYPE_OUT = std::bfloat16_t;
+using DATATYPE_IN1 = test_utils::bfloat16_t;
+using DATATYPE_IN2 = test_utils::bfloat16_t; // For LUT (cos,sin) pairs
+using DATATYPE_OUT = test_utils::bfloat16_t;
 #endif
 
 // Initialize Input buffer 1
 void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int in_volume) {
   for (int i = 0; i < in_volume; i++) {
-    DATATYPE_IN1 val = test_utils::random_bfloat16_t((std::bfloat16_t)8.0,
-                                                     (std::bfloat16_t)-4.0);
+    DATATYPE_IN1 val =
+        test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f),
+                                      test_utils::bfloat16_from_float(-4.0f));
     bufIn1[i] = val;
   }
 }
@@ -47,8 +48,8 @@ void initialize_bufIn2(DATATYPE_IN2 *bufIn2, int lut_volume) {
       float cos_val = std::cos(angle);
       float sin_val = std::sin(angle);
       int base_idx = r * COLS + 2 * i;
-      bufIn2[base_idx] = static_cast<DATATYPE_IN2>(cos_val);
-      bufIn2[base_idx + 1] = static_cast<DATATYPE_IN2>(sin_val);
+      bufIn2[base_idx] = test_utils::bfloat16_from_float(cos_val);
+      bufIn2[base_idx + 1] = test_utils::bfloat16_from_float(sin_val);
     }
   }
 }
@@ -71,10 +72,10 @@ int verify_rope_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_IN2 *bufIn2,
       int input_base_idx = r * COLS + 2 * i;
       int lut_base_idx = r * COLS + 2 * i;
 
-      float x_even = static_cast<float>(bufIn1[input_base_idx]);
-      float x_odd = static_cast<float>(bufIn1[input_base_idx + 1]);
-      float cos_val = static_cast<float>(bufIn2[lut_base_idx]);
-      float sin_val = static_cast<float>(bufIn2[lut_base_idx + 1]);
+      float x_even = test_utils::bfloat16_to_float(bufIn1[input_base_idx]);
+      float x_odd = test_utils::bfloat16_to_float(bufIn1[input_base_idx + 1]);
+      float cos_val = test_utils::bfloat16_to_float(bufIn2[lut_base_idx]);
+      float sin_val = test_utils::bfloat16_to_float(bufIn2[lut_base_idx + 1]);
 
       expected[input_base_idx] = x_even * cos_val - x_odd * sin_val;
       expected[input_base_idx + 1] = x_even * sin_val + x_odd * cos_val;
@@ -83,7 +84,7 @@ int verify_rope_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_IN2 *bufIn2,
 
   for (int i = 0; i < (ROWS * COLS); i++) {
     float expected_val = expected[i];
-    float hw_val = static_cast<float>(bufOut[i]);
+    float hw_val = test_utils::bfloat16_to_float(bufOut[i]);
     if (std::abs(expected_val - hw_val) > 0.05f) {
       std::cout << "Mismatch at index " << i << ": expected " << expected_val
                 << ", got " << hw_val << std::endl;
diff --git a/programming_examples/ml/scale_shift/test.cpp b/programming_examples/ml/scale_shift/test.cpp
index 15ee4c0a1cb..4e101ceabb1 100644
--- a/programming_examples/ml/scale_shift/test.cpp
+++ b/programming_examples/ml/scale_shift/test.cpp
@@ -8,8 +8,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <bits/stdc++.h>
+#include <algorithm>
+#include <chrono>
 #include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -25,10 +29,10 @@
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-using INOUT0_DATATYPE = std::bfloat16_t;
-using INOUT1_DATATYPE = std::bfloat16_t;
-using INOUT2_DATATYPE = std::bfloat16_t;
-using INOUT3_DATATYPE = std::bfloat16_t;
+using INOUT0_DATATYPE = test_utils::bfloat16_t;
+using INOUT1_DATATYPE = test_utils::bfloat16_t;
+using INOUT2_DATATYPE = test_utils::bfloat16_t;
+using INOUT3_DATATYPE = test_utils::bfloat16_t;
 #endif
 
 // ----------------------------------------------------------------------------
@@ -39,16 +43,22 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
            std::vector<T> D, int verbosity) {
   int errors = 0;
   for (uint32_t i = 0; i < size; i++) {
-    T ref = A[i] * B[i] + C[i];
-    if (!test_utils::nearly_equal(ref, D[i], 0.002)) {
+    const float a = test_utils::bfloat16_to_float(A[i]);
+    const float b = test_utils::bfloat16_to_float(B[i]);
+    const float c = test_utils::bfloat16_to_float(C[i]);
+    const float actual = test_utils::bfloat16_to_float(D[i]);
+    const auto product_bf16 = test_utils::bfloat16_mul(A[i], B[i]);
+    const auto ref_bf16 = test_utils::bfloat16_add(product_bf16, C[i]);
+    const float ref = test_utils::bfloat16_to_float(ref_bf16);
+    if (!test_utils::nearly_equal(ref, actual, 0.002)) {
       if (verbosity >= 1) {
-        std::cout << "Error in output " << D[i] << " != " << ref << " from "
-                  << A[i] << " * " << B[i] << " + " << C[i] << std::endl;
+        std::cout << "Error in output " << actual << " != " << ref << " from "
+                  << a << " * " << b << " + " << c << std::endl;
       }
       errors++;
     } else {
       if (verbosity >= 1)
-        std::cout << "Correct output " << D[i] << " == " << ref << std::endl;
+        std::cout << "Correct output " << actual << " == " << ref << std::endl;
     }
   }
   return errors;
@@ -167,21 +177,21 @@ int main(int argc, const char *argv[]) {
   INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
   std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
   for (int i = 0; i < INOUT0_VOLUME; i++)
-    AVec[i] = INOUT0_DATATYPE(4.0f);
+    AVec[i] = test_utils::bfloat16_from_float(4.0f);
   memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
 
   // Initialize Inout buffer 1
   INOUT1_DATATYPE *bufInOut1 = bo_inout1.map<INOUT0_DATATYPE *>();
   std::vector<INOUT1_DATATYPE> BVec(INOUT1_VOLUME);
   for (int i = 0; i < INOUT1_VOLUME; i++)
-    BVec[i] = INOUT1_DATATYPE(3.35f);
+    BVec[i] = test_utils::bfloat16_from_float(3.35f);
   memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
 
   // Initialize Inout buffer 2
   INOUT1_DATATYPE *bufInOut2 = bo_inout2.map<INOUT2_DATATYPE *>();
   std::vector<INOUT2_DATATYPE> CVec(INOUT2_VOLUME);
   for (int i = 0; i < INOUT2_VOLUME; i++)
-    CVec[i] = INOUT2_DATATYPE(0.77f);
+    CVec[i] = test_utils::bfloat16_from_float(0.77f);
   memcpy(bufInOut2, CVec.data(), (CVec.size() * sizeof(INOUT2_DATATYPE)));
 
   // Initialize Inout buffer 3
diff --git a/programming_examples/ml/silu/test.cpp b/programming_examples/ml/silu/test.cpp
index 84b9fa07680..44635c384f5 100644
--- a/programming_examples/ml/silu/test.cpp
+++ b/programming_examples/ml/silu/test.cpp
@@ -9,8 +9,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "cxxopts.hpp"
+#include <algorithm>
+#include <chrono>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -23,16 +27,18 @@
 
 #include "test_utils.h"
 
-// Silu reference implementation
-std::bfloat16_t silu_bf16(std::bfloat16_t &input) {
-  // Compute tanh approximation
-  std::bfloat16_t half_x = input * std::bfloat16_t(0.5f);
-  std::bfloat16_t tanh_half_x = std::tanh(half_x);
-  std::bfloat16_t sigmoid_approx =
-      std::bfloat16_t(0.5f) * (tanh_half_x + std::bfloat16_t(1.0f));
-
-  // Compute output: x * tanh_approx
-  return input * sigmoid_approx;
+// SiLU reference implementation.
+test_utils::bfloat16_t silu_bf16(test_utils::bfloat16_t input) {
+  const test_utils::bfloat16_t k0_5 = test_utils::bfloat16_from_float(0.5f);
+  const test_utils::bfloat16_t k1 = test_utils::bfloat16_from_float(1.0f);
+
+  const test_utils::bfloat16_t half_x = test_utils::bfloat16_mul(input, k0_5);
+  const test_utils::bfloat16_t tanh_half_x = test_utils::bfloat16_tanh(half_x);
+  const test_utils::bfloat16_t tanh_half_x_approx =
+      test_utils::bfloat16_add(tanh_half_x, k1);
+  const test_utils::bfloat16_t sigmoid_approx =
+      test_utils::bfloat16_mul(tanh_half_x_approx, k0_5);
+  return test_utils::bfloat16_mul(input, sigmoid_approx);
 }
 
 int main(int argc, const char *argv[]) {
@@ -49,7 +55,7 @@ int main(int argc, const char *argv[]) {
       "instr,i",
       "path of file containing userspace instructions to be sent to the LX6",
       cxxopts::value<std::string>())(
-      "length,l", "the length of the transfer in std::bfloat16_t",
+      "length,l", "the length of the transfer in bfloat16 elements",
       cxxopts::value<int>()->default_value("4096"));
 
   try {
@@ -129,19 +135,21 @@ int main(int argc, const char *argv[]) {
 
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t),
+  auto bo_inA = xrt::bo(device, N * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t),
+  auto bo_out = xrt::bo(device, N * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects." << std::endl;
 
-  std::bfloat16_t *bufInA = bo_inA.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> srcVecA;
+  test_utils::bfloat16_t *bufInA = bo_inA.map<test_utils::bfloat16_t *>();
+  std::vector<test_utils::bfloat16_t> srcVecA;
   for (int i = 0; i < N; i++)
-    srcVecA.push_back(std::bfloat16_t(i * 0.05f + -3.0f)); // Example data
-  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t)));
+    srcVecA.push_back(
+        test_utils::bfloat16_from_float(i * 0.05f + -3.0f)); // Example data
+  memcpy(bufInA, srcVecA.data(),
+         (srcVecA.size() * sizeof(test_utils::bfloat16_t)));
 
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
@@ -169,24 +177,27 @@ int main(int argc, const char *argv[]) {
   std::cout << "Latency (us): " << npu_time << std::endl;
   std::cout << std::endl;
 
-  double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output
+  double total_bytes =
+      2.0 * N * sizeof(test_utils::bfloat16_t); // input and output
   double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9;
   std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s"
             << std::endl;
 
-  std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
+  test_utils::bfloat16_t *bufOut = bo_out.map<test_utils::bfloat16_t *>();
 
   int errors = 0;
 
   for (int i = 0; i < N; i++) {
-    std::bfloat16_t ref = silu_bf16(srcVecA[i]);
-    if (!test_utils::nearly_equal(*(bufOut + i), ref, 0.04)) {
+    const test_utils::bfloat16_t ref = silu_bf16(srcVecA[i]);
+    const float expected = test_utils::bfloat16_to_float(ref);
+    const float actual = test_utils::bfloat16_to_float(*(bufOut + i));
+    if (!test_utils::nearly_equal(actual, expected, 0.04)) {
       errors++;
       // Print the first 100 mismatches
       if (errors <= 100) {
         std::cout << "Mismatch at index " << i << ": "
-                  << "Expected: " << ref << ", "
-                  << "Got: " << *(bufOut + i) << std::endl;
+                  << "Expected: " << expected << ", "
+                  << "Got: " << actual << std::endl;
       }
     }
   }
diff --git a/programming_examples/ml/softmax/test.cpp b/programming_examples/ml/softmax/test.cpp
index 634d44110f2..c05201270bf 100644
--- a/programming_examples/ml/softmax/test.cpp
+++ b/programming_examples/ml/softmax/test.cpp
@@ -8,12 +8,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <bits/stdc++.h>
+#include <chrono>
+#include <cmath>
 #include <cstdint>
+#include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <sstream>
-#include <stdfloat>
+#include <string>
 #include <vector>
 
 #include "xrt/xrt_bo.h"
@@ -25,52 +28,55 @@
 
 #ifndef DATATYPES_USING_DEFINED
 #define DATATYPES_USING_DEFINED
-using INOUT0_DATATYPE = std::bfloat16_t;
-using INOUT1_DATATYPE = std::bfloat16_t;
+using INOUT0_DATATYPE = test_utils::bfloat16_t;
+using INOUT1_DATATYPE = test_utils::bfloat16_t;
 #endif
 
 // ----------------------------------------------------------------------------
 // Verify results (specific to our design example)
 // ----------------------------------------------------------------------------
-template <typename T>
-int verify(int size, int tile_size, std::vector<T> A, std::vector<T> B,
-           int verbosity) {
+static int verify(int size, int tile_size,
+                  const std::vector<INOUT0_DATATYPE> &A,
+                  const std::vector<INOUT1_DATATYPE> &B, int verbosity) {
 
   int errors = 0;
-  T max_val = A[0];
-  std::vector<T> RefVec(size);
+  float max_val = test_utils::bfloat16_to_float(A[0]);
+  std::vector<INOUT1_DATATYPE> RefVec(size);
 
   for (uint32_t i = 1; i < A.size(); i++) {
-    A[i] = (T)(A[i]);
-    T val = A[i];
+    const float val = test_utils::bfloat16_to_float(A[i]);
     if (val > max_val) {
       max_val = val;
     }
   }
 
   for (uint32_t t = 0; t < size; t += tile_size) {
-    float running = 0.0;
+    float running = 0.0f;
     for (uint32_t i = 0; i < tile_size; i++) {
-      float ez = (float)(exp(A[t + i] - max_val));
-      running += ez;
-      RefVec[t + i] = (T)exp(A[t + i] - max_val);
+      const float input_value = test_utils::bfloat16_to_float(A[t + i]);
+      const float exp_value = std::exp(input_value - max_val);
+      running += exp_value;
+      RefVec[t + i] = test_utils::bfloat16_from_float(exp_value);
     }
 
+    const INOUT1_DATATYPE running_bf16 =
+        test_utils::bfloat16_from_float(running);
     for (uint32_t i = 0; i < tile_size; i++) {
-      RefVec[t + i] /= (T)running;
+      RefVec[t + i] = test_utils::bfloat16_div(RefVec[t + i], running_bf16);
     }
   }
 
   for (uint32_t i = 0; i < size; i++) {
-
-    if (!test_utils::nearly_equal(RefVec[i], B[i], 0.04, 0.001)) {
+    const float expected = test_utils::bfloat16_to_float(RefVec[i]);
+    const float actual = test_utils::bfloat16_to_float(B[i]);
+    if (!test_utils::nearly_equal(actual, expected, 0.04, 0.001)) {
       if (verbosity >= 1) {
-        std::cout << "Error in output " << B[i] << " != " << RefVec[i]
+        std::cout << "Error in output " << actual << " != " << expected
                   << std::endl;
       }
       errors++;
     } else if (verbosity >= 1) {
-      std::cout << "Correct output " << B[i] << " == " << RefVec[i]
+      std::cout << "Correct output " << actual << " == " << expected
                 << std::endl;
     }
   }
@@ -153,12 +159,14 @@ int main(int argc, const char *argv[]) {
   for (int i = 0; i < INOUT0_VOLUME; i++) {
     if (dev == 1) {
       // NPU1: Use bfloat16 values in range [4.0, 4.0]
-      AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)8.0,
-                                              (std::bfloat16_t)-4.0);
+      AVec[i] =
+          test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f),
+                                        test_utils::bfloat16_from_float(-4.0f));
     } else if (dev == 2) {
       // NPU2: Use bfloat16 values in range [-512.0, 512.0]
-      AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1024.0,
-                                              (std::bfloat16_t)-512.0);
+      AVec[i] = test_utils::random_bfloat16_t(
+          test_utils::bfloat16_from_float(1024.0f),
+          test_utils::bfloat16_from_float(-512.0f));
     }
   }
   memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
diff --git a/programming_examples/ml/swiglu/test.cpp b/programming_examples/ml/swiglu/test.cpp
index 3a58c66841f..c7aeaebdcda 100644
--- a/programming_examples/ml/swiglu/test.cpp
+++ b/programming_examples/ml/swiglu/test.cpp
@@ -9,8 +9,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "cxxopts.hpp"
+#include <algorithm>
+#include <chrono>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <fstream>
 #include <iostream>
 #include <sstream>
@@ -23,29 +27,27 @@
 
 #include "test_utils.h"
 
-// Silu reference implementation
-std::bfloat16_t silu_bf16(std::bfloat16_t &input) {
-  // Compute tanh approximation
-  std::bfloat16_t half_x = input * std::bfloat16_t(0.5f);
-  std::bfloat16_t tanh_half_x = std::tanh(half_x);
-  std::bfloat16_t sigmoid_approx =
-      std::bfloat16_t(0.5f) * (tanh_half_x + std::bfloat16_t(1.0f));
-
-  // Compute output: x * tanh_approx
-  return input * sigmoid_approx;
-}
-
-// SwiGLU reference implementation
-std::bfloat16_t swiglu_bf16(std::bfloat16_t &input, std::bfloat16_t &w1,
-                            std::bfloat16_t &w2) {
-  // Compute the first part: x * w1
-  std::bfloat16_t x_w1 = input * w1;
-  // Compute the second part: x * w2
-  std::bfloat16_t x_w2 = input * w2;
-  // Apply the silu activation function to the second part
-  std::bfloat16_t silu_output = silu_bf16(x_w2);
-  // Compute the final output: x * w1 * silu_output
-  return x_w1 * silu_output;
+// SwiGLU reference implementation.
+test_utils::bfloat16_t swiglu_bf16(test_utils::bfloat16_t input,
+                                   test_utils::bfloat16_t w1,
+                                   test_utils::bfloat16_t w2) {
+  const test_utils::bfloat16_t k0_5 = test_utils::bfloat16_from_float(0.5f);
+  const test_utils::bfloat16_t k1 = test_utils::bfloat16_from_float(1.0f);
+
+  const test_utils::bfloat16_t mul_input_weight_1 =
+      test_utils::bfloat16_mul(input, w1);
+  const test_utils::bfloat16_t mul_input_weight_2 =
+      test_utils::bfloat16_mul(input, w2);
+  const test_utils::bfloat16_t half_x =
+      test_utils::bfloat16_mul(mul_input_weight_2, k0_5);
+  const test_utils::bfloat16_t tanh_half_x = test_utils::bfloat16_tanh(half_x);
+  const test_utils::bfloat16_t tanh_half_x_approx =
+      test_utils::bfloat16_add(tanh_half_x, k1);
+  const test_utils::bfloat16_t sigmoid_approx =
+      test_utils::bfloat16_mul(tanh_half_x_approx, k0_5);
+  const test_utils::bfloat16_t silu_output =
+      test_utils::bfloat16_mul(mul_input_weight_2, sigmoid_approx);
+  return test_utils::bfloat16_mul(mul_input_weight_1, silu_output);
 }
 
 int main(int argc, const char *argv[]) {
@@ -62,7 +64,7 @@ int main(int argc, const char *argv[]) {
       "instr,i",
       "path of file containing userspace instructions to be sent to the LX6",
       cxxopts::value<std::string>())(
-      "length,l", "the length of the transfer in std::bfloat16_t",
+      "length,l", "the length of the transfer in bfloat16 elements",
       cxxopts::value<int>()->default_value("4096"));
 
   try {
@@ -142,31 +144,33 @@ int main(int argc, const char *argv[]) {
 
   auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
-  auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t),
+  auto bo_inA = xrt::bo(device, N * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_weights = xrt::bo(device, 2 * N * sizeof(std::bfloat16_t),
+  auto bo_weights = xrt::bo(device, 2 * N * sizeof(test_utils::bfloat16_t),
                             XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-  auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t),
+  auto bo_out = xrt::bo(device, N * sizeof(test_utils::bfloat16_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
 
   if (verbosity >= 1)
     std::cout << "Writing data into buffer objects." << std::endl;
 
-  std::bfloat16_t *bufInA = bo_inA.map<std::bfloat16_t *>();
-  std::vector<std::bfloat16_t> srcVecA;
+  test_utils::bfloat16_t *bufInA = bo_inA.map<test_utils::bfloat16_t *>();
+  std::vector<test_utils::bfloat16_t> srcVecA;
   for (int i = 0; i < N; i++)
-    srcVecA.push_back(std::bfloat16_t(i * 0.05f + -1.0f)); // Example data
-  memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t)));
+    srcVecA.push_back(
+        test_utils::bfloat16_from_float(i * 0.05f + -1.0f)); // Example data
+  memcpy(bufInA, srcVecA.data(),
+         (srcVecA.size() * sizeof(test_utils::bfloat16_t)));
 
   // Generate the W1 and W2 weights
-  std::vector<std::bfloat16_t> srcVecW1;
-  std::vector<std::bfloat16_t> srcVecW2;
+  std::vector<test_utils::bfloat16_t> srcVecW1;
+  std::vector<test_utils::bfloat16_t> srcVecW2;
   for (int i = 0; i < N; i++) {
     // Example weights, can be replaced with actual model weights
-    srcVecW1.push_back(std::bfloat16_t(0.1f * (i % 10) + 0.1f));
-    srcVecW2.push_back(std::bfloat16_t(0.2f * (i % 20) + 0.2f));
+    srcVecW1.push_back(test_utils::bfloat16_from_float(0.1f * (i % 10) + 0.1f));
+    srcVecW2.push_back(test_utils::bfloat16_from_float(0.2f * (i % 20) + 0.2f));
   }
-  std::vector<std::bfloat16_t> srcVecWeights;
+  std::vector<test_utils::bfloat16_t> srcVecWeights;
   // Interleave the weights into one vector in 1024 elements chunks
   // of each W1 and W2
   for (int i = 0; i < N; i += 1024) {
@@ -179,9 +183,9 @@ int main(int argc, const char *argv[]) {
   }
 
   // Write the weights to the buffer object
-  auto bufWeights = bo_weights.map<std::bfloat16_t *>();
+  auto bufWeights = bo_weights.map<test_utils::bfloat16_t *>();
   memcpy(bufWeights, srcVecWeights.data(),
-         srcVecWeights.size() * sizeof(std::bfloat16_t));
+         srcVecWeights.size() * sizeof(test_utils::bfloat16_t));
 
   void *bufInstr = bo_instr.map<void *>();
   memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
@@ -212,24 +216,28 @@ int main(int argc, const char *argv[]) {
   std::cout << "Latency (us): " << npu_time << std::endl;
   std::cout << std::endl;
 
-  double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output
+  double total_bytes =
+      2.0 * N * sizeof(test_utils::bfloat16_t); // input and output
   double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9;
   std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s"
             << std::endl;
 
-  std::bfloat16_t *bufOut = bo_out.map<std::bfloat16_t *>();
+  test_utils::bfloat16_t *bufOut = bo_out.map<test_utils::bfloat16_t *>();
 
   int errors = 0;
 
   for (int i = 0; i < N; i++) {
-    std::bfloat16_t ref = swiglu_bf16(srcVecA[i], srcVecW1[i], srcVecW2[i]);
-    if (!test_utils::nearly_equal(*(bufOut + i), ref, 0.05f)) {
+    const test_utils::bfloat16_t ref =
+        swiglu_bf16(srcVecA[i], srcVecW1[i], srcVecW2[i]);
+    const float expected = test_utils::bfloat16_to_float(ref);
+    const float actual = test_utils::bfloat16_to_float(*(bufOut + i));
+    if (!test_utils::nearly_equal(actual, expected, 0.05f)) {
       errors++;
       // Print the first 100 mismatches
       if (errors <= 100) {
         std::cout << "Mismatch at index " << i << ": "
-                  << "Expected: " << ref << ", "
-                  << "Got: " << *(bufOut + i) << std::endl;
+                  << "Expected: " << expected << ", "
+                  << "Got: " << actual << std::endl;
       }
     }
   }
diff --git a/runtime_lib/test_lib/test_utils.h b/runtime_lib/test_lib/test_utils.h
index cf4eca0fe21..ca0be709013 100644
--- a/runtime_lib/test_lib/test_utils.h
+++ b/runtime_lib/test_lib/test_utils.h
@@ -14,9 +14,11 @@
 #define _TEST_UTILS_H_
 
 #include "cxxopts.hpp"
+#include <algorithm>
 #include <cfloat>
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
@@ -57,16 +59,82 @@ static inline std::int32_t random_int32_t(int32_t range = 0x10000) {
   return (std::int32_t)rand() % range;
 }
 
+// The Linux toolchain has std::bfloat16_t. MSVC does not.
+//
+// Use this host-side helper for bfloat16 XRT buffers and reference checks.
+// Device code should use the AIE bfloat16 types and APIs.
 #if defined(__STDCPP_BFLOAT16_T__)
-static inline std::bfloat16_t random_bfloat16_t(std::bfloat16_t scale,
-                                                std::bfloat16_t bias) {
-  return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) + bias);
+using bfloat16_t = std::bfloat16_t;
+
+static inline bfloat16_t bfloat16_from_float(float value) {
+  return bfloat16_t(value);
+}
+
+static inline bfloat16_t bfloat16_from_bits(std::uint16_t bits) {
+  bfloat16_t value;
+  std::memcpy(&value, &bits, sizeof(value));
+  return value;
+}
+
+static inline float bfloat16_to_float(bfloat16_t value) {
+  return static_cast<float>(value);
+}
+#else
+using bfloat16_t = std::uint16_t;
+
+static inline bfloat16_t bfloat16_from_bits(std::uint16_t bits) { return bits; }
+
+static inline float bfloat16_to_float(bfloat16_t bits) {
+  const std::uint32_t expanded_bits = static_cast<std::uint32_t>(bits) << 16;
+  float value = 0.0f;
+  std::memcpy(&value, &expanded_bits, sizeof(value));
+  return value;
+}
+
+static inline bfloat16_t bfloat16_from_float(float value) {
+  std::uint32_t bits = 0;
+  std::memcpy(&bits, &value, sizeof(bits));
+
+  // Round to nearest-even instead of truncating.
+  const std::uint32_t lsb = (bits >> 16) & 1U;
+  const std::uint32_t rounding_bias = 0x7FFFU + lsb;
+  return static_cast<bfloat16_t>((bits + rounding_bias) >> 16);
 }
 #endif
 
+static inline bfloat16_t random_bfloat16_t(bfloat16_t scale, bfloat16_t bias) {
+  const float scale_value = bfloat16_to_float(scale);
+  const float bias_value = bfloat16_to_float(bias);
+  return bfloat16_from_float((scale_value * (float)rand() / (float)(RAND_MAX)) +
+                             bias_value);
+}
+
 bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON,
                   float abs_th = FLT_MIN);
 
+static inline bool nearly_equal_bfloat16(bfloat16_t a, bfloat16_t b,
+                                         float epsilon = 128 * FLT_EPSILON,
+                                         float abs_th = FLT_MIN) {
+  return nearly_equal(bfloat16_to_float(a), bfloat16_to_float(b), epsilon,
+                      abs_th);
+}
+
+static inline bfloat16_t bfloat16_add(bfloat16_t lhs, bfloat16_t rhs) {
+  return bfloat16_from_float(bfloat16_to_float(lhs) + bfloat16_to_float(rhs));
+}
+
+static inline bfloat16_t bfloat16_mul(bfloat16_t lhs, bfloat16_t rhs) {
+  return bfloat16_from_float(bfloat16_to_float(lhs) * bfloat16_to_float(rhs));
+}
+
+static inline bfloat16_t bfloat16_div(bfloat16_t lhs, bfloat16_t rhs) {
+  return bfloat16_from_float(bfloat16_to_float(lhs) / bfloat16_to_float(rhs));
+}
+
+static inline bfloat16_t bfloat16_tanh(bfloat16_t value) {
+  return bfloat16_from_float(std::tanh(bfloat16_to_float(value)));
+}
+
 template <typename T>
 void print_matrix(const std::vector<T> matrix, int n_cols,
                   int n_printable_rows = 10, int n_printable_cols = 10,