diff --git a/programming_examples/basic/matrix_multiplication/common.h b/programming_examples/basic/matrix_multiplication/common.h index e9f743a632f..275ff39e7a5 100644 --- a/programming_examples/basic/matrix_multiplication/common.h +++ b/programming_examples/basic/matrix_multiplication/common.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include "test_utils.h" @@ -92,6 +92,52 @@ void parse_options(int argc, const char *argv[], cxxopts::Options &options, template static inline T get_random(); +template +static inline auto scalar_to_arithmetic(T value) { + if constexpr (std::is_same_v) { + return test_utils::bfloat16_to_float(value); + } else { + return value; + } +} + +template +static inline float scalar_to_float(T value) { + return static_cast(scalar_to_arithmetic(value)); +} + +template +static inline T scalar_from_accum(Tacc value) { + auto arithmetic_value = scalar_to_arithmetic(value); + if constexpr (std::is_same_v) { + return test_utils::bfloat16_from_float( + static_cast(arithmetic_value)); + } else { + return static_cast(arithmetic_value); + } +} + +template +static inline Tacc zero_accum() { + if constexpr (std::is_same_v) { + return test_utils::bfloat16_from_float(0.0f); + } else { + return Tacc(0); + } +} + +template +static inline Tacc accum_add_product(Tacc running_sum, Tin lhs, Tin rhs) { + auto product = scalar_to_arithmetic(lhs) * scalar_to_arithmetic(rhs); + if constexpr (std::is_same_v) { + return test_utils::bfloat16_add( + running_sum, + test_utils::bfloat16_from_float(static_cast(product))); + } else { + return running_sum + Tacc(product); + } +} + template <> std::int16_t get_random() { return (std::int16_t)rand() % 0x10000; @@ -103,10 +149,11 @@ int8_t get_random() { } template <> -std::bfloat16_t get_random() { +test_utils::bfloat16_t get_random() { // Random numbers should NOT be uniformly between 0 and 1, because that // would make the matrix product AB always close to 1. - return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX)); + return test_utils::bfloat16_from_float(4.0f * (float)rand() / + (float)(RAND_MAX)); } template @@ -115,18 +162,20 @@ void matmul(int M, int N, int K, const std::vector A, int c_col_maj) { for (int row = 0; row < M; row++) { for (int col = 0; col < N; col++) { - Tacc running_sum = 0; + Tacc running_sum = zero_accum(); for (int k = 0; k < K; k++) { if (!b_col_maj) { - running_sum += Tacc(A[row * K + k] * B[k * N + col]); + running_sum = accum_add_product(running_sum, A[row * K + k], + B[k * N + col]); } else { - running_sum += Tacc(A[row * K + k] * B[k + col * K]); + running_sum = accum_add_product(running_sum, A[row * K + k], + B[k + col * K]); } } if (!c_col_maj) { - C[row * N + col] = Tout(running_sum); + C[row * N + col] = scalar_from_accum(running_sum); } else { - C[row + col * M] = Tout(running_sum); + C[row + col * M] = scalar_from_accum(running_sum); } } } @@ -135,15 +184,17 @@ void matmul(int M, int N, int K, const std::vector A, template Tout mul_acc(int M, int N, int K, int row, int col, const std::vector A, const std::vector B, int b_col_maj) { - Tacc running_sum = 0; + Tacc running_sum = zero_accum(); for (int k = 0; k < K; k++) { if (!b_col_maj) { - running_sum += Tacc(A[row * K + k] * B[k * N + col]); + running_sum = + accum_add_product(running_sum, A[row * K + k], B[k * N + col]); } else { - running_sum += Tacc(A[row * K + k] * B[k + col * K]); + running_sum = + accum_add_product(running_sum, A[row * K + k], B[k + col * K]); } } - return (Tout)running_sum; + return scalar_from_accum(running_sum); } // nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0 @@ -184,7 +235,7 @@ float get_abs_tol() { } template <> -float get_abs_tol() { +float get_abs_tol() { return 0.5; } @@ -209,7 +260,7 @@ float get_rel_tol() { } template <> -float get_rel_tol() { +float get_rel_tol() { return 0.05; } @@ -314,8 +365,9 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual, float abs_tol, float rel_tol) { bool match = expected == actual; if (abs_tol > 0 || rel_tol > 0) { - // Allow for some tolerance for float data types - match = nearly_equal(expected, actual, rel_tol, abs_tol); + // Allow for some tolerance for float and host-side bfloat16 data types. + match = nearly_equal(scalar_to_float(expected), scalar_to_float(actual), + rel_tol, abs_tol); } if (!match) { return (struct error){row, col, expected, actual}; @@ -326,12 +378,13 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual, template void print_error_summary(std::ostream &os, int n_errors, std::vector> &errors, - Tout max_rel_error) { + float max_rel_error) { for (struct error &err : errors) { os << "[" << std::setw(5) << err.row << ", " << std::setw(5) << err.col << "] " << std::setw(4) << std::setprecision(2) << std::fixed - << (float)err.actual << " =!= " << std::setw(4) << std::setprecision(2) - << std::fixed << (float)err.expected << std::endl; + << scalar_to_float(err.actual) << " =!= " << std::setw(4) + << std::setprecision(2) << std::fixed << scalar_to_float(err.expected) + << std::endl; } if (n_errors > max_printable_errors) { os << "...and " << std::setw(0) << n_errors - max_printable_errors @@ -357,7 +410,7 @@ int verify(int M, int N, int K, std::vector A, std::vector B, float rel_tol = 0.05, int b_col_maj = 0, int c_col_maj = 0) { int n_errors = 0; std::vector> errors; - Tout max_rel_error = (Tout)0.0f; + float max_rel_error = 0.0f; struct error max_error; std::vector CRef(M * N); @@ -372,9 +425,11 @@ int verify(int M, int N, int K, std::vector A, std::vector B, if (n_errors < max_printable_errors) { errors.push_back(*error); } - Tout rel_error = - std::abs(error->actual - error->expected) / - std::max(std::abs(error->actual), std::abs(error->expected)); + float actual_value = scalar_to_float(error->actual); + float expected_value = scalar_to_float(error->expected); + float rel_error = + std::abs(actual_value - expected_value) / + std::max(std::abs(actual_value), std::abs(expected_value)); if (rel_error > max_rel_error) { max_rel_error = rel_error; max_error = *error; @@ -414,7 +469,7 @@ int verify_stochastic(int M, int N, int K, std::vector A, int n_errors = 0; std::vector> errors; - Tout max_rel_error = (Tout)0.0f; + float max_rel_error = 0.0f; double progress = 0; for (std::tuple> cell : std::views::enumerate(std::views::zip(sampled_rows, sampled_cols))) { @@ -440,9 +495,11 @@ int verify_stochastic(int M, int N, int K, std::vector A, if (n_errors < max_printable_errors) { errors.push_back(*error); } - Tout rel_error = - std::abs(error->actual - error->expected) / - std::max(std::abs(error->actual), std::abs(error->expected)); + float actual_value = scalar_to_float(error->actual); + float expected_value = scalar_to_float(error->expected); + float rel_error = + std::abs(actual_value - expected_value) / + std::max(std::abs(actual_value), std::abs(expected_value)); if (rel_error > max_rel_error) { max_rel_error = rel_error; } diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index afc028f200e..284a288de4f 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -42,10 +42,10 @@ devicename ?= $(if $(filter 1,$(NPU2)),npu2,npu) colshift ?= $(if $(filter npu,$(devicename)),1,0) ifeq ($(dtype_in),bf16) - dtype_in_cpp=std::bfloat16_t + dtype_in_cpp=test_utils::bfloat16_t endif ifeq ($(dtype_out),bf16) - dtype_out_cpp=std::bfloat16_t + dtype_out_cpp=test_utils::bfloat16_t dtype_acc_cpp=float endif ifeq ($(dtype_in),i16) diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp index eb41adafa3b..31cc6e3a3e9 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/test.cpp @@ -8,12 +8,11 @@ // //===----------------------------------------------------------------------===// -#include #include #define DATATYPES_USING_DEFINED -using A_DATATYPE = int16_t; // std::bfloat16_t; -using B_DATATYPE = int16_t; // std::bfloat16_t; +using A_DATATYPE = int16_t; +using B_DATATYPE = int16_t; using C_DATATYPE = int32_t; // float; using ACC_DATATYPE = int32_t; diff --git a/programming_examples/basic/matrix_multiplication/test.cpp b/programming_examples/basic/matrix_multiplication/test.cpp index 1c2c33f1394..de82d8b84be 100644 --- a/programming_examples/basic/matrix_multiplication/test.cpp +++ b/programming_examples/basic/matrix_multiplication/test.cpp @@ -18,7 +18,6 @@ #include #include #include -#include #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" @@ -29,10 +28,10 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED #ifndef DTYPE_IN -#define DTYPE_IN std::bfloat16_t +#define DTYPE_IN test_utils::bfloat16_t #endif #ifndef DTYPE_OUT -#define DTYPE_OUT std::bfloat16_t +#define DTYPE_OUT test_utils::bfloat16_t #endif #ifndef DTYPE_ACC #define DTYPE_ACC float diff --git a/programming_examples/basic/vector_exp/test.cpp b/programming_examples/basic/vector_exp/test.cpp index 1c179c0d4df..4b3f16846ba 100644 --- a/programming_examples/basic/vector_exp/test.cpp +++ b/programming_examples/basic/vector_exp/test.cpp @@ -9,9 +9,13 @@ //===----------------------------------------------------------------------===// #include "cxxopts.hpp" -#include +#include +#include #include #include +#include +#include +#include #include #include #include @@ -26,8 +30,8 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -using INOUT0_DATATYPE = std::bfloat16_t; -using INOUT1_DATATYPE = std::bfloat16_t; +using INOUT0_DATATYPE = test_utils::bfloat16_t; +using INOUT1_DATATYPE = test_utils::bfloat16_t; #endif // ---------------------------------------------------------------------------- @@ -37,16 +41,19 @@ template int verify(int CSize, std::vector A, std::vector C, int verbosity) { int errors = 0; for (uint32_t i = 0; i < CSize; i++) { - std::bfloat16_t ref = exp(A[i]); + const float input = test_utils::bfloat16_to_float(A[i]); + const float actual = test_utils::bfloat16_to_float(C[i]); + const auto ref_bf16 = test_utils::bfloat16_from_float(std::exp(input)); + const float ref = test_utils::bfloat16_to_float(ref_bf16); // Let's check if they are inf or nan, and if so just pass because // comparisions will then fail, even for matches - if (std::isinf(ref) || std::isinf(C[i])) + if (std::isinf(ref) || std::isinf(actual)) break; - if (std::isnan(ref) || std::isnan(C[i])) + if (std::isnan(ref) || std::isnan(actual)) break; - if (!test_utils::nearly_equal(ref, C[i], 0.128)) { + if (!test_utils::nearly_equal(ref, actual, 0.128)) { if (errors < 100) { - std::cout << "Error in output " << C[i] << " != " << ref << std::endl; + std::cout << "Error in output " << actual << " != " << ref << std::endl; } else if (errors == 100) { std::cout << "..." << std::endl; std::cout << "[Errors truncated]" << std::endl; @@ -54,7 +61,7 @@ int verify(int CSize, std::vector A, std::vector C, int verbosity) { errors++; } else { if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; + std::cout << "Correct output " << actual << " == " << ref << std::endl; } } return errors; @@ -161,9 +168,8 @@ int main(int argc, const char *argv[]) { INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); std::vector AVec(INOUT0_VOLUME); for (int i = 0; i < INOUT0_VOLUME; i++) { - std::uint16_t u16 = (std::uint16_t)i; - std::bfloat16_t bf16 = *(std::bfloat16_t *)&u16; - AVec[i] = bf16; + const std::uint16_t bits = static_cast(i); + AVec[i] = test_utils::bfloat16_from_bits(bits); } memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); @@ -203,7 +209,7 @@ int main(int argc, const char *argv[]) { /* Warmup iterations do not count towards average runtime. */ continue; } - std::bfloat16_t *bufOut = bo_inout1.map(); + INOUT1_DATATYPE *bufOut = bo_inout1.map(); // Copy output results and verify they are correct std::vector CVec(INOUT1_VOLUME); diff --git a/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile b/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile index c228738ebec..3efcbca088e 100755 --- a/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile +++ b/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile @@ -40,7 +40,7 @@ else endif ifeq ($(dtype),bf16) - dtype_cpp=std::bfloat16_t + dtype_cpp=test_utils::bfloat16_t endif ifeq ($(dtype),i32) dtype_cpp=std::int32_t diff --git a/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile b/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile index 6ebb0850f09..79291cd6d39 100644 --- a/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile +++ b/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile @@ -41,7 +41,7 @@ aie_py_src=${targetname}.py endif ifeq ($(dtype),bf16) - dtype_cpp=std::bfloat16_t + dtype_cpp=test_utils::bfloat16_t endif ifeq ($(dtype),i32) dtype_cpp=std::int32_t diff --git a/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile b/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile index 5e1ab1db180..55630048482 100755 --- a/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile +++ b/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile @@ -30,7 +30,7 @@ aie_py_src=${targetname}.py endif ifeq ($(dtype),bf16) - dtype_cpp=std::bfloat16_t + dtype_cpp=test_utils::bfloat16_t endif ifeq ($(dtype),i32) dtype_cpp=std::int32_t diff --git a/programming_examples/basic/vector_reduce_max/test.cpp b/programming_examples/basic/vector_reduce_max/test.cpp index 86f4219496b..890c54df985 100644 --- a/programming_examples/basic/vector_reduce_max/test.cpp +++ b/programming_examples/basic/vector_reduce_max/test.cpp @@ -7,69 +7,108 @@ // Copyright (C) 2025, Advanced Micro Devices, Inc. // //===----------------------------------------------------------------------===// + #include "xrt_test_wrapper.h" #include -#include +#include +#include +#include +#include +#include + #ifndef DTYPE -#define DTYPE std::bfloat16_t +#define DTYPE test_utils::bfloat16_t #endif -// ------------------------------------------------------ -// Configure this to match your buffer data type -// ----------------------------------------------------- + using DATATYPE = DTYPE; +template +T random_input_value() { + if constexpr (std::is_same_v) { + return test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(-4.0f), + test_utils::bfloat16_from_float(8.0f)); + } else if constexpr (std::is_same_v) { + return test_utils::random_int32_t(100000); + } else { + std::cerr << "Unsupported data type" << std::endl; + std::exit(EXIT_FAILURE); + } +} + +template +T lowest_value() { + if constexpr (std::is_same_v) { + return test_utils::bfloat16_from_float( + -std::numeric_limits::infinity()); + } else { + return std::numeric_limits::lowest(); + } +} + +template +bool less_than(T lhs, T rhs) { + if constexpr (std::is_same_v) { + return test_utils::bfloat16_to_float(lhs) < + test_utils::bfloat16_to_float(rhs); + } else { + return lhs < rhs; + } +} + +template +bool values_equal(T lhs, T rhs) { + if constexpr (std::is_same_v) { + return test_utils::nearly_equal_bfloat16(lhs, rhs); + } else { + return lhs == rhs; + } +} + +template +auto printable_value(T value) { + if constexpr (std::is_same_v) { + return test_utils::bfloat16_to_float(value); + } else { + return value; + } +} + void initialize_bufIn1(DATATYPE *bufIn1, int SIZE) { - DATATYPE max = std::numeric_limits::lowest(); + DATATYPE max = lowest_value(); for (int i = 0; i < SIZE; i++) { - DATATYPE next; - if constexpr (std::is_same_v && - std::is_same_v) { - next = test_utils::random_bfloat16_t((std::bfloat16_t)-4.0, - (std::bfloat16_t)8.0); - } else if constexpr (std::is_same_v && - std::is_same_v) { - next = test_utils::random_int32_t(100000); - } else { - std::cerr << "Unsupported data type" << std::endl; - std::exit(EXIT_FAILURE); - } - if (next > max) + DATATYPE next = random_input_value(); + if (less_than(max, next)) max = next; bufIn1[i] = next; } } -// Initialize Output buffer -void initialize_bufOut(DATATYPE *bufOut, int SIZE) { memset(bufOut, 0, SIZE); } +void initialize_bufOut(DATATYPE *bufOut, int SIZE) { + std::memset(bufOut, 0, SIZE); +} -// Functional correctness verifyer int verify_vector_reduce_max(DATATYPE *bufIn1, DATATYPE *bufOut, int SIZE, int verbosity) { int errors = 0; - // Calculate max within the function - DATATYPE max = std::numeric_limits::lowest(); + DATATYPE max = lowest_value(); for (int i = 0; i < SIZE; i++) { - if (bufIn1[i] > max) + if (less_than(max, bufIn1[i])) max = bufIn1[i]; } - if (bufOut[0] != max) { + if (!values_equal(bufOut[0], max)) { errors++; - std::cout << "max is " << max << " calc " << bufOut[0] << std::endl; - } else { - if (verbosity >= 1) - std::cout << "max is " << max << " calc " << bufOut[0] << std::endl; + std::cout << "max is " << printable_value(max) << " calc " + << printable_value(bufOut[0]) << std::endl; + } else if (verbosity >= 1) { + std::cout << "max is " << printable_value(max) << " calc " + << printable_value(bufOut[0]) << std::endl; } return errors; } -//***************************************************************************** -// Should not need to modify below section -//***************************************************************************** - int main(int argc, const char *argv[]) { - constexpr int IN1_VOLUME = IN1_SIZE / sizeof(DATATYPE); constexpr int OUT_VOLUME = OUT_SIZE / sizeof(DATATYPE); @@ -79,4 +118,4 @@ int main(int argc, const char *argv[]) { initialize_bufOut, verify_vector_reduce_max>( IN1_VOLUME, OUT_VOLUME, myargs); return res; -} \ No newline at end of file +} diff --git a/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp b/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp index 0f785c104e5..44fc021a9d6 100644 --- a/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp +++ b/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp @@ -8,7 +8,6 @@ // //===----------------------------------------------------------------------===// -#include #include #include #include @@ -28,15 +27,6 @@ #include "../helper.h" #include "common.h" -#include - -// Clangd fix, remove -#ifdef _CLANGD -namespace std { -using bfloat16_t = float; -} // namespace std -#endif - int main(int argc, const char *argv[]) { // ------------------------------------------------------ @@ -113,9 +103,9 @@ int main(int argc, const char *argv[]) { auto boInstr = xrt::bo(device, instr.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto boInA = xrt::bo(device, numberFloats * sizeof(std::bfloat16_t), + auto boInA = xrt::bo(device, numberFloats * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto boInB = xrt::bo(device, numberFloats * sizeof(std::bfloat16_t), + auto boInB = xrt::bo(device, numberFloats * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto boOut = xrt::bo(device, bfpBytesSize * sizeof(int8_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); @@ -139,22 +129,24 @@ int main(int argc, const char *argv[]) { return generateRandomFloatingPoint(rng, -5, 5); }); - std::bfloat16_t bfloatA[numberFloats]; - std::bfloat16_t bfloatB[numberFloats]; + test_utils::bfloat16_t bfloatA[numberFloats]; + test_utils::bfloat16_t bfloatB[numberFloats]; - std::ranges::transform( - floatA, bfloatA, [](float f) { return static_cast(f); }); - std::ranges::transform( - floatB, bfloatB, [](float f) { return static_cast(f); }); + std::ranges::transform(floatA, bfloatA, [](float f) { + return test_utils::bfloat16_from_float(f); + }); + std::ranges::transform(floatB, bfloatB, [](float f) { + return test_utils::bfloat16_from_float(f); + }); // ------------------------------------------------------ // Write data into buffers // ------------------------------------------------------ - std::bfloat16_t *bufInA = boInA.map(); - memcpy(bufInA, bfloatA, (numberFloats * sizeof(std::bfloat16_t))); + test_utils::bfloat16_t *bufInA = boInA.map(); + memcpy(bufInA, bfloatA, (numberFloats * sizeof(test_utils::bfloat16_t))); - std::bfloat16_t *bufInB = boInB.map(); - memcpy(bufInB, bfloatB, (numberFloats * sizeof(std::bfloat16_t))); + test_utils::bfloat16_t *bufInB = boInB.map(); + memcpy(bufInB, bfloatB, (numberFloats * sizeof(test_utils::bfloat16_t))); void *bufInstr = boInstr.map(); memcpy(bufInstr, instr.data(), instr.size() * sizeof(int)); diff --git a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp index b7dce1e8087..9196eb2418e 100644 --- a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp +++ b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_bfp_test.cpp @@ -27,20 +27,12 @@ #include #include #include -#include #include #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -// Clangd fix, remove -#ifdef _CLANGD -namespace std { -using bfloat16_t = double; -} // namespace std -#endif - #include "../helper.h" #include "common.h" #include "gemm_atb_layout.h" @@ -61,8 +53,8 @@ constexpr int verify_stochastic_n_samples = 1000; // are initialized to all-ones: every C[i,j] reduces to exactly K, which is // representable losslessly in BFP16 ebs8 (no requantization compounding), // and the CPU reference produces the same value bit-for-bit. -float abs_tol = matmul_common::get_abs_tol(); -float rel_tol = matmul_common::get_rel_tol(); +float abs_tol = matmul_common::get_abs_tol(); +float rel_tol = matmul_common::get_rel_tol(); int main(int argc, const char *argv[]) { diff --git a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp index 5468ddc2640..4080c103386 100644 --- a/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp +++ b/programming_examples/ml/block_datatypes/gemm_asymmetric_tile_buffering/gemm_atb_mixed_test.cpp @@ -22,20 +22,12 @@ #include #include #include -#include #include #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -// Clangd fix, remove -#ifdef _CLANGD -namespace std { -using bfloat16_t = double; -} // namespace std -#endif - #include "../helper.h" #include "common.h" #include "gemm_atb_layout.h" @@ -49,8 +41,8 @@ constexpr int verify_stochastic_n_samples = 1000; // Verification tolerance // See "Note on Numerical Tolerances" in README.md // TODO: This might have to be adjusted for bfp -float abs_tol = matmul_common::get_abs_tol(); -float rel_tol = matmul_common::get_rel_tol() * 2.0f; +float abs_tol = matmul_common::get_abs_tol(); +float rel_tol = matmul_common::get_rel_tol() * 2.0f; int main(int argc, const char *argv[]) { @@ -154,11 +146,11 @@ int main(int argc, const char *argv[]) { auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_a = xrt::bo(device, A_SIZE * sizeof(std::bfloat16_t), + auto bo_a = xrt::bo(device, A_SIZE * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_b = xrt::bo(device, B_VOLUME, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_out = xrt::bo(device, C_SIZE * sizeof(std::bfloat16_t), + auto bo_out = xrt::bo(device, C_SIZE * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); // ------------------------------------------------------ @@ -168,20 +160,22 @@ int main(int argc, const char *argv[]) { std::cout << "Writing data into buffer objects.\n"; } - std::vector AVec(A_SIZE); + std::vector AVec(A_SIZE); for (int i = 0; i < A_SIZE; i++) { - AVec[i] = (std::bfloat16_t)((rand() % 8) - 4); + AVec[i] = + test_utils::bfloat16_from_float(static_cast((rand() % 8) - 4)); } - std::vector BVec(B_SIZE); + std::vector BVec(B_SIZE); for (int i = 0; i < B_SIZE; i++) { - BVec[i] = (std::bfloat16_t)((rand() % 8) - 4); + BVec[i] = + test_utils::bfloat16_from_float(static_cast((rand() % 8) - 4)); } // This is a quick conversion to avoid having to create a custom function for // bf16 for now std::vector BVecFloat(B_SIZE); for (int i = 0; i < B_SIZE; i++) { - BVecFloat[i] = (float)BVec[i]; + BVecFloat[i] = test_utils::bfloat16_to_float(BVec[i]); } auto shuffleStart = std::chrono::high_resolution_clock::now(); @@ -201,9 +195,9 @@ int main(int argc, const char *argv[]) { // ------------------------------------------------------ // Write data into buffers // ------------------------------------------------------ - std::bfloat16_t *bufA = bo_a.map(); + test_utils::bfloat16_t *bufA = bo_a.map(); uint8_t *bufB = bo_b.map(); - memcpy(bufA, AVec.data(), AVec.size() * sizeof(std::bfloat16_t)); + memcpy(bufA, AVec.data(), AVec.size() * sizeof(test_utils::bfloat16_t)); memcpy(bufB, BVecBfpShuffled.data(), B_VOLUME); // Initialize outputs; bufOut is results matrix @@ -260,8 +254,8 @@ int main(int argc, const char *argv[]) { // verify pass does not pollute the per-iter average runtime. // ------------------------------------------------------ if (do_verify) { - std::vector CVec(C_SIZE); - memcpy(CVec.data(), bufOut, CVec.size() * sizeof(std::bfloat16_t)); + std::vector CVec(C_SIZE); + memcpy(CVec.data(), bufOut, CVec.size() * sizeof(test_utils::bfloat16_t)); if (verbosity >= 1) { std::cout << "Verifying against reference matmul ..." << std::endl; @@ -272,12 +266,13 @@ int main(int argc, const char *argv[]) { // in fp32, and the bf16 accumulator pattern from the canonical // mixed_test.cpp loses too much precision over the K=4096 reductions // that the paper-scale shapes require. - errors = matmul_common::verify_stochastic( + errors = matmul_common::verify_stochastic( M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity, abs_tol, rel_tol, /*b_col_maj=*/0); } else { - errors = matmul_common::verify( + errors = matmul_common::verify( M, N, K, AVec, BVec, CVec, verbosity, abs_tol, rel_tol, /*b_col_maj=*/0); } diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp index 0969e719218..92522808858 100644 --- a/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp +++ b/programming_examples/ml/block_datatypes/matrix_multiplication/bfp_test.cpp @@ -14,20 +14,12 @@ #include #include #include -#include #include #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -// Clangd fix, remove -#ifdef _CLANGD -namespace std { -using bfloat16_t = double; -} // namespace std -#endif - #include "../helper.h" #include "common.h" @@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000; // Verification tolerance // See "Note on Numerical Tolerances" in README.md // TODO: This might have to be adjusted for bfp -float abs_tol = matmul_common::get_abs_tol(); -float rel_tol = matmul_common::get_rel_tol(); +float abs_tol = matmul_common::get_abs_tol(); +float rel_tol = matmul_common::get_rel_tol(); int main(int argc, const char *argv[]) { diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp index 4cc51f2c0e8..fab597f1f46 100644 --- a/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp +++ b/programming_examples/ml/block_datatypes/matrix_multiplication/in_core_shuffle/test.cpp @@ -14,20 +14,12 @@ #include #include #include -#include #include #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -// Clangd fix, remove -#ifdef _CLANGD -namespace std { -using bfloat16_t = double; -} // namespace std -#endif - #include "../../helper.h" #include "common.h" @@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000; // Verification tolerance // See "Note on Numerical Tolerances" in README.md // TODO: This might have to be adjusted for bfp -float abs_tol = matmul_common::get_abs_tol(); -float rel_tol = matmul_common::get_rel_tol(); +float abs_tol = matmul_common::get_abs_tol(); +float rel_tol = matmul_common::get_rel_tol(); int main(int argc, const char *argv[]) { diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp index f536ba3cc1d..c875796c5af 100644 --- a/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp +++ b/programming_examples/ml/block_datatypes/matrix_multiplication/mixed_test.cpp @@ -14,20 +14,12 @@ #include #include #include -#include #include #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -// Clangd fix, remove -#ifdef _CLANGD -namespace std { -using bfloat16_t = double; -} // namespace std -#endif - #include "../helper.h" #include "common.h" @@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000; // Verification tolerance // See "Note on Numerical Tolerances" in README.md // TODO: This might have to be adjusted for bfp -float abs_tol = matmul_common::get_abs_tol(); -float rel_tol = matmul_common::get_rel_tol() * 2.0f; +float abs_tol = matmul_common::get_abs_tol(); +float rel_tol = matmul_common::get_rel_tol() * 2.0f; int main(int argc, const char *argv[]) { @@ -145,11 +137,11 @@ int main(int argc, const char *argv[]) { auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_a = xrt::bo(device, A_SIZE * sizeof(std::bfloat16_t), + auto bo_a = xrt::bo(device, A_SIZE * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_b = xrt::bo(device, B_VOLUME, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_out = xrt::bo(device, C_SIZE * sizeof(std::bfloat16_t), + auto bo_out = xrt::bo(device, C_SIZE * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); // ------------------------------------------------------ @@ -159,10 +151,11 @@ int main(int argc, const char *argv[]) { std::cout << "Writing data into buffer objects.\n"; } - std::vector AVec(A_SIZE); + std::vector AVec(A_SIZE); for (int i = 0; i < A_SIZE; i++) { // Limiting to 16 to avoid precision loss issues - AVec[i] = (std::bfloat16_t)((rand() % 8) - 4); + AVec[i] = + test_utils::bfloat16_from_float(static_cast((rand() % 8) - 4)); // AVec[i] = i; // if (i % N == i / N) { // AVec[i] = 1.0; @@ -173,10 +166,11 @@ int main(int argc, const char *argv[]) { // AVec[i] = (i / 8) % 1000; } - std::vector BVec(B_SIZE); + std::vector BVec(B_SIZE); for (int i = 0; i < B_SIZE; i++) { // Limiting to 16 to avoid precision loss issues - BVec[i] = (std::bfloat16_t)((rand() % 8) - 4); + BVec[i] = + test_utils::bfloat16_from_float(static_cast((rand() % 8) - 4)); // Diagonal: // if (i % N == i / N) { // BVec[i] = 1.0; @@ -191,7 +185,7 @@ int main(int argc, const char *argv[]) { // bf16 for now std::vector BVecFloat(B_SIZE); for (int i = 0; i < B_SIZE; i++) { - BVecFloat[i] = (float)BVec[i]; + BVecFloat[i] = test_utils::bfloat16_to_float(BVec[i]); } auto BVecBfp = floatToBfp16(8, B_SIZE, BVecFloat.data(), 0); @@ -208,9 +202,9 @@ int main(int argc, const char *argv[]) { // ------------------------------------------------------ // Write data into buffers // ------------------------------------------------------ - std::bfloat16_t *bufA = bo_a.map(); + test_utils::bfloat16_t *bufA = bo_a.map(); uint8_t *bufB = bo_b.map(); - memcpy(bufA, AVec.data(), AVec.size() * sizeof(std::bfloat16_t)); + memcpy(bufA, AVec.data(), AVec.size() * sizeof(test_utils::bfloat16_t)); memcpy(bufB, BVecBfpShuffled.data(), B_VOLUME); // Initialize outputs; bufOut is results matrix @@ -257,22 +251,23 @@ int main(int argc, const char *argv[]) { // Check output // ------------------------------------------------------ if (do_verify) { - std::vector CVec(C_SIZE); - memcpy(CVec.data(), bufOut, CVec.size() * sizeof(std::bfloat16_t)); + std::vector CVec(C_SIZE); + memcpy(CVec.data(), bufOut, CVec.size() * sizeof(test_utils::bfloat16_t)); if (verbosity >= 1) { std::cout << "Verifying against reference matmul ..." << std::endl; } auto vstart = std::chrono::system_clock::now(); if (do_verify_stochastic) { - errors = - matmul_common::verify_stochastic( - M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, - verbosity, abs_tol, rel_tol, true); + errors = matmul_common::verify_stochastic( + M, N, K, AVec, BVec, CVec, verify_stochastic_n_samples, verbosity, + abs_tol, rel_tol, true); } else { - errors = matmul_common::verify( + errors = matmul_common::verify( M, N, K, AVec, BVec, CVec, verbosity, abs_tol, rel_tol, true); } auto vstop = std::chrono::system_clock::now(); diff --git a/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp b/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp index 4b91f58dbb8..22fff460945 100644 --- a/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp +++ b/programming_examples/ml/block_datatypes/matrix_multiplication/whole_array_shuffle/bfp_test.cpp @@ -14,20 +14,12 @@ #include #include #include -#include #include #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" -// Clangd fix, remove -#ifdef _CLANGD -namespace std { -using bfloat16_t = double; -} // namespace std -#endif - #include "../../helper.h" #include "common.h" @@ -40,8 +32,8 @@ constexpr int verify_stochastic_n_samples = 1000; // Verification tolerance // See "Note on Numerical Tolerances" in README.md // TODO: This might have to be adjusted for bfp -float abs_tol = matmul_common::get_abs_tol(); -float rel_tol = matmul_common::get_rel_tol(); +float abs_tol = matmul_common::get_abs_tol(); +float rel_tol = matmul_common::get_rel_tol(); int main(int argc, const char *argv[]) { diff --git a/programming_examples/ml/eltwise_add/test.cpp b/programming_examples/ml/eltwise_add/test.cpp index fbb1d911657..b3ddfd960a3 100644 --- a/programming_examples/ml/eltwise_add/test.cpp +++ b/programming_examples/ml/eltwise_add/test.cpp @@ -8,8 +8,12 @@ // //===----------------------------------------------------------------------===// -#include +#include +#include #include +#include +#include +#include #include #include #include @@ -25,9 +29,9 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -using INOUT0_DATATYPE = std::bfloat16_t; -using INOUT1_DATATYPE = std::bfloat16_t; -using INOUT2_DATATYPE = std::bfloat16_t; +using INOUT0_DATATYPE = test_utils::bfloat16_t; +using INOUT1_DATATYPE = test_utils::bfloat16_t; +using INOUT2_DATATYPE = test_utils::bfloat16_t; #endif // ---------------------------------------------------------------------------- @@ -38,11 +42,15 @@ int verify(int size, std::vector A, std::vector B, std::vector C, int verbosity) { int errors = 0; for (uint32_t i = 0; i < size; i++) { - T ref = A[i] + B[i]; - if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) { + const float a = test_utils::bfloat16_to_float(A[i]); + const float b = test_utils::bfloat16_to_float(B[i]); + const float actual = test_utils::bfloat16_to_float(C[i]); + const auto ref_bf16 = test_utils::bfloat16_add(A[i], B[i]); + const float ref = test_utils::bfloat16_to_float(ref_bf16); + if (!test_utils::nearly_equal(ref, actual, 0.00390625)) { if (errors < 100) { - std::cout << "Error in output " << C[i] << " != " << ref << " from " - << A[i] << " * " << B[i] << std::endl; + std::cout << "Error in output " << actual << " != " << ref << " from " + << a << " + " << b << std::endl; } else if (errors == 100) { std::cout << "..." << std::endl; std::cout << "[Errors truncated]" << std::endl; @@ -50,7 +58,7 @@ int verify(int size, std::vector A, std::vector B, std::vector C, errors++; } else { if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; + std::cout << "Correct output " << actual << " == " << ref << std::endl; } } return errors; @@ -166,16 +174,18 @@ int main(int argc, const char *argv[]) { INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); std::vector AVec(INOUT0_VOLUME); for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, - (std::bfloat16_t)-0.5); + AVec[i] = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f), + test_utils::bfloat16_from_float(-0.5f)); memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); // Initialize Inout buffer 1 INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); std::vector BVec(INOUT1_VOLUME); for (int i = 0; i < INOUT1_VOLUME; i++) - BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, - (std::bfloat16_t)-0.5); + BVec[i] = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f), + test_utils::bfloat16_from_float(-0.5f)); memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); // Initialize Inout buffer 2 diff --git a/programming_examples/ml/eltwise_mul/test.cpp b/programming_examples/ml/eltwise_mul/test.cpp index 36aea4639cc..e6f1050a676 100644 --- a/programming_examples/ml/eltwise_mul/test.cpp +++ b/programming_examples/ml/eltwise_mul/test.cpp @@ -8,8 +8,12 @@ // //===----------------------------------------------------------------------===// -#include +#include +#include #include +#include +#include +#include #include #include #include @@ -25,9 +29,9 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -using INOUT0_DATATYPE = std::bfloat16_t; -using INOUT1_DATATYPE = std::bfloat16_t; -using INOUT2_DATATYPE = std::bfloat16_t; +using INOUT0_DATATYPE = test_utils::bfloat16_t; +using INOUT1_DATATYPE = test_utils::bfloat16_t; +using INOUT2_DATATYPE = test_utils::bfloat16_t; #endif // ---------------------------------------------------------------------------- @@ -38,11 +42,15 @@ int verify(int size, std::vector A, std::vector B, std::vector C, int verbosity) { int errors = 0; for (uint32_t i = 0; i < size; i++) { - T ref = A[i] * B[i]; - if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) { + const float a = test_utils::bfloat16_to_float(A[i]); + const float b = test_utils::bfloat16_to_float(B[i]); + const float actual = test_utils::bfloat16_to_float(C[i]); + const auto ref_bf16 = test_utils::bfloat16_mul(A[i], B[i]); + const float ref = test_utils::bfloat16_to_float(ref_bf16); + if (!test_utils::nearly_equal(ref, actual, 0.00390625)) { if (errors < 100) { - std::cout << "Error in output " << C[i] << " != " << ref << " from " - << A[i] << " * " << B[i] << std::endl; + std::cout << "Error in output " << actual << " != " << ref << " from " + << a << " * " << b << std::endl; } else if (errors == 100) { std::cout << "..." << std::endl; std::cout << "[Errors truncated]" << std::endl; @@ -50,7 +58,7 @@ int verify(int size, std::vector A, std::vector B, std::vector C, errors++; } else { if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; + std::cout << "Correct output " << actual << " == " << ref << std::endl; } } return errors; @@ -166,16 +174,18 @@ int main(int argc, const char *argv[]) { INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); std::vector AVec(INOUT0_VOLUME); for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, - (std::bfloat16_t)-0.5); + AVec[i] = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f), + test_utils::bfloat16_from_float(-0.5f)); memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); // Initialize Inout buffer 1 INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); std::vector BVec(INOUT1_VOLUME); for (int i = 0; i < INOUT1_VOLUME; i++) - BVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1.0, - (std::bfloat16_t)-0.5); + BVec[i] = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(1.0f), + test_utils::bfloat16_from_float(-0.5f)); memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); // Initialize Inout buffer 2 diff --git a/programming_examples/ml/gelu/test.cpp b/programming_examples/ml/gelu/test.cpp index f3c3692fb74..1bc14a8733f 100644 --- a/programming_examples/ml/gelu/test.cpp +++ b/programming_examples/ml/gelu/test.cpp @@ -9,8 +9,12 @@ //===----------------------------------------------------------------------===// #include "cxxopts.hpp" +#include +#include +#include #include #include +#include #include #include #include @@ -23,20 +27,28 @@ #include "test_utils.h" -// gelu reference implementation -std::bfloat16_t gelu_bf16(std::bfloat16_t &input) { +// GELU reference implementation. +test_utils::bfloat16_t gelu_bf16(test_utils::bfloat16_t input) { // Approximate GELU: 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³))) - constexpr auto sqrt_2_over_pi = std::bfloat16_t(0.79788456f); - constexpr auto beta = std::bfloat16_t(0.044715f); - - std::bfloat16_t x = input; - std::bfloat16_t x3 = x * x * x; - std::bfloat16_t inner = sqrt_2_over_pi * (x + beta * x3); - std::bfloat16_t tanh_val = std::tanh(inner); - std::bfloat16_t gelu = - std::bfloat16_t(0.5f) * x * (std::bfloat16_t(1.0f) + tanh_val); - - return std::bfloat16_t(gelu); + const test_utils::bfloat16_t k0_5 = test_utils::bfloat16_from_float(0.5f); + const test_utils::bfloat16_t k1 = test_utils::bfloat16_from_float(1.0f); + const test_utils::bfloat16_t sqrt_2_over_pi = + test_utils::bfloat16_from_float(0.79788456f); + const test_utils::bfloat16_t beta = + test_utils::bfloat16_from_float(0.044715f); + + const test_utils::bfloat16_t x2 = test_utils::bfloat16_mul(input, input); + const test_utils::bfloat16_t x3 = test_utils::bfloat16_mul(input, x2); + const test_utils::bfloat16_t x3_beta = test_utils::bfloat16_mul(x3, beta); + const test_utils::bfloat16_t inner = test_utils::bfloat16_add(input, x3_beta); + const test_utils::bfloat16_t inner1 = + test_utils::bfloat16_mul(inner, sqrt_2_over_pi); + const test_utils::bfloat16_t tanh_out = test_utils::bfloat16_tanh(inner1); + const test_utils::bfloat16_t one_plus_tanh = + test_utils::bfloat16_add(tanh_out, k1); + const test_utils::bfloat16_t mul_v05 = + test_utils::bfloat16_mul(k0_5, one_plus_tanh); + return test_utils::bfloat16_mul(input, mul_v05); } int main(int argc, const char *argv[]) { @@ -53,7 +65,7 @@ int main(int argc, const char *argv[]) { "instr,i", "path of file containing userspace instructions to be sent to the LX6", cxxopts::value())( - "length,l", "the length of the transfer in std::bfloat16_t", + "length,l", "the length of the transfer in bfloat16 elements", cxxopts::value()->default_value("4096")); try { @@ -133,19 +145,21 @@ int main(int argc, const char *argv[]) { auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t), + auto bo_inA = xrt::bo(device, N * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t), + auto bo_out = xrt::bo(device, N * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects." << std::endl; - std::bfloat16_t *bufInA = bo_inA.map(); - std::vector srcVecA; + test_utils::bfloat16_t *bufInA = bo_inA.map(); + std::vector srcVecA; for (int i = 0; i < N; i++) - srcVecA.push_back(std::bfloat16_t(i * 0.05f + -2.0f)); // Example data - memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t))); + srcVecA.push_back( + test_utils::bfloat16_from_float(i * 0.05f + -2.0f)); // Example data + memcpy(bufInA, srcVecA.data(), + (srcVecA.size() * sizeof(test_utils::bfloat16_t))); void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); @@ -173,24 +187,27 @@ int main(int argc, const char *argv[]) { std::cout << "Latency (us): " << npu_time << std::endl; std::cout << std::endl; - double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output + double total_bytes = + 2.0 * N * sizeof(test_utils::bfloat16_t); // input and output double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9; std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl; - std::bfloat16_t *bufOut = bo_out.map(); + test_utils::bfloat16_t *bufOut = bo_out.map(); int errors = 0; for (int i = 0; i < N; i++) { - std::bfloat16_t ref = gelu_bf16(srcVecA[i]); - if (!test_utils::nearly_equal(*(bufOut + i), ref, 0.1)) { + const test_utils::bfloat16_t ref = gelu_bf16(srcVecA[i]); + const float expected = test_utils::bfloat16_to_float(ref); + const float actual = test_utils::bfloat16_to_float(*(bufOut + i)); + if (!test_utils::nearly_equal(actual, expected, 0.1)) { errors++; // Print the first 100 mismatches if (errors <= 100) { std::cout << "Mismatch at index " << i << ": " - << "Expected: " << ref << ", " - << "Got: " << *(bufOut + i) << std::endl; + << "Expected: " << expected << ", " + << "Got: " << actual << std::endl; } } } diff --git a/programming_examples/ml/layernorm/test.cpp b/programming_examples/ml/layernorm/test.cpp index 8c698f634c0..c1bb1a6ac50 100644 --- a/programming_examples/ml/layernorm/test.cpp +++ b/programming_examples/ml/layernorm/test.cpp @@ -12,6 +12,7 @@ #include "xrt_test_wrapper.h" #include #include +#include #include #include @@ -20,22 +21,23 @@ // ------------------------------------------------------ // Configure this to match your buffer data type // ------------------------------------------------------ -using DATATYPE_IN1 = std::bfloat16_t; -using DATATYPE_OUT = std::bfloat16_t; +using DATATYPE_IN1 = test_utils::bfloat16_t; +using DATATYPE_OUT = test_utils::bfloat16_t; #endif // Initialize Input buffer 1 void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int in_volume) { for (int i = 0; i < in_volume; i++) { - DATATYPE_IN1 val = static_cast(test_utils::random_bfloat16_t( - (std::bfloat16_t)8.0, (std::bfloat16_t)-4.0)); + DATATYPE_IN1 val = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f), + test_utils::bfloat16_from_float(-4.0f)); bufIn1[i] = val; } } // Initialize Output buffer void initialize_bufOut(DATATYPE_OUT *bufOut, int out_volume) { - memset(bufOut, 0, out_volume); + memset(bufOut, 0, out_volume * sizeof(DATATYPE_OUT)); } int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, @@ -52,7 +54,7 @@ int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, // Accumulate sum and sum of squares for each row for (int c = 0; c < COLS; c++) { int idx = r * COLS + c; - float val = static_cast(bufIn1[idx]); + float val = test_utils::bfloat16_to_float(bufIn1[idx]); sum += val; sum_sq += val * val; } @@ -63,7 +65,7 @@ int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, // Compute expected output for the current row for (int c = 0; c < COLS; c++) { int idx = r * COLS + c; - float val = static_cast(bufIn1[idx]); + float val = test_utils::bfloat16_to_float(bufIn1[idx]); float norm = (val - mean) * inv_std; float scaled = norm * gamma; float out_val = scaled + beta; @@ -73,7 +75,7 @@ int verify_layernorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, // Now compare the expected results with the computed results in bufOut for (int i = 0; i < (ROWS * COLS); i++) { float expected_val = expected[i]; - float hw_val = static_cast(bufOut[i]); + float hw_val = test_utils::bfloat16_to_float(bufOut[i]); float diff = std::abs(expected_val - hw_val); if (diff > 0.1) { std::cout << "Mismatch at index " << i << ": expected " << expected_val diff --git a/programming_examples/ml/relu/test.cpp b/programming_examples/ml/relu/test.cpp index 2bb1fd62e94..27d82509d91 100644 --- a/programming_examples/ml/relu/test.cpp +++ b/programming_examples/ml/relu/test.cpp @@ -9,8 +9,11 @@ //===----------------------------------------------------------------------===// #include "cxxopts.hpp" +#include +#include #include #include +#include #include #include #include @@ -27,10 +30,11 @@ #include "test_utils.h" -// relu reference implementation -std::bfloat16_t relu_bf16(std::bfloat16_t &input) { - // Return the relu output - return (input > std::bfloat16_t(0.0f)) ? input : std::bfloat16_t(0.0f); +// ReLU reference implementation. +test_utils::bfloat16_t relu_bf16(test_utils::bfloat16_t input) { + return (test_utils::bfloat16_to_float(input) > 0.0f) + ? input + : test_utils::bfloat16_from_float(0.0f); } int main(int argc, const char *argv[]) { @@ -47,7 +51,7 @@ int main(int argc, const char *argv[]) { "instr,i", "path of file containing userspace instructions to be sent to the LX6", cxxopts::value())( - "length,l", "the length of the transfer in std::bfloat16_t", + "length,l", "the length of the transfer in bfloat16 elements", cxxopts::value()->default_value("4096")); try { @@ -123,17 +127,19 @@ int main(int argc, const char *argv[]) { std::cout << "Getting handle to kernel:" << kernelName << std::endl; auto kernel = xrt::ext::kernel(context, mod, kernelName); - xrt::bo bo_inA = xrt::ext::bo{device, N * sizeof(std::bfloat16_t)}; - xrt::bo bo_out = xrt::ext::bo{device, N * sizeof(std::bfloat16_t)}; + xrt::bo bo_inA = xrt::ext::bo{device, N * sizeof(test_utils::bfloat16_t)}; + xrt::bo bo_out = xrt::ext::bo{device, N * sizeof(test_utils::bfloat16_t)}; if (verbosity >= 1) std::cout << "Writing data into buffer objects." << std::endl; - std::bfloat16_t *bufInA = bo_inA.map(); - std::vector srcVecA; + test_utils::bfloat16_t *bufInA = bo_inA.map(); + std::vector srcVecA; for (int i = 0; i < N; i++) - srcVecA.push_back(std::bfloat16_t(i * 0.05f + -3.0f)); // Example data - memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t))); + srcVecA.push_back( + test_utils::bfloat16_from_float(i * 0.05f + -3.0f)); // Example data + memcpy(bufInA, srcVecA.data(), + (srcVecA.size() * sizeof(test_utils::bfloat16_t))); bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); @@ -157,24 +163,27 @@ int main(int argc, const char *argv[]) { std::cout << "Latency (us): " << npu_time << std::endl; std::cout << std::endl; - double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output + double total_bytes = + 2.0 * N * sizeof(test_utils::bfloat16_t); // input and output double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9; std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl; - std::bfloat16_t *bufOut = bo_out.map(); + test_utils::bfloat16_t *bufOut = bo_out.map(); int errors = 0; for (int i = 0; i < N; i++) { - std::bfloat16_t ref = relu_bf16(srcVecA[i]); - if (!test_utils::nearly_equal(*(bufOut + i), ref)) { + test_utils::bfloat16_t ref = relu_bf16(srcVecA[i]); + const float expected = test_utils::bfloat16_to_float(ref); + const float actual = test_utils::bfloat16_to_float(*(bufOut + i)); + if (!test_utils::nearly_equal(actual, expected)) { errors++; // Print the first 100 mismatches if (errors <= 100) { std::cout << "Mismatch at index " << i << ": " - << "Expected: " << ref << ", " - << "Got: " << *(bufOut + i) << std::endl; + << "Expected: " << expected << ", " + << "Got: " << actual << std::endl; } } } diff --git a/programming_examples/ml/rmsnorm/test.cpp b/programming_examples/ml/rmsnorm/test.cpp index 16124c7dbe4..1b8795dab67 100644 --- a/programming_examples/ml/rmsnorm/test.cpp +++ b/programming_examples/ml/rmsnorm/test.cpp @@ -11,6 +11,7 @@ #include "xrt_test_wrapper.h" #include #include +#include #include #include @@ -19,22 +20,23 @@ // ------------------------------------------------------ // Configure this to match your buffer data type // ------------------------------------------------------ -using DATATYPE_IN1 = std::bfloat16_t; -using DATATYPE_OUT = std::bfloat16_t; +using DATATYPE_IN1 = test_utils::bfloat16_t; +using DATATYPE_OUT = test_utils::bfloat16_t; #endif // Initialize Input buffer 1 void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int in_volume) { for (int i = 0; i < in_volume; i++) { - DATATYPE_IN1 val = test_utils::random_bfloat16_t((std::bfloat16_t)8.0, - (std::bfloat16_t)-4.0); + DATATYPE_IN1 val = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f), + test_utils::bfloat16_from_float(-4.0f)); bufIn1[i] = val; } } // Initialize Output buffer void initialize_bufOut(DATATYPE_OUT *bufOut, int out_volume) { - memset(bufOut, 0, out_volume); + memset(bufOut, 0, out_volume * sizeof(DATATYPE_OUT)); } int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, @@ -49,7 +51,7 @@ int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, float sum_sq = 0.0f; for (int c = 0; c < COLS; c++) { int idx = r * COLS + c; - float val = static_cast(bufIn1[idx]); + float val = test_utils::bfloat16_to_float(bufIn1[idx]); sum_sq += val * val; } @@ -57,7 +59,7 @@ int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, for (int c = 0; c < COLS; c++) { int idx = r * COLS + c; - float val = static_cast(bufIn1[idx]); + float val = test_utils::bfloat16_to_float(bufIn1[idx]); float norm = (val * gamma) / rms; expected[idx] = norm; } @@ -65,7 +67,7 @@ int verify_rmsnorm_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, for (int i = 0; i < (ROWS * COLS); i++) { float expected_val = expected[i]; - float hw_val = static_cast(bufOut[i]); + float hw_val = test_utils::bfloat16_to_float(bufOut[i]); if (std::abs(expected_val - hw_val) > 0.05f) { std::cout << "Mismatch at index " << i << ": expected " << expected_val << ", got " << hw_val << std::endl; diff --git a/programming_examples/ml/rope/test.cpp b/programming_examples/ml/rope/test.cpp index 5ca6850b6ca..2cbe3a8301c 100644 --- a/programming_examples/ml/rope/test.cpp +++ b/programming_examples/ml/rope/test.cpp @@ -18,16 +18,17 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -using DATATYPE_IN1 = std::bfloat16_t; -using DATATYPE_IN2 = std::bfloat16_t; // For LUT (cos,sin) pairs -using DATATYPE_OUT = std::bfloat16_t; +using DATATYPE_IN1 = test_utils::bfloat16_t; +using DATATYPE_IN2 = test_utils::bfloat16_t; // For LUT (cos,sin) pairs +using DATATYPE_OUT = test_utils::bfloat16_t; #endif // Initialize Input buffer 1 void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int in_volume) { for (int i = 0; i < in_volume; i++) { - DATATYPE_IN1 val = test_utils::random_bfloat16_t((std::bfloat16_t)8.0, - (std::bfloat16_t)-4.0); + DATATYPE_IN1 val = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f), + test_utils::bfloat16_from_float(-4.0f)); bufIn1[i] = val; } } @@ -47,8 +48,8 @@ void initialize_bufIn2(DATATYPE_IN2 *bufIn2, int lut_volume) { float cos_val = std::cos(angle); float sin_val = std::sin(angle); int base_idx = r * COLS + 2 * i; - bufIn2[base_idx] = static_cast(cos_val); - bufIn2[base_idx + 1] = static_cast(sin_val); + bufIn2[base_idx] = test_utils::bfloat16_from_float(cos_val); + bufIn2[base_idx + 1] = test_utils::bfloat16_from_float(sin_val); } } } @@ -71,10 +72,10 @@ int verify_rope_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_IN2 *bufIn2, int input_base_idx = r * COLS + 2 * i; int lut_base_idx = r * COLS + 2 * i; - float x_even = static_cast(bufIn1[input_base_idx]); - float x_odd = static_cast(bufIn1[input_base_idx + 1]); - float cos_val = static_cast(bufIn2[lut_base_idx]); - float sin_val = static_cast(bufIn2[lut_base_idx + 1]); + float x_even = test_utils::bfloat16_to_float(bufIn1[input_base_idx]); + float x_odd = test_utils::bfloat16_to_float(bufIn1[input_base_idx + 1]); + float cos_val = test_utils::bfloat16_to_float(bufIn2[lut_base_idx]); + float sin_val = test_utils::bfloat16_to_float(bufIn2[lut_base_idx + 1]); expected[input_base_idx] = x_even * cos_val - x_odd * sin_val; expected[input_base_idx + 1] = x_even * sin_val + x_odd * cos_val; @@ -83,7 +84,7 @@ int verify_rope_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_IN2 *bufIn2, for (int i = 0; i < (ROWS * COLS); i++) { float expected_val = expected[i]; - float hw_val = static_cast(bufOut[i]); + float hw_val = test_utils::bfloat16_to_float(bufOut[i]); if (std::abs(expected_val - hw_val) > 0.05f) { std::cout << "Mismatch at index " << i << ": expected " << expected_val << ", got " << hw_val << std::endl; diff --git a/programming_examples/ml/scale_shift/test.cpp b/programming_examples/ml/scale_shift/test.cpp index 15ee4c0a1cb..4e101ceabb1 100644 --- a/programming_examples/ml/scale_shift/test.cpp +++ b/programming_examples/ml/scale_shift/test.cpp @@ -8,8 +8,12 @@ // //===----------------------------------------------------------------------===// -#include +#include +#include #include +#include +#include +#include #include #include #include @@ -25,10 +29,10 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -using INOUT0_DATATYPE = std::bfloat16_t; -using INOUT1_DATATYPE = std::bfloat16_t; -using INOUT2_DATATYPE = std::bfloat16_t; -using INOUT3_DATATYPE = std::bfloat16_t; +using INOUT0_DATATYPE = test_utils::bfloat16_t; +using INOUT1_DATATYPE = test_utils::bfloat16_t; +using INOUT2_DATATYPE = test_utils::bfloat16_t; +using INOUT3_DATATYPE = test_utils::bfloat16_t; #endif // ---------------------------------------------------------------------------- @@ -39,16 +43,22 @@ int verify(int size, std::vector A, std::vector B, std::vector C, std::vector D, int verbosity) { int errors = 0; for (uint32_t i = 0; i < size; i++) { - T ref = A[i] * B[i] + C[i]; - if (!test_utils::nearly_equal(ref, D[i], 0.002)) { + const float a = test_utils::bfloat16_to_float(A[i]); + const float b = test_utils::bfloat16_to_float(B[i]); + const float c = test_utils::bfloat16_to_float(C[i]); + const float actual = test_utils::bfloat16_to_float(D[i]); + const auto product_bf16 = test_utils::bfloat16_mul(A[i], B[i]); + const auto ref_bf16 = test_utils::bfloat16_add(product_bf16, C[i]); + const float ref = test_utils::bfloat16_to_float(ref_bf16); + if (!test_utils::nearly_equal(ref, actual, 0.002)) { if (verbosity >= 1) { - std::cout << "Error in output " << D[i] << " != " << ref << " from " - << A[i] << " * " << B[i] << " + " << C[i] << std::endl; + std::cout << "Error in output " << actual << " != " << ref << " from " + << a << " * " << b << " + " << c << std::endl; } errors++; } else { if (verbosity >= 1) - std::cout << "Correct output " << D[i] << " == " << ref << std::endl; + std::cout << "Correct output " << actual << " == " << ref << std::endl; } } return errors; @@ -167,21 +177,21 @@ int main(int argc, const char *argv[]) { INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); std::vector AVec(INOUT0_VOLUME); for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = INOUT0_DATATYPE(4.0f); + AVec[i] = test_utils::bfloat16_from_float(4.0f); memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); // Initialize Inout buffer 1 INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); std::vector BVec(INOUT1_VOLUME); for (int i = 0; i < INOUT1_VOLUME; i++) - BVec[i] = INOUT1_DATATYPE(3.35f); + BVec[i] = test_utils::bfloat16_from_float(3.35f); memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); // Initialize Inout buffer 2 INOUT1_DATATYPE *bufInOut2 = bo_inout2.map(); std::vector CVec(INOUT2_VOLUME); for (int i = 0; i < INOUT2_VOLUME; i++) - CVec[i] = INOUT2_DATATYPE(0.77f); + CVec[i] = test_utils::bfloat16_from_float(0.77f); memcpy(bufInOut2, CVec.data(), (CVec.size() * sizeof(INOUT2_DATATYPE))); // Initialize Inout buffer 3 diff --git a/programming_examples/ml/silu/test.cpp b/programming_examples/ml/silu/test.cpp index 84b9fa07680..44635c384f5 100644 --- a/programming_examples/ml/silu/test.cpp +++ b/programming_examples/ml/silu/test.cpp @@ -9,8 +9,12 @@ //===----------------------------------------------------------------------===// #include "cxxopts.hpp" +#include +#include +#include #include #include +#include #include #include #include @@ -23,16 +27,18 @@ #include "test_utils.h" -// Silu reference implementation -std::bfloat16_t silu_bf16(std::bfloat16_t &input) { - // Compute tanh approximation - std::bfloat16_t half_x = input * std::bfloat16_t(0.5f); - std::bfloat16_t tanh_half_x = std::tanh(half_x); - std::bfloat16_t sigmoid_approx = - std::bfloat16_t(0.5f) * (tanh_half_x + std::bfloat16_t(1.0f)); - - // Compute output: x * tanh_approx - return input * sigmoid_approx; +// SiLU reference implementation. +test_utils::bfloat16_t silu_bf16(test_utils::bfloat16_t input) { + const test_utils::bfloat16_t k0_5 = test_utils::bfloat16_from_float(0.5f); + const test_utils::bfloat16_t k1 = test_utils::bfloat16_from_float(1.0f); + + const test_utils::bfloat16_t half_x = test_utils::bfloat16_mul(input, k0_5); + const test_utils::bfloat16_t tanh_half_x = test_utils::bfloat16_tanh(half_x); + const test_utils::bfloat16_t tanh_half_x_approx = + test_utils::bfloat16_add(tanh_half_x, k1); + const test_utils::bfloat16_t sigmoid_approx = + test_utils::bfloat16_mul(tanh_half_x_approx, k0_5); + return test_utils::bfloat16_mul(input, sigmoid_approx); } int main(int argc, const char *argv[]) { @@ -49,7 +55,7 @@ int main(int argc, const char *argv[]) { "instr,i", "path of file containing userspace instructions to be sent to the LX6", cxxopts::value())( - "length,l", "the length of the transfer in std::bfloat16_t", + "length,l", "the length of the transfer in bfloat16 elements", cxxopts::value()->default_value("4096")); try { @@ -129,19 +135,21 @@ int main(int argc, const char *argv[]) { auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t), + auto bo_inA = xrt::bo(device, N * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t), + auto bo_out = xrt::bo(device, N * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects." << std::endl; - std::bfloat16_t *bufInA = bo_inA.map(); - std::vector srcVecA; + test_utils::bfloat16_t *bufInA = bo_inA.map(); + std::vector srcVecA; for (int i = 0; i < N; i++) - srcVecA.push_back(std::bfloat16_t(i * 0.05f + -3.0f)); // Example data - memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t))); + srcVecA.push_back( + test_utils::bfloat16_from_float(i * 0.05f + -3.0f)); // Example data + memcpy(bufInA, srcVecA.data(), + (srcVecA.size() * sizeof(test_utils::bfloat16_t))); void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); @@ -169,24 +177,27 @@ int main(int argc, const char *argv[]) { std::cout << "Latency (us): " << npu_time << std::endl; std::cout << std::endl; - double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output + double total_bytes = + 2.0 * N * sizeof(test_utils::bfloat16_t); // input and output double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9; std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl; - std::bfloat16_t *bufOut = bo_out.map(); + test_utils::bfloat16_t *bufOut = bo_out.map(); int errors = 0; for (int i = 0; i < N; i++) { - std::bfloat16_t ref = silu_bf16(srcVecA[i]); - if (!test_utils::nearly_equal(*(bufOut + i), ref, 0.04)) { + const test_utils::bfloat16_t ref = silu_bf16(srcVecA[i]); + const float expected = test_utils::bfloat16_to_float(ref); + const float actual = test_utils::bfloat16_to_float(*(bufOut + i)); + if (!test_utils::nearly_equal(actual, expected, 0.04)) { errors++; // Print the first 100 mismatches if (errors <= 100) { std::cout << "Mismatch at index " << i << ": " - << "Expected: " << ref << ", " - << "Got: " << *(bufOut + i) << std::endl; + << "Expected: " << expected << ", " + << "Got: " << actual << std::endl; } } } diff --git a/programming_examples/ml/softmax/test.cpp b/programming_examples/ml/softmax/test.cpp index 634d44110f2..c05201270bf 100644 --- a/programming_examples/ml/softmax/test.cpp +++ b/programming_examples/ml/softmax/test.cpp @@ -8,12 +8,15 @@ // //===----------------------------------------------------------------------===// -#include +#include +#include #include +#include +#include #include #include #include -#include +#include #include #include "xrt/xrt_bo.h" @@ -25,52 +28,55 @@ #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -using INOUT0_DATATYPE = std::bfloat16_t; -using INOUT1_DATATYPE = std::bfloat16_t; +using INOUT0_DATATYPE = test_utils::bfloat16_t; +using INOUT1_DATATYPE = test_utils::bfloat16_t; #endif // ---------------------------------------------------------------------------- // Verify results (specific to our design example) // ---------------------------------------------------------------------------- -template -int verify(int size, int tile_size, std::vector A, std::vector B, - int verbosity) { +static int verify(int size, int tile_size, + const std::vector &A, + const std::vector &B, int verbosity) { int errors = 0; - T max_val = A[0]; - std::vector RefVec(size); + float max_val = test_utils::bfloat16_to_float(A[0]); + std::vector RefVec(size); for (uint32_t i = 1; i < A.size(); i++) { - A[i] = (T)(A[i]); - T val = A[i]; + const float val = test_utils::bfloat16_to_float(A[i]); if (val > max_val) { max_val = val; } } for (uint32_t t = 0; t < size; t += tile_size) { - float running = 0.0; + float running = 0.0f; for (uint32_t i = 0; i < tile_size; i++) { - float ez = (float)(exp(A[t + i] - max_val)); - running += ez; - RefVec[t + i] = (T)exp(A[t + i] - max_val); + const float input_value = test_utils::bfloat16_to_float(A[t + i]); + const float exp_value = std::exp(input_value - max_val); + running += exp_value; + RefVec[t + i] = test_utils::bfloat16_from_float(exp_value); } + const INOUT1_DATATYPE running_bf16 = + test_utils::bfloat16_from_float(running); for (uint32_t i = 0; i < tile_size; i++) { - RefVec[t + i] /= (T)running; + RefVec[t + i] = test_utils::bfloat16_div(RefVec[t + i], running_bf16); } } for (uint32_t i = 0; i < size; i++) { - - if (!test_utils::nearly_equal(RefVec[i], B[i], 0.04, 0.001)) { + const float expected = test_utils::bfloat16_to_float(RefVec[i]); + const float actual = test_utils::bfloat16_to_float(B[i]); + if (!test_utils::nearly_equal(actual, expected, 0.04, 0.001)) { if (verbosity >= 1) { - std::cout << "Error in output " << B[i] << " != " << RefVec[i] + std::cout << "Error in output " << actual << " != " << expected << std::endl; } errors++; } else if (verbosity >= 1) { - std::cout << "Correct output " << B[i] << " == " << RefVec[i] + std::cout << "Correct output " << actual << " == " << expected << std::endl; } } @@ -153,12 +159,14 @@ int main(int argc, const char *argv[]) { for (int i = 0; i < INOUT0_VOLUME; i++) { if (dev == 1) { // NPU1: Use bfloat16 values in range [4.0, 4.0] - AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)8.0, - (std::bfloat16_t)-4.0); + AVec[i] = + test_utils::random_bfloat16_t(test_utils::bfloat16_from_float(8.0f), + test_utils::bfloat16_from_float(-4.0f)); } else if (dev == 2) { // NPU2: Use bfloat16 values in range [-512.0, 512.0] - AVec[i] = test_utils::random_bfloat16_t((std::bfloat16_t)1024.0, - (std::bfloat16_t)-512.0); + AVec[i] = test_utils::random_bfloat16_t( + test_utils::bfloat16_from_float(1024.0f), + test_utils::bfloat16_from_float(-512.0f)); } } memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); diff --git a/programming_examples/ml/swiglu/test.cpp b/programming_examples/ml/swiglu/test.cpp index 3a58c66841f..c7aeaebdcda 100644 --- a/programming_examples/ml/swiglu/test.cpp +++ b/programming_examples/ml/swiglu/test.cpp @@ -9,8 +9,12 @@ //===----------------------------------------------------------------------===// #include "cxxopts.hpp" +#include +#include +#include #include #include +#include #include #include #include @@ -23,29 +27,27 @@ #include "test_utils.h" -// Silu reference implementation -std::bfloat16_t silu_bf16(std::bfloat16_t &input) { - // Compute tanh approximation - std::bfloat16_t half_x = input * std::bfloat16_t(0.5f); - std::bfloat16_t tanh_half_x = std::tanh(half_x); - std::bfloat16_t sigmoid_approx = - std::bfloat16_t(0.5f) * (tanh_half_x + std::bfloat16_t(1.0f)); - - // Compute output: x * tanh_approx - return input * sigmoid_approx; -} - -// SwiGLU reference implementation -std::bfloat16_t swiglu_bf16(std::bfloat16_t &input, std::bfloat16_t &w1, - std::bfloat16_t &w2) { - // Compute the first part: x * w1 - std::bfloat16_t x_w1 = input * w1; - // Compute the second part: x * w2 - std::bfloat16_t x_w2 = input * w2; - // Apply the silu activation function to the second part - std::bfloat16_t silu_output = silu_bf16(x_w2); - // Compute the final output: x * w1 * silu_output - return x_w1 * silu_output; +// SwiGLU reference implementation. +test_utils::bfloat16_t swiglu_bf16(test_utils::bfloat16_t input, + test_utils::bfloat16_t w1, + test_utils::bfloat16_t w2) { + const test_utils::bfloat16_t k0_5 = test_utils::bfloat16_from_float(0.5f); + const test_utils::bfloat16_t k1 = test_utils::bfloat16_from_float(1.0f); + + const test_utils::bfloat16_t mul_input_weight_1 = + test_utils::bfloat16_mul(input, w1); + const test_utils::bfloat16_t mul_input_weight_2 = + test_utils::bfloat16_mul(input, w2); + const test_utils::bfloat16_t half_x = + test_utils::bfloat16_mul(mul_input_weight_2, k0_5); + const test_utils::bfloat16_t tanh_half_x = test_utils::bfloat16_tanh(half_x); + const test_utils::bfloat16_t tanh_half_x_approx = + test_utils::bfloat16_add(tanh_half_x, k1); + const test_utils::bfloat16_t sigmoid_approx = + test_utils::bfloat16_mul(tanh_half_x_approx, k0_5); + const test_utils::bfloat16_t silu_output = + test_utils::bfloat16_mul(mul_input_weight_2, sigmoid_approx); + return test_utils::bfloat16_mul(mul_input_weight_1, silu_output); } int main(int argc, const char *argv[]) { @@ -62,7 +64,7 @@ int main(int argc, const char *argv[]) { "instr,i", "path of file containing userspace instructions to be sent to the LX6", cxxopts::value())( - "length,l", "the length of the transfer in std::bfloat16_t", + "length,l", "the length of the transfer in bfloat16 elements", cxxopts::value()->default_value("4096")); try { @@ -142,31 +144,33 @@ int main(int argc, const char *argv[]) { auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); - auto bo_inA = xrt::bo(device, N * sizeof(std::bfloat16_t), + auto bo_inA = xrt::bo(device, N * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_weights = xrt::bo(device, 2 * N * sizeof(std::bfloat16_t), + auto bo_weights = xrt::bo(device, 2 * N * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - auto bo_out = xrt::bo(device, N * sizeof(std::bfloat16_t), + auto bo_out = xrt::bo(device, N * sizeof(test_utils::bfloat16_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); if (verbosity >= 1) std::cout << "Writing data into buffer objects." << std::endl; - std::bfloat16_t *bufInA = bo_inA.map(); - std::vector srcVecA; + test_utils::bfloat16_t *bufInA = bo_inA.map(); + std::vector srcVecA; for (int i = 0; i < N; i++) - srcVecA.push_back(std::bfloat16_t(i * 0.05f + -1.0f)); // Example data - memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(std::bfloat16_t))); + srcVecA.push_back( + test_utils::bfloat16_from_float(i * 0.05f + -1.0f)); // Example data + memcpy(bufInA, srcVecA.data(), + (srcVecA.size() * sizeof(test_utils::bfloat16_t))); // Generate the W1 and W2 weights - std::vector srcVecW1; - std::vector srcVecW2; + std::vector srcVecW1; + std::vector srcVecW2; for (int i = 0; i < N; i++) { // Example weights, can be replaced with actual model weights - srcVecW1.push_back(std::bfloat16_t(0.1f * (i % 10) + 0.1f)); - srcVecW2.push_back(std::bfloat16_t(0.2f * (i % 20) + 0.2f)); + srcVecW1.push_back(test_utils::bfloat16_from_float(0.1f * (i % 10) + 0.1f)); + srcVecW2.push_back(test_utils::bfloat16_from_float(0.2f * (i % 20) + 0.2f)); } - std::vector srcVecWeights; + std::vector srcVecWeights; // Interleave the weights into one vector in 1024 elements chunks // of each W1 and W2 for (int i = 0; i < N; i += 1024) { @@ -179,9 +183,9 @@ int main(int argc, const char *argv[]) { } // Write the weights to the buffer object - auto bufWeights = bo_weights.map(); + auto bufWeights = bo_weights.map(); memcpy(bufWeights, srcVecWeights.data(), - srcVecWeights.size() * sizeof(std::bfloat16_t)); + srcVecWeights.size() * sizeof(test_utils::bfloat16_t)); void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); @@ -212,24 +216,28 @@ int main(int argc, const char *argv[]) { std::cout << "Latency (us): " << npu_time << std::endl; std::cout << std::endl; - double total_bytes = 2.0 * N * sizeof(std::bfloat16_t); // input and output + double total_bytes = + 2.0 * N * sizeof(test_utils::bfloat16_t); // input and output double bandwidth_GBps = total_bytes / (npu_time * 1e-6) / 1e9; std::cout << "Effective Bandwidth: " << bandwidth_GBps << " GB/s" << std::endl; - std::bfloat16_t *bufOut = bo_out.map(); + test_utils::bfloat16_t *bufOut = bo_out.map(); int errors = 0; for (int i = 0; i < N; i++) { - std::bfloat16_t ref = swiglu_bf16(srcVecA[i], srcVecW1[i], srcVecW2[i]); - if (!test_utils::nearly_equal(*(bufOut + i), ref, 0.05f)) { + const test_utils::bfloat16_t ref = + swiglu_bf16(srcVecA[i], srcVecW1[i], srcVecW2[i]); + const float expected = test_utils::bfloat16_to_float(ref); + const float actual = test_utils::bfloat16_to_float(*(bufOut + i)); + if (!test_utils::nearly_equal(actual, expected, 0.05f)) { errors++; // Print the first 100 mismatches if (errors <= 100) { std::cout << "Mismatch at index " << i << ": " - << "Expected: " << ref << ", " - << "Got: " << *(bufOut + i) << std::endl; + << "Expected: " << expected << ", " + << "Got: " << actual << std::endl; } } } diff --git a/runtime_lib/test_lib/test_utils.h b/runtime_lib/test_lib/test_utils.h index cf4eca0fe21..ca0be709013 100644 --- a/runtime_lib/test_lib/test_utils.h +++ b/runtime_lib/test_lib/test_utils.h @@ -14,9 +14,11 @@ #define _TEST_UTILS_H_ #include "cxxopts.hpp" +#include #include #include #include +#include #include #include #include @@ -57,16 +59,82 @@ static inline std::int32_t random_int32_t(int32_t range = 0x10000) { return (std::int32_t)rand() % range; } +// The Linux toolchain has std::bfloat16_t. MSVC does not. +// +// Use this host-side helper for bfloat16 XRT buffers and reference checks. +// Device code should use the AIE bfloat16 types and APIs. #if defined(__STDCPP_BFLOAT16_T__) -static inline std::bfloat16_t random_bfloat16_t(std::bfloat16_t scale, - std::bfloat16_t bias) { - return std::bfloat16_t((scale * (float)rand() / (float)(RAND_MAX)) + bias); +using bfloat16_t = std::bfloat16_t; + +static inline bfloat16_t bfloat16_from_float(float value) { + return bfloat16_t(value); +} + +static inline bfloat16_t bfloat16_from_bits(std::uint16_t bits) { + bfloat16_t value; + std::memcpy(&value, &bits, sizeof(value)); + return value; +} + +static inline float bfloat16_to_float(bfloat16_t value) { + return static_cast(value); +} +#else +using bfloat16_t = std::uint16_t; + +static inline bfloat16_t bfloat16_from_bits(std::uint16_t bits) { return bits; } + +static inline float bfloat16_to_float(bfloat16_t bits) { + const std::uint32_t expanded_bits = static_cast(bits) << 16; + float value = 0.0f; + std::memcpy(&value, &expanded_bits, sizeof(value)); + return value; +} + +static inline bfloat16_t bfloat16_from_float(float value) { + std::uint32_t bits = 0; + std::memcpy(&bits, &value, sizeof(bits)); + + // Round to nearest-even instead of truncating. + const std::uint32_t lsb = (bits >> 16) & 1U; + const std::uint32_t rounding_bias = 0x7FFFU + lsb; + return static_cast((bits + rounding_bias) >> 16); } #endif +static inline bfloat16_t random_bfloat16_t(bfloat16_t scale, bfloat16_t bias) { + const float scale_value = bfloat16_to_float(scale); + const float bias_value = bfloat16_to_float(bias); + return bfloat16_from_float((scale_value * (float)rand() / (float)(RAND_MAX)) + + bias_value); +} + bool nearly_equal(float a, float b, float epsilon = 128 * FLT_EPSILON, float abs_th = FLT_MIN); +static inline bool nearly_equal_bfloat16(bfloat16_t a, bfloat16_t b, + float epsilon = 128 * FLT_EPSILON, + float abs_th = FLT_MIN) { + return nearly_equal(bfloat16_to_float(a), bfloat16_to_float(b), epsilon, + abs_th); +} + +static inline bfloat16_t bfloat16_add(bfloat16_t lhs, bfloat16_t rhs) { + return bfloat16_from_float(bfloat16_to_float(lhs) + bfloat16_to_float(rhs)); +} + +static inline bfloat16_t bfloat16_mul(bfloat16_t lhs, bfloat16_t rhs) { + return bfloat16_from_float(bfloat16_to_float(lhs) * bfloat16_to_float(rhs)); +} + +static inline bfloat16_t bfloat16_div(bfloat16_t lhs, bfloat16_t rhs) { + return bfloat16_from_float(bfloat16_to_float(lhs) / bfloat16_to_float(rhs)); +} + +static inline bfloat16_t bfloat16_tanh(bfloat16_t value) { + return bfloat16_from_float(std::tanh(bfloat16_to_float(value))); +} + template void print_matrix(const std::vector matrix, int n_cols, int n_printable_rows = 10, int n_printable_cols = 10,