Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 84 additions & 27 deletions programming_examples/basic/matrix_multiplication/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <iostream>
#include <optional>
#include <ostream>
#include <stdfloat>
#include <type_traits>

#include "test_utils.h"

Expand Down Expand Up @@ -92,6 +92,52 @@ void parse_options(int argc, const char *argv[], cxxopts::Options &options,
template <typename T>
static inline T get_random();

template <typename T>
static inline auto scalar_to_arithmetic(T value) {
if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
return test_utils::bfloat16_to_float(value);
} else {
return value;
}
}

template <typename T>
static inline float scalar_to_float(T value) {
return static_cast<float>(scalar_to_arithmetic(value));
}

template <typename T, typename Tacc>
static inline T scalar_from_accum(Tacc value) {
auto arithmetic_value = scalar_to_arithmetic(value);
if constexpr (std::is_same_v<T, test_utils::bfloat16_t>) {
return test_utils::bfloat16_from_float(
static_cast<float>(arithmetic_value));
} else {
return static_cast<T>(arithmetic_value);
}
}

template <typename Tacc>
static inline Tacc zero_accum() {
if constexpr (std::is_same_v<Tacc, test_utils::bfloat16_t>) {
return test_utils::bfloat16_from_float(0.0f);
} else {
return Tacc(0);
}
}

template <typename Tacc, typename Tin>
static inline Tacc accum_add_product(Tacc running_sum, Tin lhs, Tin rhs) {
auto product = scalar_to_arithmetic(lhs) * scalar_to_arithmetic(rhs);
if constexpr (std::is_same_v<Tacc, test_utils::bfloat16_t>) {
return test_utils::bfloat16_add(
running_sum,
test_utils::bfloat16_from_float(static_cast<float>(product)));
} else {
return running_sum + Tacc(product);
}
}

template <>
std::int16_t get_random<std::int16_t>() {
return (std::int16_t)rand() % 0x10000;
Expand All @@ -103,10 +149,11 @@ int8_t get_random<int8_t>() {
}

template <>
std::bfloat16_t get_random<std::bfloat16_t>() {
test_utils::bfloat16_t get_random<test_utils::bfloat16_t>() {
// Random numbers should NOT be uniformly between 0 and 1, because that
// would make the matrix product AB always close to 1.
return std::bfloat16_t(4.0 * (float)rand() / (float)(RAND_MAX));
return test_utils::bfloat16_from_float(4.0f * (float)rand() /
(float)(RAND_MAX));
}

template <typename Tin, typename Tout, typename Tacc>
Expand All @@ -115,18 +162,20 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
int c_col_maj) {
for (int row = 0; row < M; row++) {
for (int col = 0; col < N; col++) {
Tacc running_sum = 0;
Tacc running_sum = zero_accum<Tacc>();
for (int k = 0; k < K; k++) {
if (!b_col_maj) {
running_sum += Tacc(A[row * K + k] * B[k * N + col]);
running_sum = accum_add_product<Tacc>(running_sum, A[row * K + k],
B[k * N + col]);
} else {
running_sum += Tacc(A[row * K + k] * B[k + col * K]);
running_sum = accum_add_product<Tacc>(running_sum, A[row * K + k],
B[k + col * K]);
}
}
if (!c_col_maj) {
C[row * N + col] = Tout(running_sum);
C[row * N + col] = scalar_from_accum<Tout>(running_sum);
} else {
C[row + col * M] = Tout(running_sum);
C[row + col * M] = scalar_from_accum<Tout>(running_sum);
}
}
}
Expand All @@ -135,15 +184,17 @@ void matmul(int M, int N, int K, const std::vector<Tin> A,
template <typename Tin, typename Tout, typename Tacc>
Tout mul_acc(int M, int N, int K, int row, int col, const std::vector<Tin> A,
const std::vector<Tin> B, int b_col_maj) {
Tacc running_sum = 0;
Tacc running_sum = zero_accum<Tacc>();
for (int k = 0; k < K; k++) {
if (!b_col_maj) {
running_sum += Tacc(A[row * K + k] * B[k * N + col]);
running_sum =
accum_add_product<Tacc>(running_sum, A[row * K + k], B[k * N + col]);
} else {
running_sum += Tacc(A[row * K + k] * B[k + col * K]);
running_sum =
accum_add_product<Tacc>(running_sum, A[row * K + k], B[k + col * K]);
}
}
return (Tout)running_sum;
return scalar_from_accum<Tout>(running_sum);
}

// nearly_equal function adapted from Stack Overflow, License CC BY-SA 4.0
Expand Down Expand Up @@ -184,7 +235,7 @@ float get_abs_tol<std::int32_t>() {
}

template <>
float get_abs_tol<std::bfloat16_t>() {
float get_abs_tol<test_utils::bfloat16_t>() {
return 0.5;
}

Expand All @@ -209,7 +260,7 @@ float get_rel_tol<std::int32_t>() {
}

template <>
float get_rel_tol<std::bfloat16_t>() {
float get_rel_tol<test_utils::bfloat16_t>() {
return 0.05;
}

Expand Down Expand Up @@ -314,8 +365,9 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual,
float abs_tol, float rel_tol) {
bool match = expected == actual;
if (abs_tol > 0 || rel_tol > 0) {
// Allow for some tolerance for float data types
match = nearly_equal(expected, actual, rel_tol, abs_tol);
// Allow for some tolerance for float and host-side bfloat16 data types.
match = nearly_equal(scalar_to_float(expected), scalar_to_float(actual),
rel_tol, abs_tol);
}
if (!match) {
return (struct error<Tout>){row, col, expected, actual};
Expand All @@ -326,12 +378,13 @@ verify_single(std::ostream &os, int row, int col, Tout expected, Tout actual,
template <typename Tout>
void print_error_summary(std::ostream &os, int n_errors,
std::vector<struct error<Tout>> &errors,
Tout max_rel_error) {
float max_rel_error) {
for (struct error<Tout> &err : errors) {
os << "[" << std::setw(5) << err.row << ", " << std::setw(5) << err.col
<< "] " << std::setw(4) << std::setprecision(2) << std::fixed
<< (float)err.actual << " =!= " << std::setw(4) << std::setprecision(2)
<< std::fixed << (float)err.expected << std::endl;
<< scalar_to_float(err.actual) << " =!= " << std::setw(4)
<< std::setprecision(2) << std::fixed << scalar_to_float(err.expected)
<< std::endl;
}
if (n_errors > max_printable_errors) {
os << "...and " << std::setw(0) << n_errors - max_printable_errors
Expand All @@ -357,7 +410,7 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
float rel_tol = 0.05, int b_col_maj = 0, int c_col_maj = 0) {
int n_errors = 0;
std::vector<struct error<Tout>> errors;
Tout max_rel_error = (Tout)0.0f;
float max_rel_error = 0.0f;
struct error<Tout> max_error;

std::vector<Tout> CRef(M * N);
Expand All @@ -372,9 +425,11 @@ int verify(int M, int N, int K, std::vector<Tin> A, std::vector<Tin> B,
if (n_errors < max_printable_errors) {
errors.push_back(*error);
}
Tout rel_error =
std::abs(error->actual - error->expected) /
std::max(std::abs(error->actual), std::abs(error->expected));
float actual_value = scalar_to_float(error->actual);
float expected_value = scalar_to_float(error->expected);
float rel_error =
std::abs(actual_value - expected_value) /
std::max(std::abs(actual_value), std::abs(expected_value));
if (rel_error > max_rel_error) {
max_rel_error = rel_error;
max_error = *error;
Expand Down Expand Up @@ -414,7 +469,7 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,

int n_errors = 0;
std::vector<struct error<Tout>> errors;
Tout max_rel_error = (Tout)0.0f;
float max_rel_error = 0.0f;
double progress = 0;
for (std::tuple<size_t, std::tuple<int &, int &>> cell :
std::views::enumerate(std::views::zip(sampled_rows, sampled_cols))) {
Expand All @@ -440,9 +495,11 @@ int verify_stochastic(int M, int N, int K, std::vector<Tin> A,
if (n_errors < max_printable_errors) {
errors.push_back(*error);
}
Tout rel_error =
std::abs(error->actual - error->expected) /
std::max(std::abs(error->actual), std::abs(error->expected));
float actual_value = scalar_to_float(error->actual);
float expected_value = scalar_to_float(error->expected);
float rel_error =
std::abs(actual_value - expected_value) /
std::max(std::abs(actual_value), std::abs(expected_value));
if (rel_error > max_rel_error) {
max_rel_error = rel_error;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@ devicename ?= $(if $(filter 1,$(NPU2)),npu2,npu)
colshift ?= $(if $(filter npu,$(devicename)),1,0)

ifeq ($(dtype_in),bf16)
dtype_in_cpp=std::bfloat16_t
dtype_in_cpp=test_utils::bfloat16_t
endif
ifeq ($(dtype_out),bf16)
dtype_out_cpp=std::bfloat16_t
dtype_out_cpp=test_utils::bfloat16_t
dtype_acc_cpp=float
endif
ifeq ($(dtype_in),i16)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,11 @@
//
//===----------------------------------------------------------------------===//

#include <stdfloat>
#include <stdint.h>

#define DATATYPES_USING_DEFINED
using A_DATATYPE = int16_t; // std::bfloat16_t;
using B_DATATYPE = int16_t; // std::bfloat16_t;
using A_DATATYPE = int16_t;
using B_DATATYPE = int16_t;
using C_DATATYPE = int32_t; // float;
using ACC_DATATYPE = int32_t;

Expand Down
5 changes: 2 additions & 3 deletions programming_examples/basic/matrix_multiplication/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <iomanip>
#include <iostream>
#include <sstream>
#include <stdfloat>

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
Expand All @@ -29,10 +28,10 @@
#ifndef DATATYPES_USING_DEFINED
#define DATATYPES_USING_DEFINED
#ifndef DTYPE_IN
#define DTYPE_IN std::bfloat16_t
#define DTYPE_IN test_utils::bfloat16_t
#endif
#ifndef DTYPE_OUT
#define DTYPE_OUT std::bfloat16_t
#define DTYPE_OUT test_utils::bfloat16_t
#endif
#ifndef DTYPE_ACC
#define DTYPE_ACC float
Expand Down
32 changes: 19 additions & 13 deletions programming_examples/basic/vector_exp/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,13 @@
//===----------------------------------------------------------------------===//

#include "cxxopts.hpp"
#include <bits/stdc++.h>
#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
Expand All @@ -26,8 +30,8 @@

#ifndef DATATYPES_USING_DEFINED
#define DATATYPES_USING_DEFINED
using INOUT0_DATATYPE = std::bfloat16_t;
using INOUT1_DATATYPE = std::bfloat16_t;
using INOUT0_DATATYPE = test_utils::bfloat16_t;
using INOUT1_DATATYPE = test_utils::bfloat16_t;
#endif

// ----------------------------------------------------------------------------
Expand All @@ -37,24 +41,27 @@ template <typename T>
int verify(int CSize, std::vector<T> A, std::vector<T> C, int verbosity) {
int errors = 0;
for (uint32_t i = 0; i < CSize; i++) {
std::bfloat16_t ref = exp(A[i]);
const float input = test_utils::bfloat16_to_float(A[i]);
const float actual = test_utils::bfloat16_to_float(C[i]);
const auto ref_bf16 = test_utils::bfloat16_from_float(std::exp(input));
const float ref = test_utils::bfloat16_to_float(ref_bf16);
// Let's check if they are inf or nan, and if so just pass because
// comparisions will then fail, even for matches
if (std::isinf(ref) || std::isinf(C[i]))
if (std::isinf(ref) || std::isinf(actual))
break;
if (std::isnan(ref) || std::isnan(C[i]))
if (std::isnan(ref) || std::isnan(actual))
break;
if (!test_utils::nearly_equal(ref, C[i], 0.128)) {
if (!test_utils::nearly_equal(ref, actual, 0.128)) {
if (errors < 100) {
std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
std::cout << "Error in output " << actual << " != " << ref << std::endl;
} else if (errors == 100) {
std::cout << "..." << std::endl;
std::cout << "[Errors truncated]" << std::endl;
}
errors++;
} else {
if (verbosity > 1)
std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
std::cout << "Correct output " << actual << " == " << ref << std::endl;
}
}
return errors;
Expand Down Expand Up @@ -161,9 +168,8 @@ int main(int argc, const char *argv[]) {
INOUT0_DATATYPE *bufInOut0 = bo_inout0.map<INOUT0_DATATYPE *>();
std::vector<INOUT0_DATATYPE> AVec(INOUT0_VOLUME);
for (int i = 0; i < INOUT0_VOLUME; i++) {
std::uint16_t u16 = (std::uint16_t)i;
std::bfloat16_t bf16 = *(std::bfloat16_t *)&u16;
AVec[i] = bf16;
const std::uint16_t bits = static_cast<std::uint16_t>(i);
AVec[i] = test_utils::bfloat16_from_bits(bits);
}
memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));

Expand Down Expand Up @@ -203,7 +209,7 @@ int main(int argc, const char *argv[]) {
/* Warmup iterations do not count towards average runtime. */
continue;
}
std::bfloat16_t *bufOut = bo_inout1.map<std::bfloat16_t *>();
INOUT1_DATATYPE *bufOut = bo_inout1.map<INOUT1_DATATYPE *>();

// Copy output results and verify they are correct
std::vector<INOUT1_DATATYPE> CVec(INOUT1_VOLUME);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ else
endif

ifeq ($(dtype),bf16)
dtype_cpp=std::bfloat16_t
dtype_cpp=test_utils::bfloat16_t
endif
ifeq ($(dtype),i32)
dtype_cpp=std::int32_t
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ aie_py_src=${targetname}.py
endif

ifeq ($(dtype),bf16)
dtype_cpp=std::bfloat16_t
dtype_cpp=test_utils::bfloat16_t
endif
ifeq ($(dtype),i32)
dtype_cpp=std::int32_t
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ aie_py_src=${targetname}.py
endif

ifeq ($(dtype),bf16)
dtype_cpp=std::bfloat16_t
dtype_cpp=test_utils::bfloat16_t
endif
ifeq ($(dtype),i32)
dtype_cpp=std::int32_t
Expand Down
Loading