Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 65 additions & 21 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@

namespace nntrainer {

static inline unsigned int ceil_div(unsigned int a, unsigned int b) {
return (a + b - 1) / b;
};

static inline unsigned int align(unsigned int a, unsigned int b) {
return (a % b == 0) ? a : a - a % b + b;
};

void gemv_int4_async_cl(std::vector<void *> weights,
std::vector<uint16_t *> scales, uint16_t *input,
std::vector<uint16_t *> outputs, unsigned int K,
Expand Down Expand Up @@ -104,6 +112,9 @@ void gemv_int4_async_cl(std::vector<void *> weights,
void gemv_int4_cl(char *weight, uint16_t *scale, uint16_t *input,
uint16_t *output, unsigned int K, unsigned int N,
unsigned int quantization_group_size) {
const auto N_GROUP_SIZE = 32; // due to input data format
const unsigned int alignN = align(N, N_GROUP_SIZE);

bool result = false;
auto *blas_cc =
static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));
Expand Down Expand Up @@ -155,7 +166,7 @@ void gemv_int4_cl(char *weight, uint16_t *scale, uint16_t *input,
throw std::runtime_error(
"Failed to set kernel argument 5 for fully_connected_gpu_int4_gemv");

const int work_groups_count[3] = {(int)(N / 2), 1, 16};
const int work_groups_count[3] = {(int)(alignN / 2), 1, 16};
const int work_group_size[3] = {16, 1, 16};

result = blas_cc->command_queue_inst_.DispatchCommand(
Expand Down Expand Up @@ -433,14 +444,6 @@ void openvino_gemm_async_cl(float *input, std::vector<void *> weights,

bool result = false;

auto ceil_div = [](unsigned int a, unsigned int b) -> unsigned int {
return (a + b - 1) / b;
};

auto align = [](unsigned int a, unsigned int b) -> unsigned int {
return (a % b == 0) ? a : a - a % b + b;
};

// copy fp32 input to fp16
copy_fp32_u16(M * K, input, (uint16_t *)clbuffInstance.getSVMInput());

Expand Down Expand Up @@ -592,9 +595,54 @@ void openvino_sgemm_cl(float *input, char *weight, uint16_t *scale,
copy_u16_fp32(M * N, (uint16_t *)clbuffInstance.getSVMOutput(), output);
}

// TODO remove it
void *allocateSVM(size_t size_bytes) {
auto *blas_cc = static_cast<nntrainer::ClContext *>(
nntrainer::Engine::Global().getRegisteredContext("gpu"));

void *ptr = blas_cc->context_inst_.createSVMRegion(size_bytes);

if (ptr == nullptr) {
throw std::runtime_error(
"Failed to allocated SVM for the OpenCL BLAS unit test.");
}

return ptr;
}

// TODO remove it
void freeSVM(void *ptr) {
auto *blas_cc = static_cast<nntrainer::ClContext *>(
nntrainer::Engine::Global().getRegisteredContext("gpu"));

blas_cc->context_inst_.releaseSVMRegion(ptr);
ptr = nullptr;
}

void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
unsigned int M, unsigned int N, unsigned int K,
unsigned int quantization_group_size) {

int alignK = align(K, quantization_group_size);
const auto N_GROUP_SIZE = 32; // due to input data format
int alignN = align(N, N_GROUP_SIZE);
// Padding input data - TODO remove this and do this in kernel quantize_input
uint16_t *input_ptr;
if (alignK != K) {
uint32_t padded_input_size = M * alignK;
input_ptr = (uint16_t *)allocateSVM(padded_input_size * sizeof(uint16_t));
for (int y = 0; y < M; y++) {
for (int x = 0; x < K; x++) {
input_ptr[y * alignK + x] = ((uint16_t *)input)[y * K + x];
}
for (int x = K; x < alignK; x++) {
input_ptr[y * alignK + x] = compute_fp32_to_fp16(0.f);
}
}
} else {
input_ptr = (uint16_t *)input;
}

bool result = false;
auto *blas_cc =
static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));
Expand All @@ -605,14 +653,6 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
" -D SIZE_QUANTIZATION_GROUP=" + std::to_string(quantization_group_size) +
" -D SCALE_ROW_MAJOR=" + std::to_string(scale_row_major);

auto ceil_div = [](unsigned int a, unsigned int b) -> unsigned int {
return (a + b - 1) / b;
};

auto align = [](unsigned int a, unsigned int b) -> unsigned int {
return (a % b == 0) ? a : a - a % b + b;
};

std::vector<cl_event> quantize_event(1);
{
ClContext::SharedPtrClKernel kernel_ptr = blas_cc->registerClKernel(
Expand All @@ -624,7 +664,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,

int arg = 0;

result = kernel_ptr->SetKernelSVMArguments(arg++, input);
result = kernel_ptr->SetKernelSVMArguments(arg++, input_ptr);

if (!result)
throw std::runtime_error("Failed to set kernel argument 0 for "
Expand All @@ -643,7 +683,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
"quantize_input");

const int work_groups_count[3] = {
(int)align((M * K) / quantization_group_size, 64), 1, 1};
(int)align((M * ceil_div(K, quantization_group_size)), 64), 1, 1};
const int work_group_size[3] = {64, 1, 1};

result = blas_cc->command_queue_inst_.DispatchCommand(
Expand All @@ -665,7 +705,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,

int arg = 0;

result = kernel_ptr->SetKernelSVMArguments(arg++, input);
result = kernel_ptr->SetKernelSVMArguments(arg++, input_ptr);

if (!result)
throw std::runtime_error(
Expand Down Expand Up @@ -703,7 +743,7 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
throw std::runtime_error(
"Failed to set kernel argument 6 for fc_bf_tiled_kernel_default");

const int work_groups_count[3] = {(int)(N / 2),
const int work_groups_count[3] = {(int)(alignN / 2),
(int)(align(ceil_div(M, 8), 8)), 1};
const int work_group_size[3] = {16, 8, 1};

Expand All @@ -723,6 +763,10 @@ void openvino_gemm_cl(void *input, void *weights, void *scales, void *output,
"Failed to read output data for fc_bf_tiled_kernel_default");
return;
}

if (alignK != K) {
freeSVM(input_ptr);
}
}

void sgemv_q6_k_cl(void *matAdata, float *vecXdata, float *vecYdata,
Expand Down
21 changes: 12 additions & 9 deletions nntrainer/tensor/cl_operations/cl_kernels/int4_gemv.cl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#define CAT(x, y) __CAT(x, y)

#define unroll_for __attribute__((opencl_unroll_hint)) for
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
#define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
Expand Down Expand Up @@ -78,8 +78,8 @@
CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)

#define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset) \
BLOCK_READN_FUNC(type_size, vector_size)( \
(const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))
BLOCK_READN_FUNC(type_size, vector_size) \
((const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))

#define BLOCK_READN(type, vector_size, ptr, offset) \
AS_TYPE( \
Expand Down Expand Up @@ -189,7 +189,9 @@ fully_connected_gpu_int4_gemv(__global half *input, const __global half *scales,
__global half *output,
const __global char *weights, const int WEIGHTS_K,
const int WEIGHTS_N) {
const int SCALE_GROUP_NUM = WEIGHTS_K / SIZE_QUANTIZATION_GROUP;
const int SCALE_GROUP_NUM = CEIL_DIV(WEIGHTS_K, SIZE_QUANTIZATION_GROUP);
int ALIGN_WEIGHTS_N = ALIGN(WEIGHTS_N, 32);
int ALIGN_WEIGHTS_K = ALIGN(WEIGHTS_K, SIZE_QUANTIZATION_GROUP);

int n = get_global_id(0) * 2; // N
int thr_id = get_local_id(2); // 0~15
Expand All @@ -212,18 +214,19 @@ fully_connected_gpu_int4_gemv(__global half *input, const __global half *scales,
float2 sum_all = 0;
for (int gk = gk0; gk < gk1; gk++) {
__global half *A = input + gk * DECOMPRESSION_GROUP_SIZE;
const __global char *B =
weights + get_4bit_weight_index(gk * DECOMPRESSION_GROUP_SIZE, n,
WEIGHTS_K, WEIGHTS_N, 32);
int w_id = get_4bit_weight_index(gk * DECOMPRESSION_GROUP_SIZE, n,
ALIGN_WEIGHTS_K, ALIGN_WEIGHTS_N, 32);

const __global char *B = weights + w_id;

GEMV_ACCUMULATOR_VEC_TYPE sum = 0;

#if SCALE_ROW_MAJOR
float scale_0 = convert_float(scales[gk]);
float scale_1 = convert_float(scales[gk + 16 * SCALE_GROUP_NUM]);
#else
float scale_0 = convert_float(scales[gk * WEIGHTS_N]);
float scale_1 = convert_float(scales[gk * WEIGHTS_N + 16]);
float scale_0 = convert_float(scales[gk * ALIGN_WEIGHTS_N]);
float scale_1 = convert_float(scales[gk * ALIGN_WEIGHTS_N + 16]);
#endif

__attribute__((opencl_unroll_hint(4))) for (int g = 0;
Expand Down
51 changes: 29 additions & 22 deletions nntrainer/tensor/cl_operations/cl_kernels/openvino_gemm.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,27 @@
#define COMPRESSED_WEIGHTS_INT4 1
#define FILTER_LAYOUT_OS_IS_YX_OSV32_ISV2 1

#define CEIL_DIV(a, b) (((a) + (b)-1) / (b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define CLAMP(v, l, u) MAX((l), MIN((v), (u)))

#define ALIGN_SIZE_K ALIGN(SIZE_K, SIZE_QUANTIZATION_GROUP)

#define DECOMPRESSION_SCALE_TERM 1
#define DECOMPRESSION_SCALE_GROUP_SIZE SIZE_QUANTIZATION_GROUP
#define DECOMPRESSION_SCALE_GROUPS_NUM (SIZE_K / DECOMPRESSION_SCALE_GROUP_SIZE)
#define DECOMPRESSION_SCALE_GROUPS_NUM \
CEIL_DIV(SIZE_K, DECOMPRESSION_SCALE_GROUP_SIZE)

#define DECOMPRESSION_SCALE_BATCH_NUM SIZE_N
#define TILE_IFM_ELEMENTS_SIZE 32
#define ALIGN_SIZE_N ALIGN(SIZE_N, TILE_IFM_ELEMENTS_SIZE)

#define DECOMPRESSION_SCALE_BATCH_NUM ALIGN_SIZE_N
#define DECOMPRESSION_SCALE_BATCH_PITCH DECOMPRESSION_SCALE_GROUPS_NUM
#define DECOMPRESSION_SCALE_FEATURE_PITCH 1
#define DECOMPRESSION_SCALE_LENGTH (SIZE_N * DECOMPRESSION_SCALE_GROUPS_NUM)
#define DECOMPRESSION_SCALE_LENGTH \
((ALIGN_SIZE_N) * (DECOMPRESSION_SCALE_GROUPS_NUM))

#define INPUT0_TYPE half
#define OUTPUT_TYPE half
Expand All @@ -21,7 +34,7 @@
#define INPUT0_OFFSET 0
#define OUTPUT_OFFSET 0

#define IFM_SIZE SIZE_K
#define IFM_SIZE ALIGN_SIZE_K

#define ACCUMULATOR_TYPE float
#define ACTIVATION_TYPE float
Expand Down Expand Up @@ -54,12 +67,11 @@
#define OUTER_OFM 1
#define DISPATCH_BSV 1
#define DISPATCH_FSV 1
#define TILE_IFM_ELEMENTS_SIZE 32
#define NUM_LOOP_IN_DYN_QUAN_GROUP (QUANTIZE_GROUP_SIZE / (TILE_IFM * SIMD))
#define REALIGN_FP16_OFFSET 0
#define TILE_OUT_F_NUM SIZE_N
#define TILE_OUT_F_PITCH 1
#define TILE_IN_B_PITCH SIZE_K
#define TILE_IN_B_PITCH ALIGN_SIZE_K
#define TILE_OUT_B_PITCH SIZE_N

#define ACTIVATION_FUNC_TYPED(input, params) (input)
Expand Down Expand Up @@ -153,11 +165,6 @@ inline int imad_SW(int acc, uchar4 input, uchar4 weight)
#endif

#define unroll_for __attribute__((opencl_unroll_hint)) for
#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
#define ALIGN(a, b) (CEIL_DIV(a, b) * (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define CLAMP(v, l, u) MAX((l), MIN((v), (u)))

// Creates vector type.
#define MAKE_VECTOR_TYPE_IMPL_1(elem_type) elem_type
Expand Down Expand Up @@ -252,8 +259,8 @@ inline int imad_SW(int acc, uchar4 input, uchar4 weight)
CAT(BLOCK_READN_FUNC_size, type_size)(vector_size)

#define BLOCK_READN_RAW(type_size, vector_size, addr_space, ptr, offset) \
BLOCK_READN_FUNC(type_size, vector_size)( \
(const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))
BLOCK_READN_FUNC(type_size, vector_size) \
((const addr_space BLOCK_READ_TYPE(type_size) *)(ptr) + (offset))

#define BLOCK_READN(type, vector_size, ptr, offset) \
AS_TYPE( \
Expand Down Expand Up @@ -458,9 +465,9 @@ DECLARE_BLOCK_READ_EMULATION(8, 8)
CAT(BLOCK_WRITEN_FUNC_size, type_size)(vector_size)

#define BLOCK_WRITEN_RAW(type_size, vector_size, addr_space, ptr, offset, val) \
BLOCK_WRITEN_FUNC(type_size, vector_size)( \
(addr_space BLOCK_WRITE_TYPE(type_size) *)(ptr) + (offset), \
AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_WRITE_TYPE(type_size), vector_size), val))
BLOCK_WRITEN_FUNC(type_size, vector_size) \
((addr_space BLOCK_WRITE_TYPE(type_size) *)(ptr) + (offset), \
AS_TYPE(MAKE_VECTOR_TYPE(BLOCK_WRITE_TYPE(type_size), vector_size), val))

#define BLOCK_WRITEN(type, vector_size, ptr, offset, val) \
BLOCK_WRITEN_RAW(TYPE_SIZE(type), vector_size, __global, ptr, offset, val)
Expand Down Expand Up @@ -1335,16 +1342,16 @@ inline void fc_bf_tiled_kernel_dyn_quan(

// =====================================================================================================================================
// Main computation loop
const uint iterations =
MAIN_LOOP_ELEMENTS_COUNT /
TILE_IFM_ELEMENTS_SIZE; // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD)
const uint iterations = CEIL_DIV(
MAIN_LOOP_ELEMENTS_COUNT,
TILE_IFM_ELEMENTS_SIZE); // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD)
// Each sub-group loads 2 Batch
const uint idx_sglid =
(sglid * TILE_K) %
TILE_IFM_ELEMENTS_SIZE; // same index for sglid 0~7 : to tile_k direction
const uint batch_sglid =
(sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE; // 0 to 1 : to batch direction
const uint scale_pitch = (TILE_IN_B_PITCH / QUANTIZE_GROUP_SIZE);
const uint scale_pitch = CEIL_DIV(TILE_IN_B_PITCH, QUANTIZE_GROUP_SIZE);

#if PER_TOKEN_SIZE_DYN_QUANTIZE
// Each token is quantized by once. So, all MAIN_LOOP_ELEMENTS_COUNT share
Expand Down Expand Up @@ -1379,7 +1386,7 @@ inline void fc_bf_tiled_kernel_dyn_quan(
++ni) {
uint in_offset =
input_offset + (idx_sglid + batch_sglid * TILE_IN_B_PITCH);
uint scale_offset = input_offset / QUANTIZE_GROUP_SIZE;
uint scale_offset = CEIL_DIV(input_offset, QUANTIZE_GROUP_SIZE);
for (uint bi = 0; bi < HALF_TILE_B; ++bi) {
// Load quantizing info from pre-quantizing kernel
tiled_input_0[bi] = vload4(0, &quantized_input[in_offset]);
Expand Down Expand Up @@ -1685,7 +1692,7 @@ inline void fc_bf_tiled_kernel_dyn_quan(
#else
const uint scale_offset =
(offset_ofm % DECOMPRESSION_SCALE_BATCH_NUM) +
ni_offset * SIZE_N;
ni_offset * ALIGN_SIZE_N;
#endif
ACCUMULATOR_TYPE ds = decompression_scale[scale_offset];
#else
Expand Down
Loading
Loading