Skip to content

[GPU] Fix regression by selection of reference MatMul #25633

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -116,25 +116,13 @@ KERNEL(quantize_input)(


#if !REALIGN_FP16_OFFSET
# if OUTPUT_3D
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_SIZE_Y
# else
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
# endif
#define MAIN_LOOP_ELEMENTS_COUNT IFM_SIZE
#else
// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
# if OUTPUT_3D
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_SIZE_Y - 1)
# else
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_ELEMENTS_COUNT - 1)
# endif
// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
#define MAIN_LOOP_ELEMENTS_COUNT (IFM_SIZE - 1)
#endif

#if OUTPUT_3D
# define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y
#else
# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
#endif
#define INPUT_ELEMENTS_COUNT IFM_SIZE

#if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4
#pragma disable_includes_optimization
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,20 @@ static constexpr size_t min_slm_size = 256;
namespace kernel_selector {

static std::pair<size_t, size_t> get_input_bf_size(const fully_connected_params& params) {
size_t input_f = params.inputs[0].Feature().v;
size_t input_batch = params.inputs[0].Batch().v;
auto& input = params.inputs[0];
size_t input_f = input.Feature().v;
size_t input_batch = input.Batch().v;

// 3D input
if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
input_f = params.inputs[0].Y().v;
input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v;
input_f = input.Y().v;
input_batch = input.Batch().v * input.Feature().v;
}

// In Some model, input_f could be dynamic in input0. It refers to IFM value of weight.
if (input.is_dynamic() && input_f == 0 && params.weights.IFM().v != 0)
input_f = params.weights.IFM().v;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we always use this as input_f?
Seems that now regardless input0 is dynamic or not, it can just use weight IFM.


return {input_batch, input_f};
}

Expand Down Expand Up @@ -153,8 +159,7 @@ bool FullyConnected_bf_tiled::Validate(const Params& params) const {

// Dynamic kernel doesn't support dynamic weights yet
if (fc_params.is_shape_agnostic && input.is_dynamic()) {
if ((output.GetLayout() == DataLayout::bfyx && input.Y().v == 0) ||
(output.GetLayout() == DataLayout::bf && input.Feature().v == 0))
if (get_input_bf_size(fc_params).second == 0)
return false;
}

Expand Down Expand Up @@ -509,6 +514,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
}

jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we just use FILTER_IFM_NUM?

jit.AddConstant(MakeJitConstant("SIMD", simd));
jit.AddConstant(MakeJitConstant("TILE_B", dispatchData.tile_m));
jit.AddConstant(MakeJitConstant("HALF_TILE_B", dispatchData.tile_m/2));
Expand Down Expand Up @@ -539,16 +545,18 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para

// for 3d output we are treating spatial as features
if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
auto tile_in_b_pitch = (params.inputs[0].Feature().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Feature().pitch;
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Y().v));
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Y().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM * OUTPUT_FEATURE_NUM)"));
} else {
auto tile_in_b_pitch = (params.inputs[0].Batch().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Batch().pitch;
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Feature().v));
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Feature().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch));
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Batch().pitch));
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)"));
}
Expand Down Expand Up @@ -614,6 +622,12 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
kd.kernels[execute_kernel_idx].params.workGroups.local = dispatchData.lws;
kd.kernels[execute_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params);

auto& input = prim_params.inputs[0];
if (prim_params.outputs[0].GetLayout() == DataLayout::bfyx)
OPENVINO_ASSERT(input.X().pad.Total() == 0 && input.Y().pad.Total() == 0, "[GPU] Invalid padding in spatial axes observed in FC bf tiled.");
else
OPENVINO_ASSERT(input.Feature().pad.Total() == 0, "[GPU] Invalid padding in f axis observed in FC bf tiled.");

if (!kd.internalBufferSizes.empty()) {
// Pre-quantizing kernel was generated. Update the kernel and intermediate buffers or disable it.
if (execute_type == KernelType::DEFAULT) {
Expand Down Expand Up @@ -784,7 +798,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
{
auto& quan_kernel = kd.kernels[0];
DispatchData dyn_quan_dispatch = dispatchData;
dyn_quan_dispatch.gws = {std::max((fc_params.inputs[0].PhysicalSize() / quantize_grp_size), (size_t)1), 1, 1};
auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second);
dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1};
dyn_quan_dispatch.lws = {16, 1, 1};
quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws;
quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws;
Expand Down Expand Up @@ -814,8 +829,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize());
kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize() / quantize_grp_size * 2);
kd.internalBufferSizes.push_back(input_size);
kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2);
kernel_number++;
}
kd.internalBufferDataType = Datatype::F16;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,9 +266,10 @@ const std::vector<ShapeRelatedParams> IS3D_smoke = {
},

{ov::test::static_shapes_to_test_representation({{1, 429}, {1, 429, 1}}), {true, true}},

{
{
{{-1, -1}, {{1, 129}, {2, 129}, {1, 129}, {2, 129}}},
{{-1, -1, -1}, {{1, 1, 129}, {1, 2, 129}, {1, 1, 129}, {1, 2, 129}}},
{{1, 129, 1}, {{1, 129, 1}, {1, 129, 1}, {1, 129, 1}, {1, 129, 1}}}
},
{true, true}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1255,7 +1255,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
}
}

void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1) {
void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1, bool is_wei_dyn = false) {
tests::random_generator rg(GET_SUITE_NAME);
auto& engine = get_test_engine();

Expand Down Expand Up @@ -1285,6 +1285,11 @@ class fully_connected_gpu_tests: public ::testing::Test {
auto scale_data = rg.generate_random_1d<ov::float16>(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f);
set_values(scale_mem, scale_data);

if (is_wei_dyn) {
// ifm_num is dynamic
dyn_input_ps = is_3d ? ov::PartialShape{ -1, -1, -1 } : ov::PartialShape{ -1, -1};
}

auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx }
: layout{ input_ps, data_types::f16, format::bfyx };

Expand All @@ -1302,7 +1307,8 @@ class fully_connected_gpu_tests: public ::testing::Test {

auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
config.set_property(ov::intel_gpu::optimize_data(true));
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));

network network(engine, topology, config);
network.set_input_data("input", input_mem);
Expand Down Expand Up @@ -1365,13 +1371,13 @@ class fully_connected_gpu_tests: public ::testing::Test {
}


void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) {
void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128, bool is_wei_dyn = false) {
tests::random_generator rg(GET_SUITE_NAME);
auto& engine = get_test_engine();
auto supports_immad = engine.get_device_info().supports_immad;

long int ifm_num = 256;
long int ofm_num = 256;
long int ofm_num = 512;

auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx });
auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
Expand All @@ -1392,6 +1398,11 @@ class fully_connected_gpu_tests: public ::testing::Test {
auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx }
: layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx };

if (is_dynamic && is_wei_dyn) {
// ifm_num is dynamic
in_layout = layout{ {-1, -1}, data_types::f16, format::bfyx };
}

auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";

auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, padding(), 2, 2);
Expand All @@ -1409,6 +1420,8 @@ class fully_connected_gpu_tests: public ::testing::Test {

auto config = get_test_default_config(engine);
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));

network network(engine, topology, config);
network.set_input_data("input", input_mem);
Expand Down Expand Up @@ -3324,6 +3337,32 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_cache_dynamic) {
this->test_compressed_int4_scale_dyn_quan(true, true, 512);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input) {
this->test_compressed_int4_scale(false, true, 256, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_cached) {
this->test_compressed_int4_scale(true, true, 260, true);
}
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g64) {
this->test_compressed_int4_scale(false, true, 1, 64, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g128) {
this->test_compressed_int4_scale(false, true, 1, 128, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_single_batch) {
this->test_compressed_int4_scale_dyn_quan(false, true, 1, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input) {
this->test_compressed_int4_scale_dyn_quan(false, true, 512, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_unaligned) {
this->test_compressed_int4_scale_dyn_quan(false, true, 511, true);
}


TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {
Expand Down
Loading