Skip to content

[GPU] Set minimum memory of count for reduce mean mode of scatter_elements_update, fix typos and remove space #30491

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@

inline uint add_count(
#if COUNT_LENGTH > COUNT_LIMIT
__global int count_k[], __global int count_v[],
__global int count_k[], __global int count_v[],
#else
__local int count_k[], __local int count_v[],
__local int count_k[], __local int count_v[],
#endif
int idx, uint valid_count)
{
Expand All @@ -117,7 +117,7 @@
count_v[valid_count] += 1;
return valid_count + 1;
}

inline int get_count(__local int count_k[], __local int count_v[], int it, int *idx)
{
if (count_k[it] != -1) {
Expand Down Expand Up @@ -185,17 +185,21 @@ KERNEL(scatter_elements_update_ref)(OPTIONAL_SHAPE_INFO_ARG
const uint tgx = INPUT2_SIZE_X * INPUT2_SIZE_Y;
const uint tgy = INPUT2_SIZE_Z * INPUT2_SIZE_W;
#endif
int count_length = 0;
#if COUNT_LENGTH > COUNT_LIMIT
__global int count_k[COUNT_LENGTH];
__global int count_v[COUNT_LENGTH];
__global int count_k[COUNT_LIMIT];
__global int count_v[COUNT_LIMIT];
count_length = COUNT_LIMIT;
#elif COUNT_LENGTH == 0
__local int count_k[1];
__local int count_v[1];
__local int count_k[COUNT_MINIMUM];
__local int count_v[COUNT_MINIMUM];
count_length = COUNT_MINIMUM;
#else
__local int count_k[COUNT_LENGTH];
__local int count_v[COUNT_LENGTH];
count_length = COUNT_LENGTH;
#endif
for (int i = 0; i < COUNT_LENGTH; ++i) {
for (int i = 0; i < count_length; ++i) {
count_k[i] = -1;
count_v[i] = 0;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ KernelsData ScatterElementsUpdateKernelRef::GetKernelsData(const Params& params)
if (i == 1) {
cldnn_jit.AddConstant(MakeJitConstant("IS_SECOND_ITER", "true"));
cldnn_jit.AddConstant(MakeJitConstant("COUNT_LIMIT", params.engineInfo.maxLocalMemSize));
cldnn_jit.AddConstant(MakeJitConstant("COUNT_MINIMUM", params.engineInfo.maxLocalMemSize/64));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • What does it mean that COUNT_LENGTH==0? Then I guess total workitem size is just 0 and no code will be executed. Isn't it?

  • what about just setting COUNT_LENGTH = dispatchData.gws[0] * dispatchData.gws[1] * dispatchData.gws[2] if dispatchData.gws[0] * dispatchData.gws[1] * dispatchData.gws[2] != 0 else COUNT_MINIMUM? Then you don't need to introduce additional variable.

Copy link
Contributor Author

@steve-y steve-y May 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When running, COUNT_LENGTH = 0 if shape agnostic kernel is selected. So the original code has [1] in case of shape agnostic kernel, and it is fixed to [1k].

cldnn_jit.AddConstant(MakeJitConstant("COUNT_LENGTH", dispatchData.gws[0] * dispatchData.gws[1] * dispatchData.gws[2]));
if (newParams.mode != ScatterUpdateReduction::NONE) {
dispatchData.gws = {1, 1, 1};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -367,19 +367,19 @@ struct scatter_elements_update_gpu_formats_test
vec[i] = t.sizes()[i];
}
std::reverse(vec.begin() + 2, vec.end());

return ov::Shape(vec.begin(), vec.end());
}


static std::vector<T> generateReferenceOutput(const format fmt,
const ScatterElementsUpdateParams<T, T_IND>& p,
const ScatterElementsUpdateOp::Reduction mode,
const bool use_init_value) {
std::vector<T> out(p.data_tensor.count());
const auto data_shape = tensorToShape(p.data_tensor, fmt);
const auto indices_shape = tensorToShape(p.indices_tensor, fmt);

ov::reference::scatter_elem_update<T, T_IND>(p.data.data(),
p.indices.data(),
p.updates.data(),
Expand Down Expand Up @@ -576,7 +576,7 @@ const std::vector<ov::op::v12::ScatterElementsUpdate::Reduction> reduce_modes{
ov::op::v12::ScatterElementsUpdate::Reduction::SUM,
ov::op::v12::ScatterElementsUpdate::Reduction::PROD,
ov::op::v12::ScatterElementsUpdate::Reduction::MIN,
// MAX mode omitted intentionally - see dedicated MAX tests below
// MAX mode omitted intentionally - see dedicated MAX tests below
ov::op::v12::ScatterElementsUpdate::Reduction::MEAN
};

Expand Down Expand Up @@ -676,3 +676,53 @@ TEST(scatter_elements_update_gpu_fp16, d2411_axisF_cached) {
TEST_P(scatter_elements_update_gpu_reduction_test_f32, cached) {
ASSERT_NO_FATAL_FAILURE(test(true));
}

TEST(scatter_elements_update_gpu_fp32, multiple_indices_mean_1d_dynamic) {
auto& engine = get_test_engine();

auto input1 = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 16, 1, 1, 1 } }); // input
auto input2 = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 8, 1, 1, 1 } }); // indices
auto input3 = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 8, 1, 1, 1 } }); // updates

std::vector<float> data = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
std::vector<int32_t> indices = { 0, 0, 4, 5, 8, 9, 0, 0 };
std::vector<float> updates = { 9.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
int32_t axis = 0;
ScatterElementsUpdateOp::Reduction mode = ov::op::v12::ScatterElementsUpdate::Reduction::MEAN;
bool use_init_value = false;

set_values(input1, data);
set_values(input2, indices);
set_values(input3, updates);

topology topology;
topology.add(input_layout("input", input1->get_layout()));
topology.add(input_layout("indices", { ov::PartialShape{ ov::Dimension(-1) }, data_types::i32, format::bfyx }));
topology.add(input_layout("updates", { ov::PartialShape{ ov::Dimension(-1) }, data_types::f32, format::bfyx }));
topology.add(
scatter_elements_update(
"scatter_elements_update",
input_info("input"),
input_info("indices"),
input_info("updates"),
axis,
mode,
use_init_value));

network network(engine, topology, get_test_default_config(engine));

network.set_input_data("input", input1);
network.set_input_data("indices", input2);
network.set_input_data("updates", input3);

auto outputs = network.execute();

auto output = outputs.at("scatter_elements_update").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());

std::vector<float> expected_results = { 5.75f, 0.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };

for (size_t i = 0; i < expected_results.size(); ++i) {
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}
Loading