Skip to content

[GPU] Set minimum memory of count for reduce mean mode of scatter_elements_update, fix typos and remove space #30491

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -99,34 +99,27 @@
#endif
}

inline uint add_count(
#if COUNT_LENGTH > COUNT_LIMIT
__global int count_k[], __global int count_v[],
#else
__local int count_k[], __local int count_v[],
#endif
int idx, uint valid_count)
{
for (int i = 0; i < valid_count; ++i) {
if (count_k[i] == idx) {
count_v[i] += 1;
return valid_count;
}
}
count_k[valid_count] = idx;
count_v[valid_count] += 1;
return valid_count + 1;
}

inline int get_count(__local int count_k[], __local int count_v[], int it, int *idx)
{
if (count_k[it] != -1) {
*idx = count_k[it];
count_k[it] = -1;
return count_v[it];
}
return -1;
}
#define add_count(count_k, count_v, idx, valid_count) ({ \
for (int i = 0; i < (valid_count); ++i) { \
if ((count_k)[i] == (idx)) { \
(count_v)[i] += 1; \
break; \
} \
} \
(count_k)[valid_count] = (idx); \
(count_v)[valid_count] += 1; \
(valid_count) = (valid_count) + 1; \
})

#define get_count(count_k, count_v, it, idx, count) ({ \
if ((count_k)[(it)] != -1) { \
(idx) = (count_k)[(it)]; \
(count_k)[(it)] = -1; \
(count) = (count_v)[(it)]; \
} else { \
(count) = -1; \
} \
})
#endif
#endif

Expand Down Expand Up @@ -188,9 +181,6 @@ KERNEL(scatter_elements_update_ref)(OPTIONAL_SHAPE_INFO_ARG
#if COUNT_LENGTH > COUNT_LIMIT
__global int count_k[COUNT_LENGTH];
__global int count_v[COUNT_LENGTH];
#elif COUNT_LENGTH == 0
__local int count_k[1];
__local int count_v[1];
#else
__local int count_k[COUNT_LENGTH];
__local int count_v[COUNT_LENGTH];
Expand Down Expand Up @@ -258,13 +248,14 @@ KERNEL(scatter_elements_update_ref)(OPTIONAL_SHAPE_INFO_ARG
const uint output_idx = GET_OUTPUT_INDEX(ORDER);
val = FUNC_CALL(reduce)(output[output_idx], val);
output[output_idx] = val;
valid_count = add_count(count_k, count_v, output_idx, valid_count);
add_count(count_k, count_v, output_idx, valid_count);
}
}
}
for (int i = 0; i < valid_count; ++i) {
int output_idx;
const int count = get_count(count_k, count_v, i, &output_idx);
int count;
get_count(count_k, count_v, i, output_idx, count);
#if REDUCE_MODE==MEAN_MODE
output[output_idx] = output[output_idx] / (count + USE_INIT_VAL);
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,13 @@ KernelsData ScatterElementsUpdateKernelRef::GetKernelsData(const Params& params)
auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, i);

if (i == 1) {
auto maxAllocatableMemSize = params.engineInfo.maxLocalMemSize / 8 / 2; // 8 is for allocatable local memory size.
// 2 is for k and v of count.
cldnn_jit.AddConstant(MakeJitConstant("IS_SECOND_ITER", "true"));
cldnn_jit.AddConstant(MakeJitConstant("COUNT_LIMIT", params.engineInfo.maxLocalMemSize));
cldnn_jit.AddConstant(MakeJitConstant("COUNT_LENGTH", newParams.inputs[1].LogicalSize()));
cldnn_jit.AddConstant(MakeJitConstant("COUNT_LIMIT", maxAllocatableMemSize));
cldnn_jit.AddConstant(MakeJitConstant("COUNT_LENGTH", newParams.inputs[1].LogicalSize() != 0 ?
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why if dynamic, always count_length is max slm size?
We should define actual size using the shape something like INPUT_SIZE_XXX.....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As count_length is used for array size of count, it should be fixed before run, so it is max slm size.
I think I didn't understand how INPUT_SIZE_XXX... used, and could you explain more?

newParams.inputs[1].LogicalSize() :
maxAllocatableMemSize));
}
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -727,3 +727,53 @@ TEST(scatter_elements_update_gpu_fp32, smoke_multiple_indices_mean_big_1d_dynami
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}

TEST(scatter_elements_update_gpu_fp32, multiple_indices_mean_1d_dynamic) {
auto& engine = get_test_engine();

auto input1 = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 16, 1, 1, 1 } }); // input
auto input2 = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 8, 1, 1, 1 } }); // indices
auto input3 = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 8, 1, 1, 1 } }); // updates

std::vector<float> data = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
std::vector<int32_t> indices = { 0, 0, 4, 5, 8, 9, 0, 0 };
std::vector<float> updates = { 9.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
int32_t axis = 0;
ScatterElementsUpdateOp::Reduction mode = ov::op::v12::ScatterElementsUpdate::Reduction::MEAN;
bool use_init_value = false;

set_values(input1, data);
set_values(input2, indices);
set_values(input3, updates);

topology topology;
topology.add(input_layout("input", input1->get_layout()));
topology.add(input_layout("indices", { ov::PartialShape{ ov::Dimension(-1) }, data_types::i32, format::bfyx }));
topology.add(input_layout("updates", { ov::PartialShape{ ov::Dimension(-1) }, data_types::f32, format::bfyx }));
topology.add(
scatter_elements_update(
"scatter_elements_update",
input_info("input"),
input_info("indices"),
input_info("updates"),
axis,
mode,
use_init_value));

network network(engine, topology, get_test_default_config(engine));

network.set_input_data("input", input1);
network.set_input_data("indices", input2);
network.set_input_data("updates", input3);

auto outputs = network.execute();

auto output = outputs.at("scatter_elements_update").get_memory();
cldnn::mem_lock<float> output_ptr(output, get_test_stream());

std::vector<float> expected_results = { 5.75f, 0.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };

for (size_t i = 0; i < expected_results.size(); ++i) {
ASSERT_EQ(expected_results[i], output_ptr[i]);
}
}
Loading