openvinotoolkit · steve-y · May 11, 2025 · May 13, 2025 · May 27, 2025 · May 28, 2025
@@ -99,34 +99,27 @@
         #endif
         }
 
-        inline uint add_count(
-            #if COUNT_LENGTH > COUNT_LIMIT
-            __global int count_k[], __global int count_v[], 
-            #else
-            __local int count_k[], __local int count_v[], 
-            #endif
-            int idx, uint valid_count)
-        {
-            for (int i = 0; i < valid_count; ++i) {
-                if (count_k[i] == idx) {
-                    count_v[i] += 1;
-                    return valid_count;
-                }
-            }
-            count_k[valid_count] = idx;
-            count_v[valid_count] += 1;
-            return valid_count + 1;
-        }
-
-        inline int get_count(__local int count_k[], __local int count_v[], int it, int *idx)
-        {
-            if (count_k[it] != -1) {
-                *idx = count_k[it];
-                count_k[it] = -1;
-                return count_v[it];
-            }
-            return -1;
-        }
+        #define add_count(count_k, count_v, idx, valid_count) ({                      \
+            for (int i = 0; i < (valid_count); ++i) {                                 \
+                if ((count_k)[i] == (idx)) {                                          \
+                    (count_v)[i] += 1;                                                \
+                    break;                                                            \
+                }                                                                     \
+            }                                                                         \
+            (count_k)[valid_count] = (idx);                                           \
+            (count_v)[valid_count] += 1;                                              \
+            (valid_count) = (valid_count) + 1;                                        \
+        })
+
+        #define get_count(count_k, count_v, it, idx, count) ({                        \
+            if ((count_k)[(it)] != -1) {                                              \
+                (idx) = (count_k)[(it)];                                              \
+                (count_k)[(it)] = -1;                                                 \
+                (count) = (count_v)[(it)];                                            \
+            } else {                                                                  \
+                (count) = -1;                                                         \
+            }                                                                         \
+        })
     #endif
 #endif
 
@@ -188,9 +181,6 @@ KERNEL(scatter_elements_update_ref)(OPTIONAL_SHAPE_INFO_ARG
         #if COUNT_LENGTH > COUNT_LIMIT
             __global int count_k[COUNT_LENGTH];
             __global int count_v[COUNT_LENGTH];
-        #elif COUNT_LENGTH == 0
-            __local int count_k[1];
-            __local int count_v[1];
         #else
             __local int count_k[COUNT_LENGTH];
             __local int count_v[COUNT_LENGTH];
@@ -258,13 +248,14 @@ KERNEL(scatter_elements_update_ref)(OPTIONAL_SHAPE_INFO_ARG
                      const uint output_idx = GET_OUTPUT_INDEX(ORDER);
                      val = FUNC_CALL(reduce)(output[output_idx], val);
                      output[output_idx] = val;
-                     valid_count = add_count(count_k, count_v, output_idx, valid_count);
+                     add_count(count_k, count_v, output_idx, valid_count);
                  }
              }
          }
         for (int i = 0; i < valid_count; ++i) {
             int output_idx;
-            const int count = get_count(count_k, count_v, i, &output_idx);
+            int count;
+            get_count(count_k, count_v, i, output_idx, count);
             #if REDUCE_MODE==MEAN_MODE
                 output[output_idx] = output[output_idx] / (count + USE_INIT_VAL);
             #endif

@@ -206,9 +206,13 @@ KernelsData ScatterElementsUpdateKernelRef::GetKernelsData(const Params& params)
         auto entry_point = GetEntryPoint(kernelName, newParams.layerID, params, i);
 
         if (i == 1) {
+            auto maxAllocatableMemSize = params.engineInfo.maxLocalMemSize / 8 / 2;    // 8 is for allocatable local memory size.
+                                                                                       // 2 is for k and v of count.
             cldnn_jit.AddConstant(MakeJitConstant("IS_SECOND_ITER", "true"));
-            cldnn_jit.AddConstant(MakeJitConstant("COUNT_LIMIT", params.engineInfo.maxLocalMemSize));
-            cldnn_jit.AddConstant(MakeJitConstant("COUNT_LENGTH", newParams.inputs[1].LogicalSize()));
+            cldnn_jit.AddConstant(MakeJitConstant("COUNT_LIMIT", maxAllocatableMemSize));
+            cldnn_jit.AddConstant(MakeJitConstant("COUNT_LENGTH", newParams.inputs[1].LogicalSize() != 0 ?
+                                                                  newParams.inputs[1].LogicalSize() :
+                                                                  maxAllocatableMemSize));
         }
         auto jit = CreateJit(kernelName, cldnn_jit, entry_point);
 

@@ -727,3 +727,53 @@ TEST(scatter_elements_update_gpu_fp32, smoke_multiple_indices_mean_big_1d_dynami
         ASSERT_EQ(expected_results[i], output_ptr[i]);
     }
 }
+
+TEST(scatter_elements_update_gpu_fp32, multiple_indices_mean_1d_dynamic) {
+    auto& engine = get_test_engine();
+
+    auto input1 = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 16, 1, 1, 1 } }); // input
+    auto input2 = engine.allocate_memory({ data_types::i32, format::bfyx, tensor{ 8, 1, 1, 1 } });  // indices
+    auto input3 = engine.allocate_memory({ data_types::f32, format::bfyx, tensor{ 8, 1, 1, 1 } });  // updates
+
+    std::vector<float> data = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+    std::vector<int32_t> indices = { 0, 0, 4, 5, 8, 9, 0, 0 };
+    std::vector<float> updates = { 9.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f };
+    int32_t axis = 0;
+    ScatterElementsUpdateOp::Reduction mode = ov::op::v12::ScatterElementsUpdate::Reduction::MEAN;
+    bool use_init_value = false;
+
+    set_values(input1, data);
+    set_values(input2, indices);
+    set_values(input3, updates);
+
+    topology topology;
+    topology.add(input_layout("input", input1->get_layout()));
+    topology.add(input_layout("indices", { ov::PartialShape{ ov::Dimension(-1) }, data_types::i32, format::bfyx }));
+    topology.add(input_layout("updates", { ov::PartialShape{ ov::Dimension(-1) }, data_types::f32, format::bfyx }));
+    topology.add(
+        scatter_elements_update(
+            "scatter_elements_update",
+            input_info("input"),
+            input_info("indices"),
+            input_info("updates"),
+            axis,
+            mode,
+            use_init_value));
+
+    network network(engine, topology, get_test_default_config(engine));
+
+    network.set_input_data("input", input1);
+    network.set_input_data("indices", input2);
+    network.set_input_data("updates", input3);
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("scatter_elements_update").get_memory();
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+
+   std::vector<float> expected_results = { 5.75f, 0.0f, 0.0f, 0.0f, 2.0f, 3.0f, 0.0f, 0.0f, 4.0f, 5.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+        ASSERT_EQ(expected_results[i], output_ptr[i]);
+    }
+}