Skip to content

Commit f37a2e7

Browse files
authored
Backport gather fix 2026.1 (#34907)
backport: #34897 ### Description of the issue(symptom, root-cause, how it was resolved) #### Symptom Low similarity with granite-4.0-h-micro #### Root cause Fusing post-ops into rank-changing gather can generate incorrect index mapping, causing output mismatches. When gather has rank decrease (e.g., 5D->4D), static_canonicalize_shapes pads the output back to 5D by inserting dim=1 at the gather axis (e.g., {-1,64,64,128} -> {-1,1,64,64,128}). However, the fused eltwise peer tensor remains 4D. In the jitter, GetIdx selects index slots based on the peer tensor's rank (4D -> b,f,y,x), so the kernel's z loop variable - which iterates over actual data - is never used for peer indexing. This causes the fused eltwise to read incorrect data, as the f slot always maps to 0 (the padded dimension) instead of the actual data dimension. #### Resolution Disable gather fusion decrease rank from input to output. while keeping safe exceptions scalar eltwise cases. Although eltwise is the root cause this model, quantize as well due to potential issues. Gather eltwise post-op fusion in rank decrease | Post-op | Fusion| |-------------------------|:-----:| | Eltwise (scalar) | O | | Eltwise (per-channel) | X | | Eltwise (full-tensor) | X | #### Problematic graph Gather_4: in[1,2,64,64,128] -> out[1,64,64,128] + Multiply_27+Add_9 <img width="1597" height="1081" alt="image" src="https://github.com/user-attachments/assets/fa3afa2f-39af-4168-b281-0f988e37d3fe" /> #### Reproduction step and snapshot (if applicable. Do not attach for customer model) $ python ./tools/who_what_benchmark/whowhatbench/wwb.py --target-model /mnt/models/ov-share-13.iotg.sclab.intel.com/cv_bench_cache/WW11_llm-optimum_2026.1.0-21296/granite-4.0-h-micro/pytorch/ov/FP16 --gt-data /mnt/models/ov-share-04.iotg.sclab.intel.com/cv_bench_cache/AC_llm/wwb_ref_gt_data_cache/2026.1.0-21296-4589d335731_nat_ref/CPU_ICX/default_data_wwb/cache_nat_refs_cli/granite-4.0-h-micro__NAT/reference.csv --model-type text --genai --device GPU.1 --output ./wwb --verbose #### Checklist - [ ] Is it a proper fix? The fundamental FIX is to make peer rank the same as gather and process it. - [x] Did you include test case for this fix, if necessary? Yes - [x] Did you review existing test that can be extended to cover this scenario? Which test did you review? gather_fusion_test ### Tickets: - *CVS-183103* --------- Signed-off-by: hyunback <hyunback.kim@intel.com>
1 parent c5940d7 commit f37a2e7

2 files changed

Lines changed: 114 additions & 1 deletion

File tree

src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,18 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
753753
return lora_is_single_user && is_simple_lora;
754754
};
755755

756+
auto gather_supports_fusings = [&](gather_node& node) -> bool {
757+
auto in_rank = node.get_input_layout(0).get_rank();
758+
auto out_rank = node.get_output_layout().get_rank();
759+
760+
return (in_rank <= out_rank);
761+
};
762+
763+
auto is_static_scalar_output = [&](program_node& node) -> bool {
764+
const auto& out_layout = node.get_output_layout();
765+
return out_layout.is_static() && out_layout.count() == 1;
766+
};
767+
756768
auto broadcast_supports_fusings = [&](broadcast_node& bcast_node) -> bool {
757769
if (bcast_node.get_outputs_count() != 1)
758770
return false;
@@ -1066,7 +1078,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
10661078
(parents[i].first->is_type<pooling>()) ||
10671079
(parents[i].first->is_type<depth_to_space>() &&
10681080
dts_supports_fusings(parents[i].first->as<depth_to_space>())) ||
1069-
(parents[i].first->is_type<gather>()) ||
1081+
(parents[i].first->is_type<gather>() &&
1082+
(gather_supports_fusings(parents[i].first->as<gather>()) ||
1083+
is_static_scalar_output(*parents[1 - i].first))) ||
10701084
(parents[i].first->is_type<reduce>() &&
10711085
reduce_supports_fusings(parents[i].first->as<reduce>())) ||
10721086
(parents[i].first->is_type<lrn>()) ||

src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,3 +237,102 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_eltwise_activation_dynamic, ::testi
237237
gather_test_params{ CASE_GATHER_FP16_7, 5, 8 },
238238
gather_test_params{ CASE_GATHER_INT8_1, 4, 7 },
239239
}));
240+
241+
#define CASE_GATHER_RANK_DECREASE_FP16 { 1, 2, 4, 4, 8 }, { }, { 1, 4, 4, 8 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::bfyx
242+
#define CASE_GATHER_RANK_INCREASE_FP16 { 2, 5, 2, 4 }, { 3, 2, 1 }, { 2, 3, 2, 1, 2, 4 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx
243+
244+
class gather_rank_change_fusing : public GatherPrimitiveFusingTest {
245+
public:
246+
void execute(gather_test_params& p) {
247+
cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true));
248+
cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true));
249+
250+
auto input_prim = get_mem(get_input_layout(p));
251+
auto indices_prim = get_mem(get_indices_layout(p), 0, static_cast<int>(get_axis_dim(p) - 1));
252+
253+
network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
254+
network network_fused(this->engine, this->topology_fused, cfg_fused);
255+
256+
network_not_fused.set_input_data("input", input_prim);
257+
network_not_fused.set_input_data("gather_indices", indices_prim);
258+
network_fused.set_input_data("input", input_prim);
259+
network_fused.set_input_data("gather_indices", indices_prim);
260+
261+
compare(network_not_fused, network_fused, p);
262+
}
263+
264+
enum class eltwise_input_type { scalar, per_channel, full_tensor };
265+
266+
layout get_eltwise_data_layout(gather_test_params& p, eltwise_input_type type) {
267+
switch (type) {
268+
case eltwise_input_type::scalar:
269+
return get_single_element_layout(p);
270+
case eltwise_input_type::per_channel:
271+
return get_per_channel_layout(p);
272+
case eltwise_input_type::full_tensor: {
273+
std::vector<ov::Dimension> dims;
274+
for (size_t i = 0; i < p.out_shape.size(); ++i)
275+
dims.push_back(ov::Dimension(p.out_shape[i]));
276+
auto fmt = format::bfyx;
277+
if (p.out_shape.size() == 5) fmt = format::bfzyx;
278+
else if (p.out_shape.size() == 6) fmt = format::bfwzyx;
279+
return layout{ ov::PartialShape(dims), p.default_type, fmt };
280+
}
281+
default:
282+
return get_single_element_layout(p);
283+
}
284+
}
285+
286+
void create_eltwise_topology(gather_test_params& p, eltwise_input_type type) {
287+
auto dyn_input = layout{ov::PartialShape::dynamic(p.dictionary_shape.size()), p.data_type, p.input_format};
288+
auto dyn_indices = layout{ov::PartialShape::dynamic(p.indices_shape.size()), p.data_type, format::bfyx};
289+
auto elt_layout = get_eltwise_data_layout(p, type);
290+
291+
create_topologies(
292+
input_layout("input", dyn_input),
293+
input_layout("gather_indices", dyn_indices),
294+
data("elt_mul_data", get_mem(elt_layout, -10, 10)),
295+
data("elt_add_data", get_mem(elt_layout, -10, 10)),
296+
gather("gather_prim", input_info("input"), input_info("gather_indices"),
297+
p.axis, p.dictionary_shape.size(), p.out_shape),
298+
eltwise("elt_mul", {input_info("gather_prim"), input_info("elt_mul_data")}, eltwise_mode::prod),
299+
eltwise("elt_add", {input_info("elt_mul"), input_info("elt_add_data")}, eltwise_mode::sum),
300+
reorder("reorder_bfyx", input_info("elt_add"), p.default_format, data_types::f32,
301+
std::vector<float>(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true)
302+
);
303+
}
304+
};
305+
306+
TEST_P(gather_rank_change_fusing, eltwise_scalar) {
307+
auto p = GetParam();
308+
create_eltwise_topology(p, eltwise_input_type::scalar);
309+
tolerance = 1e-2f;
310+
p.expected_fused_primitives = 3;
311+
p.expected_not_fused_primitives = 5;
312+
execute(p);
313+
}
314+
315+
TEST_P(gather_rank_change_fusing, eltwise_per_channel) {
316+
auto p = GetParam();
317+
create_eltwise_topology(p, eltwise_input_type::per_channel);
318+
tolerance = 1e-2f;
319+
bool is_decrease = p.dictionary_shape.size() > p.out_shape.size();
320+
p.expected_fused_primitives = is_decrease ? 4 : 3;
321+
p.expected_not_fused_primitives = 5;
322+
execute(p);
323+
}
324+
325+
TEST_P(gather_rank_change_fusing, eltwise_full_tensor) {
326+
auto p = GetParam();
327+
create_eltwise_topology(p, eltwise_input_type::full_tensor);
328+
tolerance = 1e-2f;
329+
bool is_decrease = p.dictionary_shape.size() > p.out_shape.size();
330+
p.expected_fused_primitives = is_decrease ? 4 : 3;
331+
p.expected_not_fused_primitives = 5;
332+
execute(p);
333+
}
334+
335+
INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_rank_change_fusing, ::testing::ValuesIn(std::vector<gather_test_params>{
336+
gather_test_params{ CASE_GATHER_RANK_DECREASE_FP16, 2, 3 },
337+
// gather_test_params{ CASE_GATHER_RANK_INCREASE_FP16, 2, 3 }, TODO)
338+
}));

0 commit comments

Comments
 (0)