Backport gather fix 2026.1 (#34907)

hyunback · web-flow · commit f37a2e7bdf6e · 2026-03-25T14:35:08.000Z
backport: #34897 ### Description of the issue(symptom, root-cause, how it was resolved) #### Symptom Low similarity with granite-4.0-h-micro #### Root cause Fusing post-ops into rank-changing gather can generate incorrect index mapping, causing output mismatches. When gather has rank decrease (e.g., 5D->4D), static_canonicalize_shapes pads the output back to 5D by inserting dim=1 at the gather axis (e.g., {-1,64,64,128} -> {-1,1,64,64,128}). However, the fused eltwise peer tensor remains 4D. In the jitter, GetIdx selects index slots based on the peer tensor's rank (4D -> b,f,y,x), so the kernel's z loop variable - which iterates over actual data - is never used for peer indexing. This causes the fused eltwise to read incorrect data, as the f slot always maps to 0 (the padded dimension) instead of the actual data dimension. #### Resolution Disable gather fusion decrease rank from input to output. while keeping safe exceptions scalar eltwise cases. Although eltwise is the root cause this model, quantize as well due to potential issues. Gather eltwise post-op fusion in rank decrease | Post-op | Fusion| |-------------------------|:-----:| | Eltwise (scalar) | O | | Eltwise (per-channel) | X | | Eltwise (full-tensor) | X | #### Problematic graph Gather_4: in[1,2,64,64,128] -> out[1,64,64,128] + Multiply_27+Add_9 <img width="1597" height="1081" alt="image" src="https://github.com/user-attachments/assets/fa3afa2f-39af-4168-b281-0f988e37d3fe" /> #### Reproduction step and snapshot (if applicable. Do not attach for customer model) $ python ./tools/who_what_benchmark/whowhatbench/wwb.py --target-model /mnt/models/ov-share-13.iotg.sclab.intel.com/cv_bench_cache/WW11_llm-optimum_2026.1.0-21296/granite-4.0-h-micro/pytorch/ov/FP16 --gt-data /mnt/models/ov-share-04.iotg.sclab.intel.com/cv_bench_cache/AC_llm/wwb_ref_gt_data_cache/2026.1.0-21296-4589d335731_nat_ref/CPU_ICX/default_data_wwb/cache_nat_refs_cli/granite-4.0-h-micro__NAT/reference.csv --model-type text --genai --device GPU.1 --output ./wwb --verbose #### Checklist - [ ] Is it a proper fix? The fundamental FIX is to make peer rank the same as gather and process it. - [x] Did you include test case for this fix, if necessary? Yes - [x] Did you review existing test that can be extended to cover this scenario? Which test did you review? gather_fusion_test ### Tickets: - *CVS-183103* --------- Signed-off-by: hyunback <hyunback.kim@intel.com>
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp
@@ -753,6 +753,18 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
             return lora_is_single_user && is_simple_lora;
         };
 
+        auto gather_supports_fusings = [&](gather_node& node) -> bool {
+            auto in_rank = node.get_input_layout(0).get_rank();
+            auto out_rank = node.get_output_layout().get_rank();
+
+            return (in_rank <= out_rank);
+        };
+
+        auto is_static_scalar_output = [&](program_node& node) -> bool {
+            const auto& out_layout = node.get_output_layout();
+            return out_layout.is_static() && out_layout.count() == 1;
+        };
+
         auto broadcast_supports_fusings = [&](broadcast_node& bcast_node) -> bool {
             if (bcast_node.get_outputs_count() != 1)
                 return false;
@@ -1066,7 +1078,9 @@ void prepare_primitive_fusing::fuse_simple_primitives(program &p) {
                                       (parents[i].first->is_type<pooling>()) ||
                                       (parents[i].first->is_type<depth_to_space>() &&
                                        dts_supports_fusings(parents[i].first->as<depth_to_space>())) ||
-                                      (parents[i].first->is_type<gather>()) ||
+                                      (parents[i].first->is_type<gather>() &&
+                                       (gather_supports_fusings(parents[i].first->as<gather>()) ||
+                                       is_static_scalar_output(*parents[1 - i].first))) ||
                                       (parents[i].first->is_type<reduce>() &&
                                        reduce_supports_fusings(parents[i].first->as<reduce>())) ||
                                       (parents[i].first->is_type<lrn>()) ||
diff --git a/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp b/src/plugins/intel_gpu/tests/unit/fusions/gather_fusion_test.cpp
@@ -237,3 +237,102 @@ INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_eltwise_activation_dynamic, ::testi
     gather_test_params{ CASE_GATHER_FP16_7, 5, 8 },
     gather_test_params{ CASE_GATHER_INT8_1, 4, 7 },
 }));
+
+#define CASE_GATHER_RANK_DECREASE_FP16 { 1, 2, 4, 4, 8 }, { }, { 1, 4, 4, 8 }, 1, data_types::f16, format::bfzyx, data_types::f16, format::bfyx
+#define CASE_GATHER_RANK_INCREASE_FP16 { 2, 5, 2, 4 }, { 3, 2, 1 }, { 2, 3, 2, 1, 2, 4 }, 1, data_types::f16, format::bfyx, data_types::f16, format::bfyx
+
+class gather_rank_change_fusing : public GatherPrimitiveFusingTest {
+public:
+    void execute(gather_test_params& p) {
+        cfg_not_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+        cfg_fused.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+
+        auto input_prim = get_mem(get_input_layout(p));
+        auto indices_prim = get_mem(get_indices_layout(p), 0, static_cast<int>(get_axis_dim(p) - 1));
+
+        network network_not_fused(this->engine, this->topology_non_fused, cfg_not_fused);
+        network network_fused(this->engine, this->topology_fused, cfg_fused);
+
+        network_not_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("gather_indices", indices_prim);
+        network_fused.set_input_data("input", input_prim);
+        network_fused.set_input_data("gather_indices", indices_prim);
+
+        compare(network_not_fused, network_fused, p);
+    }
+
+    enum class eltwise_input_type { scalar, per_channel, full_tensor };
+
+    layout get_eltwise_data_layout(gather_test_params& p, eltwise_input_type type) {
+        switch (type) {
+            case eltwise_input_type::scalar:
+                return get_single_element_layout(p);
+            case eltwise_input_type::per_channel:
+                return get_per_channel_layout(p);
+            case eltwise_input_type::full_tensor: {
+                std::vector<ov::Dimension> dims;
+                for (size_t i = 0; i < p.out_shape.size(); ++i)
+                    dims.push_back(ov::Dimension(p.out_shape[i]));
+                auto fmt = format::bfyx;
+                if (p.out_shape.size() == 5) fmt = format::bfzyx;
+                else if (p.out_shape.size() == 6) fmt = format::bfwzyx;
+                return layout{ ov::PartialShape(dims), p.default_type, fmt };
+            }
+            default:
+                return get_single_element_layout(p);
+        }
+    }
+
+    void create_eltwise_topology(gather_test_params& p, eltwise_input_type type) {
+        auto dyn_input = layout{ov::PartialShape::dynamic(p.dictionary_shape.size()), p.data_type, p.input_format};
+        auto dyn_indices = layout{ov::PartialShape::dynamic(p.indices_shape.size()), p.data_type, format::bfyx};
+        auto elt_layout = get_eltwise_data_layout(p, type);
+
+        create_topologies(
+            input_layout("input", dyn_input),
+            input_layout("gather_indices", dyn_indices),
+            data("elt_mul_data", get_mem(elt_layout, -10, 10)),
+            data("elt_add_data", get_mem(elt_layout, -10, 10)),
+            gather("gather_prim", input_info("input"), input_info("gather_indices"),
+                   p.axis, p.dictionary_shape.size(), p.out_shape),
+            eltwise("elt_mul", {input_info("gather_prim"), input_info("elt_mul_data")}, eltwise_mode::prod),
+            eltwise("elt_add", {input_info("elt_mul"), input_info("elt_add_data")}, eltwise_mode::sum),
+            reorder("reorder_bfyx", input_info("elt_add"), p.default_format, data_types::f32,
+                    std::vector<float>(), cldnn::reorder_mean_mode::subtract, cldnn::padding(), true)
+        );
+    }
+};
+
+TEST_P(gather_rank_change_fusing, eltwise_scalar) {
+    auto p = GetParam();
+    create_eltwise_topology(p, eltwise_input_type::scalar);
+    tolerance = 1e-2f;
+    p.expected_fused_primitives = 3;
+    p.expected_not_fused_primitives = 5;
+    execute(p);
+}
+
+TEST_P(gather_rank_change_fusing, eltwise_per_channel) {
+    auto p = GetParam();
+    create_eltwise_topology(p, eltwise_input_type::per_channel);
+    tolerance = 1e-2f;
+    bool is_decrease = p.dictionary_shape.size() > p.out_shape.size();
+    p.expected_fused_primitives = is_decrease ? 4 : 3;
+    p.expected_not_fused_primitives = 5;
+    execute(p);
+}
+
+TEST_P(gather_rank_change_fusing, eltwise_full_tensor) {
+    auto p = GetParam();
+    create_eltwise_topology(p, eltwise_input_type::full_tensor);
+    tolerance = 1e-2f;
+    bool is_decrease = p.dictionary_shape.size() > p.out_shape.size();
+    p.expected_fused_primitives = is_decrease ? 4 : 3;
+    p.expected_not_fused_primitives = 5;
+    execute(p);
+}
+
+INSTANTIATE_TEST_SUITE_P(fusings_gpu, gather_rank_change_fusing, ::testing::ValuesIn(std::vector<gather_test_params>{
+    gather_test_params{ CASE_GATHER_RANK_DECREASE_FP16, 2, 3 },
+    // gather_test_params{ CASE_GATHER_RANK_INCREASE_FP16, 2, 3 },  TODO)
+}));