Generic Pool Large Kernel Optimization (#23162)

wransom-TT · web-flow · commit e4980c6634bd · 2025-06-15T19:09:25.000Z
Ticket N/A Problem description Generic Pool's performance is poor for large kernel sizes. What's changed - YoloV4's expected perf has been increased from 87.8 to 93.5 FPS. - Generic pool now supports 32 row reductions - A bug was fixed with the size of the intermediate / partials CB - A bug was fixed in the face dimension passed to unpack tilize - For Max Pool, the fill_with_val in the loop was eliminated. This is possible since the junk data left from previous iterations do not affect the max value. - in_cb initialization has been added for cases where there are not more intermediate reduction chunks than multibuffering chunks. This is necessary since the compute kernel always processes max_rows_per_reduction rows from the in_cb which may include uninitialized data if multibuffering is enabled. However when we have enough intermediate reduction chunks, the entire in_cb get's filled with valid data which cannot contain values larger than the max, thus initialization is not necessary. - Clear out tiles is now used for buffer initialization as well as for Avg Pool's fill_with_val called in the loop resulting in dramatically better performance in some cases. Note - Multi buffering does not require in-loop fill_with_val since one CB only processes a single top left index at a time, and if necessary the in_cb was initialized before the loop. - Junk data from previous top left indices is not an issue since all kernel positions have the same number of elements. - For both average pool and max pool we would not need to initialize the CB with the init value at all since we know we have kernel_HW > max_rows_per_reduction except that we are using multibuffering so there will usually be some dead space. It is possible that it is worth it to turn off multibuffering but more testing is required. Checklist ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes: https://github.com/tenstorrent/tt-metal/actions/runs/15646661523 - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes: (same failure as main, unrelated to changes) https://github.com/tenstorrent/tt-metal/actions/runs/15646662524 - [x] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes: (same failure as main, unrelated to changes) https://github.com/tenstorrent/tt-metal/actions/runs/15646665080 - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes: https://github.com/tenstorrent/tt-metal/actions/runs/15646663480 - [x] [Nightly L2](https://github.com/tenstorrent/tt-metal/actions/workflows/tt-metal-l2-nightly.yaml) CI passes: (wormhole) https://github.com/tenstorrent/tt-metal/actions/runs/15646668961 (blackhole) https://github.com/tenstorrent/tt-metal/actions/runs/15646670132 - [x] [Frequent model](https://github.com/tenstorrent/tt-metal/actions/workflows/fast-dispatch-full-regressions-and-models.yaml) CI passes: https://github.com/tenstorrent/tt-metal/actions/runs/15646666798 - [x] New/Existing tests provide coverage for changes
diff --git a/models/demos/yolov4/tests/perf/test_perf.py b/models/demos/yolov4/tests/perf/test_perf.py
@@ -101,7 +101,7 @@ def test_yolov4(
 @pytest.mark.parametrize(
     "batch_size, model_name, expected_perf",
     [
-        (1, "yolov4", 87.8),
+        (1, "yolov4", 93.5),
     ],
 )
 @pytest.mark.models_device_performance_bare_metal
diff --git a/models/experimental/yolov8s_world/tests/test_perf_yolov8s_world.py b/models/experimental/yolov8s_world/tests/test_perf_yolov8s_world.py
@@ -132,7 +132,7 @@ def test_perf(device, use_pretrained_weight, use_program_cache):
 @pytest.mark.parametrize(
     "batch_size, expected_perf",
     [
-        [1, 79.2],
+        [1, 80.0],
     ],
 )
 @pytest.mark.models_device_performance_bare_metal
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/compute/pool_2d_multi_core_large_kernel.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/compute/pool_2d_multi_core_large_kernel.cpp
@@ -64,6 +64,7 @@ template <
     uint32_t num_output_tiles,
     bool is_partial_tile,
     uint32_t max_rows_for_reduction,
+    uint32_t unpA_face_r_dim,
     bool neginf_srca_maxpool,
     bool zero_srca_avgpool>
 inline void reduce_h_fused(const uint32_t interm_cb_id, const uint32_t in_scalar_cb_id, const uint32_t out_cb_id) {
@@ -80,7 +81,7 @@ inline void reduce_h_fused(const uint32_t interm_cb_id, const uint32_t in_scalar
         num_output_tiles,
         0 /*tile idx for Src b is 0 because only 1 tile of constants is loaded*/,
         num_faces_in_input_tile /* unpack 1 or 2 faces ) */,
-        max_rows_for_reduction);
+        unpA_face_r_dim);
     for (uint32_t c_i = 0; c_i < num_output_tiles; ++c_i) {
         reduce_tile_math(c_i, num_faces_in_input_tile /* reduce 1 or 2 faces */);
     }
@@ -119,6 +120,8 @@ void MAIN {
     constexpr uint32_t interm_cb_id = get_compile_time_arg_val(15);
     constexpr uint32_t in_one_cb_id = get_compile_time_arg_val(16);
     constexpr bool one_scalar_per_core = get_compile_time_arg_val(17);
+    constexpr uint32_t sync_cb_id1 = get_compile_time_arg_val(18);
+    constexpr uint32_t sync_cb_id2 = get_compile_time_arg_val(19);
 
     constexpr bool is_partial_tile = in_c < 32;
     static_assert((!is_partial_tile || (in_c == 16)), "Partial tile must have c_dim 16");
@@ -136,19 +139,18 @@ void MAIN {
     constexpr bool neginf_srca_maxpool = (REDUCE_OP == PoolType::MAX) ? true : false;
     constexpr bool zero_srca_avgpool = (REDUCE_OP == PoolType::SUM) ? true : false;
 
+    constexpr uint32_t face_r_dim = 16;
     tilizeA_B_reduce_init<neginf_srca_maxpool, zero_srca_avgpool>(
-        in_cb_id_0,
-        in_scalar_cb_id_0,
-        max_tiles_per_iter,
-        interm_cb_id,
-        num_faces_in_input_tile,
-        max_rows_for_reduction);
+        in_cb_id_0, in_scalar_cb_id_0, max_tiles_per_iter, interm_cb_id, num_faces_in_input_tile, face_r_dim);
 
     constexpr uint32_t remaining_elems = window_size_hw % max_rows_for_reduction;
     constexpr uint32_t interm_reduction_chunks =
         remaining_elems ? window_size_hw / max_rows_for_reduction + 1 : window_size_hw / max_rows_for_reduction;
-    if constexpr (one_scalar_per_core) {
-        cb_wait_front(in_scalar_cb_id_0, 1);
+
+    // wait for initialization to complete
+    cb_wait_front(sync_cb_id1, 2);
+    if constexpr (split_reader) {
+        cb_wait_front(sync_cb_id2, 2);
     }
 
     for (uint32_t i = 0; i < nsticks_per_core_by_nblocks; ++i) {
@@ -171,7 +173,7 @@ void MAIN {
                     is_partial_tile,
                     max_rows_for_reduction,
                     split_reader,
-                    max_rows_for_reduction,
+                    face_r_dim,
                     neginf_srca_maxpool,
                     zero_srca_avgpool>(in_cb_id_0, in_cb_id_1, curr_scalar_cb_id, i, h, interm_cb_id);
             }
@@ -184,6 +186,7 @@ void MAIN {
                 max_tiles_per_iter,
                 is_partial_tile,
                 max_rows_for_reduction,
+                face_r_dim,
                 neginf_srca_maxpool,
                 zero_srca_avgpool>(
                 interm_cb_id, REDUCE_OP == PoolType::MAX ? in_scalar_cb_id_0 : in_one_cb_id, out_cb_id);
@@ -200,7 +203,7 @@ void MAIN {
                 is_partial_tile,
                 max_rows_for_reduction,
                 split_reader,
-                max_rows_for_reduction,
+                face_r_dim,
                 neginf_srca_maxpool,
                 zero_srca_avgpool>(in_cb_id_0, in_cb_id_1, curr_scalar_cb_id, i, h, interm_cb_id);
         }
@@ -213,6 +216,7 @@ void MAIN {
             partial_iter_output_tiles,
             is_partial_tile,
             max_rows_for_reduction,
+            face_r_dim,
             neginf_srca_maxpool,
             zero_srca_avgpool>(interm_cb_id, REDUCE_OP == PoolType::MAX ? in_scalar_cb_id_0 : in_one_cb_id, out_cb_id);
         if constexpr (!one_scalar_per_core) {
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/dataflow/reader_pool2d_sharded_common.hpp b/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/dataflow/reader_pool2d_sharded_common.hpp
@@ -20,3 +20,29 @@ ALWI bool fill_with_val(uint32_t begin_addr, uint32_t n, uint16_t val, bool unco
 
     return true;
 }
+
+template <uint32_t cb_id, uint32_t clear_value_cb_id>
+FORCE_INLINE void clear_out_tiles() {
+    constexpr uint32_t tile_size = get_tile_size(cb_id);
+    const uint32_t num_pages = get_local_cb_interface(cb_id).fifo_num_pages;
+    const uint32_t num_tiles = get_local_cb_interface(cb_id).fifo_page_size / tile_size;
+    const uint64_t clear_value_addr = get_noc_addr(get_read_ptr(clear_value_cb_id));
+    uint64_t write_addr = get_noc_addr(get_write_ptr(cb_id));
+
+    for (uint32_t i = 0; i < num_tiles * num_pages; ++i) {
+        noc_async_read(clear_value_addr, write_addr, tile_size);
+        write_addr += tile_size;
+    }
+    noc_async_read_barrier();
+}
+
+template <uint32_t clear_value_cb_id, uint32_t num_tiles>
+FORCE_INLINE void clear_out_tiles(uint64_t write_addr, uint64_t clear_value_addr) {
+    constexpr uint32_t tile_size = get_tile_size(clear_value_cb_id);
+
+    for (uint32_t i = 0; i < num_tiles; ++i) {
+        noc_async_read(clear_value_addr, write_addr, tile_size);
+        write_addr += tile_size;
+    }
+    noc_async_read_barrier();
+}
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/dataflow/reader_pool_2d_multi_core_sharded.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/dataflow/reader_pool_2d_multi_core_sharded.cpp
@@ -13,32 +13,6 @@
 #include "debug/dprint_pages.h"
 #endif
 
-template <uint32_t cb_id, uint32_t clear_value_cb_id>
-FORCE_INLINE void clear_out_tiles() {
-    constexpr uint32_t tile_size = get_tile_size(cb_id);
-    const uint32_t num_pages = get_local_cb_interface(cb_id).fifo_num_pages;
-    const uint32_t num_tiles = get_local_cb_interface(cb_id).fifo_page_size / tile_size;
-    const uint64_t clear_value_addr = get_noc_addr(get_read_ptr(clear_value_cb_id));
-    uint64_t write_addr = get_noc_addr(get_write_ptr(cb_id));
-
-    for (uint32_t i = 0; i < num_tiles * num_pages; ++i) {
-        noc_async_read(clear_value_addr, write_addr, tile_size);
-        write_addr += tile_size;
-    }
-    noc_async_read_barrier();
-}
-
-template <uint32_t clear_value_cb_id, uint32_t num_tiles>
-FORCE_INLINE void clear_out_tiles(uint64_t write_addr, uint64_t clear_value_addr) {
-    constexpr uint32_t tile_size = get_tile_size(clear_value_cb_id);
-
-    for (uint32_t i = 0; i < num_tiles; ++i) {
-        noc_async_read(clear_value_addr, write_addr, tile_size);
-        write_addr += tile_size;
-    }
-    noc_async_write_barrier();
-}
-
 /**
  * Pool 2D (Max pool 2D and Avg pool 2D)
  */
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/dataflow/reader_pool_2d_multi_core_sharded_with_halo_large_kernel_v2.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/kernels/dataflow/reader_pool_2d_multi_core_sharded_with_halo_large_kernel_v2.cpp
@@ -44,6 +44,7 @@ void kernel_main() {
     constexpr uint32_t max_rows_for_reduction = get_compile_time_arg_val(14);
     constexpr uint32_t ceil_pad_w = get_compile_time_arg_val(15);
 
+    constexpr uint32_t TILE_HEIGHT = 32;
     constexpr uint32_t TILE_WIDTH = 32;
     constexpr uint32_t MAX_ELE_PER_REDUCTION = 512;  // TILE_WIDTH * 8 * numbytes
 
@@ -54,8 +55,14 @@ void kernel_main() {
     constexpr uint32_t in_scalar_cb_id_1 = get_compile_time_arg_val(21);
     constexpr uint32_t interm_reduction_cb_id = get_compile_time_arg_val(22);
     constexpr uint32_t in_one_cb_id = get_compile_time_arg_val(23);
+    constexpr uint32_t clear_value_cb_id = get_compile_time_arg_val(24);
+    constexpr bool is_avg_pool = (bool)get_compile_time_arg_val(25);
     constexpr bool one_scalar_per_core = get_compile_time_arg_val(26);
     constexpr uint32_t config_cb_id = get_compile_time_arg_val(27);
+    constexpr uint32_t multi_buffering_factor = get_compile_time_arg_val(28);
+    constexpr uint32_t sync_cb_id1 = get_compile_time_arg_val(29);
+    constexpr uint32_t sync_cb_id2 = get_compile_time_arg_val(30);
+
     constexpr uint32_t in_scalar_cb_id =
         split_reader && reader_id == 1 && !one_scalar_per_core ? in_scalar_cb_id_1 : in_scalar_cb_id_0;
 
@@ -64,21 +71,68 @@ void kernel_main() {
     uint32_t scalar_end = 1;
     uint32_t scalar_value = 0;
 
+    constexpr uint32_t window_size_hw = window_h * window_w;
+    constexpr uint32_t remaining_elems = window_size_hw % max_rows_for_reduction;
+    constexpr uint32_t interm_reduction_chunks =
+        remaining_elems ? window_size_hw / max_rows_for_reduction + 1 : window_size_hw / max_rows_for_reduction;
+    // we only need to initialize the in_cb if we will not fill each multibuffering chunk with max_rows worth of data
+    constexpr bool need_to_initialize_in_cb = remaining_elems && interm_reduction_chunks <= multi_buffering_factor;
+    constexpr uint32_t in_cb_ntiles = in_cb_sz / (TILE_WIDTH * TILE_HEIGHT);  // only use the non-multi buffering size
+
+    // fill the clear cb
+    if constexpr (split_reader) {
+        constexpr uint32_t half_tile = TILE_HEIGHT * TILE_WIDTH / 2;
+        if constexpr (reader_id == 0) {
+            fill_with_val(get_write_ptr(clear_value_cb_id), half_tile, bf16_init_value);
+        } else {
+            fill_with_val(get_write_ptr(clear_value_cb_id) + 2 * half_tile, half_tile, bf16_init_value);  // 2 for bf16
+        }
+    } else {
+        if constexpr (reader_id == 0) {
+            fill_with_val(get_write_ptr(clear_value_cb_id), TILE_HEIGHT * TILE_WIDTH, bf16_init_value);
+        }
+    }
+
+    // ensure the clear CB is full before proceeding
+    if constexpr (reader_id == 0) {
+        cb_push_back(sync_cb_id1, 1);
+        if constexpr (split_reader) {
+            cb_wait_front(sync_cb_id2, 1);
+        }
+    } else {
+        cb_push_back(sync_cb_id2, 1);
+        cb_wait_front(sync_cb_id1, 1);
+    }
+
+    if constexpr (need_to_initialize_in_cb && !is_avg_pool) {  // for avg pool fill_with_val runs in loop, no need to
+                                                               // initialize
+        clear_out_tiles<in_cb_id, clear_value_cb_id>();
+    }
+
     if constexpr (reader_id == 0) {
         constexpr uint32_t bf16_one_u16 = bf16_one_u32 >> 16;
-        // fill interm buffer with init_value
-        fill_with_val(get_write_ptr(interm_reduction_cb_id), in_cb_sz, bf16_init_value);
+        // initialize buffers
+        clear_out_tiles<interm_reduction_cb_id, clear_value_cb_id>();
         if constexpr (one_scalar_per_core) {
-            cb_reserve_back(in_scalar_cb_id_0, 1);
             fill_with_val(get_write_ptr(in_scalar_cb_id_0), TILE_WIDTH, bf16_scalar >> 16);
-            cb_push_back(in_scalar_cb_id_0, 1);
         }
-        if (bf16_scalar != bf16_one_u32 || !one_scalar_per_core) {
-            // Pool operation is not maxpool
+        if constexpr (is_avg_pool) {
+            // for avgpool, we use a one's CB to avoid double division by kernel size for large kernel case.
             fill_with_val(get_write_ptr(in_one_cb_id), TILE_WIDTH, bf16_one_u16);
         }
     }
 
+    // ensure initialization is done before proceeding
+    if constexpr (reader_id == 0) {
+        cb_push_back(sync_cb_id1, 1);
+        if constexpr (split_reader) {
+            cb_wait_front(sync_cb_id2, 2);
+        }
+    } else {
+        cb_push_back(sync_cb_id2, 1);
+        cb_wait_front(sync_cb_id1, 2);
+    }
+
     const uint32_t in_l1_read_base_addr = get_read_ptr(in_shard_cb_id);
     uint32_t reader_indices_l1_addr = get_read_ptr(in_reader_indices_cb_id);
     volatile tt_l1_ptr uint16_t* reader_indices_ptr =
@@ -90,7 +144,6 @@ void kernel_main() {
 
     uint32_t counter = reader_id;
     constexpr uint32_t total_elems_to_reduce = window_h * window_w;
-    constexpr uint32_t remaining_elems = total_elems_to_reduce % max_rows_for_reduction;
     constexpr bool wide_reduction = in_nblocks_c > 1;
     constexpr uint32_t read_bytes =
         wide_reduction ? MAX_ELE_PER_REDUCTION : in_nbytes_c;  // in_cb is MAX_ELE_PER_REDUCTION for wide reductions
@@ -145,9 +198,17 @@ void kernel_main() {
                         cb_push_back(in_cb_id, 1);
                         cb_reserve_back(in_cb_id, 1);
                         out_l1_write_addr = get_write_ptr(in_cb_id);
-                        // If next is last chunk, fill whole buffer with the init_value.
-                        if ((total_elems_to_reduce - processed_rows) < max_rows_for_reduction) {
-                            fill_with_val(out_l1_write_addr, in_cb_sz, bf16_init_value);
+                        // If next is last chunk, fill whole buffer with the init_value. note for max pool we do
+                        // not need to fill the CB for the partial chunk since as long as we have N>1 chunks we
+                        // are guaranteed that the junk data remaining from chunk N-1 will fill the entire CB and
+                        // cannot contain values greater than the max value, and if we have N=1 chunks we already
+                        // initialized the entire CB with the init value, but for avg pool we need to fill the
+                        // entire CB with the init value since the junk data will contribute to the average.
+                        if constexpr (is_avg_pool) {
+                            if ((total_elems_to_reduce - processed_rows) < max_rows_for_reduction) {
+                                clear_out_tiles<clear_value_cb_id, in_cb_ntiles>(
+                                    get_noc_addr(out_l1_write_addr), get_noc_addr(get_read_ptr(clear_value_cb_id)));
+                            }
                         }
                     }
                 }
diff --git a/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp b/ttnn/cpp/ttnn/operations/pool/generic/device/pool_multi_core_program_factory.cpp
@@ -296,9 +296,14 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_
     const bool is_large_kernel =
         is_partial_tile ? kernel_size_hw > tt::constants::TILE_HEIGHT / 2 : kernel_size_hw > tt::constants::TILE_HEIGHT;
 
-    // ToDo: enable 32 sticks per tile for reduction for all cases.
+    // TODO: enable 32 sticks per tile for reduction for all cases, we can only support 16 row reductions for
+    // partial tiles, and there is currently a bug forcing us to use 16 row reductions for avg pool when there
+    // is 1 remainder C tile
     const uint32_t max_rows_for_reduction =
-        (!is_partial_tile && !is_large_kernel) ? tt::constants::TILE_HEIGHT : tt::constants::TILE_HEIGHT / 2;
+        !is_partial_tile && !(is_wide_reduction && pool_type == Pool2DType::AVG_POOL2D &&
+                              in_ntiles_c % MAX_TILES_PER_REDUCTION == 1)
+            ? tt::constants::TILE_HEIGHT
+            : tt::constants::TILE_HEIGHT / 2;
     TT_FATAL(nblocks == 1, "Multiple blocks not yet supported");
 
     if (input_shape[3] < tt::constants::TILE_WIDTH) {
@@ -360,14 +365,22 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_
     }
 
     uint32_t clear_value_cb_id = 32;
-    if (max_rows_for_reduction == tt::constants::TILE_HEIGHT) {
+    if (max_rows_for_reduction == tt::constants::TILE_HEIGHT || is_large_kernel ||
+        (is_wide_reduction && in_ntiles_c % MAX_TILES_PER_REDUCTION != 0)) {
         // CB storing just "clear value" (-inf for maxpool, 0 for avgpool)
-        // is needed only if we use more then 16 sticks per tile for reduction.
+        // is needed only if we use more then 16 sticks per tile for reduction
+        // or if we use large kernel size.
         clear_value_cb_id = next_cb_index++;
         tt::tt_metal::create_cb(clear_value_cb_id, program, all_cores, tile_size(in_df), 1, in_df);
         log_debug(tt::LogOp, "CB {} :: PS = {}, NP = {}", clear_value_cb_id, tile_size(in_df), 1);
     }
 
+    // CBs for NC/BR synchornization
+    int32_t sync_cb_id1 = next_cb_index++;
+    auto sync_cb1 = tt::tt_metal::create_cb(sync_cb_id1, program, all_cores, 2, 2, tt::DataFormat::UInt16);
+    int32_t sync_cb_id2 = next_cb_index++;
+    auto sync_cb2 = tt::tt_metal::create_cb(sync_cb_id2, program, all_cores, 2, 2, tt::DataFormat::UInt16);
+
     // incoming data is the input cb instead of raw l1/dram addr
     // this input shard has halo and padding inserted.
     const uint32_t raw_in_cb_npages = input.shard_spec().value().shape[0];
@@ -441,7 +454,7 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_
     uint32_t max_pool_partials_cb_id = 32;
     if (is_large_kernel) {
         max_pool_partials_cb_id = next_cb_index++;  // max_pool partials
-        const uint32_t max_pool_partials_cb_pagesize = out_cb_pagesize;
+        const uint32_t max_pool_partials_cb_pagesize = in_cb_pagesize;
         const uint32_t max_pool_partials_cb_npages = nblocks;
 
         tt::tt_metal::create_cb(
@@ -540,7 +553,10 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_
         clear_value_cb_id,
         (uint32_t)pool_type,
         one_scalar_per_core,
-        config_cb_id};
+        config_cb_id,
+        multi_buffering_factor,
+        sync_cb_id1,
+        sync_cb_id2};
     std::vector<uint32_t> reader1_ct_args = reader0_ct_args;
     reader1_ct_args[8] = 1;  // split reader id for reader1
 
@@ -589,7 +605,9 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_
         out_cb_id,
         max_pool_partials_cb_id,
         in_one_cb_id,
-        one_scalar_per_core};
+        one_scalar_per_core,
+        sync_cb_id1,
+        sync_cb_id2};
 
     auto compute_config = tt::tt_metal::ComputeConfig{
         .math_fidelity = MathFidelity::HiFi4,

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ def test_yolov4(`
`101`	`101`	`@pytest.mark.parametrize(`
`102`	`102`	`"batch_size, model_name, expected_perf",`
`103`	`103`	`[`
`104`		`- (1, "yolov4", 87.8),`
	`104`	`+ (1, "yolov4", 93.5),`
`105`	`105`	`],`
`106`	`106`	`)`
`107`	`107`	`@pytest.mark.models_device_performance_bare_metal`
Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,7 @@ def test_perf(device, use_pretrained_weight, use_program_cache):`
`132`	`132`	`@pytest.mark.parametrize(`
`133`	`133`	`"batch_size, expected_perf",`
`134`	`134`	`[`
`135`		`- [1, 79.2],`
	`135`	`+ [1, 80.0],`
`136`	`136`	`],`
`137`	`137`	`)`
`138`	`138`	`@pytest.mark.models_device_performance_bare_metal`