fix some compile issues with -Werror (ROCm#1657)

yuguo68 · web-flow · commit e467f7d219f2 · 2025-12-17T11:20:27.000+08:00
diff --git a/csrc/include/aiter_hip_common.h b/csrc/include/aiter_hip_common.h
@@ -160,7 +160,7 @@ static const std::string get_gpu_arch()
     }
 }
 
-static const uint32_t get_num_cu_func()
+static uint32_t get_num_cu_func()
 {
     auto get_num_cu_local = []() {
         hipDevice_t dev;
diff --git a/csrc/kernels/custom_kernels.cu b/csrc/kernels/custom_kernels.cu
@@ -1792,7 +1792,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitKQ_hf_sml_(const int K,
                                                                     const int CuCount)
 {
     using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
-    using intx2   = __attribute__((__vector_size__(2 * sizeof(int)))) int;
     using intx4   = __attribute__((__vector_size__(4 * sizeof(int)))) int;
     union bigType
     {
@@ -2014,7 +2013,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitKQ_hf_(const int K,
                                                                 const int CuCount)
 {
     using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
-    using intx2   = __attribute__((__vector_size__(2 * sizeof(int)))) int;
     using intx4   = __attribute__((__vector_size__(4 * sizeof(int)))) int;
     union bigType
     {
diff --git a/csrc/kernels/mla/metadata/v1_1_device.cuh b/csrc/kernels/mla/metadata/v1_1_device.cuh
@@ -195,7 +195,7 @@ CK_TILE_DEVICE void generate_work(
                     if (p_reduce_partial_map[global_cluster_q_idx].q_start == -1)
                     {
                         p_reduce_partial_map[global_cluster_q_idx].q_start = *p_loc_partial_outputs;
-                        p_reduce_final_map[global_cluster_q_idx] = { work_info.qo_start, work_info.qo_end };
+                        p_reduce_final_map[global_cluster_q_idx] = {{ work_info.qo_start, work_info.qo_end }};
                     }
                     ++(*p_num_partial_outputs);
                     *p_loc_partial_outputs += (work_info.qo_end - work_info.qo_start);
@@ -424,8 +424,8 @@ __global__ void kn_get_mla_metadata_v1_1(
     MlaPartialTileInfo* p_reduce_final_map = p_reduce_partial_map + tot_qo_tiles;
     for (int32_t cluster_q_idx = threadIdx.x; cluster_q_idx < tot_qo_tiles; cluster_q_idx += ck_tile::get_warp_size())
     {
-        p_reduce_partial_map[cluster_q_idx] = MlaPartialTileInfo{-1, -2};
-        p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{-1, -2};
+        p_reduce_partial_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}};
+        p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}};
     }
 
     // Step.5.3. Output work info
diff --git a/csrc/kernels/mla/metadata/v1_1_host.cuh b/csrc/kernels/mla/metadata/v1_1_host.cuh
@@ -95,7 +95,7 @@ std::vector<torch::Tensor> get_mla_metadata_v1_1_host(
     // Step.3.1. Allocates output buffers except indptrs
     std::vector<std::vector<MlaWorkInfo>> work_info_set(num_clusters, std::vector<MlaWorkInfo>());
     std::vector<std::vector<index_t>> reduce_partial_map(num_qo_clusters_indptr.back(), std::vector<index_t>());
-    std::vector<MlaPartialTileInfo> reduce_partial_info(num_qo_clusters_indptr.back(), {-1, -2});
+    std::vector<MlaPartialTileInfo> reduce_partial_info(num_qo_clusters_indptr.back(), {{-1, -2}});
 
     // Step.3.2. Declare priority queue
     using ClusterCost = std::tuple<int32_t, int32_t>; // cluster_id(cid), cost
@@ -175,7 +175,7 @@ std::vector<torch::Tensor> get_mla_metadata_v1_1_host(
                     if (reduce_partial_map[global_cluster_q_idx].empty())
                     {
                         ++num_reduce_row;
-                        reduce_partial_info[global_cluster_q_idx] = { work_info.qo_start, work_info.qo_end };
+                        reduce_partial_info[global_cluster_q_idx] = {{ work_info.qo_start, work_info.qo_end }};
                     }
                     reduce_partial_map[global_cluster_q_idx].push_back(loc_partial_outputs);
                     ++num_partial_outputs;
diff --git a/csrc/kernels/mla/reduce.cu b/csrc/kernels/mla/reduce.cu
@@ -318,7 +318,7 @@ CK_TILE_DEVICE void mla_reduce_v1_impl_massive(
         else
         {
             const int32_t qo_len = reduce_partial_map_1 - reduce_partial_map_0;
-            return MlaPartialTileInfo{tile_idx * qo_len, (tile_idx + 1) * qo_len};
+            return MlaPartialTileInfo{{tile_idx * qo_len, (tile_idx + 1) * qo_len}};
         }
     }();
 
diff --git a/csrc/kernels/moe_fused_gate.cu b/csrc/kernels/moe_fused_gate.cu
@@ -591,6 +591,7 @@ std::vector<at::Tensor> moe_fused_gate(at::Tensor& input,
     {
     case 256:
         if(num_expert_group == 8)
+        {
             // This is deepseek v3 case. Here VPT = 256/8 = 32, ROWS_PER_WARP = 32/8 = 4,
             // ROWS_PER_CTA = 6 * 4 = 24.
             if(input.scalar_type() == at::kBFloat16)
@@ -605,23 +606,27 @@ std::vector<at::Tensor> moe_fused_gate(at::Tensor& input,
             {
                 LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 8);
             }
-            else if(num_expert_group == 16)
-                // Here VPT = 256/16 = 16, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
-                if(input.scalar_type() == at::kBFloat16)
-                {
-                    LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 16);
-                }
-                else if(input.scalar_type() == at::kHalf)
-                {
-                    LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 16);
-                }
-                else if(input.scalar_type() == at::kFloat)
-                {
-                    LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 16);
-                }
+        }
+        else if(num_expert_group == 16)
+        {
+            // Here VPT = 256/16 = 16, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
+            if(input.scalar_type() == at::kBFloat16)
+            {
+                LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 16);
+            }
+            else if(input.scalar_type() == at::kHalf)
+            {
+                LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 16);
+            }
+            else if(input.scalar_type() == at::kFloat)
+            {
+                LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 16);
+            }
+        }
         break;
     case 128:
         if(num_expert_group == 4)
+        {
             // VPT = 128/4 = 32, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
             if(input.scalar_type() == at::kBFloat16)
             {
@@ -635,20 +640,23 @@ std::vector<at::Tensor> moe_fused_gate(at::Tensor& input,
             {
                 LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 4);
             }
-            else if(num_expert_group == 8)
-                // VPT = 128/8 = 16, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4 = 24.
-                if(input.scalar_type() == at::kBFloat16)
-                {
-                    LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 8);
-                }
-                else if(input.scalar_type() == at::kHalf)
-                {
-                    LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 8);
-                }
-                else if(input.scalar_type() == at::kFloat)
-                {
-                    LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 8);
-                }
+        }
+        else if(num_expert_group == 8)
+        {
+            // VPT = 128/8 = 16, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4 = 24.
+            if(input.scalar_type() == at::kBFloat16)
+            {
+                LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 8);
+            }
+            else if(input.scalar_type() == at::kHalf)
+            {
+                LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 8);
+            }
+            else if(input.scalar_type() == at::kFloat)
+            {
+                LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 8);
+            }
+        }
         break;
     default: break;
     }
diff --git a/csrc/kernels/quant_kernels.cu b/csrc/kernels/quant_kernels.cu
@@ -68,7 +68,6 @@ dynamic_per_group_scaled_quant_kernel(DTYPE_O* __restrict__ out,
     using vec_i = ck_tile::vec_t<DTYPE_I, thread_data_size>;
     static constexpr int32_t vec_size_o =
         std::is_same_v<DTYPE_O, ck_tile::fp4x2_t> ? thread_data_size / 2 : thread_data_size;
-    using vec_o = ck_tile::vec_t<DTYPE_O, vec_size_o>;
     const float inverted_DTYPE_MAX =
         std::is_same_v<DTYPE_O, ck_tile::fp4x2_t>
             ? 0.25
@@ -341,7 +340,6 @@ __device__ void scaled_quant_vgpr_impl(DTYPE_O* __restrict__ out,
         std::is_same_v<DTYPE_O, ck_tile::fp4x2_t> ? vec_size_i / 2 : vec_size_i;
 
     using vec_i       = ck_tile::vec_t<DTYPE_I, vec_size_i>;
-    using vec_o       = ck_tile::vec_t<DTYPE_O, vec_size_o>;
     using DTYPE_STORE = typename ck_tile::vector_traits<DTYPE_O>::scalar_type;
 
     const int64_t row_offset        = blockIdx.x * cols;

Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ static const std::string get_gpu_arch()`
`160`	`160`	`}`
`161`	`161`	`}`
`162`	`162`
`163`		`-static const uint32_t get_num_cu_func()`
	`163`	`+static uint32_t get_num_cu_func()`
`164`	`164`	`{`
`165`	`165`	`auto get_num_cu_local = []() {`
`166`	`166`	`hipDevice_t dev;`
Original file line number	Diff line number	Diff line change
`@@ -1792,7 +1792,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitKQ_hf_sml_(const int K,`
`1792`	`1792`	`const int CuCount)`
`1793`	`1793`	`{`
`1794`	`1794`	`using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;`
`1795`		`- using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;`
`1796`	`1795`	`using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;`
`1797`	`1796`	`union bigType`
`1798`	`1797`	`{`
`@@ -2014,7 +2013,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitKQ_hf_(const int K,`
`2014`	`2013`	`const int CuCount)`
`2015`	`2014`	`{`
`2016`	`2015`	`using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;`
`2017`		`- using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;`
`2018`	`2016`	`using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;`
`2019`	`2017`	`union bigType`
`2020`	`2018`	`{`
Original file line number	Diff line number	Diff line change
`@@ -195,7 +195,7 @@ CK_TILE_DEVICE void generate_work(`
`195`	`195`	`if (p_reduce_partial_map[global_cluster_q_idx].q_start == -1)`
`196`	`196`	`{`
`197`	`197`	`p_reduce_partial_map[global_cluster_q_idx].q_start = *p_loc_partial_outputs;`
`198`		`- p_reduce_final_map[global_cluster_q_idx] = { work_info.qo_start, work_info.qo_end };`
	`198`	`+ p_reduce_final_map[global_cluster_q_idx] = {{ work_info.qo_start, work_info.qo_end }};`
`199`	`199`	`}`
`200`	`200`	`++(*p_num_partial_outputs);`
`201`	`201`	`*p_loc_partial_outputs += (work_info.qo_end - work_info.qo_start);`
`@@ -424,8 +424,8 @@ __global__ void kn_get_mla_metadata_v1_1(`
`424`	`424`	`MlaPartialTileInfo* p_reduce_final_map = p_reduce_partial_map + tot_qo_tiles;`
`425`	`425`	`for (int32_t cluster_q_idx = threadIdx.x; cluster_q_idx < tot_qo_tiles; cluster_q_idx += ck_tile::get_warp_size())`
`426`	`426`	`{`
`427`		`- p_reduce_partial_map[cluster_q_idx] = MlaPartialTileInfo{-1, -2};`
`428`		`- p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{-1, -2};`
	`427`	`+ p_reduce_partial_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}};`
	`428`	`+ p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}};`
`429`	`429`	`}`
`430`	`430`
`431`	`431`	`// Step.5.3. Output work info`
Original file line number	Diff line number	Diff line change
`@@ -318,7 +318,7 @@ CK_TILE_DEVICE void mla_reduce_v1_impl_massive(`
`318`	`318`	`else`
`319`	`319`	`{`
`320`	`320`	`const int32_t qo_len = reduce_partial_map_1 - reduce_partial_map_0;`
`321`		`- return MlaPartialTileInfo{tile_idx * qo_len, (tile_idx + 1) * qo_len};`
	`321`	`+ return MlaPartialTileInfo{{tile_idx * qo_len, (tile_idx + 1) * qo_len}};`
`322`	`322`	`}`
`323`	`323`	`}();`
`324`	`324`