Skip to content

Commit e467f7d

Browse files
authored
fix some compile issues with -Werror (ROCm#1657)
1 parent d0c313d commit e467f7d

7 files changed

Lines changed: 43 additions & 39 deletions

File tree

csrc/include/aiter_hip_common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ static const std::string get_gpu_arch()
160160
}
161161
}
162162

163-
static const uint32_t get_num_cu_func()
163+
static uint32_t get_num_cu_func()
164164
{
165165
auto get_num_cu_local = []() {
166166
hipDevice_t dev;

csrc/kernels/custom_kernels.cu

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1792,7 +1792,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitKQ_hf_sml_(const int K,
17921792
const int CuCount)
17931793
{
17941794
using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
1795-
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
17961795
using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
17971796
union bigType
17981797
{
@@ -2014,7 +2013,6 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) wvSplitKQ_hf_(const int K,
20142013
const int CuCount)
20152014
{
20162015
using scalar8 = __attribute__((__vector_size__((A_CHUNK / 4) * sizeof(float)))) float;
2017-
using intx2 = __attribute__((__vector_size__(2 * sizeof(int)))) int;
20182016
using intx4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
20192017
union bigType
20202018
{

csrc/kernels/mla/metadata/v1_1_device.cuh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ CK_TILE_DEVICE void generate_work(
195195
if (p_reduce_partial_map[global_cluster_q_idx].q_start == -1)
196196
{
197197
p_reduce_partial_map[global_cluster_q_idx].q_start = *p_loc_partial_outputs;
198-
p_reduce_final_map[global_cluster_q_idx] = { work_info.qo_start, work_info.qo_end };
198+
p_reduce_final_map[global_cluster_q_idx] = {{ work_info.qo_start, work_info.qo_end }};
199199
}
200200
++(*p_num_partial_outputs);
201201
*p_loc_partial_outputs += (work_info.qo_end - work_info.qo_start);
@@ -424,8 +424,8 @@ __global__ void kn_get_mla_metadata_v1_1(
424424
MlaPartialTileInfo* p_reduce_final_map = p_reduce_partial_map + tot_qo_tiles;
425425
for (int32_t cluster_q_idx = threadIdx.x; cluster_q_idx < tot_qo_tiles; cluster_q_idx += ck_tile::get_warp_size())
426426
{
427-
p_reduce_partial_map[cluster_q_idx] = MlaPartialTileInfo{-1, -2};
428-
p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{-1, -2};
427+
p_reduce_partial_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}};
428+
p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}};
429429
}
430430

431431
// Step.5.3. Output work info

csrc/kernels/mla/metadata/v1_1_host.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ std::vector<torch::Tensor> get_mla_metadata_v1_1_host(
9595
// Step.3.1. Allocates output buffers except indptrs
9696
std::vector<std::vector<MlaWorkInfo>> work_info_set(num_clusters, std::vector<MlaWorkInfo>());
9797
std::vector<std::vector<index_t>> reduce_partial_map(num_qo_clusters_indptr.back(), std::vector<index_t>());
98-
std::vector<MlaPartialTileInfo> reduce_partial_info(num_qo_clusters_indptr.back(), {-1, -2});
98+
std::vector<MlaPartialTileInfo> reduce_partial_info(num_qo_clusters_indptr.back(), {{-1, -2}});
9999

100100
// Step.3.2. Declare priority queue
101101
using ClusterCost = std::tuple<int32_t, int32_t>; // cluster_id(cid), cost
@@ -175,7 +175,7 @@ std::vector<torch::Tensor> get_mla_metadata_v1_1_host(
175175
if (reduce_partial_map[global_cluster_q_idx].empty())
176176
{
177177
++num_reduce_row;
178-
reduce_partial_info[global_cluster_q_idx] = { work_info.qo_start, work_info.qo_end };
178+
reduce_partial_info[global_cluster_q_idx] = {{ work_info.qo_start, work_info.qo_end }};
179179
}
180180
reduce_partial_map[global_cluster_q_idx].push_back(loc_partial_outputs);
181181
++num_partial_outputs;

csrc/kernels/mla/reduce.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,7 @@ CK_TILE_DEVICE void mla_reduce_v1_impl_massive(
318318
else
319319
{
320320
const int32_t qo_len = reduce_partial_map_1 - reduce_partial_map_0;
321-
return MlaPartialTileInfo{tile_idx * qo_len, (tile_idx + 1) * qo_len};
321+
return MlaPartialTileInfo{{tile_idx * qo_len, (tile_idx + 1) * qo_len}};
322322
}
323323
}();
324324

csrc/kernels/moe_fused_gate.cu

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,7 @@ std::vector<at::Tensor> moe_fused_gate(at::Tensor& input,
591591
{
592592
case 256:
593593
if(num_expert_group == 8)
594+
{
594595
// This is deepseek v3 case. Here VPT = 256/8 = 32, ROWS_PER_WARP = 32/8 = 4,
595596
// ROWS_PER_CTA = 6 * 4 = 24.
596597
if(input.scalar_type() == at::kBFloat16)
@@ -605,23 +606,27 @@ std::vector<at::Tensor> moe_fused_gate(at::Tensor& input,
605606
{
606607
LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 8);
607608
}
608-
else if(num_expert_group == 16)
609-
// Here VPT = 256/16 = 16, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
610-
if(input.scalar_type() == at::kBFloat16)
611-
{
612-
LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 16);
613-
}
614-
else if(input.scalar_type() == at::kHalf)
615-
{
616-
LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 16);
617-
}
618-
else if(input.scalar_type() == at::kFloat)
619-
{
620-
LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 16);
621-
}
609+
}
610+
else if(num_expert_group == 16)
611+
{
612+
// Here VPT = 256/16 = 16, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
613+
if(input.scalar_type() == at::kBFloat16)
614+
{
615+
LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 256, 16);
616+
}
617+
else if(input.scalar_type() == at::kHalf)
618+
{
619+
LAUNCH_MOE_GATE_CONFIG(float16_t, 256, 16);
620+
}
621+
else if(input.scalar_type() == at::kFloat)
622+
{
623+
LAUNCH_MOE_GATE_CONFIG(float32_t, 256, 16);
624+
}
625+
}
622626
break;
623627
case 128:
624628
if(num_expert_group == 4)
629+
{
625630
// VPT = 128/4 = 32, ROWS_PER_WARP = 32/16 = 2, ROWS_PER_CTA = 6 * 2 = 12.
626631
if(input.scalar_type() == at::kBFloat16)
627632
{
@@ -635,20 +640,23 @@ std::vector<at::Tensor> moe_fused_gate(at::Tensor& input,
635640
{
636641
LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 4);
637642
}
638-
else if(num_expert_group == 8)
639-
// VPT = 128/8 = 16, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4 = 24.
640-
if(input.scalar_type() == at::kBFloat16)
641-
{
642-
LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 8);
643-
}
644-
else if(input.scalar_type() == at::kHalf)
645-
{
646-
LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 8);
647-
}
648-
else if(input.scalar_type() == at::kFloat)
649-
{
650-
LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 8);
651-
}
643+
}
644+
else if(num_expert_group == 8)
645+
{
646+
// VPT = 128/8 = 16, ROWS_PER_WARP = 32/8 = 4, ROWS_PER_CTA = 6 * 4 = 24.
647+
if(input.scalar_type() == at::kBFloat16)
648+
{
649+
LAUNCH_MOE_GATE_CONFIG(bfloat16_t, 128, 8);
650+
}
651+
else if(input.scalar_type() == at::kHalf)
652+
{
653+
LAUNCH_MOE_GATE_CONFIG(float16_t, 128, 8);
654+
}
655+
else if(input.scalar_type() == at::kFloat)
656+
{
657+
LAUNCH_MOE_GATE_CONFIG(float32_t, 128, 8);
658+
}
659+
}
652660
break;
653661
default: break;
654662
}

csrc/kernels/quant_kernels.cu

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@ dynamic_per_group_scaled_quant_kernel(DTYPE_O* __restrict__ out,
6868
using vec_i = ck_tile::vec_t<DTYPE_I, thread_data_size>;
6969
static constexpr int32_t vec_size_o =
7070
std::is_same_v<DTYPE_O, ck_tile::fp4x2_t> ? thread_data_size / 2 : thread_data_size;
71-
using vec_o = ck_tile::vec_t<DTYPE_O, vec_size_o>;
7271
const float inverted_DTYPE_MAX =
7372
std::is_same_v<DTYPE_O, ck_tile::fp4x2_t>
7473
? 0.25
@@ -341,7 +340,6 @@ __device__ void scaled_quant_vgpr_impl(DTYPE_O* __restrict__ out,
341340
std::is_same_v<DTYPE_O, ck_tile::fp4x2_t> ? vec_size_i / 2 : vec_size_i;
342341

343342
using vec_i = ck_tile::vec_t<DTYPE_I, vec_size_i>;
344-
using vec_o = ck_tile::vec_t<DTYPE_O, vec_size_o>;
345343
using DTYPE_STORE = typename ck_tile::vector_traits<DTYPE_O>::scalar_type;
346344

347345
const int64_t row_offset = blockIdx.x * cols;

0 commit comments

Comments
 (0)