Skip to content

Commit a36d902

Browse files
committed
Inform Mullapudi2016's grouping algo of the gpu_blocks() dims.
By design, a Stage cannot "compute_at" another stage's outer gpu_blocks() dimensions. Inform the auto-grouping algorithm of all outer dimensions by appending Vars to the lists: inner_dims and outer_dims. Make can_parallel() and can_vectorize() optional.
1 parent 36040d5 commit a36d902

File tree

1 file changed

+20
-9
lines changed

1 file changed

+20
-9
lines changed

src/autoschedulers/mullapudi2016/AutoSchedule.cpp

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,7 +1154,11 @@ class GPUTilingDedup {
11541154
VarOrRVar inner{var + "_i", v.is_rvar};
11551155

11561156
split_t entry{v, outer, inner, factor, TailStrategy::Auto};
1157-
parallelize.try_emplace(var, entry);
1157+
const auto [_, insertion_happened] = parallelize.try_emplace(var, entry);
1158+
if (!insertion_happened) {
1159+
return std::nullopt;
1160+
}
1161+
11581162
return entry;
11591163
}
11601164

@@ -1163,15 +1167,16 @@ class GPUTilingDedup {
11631167
* @param[in] vo split into outer dimension
11641168
* @param[in] vi split into inner dimension
11651169
* @param[in] factor the partition size.
1170+
* @return whether the vectorize() request is accepted or rejected.
11661171
*/
1167-
void can_vectorize(const VarOrRVar &v, const VarOrRVar &vo, const VarOrRVar &vi, const Expr &factor) {
1172+
bool can_vectorize(const VarOrRVar &v, const VarOrRVar &vo, const VarOrRVar &vi, const Expr &factor) {
11681173
const auto &var = v.name();
11691174

11701175
if (is_inner(var)) {
11711176
// For CPU, it makes sense to further split the inner loop and run
11721177
// SIMD instruction. But this operation is redundant in GPU as the
11731178
// gpu_block is already specified.
1174-
return;
1179+
return false;
11751180
}
11761181

11771182
debug(2) << f.name() << ".vectorize(" << v.name() << "," << factor << ")\n";
@@ -1180,10 +1185,11 @@ class GPUTilingDedup {
11801185
// vectorized dimension is treated as a thread in GPU. No need to
11811186
// further split it to match the natural_vector_size() of CPUs.
11821187
inner_vars.emplace(v.name());
1183-
return;
1188+
return false;
11841189
}
11851190

11861191
parallelize.try_emplace(var, split_t{v, vo, vi, factor, TailStrategy::Auto});
1192+
return true;
11871193
}
11881194

11891195
/** Mark the current dimension is already split by Mullapudi2016's
@@ -2880,11 +2886,11 @@ std::optional<pair<VarOrRVar, VarOrRVar>> Partitioner::vectorize_stage(const Gro
28802886
internal_assert(is_rvar == dims[vec_dim_index].is_rvar());
28812887

28822888
VarOrRVar vec_var(vec_dim_name, is_rvar);
2883-
auto [inner, outer] = [&]() -> std::pair<VarOrRVar, VarOrRVar> {
2889+
auto [inner, outer, accepted] = [&]() -> std::tuple<VarOrRVar, VarOrRVar, bool> {
28842890
if (t.has_gpu_feature()) {
28852891
VarOrRVar inner{vec_var.name() + "_vi", vec_var.is_rvar}, outer{vec_var.name() + "_vo", vec_var.is_rvar};
2886-
gpu_tiling.can_vectorize(vec_var, outer, inner, vec_len);
2887-
return {inner, outer};
2892+
const bool accepted = gpu_tiling.can_vectorize(vec_var, outer, inner, vec_len);
2893+
return {inner, outer, accepted};
28882894
}
28892895

28902896
auto split_vars = split_dim(g, f_handle, stage_num, def, is_group_output, vec_var, vec_len,
@@ -2894,7 +2900,7 @@ std::optional<pair<VarOrRVar, VarOrRVar>> Partitioner::vectorize_stage(const Gro
28942900
sched.push_schedule(f_handle.name(), stage_num,
28952901
"vectorize(" + split_vars.first.name() + ")",
28962902
{split_vars.first.name()});
2897-
return split_vars;
2903+
return std::make_tuple(split_vars.first, split_vars.second, true);
28982904
}();
28992905

29002906
if (is_rvar) {
@@ -2912,6 +2918,10 @@ std::optional<pair<VarOrRVar, VarOrRVar>> Partitioner::vectorize_stage(const Gro
29122918
<< "\" in function \"" << f_handle.name() << "\"\n";
29132919
}
29142920

2921+
if (!accepted) {
2922+
return std::nullopt;
2923+
}
2924+
29152925
return make_pair(inner, outer);
29162926
}
29172927

@@ -3284,7 +3294,8 @@ void Partitioner::generate_group_cpu_schedule(
32843294
}
32853295

32863296
// Find the level at which group members will be computed.
3287-
int tile_inner_index = dims.size() - outer_dims.size() - 1;
3297+
internal_assert(dims.size() > outer_dims.size());
3298+
const auto tile_inner_index = dims.size() - outer_dims.size() - 1;
32883299
VarOrRVar tile_inner_var(Var::outermost());
32893300
if (!outer_dims.empty()) {
32903301
string var_name = get_base_name(dims[tile_inner_index].var);

0 commit comments

Comments
 (0)