Skip to content

Commit a96d864

Browse files
committed
Mullapudi2016-GPU schedule default to false
Add an autoscheduler option `mullapudi2016.experimental_gpu_schedule` which defaults to `false`, such that the autoscheduler will always generate CPU schedules by default. This is to preserve the original behavior of the Mullapudi2016 scheduler, as well as to pass all CI test-benchmark on Buildbot requiring passing the tests for "CUDA" or "Metal" targets. Especially, for algorithm pipelines that poorly fits the assumption of the Mullapudi2016 model (i.e. the cascaded stentil operations).
1 parent ca98dee commit a96d864

File tree

1 file changed

+13
-9
lines changed

1 file changed

+13
-9
lines changed

src/autoschedulers/mullapudi2016/AutoSchedule.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ using std::vector;
2323
namespace {
2424

2525
struct ArchParams {
26+
/** Enable experimental-GPU schedule feature. */
27+
bool experimental_gpu_schedule{false};
28+
2629
/** Maximum level of parallelism avalaible. */
2730
int parallelism{};
2831

@@ -2858,7 +2861,7 @@ std::optional<pair<VarOrRVar, VarOrRVar>> Partitioner::vectorize_stage(const Gro
28582861
// Set the vector length as the maximum of the natural vector size of all
28592862
// values produced by the function.
28602863
const auto vec_len = [&]() -> int {
2861-
if (t.has_gpu_feature()) {
2864+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
28622865
/** Section 5.4 of the Mullapudi2016 article: We configure the
28632866
* auto-scheduler to target the GPU by set- ting the ...,
28642867
* VECTOR_WIDTH to 32.
@@ -2895,7 +2898,7 @@ std::optional<pair<VarOrRVar, VarOrRVar>> Partitioner::vectorize_stage(const Gro
28952898

28962899
VarOrRVar vec_var(vec_dim_name, is_rvar);
28972900
auto [inner, outer, accepted] = [&]() -> std::tuple<VarOrRVar, VarOrRVar, bool> {
2898-
if (t.has_gpu_feature()) {
2901+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
28992902
VarOrRVar inner{vec_var.name() + "_vi", vec_var.is_rvar}, outer{vec_var.name() + "_vo", vec_var.is_rvar};
29002903
const bool accepted = gpu_tiling.can_vectorize(vec_var, outer, inner, vec_len);
29012904
return {inner, outer, accepted};
@@ -3052,7 +3055,7 @@ void Partitioner::reorder_dims(Stage f_handle, int stage_num, Definition def,
30523055
}
30533056

30543057
if (dims != ordering) {
3055-
if (t.has_gpu_feature()) {
3058+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
30563059
gpu_tiling.canReorder(ordering);
30573060
} else {
30583061
f_handle.reorder(ordering);
@@ -3205,7 +3208,7 @@ void Partitioner::generate_group_cpu_schedule(
32053208
}
32063209

32073210
if (dims != ordering) {
3208-
if (t.has_gpu_feature()) {
3211+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
32093212
gpu_tiling.canReorder(ordering);
32103213
} else {
32113214
f_handle.reorder(ordering);
@@ -3219,7 +3222,7 @@ void Partitioner::generate_group_cpu_schedule(
32193222
auto vectorized_split = vectorize_stage(g, f_handle, g.output.stage_num, def, g_out, true, t,
32203223
rvars, stg_estimates, sched, gpu_tiling);
32213224

3222-
if (t.has_gpu_feature() && vectorized_split) {
3225+
if (t.has_gpu_feature() && vectorized_split && arch_params.experimental_gpu_schedule) {
32233226
auto [v_i, v_o] = *vectorized_split;
32243227
inner_dims.emplace_back(v_i);
32253228
outer_dims.emplace_back(v_o);
@@ -3265,7 +3268,7 @@ void Partitioner::generate_group_cpu_schedule(
32653268
if ((iter != stg_estimates.end()) && iter->second.defined()) {
32663269
if (!seq_var.empty()) {
32673270
VarOrRVar seq(seq_var, (rvars.find(seq_var) != rvars.end()));
3268-
if (t.has_gpu_feature()) {
3271+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
32693272
gpu_tiling.canReorder({seq, v});
32703273
} else {
32713274
f_handle.reorder(seq, v);
@@ -3274,7 +3277,7 @@ void Partitioner::generate_group_cpu_schedule(
32743277
{seq_var, var});
32753278
}
32763279
}
3277-
if (t.has_gpu_feature()) {
3280+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
32783281
auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second);
32793282
if (parallelized_split) {
32803283
auto split_vars = *parallelized_split;
@@ -3297,7 +3300,7 @@ void Partitioner::generate_group_cpu_schedule(
32973300
debug(1) << "Insufficient parallelism for " << f_handle.name() << "\n";
32983301
}
32993302

3300-
if (t.has_gpu_feature()) {
3303+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
33013304
gpu_tiling.apply(sched);
33023305
}
33033306

@@ -3378,7 +3381,7 @@ void Partitioner::generate_group_cpu_schedule(
33783381
vectorize_stage(g, mem_handle, mem.stage_num, mem_def, mem.func, false,
33793382
t, mem_rvars, mem_estimates, sched, gpu_tiling2);
33803383

3381-
if (t.has_gpu_feature()) {
3384+
if (t.has_gpu_feature() && arch_params.experimental_gpu_schedule) {
33823385
gpu_tiling2.apply(sched);
33833386
}
33843387
}
@@ -3917,6 +3920,7 @@ struct Mullapudi2016 {
39173920
ArchParams arch_params{target.has_gpu_feature()};
39183921
{
39193922
ParamParser parser(params_in.extra);
3923+
parser.parse("experimental_gpu_schedule", &arch_params.experimental_gpu_schedule);
39203924
parser.parse("parallelism", &arch_params.parallelism);
39213925
parser.parse("last_level_cache_size", &arch_params.last_level_cache_size);
39223926
parser.parse("balance", &arch_params.balance);

0 commit comments

Comments
 (0)