@@ -23,6 +23,9 @@ using std::vector;
23
23
namespace {
24
24
25
25
struct ArchParams {
26
+ /* * Enable experimental-GPU schedule feature. */
27
+ bool experimental_gpu_schedule{false };
28
+
26
29
/* * Maximum level of parallelism avalaible. */
27
30
int parallelism{};
28
31
@@ -2858,7 +2861,7 @@ std::optional<pair<VarOrRVar, VarOrRVar>> Partitioner::vectorize_stage(const Gro
2858
2861
// Set the vector length as the maximum of the natural vector size of all
2859
2862
// values produced by the function.
2860
2863
const auto vec_len = [&]() -> int {
2861
- if (t.has_gpu_feature ()) {
2864
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
2862
2865
/* * Section 5.4 of the Mullapudi2016 article: We configure the
2863
2866
* auto-scheduler to target the GPU by set- ting the ...,
2864
2867
* VECTOR_WIDTH to 32.
@@ -2895,7 +2898,7 @@ std::optional<pair<VarOrRVar, VarOrRVar>> Partitioner::vectorize_stage(const Gro
2895
2898
2896
2899
VarOrRVar vec_var (vec_dim_name, is_rvar);
2897
2900
auto [inner, outer, accepted] = [&]() -> std::tuple<VarOrRVar, VarOrRVar, bool > {
2898
- if (t.has_gpu_feature ()) {
2901
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
2899
2902
VarOrRVar inner{vec_var.name () + " _vi" , vec_var.is_rvar }, outer{vec_var.name () + " _vo" , vec_var.is_rvar };
2900
2903
const bool accepted = gpu_tiling.can_vectorize (vec_var, outer, inner, vec_len);
2901
2904
return {inner, outer, accepted};
@@ -3052,7 +3055,7 @@ void Partitioner::reorder_dims(Stage f_handle, int stage_num, Definition def,
3052
3055
}
3053
3056
3054
3057
if (dims != ordering) {
3055
- if (t.has_gpu_feature ()) {
3058
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
3056
3059
gpu_tiling.canReorder (ordering);
3057
3060
} else {
3058
3061
f_handle.reorder (ordering);
@@ -3205,7 +3208,7 @@ void Partitioner::generate_group_cpu_schedule(
3205
3208
}
3206
3209
3207
3210
if (dims != ordering) {
3208
- if (t.has_gpu_feature ()) {
3211
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
3209
3212
gpu_tiling.canReorder (ordering);
3210
3213
} else {
3211
3214
f_handle.reorder (ordering);
@@ -3219,7 +3222,7 @@ void Partitioner::generate_group_cpu_schedule(
3219
3222
auto vectorized_split = vectorize_stage (g, f_handle, g.output .stage_num , def, g_out, true , t,
3220
3223
rvars, stg_estimates, sched, gpu_tiling);
3221
3224
3222
- if (t.has_gpu_feature () && vectorized_split) {
3225
+ if (t.has_gpu_feature () && vectorized_split && arch_params. experimental_gpu_schedule ) {
3223
3226
auto [v_i, v_o] = *vectorized_split;
3224
3227
inner_dims.emplace_back (v_i);
3225
3228
outer_dims.emplace_back (v_o);
@@ -3265,7 +3268,7 @@ void Partitioner::generate_group_cpu_schedule(
3265
3268
if ((iter != stg_estimates.end ()) && iter->second .defined ()) {
3266
3269
if (!seq_var.empty ()) {
3267
3270
VarOrRVar seq (seq_var, (rvars.find (seq_var) != rvars.end ()));
3268
- if (t.has_gpu_feature ()) {
3271
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
3269
3272
gpu_tiling.canReorder ({seq, v});
3270
3273
} else {
3271
3274
f_handle.reorder (seq, v);
@@ -3274,7 +3277,7 @@ void Partitioner::generate_group_cpu_schedule(
3274
3277
{seq_var, var});
3275
3278
}
3276
3279
}
3277
- if (t.has_gpu_feature ()) {
3280
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
3278
3281
auto parallelized_split = gpu_tiling.can_parallelize (v, iter->second );
3279
3282
if (parallelized_split) {
3280
3283
auto split_vars = *parallelized_split;
@@ -3297,7 +3300,7 @@ void Partitioner::generate_group_cpu_schedule(
3297
3300
debug (1 ) << " Insufficient parallelism for " << f_handle.name () << " \n " ;
3298
3301
}
3299
3302
3300
- if (t.has_gpu_feature ()) {
3303
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
3301
3304
gpu_tiling.apply (sched);
3302
3305
}
3303
3306
@@ -3378,7 +3381,7 @@ void Partitioner::generate_group_cpu_schedule(
3378
3381
vectorize_stage (g, mem_handle, mem.stage_num , mem_def, mem.func , false ,
3379
3382
t, mem_rvars, mem_estimates, sched, gpu_tiling2);
3380
3383
3381
- if (t.has_gpu_feature ()) {
3384
+ if (t.has_gpu_feature () && arch_params. experimental_gpu_schedule ) {
3382
3385
gpu_tiling2.apply (sched);
3383
3386
}
3384
3387
}
@@ -3917,6 +3920,7 @@ struct Mullapudi2016 {
3917
3920
ArchParams arch_params{target.has_gpu_feature ()};
3918
3921
{
3919
3922
ParamParser parser (params_in.extra );
3923
+ parser.parse (" experimental_gpu_schedule" , &arch_params.experimental_gpu_schedule );
3920
3924
parser.parse (" parallelism" , &arch_params.parallelism );
3921
3925
parser.parse (" last_level_cache_size" , &arch_params.last_level_cache_size );
3922
3926
parser.parse (" balance" , &arch_params.balance );
0 commit comments