Skip to content

Commit 36040d5

Browse files
committed
Mark outer dimensions that can parallel()
For GPU targets, mark all outer dimensionsions generated by GPUTilingDedup::canParallel(). These dimensions are marked as gpu_blocks(), so any Stages compute_at this stage needs to use the innermost gpu_block. Existing bug: bad_alloc exception in autograd.generator. Likely out of memory on consumber-grade computers (e.g. 16GB RAM).
1 parent 75689cf commit 36040d5

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

src/autoschedulers/mullapudi2016/AutoSchedule.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,21 +1139,23 @@ class GPUTilingDedup {
11391139
* @param[in] v dimension to parallelize.
11401140
* @param[in] factor expected extent of the dimension.
11411141
*/
1142-
void can_parallelize(const VarOrRVar &v, const Expr &factor) {
1142+
std::optional<split_t> can_parallelize(const VarOrRVar &v, const Expr &factor) {
11431143
const auto &var = v.name();
11441144

11451145
if (is_outer(var) || is_inner(var)) {
11461146
// For CPU, it makes sense to mark the outer loop to execute in
11471147
// parallel. But this operation is redundant in GPU as the gpu_block
11481148
// is already specified.
1149-
return;
1149+
return std::nullopt;
11501150
}
11511151

11521152
debug(2) << f.name() << ".parallel(" << v.name() << "," << factor << ")\n";
11531153
VarOrRVar outer{var + "_o", v.is_rvar};
11541154
VarOrRVar inner{var + "_i", v.is_rvar};
11551155

1156-
parallelize.try_emplace(var, split_t{v, std::move(outer), std::move(inner), factor, TailStrategy::Auto});
1156+
split_t entry{v, outer, inner, factor, TailStrategy::Auto};
1157+
parallelize.try_emplace(var, entry);
1158+
return entry;
11571159
}
11581160

11591161
/** Indicate the desire to Func::vectorize(v_i).
@@ -3201,8 +3203,8 @@ void Partitioner::generate_group_cpu_schedule(
32013203

32023204
if (t.has_gpu_feature() && vectorized_split) {
32033205
auto [v_i, v_o] = *vectorized_split;
3204-
inner_dims.emplace_back(std::move(v_i));
3205-
outer_dims.emplace_back(std::move(v_o));
3206+
inner_dims.emplace_back(v_i);
3207+
outer_dims.emplace_back(v_o);
32063208
}
32073209
}
32083210

@@ -3255,7 +3257,12 @@ void Partitioner::generate_group_cpu_schedule(
32553257
}
32563258
}
32573259
if (t.has_gpu_feature()) {
3258-
gpu_tiling.can_parallelize(v, iter->second);
3260+
auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second);
3261+
if (parallelized_split) {
3262+
auto split_vars = *parallelized_split;
3263+
inner_dims.emplace_back(split_vars.inner);
3264+
outer_dims.emplace_back(split_vars.outer);
3265+
}
32593266
} else {
32603267
f_handle.parallel(v);
32613268
sched.push_schedule(f_handle.name(), g.output.stage_num,

0 commit comments

Comments
 (0)