Skip to content

Commit 3dcb5d4

Browse files
committed
Mark outer dimensions that can parallel()
For GPU targets, mark all outer dimensionsions generated by GPUTilingDedup::canParallel(). These dimensions are marked as gpu_blocks(), so any Stages compute_at this stage needs to use the innermost gpu_block. Existing bug: bad_alloc exception in autograd.generator. Likely out of memory on consumber-grade computers (e.g. 16GB RAM).
1 parent 852ffed commit 3dcb5d4

File tree

1 file changed

+13
-6
lines changed

1 file changed

+13
-6
lines changed

src/autoschedulers/mullapudi2016/AutoSchedule.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1145,21 +1145,23 @@ class GPUTilingDedup {
11451145
* @param[in] v dimension to parallelize.
11461146
* @param[in] factor expected extent of the dimension.
11471147
*/
1148-
void can_parallelize(const VarOrRVar &v, const Expr &factor) {
1148+
std::optional<split_t> can_parallelize(const VarOrRVar &v, const Expr &factor) {
11491149
const auto &var = v.name();
11501150

11511151
if (is_outer(var) || is_inner(var)) {
11521152
// For CPU, it makes sense to mark the outer loop to execute in
11531153
// parallel. But this operation is redundant in GPU as the gpu_block
11541154
// is already specified.
1155-
return;
1155+
return std::nullopt;
11561156
}
11571157

11581158
debug(2) << f.name() << ".parallel(" << v.name() << "," << factor << ")\n";
11591159
VarOrRVar outer{var + "_o", v.is_rvar};
11601160
VarOrRVar inner{var + "_i", v.is_rvar};
11611161

1162-
parallelize.try_emplace(var, split_t{v, std::move(outer), std::move(inner), factor, TailStrategy::Auto});
1162+
split_t entry{v, outer, inner, factor, TailStrategy::Auto};
1163+
parallelize.try_emplace(var, entry);
1164+
return entry;
11631165
}
11641166

11651167
/** Indicate the desire to Func::vectorize(v_i).
@@ -3207,8 +3209,8 @@ void Partitioner::generate_group_cpu_schedule(
32073209

32083210
if (t.has_gpu_feature() && vectorized_split) {
32093211
auto [v_i, v_o] = *vectorized_split;
3210-
inner_dims.emplace_back(std::move(v_i));
3211-
outer_dims.emplace_back(std::move(v_o));
3212+
inner_dims.emplace_back(v_i);
3213+
outer_dims.emplace_back(v_o);
32123214
}
32133215
}
32143216

@@ -3261,7 +3263,12 @@ void Partitioner::generate_group_cpu_schedule(
32613263
}
32623264
}
32633265
if (t.has_gpu_feature()) {
3264-
gpu_tiling.can_parallelize(v, iter->second);
3266+
auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second);
3267+
if (parallelized_split) {
3268+
auto split_vars = *parallelized_split;
3269+
inner_dims.emplace_back(split_vars.inner);
3270+
outer_dims.emplace_back(split_vars.outer);
3271+
}
32653272
} else {
32663273
f_handle.parallel(v);
32673274
sched.push_schedule(f_handle.name(), g.output.stage_num,

0 commit comments

Comments
 (0)