Open
Description
This code is a cleaned up lowering of part of torch.argmax(torch.adaptive_avg_pool1d(...))
repro.py
import halide as hl
@hl.generator(name="kernel")
class Kernel:
in_ptr0 = hl.InputBuffer(hl.Float(64), 1)
out_ptr0 = hl.OutputBuffer(hl.Int(64), 1)
def generate(g):
in_ptr0 = g.in_ptr0
out_ptr0 = g.out_ptr0
rindex = hl.Var("rindex")
r0 = rindex % 2
r1 = rindex // 2
rdom = hl.RDom([hl.Range(0, 32)])
tmp3 = hl.Func("tmp3")
tmp3[rindex] = (3 * r0) // 2
tmp4 = hl.Func("tmp4")
tmp4[rindex] = 2 + ((3 * r0) // 2)
tmp5 = hl.Func("tmp5")
tmp5[rindex] = tmp3[rindex] < tmp4[rindex]
tmp7 = hl.Func("tmp7")
tmp7[rindex] = hl.BoundaryConditions.constant_exterior(in_ptr0, 0)[
(3 * r1) + ((3 * r0) // 2)
]
tmp9 = hl.Func("tmp9")
tmp9[rindex] = hl.select(tmp5[rindex], tmp7[rindex], hl.f64(0.0))
tmp10 = hl.Func("tmp10")
tmp10[rindex] = 1 + ((3 * r0) // 2)
tmp11 = hl.Func("tmp11")
tmp11[rindex] = tmp10[rindex] < tmp4[rindex]
tmp13 = hl.Func("tmp13")
tmp13[rindex] = hl.BoundaryConditions.constant_exterior(in_ptr0, 0)[
1 + (3 * r1) + ((3 * r0) // 2)
]
tmp15 = hl.Func("tmp15")
tmp15[rindex] = hl.select(tmp11[rindex], tmp13[rindex], hl.f64(0.0))
tmp16 = hl.Func("tmp16")
tmp16[rindex] = tmp15[rindex] + tmp9[rindex]
tmp19 = hl.Func("tmp19")
tmp19[rindex] = hl.select(tmp5[rindex], hl.f64(1.0), hl.f64(0.0))
tmp20 = hl.Func("tmp20")
tmp20[rindex] = hl.select(tmp11[rindex], hl.f64(1.0), hl.f64(0.0))
tmp21 = hl.Func("tmp21")
tmp21[rindex] = tmp20[rindex] + tmp19[rindex]
tmp22 = hl.Func("tmp22")
tmp22[rindex] = tmp16[rindex] / tmp21[rindex]
tmp23 = hl.argmax(rdom, tmp22[rdom])[0]
out_ptr0[hl.Var()] = hl.cast(out_ptr0.type(), tmp23)
assert g.using_autoscheduler()
in_ptr0.set_estimates([hl.Range(0, 48)])
# the range here is actually 1, but setting it to 2 to workaround: https://github.com/halide/Halide/issues/8246
out_ptr0.set_estimates([hl.Range(0, 2)])
if __name__ == "__main__":
import sys, tempfile
with tempfile.TemporaryDirectory() as out:
sys.argv = [
"repro.py",
"-g", "kernel",
"-o", out,
"-f", "halide_kernel",
"-e", "static_library,h,schedule",
"-p", "/home/jansel/conda/envs/pytorch/lib/libautoschedule_anderson2021.so",
"target=host-cuda-cuda_capability_86-user_context-strict_float-no_asserts",
"autoscheduler=Anderson2021",
"autoscheduler.parallelism=82",
]
hl.main()
(you will need to update the path to libautoschedule_anderson2021.so
)
Output:
Unhandled exception: Internal Error at /home/jansel/Halide/src/autoschedulers/anderson2021/GPULoopInfo.cpp:92 triggered by user code at : Condition failed: at_or_inside_block():
Traceback (most recent call last):
File "/home/jansel/pytorch/repro.py", line 72, in <module>
hl.main()
RuntimeError: Generator failed: -1
The code includes a workaround to #8246 by saying the output size is 2 (when it is actually 1). If I remove that workaround, I get the same error as #8246. I think the workaround is uncovering a different issue, but the two issues are possibly related.