Open
Description
While looking at #583 I noticed that the aug_fwd
kernel looks like:
function aug_fwd(
ctx,
f::FT,
::Val{ModifiedBetween},
subtape,
::Val{TapeType},
args...,
) where {ModifiedBetween, FT, TapeType}
# A2 = Const{Nothing} -- since f->Nothing
forward, _ = EnzymeCore.autodiff_deferred_thunk(
ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)),
TapeType,
Const{Core.Typeof(f)},
Const{Nothing},
Const{Core.Typeof(ctx)},
map(Core.Typeof, args)...,
)
# On the GPU: F is a per thread function
# On the GPU: subtape::Vector
if __validindex(ctx)
I = __index_Global_Linear(ctx)
subtape[I] = forward(Const(f), Const(ctx), args...)[1]
end
return nothing
end
This will create divergent execution of barrier operations #558 (comment)
Likely this is also broken with @kernel unsafe_indicies=true
cc: @michel2323