Open
Description
Useful Info:
NNlib v0.8.16
I actually have use of this to implement very efficient multi head attention. I am sharing the minimal example to replicate below:
using Flux # to get batched_mul
using CUDA
CUDA.allowscalar(false)
A3 = rand(3,3,8) |> gpu
v3 = rand(4, 2, 3, 8) |> gpu
v4, v5, = eachslice(v3, dims=2)
v4
v6 = reshape(v4, size(v4)...)
v4 ⊠ A3 # works
v6 ⊠ A3 # breaks throwing a scalar indexing error
Error for reference:
Scalar indexing is disallowed.
Invocation of getindex resulted in scalar indexing of a GPU array.
This is typically caused by calling an iterating implementation of a method.
Such implementations *do not* execute on the GPU, but very slowly on the CPU,
and therefore are only permitted from the REPL for prototyping purposes.
If you did intend to index this array, annotate the caller with @allowscalar.
error(s::String) at error.jl:33
assertscalar(op::String) at GPUArraysCore.jl:100
getindex(::CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, ::Int64, ::Int64, ::Int64, ::Int64) at indexing.jl:9
getindex at subarray.jl:276 [inlined]
_unsafe_getindex_rs at reshapedarray.jl:249 [inlined]
_unsafe_getindex at reshapedarray.jl:246 [inlined]
getindex at reshapedarray.jl:234 [inlined]
getindex at subarray.jl:276 [inlined]
_generic_matmatmul!(C::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, tA::Char, tB::Char, A::SubArray{Float32, 2, Base.ReshapedArray{Float32, 3, SubArray{Float32, 3, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}}, false}, Tuple{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}, Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}}}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, false}, B::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, _add::LinearAlgebra.MulAddMul{true, true, Float32, Float32}) at matmul.jl:830
generic_matmatmul!(C::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, tA::Char, tB::Char, A::SubArray{Float32, 2, Base.ReshapedArray{Float32, 3, SubArray{Float32, 3, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}}, false}, Tuple{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}, Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}}}, Tuple{Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}, Int64}, false}, B::CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, _add::LinearAlgebra.MulAddMul{true, true, Float32, Float32}) at matmul.jl:798
mul! at matmul.jl:302 [inlined]
batched_mul_generic!(C::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, A::Base.ReshapedArray{Float32, 3, SubArray{Float32, 3, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}}, false}, Tuple{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}, Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}}}, B::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, α::Float32, β::Float32) at batchedmul.jl:274
_batched_try_gemm! at batchedmul.jl:219 [inlined]
_batched_mul! at batchedmul.jl:211 [inlined]
batched_mul! at batchedmul.jl:205 [inlined]
batched_mul! at batchedmul.jl:205 [inlined]
_batched_mul at batchedmul.jl:61 [inlined]
batched_mul(A::Base.ReshapedArray{Float32, 3, SubArray{Float32, 3, CUDA.CuArray{Float32, 4, CUDA.Mem.DeviceBuffer}, Tuple{Base.Slice{Base.OneTo{Int64}}, Int64, Base.Slice{Base.OneTo{Int64}}, Base.Slice{Base.OneTo{Int64}}}, false}, Tuple{Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}, Base.MultiplicativeInverses.SignedMultiplicativeInverse{Int64}}}, B::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}) at batchedmul.jl:48
top-level scope at model.jl:135