Compilation failure due to high register usage

As seen on DiffEqGPU.jl:

```
  caused by: NSError: Compute function exceeds available temporary registers (AGXMetal13_3, code 3)
  Stacktrace:
    [1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
      @ Metal.MTL /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/lib/mtl/compute_pipeline.jl:60
    [2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/compilation.jl:71
    [3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/compilation.jl:66
    [4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
      @ GPUCompiler /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/GPUCompiler/NVLGB/src/execution.jl:132
    [5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
      @ GPUCompiler /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/GPUCompiler/NVLGB/src/execution.jl:103
    [6] macro expansion
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:162 [inlined]
    [7] macro expansion
      @ ./lock.jl:267 [inlined]
    [8] mtlfunction(f::typeof(DiffEqGPU.gpu_ode_asolve_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, MtlDeviceVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, 1}, GPURosenbrock23{true}, MtlDeviceMatrix{SVector{20, Float32}, 1}, MtlDeviceMatrix{Float32, 1}, Float32, CallbackSet{Tuple{}, Tuple{}}, Nothing, Float32, Float32, Nothing, Val{false}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:157
    [9] mtlfunction(f::typeof(DiffEqGPU.gpu_ode_asolve_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, MtlDeviceVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, 1}, GPURosenbrock23{true}, MtlDeviceMatrix{SVector{20, Float32}, 1}, MtlDeviceMatrix{Float32, 1}, Float32, CallbackSet{Tuple{}, Tuple{}}, Nothing, Float32, Float32, Nothing, Val{false}}})
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:155
   [10] macro expansion
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:77 [inlined]
   [11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(DiffEqGPU.gpu_ode_asolve_kernel)})(::MtlVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}}, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
      @ Metal.MetalKernels /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/MetalKernels.jl:105
   [12] Kernel
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/MetalKernels.jl:101 [inlined]
   [13] #vectorized_asolve#166
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/build/default-macmini-aarch64-3-0/julialang/diffeqgpu-dot-jl/src/solve.jl:182 [inlined]
```

It's interesting because IIUC the dynamic workgroup size setting there should have used `maxTotalThreadsPerThreadgroup`, which in the case of CUDA takes register usage into account. Maybe there's additional limits we need to respect with Metal?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Compilation failure due to high register usage #214

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Compilation failure due to high register usage #214

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions