Skip to content

Compilation failure due to high register usage #214

@maleadt

Description

@maleadt

As seen on DiffEqGPU.jl:

  caused by: NSError: Compute function exceeds available temporary registers (AGXMetal13_3, code 3)
  Stacktrace:
    [1] MTLComputePipelineState(dev::Metal.MTL.MTLDeviceInstance, fun::Metal.MTL.MTLFunctionInstance)
      @ Metal.MTL /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/lib/mtl/compute_pipeline.jl:60
    [2] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}}; return_function::Bool)
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/compilation.jl:71
    [3] link(job::GPUCompiler.CompilerJob, compiled::NamedTuple{(:image, :entry), Tuple{Vector{UInt8}, String}})
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/compilation.jl:66
    [4] actual_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, world::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::typeof(Metal.compile), linker::typeof(Metal.link))
      @ GPUCompiler /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/GPUCompiler/NVLGB/src/execution.jl:132
    [5] cached_compilation(cache::Dict{Any, Any}, src::Core.MethodInstance, cfg::GPUCompiler.CompilerConfig{GPUCompiler.MetalCompilerTarget, Metal.MetalCompilerParams}, compiler::Function, linker::Function)
      @ GPUCompiler /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/GPUCompiler/NVLGB/src/execution.jl:103
    [6] macro expansion
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:162 [inlined]
    [7] macro expansion
      @ ./lock.jl:267 [inlined]
    [8] mtlfunction(f::typeof(DiffEqGPU.gpu_ode_asolve_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, MtlDeviceVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, 1}, GPURosenbrock23{true}, MtlDeviceMatrix{SVector{20, Float32}, 1}, MtlDeviceMatrix{Float32, 1}, Float32, CallbackSet{Tuple{}, Tuple{}}, Nothing, Float32, Float32, Nothing, Val{false}}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:157
    [9] mtlfunction(f::typeof(DiffEqGPU.gpu_ode_asolve_kernel), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, MtlDeviceVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}, 1}, GPURosenbrock23{true}, MtlDeviceMatrix{SVector{20, Float32}, 1}, MtlDeviceMatrix{Float32, 1}, Float32, CallbackSet{Tuple{}, Tuple{}}, Nothing, Float32, Float32, Nothing, Val{false}}})
      @ Metal /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:155
   [10] macro expansion
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/compiler/execution.jl:77 [inlined]
   [11] (::KernelAbstractions.Kernel{MetalBackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(DiffEqGPU.gpu_ode_asolve_kernel)})(::MtlVector{ODEProblem{SVector{20, Float32}, Tuple{Float32, Float32}, false, SciMLBase.NullParameters, ODEFunction{false, SciMLBase.AutoSpecialize, typeof(f_large), UniformScaling{Bool}, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, Nothing, typeof(SciMLBase.DEFAULT_OBSERVED), Nothing, Nothing}, Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}}, SciMLBase.StandardODEProblem}}, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
      @ Metal.MetalKernels /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/MetalKernels.jl:105
   [12] Kernel
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/depots/26e4f8df-bbdd-40a2-82e4-24a159795e4b/packages/Metal/9shJi/src/MetalKernels.jl:101 [inlined]
   [13] #vectorized_asolve#166
      @ /Users/julia/.julia/scratchspaces/a66863c6-20e8-4ff4-8a62-49f30b1f605e/agent-cache/default-macmini-aarch64-3.0/build/default-macmini-aarch64-3-0/julialang/diffeqgpu-dot-jl/src/solve.jl:182 [inlined]

It's interesting because IIUC the dynamic workgroup size setting there should have used maxTotalThreadsPerThreadgroup, which in the case of CUDA takes register usage into account. Maybe there's additional limits we need to respect with Metal?

Metadata

Metadata

Assignees

No one assigned

    Labels

    kernelsThings about kernels and how they are compiled.upstreamOut of our hands

    Type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions