ComputableDAGs
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 18 additions & 1 deletion b/‎.github/workflows/unit_tests.yml‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎Project.toml‎
Lines changed: 3 additions & 1 deletion b/‎Project.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎benchmark/QEDFD.jl‎
Lines changed: 2 additions & 2 deletions b/‎benchmark/QEDFD.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions b/‎docs/make.jl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/src/lib/internals/code_gen.md‎
Lines changed: 0 additions & 1 deletion b/‎docs/src/lib/internals/code_gen.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/src/lib/internals/ka_extension.md‎
Lines changed: 14 additions & 0 deletions b/‎docs/src/lib/internals/ka_extension.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/src/manual.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/src/manual.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ext/KernelAbstractionsExt.jl‎
Lines changed: 23 additions & 18 deletions b/‎ext/KernelAbstractionsExt.jl‎
Lines changed: 23 additions & 18 deletions
diff --git a/‎ext/kernel_wrapper.jl‎
Lines changed: 35 additions & 0 deletions b/‎ext/kernel_wrapper.jl‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎src/code_gen/function.jl‎
Lines changed: 36 additions & 31 deletions b/‎src/code_gen/function.jl‎
Lines changed: 36 additions & 31 deletions
@@ -18,6 +18,16 @@ jobs:
         julia-version: ['1.10', '1.11', '1.12']
         julia-arch: [x64]
         os: [ubuntu-latest]
+        backend:
+          - { container: "ubuntu:24.04",                                     cpu: "1", kacpu: "0", cuda: "0", amdgpu: "0", oneapi: "0", metal: "0" }
+          - { container: "ubuntu:24.04",                                     cpu: "0", kacpu: "1", cuda: "0", amdgpu: "0", oneapi: "0", metal: "0" }
+          # large runners with gpu hardware cost money
+          #- { container: "nvidia/cuda:12.9.1-devel-ubuntu24.04",             cpu: "0", kacpu: "0", cuda: "1", amdgpu: "0", oneapi: "0", metal: "0" }
+          #- { container: "rocm/dev-ubuntu-24.04:6.4.2",                      cpu: "0", kacpu: "0", cuda: "0", amdgpu: "1", oneapi: "0", metal: "0" }
+          #- { container: "intel/oneapi-hpckit:2025.2.0-0-devel-ubuntu24.04", cpu: "0", kacpu: "0", cuda: "0", amdgpu: "0", oneapi: "1", metal: "0" }
+
+    #container:
+    #  image: ${{matrix.backend.container}}
 
     steps:
       - name: Checkout repository
@@ -35,7 +45,14 @@ jobs:
         uses: julia-actions/cache@v2
 
       - name: Instantiate
-        run: julia --project=./ -e 'using Pkg; Pkg.instantiate()'
+        run: |
+          julia --project=./ -e 'using Pkg; Pkg.instantiate()'
+          echo "TEST_CPU=${{ matrix.backend.cpu }}" >> $GITHUB_ENV
+          echo "TEST_KACPU=${{ matrix.backend.kacpu }}" >> $GITHUB_ENV
+          echo "TEST_CUDA=${{ matrix.backend.cuda }}" >> $GITHUB_ENV
+          echo "TEST_AMDGPU=${{ matrix.backend.amdgpu }}" >> $GITHUB_ENV
+          echo "TEST_ONEAPI=${{ matrix.backend.oneapi }}" >> $GITHUB_ENV
+          echo "TEST_METAL=${{ matrix.backend.metal }}" >> $GITHUB_ENV
 
       - name: Run tests
         uses: julia-actions/julia-runtest@v1
 
@@ -35,9 +35,11 @@ julia = "1.10"
 oneAPI = "1, 2"
 
 [extras]
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["SafeTestsets", "Test", "StatsBase"]
+test = ["SafeTestsets", "Test", "Pkg", "StatsBase", "KernelAbstractions"]
@@ -5,7 +5,7 @@ using QEDprocesses
 using QEDcore
 using QEDbase
 
-RNG = Xoshiro(1)
+RNG = Xoshiro(143)
 MODEL = PerturbativeQED()
 PROC = ScatteringProcess(
     (Electron(), Photon()),
@@ -20,7 +20,7 @@ PSP = PhaseSpacePoint(PROC, MODEL, INPSL, tuple(rand(SFourMomentum, number_incom
 @show g
 
 @info "Building the function"
-@time f = compute_function(g, PROC, cpu_st(), @__MODULE__; closures_size = 100, concrete_input_type = typeof(PSP));
+@time f = compute_function(g, PROC, cpu_st(), @__MODULE__)
 
 #=@info "Writing llvm code"
 @time open("llvm.out", write = true) do file
 
@@ -41,6 +41,7 @@ pages = [
         "Code Generation" => "lib/internals/code_gen.md",
         "Devices" => "lib/internals/devices.md",
         "Utility" => "lib/internals/utility.md",
+        "KernelAbstractions Extension" => "lib/internals/ka_extension.md",
     ],
     "Contribution" => "contribution.md",
 ]
 
@@ -8,7 +8,6 @@ Order = [:type, :constant, :function]
 ```
 
 ## Function Generation
-Implementations for generation of a callable function. A function generated this way cannot immediately be called. One Julia World Age has to pass before this is possible, which happens when the global Julia scope advances. If the DAG and therefore the generated function becomes too large, use the tape machine instead, since compiling large functions becomes infeasible.
 ```@autodocs
 Modules = [ComputableDAGs]
 Pages = ["code_gen/function.jl"]
 
@@ -0,0 +1,14 @@
+# Kernel Abstractions Extension
+
+```@autodocs
+Modules = [ComputableDAGs]
+Pages = ["ext/KernelAbstractionsExt.jl"]
+Order   = [:function]
+```
+
+## Kernel Wrapping
+```@autodocs
+Modules = [ComputableDAGs]
+Pages = ["ext/kernel_wrapper.jl"]
+Order   = [:type, :function]
+```
@@ -31,7 +31,7 @@ When the CDAG is ready, it can be compiled for the machine you're running on.
 
 Now, [`compute_function`](@ref) can be used to create a function that can be called on inputs. [`compute_function`](@ref) supports and in some cases requires keyword arguments, please refer to its documentation for more information.
 
-Alternatively, GPU kernels can be generated by using [`kernel`](@ref) instead of [`compute_function`](@ref). This is implemented for several GPU backends and produces a regular function for the given backend. Since RuntimeGeneratedFunctions.jl does not support GPU kernels at this time, this function will only be callable if the world age has been increased since its generation. Furthermore, the compute functions in the graph need to comply with all the normal requirements for GPU kernels, such as not calling dynamic functions.
+Alternatively, [KernelAbstractions](https://juliagpu.github.io/KernelAbstractions.jl/stable/) kernels can be generated by using [`kernel`](@ref) instead of [`compute_function`](@ref). The returned value is a KernelAbstractions kernel object that can be called like any such kernel by giving it a backend and block size. The compute functions in the graph need to comply with all the normal requirements for GPU kernels, such as not calling dynamic functions. For more details, refer to the function's docs.
 
 ## Application repositories
 
 
@@ -1,29 +1,34 @@
 module KernelAbstractionsExt
 
 using ComputableDAGs
+using KernelAbstractions
 using UUIDs
 using Random
 
-function ComputableDAGs.kernel(graph::DAG, instance, context_module::Module)
-    machine = cpu_st()
-    tape = ComputableDAGs.gen_tape(graph, instance, machine, context_module)
+include("kernel_wrapper.jl")
 
+function ComputableDAGs.init_kernel(mod::Module)
+    mod.eval(Meta.parse("@kernel inbounds = true function _ka_broadcast!(@Const(in::AbstractVector), out::AbstractVector, val::Val)
+        id = @index(Global)
+        @inline out[id] = _compute_expr(in[id], val)
+    end"))
+    return nothing
+end
+
+function ComputableDAGs.kernel(dag::DAG, instance, context_module::Module)
+    tape = ComputableDAGs.gen_tape(dag, instance, cpu_st(), ComputableDAGs.GreedyScheduler())
+
+    code = ComputableDAGs.gen_function_body(tape)
     assign_inputs = Expr(:block, ComputableDAGs.expr_from_fc.(tape.input_assign_code)...)
-    # TODO: use gen_function_body here
-    code = Expr(:block, ComputableDAGs.expr_from_fc.(tape.schedule)...)
-
-    function_id = ComputableDAGs.to_var_name(UUIDs.uuid1(TaskLocalRNG()))
-    expr = Meta.parse(
-        "@kernel function compute_$(function_id)(input_vector, output_vector)
-            id = @index(Global)
-            @inline input = input_vector[id]
-            $(assign_inputs)
-            $code
-            @inline output_vector[id] = $(tape.output_symbol)
-        end"
-    )
-
-    return expr
+
+    expr = Expr(:block, assign_inputs, code, :(return $(tape.output_symbol)))
+
+    # generate random UUID for type independent lookup in the expression cache
+    val = Val(UUIDs.uuid1(TaskLocalRNG()))
+    getfield(context_module, ComputableDAGs.EXPR_SYM)[val] = expr
+
+    # wrap the kernel together with the generated Val{UUID} to opaquely insert it for the caller later
+    return KAWrapper(context_module._ka_broadcast!, val)
 end
 
 end
@@ -0,0 +1,35 @@
+"""
+    KAWrapper{T, ID}
+
+A wrapper around a KernelAbstractions kernel. Takes the `kernel::T` and an `ID::Val`.
+
+This is necessary to insert the id to the KernelAbstractions kernel without needing the user to do it manually.
+The Val itself is necessary to be able to define multiple different kernels working on the same input type. It is used in the expression cache as the key, and dispatched on in the `@generated` function.
+"""
+struct KAWrapper{T, ID}
+    kernel::T
+    id::ID
+end
+
+"""
+    KAWrapperKernel{T, ID, Args, KWArgs}
+
+The second level of wrapping, to imitate the way that KernelAbstractions kernels are called: `kernel(<kernel config/backend>)(<runtime arguments>)`.
+"""
+struct KAWrapperKernel{T, ID, Args, KWArgs}
+    kernel::T
+    id::ID
+    args::Args
+    kwargs::KWArgs
+end
+
+# initial level, args and kwargs are the kernel config, stored in the KAWrapperKernel
+@inline function (k::KAWrapper{T, ID})(args...; kwargs...) where {T, ID}
+    return KAWrapperKernel(k.kernel, k.id, args, kwargs)
+end
+
+# second level, wraps the actual call, inserting the kernel config args/kwargs, and calling with the runtime args + the stored id
+@inline function (k::KAWrapperKernel{T, ID, Args})(args...; kwargs...) where {T, ID, Args}
+    k.kernel(k.args...; k.kwargs...)(args..., k.id; kwargs...)
+    return nothing
+end
@@ -1,46 +1,22 @@
 """
-    compute_function(
+    compute_function_expr(
         dag::DAG,
         instance,
         machine::Machine,
-        context_module::Module
+        scheduler::AbstractScheduler
     )
 
-Return a function of signature `compute_<id>(input::input_type(instance))`, which will return the result of the DAG computation on the given input.
-The final argument `context_module` should always be `@__MODULE__` to be able to use functions defined in the caller's environment.
-
-## Keyword Arguments
-
-`closures_size` (default=0 (off)): The size of closures to use in the main generated code. This specifies the size of code blocks across which the
-        compiler cannot optimize. For sufficiently large functions, a larger value means longer compile times but potentially faster execution time.
-        **Note** that the actually used closure size might be different than the one passed here, since the function automatically chooses a size that
-        is close to a n-th root of the total number of loc, based off the given size.
-`concrete_input_type` (default=`input_type(instance)`): A type that will be used as the expected input type of the generated function. If
-    omitted, the `input_type` of the problem instance is used. Note that the `input_type` of the instance will still be used as the annotated
-    type in the generated function header.
+Helper function, returning the complete function expression.
 """
-function compute_function(
+function compute_function_expr(
         dag::DAG,
         instance,
         machine::Machine,
-        context_module::Module;
-        closures_size::Int = 0,
-        concrete_input_type::Type = Nothing,
+        scheduler::AbstractScheduler
     )
-    global INITIALIZED_MODULES
-    if !(context_module in INITIALIZED_MODULES)
-        RuntimeGeneratedFunctions.init(context_module)
-        push!(INITIALIZED_MODULES, context_module)
-    end
-
-    tape = gen_tape(dag, instance, machine, context_module)
+    tape = gen_tape(dag, instance, machine, scheduler)
 
-    code = gen_function_body(
-        tape,
-        context_module;
-        closures_size = closures_size,
-        concrete_input_type = concrete_input_type,
-    )
+    code = gen_function_body(tape)
     assign_inputs = Expr(:block, expr_from_fc.(tape.input_assign_code)...)
 
     function_id = to_var_name(UUIDs.uuid1(TaskLocalRNG()))
@@ -60,5 +36,34 @@ function compute_function(
         ), # function body
     )
 
+    return expr
+end
+
+"""
+    compute_function(
+        dag::DAG,
+        instance,
+        machine::Machine,
+        context_module::Module,
+        scheduler::AbstractScheduler = GreedyScheduler(),
+    )
+
+Return a function of signature `compute_<id>(input::input_type(instance))`, which will return the result of the DAG computation on the given input.
+The final argument `context_module` should always be `@__MODULE__` to be able to use functions defined in the caller's environment.
+"""
+function compute_function(
+        dag::DAG,
+        instance,
+        machine::Machine,
+        context_module::Module,
+        scheduler::AbstractScheduler = GreedyScheduler()
+    )
+    global INITIALIZED_MODULES
+    if !(context_module in INITIALIZED_MODULES)
+        RuntimeGeneratedFunctions.init(context_module)
+        push!(INITIALIZED_MODULES, context_module)
+    end
+
+    expr = compute_function_expr(dag, instance, machine, scheduler)
     return invokelatest(RuntimeGeneratedFunction, @__MODULE__, context_module, expr)
 end
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ pages = [`
`41`	`41`	`"Code Generation" => "lib/internals/code_gen.md",`
`42`	`42`	`"Devices" => "lib/internals/devices.md",`
`43`	`43`	`"Utility" => "lib/internals/utility.md",`
	`44`	`+ "KernelAbstractions Extension" => "lib/internals/ka_extension.md",`
`44`	`45`	`],`
`45`	`46`	`"Contribution" => "contribution.md",`
`46`	`47`	`]`