LuxDL
diff --git a/‎README.md‎
Lines changed: 13 additions & 1 deletion b/‎README.md‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎perf/README.md‎
Lines changed: 15 additions & 1 deletion b/‎perf/README.md‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎perf/kan/Project.toml‎
Lines changed: 27 additions & 0 deletions b/‎perf/kan/Project.toml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎perf/kan/main.jl‎
Lines changed: 218 additions & 0 deletions b/‎perf/kan/main.jl‎
Lines changed: 218 additions & 0 deletions
diff --git a/‎perf/resnet/Project.toml‎
Lines changed: 2 additions & 1 deletion b/‎perf/resnet/Project.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎perf/resnet/lux.jl‎ ‎perf/resnet/cuda.jl‎perf/resnet/lux.jl renamed to perf/resnet/cuda.jl
Lines changed: 5 additions & 4 deletions b/‎perf/resnet/lux.jl‎ ‎perf/resnet/cuda.jl‎perf/resnet/lux.jl renamed to perf/resnet/cuda.jl
Lines changed: 5 additions & 4 deletions
diff --git a/‎perf/resnet/main.py‎
Lines changed: 3 additions & 3 deletions b/‎perf/resnet/main.py‎
Lines changed: 3 additions & 3 deletions
@@ -13,7 +13,7 @@
 [![CI (pre-release)](<https://img.shields.io/github/actions/workflow/status/LuxDL/Lux.jl/CIPreRelease.yml?branch=main&label=CI%20(pre-release)&logo=github>)](https://github.com/LuxDL/Lux.jl/actions/workflows/CIPreRelease.yml)
 [![Build status](https://img.shields.io/buildkite/ba1f9622add5978c2d7b194563fd9327113c9c21e5734be20e/main.svg?label=gpu&branch=main&logo=buildkite)](https://buildkite.com/julialang/lux-dot-jl)
 [![codecov](https://codecov.io/gh/LuxDL/Lux.jl/branch/main/graph/badge.svg?token=IMqBM1e3hz)](https://codecov.io/gh/LuxDL/Lux.jl)
-[![Benchmarks](https://github.com/LuxDL/Lux.jl/actions/workflows/Benchmark.yml/badge.svg?branch=main)](https://lux.csail.mit.edu/benchmarks/)
+<!-- [![Benchmarks](https://github.com/LuxDL/Lux.jl/actions/workflows/Benchmark.yml/badge.svg?branch=main)](https://lux.csail.mit.edu/benchmarks/) -->
 
 [![Downloads](https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Fmonthly_downloads%2FLux&query=total_requests&suffix=%2Fmonth&label=Downloads)](https://juliapkgstats.com/pkg/Lux)
 [![Downloads](https://img.shields.io/badge/dynamic/json?url=http%3A%2F%2Fjuliapkgstats.com%2Fapi%2Fv1%2Ftotal_downloads%2FLux&query=total_requests&&label=Total%20Downloads)](https://juliapkgstats.com/pkg/Lux)
@@ -129,6 +129,18 @@ Pkg.add("Lux")
 [downloads-luxtestutils-url]: http://juliapkgstats.com/pkg/LuxTestUtils
 [downloads-luxcuda-url]: http://juliapkgstats.com/pkg/LuxCUDA
 
+## 🚀 Benchmarks
+
+Currently Benchmarks are scatter across a few places:
+
+  1. For comparison with other Julia packages like CUDA.jl take a look
+     at [Lux.jl/perf](./perf/README.md).
+  2. <https://enzymead.github.io/Enzyme-JAX/benchmarks/> highlights
+     performance of EnzymeJAX (backend for Reactant.jl) against JAX.
+  3. <https://enzymead.github.io/Reactant.jl/benchmarks/> highlights
+     performance of Reactant.jl against default XLA and base Julia
+     compilation.
+
 ## 🤸 Quickstart
 
 ### Reactant & Enzyme
 
@@ -2,7 +2,7 @@
 
 ## ResNet
 
-Benchmarks were run on a single A100 GPU with 40GB of memory.
+Benchmarks were run on a single GeForce RTX 5090 GPU with 32GB of VRAM.
 
 <p align="center">
     <img src="results/resnet/resnet_runtimes.svg#gh-light-mode-only"/>
@@ -13,3 +13,17 @@ Benchmarks were run on a single A100 GPU with 40GB of memory.
     <img src="results/resnet/resnet_speedups.svg#gh-light-mode-only"/>
     <img src="results/resnet/resnet_speedups_dark.svg#gh-dark-mode-only"/>
 </p>
+
+## Kolmogorov-Arnold Networks
+
+Benchmarks were run on a single GeForce RTX 5090 GPU with 32GB of VRAM.
+
+<p align="center">
+    <img src="results/kan/kan_runtimes.svg#gh-light-mode-only"/>
+    <img src="results/kan/kan_runtimes_dark.svg#gh-dark-mode-only"/>
+</p>
+
+<p align="center">
+    <img src="results/kan/kan_speedups.svg#gh-light-mode-only"/>
+    <img src="results/kan/kan_speedups_dark.svg#gh-dark-mode-only"/>
+</p>
@@ -0,0 +1,27 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Comonicon = "863f3e99-da2a-4334-8734-de3dacbe5542"
+ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
+JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+KolmogorovArnold = "eec8b66d-f71a-4a43-b228-0fe5d6721cd3"
+Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+
+[sources]
+Lux = {path = "../../"}
+LuxCUDA = {path = "../../lib/LuxCUDA"}
+
+[compat]
+BenchmarkTools = "1.6.0"
+Comonicon = "1.0.8"
+Enzyme = "0.13.81"
+Lux = "1.13.3"
+LuxCUDA = "0.3.3"
+Random = "1.11"
+Reactant = "0.2.190"
+julia = "1.11"
@@ -0,0 +1,218 @@
+# Taken from https://github.com/vpuri3/KolmogorovArnold.jl/blob/0fc349813be15982365173bce0e9bf3a814a342a/examples/eg3.jl
+using KolmogorovArnold
+using Comonicon, BenchmarkTools, JSON3
+using Random, LinearAlgebra
+using Enzyme, Zygote, Lux
+using OrderedCollections
+
+# configure BLAS
+ncores = min(Sys.CPU_THREADS, length(Sys.cpu_info()))
+BLAS.set_num_threads(ncores)
+
+# configure CUDA
+using LuxCUDA
+CUDA.allowscalar(false)
+
+# configure Reactant
+using Reactant
+Reactant.set_default_backend("gpu")
+
+rng = Random.default_rng()
+Random.seed!(rng, 0)
+
+function toy_loss_function(model, ps, st, x, y)
+    pred, _ = model(x, ps, st)
+    return MSELoss()(pred, y)
+end
+
+function setup_models(; kan_width::Int=128, grid_size::Int=32)
+    wK, G = kan_width, grid_size
+
+    basis_func = rbf      # rbf, rswaf
+    normalizer = softsign # sigmoid(_fast), tanh(_fast), softsign
+
+    kan1 = Chain(
+        KDense(1, wK, G; use_base_act=true, basis_func, normalizer),
+        KDense(wK, wK, G; use_base_act=true, basis_func, normalizer),
+        KDense(wK, 1, G; use_base_act=true, basis_func, normalizer),
+    )
+
+    kan2 = Chain(
+        KDense(1, wK, G; use_base_act=false, basis_func, normalizer),
+        KDense(wK, wK, G; use_base_act=false, basis_func, normalizer),
+        KDense(wK, 1, G; use_base_act=false, basis_func, normalizer),
+    )
+
+    return [("kan_base_act", kan1), ("kan_no_base_act", kan2)]
+end
+
+function run_cuda_benchmarks(; batch_size::Int=128, kwargs...)
+    dev = gpu_device(; force=true)
+
+    x = rand32(rng, 1, batch_size)
+    y = x .^ 2
+
+    models = setup_models(; kwargs...)
+    timings = OrderedDict{String,OrderedDict{String,Float64}}()
+
+    for (name, model) in models
+        println("\nCUDA Benchmarking: $name")
+
+        ps, st = Lux.setup(rng, model) |> dev
+        x_cu = x |> dev
+        y_cu = y |> dev
+
+        println("Param count: $(Lux.parameterlength(ps))")
+        println("State count: $(Lux.statelength(st))")
+
+        # Forward pass timing
+        fwd_time = @belapsed begin
+            pred, _ = $(model)($(x_cu), $(ps), $(Lux.testmode(st)))
+            CUDA.synchronize()
+        end setup = begin
+            GC.gc(true)
+            CUDA.reclaim()
+        end
+
+        # Backward pass timing (using Zygote)
+        fn = (ps, x) -> toy_loss_function(model, ps, st, x, y_cu)
+
+        bwd_time = @belapsed begin
+            Zygote.gradient($(fn), $(ps), $(x_cu))
+            CUDA.synchronize()
+        end setup = begin
+            GC.gc(true)
+            CUDA.reclaim()
+        end
+
+        timings[name] = OrderedDict{String,Float64}(
+            "forward" => fwd_time, "backward" => bwd_time
+        )
+
+        display(timings[name])
+    end
+
+    return timings
+end
+
+function run_xla_benchmarks(; kwargs...)
+    return run_reactant_benchmarks(;
+        kwargs..., compile_options=Reactant.DefaultXLACompileOptions()
+    )
+end
+
+function run_reactant_benchmarks(;
+    batch_size::Int=128,
+    compile_options=Reactant.CompileOptions(; optimization_passes=:all),
+    kwargs...,
+)
+    dev = reactant_device(; force=true)
+
+    x = rand32(rng, 1, batch_size)
+    y = x .^ 2
+
+    models = setup_models(; kwargs...)
+    timings = OrderedDict{String,OrderedDict{String,Float64}}()
+
+    for (name, model) in models
+        println("\nReactant Benchmarking: $name")
+
+        ps, st = Lux.setup(rng, model) |> dev
+        x_ra = x |> dev
+        y_ra = y |> dev
+
+        println("Param count: $(Lux.parameterlength(ps))")
+        println("State count: $(Lux.statelength(st))")
+
+        # Forward pass timing
+        fwd_time_result = Reactant.Profiler.profile_with_xprof(
+            Lux.apply,
+            model,
+            x_ra,
+            ps,
+            Lux.testmode(st);
+            nrepeat=10,
+            warmup=1,
+            compile_options,
+        )
+        fwd_time = fwd_time_result.profiling_result.runtime_ns / 1e9
+
+        # Backward pass timing
+        bwd_time_result = Reactant.Profiler.profile_with_xprof(
+            Enzyme.gradient,
+            Reverse,
+            toy_loss_function,
+            Const(model),
+            ps,
+            Const(st),
+            Const(x_ra),
+            Const(y_ra);
+            nrepeat=10,
+            warmup=1,
+            compile_options,
+        )
+        bwd_time = bwd_time_result.profiling_result.runtime_ns / 1e9
+
+        timings[name] = OrderedDict{String,Float64}(
+            "forward" => fwd_time, "backward" => bwd_time
+        )
+
+        display(timings[name])
+    end
+
+    return timings
+end
+
+Comonicon.@main function main(;
+    backend::String="all", batch_size::Int=1024, kan_width::Int=128, grid_size::Int=32
+)
+    results_path = joinpath(@__DIR__, "../results/kan/")
+    mkpath(results_path)
+
+    if backend in ("cuda", "all")
+        println("\n" * "="^50)
+        println("Running CUDA benchmarks...")
+        println("="^50)
+
+        cuda_timings = run_cuda_benchmarks(; batch_size, kan_width, grid_size)
+
+        open(joinpath(results_path, "cudajl.json"), "w") do io
+            JSON3.write(io, cuda_timings)
+        end
+
+        println("\nCUDA Results:")
+        display(cuda_timings)
+    end
+
+    if backend in ("reactant", "all")
+        println("\n" * "="^50)
+        println("Running Reactant benchmarks...")
+        println("="^50)
+
+        reactant_timings = run_reactant_benchmarks(; batch_size, kan_width, grid_size)
+
+        open(joinpath(results_path, "reactant.json"), "w") do io
+            JSON3.write(io, reactant_timings)
+        end
+
+        println("\nReactant Results:")
+        display(reactant_timings)
+    end
+
+    if backend in ("xla", "all")
+        println("\n" * "="^50)
+        println("Running XLA benchmarks...")
+        println("="^50)
+
+        xla_timings = run_xla_benchmarks(; batch_size, kan_width, grid_size)
+
+        open(joinpath(results_path, "xla.json"), "w") do io
+            JSON3.write(io, xla_timings)
+        end
+
+        println("\nXLA Results:")
+        display(xla_timings)
+    end
+
+    return nothing
+end
@@ -5,6 +5,7 @@ Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
@@ -20,5 +21,5 @@ Enzyme = "0.13.81"
 Lux = "1.13.3"
 LuxCUDA = "0.3.3"
 Random = "1.11"
-Reactant = "0.2.170"
+Reactant = "0.2.190"
 julia = "1.11"
@@ -1,5 +1,6 @@
 using Comonicon, BenchmarkTools, JSON3
 using Lux, LuxCUDA, Random, Zygote
+using OrderedCollections
 
 include("resnet.jl")
 
@@ -13,7 +14,7 @@ Comonicon.@main function main(;
 )
     dev = gpu_device(; force=true)
 
-    timings = Dict{Int,Dict{Int,Dict{String,Float64}}}()
+    timings = OrderedDict{Int,OrderedDict{Int,OrderedDict{String,Float64}}}
 
     for m in model_size
         println("model_size=$m")
@@ -23,7 +24,7 @@ Comonicon.@main function main(;
         println("Param count: $(Lux.parameterlength(ps))")
         println("State count: $(Lux.statelength(st))")
 
-        timings[m] = Dict{Int,Dict{String,Float64}}()
+        timings[m] = OrderedDict{Int,OrderedDict{String,Float64}}()
 
         for b in batch_size
             x = rand(Float32, 224, 224, 3, b) |> dev
@@ -52,12 +53,12 @@ Comonicon.@main function main(;
                 end
             end
 
-            timings[m][b] = Dict{String,Float64}(
+            timings[m][b] = OrderedDict{String,Float64}(
                 "forward" => fwd_time, "backward" => bwd_time
             )
         end
 
-        println(timings[m])
+        display(timings[m])
     end
 
     results_path = joinpath(@__DIR__, "../results/resnet/")
 
@@ -15,7 +15,7 @@
 import json
 
 from functools import partial
-from typing import Any, Tuple
+from typing import Any
 from collections.abc import Callable, Sequence
 
 import flax.linen as nn
@@ -153,7 +153,7 @@ def loss_fn(p, x, y):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--batch-size", type=list, default=[1, 4, 32, 128])
-    parser.add_argument("--model-size", type=list, default=[18, 34, 50, 101, 152])
+    parser.add_argument("--model-size", type=list, default=[18, 34, 50, 101])
     args = parser.parse_args()
 
     timings = dict()
@@ -223,4 +223,4 @@ def loss_fn(p, x, y):
     os.makedirs(results_path, exist_ok=True)
 
     with open(os.path.join(results_path, "jax.json"), "w") as f:
-        json.dump(timings, f, indent=4)
+        json.dump(timings, f, indent=4, sort_keys=True)