SciML · divital-coder · Jan 1, 2026 · Jan 3, 2026 · Jan 3, 2026 · Jan 10, 2026
diff --git a/.buildkite/build_benchmark.sh b/.buildkite/build_benchmark.sh
@@ -14,10 +14,24 @@ if [[ "${JULIAHUBREGISTRY_BENCHMARK_TARGETS[*]}" =~ "${1}" ]]; then
 	julia -e 'using Pkg; Pkg.Registry.add(); Pkg.Registry.status()'
 fi
 
+# GPU benchmark setup
+if [[ "${1}" == *GPU/* ]]; then
+	echo "--- :gpu: GPU benchmark setup"
+	# Disable CUDA memory pool for accurate benchmarking
+	export JULIA_CUDA_MEMORY_POOL='none'
+	echo "JULIA_CUDA_MEMORY_POOL=${JULIA_CUDA_MEMORY_POOL}"
+fi
+
 # Instantiate, to install the overall project dependencies, and `build()` for conda
 echo "--- :julia: Instantiate"
 julia --project=. -e 'using Pkg; Pkg.instantiate(); Pkg.build()'
 
+# Verify CUDA availability for GPU benchmarks
+if [[ "${1}" == *GPU/* ]]; then
+	echo "--- :gpu: Verify CUDA availability"
+	julia --project=. -e 'using CUDA; CUDA.functional() || error("CUDA not functional!"); println("GPU: ", CUDA.name(CUDA.device())); CUDA.versioninfo()'
+fi
+
 if [[ "${1}" == *BayesianInference* ]]; then
 	export CMDSTAN_HOME="$(pwd)/cmdstan-2.29.2/"
 	curl -LO https://github.com/stan-dev/cmdstan/releases/download/v2.29.2/cmdstan-2.29.2.tar.gz

diff --git a/.buildkite/launch_benchmarks.yml b/.buildkite/launch_benchmarks.yml
@@ -3,17 +3,35 @@ agents:
   sandbox_capable: true
 
 steps:
-  - label: ":runner: Dynamically launch run_benchmark.yml"
+  - label: ":runner: Dynamically launch run_benchmark.yml (CPU)"
     branches: "!gh-pages"
     env:
       BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
     depends_on:
     plugins:
       - staticfloat/forerunner#v1:
-          # This will create one job per project
+          # This will create one job per project (excluding GPU benchmarks)
           watch:
             - benchmarks/**/*.jmd
             - benchmarks/**/*.toml
+          ignore:
+            - benchmarks/GPU
+            - benchmarks/GPU/**
           path_processor: .buildkite/path_processors/project-coalescing
           target: .buildkite/run_benchmark.yml
           target_type: template
+
+  - label: ":runner: :gpu: Dynamically launch run_gpu_benchmark.yml (GPU)"
+    branches: "!gh-pages"
+    env:
+      BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
+    depends_on:
+    plugins:
+      - staticfloat/forerunner#v1:
+          # This will create one job per GPU benchmark
+          watch:
+            - benchmarks/GPU/**/*.jmd
+            - benchmarks/GPU/**/*.toml
+          path_processor: .buildkite/path_processors/project-coalescing
+          target: .buildkite/run_gpu_benchmark.yml
+          target_type: template
diff --git a/.buildkite/run_gpu_benchmark.yml b/.buildkite/run_gpu_benchmark.yml
@@ -0,0 +1,94 @@
+# This is a pipeline that runs GPU benchmarks, then uploads the resultant
+# .PDF and other reports as (buildkite, not Julia) artifacts. The `coppermind`
+# configuration memoizes the result, so that identical inputs don't get
+# benchmarked multiple times.
+#
+# GPU benchmarks use the juliagpu queue with CUDA-capable agents.
+# Note: sandbox plugin is NOT used for GPU jobs due to GPU passthrough complexity.
+
+steps:
+  - label: ":hammer: :gpu: {PATH}"
+    key: "benchmark-{SANITIZED_PATH}"
+    env:
+      BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
+    plugins:
+      - staticfloat/cryptic#v2:
+          variables:
+            - BUILDKITE_S3_ACCESS_KEY_ID="U2FsdGVkX1+x3xs1ZRRZRt3FmwFQmYYKnpV7o8xKkX5Ib6y0o5fv0+rskVAj+JKu"
+            - BUILDKITE_S3_SECRET_ACCESS_KEY="U2FsdGVkX1+LWh1yX7LsMBlecEJLc08eJrgOhurhd47CY1/jS3wCGVCQmS1t6f2j70spBoFdfc9kn2naj8HH5A=="
+            - BUILDKITE_S3_DEFAULT_REGION="U2FsdGVkX18ccoE9FmtkwsCm1x0MLMBlN/FLcAyKkY4="
+          files:
+            - .buildkite/secrets/token.toml
+      - JuliaCI/julia#v1:
+          version: "1.10"
+      # Note: No sandbox plugin for GPU jobs - GPU passthrough is complex
+      - staticfloat/coppermind#v1:
+          inputs:
+            # We are sensitive to the actual benchmark changing
+            - {PATH}
+            # We are sensitive to the source code of this package changing
+            - src/**/*.jl
+            # We are sensitive to our overall dependencies changing
+            - ./*.toml
+          outputs:
+            - markdown/**/figures/*.png
+            - markdown/**/*.md
+            - markdown/**/*.pdf
+            - markdown/**/*.svg
+            - notebook/**/*.ipynb
+            - pdf/**/*.pdf
+            - script/**/*.jl
+          s3_prefix: s3://julialang-buildkite-artifacts/scimlbenchmarks
+    timeout_in_minutes: 12000
+    commands: |
+      # Clear out these secrets as they're not needed during the actual build
+      BUILDKITE_S3_ACCESS_KEY_ID="" BUILDKITE_S3_SECRET_ACCESS_KEY="" ./.buildkite/build_benchmark.sh "{PATH}"
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+
+  - label: ":rocket: Publish {PATH}"
+    env:
+      BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
+    plugins:
+      - staticfloat/cryptic#v2:
+          variables:
+            - BUILDKITE_S3_ACCESS_KEY_ID="U2FsdGVkX1+x3xs1ZRRZRt3FmwFQmYYKnpV7o8xKkX5Ib6y0o5fv0+rskVAj+JKu"
+            - BUILDKITE_S3_SECRET_ACCESS_KEY="U2FsdGVkX1+LWh1yX7LsMBlecEJLc08eJrgOhurhd47CY1/jS3wCGVCQmS1t6f2j70spBoFdfc9kn2naj8HH5A=="
+            - BUILDKITE_S3_DEFAULT_REGION="U2FsdGVkX18ccoE9FmtkwsCm1x0MLMBlN/FLcAyKkY4="
+          files:
+            - .buildkite/ssh_deploy.key
+      - JuliaCI/julia#v1:
+          version: "1.10"
+      - staticfloat/sandbox:
+          rootfs_url: "https://jc-rootfs-images.s3.amazonaws.com/aws_uploader-2021-11-12.x86_64.tar.gz"
+          rootfs_treehash: "986217e5b36efd3b3b91ed90df8e36d628cf543f"
+          workspaces:
+            # Include the julia we just downloaded
+            - "/cache/julia-buildkite-plugin:/cache/julia-buildkite-plugin"
+      # Use coppermind to download the benchmark results that were calculated in the
+      # benchmarking job above. Note we still list `outputs` here, since we have the
+      # option to extract only a subset of them here.
+      - staticfloat/coppermind#v1:
+          input_from: "benchmark-{SANITIZED_PATH}"
+          outputs:
+            - markdown/**/figures/*.png
+            - markdown/**/*.md
+            - notebook/**/*.ipynb
+            - pdf/**/*.pdf
+            - script/**/*.jl
+          s3_prefix: s3://julialang-buildkite-artifacts/scimlbenchmarks
+      - staticfloat/ssh-agent:
+          keyfiles:
+            - .buildkite/ssh_deploy.key
+    agents:
+      queue: "juliaecosystem"
+      sandbox_capable: true
+      arch: "x86_64"
+    concurrency: 1
+    concurrency_group: "scimlbenchmarks/deploy"
+    commands: .buildkite/publish_benchmark_output.sh
+    # Don't run this unless we're on the master branch, and not until the actual benchmark
+    # command has had a chance to run.
+    depends_on: "benchmark-{SANITIZED_PATH}"
+    branches: "master"
diff --git a/benchmarks/GPU/EnsembleGPU_Lorenz.jmd b/benchmarks/GPU/EnsembleGPU_Lorenz.jmd
@@ -0,0 +1,209 @@
+---
+title: GPU Ensemble ODE Benchmark - Lorenz System
+author: SciMLBenchmarks
+---
+
+## Introduction
+
+This benchmark compares CPU vs GPU performance for solving many independent ODE trajectories using DiffEqGPU.jl's EnsembleGPUKernel approach. The Lorenz system is used as a canonical chaotic system benchmark.
+
+GPU acceleration is most effective for ensemble problems where thousands of independent trajectories need to be computed. The GPU kernel approach compiles the entire ODE solver to GPU, avoiding CPU-GPU synchronization overhead.
+
+```julia
+using OrdinaryDiffEq, DiffEqGPU, CUDA, StaticArrays, BenchmarkTools, Plots
+
+# Check GPU availability
+const CUDA_AVAILABLE = CUDA.functional()
+println("CUDA functional: ", CUDA_AVAILABLE)
+if CUDA_AVAILABLE
+    println("GPU: ", CUDA.name(CUDA.device()))
+    CUDA.versioninfo()
+else
+    @warn "CUDA not functional - GPU benchmarks will be skipped, showing CPU-only results"
+end
+```
+
+## Problem Definition
+
+The Lorenz system is defined using StaticArrays for stack-allocated, non-allocating operations required for GPU execution:
+
+$$\frac{dx}{dt} = \sigma(y - x)$$
+$$\frac{dy}{dt} = x(\rho - z) - y$$
+$$\frac{dz}{dt} = xy - \beta z$$
+
+```julia
+function lorenz(u, p, t)
+    sigma, rho, beta = p
+    x, y, z = u
+    dx = sigma * (y - x)
+    dy = x * (rho - z) - y
+    dz = x * y - beta * z
+    SA[dx, dy, dz]
+end
+
+u0 = SA[1.0f0, 0.0f0, 0.0f0]
+tspan = (0.0f0, 10.0f0)
+p = SA[10.0f0, 28.0f0, 8.0f0/3.0f0]
+
+prob = ODEProblem{false}(lorenz, u0, tspan, p)
+```
+
+## Ensemble Problem Setup
+
+We create an ensemble with parameter variations to simulate many independent trajectories:
+
+```julia
+function prob_func(prob, i, repeat)
+    remake(prob, p = SA[10.0f0 + 0.01f0*i, 28.0f0, 8.0f0/3.0f0])
+end
+
+ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
+```
+
+## Single Trajectory Verification
+
+First, verify that GPU and CPU produce matching results:
+
+```julia
+# CPU solution
+sol_cpu = solve(prob, Tsit5(), saveat = 0.1f0)
+println("CPU final state: ", sol_cpu.u[end])
+
+if CUDA_AVAILABLE
+    # GPU solution (single trajectory)
+    sol_gpu = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
+                    trajectories = 1, saveat = 0.1f0)
+    println("GPU final state: ", sol_gpu[1].u[end])
+end
+```
+
+```julia
+plot(sol_cpu, idxs = (1, 2, 3), title = "Lorenz Attractor (CPU)", label = "")
+```
+
+## Benchmarking: CPU vs GPU
+
+We benchmark across different trajectory counts to find the crossover point where GPU becomes faster:
+
+```julia
+trajectory_counts = [100, 1000, 10000, 100000]
+cpu_times = Float64[]
+gpu_times = Float64[]
+
+for N in trajectory_counts
+    println("\n--- Benchmarking N = $N trajectories ---")
+
+    # CPU (threaded)
+    t_cpu = @belapsed solve($ensemble_prob, Tsit5(), EnsembleThreads(),
+                           trajectories = $N, saveat = 0.1f0) samples=3 evals=1
+    push!(cpu_times, t_cpu)
+    println("CPU (threaded): $(round(t_cpu, digits=4))s")
+
+    if CUDA_AVAILABLE
+        # GPU
+        t_gpu = @belapsed solve($ensemble_prob, GPUTsit5(),
+                               EnsembleGPUKernel(CUDA.CUDABackend()),
+                               trajectories = $N, saveat = 0.1f0) samples=3 evals=1
+        push!(gpu_times, t_gpu)
+        println("GPU (CUDA):     $(round(t_gpu, digits=4))s")
+        speedup = t_cpu / t_gpu
+        println("Speedup:        $(round(speedup, digits=1))x")
+    else
+        push!(gpu_times, NaN)
+        println("GPU (CUDA):     N/A (CUDA not available)")
+    end
+end
+```
+
+## Results Visualization
+
+```julia
+if CUDA_AVAILABLE && !any(isnan, gpu_times)
+    p1 = plot(trajectory_counts, [cpu_times gpu_times],
+         label = ["CPU (Threads)" "GPU (CUDA)"],
+         xscale = :log10, yscale = :log10,
+         xlabel = "Number of Trajectories",
+         ylabel = "Time (s)",
+         title = "DiffEqGPU.jl Ensemble Performance",
+         marker = :circle,
+         legend = :topleft,
+         linewidth = 2)
+else
+    p1 = plot(trajectory_counts, cpu_times,
+         label = "CPU (Threads)",
+         xscale = :log10, yscale = :log10,
+         xlabel = "Number of Trajectories",
+         ylabel = "Time (s)",
+         title = "CPU Ensemble Performance (GPU unavailable)",
+         marker = :circle,
+         linewidth = 2)
+end
+p1
+```
+
+```julia
+if CUDA_AVAILABLE && !any(isnan, gpu_times)
+    speedups = cpu_times ./ gpu_times
+    p2 = plot(trajectory_counts, speedups,
+         xscale = :log10,
+         xlabel = "Number of Trajectories",
+         ylabel = "Speedup (CPU/GPU)",
+         title = "GPU Speedup vs CPU",
+         marker = :circle,
+         legend = false,
+         linewidth = 2,
+         color = :green)
+    hline!([1.0], linestyle = :dash, color = :red, label = "Break-even")
+    p2
+else
+    println("GPU speedup plot skipped - CUDA not available")
+    nothing
+end
+```
+
+```julia
+if CUDA_AVAILABLE && !any(isnan, gpu_times)
+    plot(p1, p2, layout = (1, 2), size = (1000, 400))
+else
+    p1
+end
+```
+
+## Summary Table
+
+```julia
+using Printf
+
+println("\nSummary Results:")
+println("=" ^ 60)
+if CUDA_AVAILABLE && !any(isnan, gpu_times)
+    @printf("%-15s %12s %12s %12s\n", "Trajectories", "CPU (s)", "GPU (s)", "Speedup")
+    println("-" ^ 60)
+    for (N, t_cpu, t_gpu) in zip(trajectory_counts, cpu_times, gpu_times)
+        @printf("%-15d %12.4f %12.4f %12.1fx\n", N, t_cpu, t_gpu, t_cpu/t_gpu)
+    end
+else
+    @printf("%-15s %12s\n", "Trajectories", "CPU (s)")
+    println("-" ^ 60)
+    for (N, t_cpu) in zip(trajectory_counts, cpu_times)
+        @printf("%-15d %12.4f\n", N, t_cpu)
+    end
+    println("\nNote: GPU benchmarks skipped - CUDA not available on this system")
+end
+println("=" ^ 60)
+```
+
+## Conclusion
+
+GPU acceleration via DiffEqGPU.jl provides significant speedups for ensemble ODE problems when the number of trajectories is large (typically > 1000). The EnsembleGPUKernel approach compiles the entire solver to GPU code, maximizing throughput for embarrassingly parallel trajectory computations.
+
+Key observations:
+- GPU overhead is amortized over many trajectories
+- Float32 precision is recommended for GPU (faster memory bandwidth)
+- StaticArrays are required for GPU kernel compilation
+- The crossover point depends on problem complexity and GPU hardware
+
+```julia, echo = false
+using SciMLBenchmarks
+SciMLBenchmarks.bench_footer(WEAVE_ARGS[:folder], WEAVE_ARGS[:file])
+```
diff --git a/benchmarks/GPU/Project.toml b/benchmarks/GPU/Project.toml
@@ -0,0 +1,17 @@
+[deps]
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
+OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+SciMLBenchmarks = "31c91b34-3c75-11e9-0341-95557aab0344"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[compat]
+BenchmarkTools = "1"
+CUDA = "5"
+DiffEqGPU = "3"
+OrdinaryDiffEq = "6"
+Plots = "1"
+SciMLBenchmarks = "0.1"
+StaticArrays = "1"