Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .buildkite/build_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,24 @@ if [[ "${JULIAHUBREGISTRY_BENCHMARK_TARGETS[*]}" =~ "${1}" ]]; then
julia -e 'using Pkg; Pkg.Registry.add(); Pkg.Registry.status()'
fi

# GPU benchmark setup
if [[ "${1}" == *GPU/* ]]; then
echo "--- :gpu: GPU benchmark setup"
# Disable CUDA memory pool for accurate benchmarking
export JULIA_CUDA_MEMORY_POOL='none'
echo "JULIA_CUDA_MEMORY_POOL=${JULIA_CUDA_MEMORY_POOL}"
fi

# Instantiate, to install the overall project dependencies, and `build()` for conda
echo "--- :julia: Instantiate"
julia --project=. -e 'using Pkg; Pkg.instantiate(); Pkg.build()'

# Verify CUDA availability for GPU benchmarks
if [[ "${1}" == *GPU/* ]]; then
echo "--- :gpu: Verify CUDA availability"
julia --project=. -e 'using CUDA; CUDA.functional() || error("CUDA not functional!"); println("GPU: ", CUDA.name(CUDA.device())); CUDA.versioninfo()'
fi

if [[ "${1}" == *BayesianInference* ]]; then
export CMDSTAN_HOME="$(pwd)/cmdstan-2.29.2/"
curl -LO https://github.com/stan-dev/cmdstan/releases/download/v2.29.2/cmdstan-2.29.2.tar.gz
Expand Down
22 changes: 20 additions & 2 deletions .buildkite/launch_benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,35 @@ agents:
sandbox_capable: true

steps:
- label: ":runner: Dynamically launch run_benchmark.yml"
- label: ":runner: Dynamically launch run_benchmark.yml (CPU)"
branches: "!gh-pages"
env:
BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
depends_on:
plugins:
- staticfloat/forerunner#v1:
# This will create one job per project
# This will create one job per project (excluding GPU benchmarks)
watch:
- benchmarks/**/*.jmd
- benchmarks/**/*.toml
ignore:
- benchmarks/GPU
- benchmarks/GPU/**
path_processor: .buildkite/path_processors/project-coalescing
target: .buildkite/run_benchmark.yml
target_type: template

- label: ":runner: :gpu: Dynamically launch run_gpu_benchmark.yml (GPU)"
branches: "!gh-pages"
env:
BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
depends_on:
plugins:
- staticfloat/forerunner#v1:
# This will create one job per GPU benchmark
watch:
- benchmarks/GPU/**/*.jmd
- benchmarks/GPU/**/*.toml
path_processor: .buildkite/path_processors/project-coalescing
target: .buildkite/run_gpu_benchmark.yml
target_type: template
94 changes: 94 additions & 0 deletions .buildkite/run_gpu_benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# This is a pipeline that runs GPU benchmarks, then uploads the resultant
# .PDF and other reports as (buildkite, not Julia) artifacts. The `coppermind`
# configuration memoizes the result, so that identical inputs don't get
# benchmarked multiple times.
#
# GPU benchmarks use the juliagpu queue with CUDA-capable agents.
# Note: sandbox plugin is NOT used for GPU jobs due to GPU passthrough complexity.

steps:
- label: ":hammer: :gpu: {PATH}"
key: "benchmark-{SANITIZED_PATH}"
env:
BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
plugins:
- staticfloat/cryptic#v2:
variables:
- BUILDKITE_S3_ACCESS_KEY_ID="U2FsdGVkX1+x3xs1ZRRZRt3FmwFQmYYKnpV7o8xKkX5Ib6y0o5fv0+rskVAj+JKu"
- BUILDKITE_S3_SECRET_ACCESS_KEY="U2FsdGVkX1+LWh1yX7LsMBlecEJLc08eJrgOhurhd47CY1/jS3wCGVCQmS1t6f2j70spBoFdfc9kn2naj8HH5A=="
- BUILDKITE_S3_DEFAULT_REGION="U2FsdGVkX18ccoE9FmtkwsCm1x0MLMBlN/FLcAyKkY4="
files:
- .buildkite/secrets/token.toml
- JuliaCI/julia#v1:
version: "1.10"
# Note: No sandbox plugin for GPU jobs - GPU passthrough is complex
- staticfloat/coppermind#v1:
inputs:
# We are sensitive to the actual benchmark changing
- {PATH}
# We are sensitive to the source code of this package changing
- src/**/*.jl
# We are sensitive to our overall dependencies changing
- ./*.toml
outputs:
- markdown/**/figures/*.png
- markdown/**/*.md
- markdown/**/*.pdf
- markdown/**/*.svg
- notebook/**/*.ipynb
- pdf/**/*.pdf
- script/**/*.jl
s3_prefix: s3://julialang-buildkite-artifacts/scimlbenchmarks
timeout_in_minutes: 12000
commands: |
# Clear out these secrets as they're not needed during the actual build
BUILDKITE_S3_ACCESS_KEY_ID="" BUILDKITE_S3_SECRET_ACCESS_KEY="" ./.buildkite/build_benchmark.sh "{PATH}"
agents:
queue: "juliagpu"
cuda: "*"

- label: ":rocket: Publish {PATH}"
env:
BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET: ${BUILDKITE_PLUGIN_CRYPTIC_BASE64_SIGNED_JOB_ID_SECRET?}
plugins:
- staticfloat/cryptic#v2:
variables:
- BUILDKITE_S3_ACCESS_KEY_ID="U2FsdGVkX1+x3xs1ZRRZRt3FmwFQmYYKnpV7o8xKkX5Ib6y0o5fv0+rskVAj+JKu"
- BUILDKITE_S3_SECRET_ACCESS_KEY="U2FsdGVkX1+LWh1yX7LsMBlecEJLc08eJrgOhurhd47CY1/jS3wCGVCQmS1t6f2j70spBoFdfc9kn2naj8HH5A=="
- BUILDKITE_S3_DEFAULT_REGION="U2FsdGVkX18ccoE9FmtkwsCm1x0MLMBlN/FLcAyKkY4="
files:
- .buildkite/ssh_deploy.key
- JuliaCI/julia#v1:
version: "1.10"
- staticfloat/sandbox:
rootfs_url: "https://jc-rootfs-images.s3.amazonaws.com/aws_uploader-2021-11-12.x86_64.tar.gz"
rootfs_treehash: "986217e5b36efd3b3b91ed90df8e36d628cf543f"
workspaces:
# Include the julia we just downloaded
- "/cache/julia-buildkite-plugin:/cache/julia-buildkite-plugin"
# Use coppermind to download the benchmark results that were calculated in the
# benchmarking job above. Note we still list `outputs` here, since we have the
# option to extract only a subset of them here.
- staticfloat/coppermind#v1:
input_from: "benchmark-{SANITIZED_PATH}"
outputs:
- markdown/**/figures/*.png
- markdown/**/*.md
- notebook/**/*.ipynb
- pdf/**/*.pdf
- script/**/*.jl
s3_prefix: s3://julialang-buildkite-artifacts/scimlbenchmarks
- staticfloat/ssh-agent:
keyfiles:
- .buildkite/ssh_deploy.key
agents:
queue: "juliaecosystem"
sandbox_capable: true
arch: "x86_64"
concurrency: 1
concurrency_group: "scimlbenchmarks/deploy"
commands: .buildkite/publish_benchmark_output.sh
# Don't run this unless we're on the master branch, and not until the actual benchmark
# command has had a chance to run.
depends_on: "benchmark-{SANITIZED_PATH}"
branches: "master"
209 changes: 209 additions & 0 deletions benchmarks/GPU/EnsembleGPU_Lorenz.jmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
---
title: GPU Ensemble ODE Benchmark - Lorenz System
author: SciMLBenchmarks
---

## Introduction

This benchmark compares CPU vs GPU performance for solving many independent ODE trajectories using DiffEqGPU.jl's EnsembleGPUKernel approach. The Lorenz system is used as a canonical chaotic system benchmark.

GPU acceleration is most effective for ensemble problems where thousands of independent trajectories need to be computed. The GPU kernel approach compiles the entire ODE solver to GPU, avoiding CPU-GPU synchronization overhead.

```julia
using OrdinaryDiffEq, DiffEqGPU, CUDA, StaticArrays, BenchmarkTools, Plots

# Check GPU availability
const CUDA_AVAILABLE = CUDA.functional()
println("CUDA functional: ", CUDA_AVAILABLE)
if CUDA_AVAILABLE
println("GPU: ", CUDA.name(CUDA.device()))
CUDA.versioninfo()
else
@warn "CUDA not functional - GPU benchmarks will be skipped, showing CPU-only results"
end
```

## Problem Definition

The Lorenz system is defined using StaticArrays for stack-allocated, non-allocating operations required for GPU execution:

$$\frac{dx}{dt} = \sigma(y - x)$$
$$\frac{dy}{dt} = x(\rho - z) - y$$
$$\frac{dz}{dt} = xy - \beta z$$

```julia
function lorenz(u, p, t)
sigma, rho, beta = p
x, y, z = u
dx = sigma * (y - x)
dy = x * (rho - z) - y
dz = x * y - beta * z
SA[dx, dy, dz]
end

u0 = SA[1.0f0, 0.0f0, 0.0f0]
tspan = (0.0f0, 10.0f0)
p = SA[10.0f0, 28.0f0, 8.0f0/3.0f0]

prob = ODEProblem{false}(lorenz, u0, tspan, p)
```

## Ensemble Problem Setup

We create an ensemble with parameter variations to simulate many independent trajectories:

```julia
function prob_func(prob, i, repeat)
remake(prob, p = SA[10.0f0 + 0.01f0*i, 28.0f0, 8.0f0/3.0f0])
end

ensemble_prob = EnsembleProblem(prob, prob_func = prob_func)
```

## Single Trajectory Verification

First, verify that GPU and CPU produce matching results:

```julia
# CPU solution
sol_cpu = solve(prob, Tsit5(), saveat = 0.1f0)
println("CPU final state: ", sol_cpu.u[end])

if CUDA_AVAILABLE
# GPU solution (single trajectory)
sol_gpu = solve(ensemble_prob, GPUTsit5(), EnsembleGPUKernel(CUDA.CUDABackend()),
trajectories = 1, saveat = 0.1f0)
println("GPU final state: ", sol_gpu[1].u[end])
end
```

```julia
plot(sol_cpu, idxs = (1, 2, 3), title = "Lorenz Attractor (CPU)", label = "")
```

## Benchmarking: CPU vs GPU

We benchmark across different trajectory counts to find the crossover point where GPU becomes faster:

```julia
trajectory_counts = [100, 1000, 10000, 100000]
cpu_times = Float64[]
gpu_times = Float64[]

for N in trajectory_counts
println("\n--- Benchmarking N = $N trajectories ---")

# CPU (threaded)
t_cpu = @belapsed solve($ensemble_prob, Tsit5(), EnsembleThreads(),
trajectories = $N, saveat = 0.1f0) samples=3 evals=1
push!(cpu_times, t_cpu)
println("CPU (threaded): $(round(t_cpu, digits=4))s")

if CUDA_AVAILABLE
# GPU
t_gpu = @belapsed solve($ensemble_prob, GPUTsit5(),
EnsembleGPUKernel(CUDA.CUDABackend()),
trajectories = $N, saveat = 0.1f0) samples=3 evals=1
push!(gpu_times, t_gpu)
println("GPU (CUDA): $(round(t_gpu, digits=4))s")
speedup = t_cpu / t_gpu
println("Speedup: $(round(speedup, digits=1))x")
else
push!(gpu_times, NaN)
println("GPU (CUDA): N/A (CUDA not available)")
end
end
```

## Results Visualization

```julia
if CUDA_AVAILABLE && !any(isnan, gpu_times)
p1 = plot(trajectory_counts, [cpu_times gpu_times],
label = ["CPU (Threads)" "GPU (CUDA)"],
xscale = :log10, yscale = :log10,
xlabel = "Number of Trajectories",
ylabel = "Time (s)",
title = "DiffEqGPU.jl Ensemble Performance",
marker = :circle,
legend = :topleft,
linewidth = 2)
else
p1 = plot(trajectory_counts, cpu_times,
label = "CPU (Threads)",
xscale = :log10, yscale = :log10,
xlabel = "Number of Trajectories",
ylabel = "Time (s)",
title = "CPU Ensemble Performance (GPU unavailable)",
marker = :circle,
linewidth = 2)
end
p1
```

```julia
if CUDA_AVAILABLE && !any(isnan, gpu_times)
speedups = cpu_times ./ gpu_times
p2 = plot(trajectory_counts, speedups,
xscale = :log10,
xlabel = "Number of Trajectories",
ylabel = "Speedup (CPU/GPU)",
title = "GPU Speedup vs CPU",
marker = :circle,
legend = false,
linewidth = 2,
color = :green)
hline!([1.0], linestyle = :dash, color = :red, label = "Break-even")
p2
else
println("GPU speedup plot skipped - CUDA not available")
nothing
end
```

```julia
if CUDA_AVAILABLE && !any(isnan, gpu_times)
plot(p1, p2, layout = (1, 2), size = (1000, 400))
else
p1
end
```

## Summary Table

```julia
using Printf

println("\nSummary Results:")
println("=" ^ 60)
if CUDA_AVAILABLE && !any(isnan, gpu_times)
@printf("%-15s %12s %12s %12s\n", "Trajectories", "CPU (s)", "GPU (s)", "Speedup")
println("-" ^ 60)
for (N, t_cpu, t_gpu) in zip(trajectory_counts, cpu_times, gpu_times)
@printf("%-15d %12.4f %12.4f %12.1fx\n", N, t_cpu, t_gpu, t_cpu/t_gpu)
end
else
@printf("%-15s %12s\n", "Trajectories", "CPU (s)")
println("-" ^ 60)
for (N, t_cpu) in zip(trajectory_counts, cpu_times)
@printf("%-15d %12.4f\n", N, t_cpu)
end
println("\nNote: GPU benchmarks skipped - CUDA not available on this system")
end
println("=" ^ 60)
```

## Conclusion

GPU acceleration via DiffEqGPU.jl provides significant speedups for ensemble ODE problems when the number of trajectories is large (typically > 1000). The EnsembleGPUKernel approach compiles the entire solver to GPU code, maximizing throughput for embarrassingly parallel trajectory computations.

Key observations:
- GPU overhead is amortized over many trajectories
- Float32 precision is recommended for GPU (faster memory bandwidth)
- StaticArrays are required for GPU kernel compilation
- The crossover point depends on problem complexity and GPU hardware

```julia, echo = false
using SciMLBenchmarks
SciMLBenchmarks.bench_footer(WEAVE_ARGS[:folder], WEAVE_ARGS[:file])
```
17 changes: 17 additions & 0 deletions benchmarks/GPU/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
SciMLBenchmarks = "31c91b34-3c75-11e9-0341-95557aab0344"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[compat]
BenchmarkTools = "1"
CUDA = "5"
DiffEqGPU = "3"
OrdinaryDiffEq = "6"
Plots = "1"
SciMLBenchmarks = "0.1"
StaticArrays = "1"