Skip to content

Commit 1124a83

Browse files
authored
Merge pull request #147 from leios/KA_support
KernelAbstractions support
2 parents 59f8967 + 1ccb627 commit 1124a83

30 files changed

+997
-934
lines changed

.github/workflows/CI.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ jobs:
2929
- NotGradients
3030
- Gradients
3131
steps:
32+
- run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
3233
- uses: actions/checkout@v4
3334
- uses: julia-actions/setup-julia@v2
3435
with:

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
*.jl.*.cov
33
*.jl.mem
44
docs/build
5-
/Manifest.toml
5+
*Manifest.toml
66
benchmark/tune.json
77
benchmark/results
88
.vscode/settings.json

Project.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
88
AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
99
AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
1010
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
11-
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
1211
CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
1312
Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
1413
Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
1716
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
1817
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
1918
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
19+
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
2020
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
21+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
2122
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
2223
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
2324
PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
@@ -32,13 +33,15 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
3233
UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
3334

3435
[weakdeps]
36+
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
3537
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
3638
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
3739
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
3840
KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
3941
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
4042

4143
[extensions]
44+
MollyCUDAExt = "CUDA"
4245
MollyEnzymeExt = "Enzyme"
4346
MollyGLMakieExt = ["GLMakie", "Colors"]
4447
MollyKernelDensityExt = "KernelDensity"
@@ -61,7 +64,9 @@ Enzyme = "0.13.20"
6164
EzXML = "1"
6265
FLoops = "0.2"
6366
GLMakie = "0.8, 0.9, 0.10, 0.11"
67+
GPUArrays = "11"
6468
Graphs = "1.8"
69+
KernelAbstractions = "0.9"
6570
KernelDensity = "0.5, 0.6"
6671
LinearAlgebra = "1.9"
6772
NearestNeighbors = "0.4"

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Implemented features include:
3434
- [Unitful.jl](https://github.com/PainterQubits/Unitful.jl) compatibility so numbers have physical meaning.
3535
- Set up crystal systems using [SimpleCrystals.jl](https://github.com/ejmeitz/SimpleCrystals.jl).
3636
- Automatic multithreading.
37-
- GPU acceleration on CUDA-enabled devices.
37+
- GPU acceleration on all backends supported by [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl), with better performance on CUDA-enabled devices.
3838
- Run with Float64, Float32 or other float types.
3939
- Some analysis functions, e.g. RDF.
4040
- Visualise simulations as animations with [Makie.jl](https://makie.juliaplots.org/stable).

benchmark/benchmarks.jl

Lines changed: 29 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ else
1717
@warn "The parallel benchmarks will not be run as Julia is running on 1 thread"
1818
end
1919

20-
# Allow CUDA device to be specified
21-
const DEVICE = get(ENV, "DEVICE", "0")
20+
# Allow GPU device to be specified
21+
const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
2222

23-
const run_gpu_tests = CUDA.functional()
24-
if run_gpu_tests
25-
device!(parse(Int, DEVICE))
26-
@info "The GPU benchmarks will be run on device $DEVICE"
23+
const run_cuda_tests = CUDA.functional()
24+
if run_cuda_tests
25+
device!(DEVICE)
26+
@info "The CUDA benchmarks will be run on device $DEVICE"
2727
else
28-
@warn "The GPU benchmarks will not be run as a CUDA-enabled device is not available"
28+
@warn "The CUDA benchmarks will not be run as a CUDA-enabled device is not available"
2929
end
3030

3131
const SUITE = BenchmarkGroup(
@@ -62,7 +62,7 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
6262
const starting_coords_f32 = [Float32.(c) for c in starting_coords]
6363
const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
6464

65-
function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
65+
function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT
6666
n_atoms = 400
6767
n_steps = 200
6868
atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,34 +72,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
7272
r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
7373
bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
7474
specific_inter_lists = (InteractionList2Atoms(
75-
gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
76-
gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
77-
gpu ? CuArray(bonds) : bonds,
75+
AT(Int32.(collect(1:2:n_atoms))),
76+
AT(Int32.(collect(2:2:n_atoms))),
77+
AT(bonds),
7878
),)
7979

8080
neighbor_finder = NoNeighborFinder()
8181
cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
8282
pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
8383
if nl
8484
neighbor_finder = DistanceNeighborFinder(
85-
eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
85+
eligible=AT(trues(n_atoms, n_atoms)),
8686
n_steps=10,
8787
dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
8888
)
8989
pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
9090
end
9191

92-
if gpu
93-
coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
94-
velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
95-
atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
96-
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
97-
else
98-
coords = copy(f32 ? starting_coords_f32 : starting_coords)
99-
velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
100-
atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
101-
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
102-
end
92+
coords = AT(copy(f32 ? starting_coords_f32 : starting_coords))
93+
velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities))
94+
atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
95+
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
10396

10497
sys = System(
10598
atoms=atoms,
@@ -117,22 +110,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
117110
end
118111

119112
runs = [
120-
("CPU" , [false, false, false, false]),
121-
("CPU f32" , [false, false, true , false]),
122-
("CPU NL" , [true , false, false, false]),
123-
("CPU f32 NL", [true , false, true , false]),
113+
("CPU" , [false, false, false, Array]),
114+
("CPU f32" , [false, false, true , Array]),
115+
("CPU NL" , [true , false, false, Array]),
116+
("CPU f32 NL", [true , false, true , Array]),
124117
]
125118
if run_parallel_tests
126-
push!(runs, ("CPU parallel" , [false, true , false, false]))
127-
push!(runs, ("CPU parallel f32" , [false, true , true , false]))
128-
push!(runs, ("CPU parallel NL" , [true , true , false, false]))
129-
push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
119+
push!(runs, ("CPU parallel" , [false, true , false, Array]))
120+
push!(runs, ("CPU parallel f32" , [false, true , true , Array]))
121+
push!(runs, ("CPU parallel NL" , [true , true , false, Array]))
122+
push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
130123
end
131-
if run_gpu_tests
132-
push!(runs, ("GPU" , [false, false, false, true]))
133-
push!(runs, ("GPU f32" , [false, false, true , true]))
134-
push!(runs, ("GPU NL" , [true , false, false, true]))
135-
push!(runs, ("GPU f32 NL", [true , false, true , true]))
124+
if run_cuda_tests
125+
push!(runs, ("GPU" , [false, false, false, CuArray]))
126+
push!(runs, ("GPU f32" , [false, false, true , CuArray]))
127+
push!(runs, ("GPU NL" , [true , false, false, CuArray]))
128+
push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
136129
end
137130

138131
for (name, args) in runs

benchmark/protein.jl

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
1111
const ff_dir = joinpath(data_dir, "force_fields")
1212
const openmm_dir = joinpath(data_dir, "openmm_6mrr")
1313

14-
function setup_system(gpu::Bool, f32::Bool, units::Bool)
14+
function setup_system(::Type{AT}, f32::Bool, units::Bool) where AT
1515
T = f32 ? Float32 : Float64
1616
ff = MolecularForceField(
1717
T,
@@ -27,9 +27,9 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
2727
sys = System(
2828
joinpath(data_dir, "6mrr_equil.pdb"),
2929
ff;
30-
velocities=gpu ? CuArray(velocities) : velocities,
30+
velocities=AT(velocities),
3131
units=units,
32-
gpu=gpu,
32+
array_type=AT,
3333
dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
3434
dist_neighbors=(units ? dist_neighbors * u"nm" : dist_neighbors),
3535
)
@@ -41,21 +41,21 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
4141
end
4242

4343
runs = [
44-
# run_name gpu parr f32 units
45-
("CPU 1 thread" , false, false, false, true ),
46-
("CPU 1 thread f32" , false, false, true , true ),
47-
("CPU 1 thread f32 nounits" , false, false, true , false),
48-
("CPU $n_threads threads" , false, true , false, true ),
49-
("CPU $n_threads threads f32" , false, true , true , true ),
50-
("CPU $n_threads threads f32 nounits", false, true , true , false),
51-
("GPU" , true , false, false, true ),
52-
("GPU f32" , true , false, true , true ),
53-
("GPU f32 nounits" , true , false, true , false),
44+
# run_name gpu parr f32 units
45+
("CPU 1 thread" , Array , false, false, true ),
46+
("CPU 1 thread f32" , Array , false, true , true ),
47+
("CPU 1 thread f32 nounits" , Array , false, true , false),
48+
("CPU $n_threads threads" , Array , true , false, true ),
49+
("CPU $n_threads threads f32" , Array , true , true , true ),
50+
("CPU $n_threads threads f32 nounits", Array , true , true , false),
51+
("GPU" , CuArray, false, false, true ),
52+
("GPU f32" , CuArray, false, true , true ),
53+
("GPU f32 nounits" , CuArray, false, true , false),
5454
]
5555

56-
for (run_name, gpu, parallel, f32, units) in runs
56+
for (run_name, AT, parallel, f32, units) in runs
5757
n_threads_used = parallel ? n_threads : 1
58-
sys, sim = setup_system(gpu, f32, units)
58+
sys, sim = setup_system(AT, f32, units)
5959
simulate!(deepcopy(sys), sim, 20; n_threads=n_threads_used)
6060
println(run_name)
6161
@time simulate!(sys, sim, n_steps; n_threads=n_threads_used)

docs/src/documentation.md

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,21 @@ visualize(sys.loggers.coords, boundary, "sim_lj.mp4")
135135

136136
## GPU acceleration
137137

138-
To run simulations on the GPU you will need to have a CUDA-compatible device.
139-
[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is used to run on the device.
138+
To run simulations on the GPU you will need to have a GPU available and then load the appropriate package:
139+
140+
| Hardware Available | Necessary Package | Array Type |
141+
| ------------------ | ----------------- | ---------- |
142+
| Parallel CPU | none | `Array` |
143+
| NVIDIA GPU | CUDA | `CuArray` |
144+
| AMD GPU | AMDGPU | `ROCArray` |
145+
| Intel GPU | oneAPI | `oneArray` |
146+
| Apple Silicon | Metal | `MtlArray` |
147+
148+
As an important note, Metal/Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
140149
Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU.
141150
This example also shows setting up a simulation to run with `Float32`, which gives much better performance on GPUs.
142151
Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case.
152+
Here is an example script for an NVIDIA GPU using CUDA:
143153
```julia
144154
using Molly
145155
using CUDA
@@ -168,6 +178,7 @@ sys = System(
168178
simulate!(deepcopy(sys), simulator, 20) # Compile function
169179
simulate!(sys, simulator, 1_000)
170180
```
181+
To use another GPU package, just swap out `CUDA` for your desired package and `CuArray` for your desired array type.
171182
The device to run on can be changed with `device!`, e.g. `device!(1)`.
172183
The GPU code path is currently designed to be compatible with differentiable simulation and runs slower than related software, but this is an active area of development.
173184
Nonetheless, GPU performance is significantly better than CPU performance and is good enough for many applications.
@@ -316,7 +327,7 @@ sys = System(
316327
energy=TotalEnergyLogger(10),
317328
writer=StructureWriter(10, "traj_6mrr_1ps.pdb", ["HOH"]),
318329
),
319-
gpu=false,
330+
array_type=Array,
320331
)
321332

322333
minimizer = SteepestDescentMinimizer()
@@ -352,7 +363,7 @@ Residue patches, virtual sites, file includes and any force types other than `Ha
352363

353364
Some PDB files that read in fine can be found [here](https://github.com/greener-group/GB99dms/tree/main/structures/training/conf_1).
354365

355-
To run on the GPU, set `gpu=true`.
366+
To run on the GPU, set `array_type=GPUArrayType`, where `GPUArrayType` is the array type for your GPU backend (for example `CuArray` for NVIDIA or `ROCArray` for AMD).
356367
You can use an implicit solvent method by giving the `implicit_solvent` keyword argument to [`System`](@ref).
357368
The options are `"obc1"`, `"obc2"` and `"gbn2"`, corresponding to the Onufriev-Bashford-Case GBSA model with parameter set I or II and the GB-Neck2 model.
358369
Other options include overriding the boundary dimensions in the file (`boundary`) and modifying the non-bonded interaction and neighbor list cutoff distances (`dist_cutoff` and `dist_neighbors`).
@@ -1017,10 +1028,10 @@ function Molly.simulate!(sys::ReplicaSystem,
10171028
end
10181029
```
10191030

1020-
Under the hood there are two implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, and a version geared towards GPUs.
1021-
You can define different versions of a simulator for CPU and GPU systems by dispatching on `System{D, false}` or `System{D, true}` respectively.
1031+
Under the hood there are multiple implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, a CUDA version, and a version for other GPU backends.
1032+
You can define different versions of a simulator for CPU, CUDA and generic GPU systems by dispatching on `System{D, Array}` or `System{D, CuArray}` and `System{D, AT} where AT <: AbstractGPUArray` respectively.
10221033
This also applies to coupling methods, neighbor finders and analysis functions.
1023-
You do not have to define two versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
1034+
You do not have to define different versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
10241035

10251036
## Coupling
10261037

@@ -1321,7 +1332,7 @@ The available neighbor finders are:
13211332
- [`DistanceNeighborFinder`](@ref)
13221333
- [`TreeNeighborFinder`](@ref)
13231334

1324-
The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU and [`GPUNeighborFinder`](@ref) on GPU.
1335+
The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU, [`GPUNeighborFinder`](@ref) on NVIDIA GPUs and [`DistanceNeighborFinder`](@ref) on other GPUs.
13251336
When using a neighbor finder you should in general also use an interaction cutoff (see [Cutoffs](@ref)) with a cutoff distance less than the neighbor finder distance.
13261337
The difference between the two should be larger than an atom can move in the time of the `n_steps` defined by the neighbor finder.
13271338
The exception is [`GPUNeighborFinder`](@ref), which uses the algorithm from [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list and should have `dist_cutoff` set to the interaction cutoff distance.

0 commit comments

Comments
 (0)