Skip to content

Commit ff3dc86

Browse files
committed
Adding KernelAbstractions tooling for Molly and tests
1 parent 49d92d9 commit ff3dc86

29 files changed

+975
-563
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
*.jl.*.cov
33
*.jl.mem
44
docs/build
5-
/Manifest.toml
5+
*Manifest.toml
66
benchmark/tune.json
77
benchmark/results
88
.vscode/settings.json

Project.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
88
AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
99
AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
1010
BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
11-
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
1211
CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
1312
ChainRules = "082447d4-558c-5d27-93f4-14fc19e9eca2"
1413
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -22,7 +21,9 @@ Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
2221
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
2322
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
2423
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
24+
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
2525
Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
26+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
2627
KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
2728
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
2829
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
@@ -42,10 +43,12 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
4243
[weakdeps]
4344
GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
4445
PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
46+
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
4547

4648
[extensions]
4749
MollyGLMakieExt = "GLMakie"
4850
MollyPythonCallExt = "PythonCall"
51+
MollyCUDAExt = "CUDA"
4952

5053
[compat]
5154
Atomix = "0.1"
@@ -68,6 +71,7 @@ FLoops = "0.2"
6871
ForwardDiff = "0.10.35"
6972
GLMakie = "0.8, 0.9, 0.10"
7073
Graphs = "1.8"
74+
KernelAbstractions = "0.9"
7175
KernelDensity = "0.5, 0.6"
7276
LinearAlgebra = "1.9"
7377
NearestNeighbors = "0.4"

benchmark/benchmarks.jl

Lines changed: 23 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,8 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
6262
const starting_coords_f32 = [Float32.(c) for c in starting_coords]
6363
const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
6464

65-
function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
65+
function test_sim(nl::Bool, parallel::Bool, f32::Bool,
66+
ArrayType::Type{AT}) where AT <: AbstractArray
6667
n_atoms = 400
6768
n_steps = 200
6869
atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,34 +73,27 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
7273
r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
7374
bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
7475
specific_inter_lists = (InteractionList2Atoms(
75-
gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
76-
gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
77-
gpu ? CuArray(bonds) : bonds,
76+
ArrayType(Int32.(collect(1:2:n_atoms))),
77+
ArrayType(Int32.(collect(2:2:n_atoms))),
78+
ArrayType(bonds),
7879
),)
7980

8081
neighbor_finder = NoNeighborFinder()
8182
cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
8283
pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
8384
if nl
8485
neighbor_finder = DistanceNeighborFinder(
85-
eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
86+
eligible=ArrayType(trues(n_atoms, n_atoms)),
8687
n_steps=10,
8788
dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
8889
)
8990
pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
9091
end
9192

92-
if gpu
93-
coords = CuArray(deepcopy(f32 ? starting_coords_f32 : starting_coords))
94-
velocities = CuArray(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
95-
atoms = CuArray([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
96-
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
97-
else
98-
coords = deepcopy(f32 ? starting_coords_f32 : starting_coords)
99-
velocities = deepcopy(f32 ? starting_velocities_f32 : starting_velocities)
100-
atoms = [Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
101-
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
102-
end
93+
coords = ArrayType(deepcopy(f32 ? starting_coords_f32 : starting_coords))
94+
velocities = ArrayType(deepcopy(f32 ? starting_velocities_f32 : starting_velocities))
95+
atoms = ArrayType([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
96+
ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
10397

10498
sys = System(
10599
atoms=atoms,
@@ -117,22 +111,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
117111
end
118112

119113
runs = [
120-
("CPU" , [false, false, false, false]),
121-
("CPU f32" , [false, false, true , false]),
122-
("CPU NL" , [true , false, false, false]),
123-
("CPU f32 NL", [true , false, true , false]),
114+
("CPU" , [false, false, false, Array]),
115+
("CPU f32" , [false, false, true , Array]),
116+
("CPU NL" , [true , false, false, Array]),
117+
("CPU f32 NL", [true , false, true , Array]),
124118
]
125119
if run_parallel_tests
126-
push!(runs, ("CPU parallel" , [false, true , false, false]))
127-
push!(runs, ("CPU parallel f32" , [false, true , true , false]))
128-
push!(runs, ("CPU parallel NL" , [true , true , false, false]))
129-
push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
120+
push!(runs, ("CPU parallel" , [false, true , false, Array]))
121+
push!(runs, ("CPU parallel f32" , [false, true , true , Array]))
122+
push!(runs, ("CPU parallel NL" , [true , true , false, Array]))
123+
push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
130124
end
131-
if run_gpu_tests
132-
push!(runs, ("GPU" , [false, false, false, true]))
133-
push!(runs, ("GPU f32" , [false, false, true , true]))
134-
push!(runs, ("GPU NL" , [true , false, false, true]))
135-
push!(runs, ("GPU f32 NL", [true , false, true , true]))
125+
if run_cuda_tests
126+
push!(runs, ("GPU" , [false, false, false, CuArray]))
127+
push!(runs, ("GPU f32" , [false, false, true , CuArray]))
128+
push!(runs, ("GPU NL" , [true , false, false, CuArray]))
129+
push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
136130
end
137131

138132
for (name, args) in runs

benchmark/protein.jl

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
1111
const ff_dir = joinpath(data_dir, "force_fields")
1212
const openmm_dir = joinpath(data_dir, "openmm_6mrr")
1313

14-
function setup_system(gpu::Bool, f32::Bool, units::Bool)
14+
function setup_system(ArrayType::AbstractArray, f32::Bool, units::Bool)
1515
T = f32 ? Float32 : Float64
1616
ff = MolecularForceField(
1717
T,
@@ -27,7 +27,7 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
2727
sys = System(
2828
joinpath(data_dir, "6mrr_equil.pdb"),
2929
ff;
30-
velocities=gpu ? CuArray(velocities) : velocities,
30+
velocities=ArrayType(velocities),
3131
units=units,
3232
gpu=gpu,
3333
dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
@@ -42,15 +42,15 @@ end
4242

4343
runs = [
4444
# run_name gpu parr f32 units
45-
("CPU 1 thread" , false, false, false, true ),
46-
("CPU 1 thread f32" , false, false, true , true ),
47-
("CPU 1 thread f32 nounits" , false, false, true , false),
48-
("CPU $n_threads threads" , false, true , false, true ),
49-
("CPU $n_threads threads f32" , false, true , true , true ),
50-
("CPU $n_threads threads f32 nounits", false, true , true , false),
51-
("GPU" , true , false, false, true ),
52-
("GPU f32" , true , false, true , true ),
53-
("GPU f32 nounits" , true , false, true , false),
45+
("CPU 1 thread" , Array, false, false, true ),
46+
("CPU 1 thread f32" , Array, false, true , true ),
47+
("CPU 1 thread f32 nounits" , Array, false, true , false),
48+
("CPU $n_threads threads" , Array, true , false, true ),
49+
("CPU $n_threads threads f32" , Array, true , true , true ),
50+
("CPU $n_threads threads f32 nounits", Array, true , true , false),
51+
("GPU" , CuArray, false, false, true ),
52+
("GPU f32" , CuArray, false, true , true ),
53+
("GPU f32 nounits" , CuArray, false, true , false),
5454
]
5555

5656
for (run_name, gpu, parallel, f32, units) in runs

src/cuda.jl renamed to ext/MollyCUDAExt.jl

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
module MollyCUDAExt
2+
3+
using Molly
4+
using CUDA
5+
6+
# CUDA specific calls for Molly
7+
@non_differentiable CUDA.zeros(args...)
8+
CUDA.Const(nl::NoNeighborList) = nl
9+
110
# CUDA.jl kernels
211
const WARPSIZE = UInt32(32)
312

@@ -29,7 +38,7 @@ function cuda_threads_blocks_specific(n_inters)
2938
return n_threads_gpu, n_blocks
3039
end
3140

32-
function pairwise_force_gpu(coords::AbstractArray{SVector{D, C}}, atoms, boundary,
41+
function pairwise_force_gpu(coords::CuArray{SVector{D, C}}, atoms, boundary,
3342
pairwise_inters, nbs, force_units, ::Val{T}) where {D, C, T}
3443
fs_mat = CUDA.zeros(T, D, length(atoms))
3544

@@ -112,7 +121,7 @@ That's why the calculations are done in the following order:
112121
h | 1 2 3 4 5 6
113122
```
114123
=#
115-
function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, atoms_var, boundary, inters,
124+
function pairwise_force_kernel_nonl!(forces::CuArray{T}, coords_var, atoms_var, boundary, inters,
116125
::Val{D}, ::Val{F}) where {T, D, F}
117126
coords = CUDA.Const(coords_var)
118127
atoms = CUDA.Const(atoms_var)
@@ -193,7 +202,7 @@ end
193202
return f
194203
end
195204

196-
function specific_force_gpu(inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
205+
function specific_force_gpu(inter_list::InteractionList1Atoms, coords::CuArray{SVector{D, C}},
197206
boundary, force_units, ::Val{T}) where {D, C, T}
198207
fs_mat = CUDA.zeros(T, D, length(coords))
199208
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -202,7 +211,7 @@ function specific_force_gpu(inter_list::InteractionList1Atoms, coords::AbstractA
202211
return fs_mat
203212
end
204213

205-
function specific_force_gpu(inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
214+
function specific_force_gpu(inter_list::InteractionList2Atoms, coords::CuArray{SVector{D, C}},
206215
boundary, force_units, ::Val{T}) where {D, C, T}
207216
fs_mat = CUDA.zeros(T, D, length(coords))
208217
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -212,7 +221,7 @@ function specific_force_gpu(inter_list::InteractionList2Atoms, coords::AbstractA
212221
return fs_mat
213222
end
214223

215-
function specific_force_gpu(inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
224+
function specific_force_gpu(inter_list::InteractionList3Atoms, coords::CuArray{SVector{D, C}},
216225
boundary, force_units, ::Val{T}) where {D, C, T}
217226
fs_mat = CUDA.zeros(T, D, length(coords))
218227
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -222,7 +231,7 @@ function specific_force_gpu(inter_list::InteractionList3Atoms, coords::AbstractA
222231
return fs_mat
223232
end
224233

225-
function specific_force_gpu(inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
234+
function specific_force_gpu(inter_list::InteractionList4Atoms, coords::CuArray{SVector{D, C}},
226235
boundary, force_units, ::Val{T}) where {D, C, T}
227236
fs_mat = CUDA.zeros(T, D, length(coords))
228237
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -328,7 +337,7 @@ function specific_force_4_atoms_kernel!(forces, coords_var, boundary, is_var, js
328337
return nothing
329338
end
330339

331-
function pairwise_pe_gpu(coords::AbstractArray{SVector{D, C}}, atoms, boundary,
340+
function pairwise_pe_gpu(coords::CuArray{SVector{D, C}}, atoms, boundary,
332341
pairwise_inters, nbs, energy_units, ::Val{T}) where {D, C, T}
333342
pe_vec = CUDA.zeros(T, 1)
334343
n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
@@ -363,7 +372,7 @@ function pairwise_pe_kernel!(energy, coords_var, atoms_var, boundary, inters, ne
363372
return nothing
364373
end
365374

366-
function specific_pe_gpu(inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
375+
function specific_pe_gpu(inter_list::InteractionList1Atoms, coords::CuArray{SVector{D, C}},
367376
boundary, energy_units, ::Val{T}) where {D, C, T}
368377
pe_vec = CUDA.zeros(T, 1)
369378
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -372,7 +381,7 @@ function specific_pe_gpu(inter_list::InteractionList1Atoms, coords::AbstractArra
372381
return pe_vec
373382
end
374383

375-
function specific_pe_gpu(inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
384+
function specific_pe_gpu(inter_list::InteractionList2Atoms, coords::CuArray{SVector{D, C}},
376385
boundary, energy_units, ::Val{T}) where {D, C, T}
377386
pe_vec = CUDA.zeros(T, 1)
378387
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -381,7 +390,7 @@ function specific_pe_gpu(inter_list::InteractionList2Atoms, coords::AbstractArra
381390
return pe_vec
382391
end
383392

384-
function specific_pe_gpu(inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
393+
function specific_pe_gpu(inter_list::InteractionList3Atoms, coords::CuArray{SVector{D, C}},
385394
boundary, energy_units, ::Val{T}) where {D, C, T}
386395
pe_vec = CUDA.zeros(T, 1)
387396
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -391,7 +400,7 @@ function specific_pe_gpu(inter_list::InteractionList3Atoms, coords::AbstractArra
391400
return pe_vec
392401
end
393402

394-
function specific_pe_gpu(inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
403+
function specific_pe_gpu(inter_list::InteractionList4Atoms, coords::CuArray{SVector{D, C}},
395404
boundary, energy_units, ::Val{T}) where {D, C, T}
396405
pe_vec = CUDA.zeros(T, 1)
397406
n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
@@ -482,3 +491,5 @@ function specific_pe_4_atoms_kernel!(energy, coords_var, boundary, is_var, js_va
482491
end
483492
return nothing
484493
end
494+
495+
end

ext/MollyPythonCallExt.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ module MollyPythonCallExt
66
using Molly
77
using PythonCall
88
import AtomsCalculators
9-
using CUDA
9+
using GPUArrays
1010
using StaticArrays
1111
using Unitful
1212

@@ -91,7 +91,7 @@ end
9191

9292
uconvert_vec(x...) = uconvert.(x...)
9393

94-
function AtomsCalculators.forces(sys::System{D, G, T},
94+
function AtomsCalculators.forces(sys::System{D, AT, T},
9595
ase_calc::ASECalculator;
9696
kwargs...) where {D, G, T}
9797
update_ase_calc!(ase_calc, sys)
@@ -105,7 +105,7 @@ function AtomsCalculators.forces(sys::System{D, G, T},
105105
else
106106
fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å")
107107
end
108-
return G ? CuArray(fs_unit) : fs_unit
108+
return AT <: AbstractGPUArray ? AT(fs_unit) : fs_unit
109109
end
110110

111111
function AtomsCalculators.potential_energy(sys::System{D, G, T},

src/Molly.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ using ChainRules
1313
using ChainRulesCore
1414
import Chemfiles
1515
using Combinatorics
16-
using CUDA
16+
using KernelAbstractions
17+
using GPUArrays
1718
using DataStructures
1819
using Distances
1920
using Distributions
@@ -41,7 +42,7 @@ include("types.jl")
4142
include("units.jl")
4243
include("spatial.jl")
4344
include("cutoffs.jl")
44-
include("cuda.jl")
45+
include("kernels.jl")
4546
include("force.jl")
4647
include("interactions/lennard_jones.jl")
4748
include("interactions/soft_sphere.jl")

src/analysis.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ Calculate the hydrodynamic radius of a set of coordinates.
162162
function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) where {D, T}
163163
n_atoms = length(coords)
164164
diag_cpu = Diagonal(ones(T, n_atoms))
165-
diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu
165+
diag = isa(coords, AbstractGPUArray) ? get_array_type(coords)((diag_cpu)) : diag_cpu
166166
# Other approaches to removing the diagonal Inf didn't work with Zygote
167167
dists = distances(coords, boundary) .+ diag
168168
sum_inv_dists = sum(inv.(dists)) - sum(inv(diag))

0 commit comments

Comments
 (0)