Skip to content

Commit 937b290

Browse files
authored
Fix cuda (#668)
* outdated unittest template testSuite -> suite * new nimcuda layout, switch to cuda 12.5, and ref type destructor fix * better cudaMalloc internal proc typing * remove deprecated use of .data= proc * fix cuda -> cpu copy proc * add side effect * mark-off buggy proc and tests for it
1 parent 873ac94 commit 937b290

12 files changed

+103
-76
lines changed

Diff for: src/arraymancer/tensor/backend/cublas.nim

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import nimcuda/[cublas_v2, cublas_api],
15+
import nimcuda/cuda12_5/[cublas_v2, cublas_api],
1616
./cuda_global_state,
1717
./cuda
1818

Diff for: src/arraymancer/tensor/backend/cuda.nim

+18-14
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,18 @@
1414

1515
import ../data_structure,
1616
./global_config,
17-
nimcuda/[nimcuda, cuda_runtime_api, driver_types]
17+
nimcuda/cuda12_5/[check, cuda_runtime_api, driver_types]
1818

19-
export nimcuda, cuda_runtime_api, driver_types
19+
export check, cuda_runtime_api, driver_types
2020

2121
# Data structures to ease interfacing with Cuda and kernels
2222

23-
proc cudaMalloc*[T](size: Natural): ptr T {.noSideEffect, inline.}=
23+
proc cudaMalloc*[T](size: Natural): ptr UncheckedArray[T] {.noSideEffect, inline.}=
2424
## Internal proc.
2525
## Wrap CudaMAlloc(var pointer, size) -> Error_code
26-
let s = size * sizeof(T)
26+
let s = csize_t(size * sizeof(T))
2727
check cudaMalloc(cast[ptr pointer](addr result), s)
2828

29-
proc deallocCuda*[T](p: ref[ptr T]) {.noSideEffect.}=
30-
if not p[].isNil:
31-
check cudaFree(p[])
3229

3330

3431
# ##############################################################
@@ -38,7 +35,7 @@ proc newCudaStorage*[T: SomeFloat](length: int): CudaStorage[T] {.noSideEffect.}
3835
result.Flen = length
3936
new(result.Fref_tracking, deallocCuda)
4037
result.Fdata = cast[ptr UncheckedArray[T]](cudaMalloc[T](result.Flen))
41-
result.Fref_tracking[] = result.Fdata
38+
result.Fref_tracking.value = result.Fdata
4239

4340
# #########################################################
4441
# # Sending tensor layout to Cuda Kernel
@@ -70,7 +67,9 @@ type
7067
## Using arrays instead of seq avoids having to indicate __restrict__ everywhere to indicate no-aliasing
7168
## We also prefer stack allocated array sice the data will be used at every single loop iteration to compute elements position.
7269
## Ultimately it avoids worrying about deallocation too
73-
CudaLayoutArray = ref[ptr cint]
70+
CudaLayoutArrayObj* = object
71+
value*: ptr UncheckedArray[cint]
72+
CudaLayoutArray* = ref CudaLayoutArrayObj
7473

7574

7675
CudaTensorLayout [T: SomeFloat] = object
@@ -88,6 +87,11 @@ type
8887
data*: ptr T # Data on Cuda device
8988
len*: cint # Number of elements allocated in memory
9089

90+
91+
proc deallocCuda*(p: CudaLayoutArray) {.noSideEffect.}=
92+
if not p.value.isNil:
93+
check cudaFree(p.value)
94+
9195
proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSideEffect.}=
9296
## Store a CudaTensor shape, strides, etc information on the GPU
9397
#
@@ -103,8 +107,8 @@ proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSid
103107
new result.shape, deallocCuda
104108
new result.strides, deallocCuda
105109

106-
result.shape[] = cudaMalloc[cint](MAXRANK)
107-
result.strides[] = cudaMalloc[cint](MAXRANK)
110+
result.shape.value = cudaMalloc[cint](MAXRANK)
111+
result.strides.value = cudaMalloc[cint](MAXRANK)
108112

109113
var
110114
tmp_shape: array[MAXRANK, cint] # CudaLayoutArray
@@ -116,6 +120,6 @@ proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSid
116120

117121

118122
# TODO: use streams and async
119-
let size = t.rank * sizeof(cint)
120-
check cudaMemCpy(result.shape[], addr tmp_shape[0], size, cudaMemcpyHostToDevice)
121-
check cudaMemCpy(result.strides[], addr tmp_strides[0], size, cudaMemcpyHostToDevice)
123+
let size = csize_t(t.rank * sizeof(cint))
124+
check cudaMemCpy(result.shape.value, addr tmp_shape[0], size, cudaMemcpyHostToDevice)
125+
check cudaMemCpy(result.strides.value, addr tmp_strides[0], size, cudaMemcpyHostToDevice)

Diff for: src/arraymancer/tensor/backend/cuda_global_state.nim

+2-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import nimcuda/[nimcuda, cuda_runtime_api, cublas_v2, cublas_api]
15+
import nimcuda/cuda12_5/[check, cuda_runtime_api, cublas_v2, cublas_api,
16+
driver_types]
1617

1718
# ###################################################
1819
# Global Cuda and CuBLAS state

Diff for: src/arraymancer/tensor/data_structure.nim

+13-1
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,19 @@ import
1616
../laser/dynamic_stack_arrays,
1717
../laser/tensor/datatypes,
1818
nimblas,
19+
nimcuda/cuda12_5/[cuda_runtime_api, check],
1920
# Standard library
2021
std/[complex]
2122

2223
export nimblas.OrderType, complex
2324
export datatypes, dynamic_stack_arrays
2425

2526
type
27+
CudaTensorRefTrackerObj*[T: SomeFloat] = object
28+
value*: ptr UncheckedArray[T]
29+
30+
CudaTensorRefTracker*[T] = ref CudaTensorRefTrackerObj[T]
31+
2632
CudaStorage*[T: SomeFloat] = object
2733
## Opaque seq-like structure for storage on the Cuda backend.
2834
##
@@ -31,7 +37,7 @@ type
3137
# TODO: Forward declaring this and making this completely private prevent assignment in newCudaStorage from working
3238
Flen*: int
3339
Fdata*: ptr UncheckedArray[T]
34-
Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection.
40+
Fref_tracking*: CudaTensorRefTracker[T] # We keep ref tracking for the GC in a separate field to avoid double indirection.
3541

3642
CudaTensor*[T: SomeFloat] = object
3743
## Tensor data structure stored on Nvidia GPU (Cuda)
@@ -73,6 +79,12 @@ type
7379

7480
AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]
7581

82+
83+
proc deallocCuda*[T](p: CudaTensorRefTracker[T]) {.noSideEffect.}=
84+
if not p.value.isNil:
85+
check cudaFree(p.value)
86+
87+
7688
# ###############
7789
# Field accessors
7890
# ###############

Diff for: src/arraymancer/tensor/init_cuda.nim

+3-2
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,16 @@ proc cuda*[T:SomeFloat](t: Tensor[T]): CudaTensor[T] {.noinit.}=
4040
cudaMemcpyHostToDevice,
4141
cudaStream0) # cudaStream0 is a cudaStream_t global var
4242

43-
proc cpu*[T:SomeFloat](t: CudaTensor[T]): Tensor[T] {.noSideEffect, noinit.}=
43+
proc cpu*[T:SomeFloat](t: CudaTensor[T]): Tensor[T] {.noinit.}=
4444
## Convert a tensor on a Cuda device to a tensor on Cpu.
4545
# We use blocking copy in this case to make sure
4646
# all data is available for future computation
4747

4848
result.shape = t.shape
4949
result.strides = t.strides
5050
result.offset = t.offset
51-
result.data = newSeqUninit[T](t.storage.Flen) # We copy over all the memory allocated
51+
52+
allocCpuStorage result.storage, t.storage.Flen
5253

5354
let size = csize_t(t.storage.Flen * sizeof(T))
5455

Diff for: src/arraymancer/tensor/private/p_kernels_interface_cuda.nim

+20-20
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ template cuda_assign_binding(kernel_name: string, binding_name: untyped)=
3131
proc `binding_name`[T: SomeFloat](
3232
blocksPerGrid, threadsPerBlock: cint,
3333
rank, len: cint,
34-
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
35-
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T
34+
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
35+
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T
3636
) {.importcpp: import_string, noSideEffect.}
3737

3838

@@ -86,9 +86,9 @@ template cuda_assign_call*[T: SomeFloat](
8686
kernel_name[T](
8787
CUDA_HOF_TPB, CUDA_HOF_BPG,
8888
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
89-
dst.shape[], dst.strides[],
89+
dst.shape.value, dst.strides.value,
9090
dst.offset, dst.data,
91-
src.shape[], src.strides[],
91+
src.shape.value, src.strides.value,
9292
src.offset, src.data
9393
)
9494

@@ -106,9 +106,9 @@ template cuda_binary_binding(kernel_name: string, binding_name: untyped)=
106106
proc `binding_name`[T: SomeFloat](
107107
blocksPerGrid, threadsPerBlock: cint,
108108
rank, len: cint,
109-
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
110-
a_shape, a_strides: ptr cint, a_offset: cint, a_data: ptr T,
111-
b_shape, b_strides: ptr cint, b_offset: cint, b_data: ptr T
109+
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
110+
a_shape, a_strides: ptr UncheckedArray[cint], a_offset: cint, a_data: ptr T,
111+
b_shape, b_strides: ptr UncheckedArray[cint], b_offset: cint, b_data: ptr T
112112
) {.importcpp: import_string, noSideEffect.}
113113

114114

@@ -170,11 +170,11 @@ template cuda_binary_call*[T: SomeFloat](
170170
kernel_name(
171171
CUDA_HOF_TPB, CUDA_HOF_BPG,
172172
src_a.rank, dst.len, # Note: small shortcut, in this case len and size are the same
173-
dst.shape[], dst.strides[],
173+
dst.shape.value, dst.strides.value,
174174
dst.offset, dst.data,
175-
src_a.shape[], src_a.strides[],
175+
src_a.shape.value, src_a.strides.value,
176176
src_a.offset, src_a.data,
177-
src_b.shape[], src_b.strides[],
177+
src_b.shape.value, src_b.strides.value,
178178
src_b.offset, src_b.data
179179
)
180180

@@ -193,8 +193,8 @@ template cuda_rscal_binding(kernel_name: string, binding_name: untyped)=
193193
proc `binding_name`[T: SomeFloat](
194194
blocksPerGrid, threadsPerBlock: cint,
195195
rank, len: cint,
196-
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
197-
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T,
196+
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
197+
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T,
198198
beta: T
199199
) {.importcpp: import_string, noSideEffect.}
200200

@@ -252,9 +252,9 @@ template cuda_rscal_call*[T: SomeFloat](
252252
kernel_name[T](
253253
CUDA_HOF_TPB, CUDA_HOF_BPG,
254254
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
255-
dst.shape[], dst.strides[],
255+
dst.shape.value, dst.strides.value,
256256
dst.offset, dst.data,
257-
src.shape[], src.strides[],
257+
src.shape.value, src.strides.value,
258258
src.offset, src.data,
259259
beta
260260
)
@@ -274,9 +274,9 @@ template cuda_lscal_binding(kernel_name: string, binding_name: untyped)=
274274
proc `binding_name`[T: SomeFloat](
275275
blocksPerGrid, threadsPerBlock: cint,
276276
rank, len: cint,
277-
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
277+
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
278278
alpha: T,
279-
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T,
279+
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T,
280280
) {.importcpp: import_string, noSideEffect.}
281281

282282

@@ -332,10 +332,10 @@ template cuda_lscal_call*[T: SomeFloat](
332332
kernel_name[T](
333333
CUDA_HOF_TPB, CUDA_HOF_BPG,
334334
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
335-
dst.shape[], dst.strides[],
335+
dst.shape.value, dst.strides.value,
336336
dst.offset, dst.data,
337337
alpha,
338-
src.shape[], src.strides[],
338+
src.shape.value, src.strides.value,
339339
src.offset, src.data
340340
)
341341

@@ -352,7 +352,7 @@ template cuda_assignscal_binding(kernel_name: string, binding_name: untyped)=
352352
proc `binding_name`[T: SomeFloat](
353353
blocksPerGrid, threadsPerBlock: cint,
354354
rank, len: cint,
355-
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
355+
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
356356
scalar: T
357357
) {.importcpp: import_string, noSideEffect.}
358358

@@ -402,7 +402,7 @@ template cuda_assignscal_call*[T: SomeFloat](
402402
kernel_name[T](
403403
CUDA_HOF_TPB, CUDA_HOF_BPG,
404404
dst.rank, dst.len, # Note: small shortcut, in this case len and size are the same
405-
dst.shape[], dst.strides[],
405+
dst.shape.value, dst.strides.value,
406406
dst.offset, dst.data,
407407
val
408408
)

Diff for: src/arraymancer/tensor/shapeshifting_cuda.nim

+5-2
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,17 @@ proc transpose*(t: CudaTensor): CudaTensor {.noSideEffect.}=
3333

3434
cuda_assign_glue("cuda_asContiguous", "CopyOp", cuda_asContiguous)
3535

36-
proc asContiguous*[T: SomeFloat](t: CudaTensor[T], layout: OrderType = colMajor, force: bool = false):
37-
CudaTensor[T] {.noSideEffect.}=
36+
proc asContiguous*[T: SomeFloat](t: CudaTensor[T], layout: OrderType = rowMajor, force: bool = false):
37+
CudaTensor[T] {.noSideEffect, error: "NOT WORKING RIGHT NOW TODO: FIX".}=
3838
## Transform a tensor with general striding to a Tensor with contiguous layout.
3939
##
4040
## By default CudaTensor will be colMajor (contrary to a cpu tensor).
4141
##
4242
## By default nothing is done if the tensor is already contiguous (C Major or F major)
4343
## The "force" parameter can force re-ordering to a specific layout
44+
# TODO: fix. this proc always outputs rowmajor, no matter the input.
45+
# probably has to do with all the cuda tensors being colmajor by default,
46+
# plus probably some double-negative of two bugs making the other procs work.
4447

4548
if t.isContiguous and not force:
4649
return t

Diff for: tests/tensor/test_accessors_slicer_cuda.nim

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import ../../src/arraymancer
1717
import std / unittest
1818

1919

20-
testSuite "CUDA: Testing indexing and slice syntax":
20+
suite "CUDA: Testing indexing and slice syntax":
2121
const
2222
a = @[1, 2, 3, 4, 5]
2323
b = @[1, 2, 3, 4, 5]

Diff for: tests/tensor/test_broadcasting_cuda.nim

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import ../../src/arraymancer
1616
import std / [unittest, sugar, sequtils]
1717

18-
testSuite "CUDA: Shapeshifting - broadcasting and non linear algebra elementwise operations":
18+
suite "CUDA: Shapeshifting - broadcasting and non linear algebra elementwise operations":
1919
test "Tensor element-wise multiplication (Hadamard product) and division":
2020
block:
2121
let u = @[-4, 0, 9].toTensor().asType(float32).cuda

Diff for: tests/tensor/test_init_cuda.nim

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ import ../../src/arraymancer
1616
import std / unittest
1717

1818

19-
testSuite "Cuda init":
19+
suite "Cuda init":
2020
test "Clone function":
2121
let a = [ 7, 4, 3, 1, 8, 6,
2222
8, 1, 6, 2, 6, 6,

Diff for: tests/tensor/test_operators_blas_cuda.nim

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import ../../src/arraymancer
1717
import std / [unittest, sugar]
1818

19-
testSuite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)":
19+
suite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)":
2020
test "GEMM - General Matrix to Matrix Multiplication":
2121
## TODO: test with slices
2222
let a = [[1.0,2,3],

0 commit comments

Comments
 (0)