Skip to content

Fix cuda #668

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/arraymancer/tensor/backend/cublas.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import nimcuda/[cublas_v2, cublas_api],
import nimcuda/cuda12_5/[cublas_v2, cublas_api],
./cuda_global_state,
./cuda

Expand Down
32 changes: 18 additions & 14 deletions src/arraymancer/tensor/backend/cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,18 @@

import ../data_structure,
./global_config,
nimcuda/[nimcuda, cuda_runtime_api, driver_types]
nimcuda/cuda12_5/[check, cuda_runtime_api, driver_types]

export nimcuda, cuda_runtime_api, driver_types
export check, cuda_runtime_api, driver_types

# Data structures to ease interfacing with Cuda and kernels

proc cudaMalloc*[T](size: Natural): ptr T {.noSideEffect, inline.}=
proc cudaMalloc*[T](size: Natural): ptr UncheckedArray[T] {.noSideEffect, inline.}=
## Internal proc.
## Wrap CudaMAlloc(var pointer, size) -> Error_code
let s = size * sizeof(T)
let s = csize_t(size * sizeof(T))
check cudaMalloc(cast[ptr pointer](addr result), s)

proc deallocCuda*[T](p: ref[ptr T]) {.noSideEffect.}=
if not p[].isNil:
check cudaFree(p[])


# ##############################################################
Expand All @@ -38,7 +35,7 @@ proc newCudaStorage*[T: SomeFloat](length: int): CudaStorage[T] {.noSideEffect.}
result.Flen = length
new(result.Fref_tracking, deallocCuda)
result.Fdata = cast[ptr UncheckedArray[T]](cudaMalloc[T](result.Flen))
result.Fref_tracking[] = result.Fdata
result.Fref_tracking.value = result.Fdata

# #########################################################
# # Sending tensor layout to Cuda Kernel
Expand Down Expand Up @@ -70,7 +67,9 @@ type
## Using arrays instead of seq avoids having to indicate __restrict__ everywhere to indicate no-aliasing
## We also prefer stack allocated array sice the data will be used at every single loop iteration to compute elements position.
## Ultimately it avoids worrying about deallocation too
CudaLayoutArray = ref[ptr cint]
CudaLayoutArrayObj* = object
value*: ptr UncheckedArray[cint]
CudaLayoutArray* = ref CudaLayoutArrayObj


CudaTensorLayout [T: SomeFloat] = object
Expand All @@ -88,6 +87,11 @@ type
data*: ptr T # Data on Cuda device
len*: cint # Number of elements allocated in memory


proc deallocCuda*(p: CudaLayoutArray) {.noSideEffect.}=
if not p.value.isNil:
check cudaFree(p.value)

proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSideEffect.}=
## Store a CudaTensor shape, strides, etc information on the GPU
#
Expand All @@ -103,8 +107,8 @@ proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSid
new result.shape, deallocCuda
new result.strides, deallocCuda

result.shape[] = cudaMalloc[cint](MAXRANK)
result.strides[] = cudaMalloc[cint](MAXRANK)
result.shape.value = cudaMalloc[cint](MAXRANK)
result.strides.value = cudaMalloc[cint](MAXRANK)

var
tmp_shape: array[MAXRANK, cint] # CudaLayoutArray
Expand All @@ -116,6 +120,6 @@ proc layoutOnDevice*[T:SomeFloat](t: CudaTensor[T]): CudaTensorLayout[T] {.noSid


# TODO: use streams and async
let size = t.rank * sizeof(cint)
check cudaMemCpy(result.shape[], addr tmp_shape[0], size, cudaMemcpyHostToDevice)
check cudaMemCpy(result.strides[], addr tmp_strides[0], size, cudaMemcpyHostToDevice)
let size = csize_t(t.rank * sizeof(cint))
check cudaMemCpy(result.shape.value, addr tmp_shape[0], size, cudaMemcpyHostToDevice)
check cudaMemCpy(result.strides.value, addr tmp_strides[0], size, cudaMemcpyHostToDevice)
3 changes: 2 additions & 1 deletion src/arraymancer/tensor/backend/cuda_global_state.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import nimcuda/[nimcuda, cuda_runtime_api, cublas_v2, cublas_api]
import nimcuda/cuda12_5/[check, cuda_runtime_api, cublas_v2, cublas_api,
driver_types]

# ###################################################
# Global Cuda and CuBLAS state
Expand Down
14 changes: 13 additions & 1 deletion src/arraymancer/tensor/data_structure.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@ import
../laser/dynamic_stack_arrays,
../laser/tensor/datatypes,
nimblas,
nimcuda/cuda12_5/[cuda_runtime_api, check],
# Standard library
std/[complex]

export nimblas.OrderType, complex
export datatypes, dynamic_stack_arrays

type
CudaTensorRefTrackerObj*[T: SomeFloat] = object
value*: ptr UncheckedArray[T]

CudaTensorRefTracker*[T] = ref CudaTensorRefTrackerObj[T]

CudaStorage*[T: SomeFloat] = object
## Opaque seq-like structure for storage on the Cuda backend.
##
Expand All @@ -31,7 +37,7 @@ type
# TODO: Forward declaring this and making this completely private prevent assignment in newCudaStorage from working
Flen*: int
Fdata*: ptr UncheckedArray[T]
Fref_tracking*: ref[ptr UncheckedArray[T]] # We keep ref tracking for the GC in a separate field to avoid double indirection.
Fref_tracking*: CudaTensorRefTracker[T] # We keep ref tracking for the GC in a separate field to avoid double indirection.

CudaTensor*[T: SomeFloat] = object
## Tensor data structure stored on Nvidia GPU (Cuda)
Expand Down Expand Up @@ -73,6 +79,12 @@ type

AnyTensor*[T] = Tensor[T] or CudaTensor[T] or ClTensor[T]


proc deallocCuda*[T](p: CudaTensorRefTracker[T]) {.noSideEffect.}=
if not p.value.isNil:
check cudaFree(p.value)


# ###############
# Field accessors
# ###############
Expand Down
5 changes: 3 additions & 2 deletions src/arraymancer/tensor/init_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,16 @@ proc cuda*[T:SomeFloat](t: Tensor[T]): CudaTensor[T] {.noinit.}=
cudaMemcpyHostToDevice,
cudaStream0) # cudaStream0 is a cudaStream_t global var

proc cpu*[T:SomeFloat](t: CudaTensor[T]): Tensor[T] {.noSideEffect, noinit.}=
proc cpu*[T:SomeFloat](t: CudaTensor[T]): Tensor[T] {.noinit.}=
## Convert a tensor on a Cuda device to a tensor on Cpu.
# We use blocking copy in this case to make sure
# all data is available for future computation

result.shape = t.shape
result.strides = t.strides
result.offset = t.offset
result.data = newSeqUninit[T](t.storage.Flen) # We copy over all the memory allocated

allocCpuStorage result.storage, t.storage.Flen

let size = csize_t(t.storage.Flen * sizeof(T))

Expand Down
40 changes: 20 additions & 20 deletions src/arraymancer/tensor/private/p_kernels_interface_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ template cuda_assign_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T
) {.importcpp: import_string, noSideEffect.}


Expand Down Expand Up @@ -86,9 +86,9 @@ template cuda_assign_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
src.shape[], src.strides[],
src.shape.value, src.strides.value,
src.offset, src.data
)

Expand All @@ -106,9 +106,9 @@ template cuda_binary_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
a_shape, a_strides: ptr cint, a_offset: cint, a_data: ptr T,
b_shape, b_strides: ptr cint, b_offset: cint, b_data: ptr T
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
a_shape, a_strides: ptr UncheckedArray[cint], a_offset: cint, a_data: ptr T,
b_shape, b_strides: ptr UncheckedArray[cint], b_offset: cint, b_data: ptr T
) {.importcpp: import_string, noSideEffect.}


Expand Down Expand Up @@ -170,11 +170,11 @@ template cuda_binary_call*[T: SomeFloat](
kernel_name(
CUDA_HOF_TPB, CUDA_HOF_BPG,
src_a.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
src_a.shape[], src_a.strides[],
src_a.shape.value, src_a.strides.value,
src_a.offset, src_a.data,
src_b.shape[], src_b.strides[],
src_b.shape.value, src_b.strides.value,
src_b.offset, src_b.data
)

Expand All @@ -193,8 +193,8 @@ template cuda_rscal_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T,
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T,
beta: T
) {.importcpp: import_string, noSideEffect.}

Expand Down Expand Up @@ -252,9 +252,9 @@ template cuda_rscal_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
src.shape[], src.strides[],
src.shape.value, src.strides.value,
src.offset, src.data,
beta
)
Expand All @@ -274,9 +274,9 @@ template cuda_lscal_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
alpha: T,
src_shape, src_strides: ptr cint, src_offset: cint, src_data: ptr T,
src_shape, src_strides: ptr UncheckedArray[cint], src_offset: cint, src_data: ptr T,
) {.importcpp: import_string, noSideEffect.}


Expand Down Expand Up @@ -332,10 +332,10 @@ template cuda_lscal_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
src.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
alpha,
src.shape[], src.strides[],
src.shape.value, src.strides.value,
src.offset, src.data
)

Expand All @@ -352,7 +352,7 @@ template cuda_assignscal_binding(kernel_name: string, binding_name: untyped)=
proc `binding_name`[T: SomeFloat](
blocksPerGrid, threadsPerBlock: cint,
rank, len: cint,
dst_shape, dst_strides: ptr cint, dst_offset: cint, dst_data: ptr T,
dst_shape, dst_strides: ptr UncheckedArray[cint], dst_offset: cint, dst_data: ptr T,
scalar: T
) {.importcpp: import_string, noSideEffect.}

Expand Down Expand Up @@ -402,7 +402,7 @@ template cuda_assignscal_call*[T: SomeFloat](
kernel_name[T](
CUDA_HOF_TPB, CUDA_HOF_BPG,
dst.rank, dst.len, # Note: small shortcut, in this case len and size are the same
dst.shape[], dst.strides[],
dst.shape.value, dst.strides.value,
dst.offset, dst.data,
val
)
7 changes: 5 additions & 2 deletions src/arraymancer/tensor/shapeshifting_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@ proc transpose*(t: CudaTensor): CudaTensor {.noSideEffect.}=

cuda_assign_glue("cuda_asContiguous", "CopyOp", cuda_asContiguous)

proc asContiguous*[T: SomeFloat](t: CudaTensor[T], layout: OrderType = colMajor, force: bool = false):
CudaTensor[T] {.noSideEffect.}=
proc asContiguous*[T: SomeFloat](t: CudaTensor[T], layout: OrderType = rowMajor, force: bool = false):
CudaTensor[T] {.noSideEffect, error: "NOT WORKING RIGHT NOW TODO: FIX".}=
## Transform a tensor with general striding to a Tensor with contiguous layout.
##
## By default CudaTensor will be colMajor (contrary to a cpu tensor).
##
## By default nothing is done if the tensor is already contiguous (C Major or F major)
## The "force" parameter can force re-ordering to a specific layout
# TODO: fix. this proc always outputs rowmajor, no matter the input.
# probably has to do with all the cuda tensors being colmajor by default,
# plus probably some double-negative of two bugs making the other procs work.

if t.isContiguous and not force:
return t
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_accessors_slicer_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import ../../src/arraymancer
import std / unittest


testSuite "CUDA: Testing indexing and slice syntax":
suite "CUDA: Testing indexing and slice syntax":
const
a = @[1, 2, 3, 4, 5]
b = @[1, 2, 3, 4, 5]
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_broadcasting_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import ../../src/arraymancer
import std / [unittest, sugar, sequtils]

testSuite "CUDA: Shapeshifting - broadcasting and non linear algebra elementwise operations":
suite "CUDA: Shapeshifting - broadcasting and non linear algebra elementwise operations":
test "Tensor element-wise multiplication (Hadamard product) and division":
block:
let u = @[-4, 0, 9].toTensor().asType(float32).cuda
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_init_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import ../../src/arraymancer
import std / unittest


testSuite "Cuda init":
suite "Cuda init":
test "Clone function":
let a = [ 7, 4, 3, 1, 8, 6,
8, 1, 6, 2, 6, 6,
Expand Down
2 changes: 1 addition & 1 deletion tests/tensor/test_operators_blas_cuda.nim
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import ../../src/arraymancer
import std / [unittest, sugar]

testSuite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)":
suite "CUDA CuBLAS backend (Basic Linear Algebra Subprograms)":
test "GEMM - General Matrix to Matrix Multiplication":
## TODO: test with slices
let a = [[1.0,2,3],
Expand Down
Loading
Loading