Add support of CuStream control (#27)

ArrogantGao · web-flow · commit ee26ad856998 · 2023-12-20T23:51:14.000+08:00
* add stream control

* reexport TropicalNumbers

* update deps
diff --git a/Project.toml b/Project.toml
@@ -1,18 +1,20 @@
 name = "CuTropicalGEMM"
 uuid = "c2b282c3-c9c2-431d-80f7-a1a0561ebe55"
-authors = ["Xuanzhao Gao <gaoxuanzhao@gmail.com> and contributors"]
-version = "0.1.1"
+authors = ["Xuanzhao Gao <xz.gao@connect.ust.hk> and Jin-Guo Liu"]
+version = "0.1.2"
 
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 TropicalGemmC_jll = "4f4992fb-2984-5eba-87b8-475305d0f5fc"
 TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334"
 
 [compat]
 CUDA = "5"
-TropicalGemmC_jll = "0.1.1"
+TropicalGemmC_jll = "0.1.3"
 TropicalNumbers = "0.6.2"
+Reexport = "1.2.2"
 julia = "1"
 
 [extras]
diff --git a/src/CuTropicalGEMM.jl b/src/CuTropicalGEMM.jl
@@ -1,14 +1,18 @@
 module CuTropicalGEMM
 
-using CUDA, TropicalNumbers, LinearAlgebra, TropicalGemmC_jll 
+using CUDA, LinearAlgebra
+using TropicalGemmC_jll
+using Reexport
+@reexport using TropicalNumbers
+
 export matmul!
 
 function __init__()
     if CUDA.functional() == true
         if CUDA.driver_version() < v"11.4"
             @warn "CUDA.driver_version < v11.4! CuTropicalGEMM may not be available."
-        elseif CUDA.driver_version() > v"12.2"
-            @warn "CUDA.driver_version > v12.2! CuTropicalGEMM may not be available."
+        elseif CUDA.driver_version() > v"12.3"
+            @warn "CUDA.driver_version > v12.3! CuTropicalGEMM may not be available."
         end
     elseif CUDA.functional() == false
         @warn "CUDA Driver not found! CuTropicalGEMM will not be available."
diff --git a/src/tropical_gemms.jl b/src/tropical_gemms.jl
@@ -20,22 +20,22 @@ for (TA, tA) in [(:CuVecOrMat, 'N'), (:CTranspose, 'T')]
             (:TropicalMinPlusF32, :Cfloat, :FLOAT_minplus, :lib_TropicalMinPlus_FP32), (:TropicalMinPlusF64, :Cdouble, :DOUBLE_minplus, :lib_TropicalMinPlus_FP64), 
             (:TropicalMaxMulF32, :Cfloat, :FLOAT_maxmul, :lib_TropicalMaxMul_FP32), (:TropicalMaxMulF64, :Cdouble, :DOUBLE_maxmul, :lib_TropicalMaxMul_FP64), (:TropicalMaxMulI32, :Cint, :INT_maxmul, :lib_TropicalMaxMul_INT32), (:TropicalMaxMulI64, :Clong, :LONG_maxmul, :lib_TropicalMaxMul_INT64)
             ]
-            @eval function matmul!(C::CuVecOrMat{T}, A::$TA{T}, B::$TB{T}, α::T, β::T) where {T<:$TT}
+            @eval function matmul!(C::CuVecOrMat{T}, A::$TA{T}, B::$TB{T}, α::T, β::T, stream::CuStream = stream()) where {T<:$TT}
                 M, N, K = dims_match(A, B, C)
                 if K == 0 && M * N != 0
                     return rmul!(C, β)
                 elseif M * N == 0
                     return C
                 else
-                    @ccall $lib.$funcname(M::Cint, N::Cint, K::Cint, pointer(parent(A))::CuPtr{$CT}, pointer(parent(B))::CuPtr{$CT}, pointer(C)::CuPtr{$CT}, content(α)::$CT, content(β)::$CT, $tA::Cchar, $tB::Cchar)::Cvoid
+                    @ccall $lib.$funcname(M::Cint, N::Cint, K::Cint, pointer(parent(A))::CuPtr{$CT}, pointer(parent(B))::CuPtr{$CT}, pointer(C)::CuPtr{$CT}, content(α)::$CT, content(β)::$CT, $tA::Cchar, $tB::Cchar, stream::CUDA.CUstream)::Cvoid
                 end
                 return C
             end
         end
     end
 end
 
-const CuTropicalBlasTypes = Union{TropicalAndOr, TropicalMaxPlusF32, TropicalMaxPlusF64, TropicalMaxMulF32, TropicalMaxMulF64, TropicalMaxMulI32, TropicalMaxMulI64}
+const CuTropicalBlasTypes = Union{TropicalAndOr, TropicalMaxPlusF32, TropicalMaxPlusF64, TropicalMinPlusF32, TropicalMinPlusF64, TropicalMaxMulF32, TropicalMaxMulF64, TropicalMaxMulI32, TropicalMaxMulI64}
 
 # overload the LinearAlgebra.mul!
 for TA in [:CuVecOrMat, :CTranspose]
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,6 +1,4 @@
 [deps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-TropicalGemmC_jll = "4f4992fb-2984-5eba-87b8-475305d0f5fc"
-TropicalNumbers = "b3a74e9c-7526-4576-a4eb-79c0d4c32334"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,7 +1,6 @@
 using CuTropicalGEMM
 using Test
 using CUDA
-using TropicalNumbers
 using LinearAlgebra
 
 @testset "CuTropicalGEMM.jl" begin
diff --git a/test/tropical_gemms.jl b/test/tropical_gemms.jl
@@ -30,7 +30,7 @@
                                 hB = Array(B)
                                 hC = Array(C)
 
-                                C = CuTropicalGEMM.matmul!(C, A, B, α, β)
+                                CUDA.@sync C = CuTropicalGEMM.matmul!(C, A, B, α, β)
                 
                                 hC .= α .* hA * hB .+ β .* hC
 
@@ -55,8 +55,8 @@ end
                             @testset "$testname" begin
                                 if !(size(A) == (1,4) && size(B) == (4,))
                                     res0 = Array(A) * Array(B)
-                                    res1 = A * B
-                                    res2 = LinearAlgebra.mul!(MT.(CUDA.zeros(T, size(res0)...)), A, B)
+                                    CUDA.@sync res1 = A * B
+                                    CUDA.@sync res2 = LinearAlgebra.mul!(MT.(CUDA.zeros(T, size(res0)...)), A, B)
                                     @test Array(res1) ≈ res0
                                     @test Array(res2) ≈ res0
                                 end
@@ -79,8 +79,8 @@ end
                 for B in [transpose(a), a, b]
                     if !(size(A) == (1,4) && size(B) == (4,))
                         res0 = Array(A) * Array(B)
-                        res1 = A * B
-                        res2 = LinearAlgebra.mul!(MT.(CUDA.zeros(T, size(res0)...)), A, B, true, false)
+                        CUDA.@sync res1 = A * B
+                        CUDA.@sync res2 = LinearAlgebra.mul!(MT.(CUDA.zeros(T, size(res0)...)), A, B, true, false)
                         @test Array(res1) ≈ res0
                         @test Array(res2) ≈ res0
                     end