JuliaGPU · kshyatt · Jan 12, 2026 · Jan 9, 2026
diff --git a/src/array.jl b/src/array.jl
@@ -946,3 +946,11 @@ function Base.resize!(A::CuVector{T}, n::Integer) where T
   A.dims = (n,)
   return A
 end
+
+
+# CUBLAS.geam! is much faster than the generic implementation of transpose! in GPUArrays:
+function LinearAlgebra.transpose!(dest::CuMatrix{T}, src::CuMatrix{T}) where {T <: Union{Float32, Float64, ComplexF32, ComplexF64}}
+    axes(dest) == reverse(axes(src)) || throw(DimensionMismatch("axes of the destination are incompatible with that of the source"))
+    CUDA.CUBLAS.geam!('T', 'T', one(T), src, zero(T), src, dest)
+    return dest
+end
diff --git a/test/base/array.jl b/test/base/array.jl
@@ -979,6 +979,16 @@ end
   @test c === a
 end
 
+@testset "transpose!" begin
+    for T in [Float32, Float64, ComplexF32, ComplexF64]
+        a = CUDA.rand(T, 10, 20)
+        b = similar(a, reverse(size(a)))
+        c = similar(a)
+        @test Array(transpose!(b, a)) == transpose(Array(a))
+        @test_throws DimensionMismatch transpose!(c, a)
+    end
+end
+
 @testset "issue 2595" begin
   # mixed-type reductions resulted in a deadlock because of union splitting over shfl
   a = CUDA.zeros(Float32, 1)