LuxDL · avik-pal · Feb 2, 2026 · Jan 31, 2026 · Feb 2, 2026
diff --git a/.buildkite/testing_luxlib.yml b/.buildkite/testing_luxlib.yml
@@ -7,7 +7,7 @@ steps:
               version: "{{matrix.julia}}"
           - JuliaCI/julia-test#v1:
               project: "lib/LuxLib"
-              test_args: "BACKEND_GROUP=CUDA LUXLIB_TEST_GROUP={{matrix.group}}"
+              test_args: "--BACKEND_GROUP=CUDA {{matrix.group}}"
           - JuliaCI/julia-coverage#v1:
               codecov: true
               dirs:
@@ -28,9 +28,9 @@ steps:
             julia:
               - "1.12"
             group:
-              - "common"
+              - "common_ops"
               - "normalization"
-              - "misc"
+              - "others"
 
   # - group: ":julia: (LuxLib) AMD GPU"
   #   steps:
@@ -40,7 +40,7 @@ steps:
   #             version: "{{matrix.julia}}"
   #         - JuliaCI/julia-test#v1:
   #             project: "lib/LuxLib"
-  #             test_args: "BACKEND_GROUP=AMDGPU"
+  #             test_args: "--BACKEND_GROUP=AMDGPU"
   #         - JuliaCI/julia-coverage#v1:
   #             codecov: true
   #             dirs:

diff --git a/.github/workflows/CI_LuxLib.yml b/.github/workflows/CI_LuxLib.yml
@@ -27,39 +27,39 @@ jobs:
       fail-fast: false
       matrix:
         test_group:
-          - "common"
+          - "common_ops"
           - "normalization"
-          - "misc"
+          - "others"
         blas_backend:
           - "default"
         loopvec:
           - "true"
         include:
-          - test_group: "common"
+          - test_group: "common_ops"
             blas_backend: "default"
             loopvec: "false"
-          - test_group: "misc"
+          - test_group: "others"
             blas_backend: "default"
             loopvec: "false"
     uses: ./.github/workflows/CommonCI.yml
     with:
       julia_version: "1.12"
       project: "lib/LuxLib"
-      test_args: "BACKEND_GROUP=cpu LUXLIB_TEST_GROUP=${{ matrix.test_group }} LUXLIB_BLAS_BACKEND=${{ matrix.blas_backend }} LUXLIB_LOAD_LOOPVEC=${{ matrix.loopvec }}"
+      test_args: "--BACKEND_GROUP=cpu --BLAS_BACKEND=${{ matrix.blas_backend }} --LOOP_VECTORIZATION=${{ matrix.loopvec }} ${{ matrix.test_group }}"
 
   downgrade:
     strategy:
       fail-fast: false
       matrix:
         test_group:
-          - "common"
+          - "common_ops"
           - "normalization"
-          - "misc"
+          - "others"
     uses: ./.github/workflows/CommonCI.yml
     with:
       julia_version: "1.11"
       project: "lib/LuxLib"
       downgrade_testing: true
       local_dependencies: "lib/LuxCore,lib/MLDataDevices"
       local_test_dependencies: "lib/LuxTestUtils,lib/MLDataDevices"
-      test_args: "BACKEND_GROUP=cpu LUXLIB_TEST_GROUP=${{ matrix.test_group }} LUXLIB_BLAS_BACKEND=default LUXLIB_LOAD_LOOPVEC=true"
+      test_args: "--BACKEND_GROUP=cpu --BLAS_BACKEND=default --LOOP_VECTORIZATION=true ${{ matrix.test_group }}"
diff --git a/lib/LuxLib/Project.toml b/lib/LuxLib/Project.toml
@@ -1,6 +1,6 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
-version = "1.15.3"
+version = "1.15.4"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
 
 [deps]
@@ -73,13 +73,13 @@ BLISBLAS = "0.1, 0.2"
 CPUSummary = "0.2.6"
 CUDA = "5.8"
 ChainRulesCore = "1.25.1"
-DispatchDoctor = "0.4.12"
+DispatchDoctor = "0.4.28"
 Enzyme = "0.13.120"
 EnzymeCore = "0.8.16"
 FastClosures = "0.3.2"
 ForwardDiff = "0.10.36, 1"
 Functors = "0.5"
-KernelAbstractions = "0.9.30"
+KernelAbstractions = "0.9.39"
 LinearAlgebra = "1.10"
 LoopVectorization = "0.12.171"
 LuxCore = "1.5"
@@ -97,7 +97,7 @@ Reexport = "1"
 ReverseDiff = "1.15"
 SLEEFPirates = "0.6.43"
 SciMLPublic = "1.0.0"
-Static = "0.8.4, 1"
+Static = "1.1.1"
 StaticArraysCore = "1.4.3"
 Statistics = "1.10"
 Tracker = "0.2.36"

diff --git a/lib/LuxLib/test/Project.toml b/lib/LuxLib/test/Project.toml
@@ -1,30 +1,31 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 BLISBLAS = "6f275bd8-fec0-4d39-945b-7e95a765fa1e"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
 ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
-InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb"
 LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
+LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 LuxLib = "82251201-b29d-42c6-8e01-566dec8acb11"
 LuxTestUtils = "ac9de150-d08f-4546-94fb-7472b5760531"
 MKL = "33e6dc65-8f57-5167-99aa-e5a354878fb2"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
+ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-ReTestItems = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
 Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
-Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3"
@@ -33,41 +34,40 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [sources]
-LuxLib = { path = ".." }
-LuxTestUtils = { path = "../../LuxTestUtils" }
-MLDataDevices = { path = "../../MLDataDevices" }
+LuxLib = {path = ".."}
+LuxTestUtils = {path = "../../LuxTestUtils"}
+MLDataDevices = {path = "../../MLDataDevices"}
 
 [compat]
 AppleAccelerate = "0.4, 0.5"
 Aqua = "0.8.7"
 BLISBLAS = "0.1, 0.2"
 BenchmarkTools = "1.5"
-CPUSummary = "0.2.6"
 ChainRulesCore = "1.25.1"
 ComponentArrays = "0.15.22"
 Enzyme = "0.13.120"
 EnzymeCore = "0.8.16"
 ExplicitImports = "1.9.0"
 ForwardDiff = "0.10.36, =1"
-InteractiveUtils = "<0.0.1, 1"
 JLArrays = "0.1.5, 0.2, 0.3"
 LoopVectorization = "0.12.171"
 LuxTestUtils = "2"
 MKL = "0.7, 0.8, 0.9"
 MLDataDevices = "1.17"
-Mooncake = "0.4"
+Mooncake = "0.5"
 NNlib = "0.9.27"
 Octavian = "0.3.28"
+ParallelTestRunner = "2.1"
 Pkg = "1.10"
 Random = "1.10"
-ReTestItems = "1.24.0"
 Reactant = "0.2.179"
-Reexport = "1"
 ReverseDiff = "1.15"
 StableRNGs = "1.0.2"
-Static = "0.8.4, 1"
+Static = "1.1.1"
 StaticArrays = "1.9.7"
 Statistics = "1.10"
 Test = "1.10"

diff --git a/lib/LuxLib/test/common_ops/activation_tests.jl b/lib/LuxLib/test/common_ops/activation_tests.jl
@@ -1,11 +1,13 @@
-@testitem "Activation Functions" tags = [:misc] setup = [SharedTestSetup] begin
-    using Enzyme
+using Enzyme, LuxLib, Test, NNlib, Zygote
 
-    rng = StableRNG(1234)
+include("../shared_testsetup.jl")
+
+apply_act(f::F, x) where {F} = sum(abs2, f.(x))
+apply_act_fast(f::F, x) where {F} = sum(abs2, fast_activation!!(f, copy(x)))
+apply_act_fast2(f::F, x) where {F} = sum(abs2, fast_activation(f, x))
 
-    apply_act(f::F, x) where {F} = sum(abs2, f.(x))
-    apply_act_fast(f::F, x) where {F} = sum(abs2, fast_activation!!(f, copy(x)))
-    apply_act_fast2(f::F, x) where {F} = sum(abs2, fast_activation(f, x))
+@testset "Activation Functions" begin
+    rng = StableRNG(1234)
 
     @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$f: $T" for f in [

diff --git a/lib/LuxLib/test/common_ops/attention_tests.jl b/lib/LuxLib/test/common_ops/attention_tests.jl
@@ -1,6 +1,8 @@
-@testitem "Scaled Dot Product Attention" tags = [:misc] setup = [SharedTestSetup] begin
-    using LuxLib, Reactant, NNlib, Random, MLDataDevices, Enzyme, Statistics
+include("../shared_testsetup.jl")
 
+using LuxLib, Reactant, NNlib, Random, MLDataDevices, Enzyme, Statistics, Test, Zygote
+
+@testset "Scaled Dot Product Attention" begin
     @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "Different Batch Sizes" begin
             n, lenq, lenkv = 15, 3, 4

diff --git a/lib/LuxLib/test/common_ops/bias_act_tests.jl b/lib/LuxLib/test/common_ops/bias_act_tests.jl
@@ -1,15 +1,20 @@
-@testitem "Bias Activation" tags = [:misc] setup = [SharedTestSetup] begin
-    rng = StableRNG(1234)
+include("../shared_testsetup.jl")
 
-    bias_act_loss1(act, x, b) = sum(abs2, act.(x .+ LuxLib.Impl.reshape_bias(x, b)))
-    bias_act_loss2(act, x, b) = sum(abs2, bias_activation(act, x, b))
-    bias_act_loss3(act, x, b) = sum(abs2, bias_activation!!(act, copy(x), b))
+using LuxLib, Test, StableRNGs, NNlib, LuxTestUtils, Zygote
+using ReverseDiff, Tracker
 
-    struct __Fix1{F,A}
-        f::F
-        act::A
-    end
-    (f::__Fix1)(x, b) = f.f(f.act, x, b)
+bias_act_loss1(act, x, b) = sum(abs2, act.(x .+ LuxLib.Impl.reshape_bias(x, b)))
+bias_act_loss2(act, x, b) = sum(abs2, bias_activation(act, x, b))
+bias_act_loss3(act, x, b) = sum(abs2, bias_activation!!(act, copy(x), b))
+
+struct __Fix1{F,A}
+    f::F
+    act::A
+end
+(f::__Fix1)(x, b) = f.f(f.act, x, b)
+
+@testset "Bias Activation" begin
+    rng = StableRNG(1234)
 
     @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$act, $T, $sz" for act in [
@@ -93,9 +98,7 @@
     end
 end
 
-@testitem "Bias Activation (ReverseDiff)" tags = [:misc] setup = [SharedTestSetup] begin
-    using ReverseDiff, Tracker
-
+@testset "Bias Activation (ReverseDiff)" begin
     x = rand(Float32, 3, 4)
     b = rand(Float32, 3)
     act = tanh
@@ -113,7 +116,7 @@ end
     @test z isa Tracker.TrackedArray
 end
 
-@testitem "Bias Activation: Zero-sized Arrays" tags = [:misc] setup = [SharedTestSetup] begin
+@testset "Bias Activation: Zero-sized Arrays" begin
     @testset "$mode" for (mode, aType, ongpu) in MODES
         x = aType(rand(Float32, 4, 3, 2, 0))
         b = aType(rand(Float32, 2))

diff --git a/lib/LuxLib/test/common_ops/conv_tests.jl b/lib/LuxLib/test/common_ops/conv_tests.jl
@@ -1,4 +1,5 @@
-@testsetup module ConvSetup
+include("../shared_testsetup.jl")
+
 using LuxLib, LuxTestUtils, Random, Test, NNlib
 
 expand(_, i::Tuple) = i
@@ -76,11 +77,7 @@ end
 
 anonact = x -> gelu(x)
 
-# const ELTYPES = [(Float32, Float32), (Float32, Float64), (Float64, Float64)]
 const ELTYPES = [(Float32, Float32), (Float64, Float64)]
-# const ACTIVATIONS = [
-#     identity, tanh, tanh_fast, sigmoid, sigmoid_fast, relu, gelu, swish, anonact
-# ]
 const ACTIVATIONS = [identity, sigmoid, gelu]
 
 const ALL_TEST_CONFIGS = Iterators.product(
@@ -95,43 +92,11 @@ const ALL_TEST_CONFIGS = Iterators.product(
     ),
 )
 
-const TEST_BLOCKS = collect(
-    Iterators.partition(ALL_TEST_CONFIGS, ceil(Int, length(ALL_TEST_CONFIGS) / 2))
-)
-
-export expand, convfilter, calc_padding, anonact, TEST_BLOCKS, run_conv_testing
-
-end
-
-@testitem "Fused Conv: Group 1" tags = [:common] setup = [SharedTestSetup, ConvSetup] begin
-    @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
-        @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for (
-            (Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)
-        ) in TEST_BLOCKS[1]
-            !fp64 && (Tx == Float64 || Tw == Float64) && continue
-            run_conv_testing(
-                generate_fixed_array,
-                activation,
-                kernel,
-                stride,
-                padding,
-                hasbias,
-                groups,
-                Tw,
-                Tx,
-                aType,
-                mode,
-                ongpu,
-            )
-        end
-    end
-end
-
-@testitem "Fused Conv: Group 2" tags = [:common] setup = [SharedTestSetup, ConvSetup] begin
+@testset "Fused Conv" begin
     @testset "$mode" for (mode, aType, ongpu, fp64) in MODES
         @testset "$(Tw) x $(Tx) hasbias: $(hasbias) activation: $(activation) kernel: $(kernel) padding: $(padding) stride: $(stride) groups: $(groups)" for (
             (Tx, Tw), hasbias, activation, (kernel, padding, stride, groups)
-        ) in TEST_BLOCKS[2]
+        ) in ALL_TEST_CONFIGS
             !fp64 && (Tx == Float64 || Tw == Float64) && continue
             run_conv_testing(
                 generate_fixed_array,