feat: support AutoReactant (#1647)

avik-pal · github-actions[bot] · web-flow · commit fec90e9e08a2 · 2026-01-29T08:55:06.000-05:00
* feat: support AutoReactant * Apply suggestion from @avik-pal * Update src/helpers/training.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.30.0"
+version = "1.31.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -100,7 +100,7 @@ TrackerExt = "Tracker"
 ZygoteExt = "Zygote"
 
 [compat]
-ADTypes = "1.15"
+ADTypes = "1.19"
 Adapt = "4.4"
 ArrayInterface = "7.17.1"
 CUDA = "5.8"
diff --git a/docs/src/manual/compiling_lux_models.md b/docs/src/manual/compiling_lux_models.md
@@ -128,8 +128,8 @@ boilerplate. Simply follow the following steps:
    device. Note that you might want to use [`DeviceIterator`](@ref) to move the data
    loader to the device with an iterator.
 3. Construct a `TrainState` using [`Training.TrainState`](@ref).
-4. And most importantly use `AutoEnzyme` while calling [`Training.single_train_step!`](@ref)
-   or [`Training.single_train_step`](@ref).
+4. And most importantly use `AutoEnzyme`/`AutoReactant` while calling
+   [`Training.single_train_step!`](@ref) or [`Training.single_train_step`](@ref).
 
 ```@example compile_lux_model
 model = Chain(
@@ -152,7 +152,8 @@ function train_model(model, ps, st, dataloader)
     for iteration in 1:1000
         for (i, (xᵢ, yᵢ)) in enumerate(dataloader)
             _, loss, _, train_state = Training.single_train_step!(
-                AutoEnzyme(), MSELoss(), (xᵢ, yᵢ), train_state)
+                AutoEnzyme(), MSELoss(), (xᵢ, yᵢ), train_state
+            )
             if (iteration % 100 == 0 || iteration == 1) && i == 1
                 @printf("Iter: [%4d/%4d]\tLoss: %.8f\n", iteration, 1000, loss)
             end
diff --git a/examples/SimpleRNN/main.jl b/examples/SimpleRNN/main.jl
@@ -165,7 +165,7 @@ function main(model_type)
     else
         model
     end
-    ad = dev isa ReactantDevice ? AutoEnzyme() : AutoZygote()
+    ad = dev isa ReactantDevice ? AutoReactant() : AutoZygote()
 
     for epoch in 1:25
         ## Train the model
diff --git a/ext/ReactantExt/ReactantExt.jl b/ext/ReactantExt/ReactantExt.jl
@@ -52,6 +52,19 @@ function with_default_precision_config(f::F, ps) where {F}
     )
 end
 
+function get_compile_options(backend::ReactantBackend)
+    (; compile_options, sync) = backend
+    @assert compile_options isa Union{Nothing,Reactant.CompileOptions}
+    if compile_options === nothing
+        sync === missing && return Reactant.CompileOptions()
+        return Reactant.CompileOptions(; sync)
+    end
+    if sync !== missing
+        @set! compile_options.sync = sync
+    end
+    return compile_options
+end
+
 include("patches.jl")
 include("training.jl")
 include("layers.jl")
diff --git a/ext/ReactantExt/training.jl b/ext/ReactantExt/training.jl
@@ -108,7 +108,7 @@ function Lux.Training.compute_gradients_impl(
     else
         compiled_gradient_function = annotate_compile("Compute Gradients") do
             with_default_precision_config(ts.parameters) do
-                @compile sync = backend.sync compute_gradients_internal(
+                @compile compile_options = get_compile_options(backend) compute_gradients_internal(
                     objective_function, ts.model, data, ts.parameters, ts.states
                 )
             end
@@ -150,18 +150,15 @@ for inplace in ("!", "")
         else
             update_function = annotate_compile("Apply Gradients") do
                 with_default_precision_config(ts.parameters) do
-                    @compile sync = ts.cache.backend.sync Optimisers.$(update_fn)(
+                    @compile compile_options = get_compile_options(ts.cache.backend) Optimisers.$(
+                        update_fn
+                    )(
                         ts.optimizer_state, ts.parameters, grads
                     )
                 end
             end
 
-            if ts.cache isa TrainingBackendCache
-                @set! ts.cache.extras = merge(ts.cache.extras, (; update_function))
-            else
-                cache = TrainingBackendCache(backend, False(), nothing, (; update_function))
-                @set! ts.cache = cache
-            end
+            @set! ts.cache.extras = merge(ts.cache.extras, (; update_function))
         end
 
         opt_state, ps = annotate_execution("Apply Gradients", ts.step) do
@@ -206,7 +203,7 @@ for inplace in ("!", "")
 
             compiled_grad_and_step_function = annotate_compile("Train Step") do
                 with_default_precision_config(ts.parameters) do
-                    @compile sync = backend.sync compute_gradients_internal_and_step!(
+                    @compile compile_options = get_compile_options(backend) compute_gradients_internal_and_step!(
                         objective_function,
                         ts.model,
                         data,
diff --git a/src/Lux.jl b/src/Lux.jl
@@ -5,6 +5,7 @@ using ADTypes:
     AutoEnzyme,
     AutoForwardDiff,
     AutoMooncake,
+    AutoReactant,
     AutoReverseDiff,
     AutoTracker,
     AutoZygote
@@ -158,7 +159,13 @@ export Training
 export jacobian_vector_product, vector_jacobian_product
 export batched_jacobian
 export AutoEnzyme,
-    AutoForwardDiff, AutoMooncake, AutoReverseDiff, AutoTracker, AutoZygote, AutoForwardDiff
+    AutoForwardDiff,
+    AutoMooncake,
+    AutoReactant,
+    AutoReverseDiff,
+    AutoTracker,
+    AutoZygote,
+    AutoForwardDiff
 
 export BinaryCrossEntropyLoss,
     BinaryFocalLoss,
diff --git a/src/helpers/training.jl b/src/helpers/training.jl
@@ -2,7 +2,13 @@ module Training
 
 using Adapt: Adapt
 using ADTypes:
-    AbstractADType, AutoEnzyme, AutoReverseDiff, AutoTracker, AutoZygote, AutoMooncake
+    AbstractADType,
+    AutoEnzyme,
+    AutoReverseDiff,
+    AutoTracker,
+    AutoZygote,
+    AutoMooncake,
+    AutoReactant
 using SciMLPublic: @public
 using ConcreteStructs: @concrete
 using FastClosures: @closure
@@ -161,7 +167,8 @@ end
 
 @concrete struct ReactantBackend
     return_gradients <: StaticBool
-    sync::Bool
+    sync <: Union{Bool,Missing}
+    compile_options
     ad <: AutoEnzyme
 end
 
@@ -247,10 +254,15 @@ const SYNC_DOCSTRING = """
     Reactant Backend.
 """
 
+const COMPILE_OPTIONS_DOCSTRING = """
+  - `compile_options`: Compile options for the reactant function. See
+    `Reactant.CompileOptions` for more details. This is only used for Reactant Backend.
+"""
+
 """
     compute_gradients(
         ad::AbstractADType, objective_function::Function, data, ts::TrainState;
-        sync::Bool=false
+        sync::Bool=false, compile_options::Union{Missing,Reactant.CompileOptions}=missing
     )
 
 Compute the gradients of the objective function wrt parameters stored in `ts`.
@@ -279,6 +291,7 @@ Compute the gradients of the objective function wrt parameters stored in `ts`.
 ## Keyword Arguments
 
 $(SYNC_DOCSTRING)
+$(COMPILE_OPTIONS_DOCSTRING)
 
 ## Return
 
@@ -304,10 +317,10 @@ A 4-Tuple containing:
     returned in step `i + 1` might be aliased by the old gradients. If you want to prevent
     this, simply use `copy(grads)` or `deepcopy(grads)` to make a copy of the gradients.
 """
-function compute_gradients(ad, obj_fn::F, data, ts::TrainState; sync::Bool=false) where {F}
+function compute_gradients(ad, obj_fn::F, data, ts::TrainState; kwargs...) where {F}
     dev_type = get_device_type((ts.parameters, ts.states))
     return compute_gradients_impl_with_allocator_cache(
-        maybe_wrap_adtype(ad, dev_type; sync), ts.allocator_cache, obj_fn, data, ts
+        maybe_wrap_adtype(ad, dev_type; kwargs...), ts.allocator_cache, obj_fn, data, ts
     )
 end
 
@@ -346,14 +359,33 @@ end
 maybe_wrap_adtype(backend::ReactantBackend, ::Any; kwargs...) = backend
 maybe_wrap_adtype(ad::AbstractADType, ::Any; kwargs...) = ad
 function maybe_wrap_adtype(
-    ad::AbstractADType,
+    ad::AutoEnzyme,
+    ::Type{ReactantDevice};
+    return_gradients::Utils.BoolType=True(),
+    sync::Union{Missing,Bool}=missing,
+    compile_options=nothing,
+)
+    return ReactantBackend(static(return_gradients), sync, compile_options, ad)
+end
+function maybe_wrap_adtype(
+    ad::AutoReactant,
     ::Type{ReactantDevice};
     return_gradients::Utils.BoolType=True(),
-    sync::Bool=false,
+    sync::Union{Missing,Bool}=missing,
+    compile_options=nothing,
 )
-    ad isa AutoEnzyme && return ReactantBackend(static(return_gradients), sync, ad)
-    throw(ArgumentError("Computing gradients for models on XLA is supported only with \
-                         Enzyme.jl (`AutoEnzyme`)."))
+    return ReactantBackend(static(return_gradients), sync, compile_options, ad.mode)
+end
+function maybe_wrap_adtype(ad::AutoReactant, ::Type{T}; kwargs...) where {T}
+    throw(ArgumentError("`AutoReactant` only supports ReactantDevice but got `$(T)`"))
+end
+function maybe_wrap_adtype(ad::AbstractADType, ::Type{ReactantDevice}; kwargs...)
+    throw(
+        ArgumentError(
+            "Computing gradients for models with Reactant is supported only with \
+                Enzyme.jl (`AutoEnzyme` or `AutoReactant`)."
+        ),
+    )
 end
 
 function generate_wrappers(::F, m, ps, st, data, ::False, ::StaticBool) where {F}
@@ -408,7 +440,9 @@ const RETURN_GRADIENTS_DOCSTRING = """
 
 """
     single_train_step!(
-        backend, obj_fn::F, data, ts::TrainState; return_gradients=True(), sync::Bool=false
+        backend, obj_fn::F, data, ts::TrainState;
+        return_gradients=True(), sync::Bool=false,
+        compile_options::Union{Nothing,Reactant.CompileOptions}=missing,
     )
 
 Perform a single training step. Computes the gradients using [`compute_gradients`](@ref) and
@@ -419,6 +453,7 @@ updates the parameters using [`apply_gradients!`](@ref). All backends supported
 
 $(RETURN_GRADIENTS_DOCSTRING)
 $(SYNC_DOCSTRING)
+$(COMPILE_OPTIONS_DOCSTRING)
 
 ## Return
 
@@ -427,16 +462,9 @@ only the parameters in `ts` are updated inplace. Users should be using the retur
 object for further training steps, else there is no caching and performance will be
 suboptimal (and absolutely terrible for backends like `AutoReactant`).
 """
-function single_train_step!(
-    backend,
-    obj_fn::F,
-    data,
-    ts::TrainState;
-    return_gradients::Utils.BoolType=True(),
-    sync::Bool=false,
-) where {F}
+function single_train_step!(backend, obj_fn::F, data, ts::TrainState; kwargs...) where {F}
     backend = maybe_wrap_adtype(
-        backend, get_device_type((ts.parameters, ts.states)); return_gradients, sync
+        backend, get_device_type((ts.parameters, ts.states)); kwargs...
     )
     return single_train_step_impl_with_allocator_cache!(
         backend, ts.allocator_cache, obj_fn, data, ts
@@ -445,7 +473,9 @@ end
 
 """
     single_train_step(
-        backend, obj_fn::F, data, ts::TrainState; return_gradients=True(), sync::Bool=false
+        backend, obj_fn::F, data, ts::TrainState;
+        return_gradients=True(), sync::Bool=false,
+        compile_options::Union{Nothing,Reactant.CompileOptions}=missing,
     )
 
 Perform a single training step. Computes the gradients using [`compute_gradients`](@ref) and
@@ -458,21 +488,15 @@ In most cases you should use [`single_train_step!`](@ref) instead of this functi
 
 $(RETURN_GRADIENTS_DOCSTRING)
 $(SYNC_DOCSTRING)
+$(COMPILE_OPTIONS_DOCSTRING)
 
 ## Return
 
 Returned values are the same as [`single_train_step!`](@ref).
 """
-function single_train_step(
-    backend,
-    obj_fn::F,
-    data,
-    ts::TrainState;
-    return_gradients::Utils.BoolType=True(),
-    sync::Bool=false,
-) where {F}
+function single_train_step(backend, obj_fn::F, data, ts::TrainState; kwargs...) where {F}
     backend = maybe_wrap_adtype(
-        backend, get_device_type((ts.parameters, ts.states)); return_gradients, sync
+        backend, get_device_type((ts.parameters, ts.states)); kwargs...
     )
     return single_train_step_impl(backend, obj_fn, data, ts)
 end
diff --git a/test/Project.toml b/test/Project.toml
@@ -50,7 +50,7 @@ LuxTestUtils = {path = "../lib/LuxTestUtils"}
 MLDataDevices = {path = "../lib/MLDataDevices"}
 
 [compat]
-ADTypes = "1.10"
+ADTypes = "1.19"
 Adapt = "4"
 Aqua = "0.8.4"
 CPUSummary = "0.2.6"
diff --git a/test/reactant/training_tests.jl b/test/reactant/training_tests.jl
@@ -47,24 +47,22 @@
             end
 
             @testset for opt in (
-                Descent(0.01f0),
-                Momentum(0.01f0),
-                Adam(0.01f0),
-                AdamW(0.01f0),
-                OptimiserChain(AccumGrad(5), Adam(0.01f0)),
-            )
+                    Descent(0.01f0),
+                    Momentum(0.01f0),
+                    Adam(0.01f0),
+                    AdamW(0.01f0),
+                    OptimiserChain(AccumGrad(5), Adam(0.01f0)),
+                ),
+                ad in (AutoEnzyme(), AutoReactant())
+
                 ps, st = xdev(Lux.setup(StableRNG(1234), model))
                 train_state = Training.TrainState(model, ps, st, opt)
 
                 for epoch in 1:100, (xᵢ, yᵢ) in dataloader
                     grads, loss, stats, train_state = if version === :iip
-                        Training.single_train_step!(
-                            AutoEnzyme(), MSELoss(), (xᵢ, yᵢ), train_state
-                        )
+                        Training.single_train_step!(ad, MSELoss(), (xᵢ, yᵢ), train_state)
                     elseif version === :oop
-                        Training.single_train_step(
-                            AutoEnzyme(), MSELoss(), (xᵢ, yᵢ), train_state
-                        )
+                        Training.single_train_step(ad, MSELoss(), (xᵢ, yᵢ), train_state)
                     else
                         error("Invalid version: $(version)")
                     end
@@ -125,6 +123,11 @@ end
         AutoEnzyme(), MSELoss(), (x, x), train_state; return_gradients=Val(false)
     )
     @test loss isa Number
+
+    _, loss, stats, ts = Training.single_train_step(
+        AutoReactant(), MSELoss(), (x, x), train_state; return_gradients=Val(false)
+    )
+    @test loss isa Number
 end
 
 @testitem "Reactant Distributed: Training API" tags = [:reactant] setup = [SharedTestSetup] begin
@@ -152,19 +155,21 @@ end
         x = rand(Float32, 4, 128) |> batch_device
         y = rand(Float32, 4, 128) |> batch_device
 
-        train_state = Training.TrainState(model, ps, st, Adam(0.001f0))
+        @testset for ad in (AutoEnzyme(), AutoReactant())
+            train_state = Training.TrainState(model, ps, st, Adam(0.001f0))
 
-        _, loss, _, train_state = Training.single_train_step(
-            AutoEnzyme(), MSELoss(), (x, y), train_state
-        )
-        @test loss isa Reactant.ConcreteRNumber
-        @test length(Reactant.XLA.devices(Reactant.XLA.sharding(loss.data))) == 8
+            _, loss, _, train_state = Training.single_train_step(
+                ad, MSELoss(), (x, y), train_state
+            )
+            @test loss isa Reactant.ConcreteRNumber
+            @test length(Reactant.XLA.devices(Reactant.XLA.sharding(loss.data))) == 8
 
-        _, loss, _, train_state = Training.single_train_step(
-            AutoEnzyme(), MSELoss(), (x, y), train_state
-        )
-        @test loss isa Reactant.ConcreteRNumber
-        @test length(Reactant.XLA.devices(Reactant.XLA.sharding(loss.data))) == 8
+            _, loss, _, train_state = Training.single_train_step(
+                ad, MSELoss(), (x, y), train_state
+            )
+            @test loss isa Reactant.ConcreteRNumber
+            @test length(Reactant.XLA.devices(Reactant.XLA.sharding(loss.data))) == 8
+        end
     end
 end