LuxDL
diff --git a/‎.github/workflows/CommonCI.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/CommonCI.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Project.toml‎
Lines changed: 3 additions & 3 deletions b/‎Project.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/Project.toml‎
Lines changed: 1 addition & 1 deletion b/‎docs/Project.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ext/LuxReactantExt/LuxReactantExt.jl‎
Lines changed: 4 additions & 19 deletions b/‎ext/LuxReactantExt/LuxReactantExt.jl‎
Lines changed: 4 additions & 19 deletions
diff --git a/‎ext/LuxReactantExt/patches.jl‎
Lines changed: 12 additions & 7 deletions b/‎ext/LuxReactantExt/patches.jl‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎ext/LuxReactantExt/training.jl‎
Lines changed: 64 additions & 19 deletions b/‎ext/LuxReactantExt/training.jl‎
Lines changed: 64 additions & 19 deletions
@@ -45,6 +45,7 @@ jobs:
     runs-on: ${{ inputs.os }}
     env:
       TMPDIR: ${{ github.workspace }}/tmp
+      XLA_FLAGS: --xla_force_host_platform_device_count=8
     steps:
       - uses: actions/checkout@v5
       - name: Create TMPDIR
 
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.24.0"
+version = "1.25.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -98,7 +98,7 @@ LinearAlgebra = "1.10"
 LossFunctions = "0.11.1, 1"
 LuxCore = "1.4.2"
 LuxLib = "1.12.1"
-MLDataDevices = "1.12.1"
+MLDataDevices = "1.15"
 MLUtils = "0.4.4"
 MPI = "0.20.19"
 MacroTools = "0.5.13"
@@ -110,7 +110,7 @@ Optimisers = "0.4.6"
 PrecompileTools = "1.2.1"
 Preferences = "1.4.3"
 Random = "1.10"
-Reactant = "0.2.170"
+Reactant = "0.2.174"
 ReactantCore = "0.1.16"
 Reexport = "1.2.2"
 ReverseDiff = "1.15"
 
@@ -66,7 +66,7 @@ OpenSSL_jll = "=3.0.16"
 Optimisers = "0.4.6"
 Printf = "1.10"
 Random = "1.10"
-Reactant = "0.2.170"
+Reactant = "0.2.173"
 StableRNGs = "1"
 StaticArrays = "1"
 WeightInitializers = "1"
 
@@ -1,6 +1,7 @@
 module LuxReactantExt
 
-using Enzyme: Enzyme, Const
+using Enzyme: Enzyme, Active, Const, Duplicated
+using Functors: Functors
 using Preferences: load_preference
 using Optimisers: Optimisers
 using Reactant:
@@ -21,38 +22,22 @@ using Static: True, False
 using Lux: Lux, LuxOps, Training, Utils, StatefulLuxLayer
 using Lux.Training: TrainingBackendCache, ReactantBackend
 using LuxCore: LuxCore, AbstractLuxLayer
-using MLDataDevices: ReactantDevice, get_device
+using MLDataDevices: MLDataDevices, ReactantDevice, get_device
 
 Lux.is_extension_loaded(::Val{:Reactant}) = true
 
-Utils.to_rarray(x; kwargs...) = Reactant.to_rarray(x; kwargs...)
-
 Utils.contiguous(x::AnyTracedRArray) = ReactantCore.materialize_traced_array(x)
 
 Utils.eltype(::Type{<:TracedRArray{T,N}}) where {T,N} = T
 Utils.eltype(::Type{<:TracedRNumber{T}}) where {T} = T
 Utils.eltype(x::Reactant.AnyTracedRArray) = Reactant.unwrapped_eltype(x)
 
-function Utils.promote_to(::Type{T}, x::Number) where {T<:Number}
-    x isa Reactant.TracedType && return x
-    return Reactant.ConcreteRNumber{T}(x)
-end
-
-# For CUDA use `PrecisionConfig.HIGH`. For other backends use `PrecisionConfig.DEFAULT`.
 function default_precision_config(ps)
     precision_config_preference = lowercase(
         load_preference(Lux, "precision_config", "auto")
     )
 
-    if precision_config_preference == "auto"
-        rdev = get_device(ps)
-        rdev isa ReactantDevice || return PrecisionConfig.DEFAULT
-        device = rdev.device === missing ? Reactant.XLA.default_device() : rdev.device
-        device_kind = string(device)
-        contains(device_kind, "CUDA") && return PrecisionConfig.HIGH
-        return PrecisionConfig.DEFAULT
-    end
-
+    precision_config_preference == "auto" && return PrecisionConfig.DEFAULT
     precision_config_preference == "default" && return PrecisionConfig.DEFAULT
     precision_config_preference == "high" && return PrecisionConfig.HIGH
     precision_config_preference == "highest" && return PrecisionConfig.HIGHEST
 
@@ -3,12 +3,17 @@ Utils.vec(x::AnyTracedRArray) = ReactantCore.materialize_traced_array(vec(x))
 # XXX: Use PoolDims once EnzymeJAX supports stablehlo.reduce_window adjoint
 Lux.calculate_pool_dims(g::Lux.GlobalPoolMode, ::TracedRArray) = g
 
-# Optimisers setup
-Profiler.@annotate "Optimisers Setup" function Lux.ReactantCompatibleOptimisers.optimisers_setup_with_jit(
-    opt, ps
-)
-    return @jit Optimisers.setup(opt, ps)
-end
-
 # rsqrt
 LuxOps.rsqrt(x::TracedRNumber) = @opcall rsqrt(x)
+
+# convert eltype
+function Utils.convert_eltype(
+    ::Type{T}, x::Reactant.ConcretePJRTNumber{S}
+) where {T<:Number,S}
+    return Reactant.ConcretePJRTNumber{T}(x)
+end
+function Utils.convert_eltype(
+    ::Type{T}, x::Reactant.ConcreteIFRTNumber{S}
+) where {T<:Number,S}
+    return Reactant.ConcreteIFRTNumber{T}(x)
+end
@@ -3,19 +3,29 @@ function objective_function_wrapper(objective_function::F, model, ps, st, data)
     return loss, Reactant.ignore_derivatives(stₙ), Reactant.ignore_derivatives(stats)
 end
 
-function compute_gradients_internal(objective_function::F, model, data, ps, st) where {F}
-    (_, _, dps, _, _), (loss, stₙ, stats) = Enzyme.gradient(
+function compute_gradients_internal!(
+    dps, objective_function::F, model, data, ps, st, zeroed_grads::Bool=false
+) where {F}
+    zeroed_grads || Enzyme.make_zero!(dps)
+
+    _, (loss, stₙ, stats) = Enzyme.autodiff(
         Enzyme.set_abi(Enzyme.ReverseWithPrimal, Reactant.ReactantABI),
         Const(objective_function_wrapper),
         Const(objective_function),
         Const(model),
-        ps,
+        Duplicated(ps, dps),
         Const(st),
         Const(data),
     )
     return dps, loss, stats, stₙ
 end
 
+function compute_gradients_internal(objective_function::F, model, data, ps, st) where {F}
+    return compute_gradients_internal!(
+        Enzyme.make_zero(ps), objective_function, model, data, ps, st, true
+    )
+end
+
 Profiler.@annotate "Compile Compute Gradients" function Lux.Training.compute_gradients_impl(
     backend::ReactantBackend, objective_function::F, data, ts::Training.TrainState
 ) where {F}
@@ -80,34 +90,54 @@ for inplace in ("!", "")
         return ts
     end
 
+    ps_expr = if inplace == "!"
+        :(ps = ts.parameters)
+    else
+        :(ps = Functors.fmap(copy, ts.parameters; exclude=MLDataDevices.isleaf))
+    end
+
     # XXX: recompile with a warning if new input types are used
     @eval Profiler.@annotate "Compile Train Step" function Lux.Training.$(fname)(
         backend::ReactantBackend, objective_function::F, data, ts::Training.TrainState
     ) where {F}
+        device = get_device((ts.parameters, ts.states, ts.optimizer_state, data))
+        @assert device isa ReactantDevice
+        is_sharded = device.device === nothing
+
+        dps = if backend.return_gradients isa True
+            Functors.fmap(Utils.zero, ts.parameters; exclude=MLDataDevices.isleaf)
+        else
+            nothing
+        end
+
+        $(ps_expr)
+
         compiled_grad_and_step_function = with_default_precision_config(ts.parameters) do
             @compile sync = backend.sync $(internal_fn)(
                 objective_function,
                 ts.model,
                 data,
-                ts.parameters,
+                ps,
                 ts.states,
                 ts.optimizer_state,
-                backend.return_gradients,
+                dps,
+                is_sharded,
             )
         end
 
         grads, ps, loss, stats, st, opt_state = compiled_grad_and_step_function(
             objective_function,
             ts.model,
             data,
-            ts.parameters,
+            ps,
             ts.states,
             ts.optimizer_state,
-            backend.return_gradients,
+            dps,
+            is_sharded,
         )
 
         cache = TrainingBackendCache(
-            backend, False(), nothing, (; compiled_grad_and_step_function)
+            backend, False(), dps, (; compiled_grad_and_step_function, is_sharded)
         )
         @set! ts.cache = cache
         @set! ts.objective_function = objective_function
@@ -120,7 +150,7 @@ for inplace in ("!", "")
     end
 
     @eval Profiler.@annotate "Train Step" function Lux.Training.$(fname)(
-        backend::ReactantBackend,
+        ::ReactantBackend,
         obj_fn::F,
         data,
         ts::Training.TrainState{<:TrainingBackendCache{<:ReactantBackend},F},
@@ -132,7 +162,8 @@ for inplace in ("!", "")
             ts.parameters,
             ts.states,
             ts.optimizer_state,
-            backend.return_gradients,
+            ts.cache.dparameters,
+            ts.cache.extras.is_sharded,
         )
 
         @set! ts.states = st
@@ -143,24 +174,38 @@ for inplace in ("!", "")
         return grads, loss, stats, ts
     end
 
-    # XXX: Inplace version not actually inplace
     @eval function $(internal_fn)(
-        objective_function::F, model, data, ps, st, opt_state, ::False
+        objective_function::F, model, data, ps, st, opt_state, ::Nothing, is_sharded::Bool
     ) where {F}
         dps, loss, stats, stₙ = compute_gradients_internal(
             objective_function, model, data, ps, st
         )
-        opt_state, ps = Optimisers.$(update_fn)(opt_state, ps, dps)
-        return nothing, ps, loss, stats, stₙ, opt_state
+
+        opt_state, psₙ = Optimisers.update!(opt_state, ps, dps)
+        # Ensure sharding of input and output states are consistent
+        is_sharded && mark_same_sharding_group(st, stₙ)
+
+        return nothing, psₙ, loss, stats, stₙ, opt_state
     end
 
     @eval function $(internal_fn)(
-        objective_function::F, model, data, ps, st, opt_state, ::True
+        objective_function::F, model, data, ps, st, opt_state, dps, is_sharded::Bool
     ) where {F}
-        dps, loss, stats, stₙ = compute_gradients_internal(
-            objective_function, model, data, ps, st
+        dps, loss, stats, stₙ = compute_gradients_internal!(
+            dps, objective_function, model, data, ps, st
         )
-        opt_state, ps = Optimisers.$(update_fn)(opt_state, ps, dps)
-        return dps, ps, loss, stats, stₙ, opt_state
+
+        opt_state, psₙ = Optimisers.update!(opt_state, ps, dps)
+        # Ensure sharding of input and output states are consistent
+        is_sharded && mark_same_sharding_group(st, stₙ)
+
+        return dps, psₙ, loss, stats, stₙ, opt_state
     end
 end
+
+mark_same_sharding_group(args...) = Functors.fmap(mark_same_sharding_group_inner, args...)
+
+function mark_same_sharding_group_inner(arg1::Union{TracedRArray,TracedRNumber}, args...)
+    return @opcall sharding_group(arg1, args...)
+end
+mark_same_sharding_group_inner(arg1, args...) = nothing