feat: some more progress towards good sharding

avik-pal · avik-pal · commit 4bba967eb283 · 2025-10-31T01:52:00.000-04:00
diff --git a/.github/workflows/CommonCI.yml b/.github/workflows/CommonCI.yml
@@ -45,6 +45,7 @@ jobs:
     runs-on: ${{ inputs.os }}
     env:
       TMPDIR: ${{ github.workspace }}/tmp
+      XLA_FLAGS: --xla_force_host_platform_device_count=8
     steps:
       - uses: actions/checkout@v5
       - name: Create TMPDIR
diff --git a/ext/LuxReactantExt/LuxReactantExt.jl b/ext/LuxReactantExt/LuxReactantExt.jl
@@ -1,6 +1,7 @@
 module LuxReactantExt
 
-using Enzyme: Enzyme, Const
+using Enzyme: Enzyme, Active, Const, Duplicated
+using Functors: Functors
 using Preferences: load_preference
 using Optimisers: Optimisers
 using Reactant:
@@ -21,7 +22,7 @@ using Static: True, False
 using Lux: Lux, LuxOps, Training, Utils, StatefulLuxLayer
 using Lux.Training: TrainingBackendCache, ReactantBackend
 using LuxCore: LuxCore, AbstractLuxLayer
-using MLDataDevices: ReactantDevice, get_device
+using MLDataDevices: MLDataDevices, ReactantDevice, get_device
 
 Lux.is_extension_loaded(::Val{:Reactant}) = true
 
diff --git a/ext/LuxReactantExt/training.jl b/ext/LuxReactantExt/training.jl
@@ -3,19 +3,27 @@ function objective_function_wrapper(objective_function::F, model, ps, st, data)
     return loss, Reactant.ignore_derivatives(stₙ), Reactant.ignore_derivatives(stats)
 end
 
-function compute_gradients_internal(objective_function::F, model, data, ps, st) where {F}
-    (_, _, dps, _, _), (loss, stₙ, stats) = Enzyme.gradient(
+function compute_gradients_internal!(
+    dps, objective_function::F, model, data, ps, st
+) where {F}
+    _, (loss, stₙ, stats) = Enzyme.autodiff(
         Enzyme.set_abi(Enzyme.ReverseWithPrimal, Reactant.ReactantABI),
         Const(objective_function_wrapper),
         Const(objective_function),
         Const(model),
-        ps,
+        Duplicated(ps, dps),
         Const(st),
         Const(data),
     )
     return dps, loss, stats, stₙ
 end
 
+function compute_gradients_internal(objective_function::F, model, data, ps, st) where {F}
+    return compute_gradients_internal!(
+        Enzyme.make_zero(ps), objective_function, model, data, ps, st
+    )
+end
+
 Profiler.@annotate "Compile Compute Gradients" function Lux.Training.compute_gradients_impl(
     backend::ReactantBackend, objective_function::F, data, ts::Training.TrainState
 ) where {F}
@@ -84,6 +92,19 @@ for inplace in ("!", "")
     @eval Profiler.@annotate "Compile Train Step" function Lux.Training.$(fname)(
         backend::ReactantBackend, objective_function::F, data, ts::Training.TrainState
     ) where {F}
+        device = get_device((ts.parameters, ts.states, ts.optimizer_state, data))
+        @assert device isa ReactantDevice
+        is_sharded = device.device === nothing
+
+        dps = if backend.return_gradients isa True
+            Functors.fmap(Utils.zero, ts.parameters; exclude=MLDataDevices.isleaf)
+        else
+            nothing
+        end
+
+        # TODO: make it conditional
+        ps_cache = Functors.fmap(copy, ts.parameters; exclude=MLDataDevices.isleaf)
+
         compiled_grad_and_step_function = with_default_precision_config(ts.parameters) do
             @compile sync = backend.sync $(internal_fn)(
                 objective_function,
@@ -92,7 +113,9 @@ for inplace in ("!", "")
                 ts.parameters,
                 ts.states,
                 ts.optimizer_state,
-                backend.return_gradients,
+                dps,
+                is_sharded,
+                ps_cache,
             )
         end
 
@@ -103,11 +126,13 @@ for inplace in ("!", "")
             ts.parameters,
             ts.states,
             ts.optimizer_state,
-            backend.return_gradients,
+            dps,
+            is_sharded,
+            ps_cache,
         )
 
         cache = TrainingBackendCache(
-            backend, False(), nothing, (; compiled_grad_and_step_function)
+            backend, False(), dps, (; compiled_grad_and_step_function, is_sharded, ps_cache)
         )
         @set! ts.cache = cache
         @set! ts.objective_function = objective_function
@@ -132,7 +157,9 @@ for inplace in ("!", "")
             ts.parameters,
             ts.states,
             ts.optimizer_state,
-            backend.return_gradients,
+            ts.cache.dparameters,
+            ts.cache.extras.is_sharded,
+            ts.cache.extras.ps_cache,
         )
 
         @set! ts.states = st
@@ -143,24 +170,67 @@ for inplace in ("!", "")
         return grads, loss, stats, ts
     end
 
-    # XXX: Inplace version not actually inplace
     @eval function $(internal_fn)(
-        objective_function::F, model, data, ps, st, opt_state, ::False
+        objective_function::F,
+        model,
+        data,
+        ps,
+        st,
+        opt_state,
+        ::Nothing,
+        is_sharded::Bool,
+        ps_cache,
     ) where {F}
         dps, loss, stats, stₙ = compute_gradients_internal(
             objective_function, model, data, ps, st
         )
-        opt_state, ps = Optimisers.$(update_fn)(opt_state, ps, dps)
-        return nothing, ps, loss, stats, stₙ, opt_state
+
+        opt_state, psₙ = Optimisers.$(update_fn)(opt_state, ps, dps)
+        Functors.fmap(copyto!, ps_cache, psₙ; exclude=MLDataDevices.isleaf)
+        if is_sharded
+            # Ensure sharding of input and output states are consistent
+            mark_same_sharding_group(st, stₙ)
+        end
+
+        return nothing, ps_cache, loss, stats, stₙ, opt_state
     end
 
     @eval function $(internal_fn)(
-        objective_function::F, model, data, ps, st, opt_state, ::True
+        objective_function::F,
+        model,
+        data,
+        ps,
+        st,
+        opt_state,
+        dps,
+        is_sharded::Bool,
+        ps_cache,
     ) where {F}
-        dps, loss, stats, stₙ = compute_gradients_internal(
-            objective_function, model, data, ps, st
+        dps, loss, stats, stₙ = compute_gradients_internal!(
+            dps, objective_function, model, data, ps, st
         )
-        opt_state, ps = Optimisers.$(update_fn)(opt_state, ps, dps)
-        return dps, ps, loss, stats, stₙ, opt_state
+
+        opt_state, psₙ = Optimisers.$(update_fn)(opt_state, ps, dps)
+        Functors.fmap(copyto!, ps_cache, psₙ; exclude=MLDataDevices.isleaf)
+        if is_sharded
+            # Ensure sharding of input and output states are consistent
+            # mark_same_sharding_group(ps, psₙ)
+            mark_same_sharding_group(st, stₙ)
+        end
+
+        return dps, ps_cache, loss, stats, stₙ, opt_state
     end
 end
+
+# TODO: think of a better way than sharding group. Since this will insert an optimization
+# barrier in the graph and we wont be able to do layout optimizations. Can we instead
+# use result sharding annotations here?
+function mark_same_sharding_group(args...)
+    return Functors.fmap(mark_same_sharding_group_inner, args...)
+end
+
+function mark_same_sharding_group_inner(arg1::Union{TracedRArray,TracedRNumber}, args...)
+    @opcall sharding_group(arg1, args...)
+    return nothing
+end
+mark_same_sharding_group_inner(arg1, args...) = nothing
diff --git a/test/reactant/training_tests.jl b/test/reactant/training_tests.jl
@@ -129,3 +129,45 @@ end
     )
     @test loss isa Number
 end
+
+@testitem "Reactant Distributed: Training API" tags = [:reactant] setup = [SharedTestSetup] begin
+    using Lux, Random, Reactant, Optimisers
+
+    ndevices = length(Reactant.devices())
+
+    # TODO: ensure lux tests are being run with IFRT
+    if ndevices ≥ 8 && Reactant.XLA.runtime() isa Val{:IFRT}
+        mesh = Sharding.Mesh(reshape(Reactant.devices()[1:8], (2, 4)), (:model, :batch))
+
+        model_device = reactant_device(;
+            sharding=Sharding.DimsSharding(mesh, (-2,), (:model,))
+        )
+        batch_device = reactant_device(;
+            sharding=Sharding.DimsSharding(mesh, (-1,), (:batch,))
+        )
+
+        model = Chain(
+            Chain(Dense(4 => 32), BatchNorm(32, relu)),
+            Chain(Dense(32 => 32), BatchNorm(32, relu)),
+            Dense(32 => 4),
+        )
+        ps, st = Lux.setup(Random.default_rng(), model) |> model_device
+
+        x = rand(Float32, 4, 128) |> batch_device
+        y = rand(Float32, 4, 128) |> batch_device
+
+        train_state = Training.TrainState(model, ps, st, Adam(0.001f0))
+
+        _, loss, _, train_state2 = Training.single_train_step(
+            AutoEnzyme(), MSELoss(), (x, y), train_state
+        )
+        @test loss isa Reactant.ConcreteRNumber
+        @test length(Reactant.XLA.devices(Reactant.XLA.sharding(loss.data))) == 8
+
+        _, loss, _, train_state = Training.single_train_step(
+            AutoEnzyme(), MSELoss(), (x, y), train_state
+        )
+        @test loss isa Reactant.ConcreteRNumber
+        @test length(Reactant.XLA.devices(Reactant.XLA.sharding(loss.data))) == 8
+    end
+end