LuxDL
diff --git a/‎Project.toml‎
Lines changed: 2 additions & 2 deletions b/‎Project.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 32 additions & 0 deletions b/‎README.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎docs/Project.toml‎
Lines changed: 1 addition & 0 deletions b/‎docs/Project.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 6 additions & 4 deletions b/‎docs/make.jl‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎docs/src/index.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/src/index.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/src/introduction/index.md‎
Lines changed: 33 additions & 17 deletions b/‎docs/src/introduction/index.md‎
Lines changed: 33 additions & 17 deletions
diff --git a/‎docs/src/manual/compiling_lux_models.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/src/manual/compiling_lux_models.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/src/manual/gpu_management.md‎
Lines changed: 6 additions & 7 deletions b/‎docs/src/manual/gpu_management.md‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎docs/tutorials.jl‎
Lines changed: 3 additions & 2 deletions b/‎docs/tutorials.jl‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/Basics/Project.toml‎
Lines changed: 0 additions & 2 deletions b/‎examples/Basics/Project.toml‎
Lines changed: 0 additions & 2 deletions
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
-version = "1.4.4"
+version = "1.5.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -88,7 +88,7 @@ Compat = "4.16"
 ComponentArrays = "0.15.18"
 ConcreteStructs = "0.2.3"
 DispatchDoctor = "0.4.12"
-Enzyme = "0.13.16"
+Enzyme = "0.13.28"
 EnzymeCore = "0.8.8"
 FastClosures = "0.3.2"
 Flux = "0.15, 0.16"
 
@@ -170,6 +170,38 @@ gs, loss, stats, train_state = Training.single_train_step!(AutoZygote(), MSELoss
     (x, dev(rand(rng, Float32, 10, 2))), train_state)
 ```
 
+## 🤸 Quickstart with Reactant
+
+```julia
+using Lux, Random, Optimisers, Reactant, Enzyme
+
+rng = Random.default_rng()
+Random.seed!(rng, 0)
+
+model = Chain(Dense(128, 256, tanh), Chain(Dense(256, 1, tanh), Dense(1, 10)))
+
+dev = reactant_device()
+
+ps, st = Lux.setup(rng, model) |> dev
+
+x = rand(rng, Float32, 128, 2) |> dev
+
+# We need to compile the model before we can use it.
+model_forward = @compile model(x, ps, Lux.testmode(st))
+model_forward(x, ps, Lux.testmode(st))
+
+# Gradients can be computed using Enzyme
+@jit Enzyme.gradient(Reverse, sum ∘ first ∘ Lux.apply, Const(model), x, ps, Const(st))
+
+# All of this can be automated using the TrainState API
+train_state = Training.TrainState(model, ps, st, Adam(0.001f0))
+
+gs, loss, stats, train_state = Training.single_train_step!(
+    AutoEnzyme(), MSELoss(),
+    (x, dev(rand(rng, Float32, 10, 2))), train_state
+)
+```
+
 ## 📚 Examples
 
 Look in the [examples](/examples/) directory for self-contained usage examples. The [documentation](https://lux.csail.mit.edu) has examples sorted into proper categories.
 
@@ -66,6 +66,7 @@ julia = "1.10"
 [sources]
 Lux = { path = "../" }
 LuxLib = { path = "../lib/LuxLib" }
+LuxCUDA = { path = "../lib/LuxCUDA" }
 LuxCore = { path = "../lib/LuxCore" }
 MLDataDevices = { path = "../lib/MLDataDevices" }
 LuxTestUtils = { path = "../lib/LuxTestUtils" }
 
@@ -1,7 +1,6 @@
 using Documenter, DocumenterVitepress, Pkg
 using Lux, LuxCore, LuxLib, WeightInitializers, NNlib
 using LuxTestUtils, MLDataDevices
-using LuxCUDA
 
 using Optimisers # for some docstrings
 
@@ -78,8 +77,10 @@ pages = [
 #! format: on
 
 deploy_config = Documenter.auto_detect_deploy_system()
-deploy_decision = Documenter.deploy_folder(deploy_config; repo="github.com/LuxDL/Lux.jl",
-    devbranch="main", devurl="dev", push_preview=true)
+deploy_decision = Documenter.deploy_folder(
+    deploy_config; repo="github.com/LuxDL/Lux.jl",
+    devbranch="main", devurl="dev", push_preview=true
+)
 
 makedocs(;
     sitename="Lux.jl Docs",
@@ -96,7 +97,8 @@ makedocs(;
     repo="https://github.com/LuxDL/Lux.jl/blob/{commit}{path}#{line}",
     format=DocumenterVitepress.MarkdownVitepress(;
         repo="github.com/LuxDL/Lux.jl", devbranch="main", devurl="dev",
-        deploy_url="https://lux.csail.mit.edu", deploy_decision),
+        deploy_url="https://lux.csail.mit.edu", deploy_decision
+    ),
     draft=false,
     pages
 )
 
@@ -23,8 +23,8 @@ hero:
 
 features:
   - icon: 🚀
-    title: Fast & Extendible
-    details: Lux.jl is written in Julia itself, making it extremely extendible. CUDA and AMDGPU are supported first-class, with experimental support for Metal and Intel GPUs.
+    title: Fast & Extendable
+    details: Lux.jl is written in Julia itself, making it extremely extendable. CUDA and AMDGPU are supported first-class, with experimental support for Metal and Intel GPUs.
     link: /introduction
 
   - icon: 🐎
 
@@ -25,8 +25,7 @@ Pkg.add("Lux")
 
 ```@example quickstart
 using Lux, Random, Optimisers, Zygote
-using LuxCUDA # For CUDA support
-# using AMDGPU, Metal, oneAPI # Other pptional packages for GPU support
+# using LuxCUDA, AMDGPU, Metal, oneAPI # Optional packages for GPU support
 ```
 
 We take randomness very seriously
@@ -66,26 +65,33 @@ y, st = Lux.apply(model, x, ps, st)
 train_state = Lux.Training.TrainState(model, ps, st, Adam(0.0001f0))
 
 ## We can compute the gradients using Training.compute_gradients
-gs, loss, stats, train_state = Lux.Training.compute_gradients(AutoZygote(), MSELoss(),
-    (x, dev(rand(rng, Float32, 10, 2))), train_state)
+gs, loss, stats, train_state = Lux.Training.compute_gradients(
+    AutoZygote(), MSELoss(),
+    (x, dev(rand(rng, Float32, 10, 2))), train_state
+)
 
 ## Optimization
 train_state = Training.apply_gradients!(train_state, gs) # or Training.apply_gradients (no `!` at the end)
 
 # Both these steps can be combined into a single call
-gs, loss, stats, train_state = Training.single_train_step!(AutoZygote(), MSELoss(),
-    (x, dev(rand(rng, Float32, 10, 2))), train_state)
+gs, loss, stats, train_state = Training.single_train_step!(
+    AutoZygote(), MSELoss(),
+    (x, dev(rand(rng, Float32, 10, 2))), train_state
+)
 ```
 
 ## Defining Custom Layers
 
+We can train our model using the above code, but let's go ahead and see how to use Reactant.
+Reactant is a julia frontend that generates MLIR and then compiles it using XLA (after
+running fancy optimizations). It is the current recommended way to train large models in
+Lux. For more details on using Reactant, see the [manual](@ref reactant-compilation).
+
 ```@example custom_compact
-using Lux, Random, Optimisers, Zygote
-using LuxCUDA # For CUDA support
-# using AMDGPU, Metal, oneAPI # Other pptional packages for GPU support
+using Lux, Random, Optimisers, Reactant, Enzyme
 using Printf # For pretty printing
 
-dev = gpu_device()
+dev = reactant_device()
 ```
 
 We will define a custom MLP using the `@compact` macro. The macro takes in a list of
@@ -97,10 +103,12 @@ n_in = 1
 n_out = 1
 nlayers = 3
 
-model = @compact(w1=Dense(n_in => 32),
+model = @compact(
+    w1=Dense(n_in => 32),
     w2=[Dense(32 => 32) for i in 1:nlayers],
     w3=Dense(32 => n_out),
-    act=relu) do x
+    act=relu
+) do x
     embed = act(w1(x))
     for w in w2
         embed = act(w(embed))
@@ -116,21 +124,24 @@ We can initialize the model and train it with the same code as before!
 rng = Random.default_rng()
 Random.seed!(rng, 0)
 
-ps, st = Lux.setup(Xoshiro(0), model) |> dev
+ps, st = Lux.setup(rng, model) |> dev
 
 x = rand(rng, Float32, n_in, 32) |> dev
 
-model(x, ps, st)  # 1×32 Matrix and updated state as output.
+@jit model(x, ps, st)  # 1×32 Matrix and updated state as output.
 
-x_data = reshape(collect(-2.0f0:0.1f0:2.0f0), 1, :) |> dev
+x_data = reshape(collect(-2.0f0:0.1f0:2.0f0), 1, :)
 y_data = 2 .* x_data .- x_data .^ 3
+x_data, y_data = dev(x_data), dev(y_data)
 
 function train_model!(model, ps, st, x_data, y_data)
     train_state = Lux.Training.TrainState(model, ps, st, Adam(0.001f0))
 
     for iter in 1:1000
-        _, loss, _, train_state = Lux.Training.single_train_step!(AutoZygote(), MSELoss(),
-            (x_data, y_data), train_state)
+        _, loss, _, train_state = Lux.Training.single_train_step!(
+            AutoEnzyme(), MSELoss(),
+            (x_data, y_data), train_state
+        )
         if iter % 100 == 1 || iter == 1000
             @printf "Iteration: %04d \t Loss: %10.9g\n" iter loss
         end
@@ -155,6 +166,11 @@ packages mentioned in this documentation are available via the Julia General Reg
 
 You can install all those packages via `import Pkg; Pkg.add(<package name>)`.
 
+## XLA (CPU/GPU/TPU) Support
+
+Lux.jl supports XLA compilation for CPU, GPU, and TPU using
+[Reactant.jl](https://github.com/EnzymeAD/Reactant.jl).
+
 ## GPU Support
 
 GPU Support for Lux.jl requires loading additional packages:
 
@@ -124,6 +124,16 @@ fmap(Broadcast.BroadcastFunction(-), ∂ps_zyg, ∂ps_enzyme |> cpu_device())
 
 ## [Using the `TrainState` API](@id compile_lux_model_trainstate)
 
+!!! tip "Debugging TrainState API Failures"
+
+    If the code fails to compile with Reactant, it is useful to dump the HLO. Starting the
+    Julia session with `LUX_DUMP_REACTANT_HLO_OPTIMIZE` environment variable set to
+    `no_enzyme`, `false`, or `true` will dump the HLO to a file (filename will be
+    displayed). This is an useful information to provide when opening an issue.
+
+    Alternatively, you can set theglobal reference `Lux.DUMP_REACTANT_HLO_OPT_MODE` to a
+    symbol corresponding to the `optimize` keyword argument to `@code_hlo`.
+
 Now that we saw the low-level API let's see how to train the model without any of this
 boilerplate. Simply follow the following steps:
 
 
@@ -1,12 +1,5 @@
 # GPU Management
 
-!!! info
-
-    Starting from `v0.5`, Lux has transitioned to a new GPU management system. The old
-    system using `cpu` and `gpu` functions is still in place but will be removed in `v1`.
-    Using the  old functions might lead to performance regressions if used inside
-    performance critical code.
-
 `Lux.jl` can handle multiple GPU backends. Currently, the following backends are supported:
 
 ```@example gpu_management
@@ -16,6 +9,12 @@ using Lux, LuxCUDA #, AMDGPU, Metal, oneAPI
 supported_gpu_backends()
 ```
 
+!!! tip "GPU Support via Reactant"
+
+    If you are using Reactant, you can use the [`reactant_device`](@ref) function to
+    automatically select Reactant backend if available. Additionally to force Reactant to
+    use `gpu`, you can run `Reactant.set_default_backend("gpu")` (this is automatic).
+
 !!! danger "Metal Support"
 
     Support for Metal GPUs should be considered extremely experimental at this point.
 
@@ -1,10 +1,11 @@
 #! format: off
 const BEGINNER_TUTORIALS = [
-    "Basics/main.jl" => "CUDA",
+    "Basics/main.jl" => "CPU",
     "PolynomialFitting/main.jl" => "CUDA",
     "SimpleRNN/main.jl" => "CUDA",
+    # Technically this is run on CPU but we need a better machine to run it
     "SimpleChains/main.jl" => "CUDA",
-    "OptimizationIntegration/main.jl" => "CUDA",
+    "OptimizationIntegration/main.jl" => "CPU",
 ]
 const INTERMEDIATE_TUTORIALS = [
     "NeuralODE/main.jl" => "CUDA",
 
@@ -2,7 +2,6 @@
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
-LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -12,6 +11,5 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 ComponentArrays = "0.15.18"
 ForwardDiff = "0.10"
 Lux = "1"
-LuxCUDA = "0.3"
 Optimisers = "0.4.1"
 Zygote = "0.6"