fix: update to new reactant changes (#1140)

* fix: update to new reactant changes * fix: use enzyme correctly fix: update training code * feat: handle optimisers correctly * fix: upstreamed Reactant patches * fix: don't force ::Real * fix: package versions and some test fixes test: try fixing load order revert: load order change
LuxDL · Dec 30, 2024 · 3c3a432 · 3c3a432 · avik-pal · Dec 30, 2024
1 parent ac2879b
commit 3c3a432
Show file tree

Hide file tree

Showing 35 changed files with 398 additions and 153 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Lux"
 uuid = "b2108857-7c20-44ae-9111-449ecde12c47"
 authors = ["Avik Pal <[email protected]> and contributors"]
-version = "1.4.2"
+version = "1.4.3"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -83,7 +83,7 @@ Adapt = "4.1"
 ArgCheck = "2.3"
 ArrayInterface = "7.17.1"
 CUDA = "5.3.2"
-ChainRulesCore = "1.24"
+ChainRulesCore = "1.25"
 Compat = "4.16"
 ComponentArrays = "0.15.18"
 ConcreteStructs = "0.2.3"
@@ -106,11 +106,11 @@ MPI = "0.20.19"
 MacroTools = "0.5.13"
 Markdown = "1.10"
 NCCL = "0.1.1"
-NNlib = "0.9.24"
+NNlib = "0.9.26"
 Optimisers = "0.4.1"
 Preferences = "1.4.3"
 Random = "1.10"
-Reactant = "0.2.8"
+Reactant = "0.2.12"
 Reexport = "1.2.2"
 ReverseDiff = "1.15"
 SIMDTypes = "0.1"

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -33,7 +33,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 ADTypes = "1.10"
 Adapt = "4"
-ChainRulesCore = "1.24"
+ChainRulesCore = "1.25"
 ComponentArrays = "0.15.18"
 Documenter = "1.4"
 DocumenterVitepress = "0.1.3"
@@ -51,12 +51,12 @@ LuxCore = "1.2"
 LuxLib = "1.3.4"
 LuxTestUtils = "1.5"
 MLDataDevices = "1.6"
-NNlib = "0.9.24"
+NNlib = "0.9.26"
 Optimisers = "0.4.1"
 Pkg = "1.10"
 Printf = "1.10"
 Random = "1.10"
-Reactant = "0.2.8"
+Reactant = "0.2.12"
 StableRNGs = "1"
 StaticArrays = "1"
 WeightInitializers = "1"

diff --git a/docs/make.jl b/docs/make.jl
@@ -29,7 +29,7 @@ pages = [
             "tutorials/intermediate/1_NeuralODE.md",
             "tutorials/intermediate/2_BayesianNN.md",
             "tutorials/intermediate/3_HyperNet.md",
-            "tutorials/intermediate/4_PINN2DPDE.md"
+            "tutorials/intermediate/4_PINN2DPDE.md",
         ],
         "Advanced" => [
             "tutorials/advanced/1_GravitationalWaveForm.md"

diff --git a/ext/LuxReactantExt/LuxReactantExt.jl b/ext/LuxReactantExt/LuxReactantExt.jl
@@ -2,13 +2,22 @@ module LuxReactantExt
 
 using Enzyme: Enzyme, Const, Duplicated, Active
 using Optimisers: Optimisers
-using Reactant: Reactant, @compile, TracedRArray, TracedRNumber
+using Reactant: Reactant, @compile, AnyTracedRArray, TracedRArray, TracedRNumber
 using Setfield: @set!
 using Static: False
 
-using Lux: Lux, LuxOps, Training
+using Lux: Lux, LuxOps, Training, Utils
 using Lux.Training: TrainingBackendCache, ReactantBackend
 
+Lux.is_extension_loaded(::Val{:Reactant}) = true
+
+Utils.to_rarray(x; kwargs...) = Reactant.to_rarray(x; kwargs...)
+
+function Utils.promote_to(::Type{T}, x::Number) where {T <: Number}
+    x isa Reactant.TracedType && return x
+    return Reactant.ConcreteRNumber{T}(x)
+end
+
 include("patches.jl")
 include("training.jl")
 

diff --git a/ext/LuxReactantExt/patches.jl b/ext/LuxReactantExt/patches.jl
@@ -1 +1,4 @@
+Utils.vec(x::AnyTracedRArray) = Reactant.TracedUtils.materialize_traced_array(vec(x))
 
+# XXX: Use PoolDims once EnzymeJAX supports stablehlo.reduce_window adjoint
+Lux.calculate_pool_dims(g::Lux.GlobalPoolMode, ::TracedRArray) = g
diff --git a/ext/LuxReactantExt/training.jl b/ext/LuxReactantExt/training.jl
@@ -1,3 +1,28 @@
+mutable struct StatsAndNewStateWrapper
+    stats::Any
+    st::Any
+end
+
+function wrapped_objective_function(
+        fn::F, model, ps, st, data, cache::StatsAndNewStateWrapper
+) where {F}
+    loss, stₙ, stats = fn(model, ps, st, data)
+    cache.stats = stats
+    cache.st = stₙ
+    return loss
+end
+
+function compute_gradients_internal(objective_function::F, model, data, ps, st) where {F}
+    stats_wrapper = StatsAndNewStateWrapper(nothing, nothing)
+    res = Enzyme.gradient(
+        Enzyme.set_abi(Enzyme.ReverseWithPrimal, Reactant.ReactantABI),
+        Const(wrapped_objective_function), Const(objective_function),
+        Const(model), ps, Const(st), Const(data), Const(stats_wrapper)
+    )
+    loss, dps = res.val, res.derivs[3]
+    return dps, loss, stats_wrapper.stats, stats_wrapper.st
+end
+
 function Lux.Training.compute_gradients_impl(
         backend::ReactantBackend, objective_function::F,
         data, ts::Training.TrainState) where {F}
@@ -22,18 +47,33 @@ function Lux.Training.compute_gradients_impl(::ReactantBackend, obj_fn::F, data,
     return grads, loss, stats, ts
 end
 
-function compute_gradients_internal(objective_function::F, model, data, ps, st) where {F}
-    dps = Enzyme.make_zero(ps)
-    _, (loss, stₙ, stats) = Enzyme.autodiff(
-        Enzyme.ReverseWithPrimal, Const(objective_function), Active, Const(model),
-        Duplicated(ps, dps), Const(st), Const(data))
-    return dps, loss, stats, stₙ
-end
-
 for inplace in ("!", "")
     fname = Symbol(:single_train_step_impl, inplace)
     internal_fn = Symbol(:compute_gradients_internal_and_step, inplace)
+    apply_gradients_fn = Symbol(:apply_gradients, inplace)
+    update_fn = Symbol(:update, inplace)
+
+    # Ideally users never hit this dispatch but it is still good to have as a fallback
+    @eval function Lux.Training.$(apply_gradients_fn)(
+            ts::Training.TrainState{<:TrainingBackendCache{ReactantBackend}}, grads
+    )
+        if hasfield(typeof(ts.cache.extras), :update_function)
+            update_function = ts.cache.extras.update_function
+        else
+            update_function = @compile Optimisers.$(update_fn)(
+                ts.optimizer_state, ts.parameters, grads)
+            @set! ts.cache.extras = merge(ts.cache.extras, (; update_function))
+        end
 
+        opt_state, ps = update_function(ts.optimizer_state, ts.parameters, grads)
+        @set! ts.parameters = ps
+        @set! ts.optimizer_state = opt_state
+        @set! ts.step = ts.step + 1
+        return ts
+    end
+
+    # XXX: Should we add a check to ensure the inputs to this function is same as the one
+    #      used in the compiled function? We can re-trigger the compilation with a warning
     @eval function Lux.Training.$(fname)(backend::ReactantBackend, objective_function::F,
             data, ts::Training.TrainState) where {F}
         compiled_grad_and_step_function = @compile $(internal_fn)(
@@ -68,27 +108,13 @@ for inplace in ("!", "")
 
         return grads, loss, stats, ts
     end
-end
 
-function compute_gradients_internal_and_step(objective_function::F, model, data, ps,
-        st, opt_state) where {F}
-    dps = Enzyme.make_zero(ps)
-    _, (loss, stₙ, stats) = Enzyme.autodiff(
-        Enzyme.set_abi(Enzyme.ReverseWithPrimal, Reactant.ReactantABI),
-        Const(objective_function), Active, Const(model),
-        Duplicated(ps, dps), Const(st), Const(data))
-    opt_state, ps = Optimisers.update(opt_state, ps, dps)
-    return dps, ps, loss, stats, stₙ, opt_state
-end
-
-function compute_gradients_internal_and_step!(objective_function::F, model, data, ps,
-        st, opt_state) where {F}
-    dps = Enzyme.make_zero(ps)
-    _, (loss, stₙ, stats) = Enzyme.autodiff(
-        Enzyme.set_abi(Enzyme.ReverseWithPrimal, Reactant.ReactantABI),
-        Const(objective_function), Active, Const(model),
-        Duplicated(ps, dps), Const(st), Const(data))
-    # XXX: Inplace updates not actually inplace
-    opt_state, ps = Optimisers.update!(opt_state, ps, dps)
-    return dps, ps, loss, stats, stₙ, opt_state
+    # XXX: Inplace version not actually inplace
+    @eval function $(internal_fn)(
+            objective_function::F, model, data, ps, st, opt_state) where {F}
+        dps, loss, stats, stₙ = compute_gradients_internal(
+            objective_function, model, data, ps, st)
+        opt_state, ps = Optimisers.$(update_fn)(opt_state, ps, dps)
+        return dps, ps, loss, stats, stₙ, opt_state
+    end
 end
diff --git a/lib/LuxLib/Project.toml b/lib/LuxLib/Project.toml
@@ -1,7 +1,7 @@
 name = "LuxLib"
 uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
 authors = ["Avik Pal <[email protected]> and contributors"]
-version = "1.3.10"
+version = "1.3.11"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -77,7 +77,7 @@ LuxCore = "1.2"
 MKL = "0.7"
 MLDataDevices = "1.6"
 Markdown = "1.10"
-NNlib = "0.9.24"
+NNlib = "0.9.26"
 Octavian = "0.3.28"
 Preferences = "1.4.3"
 Polyester = "0.7.15"

diff --git a/lib/LuxLib/ext/LuxLibTrackerExt.jl b/lib/LuxLib/ext/LuxLibTrackerExt.jl
@@ -97,7 +97,7 @@ for RM in (:TrackedVector, :Nothing, :AbstractVector),
     Utils.is_tracked(RM, RV, S, B, XT) || continue
 
     @eval Tracker.@grad_from_chainrules LuxLib.Impl.batchnorm_cudnn(
-        γ::$S, β::$B, x::$XT, rμ::$RM, rσ²::$RV, m::Real, ϵ::Real, training::StaticBool)
+        γ::$S, β::$B, x::$XT, rμ::$RM, rσ²::$RV, m, ϵ, training::StaticBool)
 end
 
 # Utils extensions

diff --git a/lib/LuxLib/ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl b/lib/LuxLib/ext/LuxLibcuDNNExt/LuxLibcuDNNExt.jl
@@ -21,7 +21,7 @@ include("batchnorm.jl")
 function Impl.batchnorm(x::Union{<:CuArray{T, 2}, <:CuArray{T, 4}, <:CuArray{T, 5}},
         γ::Optional{<:CuVector{T}}, β::Optional{<:CuVector{T}},
         rμ::Optional{<:CuVector{T}}, rσ²::Optional{<:CuVector{T}},
-        training::StaticBool, σ::F, m::Real, ϵ::Real) where {T <: cuDNNFloat, F}
+        training::StaticBool, σ::F, m, ϵ) where {T <: cuDNNFloat, F}
     rμₙ, rσ²ₙ = Impl.get_batchnorm_statistics(x, rμ, rσ², training)
     y = Impl.batchnorm_cudnn(γ, β, x, rμₙ, rσ²ₙ, m, ϵ, training)[1]
     return Impl.activation!!(σ, y), safe_vec(rμₙ), safe_vec(rσ²ₙ)

diff --git a/lib/LuxLib/src/api/batchnorm.jl b/lib/LuxLib/src/api/batchnorm.jl
@@ -37,7 +37,7 @@ mean and variance.
 function batchnorm(x::AbstractArray{T, N}, γ::Optional{<:AbstractVector},
         β::Optional{<:AbstractVector}, rμ::Optional{<:AbstractVector},
         rσ²::Optional{<:AbstractVector}, training::TrainingType, act::F=identity,
-        momentum::Real=0.1f0, epsilon::Real=default_epsilon(x)) where {F, T, N}
+        momentum=0.1f0, epsilon=default_epsilon(x)) where {F, T, N}
     σ = select_fastest_activation(act, x, γ, β, rμ, rσ²)
     y, rμ, rσ² = batchnorm_impl(
         x, γ, β, rμ, rσ², static_training_mode(training, x, γ, β, rμ, rσ²),

diff --git a/lib/LuxLib/src/api/groupnorm.jl b/lib/LuxLib/src/api/groupnorm.jl
@@ -1,6 +1,6 @@
 @doc doc"""
     groupnorm(x, scale, bias, groups::Int, σ::F=identity,
-        epsilon::Real=eps(eltype(x)) ^ (5 // 7))
+        epsilon=eps(eltype(x)) ^ (5 // 7))
 
 Group Normalization. For details see [1].
 
@@ -30,7 +30,7 @@ The normalized array is returned.
 """
 function groupnorm(x::AbstractArray{<:Real, N}, scale::Optional{<:AbstractVector},
         bias::Optional{<:AbstractVector}, groups::Int, σ::F=identity,
-        epsilon::Real=default_epsilon(x)) where {F, N}
+        epsilon=default_epsilon(x)) where {F, N}
     assert_valid_groupnorm_arguments(x, scale, bias, groups)
     return groupnorm_impl(
         x, scale, bias, groups, select_fastest_activation(σ, x, scale, bias), epsilon)

diff --git a/lib/LuxLib/src/api/instancenorm.jl b/lib/LuxLib/src/api/instancenorm.jl
@@ -36,15 +36,15 @@ mean and variance.
 """
 function instancenorm(x::AbstractArray, γ::Optional{<:AbstractVector},
         β::Optional{<:AbstractVector}, training::TrainingType,
-        σ::F=identity, epsilon::Real=default_epsilon(x)) where {F}
+        σ::F=identity, epsilon=default_epsilon(x)) where {F}
     # This API is kept for legacy purposes when we didn't support passing running stats
     return instancenorm(x, γ, β, nothing, nothing, training, σ, nothing, epsilon)
 end
 
 function instancenorm(x::AbstractArray, γ::Optional{<:AbstractVector},
         β::Optional{<:AbstractVector}, rμ::Optional{<:AbstractVector},
         rσ²::Optional{<:AbstractVector}, training::TrainingType, σ::F=identity,
-        momentum::Optional{<:Real}=0.1f0, epsilon::Real=default_epsilon(x)) where {F}
+        momentum::Optional{<:Real}=0.1f0, epsilon=default_epsilon(x)) where {F}
     assert_valid_instancenorm_arguments(x)
 
     y, rμₙ, rσ²ₙ = instancenorm_impl(

diff --git a/lib/LuxLib/src/api/layernorm.jl b/lib/LuxLib/src/api/layernorm.jl
@@ -36,7 +36,7 @@ Normalized Array of same size as `x`.
 """
 function layernorm(x::AbstractArray{xT, N}, scale::Optional{<:AbstractArray},
         bias::Optional{<:AbstractArray}, σ::F=identity, dims=1:(N - 1),
-        epsilon::Real=default_epsilon(x)) where {F, xT, N}
+        epsilon=default_epsilon(x)) where {F, xT, N}
     return layernorm_impl(
         x, scale, bias, select_fastest_activation(σ, x, scale, bias), dims, epsilon)
 end