diff --git a/test/basic.jl b/test/basic.jl
index 199ed4bd1..20ef425d5 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -1,6 +1,7 @@
 using Reactant
 using Test
 using Enzyme
+using Statistics
 
 # Reactant.set_default_backend("gpu")
 
diff --git a/test/nn_lux.jl b/test/nn_lux.jl
index e9bf1b204..3521efc38 100644
--- a/test/nn_lux.jl
+++ b/test/nn_lux.jl
@@ -9,6 +9,7 @@ truth = [xor(col[1] > 0.5, col[2] > 0.5) for col in eachcol(noisy)]   # 1000-ele
 # Define our model, a multi-layer perceptron with one hidden layer of size 3:
 model = Lux.Chain(
     Lux.Dense(2 => 3, tanh),   # activation function inside layer
+    Lux.BatchNorm(3, gelu),
     Lux.Dense(3 => 2),
     softmax,
 )
@@ -17,8 +18,7 @@ ps, st = Lux.setup(Xoshiro(123), model)
 using BenchmarkTools
 
 origout, _ = model(noisy, ps, st)
-@show origout[3]
-@btime model($noisy, $ps, $st)  # 52.731 μs (10 allocations: 32.03 KiB)
+@btime model($noisy, $ps, $st)  # 68.444 μs (46 allocations: 45.88 KiB)
 
 cmodel = Reactant.make_tracer(IdDict(), model, (), Reactant.ArrayToConcrete)
 cps = Reactant.make_tracer(IdDict(), ps, (), Reactant.ArrayToConcrete)
@@ -31,8 +31,9 @@ f = Reactant.compile((a, b, c, d) -> first(a(b, c, d)), (cmodel, cnoisy, cps, cs
 # # @show @code_typed f(cmodel,cnoisy)
 # # @show @code_llvm f(cmodel,cnoisy)
 comp = f(cmodel, cnoisy, cps, cst)
-@show comp[3]
-@btime f($cmodel, $cnoisy, $cps, $cst) # 4.430 μs (5 allocations: 160 bytes)
+@btime f($cmodel, $cnoisy, $cps, $cst) # 21.790 μs (6 allocations: 224 bytes)
+
+@test comp ≈ origout atol = 1e-5 rtol = 1e-2
 
 # To train the model, we use batches of 64 samples, and one-hot encoding:
 
@@ -81,6 +82,8 @@ compiled_gradient = Reactant.compile(
     gradient_loss_function, (cmodel, cnoisy, ctarget, cps, cst)
 )
 
+@test length(compiled_gradient(cmodel, cnoisy, ctarget, cps, cst)) == 2
+
 # # Training loop, using the whole data set 1000 times:
 # losses = []
 # for epoch in 1:1_000