diff --git a/docs/Project.toml b/docs/Project.toml
index 6fcbb71..7ebfdf9 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,3 +1,4 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterInterLinks = "d12716ef-a0f6-4df4-a9f1-a5a34e75c656"
 RecurrentLayers = "78449bcf-6750-4b78-9e82-63d4a1ccdf8c"
diff --git a/docs/make.jl b/docs/make.jl
index 04335f7..e1e25ca 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,25 +1,30 @@
 using RecurrentLayers
-using Documenter
+using Documenter, DocumenterInterLinks
 include("pages.jl")
 
 DocMeta.setdocmeta!(RecurrentLayers, :DocTestSetup, :(using RecurrentLayers); recursive=true)
 mathengine = Documenter.MathJax()
 
+links = InterLinks(
+    "Flux" => "https://fluxml.ai/Flux.jl/stable/",
+)
+
 makedocs(;
-    modules=[RecurrentLayers],
-    authors="Francesco Martinuzzi",
-    sitename="RecurrentLayers.jl",
-    format=Documenter.HTML(;
+    modules = [RecurrentLayers],
+    authors = "Francesco Martinuzzi",
+    sitename = "RecurrentLayers.jl",
+    format = Documenter.HTML(;
         mathengine,
         assets = ["assets/favicon.ico"],
-        canonical="https://MartinuzziFrancesco.github.io/RecurrentLayers.jl",
-        edit_link="main",
+        canonical = "https://MartinuzziFrancesco.github.io/RecurrentLayers.jl",
+        edit_link = "main",
     ),
-    pages=pages,
+    pages = pages,
+    plugins = [links],
 )
 
 deploydocs(;
-    repo="github.com/MartinuzziFrancesco/RecurrentLayers.jl",
-    devbranch="main",
-    push_preview=true,
+    repo = "github.com/MartinuzziFrancesco/RecurrentLayers.jl",
+    devbranch = "main",
+    push_preview = true,
 )
diff --git a/src/fastrnn_cell.jl b/src/fastrnn_cell.jl
index 595b92f..623a025 100644
--- a/src/fastrnn_cell.jl
+++ b/src/fastrnn_cell.jl
@@ -45,7 +45,8 @@ h_t &= \alpha \tilde{h}_t + \beta h_{t-1}
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the FastRNN. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -122,7 +123,8 @@ h_t &= \alpha \tilde{h}_t + \beta h_{t-1}
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the FastRNN. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
@@ -182,7 +184,8 @@ h_t &= \big((\zeta (1 - z_t) + \nu) \odot \tilde{h}_t\big) + z_t \odot h_{t-1}
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the FastGRNN. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -265,7 +268,8 @@ h_t &= \big((\zeta (1 - z_t) + \nu) \odot \tilde{h}_t\big) + z_t \odot h_{t-1}
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the FastGRNN. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/indrnn_cell.jl b/src/indrnn_cell.jl
index d5daa4f..3906860 100644
--- a/src/indrnn_cell.jl
+++ b/src/indrnn_cell.jl
@@ -41,7 +41,8 @@ See [`IndRNN`](@ref) for a layer that processes entire sequences.
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the IndRNNCell. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -105,7 +106,8 @@ See [`IndRNNCell`](@ref) for a layer that processes a single sequence.
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the IndRNN. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/lightru_cell.jl b/src/lightru_cell.jl
index 5eb4a87..b68c4e8 100644
--- a/src/lightru_cell.jl
+++ b/src/lightru_cell.jl
@@ -42,7 +42,8 @@ h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t.
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the LightRUCell. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -116,7 +117,8 @@ h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t.
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the LightRU. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/ligru_cell.jl b/src/ligru_cell.jl
index e8e30c1..8120e1d 100644
--- a/src/ligru_cell.jl
+++ b/src/ligru_cell.jl
@@ -44,7 +44,8 @@ h_t &= z_t \odot h_{t-1} + (1 - z_t) \odot \tilde{h}_t
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the LiGRUCell. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -118,7 +119,8 @@ h_t &= z_t \odot h_{t-1} + (1 - z_t) \odot \tilde{h}_t
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the LiGRU. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/mgu_cell.jl b/src/mgu_cell.jl
index ce78015..b199e7f 100644
--- a/src/mgu_cell.jl
+++ b/src/mgu_cell.jl
@@ -42,7 +42,8 @@ h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the MGUCell. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -115,7 +116,8 @@ h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the MGU. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/mut_cell.jl b/src/mut_cell.jl
index 0c9c60b..afac63c 100644
--- a/src/mut_cell.jl
+++ b/src/mut_cell.jl
@@ -43,7 +43,8 @@ h_{t+1} &= \tanh(U_h (r \odot h_t) + \tanh(W_h x_t) + b_h) \odot z \\
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the MUTCell. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -119,7 +120,8 @@ h_{t+1} &= \tanh(U_h (r \odot h_t) + \tanh(W_h x_t) + b_h) \odot z \\
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the MUT. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
@@ -174,7 +176,8 @@ h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the MUTCell. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -250,7 +253,8 @@ h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the MUT. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
@@ -305,7 +309,8 @@ h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
   or a matrix of size `input_size x batch_size`.
 - `state`: The hidden state of the MUTCell. It should be a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where both elements are given by the updated state `new_state`, 
@@ -379,7 +384,8 @@ h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
   or a matrix of size `input_size x len x batch_size`.
 - `state`: The hidden state of the MUT. If given, it is a vector of size
   `hidden_size` or a matrix of size `hidden_size x batch_size`.
-  If not provided, it is assumed to be a vector of zeros.
+  If not provided, it is assumed to be a vector of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/nas_cell.jl b/src/nas_cell.jl
index 868fda0..e747700 100644
--- a/src/nas_cell.jl
+++ b/src/nas_cell.jl
@@ -87,7 +87,8 @@ h_{\text{new}} &= \tanh(c_{\text{new}} \cdot l_5)
   or a matrix of size `input_size x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the NASCell.
   They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-  If not provided, they are assumed to be vectors of zeros.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where `output = new_state` is the new hidden state and
@@ -202,8 +203,9 @@ h_{\text{new}} &= \tanh(c_{\text{new}} \cdot l_5)
 - `inp`: The input to the nas. It should be a vector of size `input_size x len`
   or a matrix of size `input_size x len x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the NAS. 
-    They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-    If not provided, they are assumed to be vectors of zeros
+  They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/peepholelstm_cell.jl b/src/peepholelstm_cell.jl
index a4a74c2..8ec7889 100644
--- a/src/peepholelstm_cell.jl
+++ b/src/peepholelstm_cell.jl
@@ -46,7 +46,8 @@ h_t &= o_t \odot \sigma_h(c_t).
   or a matrix of size `input_size x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the PeepholeLSTMCell.
   They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-  If not provided, they are assumed to be vectors of zeros.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where `output = new_state` is the new hidden state and
@@ -120,8 +121,9 @@ h_t &= o_t \odot \sigma_h(c_t).
 - `inp`: The input to the peepholelstm. It should be a vector of size `input_size x len`
   or a matrix of size `input_size x len x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the PeepholeLSTM. 
-    They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-    If not provided, they are assumed to be vectors of zeros
+  They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/ran_cell.jl b/src/ran_cell.jl
index c2f393f..6be718f 100644
--- a/src/ran_cell.jl
+++ b/src/ran_cell.jl
@@ -47,7 +47,8 @@ h_t         &= g(c_t)
   or a matrix of size `input_size x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the RANCell.
   They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-  If not provided, they are assumed to be vectors of zeros.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where `output = new_state` is the new hidden state and
@@ -128,8 +129,9 @@ h_t         &= g(c_t)
 - `inp`: The input to the ran. It should be a vector of size `input_size x len`
   or a matrix of size `input_size x len x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the RAN. 
-    They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-    If not provided, they are assumed to be vectors of zeros
+  They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.
diff --git a/src/scrn_cell.jl b/src/scrn_cell.jl
index 2d1ab61..48ebf70 100644
--- a/src/scrn_cell.jl
+++ b/src/scrn_cell.jl
@@ -48,7 +48,8 @@ y_t &= f(U_y h_t + W_y s_t)
   or a matrix of size `input_size x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the SCRNCell.
   They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-  If not provided, they are assumed to be vectors of zeros.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - A tuple `(output, state)`, where `output = new_state` is the new hidden state and
@@ -130,8 +131,9 @@ y_t &= f(U_y h_t + W_y s_t)
 - `inp`: The input to the scrn. It should be a vector of size `input_size x len`
   or a matrix of size `input_size x len x batch_size`.
 - `(state, cstate)`: A tuple containing the hidden and cell states of the SCRN. 
-    They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
-    If not provided, they are assumed to be vectors of zeros
+  They should be vectors of size `hidden_size` or matrices of size `hidden_size x batch_size`.
+  If not provided, they are assumed to be vectors of zeros,
+  initialized by [`Flux.initialstates`](@extref).
 
 ## Returns
 - New hidden states `new_states` as an array of size `hidden_size x len x batch_size`.