From b71c6d8cde55b4a2101dd5307be03476d2cf0c01 Mon Sep 17 00:00:00 2001
From: MartinuzziFrancesco <martinuzzi.francesco@gmail.com>
Date: Wed, 27 Nov 2024 19:56:17 +0100
Subject: [PATCH] finalizing docstrings

---
 src/indrnn_cell.jl  |  7 ++++++-
 src/lightru_cell.jl | 11 ++++++++++-
 src/ligru_cell.jl   | 11 ++++++++++-
 src/mgu_cell.jl     | 11 ++++++++++-
 src/mut_cell.jl     | 36 +++++++++++++++++++++++++++++++++---
 src/nas_cell.jl     | 31 ++++++++++++++++++++++++++++++-
 src/ran_cell.jl     | 12 +++++++++++-
 src/rhn_cell.jl     | 13 ++++++++++++-
 src/scrn_cell.jl    | 11 ++++++++++-
 9 files changed, 132 insertions(+), 11 deletions(-)

diff --git a/src/indrnn_cell.jl b/src/indrnn_cell.jl
index b523ab3..1a15fd8 100644
--- a/src/indrnn_cell.jl
+++ b/src/indrnn_cell.jl
@@ -70,7 +70,7 @@ end
   
 Flux.@layer :expand IndRNN
 
-"""
+@doc raw"""
     IndRNN((input_size, hidden_size)::Pair, σ = tanh, σ=relu;
         kwargs...)
 
@@ -84,6 +84,11 @@ See [`IndRNNCell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\mathbf{h}_{t} = \sigma(\mathbf{W} \mathbf{x}_t + \mathbf{u} \odot \mathbf{h}_{t-1} + \mathbf{b})
+```
 """
 function IndRNN((input_size, hidden_size)::Pair, σ = tanh; kwargs...)
     cell = IndRNNCell(input_size, hidden_size, σ; kwargs...)
diff --git a/src/lightru_cell.jl b/src/lightru_cell.jl
index aac2946..1f2f081 100644
--- a/src/lightru_cell.jl
+++ b/src/lightru_cell.jl
@@ -77,7 +77,7 @@ end
   
 Flux.@layer :expand LightRU
 
-"""
+@doc raw"""
     LightRU((input_size => hidden_size)::Pair; kwargs...)
 
 [Light recurrent unit network](https://www.mdpi.com/2079-9292/13/16/3204).
@@ -89,6 +89,15 @@ See [`LightRUCell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+\tilde{h}_t &= \tanh(W_h x_t), \\
+f_t         &= \delta(W_f x_t + U_f h_{t-1} + b_f), \\
+h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t.
+\end{aligned}
+```
 """
 function LightRU((input_size, hidden_size)::Pair; kwargs...)
     cell = LightRUCell(input_size => hidden_size; kwargs...)
diff --git a/src/ligru_cell.jl b/src/ligru_cell.jl
index b217c29..b09e842 100644
--- a/src/ligru_cell.jl
+++ b/src/ligru_cell.jl
@@ -75,7 +75,7 @@ end
   
 Flux.@layer :expand LiGRU
 
-"""
+@doc raw"""
     LiGRU((input_size => hidden_size)::Pair; kwargs...)
 
 [Light gated recurrent network](https://arxiv.org/pdf/1803.10225).
@@ -89,6 +89,15 @@ See [`LiGRUCell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+z_t &= \sigma(W_z x_t + U_z h_{t-1}), \\
+\tilde{h}_t &= \text{ReLU}(W_h x_t + U_h h_{t-1}), \\
+h_t &= z_t \odot h_{t-1} + (1 - z_t) \odot \tilde{h}_t
+\end{aligned}
+```
 """
 function LiGRU((input_size, hidden_size)::Pair; kwargs...)
     cell = LiGRUCell(input_size => hidden_size; kwargs...)
diff --git a/src/mgu_cell.jl b/src/mgu_cell.jl
index 70f23f3..3d81106 100644
--- a/src/mgu_cell.jl
+++ b/src/mgu_cell.jl
@@ -76,7 +76,7 @@ end
   
 Flux.@layer :expand MGU
 
-"""
+@doc raw"""
     MGU((input_size => hidden_size)::Pair; kwargs...)
 
 [Minimal gated unit network](https://arxiv.org/pdf/1603.09420).
@@ -88,6 +88,15 @@ See [`MGUCell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+f_t         &= \sigma(W_f x_t + U_f h_{t-1} + b_f), \\
+\tilde{h}_t &= \tanh(W_h x_t + U_h (f_t \odot h_{t-1}) + b_h), \\
+h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t
+\end{aligned}
+```
 """
 function MGU((input_size, hidden_size)::Pair; kwargs...)
     cell = MGUCell(input_size => hidden_size; kwargs...)
diff --git a/src/mut_cell.jl b/src/mut_cell.jl
index 896aab6..835ed3a 100644
--- a/src/mut_cell.jl
+++ b/src/mut_cell.jl
@@ -79,7 +79,7 @@ end
   
 Flux.@layer :expand MUT1
 
-"""
+@doc raw"""
     MUT1((input_size => hidden_size); kwargs...)
 
 [Mutated unit 1 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
@@ -91,6 +91,16 @@ See [`MUT1Cell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+z &= \sigma(W_z x_t + b_z), \\
+r &= \sigma(W_r x_t + U_r h_t + b_r), \\
+h_{t+1} &= \tanh(U_h (r \odot h_t) + \tanh(W_h x_t) + b_h) \odot z \\
+&\quad + h_t \odot (1 - z).
+\end{aligned}
+```
 """
 function MUT1((input_size, hidden_size)::Pair; kwargs...)
     cell = MUT1Cell(input_size => hidden_size; kwargs...)
@@ -194,7 +204,7 @@ end
   
 Flux.@layer :expand MUT2
 
-"""
+@doc raw"""
     MUT2Cell((input_size => hidden_size); kwargs...)
 
 [Mutated unit 2 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
@@ -206,6 +216,16 @@ See [`MUT2Cell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+z &= \sigma(W_z x_t + U_z h_t + b_z), \\
+r &= \sigma(x_t + U_r h_t + b_r), \\
+h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
+&\quad + h_t \odot (1 - z).
+\end{aligned}
+```
 """
 function MUT2((input_size, hidden_size)::Pair; kwargs...)
     cell = MUT2Cell(input_size => hidden_size; kwargs...)
@@ -306,7 +326,7 @@ end
   
 Flux.@layer :expand MUT3
 
-"""
+@doc raw"""
     MUT3((input_size => hidden_size); kwargs...)
 
 [Mutated unit 3 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
@@ -318,6 +338,16 @@ See [`MUT3Cell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+z &= \sigma(W_z x_t + U_z \tanh(h_t) + b_z), \\
+r &= \sigma(W_r x_t + U_r h_t + b_r), \\
+h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
+&\quad + h_t \odot (1 - z).
+\end{aligned}
+```
 """
 function MUT3((input_size, hidden_size)::Pair; kwargs...)
     cell = MUT3Cell(input_size => hidden_size; kwargs...)
diff --git a/src/nas_cell.jl b/src/nas_cell.jl
index 714636c..b5705fb 100644
--- a/src/nas_cell.jl
+++ b/src/nas_cell.jl
@@ -142,7 +142,7 @@ end
 
 Flux.@layer :expand NAS
 
-"""
+@doc raw"""
     NAS((input_size => hidden_size)::Pair; kwargs...)
 
 
@@ -155,6 +155,35 @@ See [`NASCell`](@ref) for a layer that processes a single sequence.
 - `init_kernel`: initializer for the input to hidden weights
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+\text{First Layer Outputs:} & \\
+o_1 &= \sigma(W_i^{(1)} x_t + W_h^{(1)} h_{t-1} + b^{(1)}), \\
+o_2 &= \text{ReLU}(W_i^{(2)} x_t + W_h^{(2)} h_{t-1} + b^{(2)}), \\
+o_3 &= \sigma(W_i^{(3)} x_t + W_h^{(3)} h_{t-1} + b^{(3)}), \\
+o_4 &= \text{ReLU}(W_i^{(4)} x_t \cdot W_h^{(4)} h_{t-1}), \\
+o_5 &= \tanh(W_i^{(5)} x_t + W_h^{(5)} h_{t-1} + b^{(5)}), \\
+o_6 &= \sigma(W_i^{(6)} x_t + W_h^{(6)} h_{t-1} + b^{(6)}), \\
+o_7 &= \tanh(W_i^{(7)} x_t + W_h^{(7)} h_{t-1} + b^{(7)}), \\
+o_8 &= \sigma(W_i^{(8)} x_t + W_h^{(8)} h_{t-1} + b^{(8)}). \\
+
+\text{Second Layer Computations:} & \\
+l_1 &= \tanh(o_1 \cdot o_2) \\
+l_2 &= \tanh(o_3 + o_4) \\
+l_3 &= \tanh(o_5 \cdot o_6) \\
+l_4 &= \sigma(o_7 + o_8) \\
+
+\text{Inject Cell State:} & \\
+l_1 &= \tanh(l_1 + c_{\text{state}}) \\
+
+\text{Final Layer Computations:} & \\
+c_{\text{new}} &= l_1 \cdot l_2 \\
+l_5 &= \tanh(l_3 + l_4) \\
+h_{\text{new}} &= \tanh(c_{\text{new}} \cdot l_5)
+\end{aligned}
+```
 """
 function NAS((input_size, hidden_size)::Pair; kwargs...)
     cell = NASCell(input_size => hidden_size; kwargs...)
diff --git a/src/ran_cell.jl b/src/ran_cell.jl
index ce0969e..6482cb5 100644
--- a/src/ran_cell.jl
+++ b/src/ran_cell.jl
@@ -108,7 +108,7 @@ end
 
 Flux.@layer :expand RAN
 
-"""
+@doc raw"""
     RAN(input_size => hidden_size; kwargs...)
 
 The `RANCell`, introduced in [this paper](https://arxiv.org/pdf/1705.07393), 
@@ -126,6 +126,16 @@ See [`RANCell`](@ref) for a layer that processes a single sequence.
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
 
+# Equations
+```math
+\begin{aligned}
+\tilde{c}_t &= W_c x_t, \\
+i_t         &= \sigma(W_i x_t + U_i h_{t-1} + b_i), \\
+f_t         &= \sigma(W_f x_t + U_f h_{t-1} + b_f), \\
+c_t         &= i_t \odot \tilde{c}_t + f_t \odot c_{t-1}, \\
+h_t         &= g(c_t)
+\end{aligned}
+```
 """
 function RAN((input_size, hidden_size)::Pair; kwargs...)
     cell = RANCell(input_size => hidden_size; kwargs...)
diff --git a/src/rhn_cell.jl b/src/rhn_cell.jl
index 59b8c68..554e217 100644
--- a/src/rhn_cell.jl
+++ b/src/rhn_cell.jl
@@ -140,7 +140,7 @@ end
   
 Flux.@layer :expand RHN
 
-"""
+@doc raw"""
     RHN((input_size => hidden_size)::Pair depth=3; kwargs...)
 
 [Recurrent highway network](https://arxiv.org/pdf/1607.03474).
@@ -154,6 +154,17 @@ See [`RHNCell`](@ref) for a layer that processes a single sequence.
 - `couple_carry`: couples the carry gate and the transform gate. Default `true`
 - `init_kernel`: initializer for the input to hidden weights
 - `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+s_{\ell}^{[t]} &= h_{\ell}^{[t]} \odot t_{\ell}^{[t]} + s_{\ell-1}^{[t]} \odot c_{\ell}^{[t]}, \\
+\text{where} \\
+h_{\ell}^{[t]} &= \tanh(W_h x^{[t]}\mathbb{I}_{\ell = 1} + U_{h_{\ell}} s_{\ell-1}^{[t]} + b_{h_{\ell}}), \\
+t_{\ell}^{[t]} &= \sigma(W_t x^{[t]}\mathbb{I}_{\ell = 1} + U_{t_{\ell}} s_{\ell-1}^{[t]} + b_{t_{\ell}}), \\
+c_{\ell}^{[t]} &= \sigma(W_c x^{[t]}\mathbb{I}_{\ell = 1} + U_{c_{\ell}} s_{\ell-1}^{[t]} + b_{c_{\ell}})
+\end{aligned}
+```
 """
 function RHN((input_size, hidden_size)::Pair, depth=3; kwargs...)
     cell = RHNCell(input_size => hidden_size, depth; kwargs...)
diff --git a/src/scrn_cell.jl b/src/scrn_cell.jl
index a5f8350..3fa6be7 100644
--- a/src/scrn_cell.jl
+++ b/src/scrn_cell.jl
@@ -86,7 +86,7 @@ end
   
 Flux.@layer :expand SCRN
 
-"""
+@doc raw"""
     SCRN((input_size => hidden_size)::Pair;
         init_kernel = glorot_uniform,
         init_recurrent_kernel = glorot_uniform,
@@ -103,6 +103,15 @@ See [`SCRNCell`](@ref) for a layer that processes a single sequence.
 - `init_recurrent_kernel`: initializer for the hidden to hidden weights
 - `bias`: include a bias or not. Default is `true`
 - `alpha`: structural contraint. Default is 0.0
+
+# Equations
+```math
+\begin{aligned}
+s_t &= (1 - \alpha) W_s x_t + \alpha s_{t-1}, \\
+h_t &= \sigma(W_h s_t + U_h h_{t-1} + b_h), \\
+y_t &= f(U_y h_t + W_y s_t)
+\end{aligned}
+```
 """
 function SCRN((input_size, hidden_size)::Pair; kwargs...)
     cell = SCRNCell(input_size => hidden_size; kwargs...)