From b71c6d8cde55b4a2101dd5307be03476d2cf0c01 Mon Sep 17 00:00:00 2001 From: MartinuzziFrancesco Date: Wed, 27 Nov 2024 19:56:17 +0100 Subject: [PATCH] finalizing docstrings --- src/indrnn_cell.jl | 7 ++++++- src/lightru_cell.jl | 11 ++++++++++- src/ligru_cell.jl | 11 ++++++++++- src/mgu_cell.jl | 11 ++++++++++- src/mut_cell.jl | 36 +++++++++++++++++++++++++++++++++--- src/nas_cell.jl | 31 ++++++++++++++++++++++++++++++- src/ran_cell.jl | 12 +++++++++++- src/rhn_cell.jl | 13 ++++++++++++- src/scrn_cell.jl | 11 ++++++++++- 9 files changed, 132 insertions(+), 11 deletions(-) diff --git a/src/indrnn_cell.jl b/src/indrnn_cell.jl index b523ab3..1a15fd8 100644 --- a/src/indrnn_cell.jl +++ b/src/indrnn_cell.jl @@ -70,7 +70,7 @@ end Flux.@layer :expand IndRNN -""" +@doc raw""" IndRNN((input_size, hidden_size)::Pair, σ = tanh, σ=relu; kwargs...) @@ -84,6 +84,11 @@ See [`IndRNNCell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\mathbf{h}_{t} = \sigma(\mathbf{W} \mathbf{x}_t + \mathbf{u} \odot \mathbf{h}_{t-1} + \mathbf{b}) +``` """ function IndRNN((input_size, hidden_size)::Pair, σ = tanh; kwargs...) cell = IndRNNCell(input_size, hidden_size, σ; kwargs...) diff --git a/src/lightru_cell.jl b/src/lightru_cell.jl index aac2946..1f2f081 100644 --- a/src/lightru_cell.jl +++ b/src/lightru_cell.jl @@ -77,7 +77,7 @@ end Flux.@layer :expand LightRU -""" +@doc raw""" LightRU((input_size => hidden_size)::Pair; kwargs...) [Light recurrent unit network](https://www.mdpi.com/2079-9292/13/16/3204). @@ -89,6 +89,15 @@ See [`LightRUCell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +\tilde{h}_t &= \tanh(W_h x_t), \\ +f_t &= \delta(W_f x_t + U_f h_{t-1} + b_f), \\ +h_t &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t. +\end{aligned} +``` """ function LightRU((input_size, hidden_size)::Pair; kwargs...) cell = LightRUCell(input_size => hidden_size; kwargs...) diff --git a/src/ligru_cell.jl b/src/ligru_cell.jl index b217c29..b09e842 100644 --- a/src/ligru_cell.jl +++ b/src/ligru_cell.jl @@ -75,7 +75,7 @@ end Flux.@layer :expand LiGRU -""" +@doc raw""" LiGRU((input_size => hidden_size)::Pair; kwargs...) [Light gated recurrent network](https://arxiv.org/pdf/1803.10225). @@ -89,6 +89,15 @@ See [`LiGRUCell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +z_t &= \sigma(W_z x_t + U_z h_{t-1}), \\ +\tilde{h}_t &= \text{ReLU}(W_h x_t + U_h h_{t-1}), \\ +h_t &= z_t \odot h_{t-1} + (1 - z_t) \odot \tilde{h}_t +\end{aligned} +``` """ function LiGRU((input_size, hidden_size)::Pair; kwargs...) cell = LiGRUCell(input_size => hidden_size; kwargs...) diff --git a/src/mgu_cell.jl b/src/mgu_cell.jl index 70f23f3..3d81106 100644 --- a/src/mgu_cell.jl +++ b/src/mgu_cell.jl @@ -76,7 +76,7 @@ end Flux.@layer :expand MGU -""" +@doc raw""" MGU((input_size => hidden_size)::Pair; kwargs...) [Minimal gated unit network](https://arxiv.org/pdf/1603.09420). @@ -88,6 +88,15 @@ See [`MGUCell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +f_t &= \sigma(W_f x_t + U_f h_{t-1} + b_f), \\ +\tilde{h}_t &= \tanh(W_h x_t + U_h (f_t \odot h_{t-1}) + b_h), \\ +h_t &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t +\end{aligned} +``` """ function MGU((input_size, hidden_size)::Pair; kwargs...) cell = MGUCell(input_size => hidden_size; kwargs...) diff --git a/src/mut_cell.jl b/src/mut_cell.jl index 896aab6..835ed3a 100644 --- a/src/mut_cell.jl +++ b/src/mut_cell.jl @@ -79,7 +79,7 @@ end Flux.@layer :expand MUT1 -""" +@doc raw""" MUT1((input_size => hidden_size); kwargs...) [Mutated unit 1 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf). @@ -91,6 +91,16 @@ See [`MUT1Cell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +z &= \sigma(W_z x_t + b_z), \\ +r &= \sigma(W_r x_t + U_r h_t + b_r), \\ +h_{t+1} &= \tanh(U_h (r \odot h_t) + \tanh(W_h x_t) + b_h) \odot z \\ +&\quad + h_t \odot (1 - z). +\end{aligned} +``` """ function MUT1((input_size, hidden_size)::Pair; kwargs...) cell = MUT1Cell(input_size => hidden_size; kwargs...) @@ -194,7 +204,7 @@ end Flux.@layer :expand MUT2 -""" +@doc raw""" MUT2Cell((input_size => hidden_size); kwargs...) [Mutated unit 2 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf). @@ -206,6 +216,16 @@ See [`MUT2Cell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +z &= \sigma(W_z x_t + U_z h_t + b_z), \\ +r &= \sigma(x_t + U_r h_t + b_r), \\ +h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\ +&\quad + h_t \odot (1 - z). +\end{aligned} +``` """ function MUT2((input_size, hidden_size)::Pair; kwargs...) cell = MUT2Cell(input_size => hidden_size; kwargs...) @@ -306,7 +326,7 @@ end Flux.@layer :expand MUT3 -""" +@doc raw""" MUT3((input_size => hidden_size); kwargs...) [Mutated unit 3 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf). @@ -318,6 +338,16 @@ See [`MUT3Cell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +z &= \sigma(W_z x_t + U_z \tanh(h_t) + b_z), \\ +r &= \sigma(W_r x_t + U_r h_t + b_r), \\ +h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\ +&\quad + h_t \odot (1 - z). +\end{aligned} +``` """ function MUT3((input_size, hidden_size)::Pair; kwargs...) cell = MUT3Cell(input_size => hidden_size; kwargs...) diff --git a/src/nas_cell.jl b/src/nas_cell.jl index 714636c..b5705fb 100644 --- a/src/nas_cell.jl +++ b/src/nas_cell.jl @@ -142,7 +142,7 @@ end Flux.@layer :expand NAS -""" +@doc raw""" NAS((input_size => hidden_size)::Pair; kwargs...) @@ -155,6 +155,35 @@ See [`NASCell`](@ref) for a layer that processes a single sequence. - `init_kernel`: initializer for the input to hidden weights - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +\text{First Layer Outputs:} & \\ +o_1 &= \sigma(W_i^{(1)} x_t + W_h^{(1)} h_{t-1} + b^{(1)}), \\ +o_2 &= \text{ReLU}(W_i^{(2)} x_t + W_h^{(2)} h_{t-1} + b^{(2)}), \\ +o_3 &= \sigma(W_i^{(3)} x_t + W_h^{(3)} h_{t-1} + b^{(3)}), \\ +o_4 &= \text{ReLU}(W_i^{(4)} x_t \cdot W_h^{(4)} h_{t-1}), \\ +o_5 &= \tanh(W_i^{(5)} x_t + W_h^{(5)} h_{t-1} + b^{(5)}), \\ +o_6 &= \sigma(W_i^{(6)} x_t + W_h^{(6)} h_{t-1} + b^{(6)}), \\ +o_7 &= \tanh(W_i^{(7)} x_t + W_h^{(7)} h_{t-1} + b^{(7)}), \\ +o_8 &= \sigma(W_i^{(8)} x_t + W_h^{(8)} h_{t-1} + b^{(8)}). \\ + +\text{Second Layer Computations:} & \\ +l_1 &= \tanh(o_1 \cdot o_2) \\ +l_2 &= \tanh(o_3 + o_4) \\ +l_3 &= \tanh(o_5 \cdot o_6) \\ +l_4 &= \sigma(o_7 + o_8) \\ + +\text{Inject Cell State:} & \\ +l_1 &= \tanh(l_1 + c_{\text{state}}) \\ + +\text{Final Layer Computations:} & \\ +c_{\text{new}} &= l_1 \cdot l_2 \\ +l_5 &= \tanh(l_3 + l_4) \\ +h_{\text{new}} &= \tanh(c_{\text{new}} \cdot l_5) +\end{aligned} +``` """ function NAS((input_size, hidden_size)::Pair; kwargs...) cell = NASCell(input_size => hidden_size; kwargs...) diff --git a/src/ran_cell.jl b/src/ran_cell.jl index ce0969e..6482cb5 100644 --- a/src/ran_cell.jl +++ b/src/ran_cell.jl @@ -108,7 +108,7 @@ end Flux.@layer :expand RAN -""" +@doc raw""" RAN(input_size => hidden_size; kwargs...) The `RANCell`, introduced in [this paper](https://arxiv.org/pdf/1705.07393), @@ -126,6 +126,16 @@ See [`RANCell`](@ref) for a layer that processes a single sequence. - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` +# Equations +```math +\begin{aligned} +\tilde{c}_t &= W_c x_t, \\ +i_t &= \sigma(W_i x_t + U_i h_{t-1} + b_i), \\ +f_t &= \sigma(W_f x_t + U_f h_{t-1} + b_f), \\ +c_t &= i_t \odot \tilde{c}_t + f_t \odot c_{t-1}, \\ +h_t &= g(c_t) +\end{aligned} +``` """ function RAN((input_size, hidden_size)::Pair; kwargs...) cell = RANCell(input_size => hidden_size; kwargs...) diff --git a/src/rhn_cell.jl b/src/rhn_cell.jl index 59b8c68..554e217 100644 --- a/src/rhn_cell.jl +++ b/src/rhn_cell.jl @@ -140,7 +140,7 @@ end Flux.@layer :expand RHN -""" +@doc raw""" RHN((input_size => hidden_size)::Pair depth=3; kwargs...) [Recurrent highway network](https://arxiv.org/pdf/1607.03474). @@ -154,6 +154,17 @@ See [`RHNCell`](@ref) for a layer that processes a single sequence. - `couple_carry`: couples the carry gate and the transform gate. Default `true` - `init_kernel`: initializer for the input to hidden weights - `bias`: include a bias or not. Default is `true` + +# Equations +```math +\begin{aligned} +s_{\ell}^{[t]} &= h_{\ell}^{[t]} \odot t_{\ell}^{[t]} + s_{\ell-1}^{[t]} \odot c_{\ell}^{[t]}, \\ +\text{where} \\ +h_{\ell}^{[t]} &= \tanh(W_h x^{[t]}\mathbb{I}_{\ell = 1} + U_{h_{\ell}} s_{\ell-1}^{[t]} + b_{h_{\ell}}), \\ +t_{\ell}^{[t]} &= \sigma(W_t x^{[t]}\mathbb{I}_{\ell = 1} + U_{t_{\ell}} s_{\ell-1}^{[t]} + b_{t_{\ell}}), \\ +c_{\ell}^{[t]} &= \sigma(W_c x^{[t]}\mathbb{I}_{\ell = 1} + U_{c_{\ell}} s_{\ell-1}^{[t]} + b_{c_{\ell}}) +\end{aligned} +``` """ function RHN((input_size, hidden_size)::Pair, depth=3; kwargs...) cell = RHNCell(input_size => hidden_size, depth; kwargs...) diff --git a/src/scrn_cell.jl b/src/scrn_cell.jl index a5f8350..3fa6be7 100644 --- a/src/scrn_cell.jl +++ b/src/scrn_cell.jl @@ -86,7 +86,7 @@ end Flux.@layer :expand SCRN -""" +@doc raw""" SCRN((input_size => hidden_size)::Pair; init_kernel = glorot_uniform, init_recurrent_kernel = glorot_uniform, @@ -103,6 +103,15 @@ See [`SCRNCell`](@ref) for a layer that processes a single sequence. - `init_recurrent_kernel`: initializer for the hidden to hidden weights - `bias`: include a bias or not. Default is `true` - `alpha`: structural contraint. Default is 0.0 + +# Equations +```math +\begin{aligned} +s_t &= (1 - \alpha) W_s x_t + \alpha s_{t-1}, \\ +h_t &= \sigma(W_h s_t + U_h h_{t-1} + b_h), \\ +y_t &= f(U_y h_t + W_y s_t) +\end{aligned} +``` """ function SCRN((input_size, hidden_size)::Pair; kwargs...) cell = SCRNCell(input_size => hidden_size; kwargs...)