Merge pull request #11 from MartinuzziFrancesco/fm/docs

More docstrings
MartinuzziFrancesco · Nov 24, 2024 · 5164499 · 5164499
2 parents 77d18f7 + 53c62ed
commit 5164499
Show file tree

Hide file tree

Showing 4 changed files with 163 additions and 6 deletions.
diff --git a/docs/make.jl b/docs/make.jl
@@ -21,4 +21,5 @@ makedocs(;
 deploydocs(;
     repo="github.com/MartinuzziFrancesco/RecurrentLayers.jl",
     devbranch="main",
+    push_preview=true,
 )
diff --git a/src/mut_cell.jl b/src/mut_cell.jl
@@ -7,11 +7,35 @@ end
 
 Flux.@layer MUT1Cell
 
-"""
+@doc raw"""
     MUT1Cell((input_size => hidden_size);
         init_kernel = glorot_uniform,
         init_recurrent_kernel = glorot_uniform,
         bias = true)
+
+[Mutated unit 1 cell](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
+See [`MUT1`](@ref) for a layer that processes entire sequences.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+z &= \sigma(W_z x_t + b_z), \\
+r &= \sigma(W_r x_t + U_r h_t + b_r), \\
+h_{t+1} &= \tanh(U_h (r \odot h_t) + \tanh(W_h x_t) + b_h) \odot z \\
+&\quad + h_t \odot (1 - z).
+\end{aligned}
+```
+
+# Forward
+
+    rnncell(inp, [state])
 """
 function MUT1Cell((input_size, hidden_size)::Pair;
     init_kernel = glorot_uniform,
@@ -57,6 +81,16 @@ Flux.@layer :expand MUT1
 
 """
     MUT1((input_size => hidden_size); kwargs...)
+
+[Mutated unit 1 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
+See [`MUT1Cell`](@ref) for a layer that processes a single sequence.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
 """
 function MUT1((input_size, hidden_size)::Pair; kwargs...)
     cell = MUT1Cell(input_size => hidden_size; kwargs...)
@@ -88,11 +122,35 @@ end
 
 Flux.@layer MUT2Cell
 
-"""
+@doc raw"""
     MUT2Cell((input_size => hidden_size);
         init_kernel = glorot_uniform,
         init_recurrent_kernel = glorot_uniform,
         bias = true)
+
+[Mutated unit 2 cell](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
+See [`MUT2`](@ref) for a layer that processes entire sequences.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+z &= \sigma(W_z x_t + U_z h_t + b_z), \\
+r &= \sigma(x_t + U_r h_t + b_r), \\
+h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
+&\quad + h_t \odot (1 - z).
+\end{aligned}
+```
+
+# Forward
+
+    rnncell(inp, [state])
 """
 function MUT2Cell((input_size, hidden_size)::Pair;
     init_kernel = glorot_uniform,
@@ -138,6 +196,16 @@ Flux.@layer :expand MUT2
 
 """
     MUT2Cell((input_size => hidden_size); kwargs...)
+
+[Mutated unit 2 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
+See [`MUT2Cell`](@ref) for a layer that processes a single sequence.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
 """
 function MUT2((input_size, hidden_size)::Pair; kwargs...)
     cell = MUT2Cell(input_size => hidden_size; kwargs...)
@@ -168,11 +236,35 @@ end
 
 Flux.@layer MUT3Cell
 
-"""
+@doc raw"""
     MUT3Cell((input_size => hidden_size);
         init_kernel = glorot_uniform,
         init_recurrent_kernel = glorot_uniform,
         bias = true)
+
+[Mutated unit 3 cell](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
+See [`MUT3`](@ref) for a layer that processes entire sequences.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+z &= \sigma(W_z x_t + U_z \tanh(h_t) + b_z), \\
+r &= \sigma(W_r x_t + U_r h_t + b_r), \\
+h_{t+1} &= \tanh(U_h (r \odot h_t) + W_h x_t + b_h) \odot z \\
+&\quad + h_t \odot (1 - z).
+\end{aligned}
+```
+
+# Forward
+
+    rnncell(inp, [state])
 """
 function MUT3Cell((input_size, hidden_size)::Pair;
     init_kernel = glorot_uniform,
@@ -216,6 +308,16 @@ Flux.@layer :expand MUT3
 
 """
     MUT3((input_size => hidden_size); kwargs...)
+
+[Mutated unit 3 network](https://proceedings.mlr.press/v37/jozefowicz15.pdf).
+See [`MUT3Cell`](@ref) for a layer that processes a single sequence.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
 """
 function MUT3((input_size, hidden_size)::Pair; kwargs...)
     cell = MUT3Cell(input_size => hidden_size; kwargs...)

diff --git a/src/nas_cell.jl b/src/nas_cell.jl
@@ -31,11 +31,54 @@ end
 
 Flux.@layer NASCell
 
-"""
+@doc raw"""
     NASCell((input_size => hidden_size);
         init_kernel = glorot_uniform,
         init_recurrent_kernel = glorot_uniform,
         bias = true)
+
+[Neural Architecture Search unit](https://arxiv.org/pdf/1611.01578).
+See [`NAS`](@ref) for a layer that processes entire sequences.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
+
+# Equations
+```math
+\begin{aligned}
+\text{First Layer Outputs:} & \\
+o_1 &= \sigma(W_i^{(1)} x_t + W_h^{(1)} h_{t-1} + b^{(1)}), \\
+o_2 &= \text{ReLU}(W_i^{(2)} x_t + W_h^{(2)} h_{t-1} + b^{(2)}), \\
+o_3 &= \sigma(W_i^{(3)} x_t + W_h^{(3)} h_{t-1} + b^{(3)}), \\
+o_4 &= \text{ReLU}(W_i^{(4)} x_t \cdot W_h^{(4)} h_{t-1}), \\
+o_5 &= \tanh(W_i^{(5)} x_t + W_h^{(5)} h_{t-1} + b^{(5)}), \\
+o_6 &= \sigma(W_i^{(6)} x_t + W_h^{(6)} h_{t-1} + b^{(6)}), \\
+o_7 &= \tanh(W_i^{(7)} x_t + W_h^{(7)} h_{t-1} + b^{(7)}), \\
+o_8 &= \sigma(W_i^{(8)} x_t + W_h^{(8)} h_{t-1} + b^{(8)}). \\
+
+\text{Second Layer Computations:} & \\
+l_1 &= \tanh(o_1 \cdot o_2) \\
+l_2 &= \tanh(o_3 + o_4) \\
+l_3 &= \tanh(o_5 \cdot o_6) \\
+l_4 &= \sigma(o_7 + o_8) \\
+
+\text{Inject Cell State:} & \\
+l_1 &= \tanh(l_1 + c_{\text{state}}) \\
+
+\text{Final Layer Computations:} & \\
+c_{\text{new}} &= l_1 \cdot l_2 \\
+l_5 &= \tanh(l_3 + l_4) \\
+h_{\text{new}} &= \tanh(c_{\text{new}} \cdot l_5)
+\end{aligned}
+```
+
+# Forward
+
+    rnncell(inp, [state])
 """
 function NASCell((input_size, hidden_size)::Pair;
     init_kernel = glorot_uniform,
@@ -101,6 +144,17 @@ Flux.@layer :expand NAS
 
 """
     NAS((input_size => hidden_size)::Pair; kwargs...)
+
+
+[Neural Architecture Search unit](https://arxiv.org/pdf/1611.01578).
+See [`NASCell`](@ref) for a layer that processes a single sequence.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
 """
 function NAS((input_size, hidden_size)::Pair; kwargs...)
     cell = NASCell(input_size => hidden_size; kwargs...)

diff --git a/src/peepholelstm_cell.jl b/src/peepholelstm_cell.jl
@@ -26,13 +26,13 @@ See [`PeepholeLSTM`](@ref) for a layer that processes entire sequences.
 # Equations
 
 ```math
-\begin{align}
+\begin{aligned}
 f_t &= \sigma_g(W_f x_t + U_f c_{t-1} + b_f), \\
 i_t &= \sigma_g(W_i x_t + U_i c_{t-1} + b_i), \\
 o_t &= \sigma_g(W_o x_t + U_o c_{t-1} + b_o), \\
 c_t &= f_t \odot c_{t-1} + i_t \odot \sigma_c(W_c x_t + b_c), \\
 h_t &= o_t \odot \sigma_h(c_t).
-\end{align}
+\end{aligned}
 ```
 
 # Forward