Merge pull request #10 from MartinuzziFrancesco/fm/plstm

PeepholeLSTM
MartinuzziFrancesco · Nov 22, 2024 · d914a32 · d914a32 · MartinuzziFrancesco · Nov 22, 2024
2 parents 88c093d + 55595c8
commit d914a32
Show file tree

Hide file tree

Showing 11 changed files with 154 additions and 10 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "RecurrentLayers"
 uuid = "78449bcf-6750-4b78-9e82-63d4a1ccdf8c"
 authors = ["Francesco Martinuzzi"]
-version = "0.1.1"
+version = "0.1.2"
 
 [deps]
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"

diff --git a/README.md b/README.md
@@ -29,6 +29,7 @@ Currently available layers and work in progress in the short term:
  - [x] Neural architecture search unit (NAS) [arxiv](https://arxiv.org/abs/1611.01578)
  - [x] Evolving recurrent neural networks (MUT1/2/3) [pub](https://proceedings.mlr.press/v37/jozefowicz15.pdf)
  - [x] Structurally constrained recurrent neural network (SCRN) [arxiv](https://arxiv.org/pdf/1412.7753)
+ - [x] Peephole long short term memory (PeepholeLSTM) [pub](https://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf)
  - [ ] Minimal gated recurrent unit (minGRU) and minimal long short term memory (minLSTM) [arxiv](https://arxiv.org/abs/2410.01201)
 
 ## Installation 💻

diff --git a/docs/src/api/cells.md b/docs/src/api/cells.md
@@ -13,4 +13,5 @@ MUT1Cell
 MUT2Cell
 MUT3Cell
 SCRNCell
+PeepholeLSTMCell
 ```
diff --git a/docs/src/api/wrappers.md b/docs/src/api/wrappers.md
@@ -12,4 +12,5 @@ MUT1
 MUT2
 MUT3
 SCRN
+PeepholeLSTM
 ```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -17,6 +17,7 @@ RecurrentLayers.jl extends [Flux.jl](https://github.com/FluxML/Flux.jl) recurren
  - Neural architecture search unit `NASCell` [arxiv](https://arxiv.org/abs/1611.01578)
  - Evolving recurrent neural networks as `MUT1Cell`, `MUT2Cell`, `MUT3Cell` [pub](https://proceedings.mlr.press/v37/jozefowicz15.pdf)
  - Structurally constrained recurrent neural network as `SCRNCell` [arxiv](https://arxiv.org/pdf/1412.7753)
+ - Peephole long short term memory as `PeepholeLSTMCell` [pub](https://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf)
 
 ## Contributing
 

diff --git a/src/RecurrentLayers.jl b/src/RecurrentLayers.jl
@@ -5,9 +5,9 @@ import Flux: _size_check, _match_eltype, chunk, create_bias, zeros_like
 import Flux: glorot_uniform
 
 export MGUCell, LiGRUCell, IndRNNCell, RANCell, LightRUCell, RHNCell,
-RHNCellUnit, NASCell, MUT1Cell, MUT2Cell, MUT3Cell, SCRNCell
+RHNCellUnit, NASCell, MUT1Cell, MUT2Cell, MUT3Cell, SCRNCell, PeepholeLSTMCell
 export MGU, LiGRU, IndRNN, RAN, LightRU, NAS, RHN, MUT1, MUT2, MUT3,
-SCRN
+SCRN, PeepholeLSTM
 
 
 #TODO add double bias
@@ -20,5 +20,6 @@ include("rhn_cell.jl")
 include("nas_cell.jl")
 include("mut_cell.jl")
 include("scrn_cell.jl")
+include("peepholelstm_cell.jl")
 
 end #module
diff --git a/src/lightru_cell.jl b/src/lightru_cell.jl
@@ -27,7 +27,7 @@ See [`LightRU`](@ref) for a layer that processes entire sequences.
 ```math
 \begin{aligned}
 \tilde{h}_t &= \tanh(W_h x_t), \\
-f_t         &= \delta(U_f h_{t-1} + W_f x_t + b_f), \\
+f_t         &= \delta(W_f x_t + U_f h_{t-1} + b_f), \\
 h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t.
 \end{aligned}
 ```

diff --git a/src/mgu_cell.jl b/src/mgu_cell.jl
@@ -26,8 +26,8 @@ See [`MGU`](@ref) for a layer that processes entire sequences.
 # Equations
 ```math
 \begin{aligned}
-f_t         &= \sigma(U_f h_{t-1} + W_f x_t + b_f), \\
-\tilde{h}_t &= \tanh(U_h (f_t \odot h_{t-1}) + W_h x_t + b_h), \\
+f_t         &= \sigma(W_f x_t + U_f h_{t-1} + b_f), \\
+\tilde{h}_t &= \tanh(W_h x_t + U_h (f_t \odot h_{t-1}) + b_h), \\
 h_t         &= (1 - f_t) \odot h_{t-1} + f_t \odot \tilde{h}_t
 \end{aligned}
 ```

diff --git a/src/peepholelstm_cell.jl b/src/peepholelstm_cell.jl
@@ -0,0 +1,139 @@
+#https://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf
+struct PeepholeLSTMCell{I, H, V}
+    Wi::I
+    Wh::H
+    bias::V
+end
+
+Flux.@layer PeepholeLSTMCell
+
+@doc raw"""
+    PeepholeLSTMCell((input_size => hidden_size)::Pair;
+        init_kernel = glorot_uniform,
+        init_recurrent_kernel = glorot_uniform,
+        bias = true)
+
+[Peephole long short term memory cell](https://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf).
+See [`PeepholeLSTM`](@ref) for a layer that processes entire sequences.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
+
+# Equations
+
+```math
+\begin{align}
+f_t &= \sigma_g(W_f x_t + U_f c_{t-1} + b_f), \\
+i_t &= \sigma_g(W_i x_t + U_i c_{t-1} + b_i), \\
+o_t &= \sigma_g(W_o x_t + U_o c_{t-1} + b_o), \\
+c_t &= f_t \odot c_{t-1} + i_t \odot \sigma_c(W_c x_t + b_c), \\
+h_t &= o_t \odot \sigma_h(c_t).
+\end{align}
+```
+
+# Forward
+
+    lstmcell(x, [h, c])
+
+The forward pass takes the following arguments:
+
+- `x`: Input to the cell, which can be a vector of size `in` or a matrix of size `in x batch_size`.
+- `h`: The hidden state vector of the cell, sized `out`, or a matrix of size `out x batch_size`.
+- `c`: The candidate state, sized `out`, or a matrix of size `out x batch_size`.
+If not provided, both `h` and `c` default to vectors of zeros.
+
+# Examples
+
+"""
+function PeepholeLSTMCell(
+    (input_size, hidden_size)::Pair;
+    init_kernel = glorot_uniform,
+    init_recurrent_kernel = glorot_uniform,
+    bias = true,
+)
+    Wi = init_kernel(hidden_size * 4, input_size)
+    Wh = init_recurrent_kernel(hidden_size * 4, hidden_size)
+    b = create_bias(Wi, bias, hidden_size * 4)
+    cell = PeepholeLSTMCell(Wi, Wh, b)
+    return cell
+end
+
+function (lstm::PeepholeLSTMCell)(inp::AbstractVecOrMat)
+    state = zeros_like(inp, size(lstm.Wh, 2))
+    c_state = zeros_like(state)
+    return lstm(inp, (state, c_state))
+end
+
+function (lstm::PeepholeLSTMCell)(inp::AbstractVecOrMat, 
+    (state, c_state))
+    _size_check(lstm, inp, 1 => size(lstm.Wi, 2))
+    b = lstm.bias
+    g = lstm.Wi * inp .+ lstm.Wh * c_state .+ b
+    input, forget, cell, output = chunk(g, 4; dims = 1)
+    new_cstate = @. sigmoid_fast(forget) * c_state + sigmoid_fast(input) * tanh_fast(cell)
+    new_state = @. sigmoid_fast(output) * tanh_fast(new_cstate)
+    return new_state, new_cstate
+end
+
+Base.show(io::IO, lstm::PeepholeLSTMCell) =
+    print(io, "PeepholeLSTMCell(", size(lstm.Wi, 2), " => ", size(lstm.Wi, 1) ÷ 4, ")")
+
+
+
+struct PeepholeLSTM{M}
+    cell::M
+end
+
+Flux.@layer :expand PeepholeLSTM
+
+@doc raw"""
+    PeepholeLSTM((input_size => hidden_size)::Pair; kwargs...)
+
+[Peephole long short term memory network](https://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf).
+See [`PeepholeLSTMCell`](@ref) for a layer that processes a single sequence.
+
+# Arguments
+
+- `input_size => hidden_size`: input and inner dimension of the layer
+- `init_kernel`: initializer for the input to hidden weights
+- `init_recurrent_kernel`: initializer for the hidden to hidden weights
+- `bias`: include a bias or not. Default is `true`
+
+# Equations
+
+```math
+\begin{align}
+f_t &= \sigma_g(W_f x_t + U_f c_{t-1} + b_f), \\
+i_t &= \sigma_g(W_i x_t + U_i c_{t-1} + b_i), \\
+o_t &= \sigma_g(W_o x_t + U_o c_{t-1} + b_o), \\
+c_t &= f_t \odot c_{t-1} + i_t \odot \sigma_c(W_c x_t + b_c), \\
+h_t &= o_t \odot \sigma_h(c_t).
+\end{align}
+```
+"""
+function PeepholeLSTM((input_size, hidden_size)::Pair; kwargs...)
+    cell = PeepholeLSTM(input_size => hidden_size; kwargs...)
+    return PeepholeLSTM(cell)
+end
+
+function (lstm::PeepholeLSTM)(inp)
+    state = zeros_like(inp, size(lstm.cell.Wh, 2))
+    c_state = zeros_like(state)
+    return lstm(inp, (state, c_state))
+end
+
+function (lstm::PeepholeLSTM)(inp, (state, c_state))
+    @assert ndims(inp) == 2 || ndims(inp) == 3
+    new_state = []
+    new_cstate = []
+    for inp_t in eachslice(inp, dims=2)
+        state, c_state = nas.cell(inp_t, (state, c_state))
+        new_state = vcat(new_state, [state])
+        new_cstate = vcat(new_cstate, [c_state])
+    end
+    return stack(new_state, dims=2), stack(new_cstate, dims=2)
+end
diff --git a/src/ran_cell.jl b/src/ran_cell.jl
@@ -33,8 +33,8 @@ See [`RAN`](@ref) for a layer that processes entire sequences.
 ```math
 \begin{aligned}
 \tilde{c}_t &= W_c x_t, \\
-i_t         &= \sigma(U_i h_{t-1} + W_i x_t + b_i), \\
-f_t         &= \sigma(U_f h_{t-1} + W_f x_t + b_f), \\
+i_t         &= \sigma(W_i x_t + U_i h_{t-1} + b_i), \\
+f_t         &= \sigma(W_f x_t + U_f h_{t-1} + b_f), \\
 c_t         &= i_t \odot \tilde{c}_t + f_t \odot c_{t-1}, \\
 h_t         &= g(c_t)
 \end{aligned}

diff --git a/test/test_cells.jl b/test/test_cells.jl
@@ -7,7 +7,7 @@ single_cells = [MGUCell, LiGRUCell, IndRNNCell,
     LightRUCell, MUT1Cell, MUT2Cell,
     MUT3Cell]
 #cells returning hidden state as a tuple
-double_cells = [RANCell, NASCell]
+double_cells = [RANCell, NASCell, PeepholeLSTMCell]
 #cells with a little more complexity to them
 different_cells = [SCRNCell, RHNCell]
 
@@ -25,7 +25,7 @@ different_cells = [SCRNCell, RHNCell]
     @test rnncell(inp) == rnncell(inp, zeros(Float32, 5))
 end
 
-@testset "cell = $cell" for cell in double_cells
+@testset "Double return cell: $cell = " for cell in double_cells
     rnncell = cell(3 => 5)
     @test length(Flux.trainables(rnncell)) == 3
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,4 +12,5 @@ MUT1 @@
     MUT2
     MUT3
     SCRN
+    PeepholeLSTM
     ```