From 437fdebb0432a6727c8e1661400e032feefec67a Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi <mrityunjay2668@gmail.com>
Date: Wed, 3 Jun 2020 15:39:25 +0530
Subject: [PATCH 1/8] complete encoder decoder and transformer model

---
 .gitignore                              |   1 +
 models/CMakeLists.txt                   |  10 +-
 models/transformer/CMakeLists.txt       |  20 ++
 models/transformer/decoder.hpp          | 236 ++++++++++++++++++++++++
 models/transformer/decoder_impl.hpp     |  90 +++++++++
 models/transformer/encoder.hpp          | 191 +++++++++++++++++++
 models/transformer/encoder_impl.hpp     |  76 ++++++++
 models/transformer/transformer.hpp      | 156 ++++++++++++++++
 models/transformer/transformer_impl.hpp | 128 +++++++++++++
 tests/ffn_model_tests.cpp               | 116 ++++++++++++
 10 files changed, 1023 insertions(+), 1 deletion(-)
 create mode 100644 models/transformer/CMakeLists.txt
 create mode 100644 models/transformer/decoder.hpp
 create mode 100644 models/transformer/decoder_impl.hpp
 create mode 100644 models/transformer/encoder.hpp
 create mode 100644 models/transformer/encoder_impl.hpp
 create mode 100644 models/transformer/transformer.hpp
 create mode 100644 models/transformer/transformer_impl.hpp

diff --git a/.gitignore b/.gitignore
index 4708e5bb..92a9136d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 build*
 xcode*
+.vscode/
 .DS_Store
 .idea
 cmake-build-*
diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt
index c4bd5a8c..77aa6f85 100644
--- a/models/CMakeLists.txt
+++ b/models/CMakeLists.txt
@@ -1,7 +1,15 @@
 cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
 project(models)
 
-add_subdirectory(darknet)
+# Recurse into each model mlpack provides.
+set(DIRS
+  darknet
+  transformer
+)
+
+foreach(dir ${DIRS})
+    add_subdirectory(${dir})
+endforeach()
 
 # Add directory name to sources.
 set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
diff --git a/models/transformer/CMakeLists.txt b/models/transformer/CMakeLists.txt
new file mode 100644
index 00000000..288262a4
--- /dev/null
+++ b/models/transformer/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
+project(transformer)
+
+set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../")
+
+set(SOURCES
+  decoder.hpp
+  decoder_impl.hpp
+  encoder.hpp
+  encoder_impl.hpp
+  transformer.hpp
+  transformer_impl.hpp
+)
+
+foreach(file ${SOURCES})
+  set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
+endforeach()
+
+set(DIRS ${DIRS} ${DIR_SRCS} PARENT_SCOPE)
diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
new file mode 100644
index 00000000..28e055a5
--- /dev/null
+++ b/models/transformer/decoder.hpp
@@ -0,0 +1,236 @@
+/**
+ * @file models/transformer/decoder.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the Transformer Decoder layer.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_DECODER_HPP
+#define MODELS_TRANSFORMER_DECODER_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <mlpack/methods/ann/layer/layer_types.hpp>
+#include <mlpack/methods/ann/layer/base_layer.hpp>
+#include <mlpack/methods/ann/regularizer/no_regularizer.hpp>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * In addition to the two sub-layers in each encoder layer, the decoder inserts
+ * a third sub-layer, which performs multi-head attention over the output of the
+ * encoder stack. Similar to the encoder, we employ residual connections around
+ * each of the sub-layers, followed by layer normalization. We also modify the
+ * self-attention sub-layer in the decoder stack to prevent positions from
+ * attending to subsequent positions. This masking, combined with fact that the
+ * output embeddings are offset by one position, ensures that the predictions
+ * for position i can depend only on the known outputs at positions less than i.
+ *
+ * @tparam ActivationFunction The type of the activation function to be used in
+ *         the position-wise feed forward neural network.
+ * @tparam RegularizerType The type of regularizer to be applied to layer
+ *         parameters.
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+  typename ActivationFunction = ReLULayer<>,
+  typename RegularizerType = NoRegularizer,
+  typename InputDataType = arma::mat,
+  typename OutputDataType = arma::mat
+>
+class TransformerDecoder
+{
+ public:
+  TransformerDecoder();
+
+  /**
+   * Create the TransformerDecoder object using the specified parameters.
+   *
+   * @param numLayers The number of decoder blocks.
+   * @param tgtSeqLen Target Sequence Length.
+   * @param srcSeqLen Source Sequence Length.
+   * @param memoryModule The last Encoder module.
+   * @param dModel The number of features in the input. Also, same as the
+   *        'embedDim' in 'MultiheadAttention' layer.
+   * @param numHeads The number of attention heads.
+   * @param dimFFN The dimentionality of feedforward network.
+   * @param dropout The dropout rate.
+   * @param attentionMask The attention mask used to black-out future sequences.
+   * @param keyPaddingMask The padding mask used to black-out particular token.
+   */
+  TransformerDecoder(const size_t numLayers,
+                     const size_t tgtSeqLen,
+                     const size_t srcSeqLen,
+                     const size_t dModel = 512,
+                     const size_t numHeads = 8,
+                     const size_t dimFFN = 1024,
+                     const double dropout = 0.1,
+                     const InputDataType& attentionMask = InputDataType(),
+                     const InputDataType& keyPaddingMask = InputDataType());
+
+  /**
+   * Get the Transformer Decoder model.
+   */
+  Sequential<>* Model() { return decoder; }
+  /**
+   * Load the network from a local directory.
+   *
+   * @param filepath The location of the stored model.
+   */
+  void LoadModel(const std::string& filepath);
+
+  /**
+   * Save the network locally.
+   *
+   * @param filepath The location where the model is to be saved.
+   */
+  void SaveModel(const std::string& filepath);
+
+  //! Get the attention mask.
+  InputDataType const& AttentionMask() const { return attentionMask; }
+
+  //! Modify the attention mask.
+  InputDataType& AttentionMask() { return attentionMask; }
+
+  //! Get the key padding mask.
+  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+
+  //! Modify the key padding mask.
+  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+
+ private:
+  /**
+   * This method adds the attention block to the decoder.
+   */
+  void AttentionBlock()
+  {
+    Sequential<>* decoderBlockBottom = new Sequential<>();
+    decoderBlockBottom->Add<Subview<>>(1, 0, dModel * tgtSeqLen - 1, 0, -1);
+
+    // Broadcast the incoming input to decoder
+    // i.e. query into (query, key, value).
+    Concat<>* decoderInput = new Concat<>();
+    decoderInput->Add<IdentityLayer<>>();
+    decoderInput->Add<IdentityLayer<>>();
+    decoderInput->Add<IdentityLayer<>>();
+
+    // Masked Self attention layer.
+    Sequential<>* maskedSelfAttention = new Sequential<>();
+    maskedSelfAttention->Add(decoderInput);
+    maskedSelfAttention->Add<MultiheadAttention<
+        InputDataType, OutputDataType, RegularizerType>>(
+          tgtSeqLen,
+          tgtSeqLen,
+          dModel,
+          numHeads,
+          attentionMask
+        );
+
+    // Residual connection.
+    AddMerge<>* residualAdd1 = new AddMerge<>();
+    residualAdd1->Add(maskedSelfAttention);
+    residualAdd1->Add<IdentityLayer<>>();
+
+    decoderBlockBottom->Add(residualAdd1);
+
+    // Add the LayerNorm layer with required parameters.
+    decoderBlockBottom->Add<LayerNorm<>>(dModel * tgtSeqLen);
+
+    // This layer broadcasts the output of encoder i.e. key into (key, value).
+    Concat<>* broadcastEncoderOutput = new Concat<>();
+    broadcastEncoderOutput->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
+    broadcastEncoderOutput->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
+
+    // This layer concatenates the output of the bottom decoder block (query)
+    // and the output of the encoder (key, value).
+    Concat<>* encoderDecoderAttentionInput = new Concat<>();
+    encoderDecoderAttentionInput->Add(decoderBlockBottom);
+    encoderDecoderAttentionInput->Add(broadcastEncoderOutput);
+
+    // Encoder-decoder attention.
+    Sequential<>* encoderDecoderAttention = new Sequential<>();
+    encoderDecoderAttention->Add(encoderDecoderAttentionInput);
+    encoderDecoderAttention->Add<MultiheadAttention<
+        InputDataType, OutputDataType, RegularizerType>>(
+          tgtSeqLen,
+          srcSeqLen,
+          dModel,
+          numHeads,
+          InputDataType(), // No attention mask to encoder-decoder attention.
+          keyPaddingMask);
+
+    // Residual connection.
+    AddMerge<>* residualAdd2 = new AddMerge<>();
+    residualAdd2->Add(encoderDecoderAttention);
+    residualAdd2->Add<IdentityLayer<>>();
+
+    decoder->Add(residualAdd2);
+    decoder->Add(new LayerNorm<>(dModel * tgtSeqLen));
+  }
+
+  /**
+   * This method adds the position-wise feed forward network to the decoder.
+   */
+  void PositionWiseFFNBlock()
+  {
+    Sequential<>* positionWiseFFN = new Sequential<>();
+    positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
+    positionWiseFFN->Add<ActivationFunction>();
+    positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
+    positionWiseFFN->Add<Dropout<>>(dropout);
+
+    /* Residual connection. */
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAdd->Add(positionWiseFFN);
+    residualAdd->Add<IdentityLayer<>>();
+    decoder->Add(residualAdd);
+  }
+
+  //! Locally-stored number of decoder layers.
+  size_t numLayers;
+
+  //! Locally-stored target sequence length.
+  size_t tgtSeqLen;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored number of input units.
+  size_t dModel;
+
+  //! Locally-stored number of output units.
+  size_t numHeads;
+
+  //! Locally-stored weight object.
+  size_t dimFFN;
+
+  //! Locally-stored weight parameters.
+  double dropout;
+
+  //! Locally-stored attention mask.
+  InputDataType attentionMask;
+
+  //! Locally-stored key padding mask.
+  InputDataType keyPaddingMask;
+
+  //! Locally-stored complete decoder network.
+  Sequential<InputDataType, OutputDataType, false>* decoder;
+
+}; // class TransformerDecoder
+
+} // namespace ann
+} // namespace mlpack
+
+// Include implementation.
+#include "decoder_impl.hpp"
+
+#endif
diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp
new file mode 100644
index 00000000..8cbf4f6d
--- /dev/null
+++ b/models/transformer/decoder_impl.hpp
@@ -0,0 +1,90 @@
+/**
+ * @file models/transformer/decoder_impl.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Implementation of the Transformer Decoder class.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_DECODER_IMPL_HPP
+#define MODELS_TRANSFORMER_DECODER_IMPL_HPP
+
+#include "decoder.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerDecoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::TransformerDecoder() :
+    tgtSeqLen(0),
+    srcSeqLen(0),
+    dModel(0),
+    numHeads(0),
+    dimFFN(0),
+    dropout(0)
+{
+  // Nothing to do here.
+}
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerDecoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::TransformerDecoder(
+    const size_t numLayers,
+    const size_t tgtSeqLen,
+    const size_t srcSeqLen,
+    const size_t dModel,
+    const size_t numHeads,
+    const size_t dimFFN,
+    const double dropout,
+    const InputDataType& attentionMask,
+    const InputDataType& keyPaddingMask) :
+    numLayers(numLayers),
+    tgtSeqLen(tgtSeqLen),
+    srcSeqLen(srcSeqLen),
+    dModel(dModel),
+    numHeads(numHeads),
+    dimFFN(dimFFN),
+    dropout(dropout),
+    attentionMask(attentionMask),
+    keyPaddingMask(keyPaddingMask)
+{
+  decoder = new Sequential<InputDataType, OutputDataType, false>();
+
+  for (size_t N = 0; N < numLayers; ++N)
+  {
+    AttentionBlock();
+    PositionWiseFFNBlock();
+  }
+}
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void TransformerDecoder<ActivationFunction, RegularizerType,
+InputDataType, OutputDataType>::LoadModel(const std::string& filepath)
+{
+  data::Load(filepath, "TransformerDecoder", decoder);
+  std::cout << "Loaded model" << std::endl;
+}
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void TransformerDecoder<ActivationFunction, RegularizerType,
+InputDataType, OutputDataType>::SaveModel(const std::string& filepath)
+{
+  std::cout << "Saving model" << std::endl;
+  data::Save(filepath, "TransformerDecoder", decoder);
+  std::cout << "Model saved in " << filepath << std::endl;
+}
+
+} // namespace ann
+} // namespace mlpack
+
+#endif
diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp
new file mode 100644
index 00000000..794e3fb3
--- /dev/null
+++ b/models/transformer/encoder.hpp
@@ -0,0 +1,191 @@
+/**
+ * @file models/transformer/encoder.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the Transformer Encoder layer.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_ENCODER_HPP
+#define MODELS_TRANSFORMER_ENCODER_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <mlpack/methods/ann/layer/layer_types.hpp>
+#include <mlpack/methods/ann/layer/base_layer.hpp>
+#include <mlpack/methods/ann/regularizer/no_regularizer.hpp>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * The Transformer Encoder layer has two sub-layers. The first is a multi-head
+ * self-attention mechanism, and the second is a simple, position-wise
+ * fully connected feed-forward network. We employ a residual connection around
+ * each of the two sub-layers, followed by layer normalization. Hence the output
+ * of each sub-layer is 'LayerNorm(x + Sublayer(x))', where 'Sublayer(x)' is the
+ * function implemented by the sub-layer itself. To facilitate these residual
+ * connections, all sub-layers in the model, as well as the embedding layers,
+ * produce outputs of dimension 'dModel'.
+ *
+ * @tparam ActivationType The type of activation function to be used in the
+ *         position-wise feed forward neural network.
+ * @tparam RegularizerType The regularizer type to be applied on layer
+ *         parameters.
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+  typename ActivationFunction = ReLULayer<>,
+  typename RegularizerType = NoRegularizer,
+  typename InputDataType = arma::mat,
+  typename OutputDataType = arma::mat
+>
+class TransformerEncoder
+{
+ public:
+  /**
+   * Create the TransformerEncoder object using the specified parameters.
+   *
+   * @param numLayers The number of encoder blocks.
+   * @param srcSeqLen Source Sequence Length.
+   * @param dModel The number of features in the input. It is same as the
+   *               'embedDim' in 'MultiheadAttention' layer.
+   * @param numHeads The number of attention heads.
+   * @param dimFFN The dimentionality of feedforward network.
+   * @param dropout The dropout rate.
+   * @param attentionMask The attention mask to be applied to the sequences.
+   * @param keyPaddingMask The key padding mask applied to the sequences.
+   */
+  TransformerEncoder(const size_t numLayers,
+                     const size_t srcSeqLen,
+                     const size_t dModel = 512,
+                     const size_t numHeads = 2,
+                     const size_t dimFFN = 1024,
+                     const double dropout = 0.1,
+                     const InputDataType& attentionMask = InputDataType(),
+                     const InputDataType& keyPaddingMask = InputDataType());
+
+  /**
+   * Get the Transformer Encoder Model.
+   */
+  Sequential<InputDataType, OutputDataType, false>* Model()
+  {
+    return encoder;
+  }
+
+  /**
+   * Load the encoder block from a local directory.
+   *
+   * @param filepath The location of the stored model.
+   */
+  void LoadModel(const std::string& filepath);
+
+  /**
+   * Save the encoder block locally.
+   *
+   * @param filepath The location where the model is to be saved.
+   */
+  void SaveModel(const std::string& filepath);
+
+  //! Get the attention mask.
+  InputDataType const& AttentionMask() const { return attentionMask; }
+
+  //! Modify the attention mask.
+  InputDataType& AttentionMask() { return attentionMask; }
+
+  //! Get the key padding mask.
+  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+
+  //! Modify the key padding mask.
+  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+
+ private:
+  /**
+   * The method adds attention block to the encoder block.
+   */
+  void AttentionBlock()
+  {
+    Concat<>* input = new Concat<>();
+    input->Add<IdentityLayer<>>();
+    input->Add<IdentityLayer<>>();
+    input->Add<IdentityLayer<>>();
+
+    /* Self attention layer. */
+    Sequential<>* selfAttention = new Sequential<>();
+    selfAttention->Add(input);
+    selfAttention->Add<MultiheadAttention<
+        InputDataType, OutputDataType, RegularizerType>
+        >(srcSeqLen, srcSeqLen, dModel, numHeads);
+
+    /* This layer adds a residual connection. */
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAdd->Add(selfAttention);
+    residualAdd->Add<IdentityLayer<>>();
+
+    encoder->Add(residualAdd);
+    encoder->Add(new LayerNorm<>(dModel * srcSeqLen));
+  }
+
+  /**
+   * This method adds position-wise feed forward block to the encoder.
+   */
+  void PositionWiseFFNBlock()
+  {
+    Sequential<>* positionWiseFFN = new Sequential<>();
+    positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
+    positionWiseFFN->Add<ActivationFunction>();
+    positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
+    positionWiseFFN->Add<Dropout<>>(dropout);
+
+    /* This layer adds a residual connection. */
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAdd->Add(positionWiseFFN);
+    residualAdd->Add<IdentityLayer<>>();
+
+    encoder->Add(residualAdd);
+    encoder->Add(new LayerNorm<>(dModel * srcSeqLen));
+  }
+
+  //! Locally-stored number of encoder blocks.
+  size_t numLayers;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored number of input units.
+  size_t dModel;
+
+  //! Locally-stored number of output units.
+  size_t numHeads;
+
+  //! Locally-stored weight object.
+  size_t dimFFN;
+
+  //! Locally-stored weight parameters.
+  double dropout;
+
+  //! Locally-stored attention mask.
+  InputDataType attentionMask;
+
+  //! Locally-stored key padding mask.
+  InputDataType keyPaddingMask;
+
+  //! Locally-stored encoder block.
+  Sequential<InputDataType, OutputDataType, false>* encoder;
+
+}; // class TransformerEncoder
+
+} // namespace ann
+} // namespace mlpack
+
+// Include implementation.
+#include "encoder_impl.hpp"
+
+#endif
diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp
new file mode 100644
index 00000000..64d4b1cc
--- /dev/null
+++ b/models/transformer/encoder_impl.hpp
@@ -0,0 +1,76 @@
+/**
+ * @file models/transformer/encoder_impl.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Implementation of the Transformer Encoder class.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_ENCODER_IMPL_HPP
+#define MODELS_TRANSFORMER_ENCODER_IMPL_HPP
+
+#include "encoder.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::TransformerEncoder(
+    const size_t numLayers,
+    const size_t srcSeqLen,
+    const size_t dModel,
+    const size_t numHeads,
+    const size_t dimFFN,
+    const double dropout,
+    const InputDataType& attentionMask,
+    const InputDataType& keyPaddingMask) :
+    numLayers(numLayers),
+    srcSeqLen(srcSeqLen),
+    dModel(dModel),
+    numHeads(numHeads),
+    dimFFN(dimFFN),
+    dropout(dropout),
+    attentionMask(attentionMask),
+    keyPaddingMask(keyPaddingMask)
+{
+  encoder = new Sequential<InputDataType, OutputDataType, false>();
+
+  encoder->Add(new IdentityLayer<>());
+
+  for (size_t N = 0; N < numLayers; ++N)
+  {
+    AttentionBlock();
+    PositionWiseFFNBlock();
+  }
+}
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::LoadModel(const std::string& filePath)
+{
+  data::Load(filePath, "TransformerEncoder", encoder);
+  std::cout << "Loaded model" << std::endl;
+}
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::SaveModel(const std::string& filePath)
+{
+  std::cout << "Saving model" << std::endl;
+  data::Save(filePath, "TransformerEncoder", encoder);
+  std::cout << "Model saved in " << filePath << std::endl;
+}
+
+} // namespace ann
+} // namespace mlpack
+
+#endif
diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp
new file mode 100644
index 00000000..8cc1a94f
--- /dev/null
+++ b/models/transformer/transformer.hpp
@@ -0,0 +1,156 @@
+/**
+ * @file models/transformer/transformer.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the Transformer model.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_TRANSFORMER_HPP
+#define MODELS_TRANSFORMER_TRANSFORMER_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <mlpack/methods/ann/ffn.hpp>
+#include <mlpack/methods/ann/layer/layer_types.hpp>
+#include <mlpack/methods/ann/layer/base_layer.hpp>
+#include <mlpack/methods/ann/regularizer/no_regularizer.hpp>
+
+#include "encoder.hpp"
+#include "decoder.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * @tparam ActivationType The type of activation function to be used in the
+ *         position-wise feed forward neural network.
+ * @tparam RegularizerType The regularizer type to be applied on layer
+ *         parameters.
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+  typename ActivationFunction = ReLULayer<>,
+  typename RegularizerType = NoRegularizer,
+  typename InputDataType = arma::mat,
+  typename OutputDataType = arma::mat
+>
+class Transformer
+{
+ public:
+  /**
+   * Create the Transformer object using the specified parameters.
+   *
+   * @param numLayers The number of encoder and decoder layers.
+   * @param tgtSeqLen Target Sequence Length.
+   * @param srcSeqLen Source Sequence Length.
+   * @param tgtVocabSize Target vocabulary size.
+   * @param srcVocabSize Source vocabulary size.
+   * @param dModel The number of features in the input. Also, same as the
+   *               'embedDim' in 'MultiheadAttention' layer.
+   * @param numHeads The number of attention heads.
+   * @param dimFFN The dimentionality of feedforward network.
+   * @param dropout The dropout rate.
+   * @param attentionMask The attention mask to be applied to the sequences.
+   * @param keyPaddingMask The key padding mask applied to the sequences.
+   */
+  Transformer(const size_t numLayers,
+              const size_t tgtSeqLen,
+              const size_t srcSeqLen,
+              const size_t tgtVocabSize,
+              const size_t srcVocabSize,
+              const size_t dModel = 512,
+              const size_t numHeads = 12,
+              const size_t dimFFN = 1024,
+              const double dropout = 0.1,
+              const InputDataType& attentionMask = InputDataType(),
+              const InputDataType& keyPaddingMask = InputDataType());
+
+  /**
+   * Get the Transformer Encoder Model.
+   */
+  Sequential<InputDataType, OutputDataType, false>* Model()
+  {
+    return transformer;
+  }
+
+  /**
+   * Load the Transformer model from a local directory.
+   *
+   * @param filepath The location of the stored model.
+   */
+  void LoadModel(const std::string& filepath);
+
+  /**
+   * Save the Transformer model locally.
+   *
+   * @param filepath The location where the model is to be saved.
+   */
+  void SaveModel(const std::string& filepath);
+
+  //! Get the attention mask.
+  InputDataType const& AttentionMask() const { return attentionMask; }
+
+  //! Modify the attention mask.
+  InputDataType& AttentionMask() { return attentionMask; }
+
+  //! Get the key padding mask.
+  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+
+  //! Modify the key padding mask.
+  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+
+ private:
+
+  //! Locally-stored number of encoder and decoder layers.
+  size_t numLayers;
+
+  //! Locally-stored target sequence length.
+  size_t tgtSeqLen;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored vocabulary size of the target.
+  size_t tgtVocabSize;
+
+  //! Locally-stored vocabulary size of the source.
+  size_t srcVocabSize;
+
+  //! Locally-stored number of input units.
+  size_t dModel;
+
+  //! Locally-stored number of output units.
+  size_t numHeads;
+
+  //! Locally-stored weight object.
+  size_t dimFFN;
+
+  //! Locally-stored weight parameters.
+  double dropout;
+
+  //! Locally-stored attention mask.
+  InputDataType attentionMask;
+
+  //! Locally-stored key padding mask.
+  InputDataType keyPaddingMask;
+
+  //! Locally-stored transformer model.
+  Sequential<InputDataType, OutputDataType>* transformer;
+
+}; // class Transformer
+
+} // namespace ann
+} // namespace mlpack
+
+// Include implementation.
+#include "transformer_impl.hpp"
+
+#endif
diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp
new file mode 100644
index 00000000..fc545e8f
--- /dev/null
+++ b/models/transformer/transformer_impl.hpp
@@ -0,0 +1,128 @@
+/**
+ * @file models/transformer/transformer_impl.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Implementation of the Transformer model.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP
+#define MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP
+
+#include "transformer.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+Transformer<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::Transformer(
+    const size_t numLayers,
+    const size_t tgtSeqLen,
+    const size_t srcSeqLen,
+    const size_t tgtVocabSize,
+    const size_t srcVocabSize,
+    const size_t dModel,
+    const size_t numHeads,
+    const size_t dimFFN,
+    const double dropout,
+    const InputDataType& attentionMask,
+    const InputDataType& keyPaddingMask) :
+    numLayers(numLayers),
+    tgtSeqLen(tgtSeqLen),
+    srcSeqLen(srcSeqLen),
+    tgtVocabSize(tgtVocabSize),
+    srcVocabSize(srcVocabSize),
+    dModel(dModel),
+    numHeads(numHeads),
+    dimFFN(dimFFN),
+    dropout(dropout),
+    attentionMask(attentionMask),
+    keyPaddingMask(keyPaddingMask)
+{
+  transformer = new Sequential<>();
+
+  Sequential<>* encoder = new Sequential<>();
+
+  // Pull out the sequences of source language which is stacked above in the
+  // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix.
+  encoder->Add<Subview<>>(1, 0, srcSeqLen - 1, 0, -1);
+  encoder->Add<Lookup<>>(srcVocabSize, dModel);
+  encoder->Add<PositionalEncoding<>>(dModel, srcSeqLen);
+
+  Sequential<>* encoderStack = mlpack::ann::TransformerEncoder<
+    ActivationFunction, RegularizerType, InputDataType, OutputDataType>(
+      numLayers,
+      srcSeqLen,
+      dModel,
+      numHeads,
+      dimFFN,
+      dropout,
+      attentionMask,
+      keyPaddingMask,
+  ).Model();
+
+  encoder->Add(encoderStack);
+
+  Sequential<>* decoderPE = new Sequential<>();
+
+  // Pull out the sequences of target language which is stacked below in the
+  // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last
+  // row and last batch of the input matrix respectively.
+  decoderPE->Add<Subview<>>(1, srcSeqLen, -1, 0, -1);
+  decoderPE->Add<Lookup<>>(tgtVocabSize, dModel);
+  decoderPE->Add<PositionalEncoding<>>(dModel, tgtSeqLen);
+
+  Concat<>* encoderDecoderConcat = new Concat<>();
+  encoderDecoderConcat->Add(encoder);
+  encoderDecoderConcat->Add(decoderPE);
+
+  Sequential<>* decoder = new Sequential<>();
+  decoder->Add(encoderDecoderConcat);
+
+  Sequential<>* decoderStack = mlpack::ann::TransformerDecoder<
+    ActivationFunction, RegularizerType, InputDataType, OutputDataType>(
+      numLayers,
+      tgtSeqLen,
+      srcSeqLen,
+      dModel,
+      numHeads,
+      dimFFN,
+      dropout,
+      attentionMask,
+      keyPaddingMask,
+  ).Model();
+
+  decoder->Add(decoderStack);
+  transformer->Add(decoder);
+}
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void Transformer<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::LoadModel(const std::string& filePath)
+{
+  data::Load(filePath, "Transformer", transformer.Model());
+  std::cout << "Loaded model" << std::endl;
+}
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void Transformer<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::SaveModel(const std::string& filePath)
+{
+  std::cout << "Saving model" << std::endl;
+  data::Save(filePath, "Transformer", transformer.Model());
+  std::cout << "Model saved in " << filePath << std::endl;
+}
+
+} // namespace ann
+} // namespace mlpack
+
+#endif
diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp
index 84e42eb7..eabda79c 100644
--- a/tests/ffn_model_tests.cpp
+++ b/tests/ffn_model_tests.cpp
@@ -14,6 +14,11 @@
 #include <ensmallen.hpp>
 #include <dataloader/dataloader.hpp>
 #include <models/darknet/darknet.hpp>
+#include <models/transformer/encoder.hpp>
+#include <models/transformer/decoder.hpp>
+#include <models/transformer/transformer.hpp>
+#include <mlpack/methods/ann/layer/layer.hpp>
+#include <mlpack/methods/ann/ffn.hpp>
 #include <boost/test/unit_test.hpp>
 
 // Use namespaces for convenience.
@@ -42,4 +47,115 @@ BOOST_AUTO_TEST_CASE(DarknetModelTest)
   BOOST_REQUIRE_EQUAL(output.n_rows, 1000);
 }
 
+/**
+ * Simple Transformer Encoder test.
+ */
+BOOST_AUTO_TEST_CASE(TransformerEncoderTest)
+{
+  const size_t vocabSize = 20;
+  const size_t numLayers = 2;
+  const size_t srcSeqLen = 10;
+  const size_t dModel = 16;
+  const size_t numHeads = 2;
+  const size_t dimFFN = 16;
+  const double dropout = 0.3;
+
+  arma::mat input = arma::randu(dModel * srcSeqLen, 1);
+  arma::mat output;
+
+  mlpack::ann::TransformerEncoder<> encoder(numLayers, srcSeqLen,
+      dModel, numHeads, dimFFN, dropout);
+
+  FFN<> model;
+
+  model.Add(encoder.Model());
+  model.Add<Linear<>>(dModel * srcSeqLen, vocabSize);
+  model.Add<LogSoftMax<>>();
+
+  model.Predict(input, output);
+
+  BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize);
+  BOOST_REQUIRE_EQUAL(output.n_cols, 1);
+}
+
+/**
+ * Simple Transformer Decoder test.
+ */
+BOOST_AUTO_TEST_CASE(TransformerDecoderTest)
+{
+  const size_t vocabSize = 20;
+  const size_t numLayers = 2;
+  const size_t tgtSeqLen = 10;
+  const size_t srcSeqLen = 10;
+  const size_t dModel = 16;
+  const size_t numHeads = 2;
+  const size_t dimFFN = 16;
+  const double dropout = 0.3;
+
+  arma::mat query = arma::randu(dModel * tgtSeqLen, 1);
+  arma::mat memory = 0.73 * arma::randu(dModel * srcSeqLen, 1);
+
+  arma::mat input = arma::join_cols(query, memory);
+  arma::mat output;
+
+  mlpack::ann::TransformerDecoder<> decoder(numLayers, tgtSeqLen, srcSeqLen,
+      dModel, numHeads, dimFFN, dropout);
+
+  FFN<> model;
+
+  model.Add(decoder.Model());
+  model.Add<Linear<>>(dModel * tgtSeqLen, vocabSize);
+  model.Add<LogSoftMax<>>();
+
+  model.Predict(input, output);
+
+  BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize);
+  BOOST_REQUIRE_EQUAL(output.n_cols, 1);
+}
+
+/**
+ * Transformer Model test.
+ */
+BOOST_AUTO_TEST_CASE(TransformerTest)
+{
+  const size_t tgtVocabSize = 20;
+  const size_t srcVocabSize = 20;
+  const size_t numLayers = 2;
+  const size_t tgtSeqLen = 10;
+  const size_t srcSeqLen = 10;
+  const size_t dModel = 16;
+  const size_t numHeads = 2;
+  const size_t dimFFN = 16;
+  const double dropout = 0.3;
+
+  arma::mat srcLanguage(srcSeqLen, 1), tgtLanguage(tgtSeqLen, 1);
+
+  for (size_t t = 0; t < srcSeqLen; ++t)
+  {
+    srcLanguage(t) = mlpack::math::RandInt(1, srcVocabSize);
+  }
+
+  for (size_t t = 0; t < tgtSeqLen; ++t)
+  {
+    tgtLanguage(t) = mlpack::math::RandInt(1, tgtVocabSize);
+  }
+
+  arma::mat input = arma::join_cols(srcLanguage, tgtLanguage);
+  arma::mat output;
+
+  mlpack::ann::Transformer<> transformer(numLayers, tgtSeqLen, srcSeqLen,
+      tgtVocabSize, srcVocabSize, dModel, numHeads, dimFFN, dropout);
+
+  FFN<> model;
+
+  model.Add(transformer.Model());
+  model.Add<Linear<>>(dModel * tgtSeqLen, vocabSize);
+  model.Add<LogSoftMax<>>();
+
+  model.Predict(input, output);
+
+  BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize);
+  BOOST_REQUIRE_EQUAL(output.n_cols, 1);
+}
+
 BOOST_AUTO_TEST_SUITE_END();

From dc8bc78d98318c913eabf1954969fe5bebb4c44e Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi <mrityunjay2668@gmail.com>
Date: Sat, 22 Aug 2020 21:21:33 +0530
Subject: [PATCH 2/8] add namespace in ffn tests

---
 models/transformer/transformer_impl.hpp | 8 ++++----
 tests/ffn_model_tests.cpp               | 6 ++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp
index fc545e8f..b129da2b 100644
--- a/models/transformer/transformer_impl.hpp
+++ b/models/transformer/transformer_impl.hpp
@@ -65,7 +65,7 @@ OutputDataType>::Transformer(
       dimFFN,
       dropout,
       attentionMask,
-      keyPaddingMask,
+      keyPaddingMask
   ).Model();
 
   encoder->Add(encoderStack);
@@ -96,7 +96,7 @@ OutputDataType>::Transformer(
       dimFFN,
       dropout,
       attentionMask,
-      keyPaddingMask,
+      keyPaddingMask
   ).Model();
 
   decoder->Add(decoderStack);
@@ -108,7 +108,7 @@ template <typename ActivationFunction, typename RegularizerType,
 void Transformer<ActivationFunction, RegularizerType, InputDataType,
 OutputDataType>::LoadModel(const std::string& filePath)
 {
-  data::Load(filePath, "Transformer", transformer.Model());
+  data::Load(filePath, "Transformer", transformer);
   std::cout << "Loaded model" << std::endl;
 }
 
@@ -118,7 +118,7 @@ void Transformer<ActivationFunction, RegularizerType, InputDataType,
 OutputDataType>::SaveModel(const std::string& filePath)
 {
   std::cout << "Saving model" << std::endl;
-  data::Save(filePath, "Transformer", transformer.Model());
+  data::Save(filePath, "Transformer", transformer);
   std::cout << "Model saved in " << filePath << std::endl;
 }
 
diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp
index eabda79c..3f4601fd 100644
--- a/tests/ffn_model_tests.cpp
+++ b/tests/ffn_model_tests.cpp
@@ -22,6 +22,8 @@
 #include <boost/test/unit_test.hpp>
 
 // Use namespaces for convenience.
+using namespace mlpack;
+using namespace mlpack::ann;
 using namespace boost::unit_test;
 
 BOOST_AUTO_TEST_SUITE(FFNModelsTests);
@@ -149,12 +151,12 @@ BOOST_AUTO_TEST_CASE(TransformerTest)
   FFN<> model;
 
   model.Add(transformer.Model());
-  model.Add<Linear<>>(dModel * tgtSeqLen, vocabSize);
+  model.Add<Linear<>>(dModel * tgtSeqLen, tgtVocabSize);
   model.Add<LogSoftMax<>>();
 
   model.Predict(input, output);
 
-  BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize);
+  BOOST_REQUIRE_EQUAL(output.n_rows, tgtVocabSize);
   BOOST_REQUIRE_EQUAL(output.n_cols, 1);
 }
 

From ec965bb86f815868d90ac358485348746dccf45d Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi
 <35535378+mrityunjay-tripathi@users.noreply.github.com>
Date: Sun, 23 Aug 2020 13:32:48 +0530
Subject: [PATCH 3/8] Apply suggestions from code review

Co-authored-by: Mikhail Lozhnikov <lozhnikovma@gmail.com>
---
 models/transformer/decoder.hpp          | 17 +++++++++++------
 models/transformer/decoder_impl.hpp     | 20 +++++++++++++++++---
 models/transformer/encoder.hpp          |  6 +++---
 models/transformer/encoder_impl.hpp     |  4 +---
 models/transformer/transformer.hpp      |  2 +-
 models/transformer/transformer_impl.hpp |  6 ++----
 6 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
index 28e055a5..64fc2974 100644
--- a/models/transformer/decoder.hpp
+++ b/models/transformer/decoder.hpp
@@ -60,7 +60,7 @@ class TransformerDecoder
    * @param srcSeqLen Source Sequence Length.
    * @param memoryModule The last Encoder module.
    * @param dModel The number of features in the input. Also, same as the
-   *        'embedDim' in 'MultiheadAttention' layer.
+   *        `embedDim` in `MultiheadAttention` layer.
    * @param numHeads The number of attention heads.
    * @param dimFFN The dimentionality of feedforward network.
    * @param dropout The dropout rate.
@@ -111,7 +111,7 @@ class TransformerDecoder
   /**
    * This method adds the attention block to the decoder.
    */
-  void AttentionBlock()
+  Sequential<>* AttentionBlock()
   {
     Sequential<>* decoderBlockBottom = new Sequential<>();
     decoderBlockBottom->Add<Subview<>>(1, 0, dModel * tgtSeqLen - 1, 0, -1);
@@ -173,14 +173,16 @@ class TransformerDecoder
     residualAdd2->Add(encoderDecoderAttention);
     residualAdd2->Add<IdentityLayer<>>();
 
-    decoder->Add(residualAdd2);
-    decoder->Add(new LayerNorm<>(dModel * tgtSeqLen));
+    Sequential<>* decoderBlock = new Sequential<>();
+    decoderBlock->Add(residualAdd2);
+    decoderBlock->Add<LayerNorm<>>(dModel * tgtSeqLen);
+    return decoderBlock;
   }
 
   /**
    * This method adds the position-wise feed forward network to the decoder.
    */
-  void PositionWiseFFNBlock()
+  Sequential<>* PositionWiseFFNBlock()
   {
     Sequential<>* positionWiseFFN = new Sequential<>();
     positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
@@ -192,7 +194,10 @@ class TransformerDecoder
     AddMerge<>* residualAdd = new AddMerge<>();
     residualAdd->Add(positionWiseFFN);
     residualAdd->Add<IdentityLayer<>>();
-    decoder->Add(residualAdd);
+
+    Sequential<>* decoderBlock = new Sequential<>();
+    decoderBlock->Add(residualAdd);
+    return decoderBlock;
   }
 
   //! Locally-stored number of decoder layers.
diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp
index 8cbf4f6d..ac9b33ae 100644
--- a/models/transformer/decoder_impl.hpp
+++ b/models/transformer/decoder_impl.hpp
@@ -58,10 +58,24 @@ OutputDataType>::TransformerDecoder(
 {
   decoder = new Sequential<InputDataType, OutputDataType, false>();
 
-  for (size_t N = 0; N < numLayers; ++N)
+  for (size_t n = 0; n < numLayers; ++n)
   {
-    AttentionBlock();
-    PositionWiseFFNBlock();
+    if (n + 1 == numLayers)
+    {
+      decoder->Add(AttentionBlock());
+      decoder->Add(PositionWiseFFNBlock());
+      break;
+    }
+
+    Sequential<>* decoderBlock = new Sequential<>();
+    decoderBlock->Add(AttentionBlock());
+    decoderBlock->Add(PositionWiseFFNBlock());
+
+    Concat<>* concatQueryKey = new Concat<>();
+    concatQueryKey->Add(decoderBlock);
+    concatQueryKey->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
+
+    decoder->Add(concatQueryKey);
   }
 }
 
diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp
index 794e3fb3..9ffe5a2e 100644
--- a/models/transformer/encoder.hpp
+++ b/models/transformer/encoder.hpp
@@ -56,7 +56,7 @@ class TransformerEncoder
    * @param numLayers The number of encoder blocks.
    * @param srcSeqLen Source Sequence Length.
    * @param dModel The number of features in the input. It is same as the
-   *               'embedDim' in 'MultiheadAttention' layer.
+   *               `embedDim` in `MultiheadAttention` layer.
    * @param numHeads The number of attention heads.
    * @param dimFFN The dimentionality of feedforward network.
    * @param dropout The dropout rate.
@@ -130,7 +130,7 @@ class TransformerEncoder
     residualAdd->Add<IdentityLayer<>>();
 
     encoder->Add(residualAdd);
-    encoder->Add(new LayerNorm<>(dModel * srcSeqLen));
+    encoder->Add<LayerNorm<>>(dModel * srcSeqLen);
   }
 
   /**
@@ -150,7 +150,7 @@ class TransformerEncoder
     residualAdd->Add<IdentityLayer<>>();
 
     encoder->Add(residualAdd);
-    encoder->Add(new LayerNorm<>(dModel * srcSeqLen));
+    encoder->Add<LayerNorm<>>(dModel * srcSeqLen);
   }
 
   //! Locally-stored number of encoder blocks.
diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp
index 64d4b1cc..a364e24d 100644
--- a/models/transformer/encoder_impl.hpp
+++ b/models/transformer/encoder_impl.hpp
@@ -42,9 +42,7 @@ OutputDataType>::TransformerEncoder(
 {
   encoder = new Sequential<InputDataType, OutputDataType, false>();
 
-  encoder->Add(new IdentityLayer<>());
-
-  for (size_t N = 0; N < numLayers; ++N)
+  for (size_t n = 0; n < numLayers; ++n)
   {
     AttentionBlock();
     PositionWiseFFNBlock();
diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp
index 8cc1a94f..d766fac9 100644
--- a/models/transformer/transformer.hpp
+++ b/models/transformer/transformer.hpp
@@ -54,7 +54,7 @@ class Transformer
    * @param tgtVocabSize Target vocabulary size.
    * @param srcVocabSize Source vocabulary size.
    * @param dModel The number of features in the input. Also, same as the
-   *               'embedDim' in 'MultiheadAttention' layer.
+   *               `embedDim` in `MultiheadAttention` layer.
    * @param numHeads The number of attention heads.
    * @param dimFFN The dimentionality of feedforward network.
    * @param dropout The dropout rate.
diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp
index b129da2b..178dc040 100644
--- a/models/transformer/transformer_impl.hpp
+++ b/models/transformer/transformer_impl.hpp
@@ -83,8 +83,7 @@ OutputDataType>::Transformer(
   encoderDecoderConcat->Add(encoder);
   encoderDecoderConcat->Add(decoderPE);
 
-  Sequential<>* decoder = new Sequential<>();
-  decoder->Add(encoderDecoderConcat);
+  transformer->Add(encoderDecoderConcat);
 
   Sequential<>* decoderStack = mlpack::ann::TransformerDecoder<
     ActivationFunction, RegularizerType, InputDataType, OutputDataType>(
@@ -99,8 +98,7 @@ OutputDataType>::Transformer(
       keyPaddingMask
   ).Model();
 
-  decoder->Add(decoderStack);
-  transformer->Add(decoder);
+  transformer->Add(decoderStack);
 }
 
 template <typename ActivationFunction, typename RegularizerType,

From e43f886b49fc7b18d6675b59cb842d318fe2c137 Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi <mrityunjay2668@gmail.com>
Date: Sun, 23 Aug 2020 17:39:19 +0530
Subject: [PATCH 4/8] add proper parameter description

---
 models/transformer/decoder.hpp          | 12 +++++-------
 models/transformer/encoder.hpp          |  9 ++++-----
 models/transformer/transformer.hpp      | 10 ++++------
 models/transformer/transformer_impl.hpp |  6 ++----
 4 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
index 64fc2974..6f9dd771 100644
--- a/models/transformer/decoder.hpp
+++ b/models/transformer/decoder.hpp
@@ -132,8 +132,7 @@ class TransformerDecoder
           tgtSeqLen,
           dModel,
           numHeads,
-          attentionMask
-        );
+          attentionMask);
 
     // Residual connection.
     AddMerge<>* residualAdd1 = new AddMerge<>();
@@ -209,16 +208,16 @@ class TransformerDecoder
   //! Locally-stored source sequence length.
   size_t srcSeqLen;
 
-  //! Locally-stored number of input units.
+  //! Locally-stored dimensionality of the model.
   size_t dModel;
 
-  //! Locally-stored number of output units.
+  //! Locally-stored number of attention heads.
   size_t numHeads;
 
-  //! Locally-stored weight object.
+  //! Locally-stored dimensionality of position-wise feed forward network.
   size_t dimFFN;
 
-  //! Locally-stored weight parameters.
+  //! Locally-stored dropout rate.
   double dropout;
 
   //! Locally-stored attention mask.
@@ -229,7 +228,6 @@ class TransformerDecoder
 
   //! Locally-stored complete decoder network.
   Sequential<InputDataType, OutputDataType, false>* decoder;
-
 }; // class TransformerDecoder
 
 } // namespace ann
diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp
index 9ffe5a2e..20e70050 100644
--- a/models/transformer/encoder.hpp
+++ b/models/transformer/encoder.hpp
@@ -159,16 +159,16 @@ class TransformerEncoder
   //! Locally-stored source sequence length.
   size_t srcSeqLen;
 
-  //! Locally-stored number of input units.
+  //! Locally-stored dimensionality of model.
   size_t dModel;
 
-  //! Locally-stored number of output units.
+  //! Locally-stored number of attention heads.
   size_t numHeads;
 
-  //! Locally-stored weight object.
+  //! Locally-stored dimensionality of position-wise feed forward network.
   size_t dimFFN;
 
-  //! Locally-stored weight parameters.
+  //! Locally-stored dropout rate.
   double dropout;
 
   //! Locally-stored attention mask.
@@ -179,7 +179,6 @@ class TransformerEncoder
 
   //! Locally-stored encoder block.
   Sequential<InputDataType, OutputDataType, false>* encoder;
-
 }; // class TransformerEncoder
 
 } // namespace ann
diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp
index d766fac9..45bef01d 100644
--- a/models/transformer/transformer.hpp
+++ b/models/transformer/transformer.hpp
@@ -108,7 +108,6 @@ class Transformer
   InputDataType& KeyPaddingMask() { return keyPaddingMask; }
 
  private:
-
   //! Locally-stored number of encoder and decoder layers.
   size_t numLayers;
 
@@ -124,16 +123,16 @@ class Transformer
   //! Locally-stored vocabulary size of the source.
   size_t srcVocabSize;
 
-  //! Locally-stored number of input units.
+  //! Locally-stored dimensionality of the model.
   size_t dModel;
 
-  //! Locally-stored number of output units.
+  //! Locally-stored number attention heads.
   size_t numHeads;
 
-  //! Locally-stored weight object.
+  //! Locally-stored dimensionality of the position-wise feed forward network.
   size_t dimFFN;
 
-  //! Locally-stored weight parameters.
+  //! Locally-stored dropout rate.
   double dropout;
 
   //! Locally-stored attention mask.
@@ -144,7 +143,6 @@ class Transformer
 
   //! Locally-stored transformer model.
   Sequential<InputDataType, OutputDataType>* transformer;
-
 }; // class Transformer
 
 } // namespace ann
diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp
index 178dc040..5fdb4940 100644
--- a/models/transformer/transformer_impl.hpp
+++ b/models/transformer/transformer_impl.hpp
@@ -65,8 +65,7 @@ OutputDataType>::Transformer(
       dimFFN,
       dropout,
       attentionMask,
-      keyPaddingMask
-  ).Model();
+      keyPaddingMask).Model();
 
   encoder->Add(encoderStack);
 
@@ -95,8 +94,7 @@ OutputDataType>::Transformer(
       dimFFN,
       dropout,
       attentionMask,
-      keyPaddingMask
-  ).Model();
+      keyPaddingMask).Model();
 
   transformer->Add(decoderStack);
 }

From 55de04561f7dfd731ea4bc6fb5c89d082d64214e Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi <mrityunjay2668@gmail.com>
Date: Sun, 23 Aug 2020 23:20:32 +0530
Subject: [PATCH 5/8] adding constructors and destructors, removing some
 templates

---
 models/transformer/decoder.hpp          | 76 +++++++++++++++---------
 models/transformer/decoder_impl.hpp     | 41 ++++++-------
 models/transformer/encoder.hpp          | 77 +++++++++++++++++--------
 models/transformer/encoder_impl.hpp     | 30 +++++-----
 models/transformer/transformer.hpp      | 65 ++++++++++++---------
 models/transformer/transformer_impl.hpp | 43 ++++----------
 6 files changed, 184 insertions(+), 148 deletions(-)

diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
index 6f9dd771..012d7f3d 100644
--- a/models/transformer/decoder.hpp
+++ b/models/transformer/decoder.hpp
@@ -36,16 +36,10 @@ namespace ann /** Artificial Neural Network. */ {
  *         the position-wise feed forward neural network.
  * @tparam RegularizerType The type of regularizer to be applied to layer
  *         parameters.
- * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
- *         arma::sp_mat or arma::cube).
- * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
- *         arma::sp_mat or arma::cube).
  */
 template <
   typename ActivationFunction = ReLULayer<>,
-  typename RegularizerType = NoRegularizer,
-  typename InputDataType = arma::mat,
-  typename OutputDataType = arma::mat
+  typename RegularizerType = NoRegularizer
 >
 class TransformerDecoder
 {
@@ -66,6 +60,7 @@ class TransformerDecoder
    * @param dropout The dropout rate.
    * @param attentionMask The attention mask used to black-out future sequences.
    * @param keyPaddingMask The padding mask used to black-out particular token.
+   * @param ownMemory Whether to delete the pointer-type decoder object.
    */
   TransformerDecoder(const size_t numLayers,
                      const size_t tgtSeqLen,
@@ -74,13 +69,39 @@ class TransformerDecoder
                      const size_t numHeads = 8,
                      const size_t dimFFN = 1024,
                      const double dropout = 0.1,
-                     const InputDataType& attentionMask = InputDataType(),
-                     const InputDataType& keyPaddingMask = InputDataType());
+                     const arma::mat& attentionMask = arma::mat(),
+                     const arma::mat& keyPaddingMask = arma::mat(),
+                     const bool ownMemory = false);
+
+  /**
+   * Destructor.
+   */
+  ~TransformerDecoder()
+  {
+    if (ownMemory)
+      delete decoder;
+  }
+
+  /**
+   * Copy constructor.
+   */
+  TransformerDecoder(const TransformerDecoder& ) = delete;
+
+  /**
+   * Move constructor.
+   */
+  TransformerDecoder(const TransformerDecoder&& ) = delete;
+
+  /**
+   * Copy assignment operator.
+   */
+  TransformerDecoder& operator = (const TransformerDecoder& ) = delete;
 
   /**
    * Get the Transformer Decoder model.
    */
   Sequential<>* Model() { return decoder; }
+
   /**
    * Load the network from a local directory.
    *
@@ -96,16 +117,16 @@ class TransformerDecoder
   void SaveModel(const std::string& filepath);
 
   //! Get the attention mask.
-  InputDataType const& AttentionMask() const { return attentionMask; }
+  arma::mat const& AttentionMask() const { return attentionMask; }
 
   //! Modify the attention mask.
-  InputDataType& AttentionMask() { return attentionMask; }
+  arma::mat& AttentionMask() { return attentionMask; }
 
   //! Get the key padding mask.
-  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+  arma::mat const& KeyPaddingMask() const { return keyPaddingMask; }
 
   //! Modify the key padding mask.
-  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+  arma::mat& KeyPaddingMask() { return keyPaddingMask; }
 
  private:
   /**
@@ -113,7 +134,7 @@ class TransformerDecoder
    */
   Sequential<>* AttentionBlock()
   {
-    Sequential<>* decoderBlockBottom = new Sequential<>();
+    Sequential<>* decoderBlockBottom = new Sequential<>(false);
     decoderBlockBottom->Add<Subview<>>(1, 0, dModel * tgtSeqLen - 1, 0, -1);
 
     // Broadcast the incoming input to decoder
@@ -124,10 +145,10 @@ class TransformerDecoder
     decoderInput->Add<IdentityLayer<>>();
 
     // Masked Self attention layer.
-    Sequential<>* maskedSelfAttention = new Sequential<>();
+    Sequential<>* maskedSelfAttention = new Sequential<>(false);
     maskedSelfAttention->Add(decoderInput);
     maskedSelfAttention->Add<MultiheadAttention<
-        InputDataType, OutputDataType, RegularizerType>>(
+        arma::mat, arma::mat, RegularizerType>>(
           tgtSeqLen,
           tgtSeqLen,
           dModel,
@@ -156,15 +177,15 @@ class TransformerDecoder
     encoderDecoderAttentionInput->Add(broadcastEncoderOutput);
 
     // Encoder-decoder attention.
-    Sequential<>* encoderDecoderAttention = new Sequential<>();
+    Sequential<>* encoderDecoderAttention = new Sequential<>(false);
     encoderDecoderAttention->Add(encoderDecoderAttentionInput);
     encoderDecoderAttention->Add<MultiheadAttention<
-        InputDataType, OutputDataType, RegularizerType>>(
+        arma::mat, arma::mat, RegularizerType>>(
           tgtSeqLen,
           srcSeqLen,
           dModel,
           numHeads,
-          InputDataType(), // No attention mask to encoder-decoder attention.
+          arma::mat(), // No attention mask to encoder-decoder attention.
           keyPaddingMask);
 
     // Residual connection.
@@ -172,7 +193,7 @@ class TransformerDecoder
     residualAdd2->Add(encoderDecoderAttention);
     residualAdd2->Add<IdentityLayer<>>();
 
-    Sequential<>* decoderBlock = new Sequential<>();
+    Sequential<>* decoderBlock = new Sequential<>(false);
     decoderBlock->Add(residualAdd2);
     decoderBlock->Add<LayerNorm<>>(dModel * tgtSeqLen);
     return decoderBlock;
@@ -183,7 +204,7 @@ class TransformerDecoder
    */
   Sequential<>* PositionWiseFFNBlock()
   {
-    Sequential<>* positionWiseFFN = new Sequential<>();
+    Sequential<>* positionWiseFFN = new Sequential<>(false);
     positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
     positionWiseFFN->Add<ActivationFunction>();
     positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
@@ -194,7 +215,7 @@ class TransformerDecoder
     residualAdd->Add(positionWiseFFN);
     residualAdd->Add<IdentityLayer<>>();
 
-    Sequential<>* decoderBlock = new Sequential<>();
+    Sequential<>* decoderBlock = new Sequential<>(false);
     decoderBlock->Add(residualAdd);
     return decoderBlock;
   }
@@ -208,7 +229,7 @@ class TransformerDecoder
   //! Locally-stored source sequence length.
   size_t srcSeqLen;
 
-  //! Locally-stored dimensionality of the model.
+  //! Locally-stored number of features in the input.
   size_t dModel;
 
   //! Locally-stored number of attention heads.
@@ -221,13 +242,16 @@ class TransformerDecoder
   double dropout;
 
   //! Locally-stored attention mask.
-  InputDataType attentionMask;
+  arma::mat attentionMask;
 
   //! Locally-stored key padding mask.
-  InputDataType keyPaddingMask;
+  arma::mat keyPaddingMask;
+
+  //! Whether to delete pointer-type decoder object.
+  bool ownMemory;
 
   //! Locally-stored complete decoder network.
-  Sequential<InputDataType, OutputDataType, false>* decoder;
+  Sequential<>* decoder;
 }; // class TransformerDecoder
 
 } // namespace ann
diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp
index ac9b33ae..a52e0257 100644
--- a/models/transformer/decoder_impl.hpp
+++ b/models/transformer/decoder_impl.hpp
@@ -19,24 +19,21 @@
 namespace mlpack {
 namespace ann /** Artificial Neural Network. */ {
 
-template<typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-TransformerDecoder<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::TransformerDecoder() :
+template<typename ActivationFunction, typename RegularizerType>
+TransformerDecoder<ActivationFunction, RegularizerType>::TransformerDecoder() :
     tgtSeqLen(0),
     srcSeqLen(0),
     dModel(0),
     numHeads(0),
     dimFFN(0),
-    dropout(0)
+    dropout(0),
+    ownMemory(true)
 {
   // Nothing to do here.
 }
 
-template<typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-TransformerDecoder<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::TransformerDecoder(
+template<typename ActivationFunction, typename RegularizerType>
+TransformerDecoder<ActivationFunction, RegularizerType>::TransformerDecoder(
     const size_t numLayers,
     const size_t tgtSeqLen,
     const size_t srcSeqLen,
@@ -44,8 +41,9 @@ OutputDataType>::TransformerDecoder(
     const size_t numHeads,
     const size_t dimFFN,
     const double dropout,
-    const InputDataType& attentionMask,
-    const InputDataType& keyPaddingMask) :
+    const arma::mat& attentionMask,
+    const arma::mat& keyPaddingMask,
+    const bool ownMemory) :
     numLayers(numLayers),
     tgtSeqLen(tgtSeqLen),
     srcSeqLen(srcSeqLen),
@@ -54,9 +52,10 @@ OutputDataType>::TransformerDecoder(
     dimFFN(dimFFN),
     dropout(dropout),
     attentionMask(attentionMask),
-    keyPaddingMask(keyPaddingMask)
+    keyPaddingMask(keyPaddingMask),
+    ownMemory(ownMemory)
 {
-  decoder = new Sequential<InputDataType, OutputDataType, false>();
+  decoder = new Sequential<>(false);
 
   for (size_t n = 0; n < numLayers; ++n)
   {
@@ -67,7 +66,7 @@ OutputDataType>::TransformerDecoder(
       break;
     }
 
-    Sequential<>* decoderBlock = new Sequential<>();
+    Sequential<>* decoderBlock = new Sequential<>(false);
     decoderBlock->Add(AttentionBlock());
     decoderBlock->Add(PositionWiseFFNBlock());
 
@@ -79,19 +78,17 @@ OutputDataType>::TransformerDecoder(
   }
 }
 
-template<typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-void TransformerDecoder<ActivationFunction, RegularizerType,
-InputDataType, OutputDataType>::LoadModel(const std::string& filepath)
+template<typename ActivationFunction, typename RegularizerType>
+void TransformerDecoder<ActivationFunction, RegularizerType>::
+LoadModel(const std::string& filepath)
 {
   data::Load(filepath, "TransformerDecoder", decoder);
   std::cout << "Loaded model" << std::endl;
 }
 
-template<typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-void TransformerDecoder<ActivationFunction, RegularizerType,
-InputDataType, OutputDataType>::SaveModel(const std::string& filepath)
+template<typename ActivationFunction, typename RegularizerType>
+void TransformerDecoder<ActivationFunction, RegularizerType>::
+SaveModel(const std::string& filepath)
 {
   std::cout << "Saving model" << std::endl;
   data::Save(filepath, "TransformerDecoder", decoder);
diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp
index 20e70050..50db3b70 100644
--- a/models/transformer/encoder.hpp
+++ b/models/transformer/encoder.hpp
@@ -36,16 +36,10 @@ namespace ann /** Artificial Neural Network. */ {
  *         position-wise feed forward neural network.
  * @tparam RegularizerType The regularizer type to be applied on layer
  *         parameters.
- * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
- *         arma::sp_mat or arma::cube).
- * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
- *         arma::sp_mat or arma::cube).
  */
 template <
   typename ActivationFunction = ReLULayer<>,
-  typename RegularizerType = NoRegularizer,
-  typename InputDataType = arma::mat,
-  typename OutputDataType = arma::mat
+  typename RegularizerType = NoRegularizer
 >
 class TransformerEncoder
 {
@@ -62,6 +56,7 @@ class TransformerEncoder
    * @param dropout The dropout rate.
    * @param attentionMask The attention mask to be applied to the sequences.
    * @param keyPaddingMask The key padding mask applied to the sequences.
+   * @param ownMemory Whether to delete the pointer-type encoder object.
    */
   TransformerEncoder(const size_t numLayers,
                      const size_t srcSeqLen,
@@ -69,13 +64,38 @@ class TransformerEncoder
                      const size_t numHeads = 2,
                      const size_t dimFFN = 1024,
                      const double dropout = 0.1,
-                     const InputDataType& attentionMask = InputDataType(),
-                     const InputDataType& keyPaddingMask = InputDataType());
+                     const arma::mat& attentionMask = arma::mat(),
+                     const arma::mat& keyPaddingMask = arma::mat(),
+                     const bool ownMemory = false);
+
+  /**
+   * Destructor.
+   */
+  ~TransformerEncoder()
+  {
+    if (ownMemory)
+      delete encoder;
+  }
+
+  /**
+   * Copy constructor.
+   */
+  TransformerEncoder(const TransformerEncoder& ) = delete;
+
+  /**
+   * Move constructor.
+   */
+  TransformerEncoder(const TransformerEncoder&& ) = delete;
+
+  /**
+   * Copy assignment operator.
+   */
+  TransformerEncoder& operator = (const TransformerEncoder& ) = delete;
 
   /**
    * Get the Transformer Encoder Model.
    */
-  Sequential<InputDataType, OutputDataType, false>* Model()
+  Sequential<arma::mat, arma::mat, false>* Model()
   {
     return encoder;
   }
@@ -95,16 +115,16 @@ class TransformerEncoder
   void SaveModel(const std::string& filepath);
 
   //! Get the attention mask.
-  InputDataType const& AttentionMask() const { return attentionMask; }
+  arma::mat const& AttentionMask() const { return attentionMask; }
 
   //! Modify the attention mask.
-  InputDataType& AttentionMask() { return attentionMask; }
+  arma::mat& AttentionMask() { return attentionMask; }
 
   //! Get the key padding mask.
-  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+  arma::mat const& KeyPaddingMask() const { return keyPaddingMask; }
 
   //! Modify the key padding mask.
-  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+  arma::mat& KeyPaddingMask() { return keyPaddingMask; }
 
  private:
   /**
@@ -118,15 +138,19 @@ class TransformerEncoder
     input->Add<IdentityLayer<>>();
 
     /* Self attention layer. */
-    Sequential<>* selfAttention = new Sequential<>();
-    selfAttention->Add(input);
-    selfAttention->Add<MultiheadAttention<
-        InputDataType, OutputDataType, RegularizerType>
-        >(srcSeqLen, srcSeqLen, dModel, numHeads);
+    Sequential<>* selfAttn = new Sequential<>(false);
+    selfAttn->Add(input);
+    selfAttn->Add<MultiheadAttention<arma::mat, arma::mat, RegularizerType>>(
+          srcSeqLen,
+          srcSeqLen,
+          dModel,
+          numHeads,
+          attentionMask,
+          keyPaddingMask);
 
     /* This layer adds a residual connection. */
     AddMerge<>* residualAdd = new AddMerge<>();
-    residualAdd->Add(selfAttention);
+    residualAdd->Add(selfAttn);
     residualAdd->Add<IdentityLayer<>>();
 
     encoder->Add(residualAdd);
@@ -138,7 +162,7 @@ class TransformerEncoder
    */
   void PositionWiseFFNBlock()
   {
-    Sequential<>* positionWiseFFN = new Sequential<>();
+    Sequential<>* positionWiseFFN = new Sequential<>(false);
     positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
     positionWiseFFN->Add<ActivationFunction>();
     positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
@@ -159,7 +183,7 @@ class TransformerEncoder
   //! Locally-stored source sequence length.
   size_t srcSeqLen;
 
-  //! Locally-stored dimensionality of model.
+  //! Locally-stored number of features in the input.
   size_t dModel;
 
   //! Locally-stored number of attention heads.
@@ -172,13 +196,16 @@ class TransformerEncoder
   double dropout;
 
   //! Locally-stored attention mask.
-  InputDataType attentionMask;
+  arma::mat attentionMask;
 
   //! Locally-stored key padding mask.
-  InputDataType keyPaddingMask;
+  arma::mat keyPaddingMask;
+
+  //! Whether to delete the pointer-type encoder object.
+  bool ownMemory;
 
   //! Locally-stored encoder block.
-  Sequential<InputDataType, OutputDataType, false>* encoder;
+  Sequential<>* encoder;
 }; // class TransformerEncoder
 
 } // namespace ann
diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp
index a364e24d..9da43258 100644
--- a/models/transformer/encoder_impl.hpp
+++ b/models/transformer/encoder_impl.hpp
@@ -19,18 +19,17 @@
 namespace mlpack {
 namespace ann /** Artificial Neural Network. */ {
 
-template <typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::TransformerEncoder(
+template <typename ActivationFunction, typename RegularizerType>
+TransformerEncoder<ActivationFunction, RegularizerType>::TransformerEncoder(
     const size_t numLayers,
     const size_t srcSeqLen,
     const size_t dModel,
     const size_t numHeads,
     const size_t dimFFN,
     const double dropout,
-    const InputDataType& attentionMask,
-    const InputDataType& keyPaddingMask) :
+    const arma::mat& attentionMask,
+    const arma::mat& keyPaddingMask,
+    const bool ownMemory) :
     numLayers(numLayers),
     srcSeqLen(srcSeqLen),
     dModel(dModel),
@@ -38,9 +37,10 @@ OutputDataType>::TransformerEncoder(
     dimFFN(dimFFN),
     dropout(dropout),
     attentionMask(attentionMask),
-    keyPaddingMask(keyPaddingMask)
+    keyPaddingMask(keyPaddingMask),
+    ownMemory(ownMemory)
 {
-  encoder = new Sequential<InputDataType, OutputDataType, false>();
+  encoder = new Sequential<arma::mat, arma::mat, false>(false);
 
   for (size_t n = 0; n < numLayers; ++n)
   {
@@ -49,19 +49,17 @@ OutputDataType>::TransformerEncoder(
   }
 }
 
-template <typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-void TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::LoadModel(const std::string& filePath)
+template <typename ActivationFunction, typename RegularizerType>
+void TransformerEncoder<ActivationFunction, RegularizerType>::
+LoadModel(const std::string& filePath)
 {
   data::Load(filePath, "TransformerEncoder", encoder);
   std::cout << "Loaded model" << std::endl;
 }
 
-template <typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-void TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::SaveModel(const std::string& filePath)
+template <typename ActivationFunction, typename RegularizerType>
+void TransformerEncoder<ActivationFunction, RegularizerType>::
+SaveModel(const std::string& filePath)
 {
   std::cout << "Saving model" << std::endl;
   data::Save(filePath, "TransformerEncoder", encoder);
diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp
index 45bef01d..1f0de944 100644
--- a/models/transformer/transformer.hpp
+++ b/models/transformer/transformer.hpp
@@ -31,16 +31,10 @@ namespace ann /** Artificial Neural Network. */ {
  *         position-wise feed forward neural network.
  * @tparam RegularizerType The regularizer type to be applied on layer
  *         parameters.
- * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
- *         arma::sp_mat or arma::cube).
- * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
- *         arma::sp_mat or arma::cube).
  */
 template <
   typename ActivationFunction = ReLULayer<>,
-  typename RegularizerType = NoRegularizer,
-  typename InputDataType = arma::mat,
-  typename OutputDataType = arma::mat
+  typename RegularizerType = NoRegularizer
 >
 class Transformer
 {
@@ -60,6 +54,7 @@ class Transformer
    * @param dropout The dropout rate.
    * @param attentionMask The attention mask to be applied to the sequences.
    * @param keyPaddingMask The key padding mask applied to the sequences.
+   * @param ownMemory Whether to delete pointer-type transformer object.
    */
   Transformer(const size_t numLayers,
               const size_t tgtSeqLen,
@@ -70,42 +65,53 @@ class Transformer
               const size_t numHeads = 12,
               const size_t dimFFN = 1024,
               const double dropout = 0.1,
-              const InputDataType& attentionMask = InputDataType(),
-              const InputDataType& keyPaddingMask = InputDataType());
+              const arma::mat& attentionMask = arma::mat(),
+              const arma::mat& keyPaddingMask = arma::mat(),
+              const bool ownMemory = false);
 
   /**
-   * Get the Transformer Encoder Model.
+   * Destructor.
    */
-  Sequential<InputDataType, OutputDataType, false>* Model()
+  ~Transformer()
   {
-    return transformer;
+    if (ownMemory)
+      delete transformer;
   }
 
   /**
-   * Load the Transformer model from a local directory.
-   *
-   * @param filepath The location of the stored model.
+   * Copy constructor.
    */
-  void LoadModel(const std::string& filepath);
+  Transformer(const Transformer& /* transformer */) = delete;
 
   /**
-   * Save the Transformer model locally.
-   *
-   * @param filepath The location where the model is to be saved.
+   * Move constructor.
    */
-  void SaveModel(const std::string& filepath);
+  Transformer(const Transformer&& /* transformer */) = delete;
+
+  /**
+   * Copy assignment operator.
+   */
+  Transformer& operator = (const Transformer& /* transformer */) = delete;
+
+  /**
+   * Get the Transformer Encoder Model.
+   */
+  Sequential<>* Model()
+  {
+    return transformer;
+  }
 
   //! Get the attention mask.
-  InputDataType const& AttentionMask() const { return attentionMask; }
+  arma::mat const& AttentionMask() const { return attentionMask; }
 
   //! Modify the attention mask.
-  InputDataType& AttentionMask() { return attentionMask; }
+  arma::mat& AttentionMask() { return attentionMask; }
 
   //! Get the key padding mask.
-  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+  arma::mat const& KeyPaddingMask() const { return keyPaddingMask; }
 
   //! Modify the key padding mask.
-  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+  arma::mat& KeyPaddingMask() { return keyPaddingMask; }
 
  private:
   //! Locally-stored number of encoder and decoder layers.
@@ -123,7 +129,7 @@ class Transformer
   //! Locally-stored vocabulary size of the source.
   size_t srcVocabSize;
 
-  //! Locally-stored dimensionality of the model.
+  //! Locally-stored number of features in the input.
   size_t dModel;
 
   //! Locally-stored number attention heads.
@@ -136,13 +142,16 @@ class Transformer
   double dropout;
 
   //! Locally-stored attention mask.
-  InputDataType attentionMask;
+  arma::mat attentionMask;
 
   //! Locally-stored key padding mask.
-  InputDataType keyPaddingMask;
+  arma::mat keyPaddingMask;
+
+  //! Whether to delete the pointer-type transformer object.
+  bool ownMemory;
 
   //! Locally-stored transformer model.
-  Sequential<InputDataType, OutputDataType>* transformer;
+  Sequential<>* transformer;
 }; // class Transformer
 
 } // namespace ann
diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp
index 5fdb4940..57d813d5 100644
--- a/models/transformer/transformer_impl.hpp
+++ b/models/transformer/transformer_impl.hpp
@@ -19,10 +19,8 @@
 namespace mlpack {
 namespace ann /** Artificial Neural Network. */ {
 
-template <typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-Transformer<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::Transformer(
+template <typename ActivationFunction, typename RegularizerType>
+Transformer<ActivationFunction, RegularizerType>::Transformer(
     const size_t numLayers,
     const size_t tgtSeqLen,
     const size_t srcSeqLen,
@@ -32,8 +30,9 @@ OutputDataType>::Transformer(
     const size_t numHeads,
     const size_t dimFFN,
     const double dropout,
-    const InputDataType& attentionMask,
-    const InputDataType& keyPaddingMask) :
+    const arma::mat& attentionMask,
+    const arma::mat& keyPaddingMask,
+    const bool ownMemory) :
     numLayers(numLayers),
     tgtSeqLen(tgtSeqLen),
     srcSeqLen(srcSeqLen),
@@ -44,11 +43,12 @@ OutputDataType>::Transformer(
     dimFFN(dimFFN),
     dropout(dropout),
     attentionMask(attentionMask),
-    keyPaddingMask(keyPaddingMask)
+    keyPaddingMask(keyPaddingMask),
+    ownMemory(ownMemory)
 {
-  transformer = new Sequential<>();
+  transformer = new Sequential<>(false);
 
-  Sequential<>* encoder = new Sequential<>();
+  Sequential<>* encoder = new Sequential<>(false);
 
   // Pull out the sequences of source language which is stacked above in the
   // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix.
@@ -57,7 +57,7 @@ OutputDataType>::Transformer(
   encoder->Add<PositionalEncoding<>>(dModel, srcSeqLen);
 
   Sequential<>* encoderStack = mlpack::ann::TransformerEncoder<
-    ActivationFunction, RegularizerType, InputDataType, OutputDataType>(
+    ActivationFunction, RegularizerType>(
       numLayers,
       srcSeqLen,
       dModel,
@@ -69,7 +69,7 @@ OutputDataType>::Transformer(
 
   encoder->Add(encoderStack);
 
-  Sequential<>* decoderPE = new Sequential<>();
+  Sequential<>* decoderPE = new Sequential<>(false);
 
   // Pull out the sequences of target language which is stacked below in the
   // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last
@@ -85,7 +85,7 @@ OutputDataType>::Transformer(
   transformer->Add(encoderDecoderConcat);
 
   Sequential<>* decoderStack = mlpack::ann::TransformerDecoder<
-    ActivationFunction, RegularizerType, InputDataType, OutputDataType>(
+    ActivationFunction, RegularizerType>(
       numLayers,
       tgtSeqLen,
       srcSeqLen,
@@ -99,25 +99,6 @@ OutputDataType>::Transformer(
   transformer->Add(decoderStack);
 }
 
-template <typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-void Transformer<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::LoadModel(const std::string& filePath)
-{
-  data::Load(filePath, "Transformer", transformer);
-  std::cout << "Loaded model" << std::endl;
-}
-
-template <typename ActivationFunction, typename RegularizerType,
-    typename InputDataType, typename OutputDataType>
-void Transformer<ActivationFunction, RegularizerType, InputDataType,
-OutputDataType>::SaveModel(const std::string& filePath)
-{
-  std::cout << "Saving model" << std::endl;
-  data::Save(filePath, "Transformer", transformer);
-  std::cout << "Model saved in " << filePath << std::endl;
-}
-
 } // namespace ann
 } // namespace mlpack
 

From a253a6a9e0c4a11ef4e882e24a9bcb68e6e7faf7 Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi <mrityunjay2668@gmail.com>
Date: Tue, 25 Aug 2020 23:20:03 +0530
Subject: [PATCH 6/8] use mutator method to set mask in mha

---
 models/transformer/decoder.hpp      | 30 ++++++++++++++---------------
 models/transformer/encoder.hpp      | 17 ++++++++--------
 models/transformer/encoder_impl.hpp |  2 +-
 tests/ffn_model_tests.cpp           |  6 +++---
 4 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
index 012d7f3d..d486c6f7 100644
--- a/models/transformer/decoder.hpp
+++ b/models/transformer/decoder.hpp
@@ -147,13 +147,14 @@ class TransformerDecoder
     // Masked Self attention layer.
     Sequential<>* maskedSelfAttention = new Sequential<>(false);
     maskedSelfAttention->Add(decoderInput);
-    maskedSelfAttention->Add<MultiheadAttention<
-        arma::mat, arma::mat, RegularizerType>>(
-          tgtSeqLen,
-          tgtSeqLen,
-          dModel,
-          numHeads,
-          attentionMask);
+
+    MultiheadAttention<>* mha1 = new MultiheadAttention<>(tgtSeqLen,
+                                                         tgtSeqLen,
+                                                         dModel,
+                                                         numHeads);
+    mha1->AttentionMask() = attentionMask;
+
+    maskedSelfAttention->Add(mha1);
 
     // Residual connection.
     AddMerge<>* residualAdd1 = new AddMerge<>();
@@ -179,14 +180,13 @@ class TransformerDecoder
     // Encoder-decoder attention.
     Sequential<>* encoderDecoderAttention = new Sequential<>(false);
     encoderDecoderAttention->Add(encoderDecoderAttentionInput);
-    encoderDecoderAttention->Add<MultiheadAttention<
-        arma::mat, arma::mat, RegularizerType>>(
-          tgtSeqLen,
-          srcSeqLen,
-          dModel,
-          numHeads,
-          arma::mat(), // No attention mask to encoder-decoder attention.
-          keyPaddingMask);
+
+    MultiheadAttention<>* mha2 = new MultiheadAttention<>(tgtSeqLen,
+                                                          srcSeqLen,
+                                                          dModel,
+                                                          numHeads);
+    mha2->KeyPaddingMask() = keyPaddingMask;
+    encoderDecoderAttention->Add(mha2);
 
     // Residual connection.
     AddMerge<>* residualAdd2 = new AddMerge<>();
diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp
index 50db3b70..a54d98b9 100644
--- a/models/transformer/encoder.hpp
+++ b/models/transformer/encoder.hpp
@@ -95,7 +95,7 @@ class TransformerEncoder
   /**
    * Get the Transformer Encoder Model.
    */
-  Sequential<arma::mat, arma::mat, false>* Model()
+  Sequential<>* Model()
   {
     return encoder;
   }
@@ -140,13 +140,14 @@ class TransformerEncoder
     /* Self attention layer. */
     Sequential<>* selfAttn = new Sequential<>(false);
     selfAttn->Add(input);
-    selfAttn->Add<MultiheadAttention<arma::mat, arma::mat, RegularizerType>>(
-          srcSeqLen,
-          srcSeqLen,
-          dModel,
-          numHeads,
-          attentionMask,
-          keyPaddingMask);
+
+    MultiheadAttention<>* mha = new MultiheadAttention<>(srcSeqLen,
+                                                         srcSeqLen,
+                                                         dModel,
+                                                         numHeads);
+    mha->AttentionMask() = attentionMask;
+    mha->KeyPaddingMask() = keyPaddingMask;
+    selfAttn->Add(mha);
 
     /* This layer adds a residual connection. */
     AddMerge<>* residualAdd = new AddMerge<>();
diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp
index 9da43258..05bfb058 100644
--- a/models/transformer/encoder_impl.hpp
+++ b/models/transformer/encoder_impl.hpp
@@ -40,7 +40,7 @@ TransformerEncoder<ActivationFunction, RegularizerType>::TransformerEncoder(
     keyPaddingMask(keyPaddingMask),
     ownMemory(ownMemory)
 {
-  encoder = new Sequential<arma::mat, arma::mat, false>(false);
+  encoder = new Sequential<>(false);
 
   for (size_t n = 0; n < numLayers; ++n)
   {
diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp
index 3f4601fd..f2c529d9 100644
--- a/tests/ffn_model_tests.cpp
+++ b/tests/ffn_model_tests.cpp
@@ -68,7 +68,7 @@ BOOST_AUTO_TEST_CASE(TransformerEncoderTest)
   mlpack::ann::TransformerEncoder<> encoder(numLayers, srcSeqLen,
       dModel, numHeads, dimFFN, dropout);
 
-  FFN<> model;
+  FFN<NegativeLogLikelihood<>, XavierInitialization> model;
 
   model.Add(encoder.Model());
   model.Add<Linear<>>(dModel * srcSeqLen, vocabSize);
@@ -103,7 +103,7 @@ BOOST_AUTO_TEST_CASE(TransformerDecoderTest)
   mlpack::ann::TransformerDecoder<> decoder(numLayers, tgtSeqLen, srcSeqLen,
       dModel, numHeads, dimFFN, dropout);
 
-  FFN<> model;
+  FFN<NegativeLogLikelihood<>, XavierInitialization> model;
 
   model.Add(decoder.Model());
   model.Add<Linear<>>(dModel * tgtSeqLen, vocabSize);
@@ -148,7 +148,7 @@ BOOST_AUTO_TEST_CASE(TransformerTest)
   mlpack::ann::Transformer<> transformer(numLayers, tgtSeqLen, srcSeqLen,
       tgtVocabSize, srcVocabSize, dModel, numHeads, dimFFN, dropout);
 
-  FFN<> model;
+  FFN<NegativeLogLikelihood<>, XavierInitialization> model;
 
   model.Add(transformer.Model());
   model.Add<Linear<>>(dModel * tgtSeqLen, tgtVocabSize);

From fbdd4ff2a028cad8c75673ff81f84546f8f2d9ea Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi <mrityunjay2668@gmail.com>
Date: Wed, 26 Aug 2020 22:01:58 +0530
Subject: [PATCH 7/8] set model = true

---
 models/transformer/decoder.hpp          | 32 ++++++++++++-------------
 models/transformer/decoder_impl.hpp     |  6 ++---
 models/transformer/encoder.hpp          | 10 ++++----
 models/transformer/encoder_impl.hpp     |  2 +-
 models/transformer/transformer_impl.hpp |  8 +++----
 5 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
index d486c6f7..f8686d4c 100644
--- a/models/transformer/decoder.hpp
+++ b/models/transformer/decoder.hpp
@@ -134,18 +134,18 @@ class TransformerDecoder
    */
   Sequential<>* AttentionBlock()
   {
-    Sequential<>* decoderBlockBottom = new Sequential<>(false);
+    Sequential<>* decoderBlockBottom = new Sequential<>();
     decoderBlockBottom->Add<Subview<>>(1, 0, dModel * tgtSeqLen - 1, 0, -1);
 
     // Broadcast the incoming input to decoder
     // i.e. query into (query, key, value).
-    Concat<>* decoderInput = new Concat<>();
+    Concat<>* decoderInput = new Concat<>(true);
     decoderInput->Add<IdentityLayer<>>();
     decoderInput->Add<IdentityLayer<>>();
     decoderInput->Add<IdentityLayer<>>();
 
     // Masked Self attention layer.
-    Sequential<>* maskedSelfAttention = new Sequential<>(false);
+    Sequential<>* maskedSelfAttention = new Sequential<>();
     maskedSelfAttention->Add(decoderInput);
 
     MultiheadAttention<>* mha1 = new MultiheadAttention<>(tgtSeqLen,
@@ -157,7 +157,7 @@ class TransformerDecoder
     maskedSelfAttention->Add(mha1);
 
     // Residual connection.
-    AddMerge<>* residualAdd1 = new AddMerge<>();
+    AddMerge<>* residualAdd1 = new AddMerge<>(true);
     residualAdd1->Add(maskedSelfAttention);
     residualAdd1->Add<IdentityLayer<>>();
 
@@ -167,19 +167,19 @@ class TransformerDecoder
     decoderBlockBottom->Add<LayerNorm<>>(dModel * tgtSeqLen);
 
     // This layer broadcasts the output of encoder i.e. key into (key, value).
-    Concat<>* broadcastEncoderOutput = new Concat<>();
+    Concat<>* broadcastEncoderOutput = new Concat<>(true);
     broadcastEncoderOutput->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
     broadcastEncoderOutput->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
 
     // This layer concatenates the output of the bottom decoder block (query)
     // and the output of the encoder (key, value).
-    Concat<>* encoderDecoderAttentionInput = new Concat<>();
-    encoderDecoderAttentionInput->Add(decoderBlockBottom);
-    encoderDecoderAttentionInput->Add(broadcastEncoderOutput);
+    Concat<>* encDecAttnInput = new Concat<>(true);
+    encDecAttnInput->Add<Subview<>>(1, 0, dModel * tgtSeqLen - 1, 0, -1);
+    encDecAttnInput->Add(broadcastEncoderOutput);
 
     // Encoder-decoder attention.
-    Sequential<>* encoderDecoderAttention = new Sequential<>(false);
-    encoderDecoderAttention->Add(encoderDecoderAttentionInput);
+    Sequential<>* encoderDecoderAttention = new Sequential<>();
+    encoderDecoderAttention->Add(encDecAttnInput);
 
     MultiheadAttention<>* mha2 = new MultiheadAttention<>(tgtSeqLen,
                                                           srcSeqLen,
@@ -189,11 +189,11 @@ class TransformerDecoder
     encoderDecoderAttention->Add(mha2);
 
     // Residual connection.
-    AddMerge<>* residualAdd2 = new AddMerge<>();
+    AddMerge<>* residualAdd2 = new AddMerge<>(true);
     residualAdd2->Add(encoderDecoderAttention);
-    residualAdd2->Add<IdentityLayer<>>();
+    residualAdd2->Add(decoderBlockBottom);
 
-    Sequential<>* decoderBlock = new Sequential<>(false);
+    Sequential<>* decoderBlock = new Sequential<>();
     decoderBlock->Add(residualAdd2);
     decoderBlock->Add<LayerNorm<>>(dModel * tgtSeqLen);
     return decoderBlock;
@@ -204,18 +204,18 @@ class TransformerDecoder
    */
   Sequential<>* PositionWiseFFNBlock()
   {
-    Sequential<>* positionWiseFFN = new Sequential<>(false);
+    Sequential<>* positionWiseFFN = new Sequential<>();
     positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
     positionWiseFFN->Add<ActivationFunction>();
     positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
     positionWiseFFN->Add<Dropout<>>(dropout);
 
     /* Residual connection. */
-    AddMerge<>* residualAdd = new AddMerge<>();
+    AddMerge<>* residualAdd = new AddMerge<>(true);
     residualAdd->Add(positionWiseFFN);
     residualAdd->Add<IdentityLayer<>>();
 
-    Sequential<>* decoderBlock = new Sequential<>(false);
+    Sequential<>* decoderBlock = new Sequential<>();
     decoderBlock->Add(residualAdd);
     return decoderBlock;
   }
diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp
index a52e0257..d77878ba 100644
--- a/models/transformer/decoder_impl.hpp
+++ b/models/transformer/decoder_impl.hpp
@@ -55,7 +55,7 @@ TransformerDecoder<ActivationFunction, RegularizerType>::TransformerDecoder(
     keyPaddingMask(keyPaddingMask),
     ownMemory(ownMemory)
 {
-  decoder = new Sequential<>(false);
+  decoder = new Sequential<>();
 
   for (size_t n = 0; n < numLayers; ++n)
   {
@@ -66,11 +66,11 @@ TransformerDecoder<ActivationFunction, RegularizerType>::TransformerDecoder(
       break;
     }
 
-    Sequential<>* decoderBlock = new Sequential<>(false);
+    Sequential<>* decoderBlock = new Sequential<>();
     decoderBlock->Add(AttentionBlock());
     decoderBlock->Add(PositionWiseFFNBlock());
 
-    Concat<>* concatQueryKey = new Concat<>();
+    Concat<>* concatQueryKey = new Concat<>(true);
     concatQueryKey->Add(decoderBlock);
     concatQueryKey->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
 
diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp
index a54d98b9..aace4131 100644
--- a/models/transformer/encoder.hpp
+++ b/models/transformer/encoder.hpp
@@ -132,13 +132,13 @@ class TransformerEncoder
    */
   void AttentionBlock()
   {
-    Concat<>* input = new Concat<>();
+    Concat<>* input = new Concat<>(true);
     input->Add<IdentityLayer<>>();
     input->Add<IdentityLayer<>>();
     input->Add<IdentityLayer<>>();
 
     /* Self attention layer. */
-    Sequential<>* selfAttn = new Sequential<>(false);
+    Sequential<>* selfAttn = new Sequential<>();
     selfAttn->Add(input);
 
     MultiheadAttention<>* mha = new MultiheadAttention<>(srcSeqLen,
@@ -150,7 +150,7 @@ class TransformerEncoder
     selfAttn->Add(mha);
 
     /* This layer adds a residual connection. */
-    AddMerge<>* residualAdd = new AddMerge<>();
+    AddMerge<>* residualAdd = new AddMerge<>(true);
     residualAdd->Add(selfAttn);
     residualAdd->Add<IdentityLayer<>>();
 
@@ -163,14 +163,14 @@ class TransformerEncoder
    */
   void PositionWiseFFNBlock()
   {
-    Sequential<>* positionWiseFFN = new Sequential<>(false);
+    Sequential<>* positionWiseFFN = new Sequential<>();
     positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
     positionWiseFFN->Add<ActivationFunction>();
     positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
     positionWiseFFN->Add<Dropout<>>(dropout);
 
     /* This layer adds a residual connection. */
-    AddMerge<>* residualAdd = new AddMerge<>();
+    AddMerge<>* residualAdd = new AddMerge<>(true);
     residualAdd->Add(positionWiseFFN);
     residualAdd->Add<IdentityLayer<>>();
 
diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp
index 05bfb058..16075c60 100644
--- a/models/transformer/encoder_impl.hpp
+++ b/models/transformer/encoder_impl.hpp
@@ -40,7 +40,7 @@ TransformerEncoder<ActivationFunction, RegularizerType>::TransformerEncoder(
     keyPaddingMask(keyPaddingMask),
     ownMemory(ownMemory)
 {
-  encoder = new Sequential<>(false);
+  encoder = new Sequential<>();
 
   for (size_t n = 0; n < numLayers; ++n)
   {
diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp
index 57d813d5..7ed8c707 100644
--- a/models/transformer/transformer_impl.hpp
+++ b/models/transformer/transformer_impl.hpp
@@ -46,9 +46,9 @@ Transformer<ActivationFunction, RegularizerType>::Transformer(
     keyPaddingMask(keyPaddingMask),
     ownMemory(ownMemory)
 {
-  transformer = new Sequential<>(false);
+  transformer = new Sequential<>();
 
-  Sequential<>* encoder = new Sequential<>(false);
+  Sequential<>* encoder = new Sequential<>();
 
   // Pull out the sequences of source language which is stacked above in the
   // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix.
@@ -69,7 +69,7 @@ Transformer<ActivationFunction, RegularizerType>::Transformer(
 
   encoder->Add(encoderStack);
 
-  Sequential<>* decoderPE = new Sequential<>(false);
+  Sequential<>* decoderPE = new Sequential<>();
 
   // Pull out the sequences of target language which is stacked below in the
   // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last
@@ -78,7 +78,7 @@ Transformer<ActivationFunction, RegularizerType>::Transformer(
   decoderPE->Add<Lookup<>>(tgtVocabSize, dModel);
   decoderPE->Add<PositionalEncoding<>>(dModel, tgtSeqLen);
 
-  Concat<>* encoderDecoderConcat = new Concat<>();
+  Concat<>* encoderDecoderConcat = new Concat<>(true);
   encoderDecoderConcat->Add(encoder);
   encoderDecoderConcat->Add(decoderPE);
 

From 368f82e94027b5049da3baa71957f0a9f323aded Mon Sep 17 00:00:00 2001
From: Mrityunjay Tripathi <mrityunjay2668@gmail.com>
Date: Sat, 10 Oct 2020 08:50:57 +0530
Subject: [PATCH 8/8] added suggestions

---
 models/transformer/decoder.hpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
index f8686d4c..5c17d288 100644
--- a/models/transformer/decoder.hpp
+++ b/models/transformer/decoder.hpp
@@ -52,7 +52,6 @@ class TransformerDecoder
    * @param numLayers The number of decoder blocks.
    * @param tgtSeqLen Target Sequence Length.
    * @param srcSeqLen Source Sequence Length.
-   * @param memoryModule The last Encoder module.
    * @param dModel The number of features in the input. Also, same as the
    *        `embedDim` in `MultiheadAttention` layer.
    * @param numHeads The number of attention heads.
@@ -90,13 +89,18 @@ class TransformerDecoder
   /**
    * Move constructor.
    */
-  TransformerDecoder(const TransformerDecoder&& ) = delete;
+  TransformerDecoder(TransformerDecoder&& ) = delete;
 
   /**
    * Copy assignment operator.
    */
   TransformerDecoder& operator = (const TransformerDecoder& ) = delete;
 
+  /**
+   * Move assignment operator.
+   */
+  TransformerDecoder& operator = (TransformerDecoder&& ) = delete;
+
   /**
    * Get the Transformer Decoder model.
    */
@@ -149,9 +153,9 @@ class TransformerDecoder
     maskedSelfAttention->Add(decoderInput);
 
     MultiheadAttention<>* mha1 = new MultiheadAttention<>(tgtSeqLen,
-                                                         tgtSeqLen,
-                                                         dModel,
-                                                         numHeads);
+                                                          tgtSeqLen,
+                                                          dModel,
+                                                          numHeads);
     mha1->AttentionMask() = attentionMask;
 
     maskedSelfAttention->Add(mha1);