From 437fdebb0432a6727c8e1661400e032feefec67a Mon Sep 17 00:00:00 2001 From: Mrityunjay Tripathi Date: Wed, 3 Jun 2020 15:39:25 +0530 Subject: [PATCH 1/8] complete encoder decoder and transformer model --- .gitignore | 1 + models/CMakeLists.txt | 10 +- models/transformer/CMakeLists.txt | 20 ++ models/transformer/decoder.hpp | 236 ++++++++++++++++++++++++ models/transformer/decoder_impl.hpp | 90 +++++++++ models/transformer/encoder.hpp | 191 +++++++++++++++++++ models/transformer/encoder_impl.hpp | 76 ++++++++ models/transformer/transformer.hpp | 156 ++++++++++++++++ models/transformer/transformer_impl.hpp | 128 +++++++++++++ tests/ffn_model_tests.cpp | 116 ++++++++++++ 10 files changed, 1023 insertions(+), 1 deletion(-) create mode 100644 models/transformer/CMakeLists.txt create mode 100644 models/transformer/decoder.hpp create mode 100644 models/transformer/decoder_impl.hpp create mode 100644 models/transformer/encoder.hpp create mode 100644 models/transformer/encoder_impl.hpp create mode 100644 models/transformer/transformer.hpp create mode 100644 models/transformer/transformer_impl.hpp diff --git a/.gitignore b/.gitignore index 4708e5bb..92a9136d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ build* xcode* +.vscode/ .DS_Store .idea cmake-build-* diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt index c4bd5a8c..77aa6f85 100644 --- a/models/CMakeLists.txt +++ b/models/CMakeLists.txt @@ -1,7 +1,15 @@ cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR) project(models) -add_subdirectory(darknet) +# Recurse into each model mlpack provides. +set(DIRS + darknet + transformer +) + +foreach(dir ${DIRS}) + add_subdirectory(${dir}) +endforeach() # Add directory name to sources. set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/) diff --git a/models/transformer/CMakeLists.txt b/models/transformer/CMakeLists.txt new file mode 100644 index 00000000..288262a4 --- /dev/null +++ b/models/transformer/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR) +project(transformer) + +set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/) +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../") + +set(SOURCES + decoder.hpp + decoder_impl.hpp + encoder.hpp + encoder_impl.hpp + transformer.hpp + transformer_impl.hpp +) + +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() + +set(DIRS ${DIRS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp new file mode 100644 index 00000000..28e055a5 --- /dev/null +++ b/models/transformer/decoder.hpp @@ -0,0 +1,236 @@ +/** + * @file models/transformer/decoder.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Definition of the Transformer Decoder layer. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_DECODER_HPP +#define MODELS_TRANSFORMER_DECODER_HPP + +#include +#include +#include +#include + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +/** + * In addition to the two sub-layers in each encoder layer, the decoder inserts + * a third sub-layer, which performs multi-head attention over the output of the + * encoder stack. Similar to the encoder, we employ residual connections around + * each of the sub-layers, followed by layer normalization. We also modify the + * self-attention sub-layer in the decoder stack to prevent positions from + * attending to subsequent positions. This masking, combined with fact that the + * output embeddings are offset by one position, ensures that the predictions + * for position i can depend only on the known outputs at positions less than i. + * + * @tparam ActivationFunction The type of the activation function to be used in + * the position-wise feed forward neural network. + * @tparam RegularizerType The type of regularizer to be applied to layer + * parameters. + * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + */ +template < + typename ActivationFunction = ReLULayer<>, + typename RegularizerType = NoRegularizer, + typename InputDataType = arma::mat, + typename OutputDataType = arma::mat +> +class TransformerDecoder +{ + public: + TransformerDecoder(); + + /** + * Create the TransformerDecoder object using the specified parameters. + * + * @param numLayers The number of decoder blocks. + * @param tgtSeqLen Target Sequence Length. + * @param srcSeqLen Source Sequence Length. + * @param memoryModule The last Encoder module. + * @param dModel The number of features in the input. Also, same as the + * 'embedDim' in 'MultiheadAttention' layer. + * @param numHeads The number of attention heads. + * @param dimFFN The dimentionality of feedforward network. + * @param dropout The dropout rate. + * @param attentionMask The attention mask used to black-out future sequences. + * @param keyPaddingMask The padding mask used to black-out particular token. + */ + TransformerDecoder(const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t dModel = 512, + const size_t numHeads = 8, + const size_t dimFFN = 1024, + const double dropout = 0.1, + const InputDataType& attentionMask = InputDataType(), + const InputDataType& keyPaddingMask = InputDataType()); + + /** + * Get the Transformer Decoder model. + */ + Sequential<>* Model() { return decoder; } + /** + * Load the network from a local directory. + * + * @param filepath The location of the stored model. + */ + void LoadModel(const std::string& filepath); + + /** + * Save the network locally. + * + * @param filepath The location where the model is to be saved. + */ + void SaveModel(const std::string& filepath); + + //! Get the attention mask. + InputDataType const& AttentionMask() const { return attentionMask; } + + //! Modify the attention mask. + InputDataType& AttentionMask() { return attentionMask; } + + //! Get the key padding mask. + InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + + //! Modify the key padding mask. + InputDataType& KeyPaddingMask() { return keyPaddingMask; } + + private: + /** + * This method adds the attention block to the decoder. + */ + void AttentionBlock() + { + Sequential<>* decoderBlockBottom = new Sequential<>(); + decoderBlockBottom->Add>(1, 0, dModel * tgtSeqLen - 1, 0, -1); + + // Broadcast the incoming input to decoder + // i.e. query into (query, key, value). + Concat<>* decoderInput = new Concat<>(); + decoderInput->Add>(); + decoderInput->Add>(); + decoderInput->Add>(); + + // Masked Self attention layer. + Sequential<>* maskedSelfAttention = new Sequential<>(); + maskedSelfAttention->Add(decoderInput); + maskedSelfAttention->Add>( + tgtSeqLen, + tgtSeqLen, + dModel, + numHeads, + attentionMask + ); + + // Residual connection. + AddMerge<>* residualAdd1 = new AddMerge<>(); + residualAdd1->Add(maskedSelfAttention); + residualAdd1->Add>(); + + decoderBlockBottom->Add(residualAdd1); + + // Add the LayerNorm layer with required parameters. + decoderBlockBottom->Add>(dModel * tgtSeqLen); + + // This layer broadcasts the output of encoder i.e. key into (key, value). + Concat<>* broadcastEncoderOutput = new Concat<>(); + broadcastEncoderOutput->Add>(1, dModel * tgtSeqLen, -1, 0, -1); + broadcastEncoderOutput->Add>(1, dModel * tgtSeqLen, -1, 0, -1); + + // This layer concatenates the output of the bottom decoder block (query) + // and the output of the encoder (key, value). + Concat<>* encoderDecoderAttentionInput = new Concat<>(); + encoderDecoderAttentionInput->Add(decoderBlockBottom); + encoderDecoderAttentionInput->Add(broadcastEncoderOutput); + + // Encoder-decoder attention. + Sequential<>* encoderDecoderAttention = new Sequential<>(); + encoderDecoderAttention->Add(encoderDecoderAttentionInput); + encoderDecoderAttention->Add>( + tgtSeqLen, + srcSeqLen, + dModel, + numHeads, + InputDataType(), // No attention mask to encoder-decoder attention. + keyPaddingMask); + + // Residual connection. + AddMerge<>* residualAdd2 = new AddMerge<>(); + residualAdd2->Add(encoderDecoderAttention); + residualAdd2->Add>(); + + decoder->Add(residualAdd2); + decoder->Add(new LayerNorm<>(dModel * tgtSeqLen)); + } + + /** + * This method adds the position-wise feed forward network to the decoder. + */ + void PositionWiseFFNBlock() + { + Sequential<>* positionWiseFFN = new Sequential<>(); + positionWiseFFN->Add>(dModel, dimFFN); + positionWiseFFN->Add(); + positionWiseFFN->Add>(dimFFN, dModel); + positionWiseFFN->Add>(dropout); + + /* Residual connection. */ + AddMerge<>* residualAdd = new AddMerge<>(); + residualAdd->Add(positionWiseFFN); + residualAdd->Add>(); + decoder->Add(residualAdd); + } + + //! Locally-stored number of decoder layers. + size_t numLayers; + + //! Locally-stored target sequence length. + size_t tgtSeqLen; + + //! Locally-stored source sequence length. + size_t srcSeqLen; + + //! Locally-stored number of input units. + size_t dModel; + + //! Locally-stored number of output units. + size_t numHeads; + + //! Locally-stored weight object. + size_t dimFFN; + + //! Locally-stored weight parameters. + double dropout; + + //! Locally-stored attention mask. + InputDataType attentionMask; + + //! Locally-stored key padding mask. + InputDataType keyPaddingMask; + + //! Locally-stored complete decoder network. + Sequential* decoder; + +}; // class TransformerDecoder + +} // namespace ann +} // namespace mlpack + +// Include implementation. +#include "decoder_impl.hpp" + +#endif diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp new file mode 100644 index 00000000..8cbf4f6d --- /dev/null +++ b/models/transformer/decoder_impl.hpp @@ -0,0 +1,90 @@ +/** + * @file models/transformer/decoder_impl.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Implementation of the Transformer Decoder class. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_DECODER_IMPL_HPP +#define MODELS_TRANSFORMER_DECODER_IMPL_HPP + +#include "decoder.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +template +TransformerDecoder::TransformerDecoder() : + tgtSeqLen(0), + srcSeqLen(0), + dModel(0), + numHeads(0), + dimFFN(0), + dropout(0) +{ + // Nothing to do here. +} + +template +TransformerDecoder::TransformerDecoder( + const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t dModel, + const size_t numHeads, + const size_t dimFFN, + const double dropout, + const InputDataType& attentionMask, + const InputDataType& keyPaddingMask) : + numLayers(numLayers), + tgtSeqLen(tgtSeqLen), + srcSeqLen(srcSeqLen), + dModel(dModel), + numHeads(numHeads), + dimFFN(dimFFN), + dropout(dropout), + attentionMask(attentionMask), + keyPaddingMask(keyPaddingMask) +{ + decoder = new Sequential(); + + for (size_t N = 0; N < numLayers; ++N) + { + AttentionBlock(); + PositionWiseFFNBlock(); + } +} + +template +void TransformerDecoder::LoadModel(const std::string& filepath) +{ + data::Load(filepath, "TransformerDecoder", decoder); + std::cout << "Loaded model" << std::endl; +} + +template +void TransformerDecoder::SaveModel(const std::string& filepath) +{ + std::cout << "Saving model" << std::endl; + data::Save(filepath, "TransformerDecoder", decoder); + std::cout << "Model saved in " << filepath << std::endl; +} + +} // namespace ann +} // namespace mlpack + +#endif diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp new file mode 100644 index 00000000..794e3fb3 --- /dev/null +++ b/models/transformer/encoder.hpp @@ -0,0 +1,191 @@ +/** + * @file models/transformer/encoder.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Definition of the Transformer Encoder layer. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_ENCODER_HPP +#define MODELS_TRANSFORMER_ENCODER_HPP + +#include +#include +#include +#include + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +/** + * The Transformer Encoder layer has two sub-layers. The first is a multi-head + * self-attention mechanism, and the second is a simple, position-wise + * fully connected feed-forward network. We employ a residual connection around + * each of the two sub-layers, followed by layer normalization. Hence the output + * of each sub-layer is 'LayerNorm(x + Sublayer(x))', where 'Sublayer(x)' is the + * function implemented by the sub-layer itself. To facilitate these residual + * connections, all sub-layers in the model, as well as the embedding layers, + * produce outputs of dimension 'dModel'. + * + * @tparam ActivationType The type of activation function to be used in the + * position-wise feed forward neural network. + * @tparam RegularizerType The regularizer type to be applied on layer + * parameters. + * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + */ +template < + typename ActivationFunction = ReLULayer<>, + typename RegularizerType = NoRegularizer, + typename InputDataType = arma::mat, + typename OutputDataType = arma::mat +> +class TransformerEncoder +{ + public: + /** + * Create the TransformerEncoder object using the specified parameters. + * + * @param numLayers The number of encoder blocks. + * @param srcSeqLen Source Sequence Length. + * @param dModel The number of features in the input. It is same as the + * 'embedDim' in 'MultiheadAttention' layer. + * @param numHeads The number of attention heads. + * @param dimFFN The dimentionality of feedforward network. + * @param dropout The dropout rate. + * @param attentionMask The attention mask to be applied to the sequences. + * @param keyPaddingMask The key padding mask applied to the sequences. + */ + TransformerEncoder(const size_t numLayers, + const size_t srcSeqLen, + const size_t dModel = 512, + const size_t numHeads = 2, + const size_t dimFFN = 1024, + const double dropout = 0.1, + const InputDataType& attentionMask = InputDataType(), + const InputDataType& keyPaddingMask = InputDataType()); + + /** + * Get the Transformer Encoder Model. + */ + Sequential* Model() + { + return encoder; + } + + /** + * Load the encoder block from a local directory. + * + * @param filepath The location of the stored model. + */ + void LoadModel(const std::string& filepath); + + /** + * Save the encoder block locally. + * + * @param filepath The location where the model is to be saved. + */ + void SaveModel(const std::string& filepath); + + //! Get the attention mask. + InputDataType const& AttentionMask() const { return attentionMask; } + + //! Modify the attention mask. + InputDataType& AttentionMask() { return attentionMask; } + + //! Get the key padding mask. + InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + + //! Modify the key padding mask. + InputDataType& KeyPaddingMask() { return keyPaddingMask; } + + private: + /** + * The method adds attention block to the encoder block. + */ + void AttentionBlock() + { + Concat<>* input = new Concat<>(); + input->Add>(); + input->Add>(); + input->Add>(); + + /* Self attention layer. */ + Sequential<>* selfAttention = new Sequential<>(); + selfAttention->Add(input); + selfAttention->Add + >(srcSeqLen, srcSeqLen, dModel, numHeads); + + /* This layer adds a residual connection. */ + AddMerge<>* residualAdd = new AddMerge<>(); + residualAdd->Add(selfAttention); + residualAdd->Add>(); + + encoder->Add(residualAdd); + encoder->Add(new LayerNorm<>(dModel * srcSeqLen)); + } + + /** + * This method adds position-wise feed forward block to the encoder. + */ + void PositionWiseFFNBlock() + { + Sequential<>* positionWiseFFN = new Sequential<>(); + positionWiseFFN->Add>(dModel, dimFFN); + positionWiseFFN->Add(); + positionWiseFFN->Add>(dimFFN, dModel); + positionWiseFFN->Add>(dropout); + + /* This layer adds a residual connection. */ + AddMerge<>* residualAdd = new AddMerge<>(); + residualAdd->Add(positionWiseFFN); + residualAdd->Add>(); + + encoder->Add(residualAdd); + encoder->Add(new LayerNorm<>(dModel * srcSeqLen)); + } + + //! Locally-stored number of encoder blocks. + size_t numLayers; + + //! Locally-stored source sequence length. + size_t srcSeqLen; + + //! Locally-stored number of input units. + size_t dModel; + + //! Locally-stored number of output units. + size_t numHeads; + + //! Locally-stored weight object. + size_t dimFFN; + + //! Locally-stored weight parameters. + double dropout; + + //! Locally-stored attention mask. + InputDataType attentionMask; + + //! Locally-stored key padding mask. + InputDataType keyPaddingMask; + + //! Locally-stored encoder block. + Sequential* encoder; + +}; // class TransformerEncoder + +} // namespace ann +} // namespace mlpack + +// Include implementation. +#include "encoder_impl.hpp" + +#endif diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp new file mode 100644 index 00000000..64d4b1cc --- /dev/null +++ b/models/transformer/encoder_impl.hpp @@ -0,0 +1,76 @@ +/** + * @file models/transformer/encoder_impl.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Implementation of the Transformer Encoder class. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_ENCODER_IMPL_HPP +#define MODELS_TRANSFORMER_ENCODER_IMPL_HPP + +#include "encoder.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +template +TransformerEncoder::TransformerEncoder( + const size_t numLayers, + const size_t srcSeqLen, + const size_t dModel, + const size_t numHeads, + const size_t dimFFN, + const double dropout, + const InputDataType& attentionMask, + const InputDataType& keyPaddingMask) : + numLayers(numLayers), + srcSeqLen(srcSeqLen), + dModel(dModel), + numHeads(numHeads), + dimFFN(dimFFN), + dropout(dropout), + attentionMask(attentionMask), + keyPaddingMask(keyPaddingMask) +{ + encoder = new Sequential(); + + encoder->Add(new IdentityLayer<>()); + + for (size_t N = 0; N < numLayers; ++N) + { + AttentionBlock(); + PositionWiseFFNBlock(); + } +} + +template +void TransformerEncoder::LoadModel(const std::string& filePath) +{ + data::Load(filePath, "TransformerEncoder", encoder); + std::cout << "Loaded model" << std::endl; +} + +template +void TransformerEncoder::SaveModel(const std::string& filePath) +{ + std::cout << "Saving model" << std::endl; + data::Save(filePath, "TransformerEncoder", encoder); + std::cout << "Model saved in " << filePath << std::endl; +} + +} // namespace ann +} // namespace mlpack + +#endif diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp new file mode 100644 index 00000000..8cc1a94f --- /dev/null +++ b/models/transformer/transformer.hpp @@ -0,0 +1,156 @@ +/** + * @file models/transformer/transformer.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Definition of the Transformer model. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_TRANSFORMER_HPP +#define MODELS_TRANSFORMER_TRANSFORMER_HPP + +#include +#include +#include +#include +#include + +#include "encoder.hpp" +#include "decoder.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +/** + * @tparam ActivationType The type of activation function to be used in the + * position-wise feed forward neural network. + * @tparam RegularizerType The regularizer type to be applied on layer + * parameters. + * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + */ +template < + typename ActivationFunction = ReLULayer<>, + typename RegularizerType = NoRegularizer, + typename InputDataType = arma::mat, + typename OutputDataType = arma::mat +> +class Transformer +{ + public: + /** + * Create the Transformer object using the specified parameters. + * + * @param numLayers The number of encoder and decoder layers. + * @param tgtSeqLen Target Sequence Length. + * @param srcSeqLen Source Sequence Length. + * @param tgtVocabSize Target vocabulary size. + * @param srcVocabSize Source vocabulary size. + * @param dModel The number of features in the input. Also, same as the + * 'embedDim' in 'MultiheadAttention' layer. + * @param numHeads The number of attention heads. + * @param dimFFN The dimentionality of feedforward network. + * @param dropout The dropout rate. + * @param attentionMask The attention mask to be applied to the sequences. + * @param keyPaddingMask The key padding mask applied to the sequences. + */ + Transformer(const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t tgtVocabSize, + const size_t srcVocabSize, + const size_t dModel = 512, + const size_t numHeads = 12, + const size_t dimFFN = 1024, + const double dropout = 0.1, + const InputDataType& attentionMask = InputDataType(), + const InputDataType& keyPaddingMask = InputDataType()); + + /** + * Get the Transformer Encoder Model. + */ + Sequential* Model() + { + return transformer; + } + + /** + * Load the Transformer model from a local directory. + * + * @param filepath The location of the stored model. + */ + void LoadModel(const std::string& filepath); + + /** + * Save the Transformer model locally. + * + * @param filepath The location where the model is to be saved. + */ + void SaveModel(const std::string& filepath); + + //! Get the attention mask. + InputDataType const& AttentionMask() const { return attentionMask; } + + //! Modify the attention mask. + InputDataType& AttentionMask() { return attentionMask; } + + //! Get the key padding mask. + InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + + //! Modify the key padding mask. + InputDataType& KeyPaddingMask() { return keyPaddingMask; } + + private: + + //! Locally-stored number of encoder and decoder layers. + size_t numLayers; + + //! Locally-stored target sequence length. + size_t tgtSeqLen; + + //! Locally-stored source sequence length. + size_t srcSeqLen; + + //! Locally-stored vocabulary size of the target. + size_t tgtVocabSize; + + //! Locally-stored vocabulary size of the source. + size_t srcVocabSize; + + //! Locally-stored number of input units. + size_t dModel; + + //! Locally-stored number of output units. + size_t numHeads; + + //! Locally-stored weight object. + size_t dimFFN; + + //! Locally-stored weight parameters. + double dropout; + + //! Locally-stored attention mask. + InputDataType attentionMask; + + //! Locally-stored key padding mask. + InputDataType keyPaddingMask; + + //! Locally-stored transformer model. + Sequential* transformer; + +}; // class Transformer + +} // namespace ann +} // namespace mlpack + +// Include implementation. +#include "transformer_impl.hpp" + +#endif diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp new file mode 100644 index 00000000..fc545e8f --- /dev/null +++ b/models/transformer/transformer_impl.hpp @@ -0,0 +1,128 @@ +/** + * @file models/transformer/transformer_impl.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Implementation of the Transformer model. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP +#define MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP + +#include "transformer.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +template +Transformer::Transformer( + const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t tgtVocabSize, + const size_t srcVocabSize, + const size_t dModel, + const size_t numHeads, + const size_t dimFFN, + const double dropout, + const InputDataType& attentionMask, + const InputDataType& keyPaddingMask) : + numLayers(numLayers), + tgtSeqLen(tgtSeqLen), + srcSeqLen(srcSeqLen), + tgtVocabSize(tgtVocabSize), + srcVocabSize(srcVocabSize), + dModel(dModel), + numHeads(numHeads), + dimFFN(dimFFN), + dropout(dropout), + attentionMask(attentionMask), + keyPaddingMask(keyPaddingMask) +{ + transformer = new Sequential<>(); + + Sequential<>* encoder = new Sequential<>(); + + // Pull out the sequences of source language which is stacked above in the + // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix. + encoder->Add>(1, 0, srcSeqLen - 1, 0, -1); + encoder->Add>(srcVocabSize, dModel); + encoder->Add>(dModel, srcSeqLen); + + Sequential<>* encoderStack = mlpack::ann::TransformerEncoder< + ActivationFunction, RegularizerType, InputDataType, OutputDataType>( + numLayers, + srcSeqLen, + dModel, + numHeads, + dimFFN, + dropout, + attentionMask, + keyPaddingMask, + ).Model(); + + encoder->Add(encoderStack); + + Sequential<>* decoderPE = new Sequential<>(); + + // Pull out the sequences of target language which is stacked below in the + // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last + // row and last batch of the input matrix respectively. + decoderPE->Add>(1, srcSeqLen, -1, 0, -1); + decoderPE->Add>(tgtVocabSize, dModel); + decoderPE->Add>(dModel, tgtSeqLen); + + Concat<>* encoderDecoderConcat = new Concat<>(); + encoderDecoderConcat->Add(encoder); + encoderDecoderConcat->Add(decoderPE); + + Sequential<>* decoder = new Sequential<>(); + decoder->Add(encoderDecoderConcat); + + Sequential<>* decoderStack = mlpack::ann::TransformerDecoder< + ActivationFunction, RegularizerType, InputDataType, OutputDataType>( + numLayers, + tgtSeqLen, + srcSeqLen, + dModel, + numHeads, + dimFFN, + dropout, + attentionMask, + keyPaddingMask, + ).Model(); + + decoder->Add(decoderStack); + transformer->Add(decoder); +} + +template +void Transformer::LoadModel(const std::string& filePath) +{ + data::Load(filePath, "Transformer", transformer.Model()); + std::cout << "Loaded model" << std::endl; +} + +template +void Transformer::SaveModel(const std::string& filePath) +{ + std::cout << "Saving model" << std::endl; + data::Save(filePath, "Transformer", transformer.Model()); + std::cout << "Model saved in " << filePath << std::endl; +} + +} // namespace ann +} // namespace mlpack + +#endif diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp index 84e42eb7..eabda79c 100644 --- a/tests/ffn_model_tests.cpp +++ b/tests/ffn_model_tests.cpp @@ -14,6 +14,11 @@ #include #include #include +#include +#include +#include +#include +#include #include // Use namespaces for convenience. @@ -42,4 +47,115 @@ BOOST_AUTO_TEST_CASE(DarknetModelTest) BOOST_REQUIRE_EQUAL(output.n_rows, 1000); } +/** + * Simple Transformer Encoder test. + */ +BOOST_AUTO_TEST_CASE(TransformerEncoderTest) +{ + const size_t vocabSize = 20; + const size_t numLayers = 2; + const size_t srcSeqLen = 10; + const size_t dModel = 16; + const size_t numHeads = 2; + const size_t dimFFN = 16; + const double dropout = 0.3; + + arma::mat input = arma::randu(dModel * srcSeqLen, 1); + arma::mat output; + + mlpack::ann::TransformerEncoder<> encoder(numLayers, srcSeqLen, + dModel, numHeads, dimFFN, dropout); + + FFN<> model; + + model.Add(encoder.Model()); + model.Add>(dModel * srcSeqLen, vocabSize); + model.Add>(); + + model.Predict(input, output); + + BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize); + BOOST_REQUIRE_EQUAL(output.n_cols, 1); +} + +/** + * Simple Transformer Decoder test. + */ +BOOST_AUTO_TEST_CASE(TransformerDecoderTest) +{ + const size_t vocabSize = 20; + const size_t numLayers = 2; + const size_t tgtSeqLen = 10; + const size_t srcSeqLen = 10; + const size_t dModel = 16; + const size_t numHeads = 2; + const size_t dimFFN = 16; + const double dropout = 0.3; + + arma::mat query = arma::randu(dModel * tgtSeqLen, 1); + arma::mat memory = 0.73 * arma::randu(dModel * srcSeqLen, 1); + + arma::mat input = arma::join_cols(query, memory); + arma::mat output; + + mlpack::ann::TransformerDecoder<> decoder(numLayers, tgtSeqLen, srcSeqLen, + dModel, numHeads, dimFFN, dropout); + + FFN<> model; + + model.Add(decoder.Model()); + model.Add>(dModel * tgtSeqLen, vocabSize); + model.Add>(); + + model.Predict(input, output); + + BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize); + BOOST_REQUIRE_EQUAL(output.n_cols, 1); +} + +/** + * Transformer Model test. + */ +BOOST_AUTO_TEST_CASE(TransformerTest) +{ + const size_t tgtVocabSize = 20; + const size_t srcVocabSize = 20; + const size_t numLayers = 2; + const size_t tgtSeqLen = 10; + const size_t srcSeqLen = 10; + const size_t dModel = 16; + const size_t numHeads = 2; + const size_t dimFFN = 16; + const double dropout = 0.3; + + arma::mat srcLanguage(srcSeqLen, 1), tgtLanguage(tgtSeqLen, 1); + + for (size_t t = 0; t < srcSeqLen; ++t) + { + srcLanguage(t) = mlpack::math::RandInt(1, srcVocabSize); + } + + for (size_t t = 0; t < tgtSeqLen; ++t) + { + tgtLanguage(t) = mlpack::math::RandInt(1, tgtVocabSize); + } + + arma::mat input = arma::join_cols(srcLanguage, tgtLanguage); + arma::mat output; + + mlpack::ann::Transformer<> transformer(numLayers, tgtSeqLen, srcSeqLen, + tgtVocabSize, srcVocabSize, dModel, numHeads, dimFFN, dropout); + + FFN<> model; + + model.Add(transformer.Model()); + model.Add>(dModel * tgtSeqLen, vocabSize); + model.Add>(); + + model.Predict(input, output); + + BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize); + BOOST_REQUIRE_EQUAL(output.n_cols, 1); +} + BOOST_AUTO_TEST_SUITE_END(); From dc8bc78d98318c913eabf1954969fe5bebb4c44e Mon Sep 17 00:00:00 2001 From: Mrityunjay Tripathi Date: Sat, 22 Aug 2020 21:21:33 +0530 Subject: [PATCH 2/8] add namespace in ffn tests --- models/transformer/transformer_impl.hpp | 8 ++++---- tests/ffn_model_tests.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp index fc545e8f..b129da2b 100644 --- a/models/transformer/transformer_impl.hpp +++ b/models/transformer/transformer_impl.hpp @@ -65,7 +65,7 @@ OutputDataType>::Transformer( dimFFN, dropout, attentionMask, - keyPaddingMask, + keyPaddingMask ).Model(); encoder->Add(encoderStack); @@ -96,7 +96,7 @@ OutputDataType>::Transformer( dimFFN, dropout, attentionMask, - keyPaddingMask, + keyPaddingMask ).Model(); decoder->Add(decoderStack); @@ -108,7 +108,7 @@ template ::LoadModel(const std::string& filePath) { - data::Load(filePath, "Transformer", transformer.Model()); + data::Load(filePath, "Transformer", transformer); std::cout << "Loaded model" << std::endl; } @@ -118,7 +118,7 @@ void Transformer::SaveModel(const std::string& filePath) { std::cout << "Saving model" << std::endl; - data::Save(filePath, "Transformer", transformer.Model()); + data::Save(filePath, "Transformer", transformer); std::cout << "Model saved in " << filePath << std::endl; } diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp index eabda79c..3f4601fd 100644 --- a/tests/ffn_model_tests.cpp +++ b/tests/ffn_model_tests.cpp @@ -22,6 +22,8 @@ #include // Use namespaces for convenience. +using namespace mlpack; +using namespace mlpack::ann; using namespace boost::unit_test; BOOST_AUTO_TEST_SUITE(FFNModelsTests); @@ -149,12 +151,12 @@ BOOST_AUTO_TEST_CASE(TransformerTest) FFN<> model; model.Add(transformer.Model()); - model.Add>(dModel * tgtSeqLen, vocabSize); + model.Add>(dModel * tgtSeqLen, tgtVocabSize); model.Add>(); model.Predict(input, output); - BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize); + BOOST_REQUIRE_EQUAL(output.n_rows, tgtVocabSize); BOOST_REQUIRE_EQUAL(output.n_cols, 1); } From ec965bb86f815868d90ac358485348746dccf45d Mon Sep 17 00:00:00 2001 From: Mrityunjay Tripathi <35535378+mrityunjay-tripathi@users.noreply.github.com> Date: Sun, 23 Aug 2020 13:32:48 +0530 Subject: [PATCH 3/8] Apply suggestions from code review Co-authored-by: Mikhail Lozhnikov --- models/transformer/decoder.hpp | 17 +++++++++++------ models/transformer/decoder_impl.hpp | 20 +++++++++++++++++--- models/transformer/encoder.hpp | 6 +++--- models/transformer/encoder_impl.hpp | 4 +--- models/transformer/transformer.hpp | 2 +- models/transformer/transformer_impl.hpp | 6 ++---- 6 files changed, 35 insertions(+), 20 deletions(-) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp index 28e055a5..64fc2974 100644 --- a/models/transformer/decoder.hpp +++ b/models/transformer/decoder.hpp @@ -60,7 +60,7 @@ class TransformerDecoder * @param srcSeqLen Source Sequence Length. * @param memoryModule The last Encoder module. * @param dModel The number of features in the input. Also, same as the - * 'embedDim' in 'MultiheadAttention' layer. + * `embedDim` in `MultiheadAttention` layer. * @param numHeads The number of attention heads. * @param dimFFN The dimentionality of feedforward network. * @param dropout The dropout rate. @@ -111,7 +111,7 @@ class TransformerDecoder /** * This method adds the attention block to the decoder. */ - void AttentionBlock() + Sequential<>* AttentionBlock() { Sequential<>* decoderBlockBottom = new Sequential<>(); decoderBlockBottom->Add>(1, 0, dModel * tgtSeqLen - 1, 0, -1); @@ -173,14 +173,16 @@ class TransformerDecoder residualAdd2->Add(encoderDecoderAttention); residualAdd2->Add>(); - decoder->Add(residualAdd2); - decoder->Add(new LayerNorm<>(dModel * tgtSeqLen)); + Sequential<>* decoderBlock = new Sequential<>(); + decoderBlock->Add(residualAdd2); + decoderBlock->Add>(dModel * tgtSeqLen); + return decoderBlock; } /** * This method adds the position-wise feed forward network to the decoder. */ - void PositionWiseFFNBlock() + Sequential<>* PositionWiseFFNBlock() { Sequential<>* positionWiseFFN = new Sequential<>(); positionWiseFFN->Add>(dModel, dimFFN); @@ -192,7 +194,10 @@ class TransformerDecoder AddMerge<>* residualAdd = new AddMerge<>(); residualAdd->Add(positionWiseFFN); residualAdd->Add>(); - decoder->Add(residualAdd); + + Sequential<>* decoderBlock = new Sequential<>(); + decoderBlock->Add(residualAdd); + return decoderBlock; } //! Locally-stored number of decoder layers. diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp index 8cbf4f6d..ac9b33ae 100644 --- a/models/transformer/decoder_impl.hpp +++ b/models/transformer/decoder_impl.hpp @@ -58,10 +58,24 @@ OutputDataType>::TransformerDecoder( { decoder = new Sequential(); - for (size_t N = 0; N < numLayers; ++N) + for (size_t n = 0; n < numLayers; ++n) { - AttentionBlock(); - PositionWiseFFNBlock(); + if (n + 1 == numLayers) + { + decoder->Add(AttentionBlock()); + decoder->Add(PositionWiseFFNBlock()); + break; + } + + Sequential<>* decoderBlock = new Sequential<>(); + decoderBlock->Add(AttentionBlock()); + decoderBlock->Add(PositionWiseFFNBlock()); + + Concat<>* concatQueryKey = new Concat<>(); + concatQueryKey->Add(decoderBlock); + concatQueryKey->Add>(1, dModel * tgtSeqLen, -1, 0, -1); + + decoder->Add(concatQueryKey); } } diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp index 794e3fb3..9ffe5a2e 100644 --- a/models/transformer/encoder.hpp +++ b/models/transformer/encoder.hpp @@ -56,7 +56,7 @@ class TransformerEncoder * @param numLayers The number of encoder blocks. * @param srcSeqLen Source Sequence Length. * @param dModel The number of features in the input. It is same as the - * 'embedDim' in 'MultiheadAttention' layer. + * `embedDim` in `MultiheadAttention` layer. * @param numHeads The number of attention heads. * @param dimFFN The dimentionality of feedforward network. * @param dropout The dropout rate. @@ -130,7 +130,7 @@ class TransformerEncoder residualAdd->Add>(); encoder->Add(residualAdd); - encoder->Add(new LayerNorm<>(dModel * srcSeqLen)); + encoder->Add>(dModel * srcSeqLen); } /** @@ -150,7 +150,7 @@ class TransformerEncoder residualAdd->Add>(); encoder->Add(residualAdd); - encoder->Add(new LayerNorm<>(dModel * srcSeqLen)); + encoder->Add>(dModel * srcSeqLen); } //! Locally-stored number of encoder blocks. diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp index 64d4b1cc..a364e24d 100644 --- a/models/transformer/encoder_impl.hpp +++ b/models/transformer/encoder_impl.hpp @@ -42,9 +42,7 @@ OutputDataType>::TransformerEncoder( { encoder = new Sequential(); - encoder->Add(new IdentityLayer<>()); - - for (size_t N = 0; N < numLayers; ++N) + for (size_t n = 0; n < numLayers; ++n) { AttentionBlock(); PositionWiseFFNBlock(); diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp index 8cc1a94f..d766fac9 100644 --- a/models/transformer/transformer.hpp +++ b/models/transformer/transformer.hpp @@ -54,7 +54,7 @@ class Transformer * @param tgtVocabSize Target vocabulary size. * @param srcVocabSize Source vocabulary size. * @param dModel The number of features in the input. Also, same as the - * 'embedDim' in 'MultiheadAttention' layer. + * `embedDim` in `MultiheadAttention` layer. * @param numHeads The number of attention heads. * @param dimFFN The dimentionality of feedforward network. * @param dropout The dropout rate. diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp index b129da2b..178dc040 100644 --- a/models/transformer/transformer_impl.hpp +++ b/models/transformer/transformer_impl.hpp @@ -83,8 +83,7 @@ OutputDataType>::Transformer( encoderDecoderConcat->Add(encoder); encoderDecoderConcat->Add(decoderPE); - Sequential<>* decoder = new Sequential<>(); - decoder->Add(encoderDecoderConcat); + transformer->Add(encoderDecoderConcat); Sequential<>* decoderStack = mlpack::ann::TransformerDecoder< ActivationFunction, RegularizerType, InputDataType, OutputDataType>( @@ -99,8 +98,7 @@ OutputDataType>::Transformer( keyPaddingMask ).Model(); - decoder->Add(decoderStack); - transformer->Add(decoder); + transformer->Add(decoderStack); } template Date: Sun, 23 Aug 2020 17:39:19 +0530 Subject: [PATCH 4/8] add proper parameter description --- models/transformer/decoder.hpp | 12 +++++------- models/transformer/encoder.hpp | 9 ++++----- models/transformer/transformer.hpp | 10 ++++------ models/transformer/transformer_impl.hpp | 6 ++---- 4 files changed, 15 insertions(+), 22 deletions(-) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp index 64fc2974..6f9dd771 100644 --- a/models/transformer/decoder.hpp +++ b/models/transformer/decoder.hpp @@ -132,8 +132,7 @@ class TransformerDecoder tgtSeqLen, dModel, numHeads, - attentionMask - ); + attentionMask); // Residual connection. AddMerge<>* residualAdd1 = new AddMerge<>(); @@ -209,16 +208,16 @@ class TransformerDecoder //! Locally-stored source sequence length. size_t srcSeqLen; - //! Locally-stored number of input units. + //! Locally-stored dimensionality of the model. size_t dModel; - //! Locally-stored number of output units. + //! Locally-stored number of attention heads. size_t numHeads; - //! Locally-stored weight object. + //! Locally-stored dimensionality of position-wise feed forward network. size_t dimFFN; - //! Locally-stored weight parameters. + //! Locally-stored dropout rate. double dropout; //! Locally-stored attention mask. @@ -229,7 +228,6 @@ class TransformerDecoder //! Locally-stored complete decoder network. Sequential* decoder; - }; // class TransformerDecoder } // namespace ann diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp index 9ffe5a2e..20e70050 100644 --- a/models/transformer/encoder.hpp +++ b/models/transformer/encoder.hpp @@ -159,16 +159,16 @@ class TransformerEncoder //! Locally-stored source sequence length. size_t srcSeqLen; - //! Locally-stored number of input units. + //! Locally-stored dimensionality of model. size_t dModel; - //! Locally-stored number of output units. + //! Locally-stored number of attention heads. size_t numHeads; - //! Locally-stored weight object. + //! Locally-stored dimensionality of position-wise feed forward network. size_t dimFFN; - //! Locally-stored weight parameters. + //! Locally-stored dropout rate. double dropout; //! Locally-stored attention mask. @@ -179,7 +179,6 @@ class TransformerEncoder //! Locally-stored encoder block. Sequential* encoder; - }; // class TransformerEncoder } // namespace ann diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp index d766fac9..45bef01d 100644 --- a/models/transformer/transformer.hpp +++ b/models/transformer/transformer.hpp @@ -108,7 +108,6 @@ class Transformer InputDataType& KeyPaddingMask() { return keyPaddingMask; } private: - //! Locally-stored number of encoder and decoder layers. size_t numLayers; @@ -124,16 +123,16 @@ class Transformer //! Locally-stored vocabulary size of the source. size_t srcVocabSize; - //! Locally-stored number of input units. + //! Locally-stored dimensionality of the model. size_t dModel; - //! Locally-stored number of output units. + //! Locally-stored number attention heads. size_t numHeads; - //! Locally-stored weight object. + //! Locally-stored dimensionality of the position-wise feed forward network. size_t dimFFN; - //! Locally-stored weight parameters. + //! Locally-stored dropout rate. double dropout; //! Locally-stored attention mask. @@ -144,7 +143,6 @@ class Transformer //! Locally-stored transformer model. Sequential* transformer; - }; // class Transformer } // namespace ann diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp index 178dc040..5fdb4940 100644 --- a/models/transformer/transformer_impl.hpp +++ b/models/transformer/transformer_impl.hpp @@ -65,8 +65,7 @@ OutputDataType>::Transformer( dimFFN, dropout, attentionMask, - keyPaddingMask - ).Model(); + keyPaddingMask).Model(); encoder->Add(encoderStack); @@ -95,8 +94,7 @@ OutputDataType>::Transformer( dimFFN, dropout, attentionMask, - keyPaddingMask - ).Model(); + keyPaddingMask).Model(); transformer->Add(decoderStack); } From 55de04561f7dfd731ea4bc6fb5c89d082d64214e Mon Sep 17 00:00:00 2001 From: Mrityunjay Tripathi Date: Sun, 23 Aug 2020 23:20:32 +0530 Subject: [PATCH 5/8] adding constructors and destructors, removing some templates --- models/transformer/decoder.hpp | 76 +++++++++++++++--------- models/transformer/decoder_impl.hpp | 41 ++++++------- models/transformer/encoder.hpp | 77 +++++++++++++++++-------- models/transformer/encoder_impl.hpp | 30 +++++----- models/transformer/transformer.hpp | 65 ++++++++++++--------- models/transformer/transformer_impl.hpp | 43 ++++---------- 6 files changed, 184 insertions(+), 148 deletions(-) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp index 6f9dd771..012d7f3d 100644 --- a/models/transformer/decoder.hpp +++ b/models/transformer/decoder.hpp @@ -36,16 +36,10 @@ namespace ann /** Artificial Neural Network. */ { * the position-wise feed forward neural network. * @tparam RegularizerType The type of regularizer to be applied to layer * parameters. - * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, - * arma::sp_mat or arma::cube). - * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, - * arma::sp_mat or arma::cube). */ template < typename ActivationFunction = ReLULayer<>, - typename RegularizerType = NoRegularizer, - typename InputDataType = arma::mat, - typename OutputDataType = arma::mat + typename RegularizerType = NoRegularizer > class TransformerDecoder { @@ -66,6 +60,7 @@ class TransformerDecoder * @param dropout The dropout rate. * @param attentionMask The attention mask used to black-out future sequences. * @param keyPaddingMask The padding mask used to black-out particular token. + * @param ownMemory Whether to delete the pointer-type decoder object. */ TransformerDecoder(const size_t numLayers, const size_t tgtSeqLen, @@ -74,13 +69,39 @@ class TransformerDecoder const size_t numHeads = 8, const size_t dimFFN = 1024, const double dropout = 0.1, - const InputDataType& attentionMask = InputDataType(), - const InputDataType& keyPaddingMask = InputDataType()); + const arma::mat& attentionMask = arma::mat(), + const arma::mat& keyPaddingMask = arma::mat(), + const bool ownMemory = false); + + /** + * Destructor. + */ + ~TransformerDecoder() + { + if (ownMemory) + delete decoder; + } + + /** + * Copy constructor. + */ + TransformerDecoder(const TransformerDecoder& ) = delete; + + /** + * Move constructor. + */ + TransformerDecoder(const TransformerDecoder&& ) = delete; + + /** + * Copy assignment operator. + */ + TransformerDecoder& operator = (const TransformerDecoder& ) = delete; /** * Get the Transformer Decoder model. */ Sequential<>* Model() { return decoder; } + /** * Load the network from a local directory. * @@ -96,16 +117,16 @@ class TransformerDecoder void SaveModel(const std::string& filepath); //! Get the attention mask. - InputDataType const& AttentionMask() const { return attentionMask; } + arma::mat const& AttentionMask() const { return attentionMask; } //! Modify the attention mask. - InputDataType& AttentionMask() { return attentionMask; } + arma::mat& AttentionMask() { return attentionMask; } //! Get the key padding mask. - InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + arma::mat const& KeyPaddingMask() const { return keyPaddingMask; } //! Modify the key padding mask. - InputDataType& KeyPaddingMask() { return keyPaddingMask; } + arma::mat& KeyPaddingMask() { return keyPaddingMask; } private: /** @@ -113,7 +134,7 @@ class TransformerDecoder */ Sequential<>* AttentionBlock() { - Sequential<>* decoderBlockBottom = new Sequential<>(); + Sequential<>* decoderBlockBottom = new Sequential<>(false); decoderBlockBottom->Add>(1, 0, dModel * tgtSeqLen - 1, 0, -1); // Broadcast the incoming input to decoder @@ -124,10 +145,10 @@ class TransformerDecoder decoderInput->Add>(); // Masked Self attention layer. - Sequential<>* maskedSelfAttention = new Sequential<>(); + Sequential<>* maskedSelfAttention = new Sequential<>(false); maskedSelfAttention->Add(decoderInput); maskedSelfAttention->Add>( + arma::mat, arma::mat, RegularizerType>>( tgtSeqLen, tgtSeqLen, dModel, @@ -156,15 +177,15 @@ class TransformerDecoder encoderDecoderAttentionInput->Add(broadcastEncoderOutput); // Encoder-decoder attention. - Sequential<>* encoderDecoderAttention = new Sequential<>(); + Sequential<>* encoderDecoderAttention = new Sequential<>(false); encoderDecoderAttention->Add(encoderDecoderAttentionInput); encoderDecoderAttention->Add>( + arma::mat, arma::mat, RegularizerType>>( tgtSeqLen, srcSeqLen, dModel, numHeads, - InputDataType(), // No attention mask to encoder-decoder attention. + arma::mat(), // No attention mask to encoder-decoder attention. keyPaddingMask); // Residual connection. @@ -172,7 +193,7 @@ class TransformerDecoder residualAdd2->Add(encoderDecoderAttention); residualAdd2->Add>(); - Sequential<>* decoderBlock = new Sequential<>(); + Sequential<>* decoderBlock = new Sequential<>(false); decoderBlock->Add(residualAdd2); decoderBlock->Add>(dModel * tgtSeqLen); return decoderBlock; @@ -183,7 +204,7 @@ class TransformerDecoder */ Sequential<>* PositionWiseFFNBlock() { - Sequential<>* positionWiseFFN = new Sequential<>(); + Sequential<>* positionWiseFFN = new Sequential<>(false); positionWiseFFN->Add>(dModel, dimFFN); positionWiseFFN->Add(); positionWiseFFN->Add>(dimFFN, dModel); @@ -194,7 +215,7 @@ class TransformerDecoder residualAdd->Add(positionWiseFFN); residualAdd->Add>(); - Sequential<>* decoderBlock = new Sequential<>(); + Sequential<>* decoderBlock = new Sequential<>(false); decoderBlock->Add(residualAdd); return decoderBlock; } @@ -208,7 +229,7 @@ class TransformerDecoder //! Locally-stored source sequence length. size_t srcSeqLen; - //! Locally-stored dimensionality of the model. + //! Locally-stored number of features in the input. size_t dModel; //! Locally-stored number of attention heads. @@ -221,13 +242,16 @@ class TransformerDecoder double dropout; //! Locally-stored attention mask. - InputDataType attentionMask; + arma::mat attentionMask; //! Locally-stored key padding mask. - InputDataType keyPaddingMask; + arma::mat keyPaddingMask; + + //! Whether to delete pointer-type decoder object. + bool ownMemory; //! Locally-stored complete decoder network. - Sequential* decoder; + Sequential<>* decoder; }; // class TransformerDecoder } // namespace ann diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp index ac9b33ae..a52e0257 100644 --- a/models/transformer/decoder_impl.hpp +++ b/models/transformer/decoder_impl.hpp @@ -19,24 +19,21 @@ namespace mlpack { namespace ann /** Artificial Neural Network. */ { -template -TransformerDecoder::TransformerDecoder() : +template +TransformerDecoder::TransformerDecoder() : tgtSeqLen(0), srcSeqLen(0), dModel(0), numHeads(0), dimFFN(0), - dropout(0) + dropout(0), + ownMemory(true) { // Nothing to do here. } -template -TransformerDecoder::TransformerDecoder( +template +TransformerDecoder::TransformerDecoder( const size_t numLayers, const size_t tgtSeqLen, const size_t srcSeqLen, @@ -44,8 +41,9 @@ OutputDataType>::TransformerDecoder( const size_t numHeads, const size_t dimFFN, const double dropout, - const InputDataType& attentionMask, - const InputDataType& keyPaddingMask) : + const arma::mat& attentionMask, + const arma::mat& keyPaddingMask, + const bool ownMemory) : numLayers(numLayers), tgtSeqLen(tgtSeqLen), srcSeqLen(srcSeqLen), @@ -54,9 +52,10 @@ OutputDataType>::TransformerDecoder( dimFFN(dimFFN), dropout(dropout), attentionMask(attentionMask), - keyPaddingMask(keyPaddingMask) + keyPaddingMask(keyPaddingMask), + ownMemory(ownMemory) { - decoder = new Sequential(); + decoder = new Sequential<>(false); for (size_t n = 0; n < numLayers; ++n) { @@ -67,7 +66,7 @@ OutputDataType>::TransformerDecoder( break; } - Sequential<>* decoderBlock = new Sequential<>(); + Sequential<>* decoderBlock = new Sequential<>(false); decoderBlock->Add(AttentionBlock()); decoderBlock->Add(PositionWiseFFNBlock()); @@ -79,19 +78,17 @@ OutputDataType>::TransformerDecoder( } } -template -void TransformerDecoder::LoadModel(const std::string& filepath) +template +void TransformerDecoder:: +LoadModel(const std::string& filepath) { data::Load(filepath, "TransformerDecoder", decoder); std::cout << "Loaded model" << std::endl; } -template -void TransformerDecoder::SaveModel(const std::string& filepath) +template +void TransformerDecoder:: +SaveModel(const std::string& filepath) { std::cout << "Saving model" << std::endl; data::Save(filepath, "TransformerDecoder", decoder); diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp index 20e70050..50db3b70 100644 --- a/models/transformer/encoder.hpp +++ b/models/transformer/encoder.hpp @@ -36,16 +36,10 @@ namespace ann /** Artificial Neural Network. */ { * position-wise feed forward neural network. * @tparam RegularizerType The regularizer type to be applied on layer * parameters. - * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, - * arma::sp_mat or arma::cube). - * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, - * arma::sp_mat or arma::cube). */ template < typename ActivationFunction = ReLULayer<>, - typename RegularizerType = NoRegularizer, - typename InputDataType = arma::mat, - typename OutputDataType = arma::mat + typename RegularizerType = NoRegularizer > class TransformerEncoder { @@ -62,6 +56,7 @@ class TransformerEncoder * @param dropout The dropout rate. * @param attentionMask The attention mask to be applied to the sequences. * @param keyPaddingMask The key padding mask applied to the sequences. + * @param ownMemory Whether to delete the pointer-type encoder object. */ TransformerEncoder(const size_t numLayers, const size_t srcSeqLen, @@ -69,13 +64,38 @@ class TransformerEncoder const size_t numHeads = 2, const size_t dimFFN = 1024, const double dropout = 0.1, - const InputDataType& attentionMask = InputDataType(), - const InputDataType& keyPaddingMask = InputDataType()); + const arma::mat& attentionMask = arma::mat(), + const arma::mat& keyPaddingMask = arma::mat(), + const bool ownMemory = false); + + /** + * Destructor. + */ + ~TransformerEncoder() + { + if (ownMemory) + delete encoder; + } + + /** + * Copy constructor. + */ + TransformerEncoder(const TransformerEncoder& ) = delete; + + /** + * Move constructor. + */ + TransformerEncoder(const TransformerEncoder&& ) = delete; + + /** + * Copy assignment operator. + */ + TransformerEncoder& operator = (const TransformerEncoder& ) = delete; /** * Get the Transformer Encoder Model. */ - Sequential* Model() + Sequential* Model() { return encoder; } @@ -95,16 +115,16 @@ class TransformerEncoder void SaveModel(const std::string& filepath); //! Get the attention mask. - InputDataType const& AttentionMask() const { return attentionMask; } + arma::mat const& AttentionMask() const { return attentionMask; } //! Modify the attention mask. - InputDataType& AttentionMask() { return attentionMask; } + arma::mat& AttentionMask() { return attentionMask; } //! Get the key padding mask. - InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + arma::mat const& KeyPaddingMask() const { return keyPaddingMask; } //! Modify the key padding mask. - InputDataType& KeyPaddingMask() { return keyPaddingMask; } + arma::mat& KeyPaddingMask() { return keyPaddingMask; } private: /** @@ -118,15 +138,19 @@ class TransformerEncoder input->Add>(); /* Self attention layer. */ - Sequential<>* selfAttention = new Sequential<>(); - selfAttention->Add(input); - selfAttention->Add - >(srcSeqLen, srcSeqLen, dModel, numHeads); + Sequential<>* selfAttn = new Sequential<>(false); + selfAttn->Add(input); + selfAttn->Add>( + srcSeqLen, + srcSeqLen, + dModel, + numHeads, + attentionMask, + keyPaddingMask); /* This layer adds a residual connection. */ AddMerge<>* residualAdd = new AddMerge<>(); - residualAdd->Add(selfAttention); + residualAdd->Add(selfAttn); residualAdd->Add>(); encoder->Add(residualAdd); @@ -138,7 +162,7 @@ class TransformerEncoder */ void PositionWiseFFNBlock() { - Sequential<>* positionWiseFFN = new Sequential<>(); + Sequential<>* positionWiseFFN = new Sequential<>(false); positionWiseFFN->Add>(dModel, dimFFN); positionWiseFFN->Add(); positionWiseFFN->Add>(dimFFN, dModel); @@ -159,7 +183,7 @@ class TransformerEncoder //! Locally-stored source sequence length. size_t srcSeqLen; - //! Locally-stored dimensionality of model. + //! Locally-stored number of features in the input. size_t dModel; //! Locally-stored number of attention heads. @@ -172,13 +196,16 @@ class TransformerEncoder double dropout; //! Locally-stored attention mask. - InputDataType attentionMask; + arma::mat attentionMask; //! Locally-stored key padding mask. - InputDataType keyPaddingMask; + arma::mat keyPaddingMask; + + //! Whether to delete the pointer-type encoder object. + bool ownMemory; //! Locally-stored encoder block. - Sequential* encoder; + Sequential<>* encoder; }; // class TransformerEncoder } // namespace ann diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp index a364e24d..9da43258 100644 --- a/models/transformer/encoder_impl.hpp +++ b/models/transformer/encoder_impl.hpp @@ -19,18 +19,17 @@ namespace mlpack { namespace ann /** Artificial Neural Network. */ { -template -TransformerEncoder::TransformerEncoder( +template +TransformerEncoder::TransformerEncoder( const size_t numLayers, const size_t srcSeqLen, const size_t dModel, const size_t numHeads, const size_t dimFFN, const double dropout, - const InputDataType& attentionMask, - const InputDataType& keyPaddingMask) : + const arma::mat& attentionMask, + const arma::mat& keyPaddingMask, + const bool ownMemory) : numLayers(numLayers), srcSeqLen(srcSeqLen), dModel(dModel), @@ -38,9 +37,10 @@ OutputDataType>::TransformerEncoder( dimFFN(dimFFN), dropout(dropout), attentionMask(attentionMask), - keyPaddingMask(keyPaddingMask) + keyPaddingMask(keyPaddingMask), + ownMemory(ownMemory) { - encoder = new Sequential(); + encoder = new Sequential(false); for (size_t n = 0; n < numLayers; ++n) { @@ -49,19 +49,17 @@ OutputDataType>::TransformerEncoder( } } -template -void TransformerEncoder::LoadModel(const std::string& filePath) +template +void TransformerEncoder:: +LoadModel(const std::string& filePath) { data::Load(filePath, "TransformerEncoder", encoder); std::cout << "Loaded model" << std::endl; } -template -void TransformerEncoder::SaveModel(const std::string& filePath) +template +void TransformerEncoder:: +SaveModel(const std::string& filePath) { std::cout << "Saving model" << std::endl; data::Save(filePath, "TransformerEncoder", encoder); diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp index 45bef01d..1f0de944 100644 --- a/models/transformer/transformer.hpp +++ b/models/transformer/transformer.hpp @@ -31,16 +31,10 @@ namespace ann /** Artificial Neural Network. */ { * position-wise feed forward neural network. * @tparam RegularizerType The regularizer type to be applied on layer * parameters. - * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, - * arma::sp_mat or arma::cube). - * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, - * arma::sp_mat or arma::cube). */ template < typename ActivationFunction = ReLULayer<>, - typename RegularizerType = NoRegularizer, - typename InputDataType = arma::mat, - typename OutputDataType = arma::mat + typename RegularizerType = NoRegularizer > class Transformer { @@ -60,6 +54,7 @@ class Transformer * @param dropout The dropout rate. * @param attentionMask The attention mask to be applied to the sequences. * @param keyPaddingMask The key padding mask applied to the sequences. + * @param ownMemory Whether to delete pointer-type transformer object. */ Transformer(const size_t numLayers, const size_t tgtSeqLen, @@ -70,42 +65,53 @@ class Transformer const size_t numHeads = 12, const size_t dimFFN = 1024, const double dropout = 0.1, - const InputDataType& attentionMask = InputDataType(), - const InputDataType& keyPaddingMask = InputDataType()); + const arma::mat& attentionMask = arma::mat(), + const arma::mat& keyPaddingMask = arma::mat(), + const bool ownMemory = false); /** - * Get the Transformer Encoder Model. + * Destructor. */ - Sequential* Model() + ~Transformer() { - return transformer; + if (ownMemory) + delete transformer; } /** - * Load the Transformer model from a local directory. - * - * @param filepath The location of the stored model. + * Copy constructor. */ - void LoadModel(const std::string& filepath); + Transformer(const Transformer& /* transformer */) = delete; /** - * Save the Transformer model locally. - * - * @param filepath The location where the model is to be saved. + * Move constructor. */ - void SaveModel(const std::string& filepath); + Transformer(const Transformer&& /* transformer */) = delete; + + /** + * Copy assignment operator. + */ + Transformer& operator = (const Transformer& /* transformer */) = delete; + + /** + * Get the Transformer Encoder Model. + */ + Sequential<>* Model() + { + return transformer; + } //! Get the attention mask. - InputDataType const& AttentionMask() const { return attentionMask; } + arma::mat const& AttentionMask() const { return attentionMask; } //! Modify the attention mask. - InputDataType& AttentionMask() { return attentionMask; } + arma::mat& AttentionMask() { return attentionMask; } //! Get the key padding mask. - InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + arma::mat const& KeyPaddingMask() const { return keyPaddingMask; } //! Modify the key padding mask. - InputDataType& KeyPaddingMask() { return keyPaddingMask; } + arma::mat& KeyPaddingMask() { return keyPaddingMask; } private: //! Locally-stored number of encoder and decoder layers. @@ -123,7 +129,7 @@ class Transformer //! Locally-stored vocabulary size of the source. size_t srcVocabSize; - //! Locally-stored dimensionality of the model. + //! Locally-stored number of features in the input. size_t dModel; //! Locally-stored number attention heads. @@ -136,13 +142,16 @@ class Transformer double dropout; //! Locally-stored attention mask. - InputDataType attentionMask; + arma::mat attentionMask; //! Locally-stored key padding mask. - InputDataType keyPaddingMask; + arma::mat keyPaddingMask; + + //! Whether to delete the pointer-type transformer object. + bool ownMemory; //! Locally-stored transformer model. - Sequential* transformer; + Sequential<>* transformer; }; // class Transformer } // namespace ann diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp index 5fdb4940..57d813d5 100644 --- a/models/transformer/transformer_impl.hpp +++ b/models/transformer/transformer_impl.hpp @@ -19,10 +19,8 @@ namespace mlpack { namespace ann /** Artificial Neural Network. */ { -template -Transformer::Transformer( +template +Transformer::Transformer( const size_t numLayers, const size_t tgtSeqLen, const size_t srcSeqLen, @@ -32,8 +30,9 @@ OutputDataType>::Transformer( const size_t numHeads, const size_t dimFFN, const double dropout, - const InputDataType& attentionMask, - const InputDataType& keyPaddingMask) : + const arma::mat& attentionMask, + const arma::mat& keyPaddingMask, + const bool ownMemory) : numLayers(numLayers), tgtSeqLen(tgtSeqLen), srcSeqLen(srcSeqLen), @@ -44,11 +43,12 @@ OutputDataType>::Transformer( dimFFN(dimFFN), dropout(dropout), attentionMask(attentionMask), - keyPaddingMask(keyPaddingMask) + keyPaddingMask(keyPaddingMask), + ownMemory(ownMemory) { - transformer = new Sequential<>(); + transformer = new Sequential<>(false); - Sequential<>* encoder = new Sequential<>(); + Sequential<>* encoder = new Sequential<>(false); // Pull out the sequences of source language which is stacked above in the // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix. @@ -57,7 +57,7 @@ OutputDataType>::Transformer( encoder->Add>(dModel, srcSeqLen); Sequential<>* encoderStack = mlpack::ann::TransformerEncoder< - ActivationFunction, RegularizerType, InputDataType, OutputDataType>( + ActivationFunction, RegularizerType>( numLayers, srcSeqLen, dModel, @@ -69,7 +69,7 @@ OutputDataType>::Transformer( encoder->Add(encoderStack); - Sequential<>* decoderPE = new Sequential<>(); + Sequential<>* decoderPE = new Sequential<>(false); // Pull out the sequences of target language which is stacked below in the // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last @@ -85,7 +85,7 @@ OutputDataType>::Transformer( transformer->Add(encoderDecoderConcat); Sequential<>* decoderStack = mlpack::ann::TransformerDecoder< - ActivationFunction, RegularizerType, InputDataType, OutputDataType>( + ActivationFunction, RegularizerType>( numLayers, tgtSeqLen, srcSeqLen, @@ -99,25 +99,6 @@ OutputDataType>::Transformer( transformer->Add(decoderStack); } -template -void Transformer::LoadModel(const std::string& filePath) -{ - data::Load(filePath, "Transformer", transformer); - std::cout << "Loaded model" << std::endl; -} - -template -void Transformer::SaveModel(const std::string& filePath) -{ - std::cout << "Saving model" << std::endl; - data::Save(filePath, "Transformer", transformer); - std::cout << "Model saved in " << filePath << std::endl; -} - } // namespace ann } // namespace mlpack From a253a6a9e0c4a11ef4e882e24a9bcb68e6e7faf7 Mon Sep 17 00:00:00 2001 From: Mrityunjay Tripathi Date: Tue, 25 Aug 2020 23:20:03 +0530 Subject: [PATCH 6/8] use mutator method to set mask in mha --- models/transformer/decoder.hpp | 30 ++++++++++++++--------------- models/transformer/encoder.hpp | 17 ++++++++-------- models/transformer/encoder_impl.hpp | 2 +- tests/ffn_model_tests.cpp | 6 +++--- 4 files changed, 28 insertions(+), 27 deletions(-) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp index 012d7f3d..d486c6f7 100644 --- a/models/transformer/decoder.hpp +++ b/models/transformer/decoder.hpp @@ -147,13 +147,14 @@ class TransformerDecoder // Masked Self attention layer. Sequential<>* maskedSelfAttention = new Sequential<>(false); maskedSelfAttention->Add(decoderInput); - maskedSelfAttention->Add>( - tgtSeqLen, - tgtSeqLen, - dModel, - numHeads, - attentionMask); + + MultiheadAttention<>* mha1 = new MultiheadAttention<>(tgtSeqLen, + tgtSeqLen, + dModel, + numHeads); + mha1->AttentionMask() = attentionMask; + + maskedSelfAttention->Add(mha1); // Residual connection. AddMerge<>* residualAdd1 = new AddMerge<>(); @@ -179,14 +180,13 @@ class TransformerDecoder // Encoder-decoder attention. Sequential<>* encoderDecoderAttention = new Sequential<>(false); encoderDecoderAttention->Add(encoderDecoderAttentionInput); - encoderDecoderAttention->Add>( - tgtSeqLen, - srcSeqLen, - dModel, - numHeads, - arma::mat(), // No attention mask to encoder-decoder attention. - keyPaddingMask); + + MultiheadAttention<>* mha2 = new MultiheadAttention<>(tgtSeqLen, + srcSeqLen, + dModel, + numHeads); + mha2->KeyPaddingMask() = keyPaddingMask; + encoderDecoderAttention->Add(mha2); // Residual connection. AddMerge<>* residualAdd2 = new AddMerge<>(); diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp index 50db3b70..a54d98b9 100644 --- a/models/transformer/encoder.hpp +++ b/models/transformer/encoder.hpp @@ -95,7 +95,7 @@ class TransformerEncoder /** * Get the Transformer Encoder Model. */ - Sequential* Model() + Sequential<>* Model() { return encoder; } @@ -140,13 +140,14 @@ class TransformerEncoder /* Self attention layer. */ Sequential<>* selfAttn = new Sequential<>(false); selfAttn->Add(input); - selfAttn->Add>( - srcSeqLen, - srcSeqLen, - dModel, - numHeads, - attentionMask, - keyPaddingMask); + + MultiheadAttention<>* mha = new MultiheadAttention<>(srcSeqLen, + srcSeqLen, + dModel, + numHeads); + mha->AttentionMask() = attentionMask; + mha->KeyPaddingMask() = keyPaddingMask; + selfAttn->Add(mha); /* This layer adds a residual connection. */ AddMerge<>* residualAdd = new AddMerge<>(); diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp index 9da43258..05bfb058 100644 --- a/models/transformer/encoder_impl.hpp +++ b/models/transformer/encoder_impl.hpp @@ -40,7 +40,7 @@ TransformerEncoder::TransformerEncoder( keyPaddingMask(keyPaddingMask), ownMemory(ownMemory) { - encoder = new Sequential(false); + encoder = new Sequential<>(false); for (size_t n = 0; n < numLayers; ++n) { diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp index 3f4601fd..f2c529d9 100644 --- a/tests/ffn_model_tests.cpp +++ b/tests/ffn_model_tests.cpp @@ -68,7 +68,7 @@ BOOST_AUTO_TEST_CASE(TransformerEncoderTest) mlpack::ann::TransformerEncoder<> encoder(numLayers, srcSeqLen, dModel, numHeads, dimFFN, dropout); - FFN<> model; + FFN, XavierInitialization> model; model.Add(encoder.Model()); model.Add>(dModel * srcSeqLen, vocabSize); @@ -103,7 +103,7 @@ BOOST_AUTO_TEST_CASE(TransformerDecoderTest) mlpack::ann::TransformerDecoder<> decoder(numLayers, tgtSeqLen, srcSeqLen, dModel, numHeads, dimFFN, dropout); - FFN<> model; + FFN, XavierInitialization> model; model.Add(decoder.Model()); model.Add>(dModel * tgtSeqLen, vocabSize); @@ -148,7 +148,7 @@ BOOST_AUTO_TEST_CASE(TransformerTest) mlpack::ann::Transformer<> transformer(numLayers, tgtSeqLen, srcSeqLen, tgtVocabSize, srcVocabSize, dModel, numHeads, dimFFN, dropout); - FFN<> model; + FFN, XavierInitialization> model; model.Add(transformer.Model()); model.Add>(dModel * tgtSeqLen, tgtVocabSize); From fbdd4ff2a028cad8c75673ff81f84546f8f2d9ea Mon Sep 17 00:00:00 2001 From: Mrityunjay Tripathi Date: Wed, 26 Aug 2020 22:01:58 +0530 Subject: [PATCH 7/8] set model = true --- models/transformer/decoder.hpp | 32 ++++++++++++------------- models/transformer/decoder_impl.hpp | 6 ++--- models/transformer/encoder.hpp | 10 ++++---- models/transformer/encoder_impl.hpp | 2 +- models/transformer/transformer_impl.hpp | 8 +++---- 5 files changed, 29 insertions(+), 29 deletions(-) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp index d486c6f7..f8686d4c 100644 --- a/models/transformer/decoder.hpp +++ b/models/transformer/decoder.hpp @@ -134,18 +134,18 @@ class TransformerDecoder */ Sequential<>* AttentionBlock() { - Sequential<>* decoderBlockBottom = new Sequential<>(false); + Sequential<>* decoderBlockBottom = new Sequential<>(); decoderBlockBottom->Add>(1, 0, dModel * tgtSeqLen - 1, 0, -1); // Broadcast the incoming input to decoder // i.e. query into (query, key, value). - Concat<>* decoderInput = new Concat<>(); + Concat<>* decoderInput = new Concat<>(true); decoderInput->Add>(); decoderInput->Add>(); decoderInput->Add>(); // Masked Self attention layer. - Sequential<>* maskedSelfAttention = new Sequential<>(false); + Sequential<>* maskedSelfAttention = new Sequential<>(); maskedSelfAttention->Add(decoderInput); MultiheadAttention<>* mha1 = new MultiheadAttention<>(tgtSeqLen, @@ -157,7 +157,7 @@ class TransformerDecoder maskedSelfAttention->Add(mha1); // Residual connection. - AddMerge<>* residualAdd1 = new AddMerge<>(); + AddMerge<>* residualAdd1 = new AddMerge<>(true); residualAdd1->Add(maskedSelfAttention); residualAdd1->Add>(); @@ -167,19 +167,19 @@ class TransformerDecoder decoderBlockBottom->Add>(dModel * tgtSeqLen); // This layer broadcasts the output of encoder i.e. key into (key, value). - Concat<>* broadcastEncoderOutput = new Concat<>(); + Concat<>* broadcastEncoderOutput = new Concat<>(true); broadcastEncoderOutput->Add>(1, dModel * tgtSeqLen, -1, 0, -1); broadcastEncoderOutput->Add>(1, dModel * tgtSeqLen, -1, 0, -1); // This layer concatenates the output of the bottom decoder block (query) // and the output of the encoder (key, value). - Concat<>* encoderDecoderAttentionInput = new Concat<>(); - encoderDecoderAttentionInput->Add(decoderBlockBottom); - encoderDecoderAttentionInput->Add(broadcastEncoderOutput); + Concat<>* encDecAttnInput = new Concat<>(true); + encDecAttnInput->Add>(1, 0, dModel * tgtSeqLen - 1, 0, -1); + encDecAttnInput->Add(broadcastEncoderOutput); // Encoder-decoder attention. - Sequential<>* encoderDecoderAttention = new Sequential<>(false); - encoderDecoderAttention->Add(encoderDecoderAttentionInput); + Sequential<>* encoderDecoderAttention = new Sequential<>(); + encoderDecoderAttention->Add(encDecAttnInput); MultiheadAttention<>* mha2 = new MultiheadAttention<>(tgtSeqLen, srcSeqLen, @@ -189,11 +189,11 @@ class TransformerDecoder encoderDecoderAttention->Add(mha2); // Residual connection. - AddMerge<>* residualAdd2 = new AddMerge<>(); + AddMerge<>* residualAdd2 = new AddMerge<>(true); residualAdd2->Add(encoderDecoderAttention); - residualAdd2->Add>(); + residualAdd2->Add(decoderBlockBottom); - Sequential<>* decoderBlock = new Sequential<>(false); + Sequential<>* decoderBlock = new Sequential<>(); decoderBlock->Add(residualAdd2); decoderBlock->Add>(dModel * tgtSeqLen); return decoderBlock; @@ -204,18 +204,18 @@ class TransformerDecoder */ Sequential<>* PositionWiseFFNBlock() { - Sequential<>* positionWiseFFN = new Sequential<>(false); + Sequential<>* positionWiseFFN = new Sequential<>(); positionWiseFFN->Add>(dModel, dimFFN); positionWiseFFN->Add(); positionWiseFFN->Add>(dimFFN, dModel); positionWiseFFN->Add>(dropout); /* Residual connection. */ - AddMerge<>* residualAdd = new AddMerge<>(); + AddMerge<>* residualAdd = new AddMerge<>(true); residualAdd->Add(positionWiseFFN); residualAdd->Add>(); - Sequential<>* decoderBlock = new Sequential<>(false); + Sequential<>* decoderBlock = new Sequential<>(); decoderBlock->Add(residualAdd); return decoderBlock; } diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp index a52e0257..d77878ba 100644 --- a/models/transformer/decoder_impl.hpp +++ b/models/transformer/decoder_impl.hpp @@ -55,7 +55,7 @@ TransformerDecoder::TransformerDecoder( keyPaddingMask(keyPaddingMask), ownMemory(ownMemory) { - decoder = new Sequential<>(false); + decoder = new Sequential<>(); for (size_t n = 0; n < numLayers; ++n) { @@ -66,11 +66,11 @@ TransformerDecoder::TransformerDecoder( break; } - Sequential<>* decoderBlock = new Sequential<>(false); + Sequential<>* decoderBlock = new Sequential<>(); decoderBlock->Add(AttentionBlock()); decoderBlock->Add(PositionWiseFFNBlock()); - Concat<>* concatQueryKey = new Concat<>(); + Concat<>* concatQueryKey = new Concat<>(true); concatQueryKey->Add(decoderBlock); concatQueryKey->Add>(1, dModel * tgtSeqLen, -1, 0, -1); diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp index a54d98b9..aace4131 100644 --- a/models/transformer/encoder.hpp +++ b/models/transformer/encoder.hpp @@ -132,13 +132,13 @@ class TransformerEncoder */ void AttentionBlock() { - Concat<>* input = new Concat<>(); + Concat<>* input = new Concat<>(true); input->Add>(); input->Add>(); input->Add>(); /* Self attention layer. */ - Sequential<>* selfAttn = new Sequential<>(false); + Sequential<>* selfAttn = new Sequential<>(); selfAttn->Add(input); MultiheadAttention<>* mha = new MultiheadAttention<>(srcSeqLen, @@ -150,7 +150,7 @@ class TransformerEncoder selfAttn->Add(mha); /* This layer adds a residual connection. */ - AddMerge<>* residualAdd = new AddMerge<>(); + AddMerge<>* residualAdd = new AddMerge<>(true); residualAdd->Add(selfAttn); residualAdd->Add>(); @@ -163,14 +163,14 @@ class TransformerEncoder */ void PositionWiseFFNBlock() { - Sequential<>* positionWiseFFN = new Sequential<>(false); + Sequential<>* positionWiseFFN = new Sequential<>(); positionWiseFFN->Add>(dModel, dimFFN); positionWiseFFN->Add(); positionWiseFFN->Add>(dimFFN, dModel); positionWiseFFN->Add>(dropout); /* This layer adds a residual connection. */ - AddMerge<>* residualAdd = new AddMerge<>(); + AddMerge<>* residualAdd = new AddMerge<>(true); residualAdd->Add(positionWiseFFN); residualAdd->Add>(); diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp index 05bfb058..16075c60 100644 --- a/models/transformer/encoder_impl.hpp +++ b/models/transformer/encoder_impl.hpp @@ -40,7 +40,7 @@ TransformerEncoder::TransformerEncoder( keyPaddingMask(keyPaddingMask), ownMemory(ownMemory) { - encoder = new Sequential<>(false); + encoder = new Sequential<>(); for (size_t n = 0; n < numLayers; ++n) { diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp index 57d813d5..7ed8c707 100644 --- a/models/transformer/transformer_impl.hpp +++ b/models/transformer/transformer_impl.hpp @@ -46,9 +46,9 @@ Transformer::Transformer( keyPaddingMask(keyPaddingMask), ownMemory(ownMemory) { - transformer = new Sequential<>(false); + transformer = new Sequential<>(); - Sequential<>* encoder = new Sequential<>(false); + Sequential<>* encoder = new Sequential<>(); // Pull out the sequences of source language which is stacked above in the // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix. @@ -69,7 +69,7 @@ Transformer::Transformer( encoder->Add(encoderStack); - Sequential<>* decoderPE = new Sequential<>(false); + Sequential<>* decoderPE = new Sequential<>(); // Pull out the sequences of target language which is stacked below in the // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last @@ -78,7 +78,7 @@ Transformer::Transformer( decoderPE->Add>(tgtVocabSize, dModel); decoderPE->Add>(dModel, tgtSeqLen); - Concat<>* encoderDecoderConcat = new Concat<>(); + Concat<>* encoderDecoderConcat = new Concat<>(true); encoderDecoderConcat->Add(encoder); encoderDecoderConcat->Add(decoderPE); From 368f82e94027b5049da3baa71957f0a9f323aded Mon Sep 17 00:00:00 2001 From: Mrityunjay Tripathi Date: Sat, 10 Oct 2020 08:50:57 +0530 Subject: [PATCH 8/8] added suggestions --- models/transformer/decoder.hpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp index f8686d4c..5c17d288 100644 --- a/models/transformer/decoder.hpp +++ b/models/transformer/decoder.hpp @@ -52,7 +52,6 @@ class TransformerDecoder * @param numLayers The number of decoder blocks. * @param tgtSeqLen Target Sequence Length. * @param srcSeqLen Source Sequence Length. - * @param memoryModule The last Encoder module. * @param dModel The number of features in the input. Also, same as the * `embedDim` in `MultiheadAttention` layer. * @param numHeads The number of attention heads. @@ -90,13 +89,18 @@ class TransformerDecoder /** * Move constructor. */ - TransformerDecoder(const TransformerDecoder&& ) = delete; + TransformerDecoder(TransformerDecoder&& ) = delete; /** * Copy assignment operator. */ TransformerDecoder& operator = (const TransformerDecoder& ) = delete; + /** + * Move assignment operator. + */ + TransformerDecoder& operator = (TransformerDecoder&& ) = delete; + /** * Get the Transformer Decoder model. */ @@ -149,9 +153,9 @@ class TransformerDecoder maskedSelfAttention->Add(decoderInput); MultiheadAttention<>* mha1 = new MultiheadAttention<>(tgtSeqLen, - tgtSeqLen, - dModel, - numHeads); + tgtSeqLen, + dModel, + numHeads); mha1->AttentionMask() = attentionMask; maskedSelfAttention->Add(mha1);