diff --git a/.gitignore b/.gitignore index 4708e5bb..92a9136d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ build* xcode* +.vscode/ .DS_Store .idea cmake-build-* diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt index c4bd5a8c..77aa6f85 100644 --- a/models/CMakeLists.txt +++ b/models/CMakeLists.txt @@ -1,7 +1,15 @@ cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR) project(models) -add_subdirectory(darknet) +# Recurse into each model mlpack provides. +set(DIRS + darknet + transformer +) + +foreach(dir ${DIRS}) + add_subdirectory(${dir}) +endforeach() # Add directory name to sources. set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/) diff --git a/models/models.hpp b/models/models.hpp new file mode 100644 index 00000000..6e620e45 --- /dev/null +++ b/models/models.hpp @@ -0,0 +1,9 @@ +/** + * @file models.hpp + * @author Mrityunjay Tripathi + * + * This includes various models. + */ + +#include "transformer/encoder.hpp" +#include "transformer/decoder.hpp" diff --git a/models/transformer/CMakeLists.txt b/models/transformer/CMakeLists.txt new file mode 100644 index 00000000..288262a4 --- /dev/null +++ b/models/transformer/CMakeLists.txt @@ -0,0 +1,20 @@ +cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR) +project(transformer) + +set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/) +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../") + +set(SOURCES + decoder.hpp + decoder_impl.hpp + encoder.hpp + encoder_impl.hpp + transformer.hpp + transformer_impl.hpp +) + +foreach(file ${SOURCES}) + set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file}) +endforeach() + +set(DIRS ${DIRS} ${DIR_SRCS} PARENT_SCOPE) diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp new file mode 100644 index 00000000..c0dabb5a --- /dev/null +++ b/models/transformer/decoder.hpp @@ -0,0 +1,230 @@ +/** + * @file models/transformer/decoder.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Definition of the Transformer Decoder layer. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_DECODER_HPP +#define MODELS_TRANSFORMER_DECODER_HPP + +#include +#include +#include +#include + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +/** + * In addition to the two sub-layers in each encoder layer, the decoder inserts + * a third sub-layer, which performs multi-head attention over the output of the + * encoder stack. Similar to the encoder, we employ residual connections around + * each of the sub-layers, followed by layer normalization. We also modify the + * self-attention sub-layer in the decoder stack to prevent positions from + * attending to subsequent positions. This masking, combined with fact that the + * output embeddings are offset by one position, ensures that the predictions + * for position i can depend only on the known outputs at positions less than i. + * + * @tparam ActivationFunction The type of the activation function to be used in + * the position-wise feed forward neural network. + * @tparam RegularizerType The type of regularizer to be applied to layer + * parameters. + * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + */ +template < + typename ActivationFunction = ReLULayer<>, + typename RegularizerType = NoRegularizer, + typename InputDataType = arma::mat, + typename OutputDataType = arma::mat +> +class TransformerDecoder +{ + public: + TransformerDecoder(); + + /** + * Create the TransformerDecoder object using the specified parameters. + * + * @param numLayers The number of decoder blocks. + * @param tgtSeqLen Target Sequence Length. + * @param srcSeqLen Source Sequence Length. + * @param memoryModule The last Encoder module. + * @param dModel The number of features in the input. Also, same as the + * 'embedDim' in 'MultiheadAttention' layer. + * @param numHeads The number of attention heads. + * @param dimFFN The dimentionality of feedforward network. + * @param dropout The dropout rate. + * @param attentionMask The attention mask used to black-out future sequences. + * @param keyPaddingMask The padding mask used to black-out particular token. + */ + TransformerDecoder(const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t dModel = 512, + const size_t numHeads = 8, + const size_t dimFFN = 1024, + const double dropout = 0.1, + const InputDataType& attentionMask = InputDataType(), + const InputDataType& keyPaddingMask = InputDataType()); + + /** + * Get the Transformer Decoder model. + */ + Sequential<>* Model() { return decoder; } + /** + * Load the network from a local directory. + * + * @param filepath The location of the stored model. + */ + void LoadModel(const std::string& filepath); + + /** + * Save the network locally. + * + * @param filepath The location where the model is to be saved. + */ + void SaveModel(const std::string& filepath); + + //! Get the key matrix, the output of the Transformer Encoder. + InputDataType const& Key() const { return key; } + + //! Modify the key matrix. + InputDataType& Key() { return key; } + + private: + /** + * This method adds the attention block to the decoder. + */ + void AttentionBlock() + { + Sequential<>* decoderBlockBottom = new Sequential<>(); + decoderBlockBottom->Add>(1, 0, dModel * tgtSeqLen - 1, 0, -1); + + // Broadcast the incoming input to decoder + // i.e. query into (query, key, value). + Concat<>* decoderInput = new Concat<>(); + decoderInput->Add>(); + decoderInput->Add>(); + decoderInput->Add>(); + + // Masked Self attention layer. + Sequential<>* maskedSelfAttention = new Sequential<>(); + maskedSelfAttention->Add(decoderInput); + maskedSelfAttention->Add>( + tgtSeqLen, + tgtSeqLen, + dModel, + numHeads, + attentionMask + ); + + // Residual connection. + AddMerge<>* residualAdd = new AddMerge<>(); + residualAdd->Add(maskedSelfAttention); + residualAdd->Add>(); + + decoderBlockBottom->Add(residualAddMerge); + + // Add the LayerNorm layer with required parameters. + decoderBlockBottom->Add>(dModel * tgtSeqLen); + + // This layer broadcasts the output of encoder i.e. key into (key, value). + Concat<>* broadcastEncoderOutput = new Concat<>(); + broadcastEncoderOutput->Add>(1, dModel * tgtSeqLen, -1, 0, -1); + broadcastEncoderOutput->Add>(1, dModel * tgtSeqLen, -1, 0, -1); + + // This layer concatenates the output of the bottom decoder block (query) + // and the output of the encoder (key, value). + Concat<>* encoderDecoderAttentionInput = new Concat<>(); + encoderDecoderAttentionInput->Add(decoderBlockBottom); + encoderDecoderAttentionInput->Add(broadcastEncoderOutput); + + // Encoder-decoder attention. + Sequential<>* encoderDecoderAttention = new Sequential<>(); + encoderDecoderAttention->Add(encoderDecoderAttentionInput); + encoderDecoderAttention->Add>( + tgtSeqLen, + srcSeqLen, + dModel, + numHeads, + InputDatatype(), // No attention mask to encoder-decoder attention. + keyPaddingMask); + + // Residual connection. + AddMerge<>* residualAdd = new AddMerge<>(); + residualAdd->Add(encoderDecoderAttention); + residualAdd->Add>(); + + decoder->Add(residualAdd); + decoder->Add>(dModel * tgtSeqLen); + } + + /** + * This method adds the position-wise feed forward network to the decoder. + */ + void PositionWiseFFNBlock() + { + Sequential<>* positionWiseFFN = new Sequential<>(); + positionWiseFFN->Add>(dModel, dimFFN); + positionWiseFFN->Add(); + positionWiseFFN->Add>(dimFFN, dModel); + positionWiseFFN->Add>(dropout); + + /* Residual connection. */ + AddMerge<>* residualAdd = new AddMerge<>(); + residualAdd->Add(positionWiseFFN); + residualAdd->Add>(); + decoder->Add(residualAdd); + } + + //! Locally-stored number of decoder layers. + size_t numLayers; + + //! Locally-stored target sequence length. + size_t tgtSeqLen; + + //! Locally-stored source sequence length. + size_t srcSeqLen; + + //! Locally-stored number of input units. + size_t dModel; + + //! Locally-stored number of output units. + size_t numHeads; + + //! Locally-stored weight object. + size_t dimFFN; + + //! Locally-stored weight parameters. + double dropout; + + //! Locally-stored attention mask. + InputDataType attentionMask; + + //! Locally-stored key padding mask. + InputDataType keyPaddingMask; + + //! Locally-stored complete decoder network. + Sequential* decoder; + +}; // class TransformerDecoder + +} // namespace ann +} // namespace mlpack + +// Include implementation. +#include "decoder_impl.hpp" + +#endif diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp new file mode 100644 index 00000000..0fb8c89c --- /dev/null +++ b/models/transformer/decoder_impl.hpp @@ -0,0 +1,91 @@ +/** + * @file models/transformer/decoder_impl.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Implementation of the Transformer Decoder class. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_DECODER_IMPL_HPP +#define MODELS_TRANSFORMER_DECODER_IMPL_HPP + +#include "decoder.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +template +TransformerDecoder::TransformerDecoder() : + tgtSeqLen(0), + srcSeqLen(0), + memoryModule(NULL), + dModel(0), + numHeads(0), + dimFFN(0), + dropout(0) +{ + // Nothing to do here. +} + +template +TransformerDecoder::TransformerDecoder( + const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t dModel, + const size_t numHeads, + const size_t dimFFN, + const double dropout, + const InputDataType& attentionMask, + const InputDataType& keyPaddingMask) : + numLayers(numLayers), + tgtSeqLen(tgtSeqLen), + srcSeqLen(srcSeqLen), + dModel(dModel), + numHeads(numHeads), + dimFFN(dimFFN), + dropout(dropout), + attentionMask(attentionMask), + keyPaddingMask(keyPaddingMask) +{ + decoder = new Sequential(); + + for (size_t N = 0; N < numLayers; ++N) + { + AttentionBlock(); + PositionWiseFFNBlock(); + } +} + +template +void TransformerDecoder::LoadModel(const std::string& filepath) +{ + data::Load(filepath, "TransformerDecoder", decoder); + std::cout << "Loaded model" << std::endl; +} + +template +void TransformerDecoder::SaveModel(const std::string& filepath) +{ + std::cout << "Saving model" << std::endl; + data::Save(filepath, "TransformerDecoder", decoder); + std::cout << "Model saved in " << filepath << std::endl; +} + +} // namespace ann +} // namespace mlpack + +#endif diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp new file mode 100644 index 00000000..ca38abff --- /dev/null +++ b/models/transformer/encoder.hpp @@ -0,0 +1,191 @@ +/** + * @file models/transformer/encoder.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Definition of the Transformer Encoder layer. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_ENCODER_HPP +#define MODELS_TRANSFORMER_ENCODER_HPP + +#include +#include +#include +#include + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +/** + * The Transformer Encoder layer has two sub-layers. The first is a multi-head + * self-attention mechanism, and the second is a simple, position-wise + * fully connected feed-forward network. We employ a residual connection around + * each of the two sub-layers, followed by layer normalization. Hence the output + * of each sub-layer is 'LayerNorm(x + Sublayer(x))', where 'Sublayer(x)' is the + * function implemented by the sub-layer itself. To facilitate these residual + * connections, all sub-layers in the model, as well as the embedding layers, + * produce outputs of dimension 'dModel'. + * + * @tparam ActivationType The type of activation function to be used in the + * position-wise feed forward neural network. + * @tparam RegularizerType The regularizer type to be applied on layer + * parameters. + * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + */ +template < + typename ActivationFunction = ReLULayer<>, + typename RegularizerType = NoRegularizer, + typename InputDataType = arma::mat, + typename OutputDataType = arma::mat +> +class TransformerEncoder +{ + public: + /** + * Create the TransformerEncoder object using the specified parameters. + * + * @param numLayers The number of encoder blocks. + * @param srcSeqLen Source Sequence Length. + * @param dModel The number of features in the input. Also, same as the + * 'embedDim' in 'MultiheadAttention' layer. + * @param numHeads The number of attention heads. + * @param dimFFN The dimentionality of feedforward network. + * @param dropout The dropout rate. + * @param attentionMask The attention mask to be applied to the sequences. + * @param keyPaddingMask The key padding mask applied to the sequences. + */ + TransformerEncoder(const size_t numLayers, + const size_t srcSeqLen, + const size_t dModel = 512, + const size_t numHeads = 2, + const size_t dimFFN = 1024, + const double dropout = 0.1, + const InputDataType& attentionMask = InputDataType(), + const InputDataType& keyPaddingMask = InputDataType()); + + /** + * Get the Transformer Encoder Model. + */ + Sequential* Model() + { + return encoder; + } + + /** + * Load the encoder block from a local directory. + * + * @param filepath The location of the stored model. + */ + void LoadModel(const std::string& filepath); + + /** + * Save the encoder block locally. + * + * @param filepath The location where the model is to be saved. + */ + void SaveModel(const std::string& filepath); + + //! Get the attention mask. + InputDataType const& AttentionMask() const { return attentionMask; } + + //! Modify the attention mask. + InputDataType& AttentionMask() { return attentionMask; } + + //! Get the key padding mask. + InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + + //! Modify the key padding mask. + InputDataType& KeyPaddingMask() { return keyPaddingMask; } + + private: + /** + * The method adds attention block to the encoder block. + */ + void AttentionBlock() + { + Concat<>* input = new Concat<>(); + input->Add>(); + input->Add>(); + input->Add>(); + + /* Self attention layer. */ + Sequential<>* selfAttention = new Sequential<>(); + selfAttention->Add(input); + selfAttention->Add + >(srcSeqLen, srcSeqLen, dModel, numHeads); + + /* This layer adds a residual connection. */ + AddMerge<>* residualAdd = new AddMerge<>(); + residualAdd->Add(selfAttention); + residualAdd->Add>(); + + encoder->Add(residualAdd); + encoder->Add(dModel * srcSeqLen); + } + + /** + * This method adds position-wise feed forward block to the encoder. + */ + void PositionWiseFFNBlock() + { + Sequential<>* positionWiseFFN = new Sequential<>(); + positionWiseFFN->Add>(dModel, dimFFN); + positionWiseFFN->Add(); + positionWiseFFN->Add>(dimFFN, dModel); + positionWiseFFN->Add>(dropout); + + /* This layer adds a residual connection. */ + AddMerge<>* residualAdd = new AddMerge<>(); + residualAddMerge->Add(positionWiseFFN); + residualAddMerge->Add>(); + + encoder->Add(residualAddMerge); + encoder->Add>(dModel * srcSeqLen); + } + + //! Locally-stored number of encoder blocks. + size_t numLayers; + + //! Locally-stored source sequence length. + size_t srcSeqLen; + + //! Locally-stored number of input units. + size_t dModel; + + //! Locally-stored number of output units. + size_t numHeads; + + //! Locally-stored weight object. + size_t dimFFN; + + //! Locally-stored weight parameters. + double dropout; + + //! Locally-stored attention mask. + InputDataType attentionMask; + + //! Locally-stored key padding mask. + InputDataType keyPaddingMask; + + //! Locally-stored encoder block. + Sequential* encoder; + +}; // class TransformerEncoder + +} // namespace ann +} // namespace mlpack + +// Include implementation. +#include "encoder_impl.hpp" + +#endif diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp new file mode 100644 index 00000000..592bae33 --- /dev/null +++ b/models/transformer/encoder_impl.hpp @@ -0,0 +1,76 @@ +/** + * @file models/transformer/encoder_impl.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Implementation of the Transformer Encoder class. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_ENCODER_IMPL_HPP +#define MODELS_TRANSFORMER_ENCODER_IMPL_HPP + +#include "encoder.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +template +TransformerEncoder::TransformerEncoder( + const size_t numLayers, + const size_t srcSeqLen, + const size_t dModel, + const size_t numHeads, + const size_t dimFFN, + const double dropout, + const InputDataType& attentionMask, + const InputDataType& keyPaddingMask) : + numLayers(numLayers), + srcSeqLen(srcSeqLen), + dModel(dModel), + numHeads(numHeads), + dimFFN(dimFFN), + dropout(dropout), + attentionMask(attentionMask), + keyPaddingMask(keyPaddingMask) +{ + encoder = new Sequential(); + + encoder->Add>(); + + for (size_t N = 0; N < numLayers; ++N) + { + AttentionBlock(); + PositionWiseFFNBlock(); + } +} + +template +TransformerEncoder::LoadModel(const std::string& filePath) +{ + data::Load(filePath, "TransformerEncoder", encoder); + std::cout << "Loaded model" << std::endl; +} + +template +TransformerEncoder::SaveModel(const std::string& filePath) +{ + std::cout << "Saving model" << std::endl; + data::Save(filePath, "TransformerEncoder", encoder); + std::cout << "Model saved in " << filePath << std::endl; +} + +} // namespace ann +} // namespace mlpack + +#endif diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp new file mode 100644 index 00000000..8cc1a94f --- /dev/null +++ b/models/transformer/transformer.hpp @@ -0,0 +1,156 @@ +/** + * @file models/transformer/transformer.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Definition of the Transformer model. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_TRANSFORMER_HPP +#define MODELS_TRANSFORMER_TRANSFORMER_HPP + +#include +#include +#include +#include +#include + +#include "encoder.hpp" +#include "decoder.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +/** + * @tparam ActivationType The type of activation function to be used in the + * position-wise feed forward neural network. + * @tparam RegularizerType The regularizer type to be applied on layer + * parameters. + * @tparam InputDataType Type of the input data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat, + * arma::sp_mat or arma::cube). + */ +template < + typename ActivationFunction = ReLULayer<>, + typename RegularizerType = NoRegularizer, + typename InputDataType = arma::mat, + typename OutputDataType = arma::mat +> +class Transformer +{ + public: + /** + * Create the Transformer object using the specified parameters. + * + * @param numLayers The number of encoder and decoder layers. + * @param tgtSeqLen Target Sequence Length. + * @param srcSeqLen Source Sequence Length. + * @param tgtVocabSize Target vocabulary size. + * @param srcVocabSize Source vocabulary size. + * @param dModel The number of features in the input. Also, same as the + * 'embedDim' in 'MultiheadAttention' layer. + * @param numHeads The number of attention heads. + * @param dimFFN The dimentionality of feedforward network. + * @param dropout The dropout rate. + * @param attentionMask The attention mask to be applied to the sequences. + * @param keyPaddingMask The key padding mask applied to the sequences. + */ + Transformer(const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t tgtVocabSize, + const size_t srcVocabSize, + const size_t dModel = 512, + const size_t numHeads = 12, + const size_t dimFFN = 1024, + const double dropout = 0.1, + const InputDataType& attentionMask = InputDataType(), + const InputDataType& keyPaddingMask = InputDataType()); + + /** + * Get the Transformer Encoder Model. + */ + Sequential* Model() + { + return transformer; + } + + /** + * Load the Transformer model from a local directory. + * + * @param filepath The location of the stored model. + */ + void LoadModel(const std::string& filepath); + + /** + * Save the Transformer model locally. + * + * @param filepath The location where the model is to be saved. + */ + void SaveModel(const std::string& filepath); + + //! Get the attention mask. + InputDataType const& AttentionMask() const { return attentionMask; } + + //! Modify the attention mask. + InputDataType& AttentionMask() { return attentionMask; } + + //! Get the key padding mask. + InputDataType const& KeyPaddingMask() const { return keyPaddingMask; } + + //! Modify the key padding mask. + InputDataType& KeyPaddingMask() { return keyPaddingMask; } + + private: + + //! Locally-stored number of encoder and decoder layers. + size_t numLayers; + + //! Locally-stored target sequence length. + size_t tgtSeqLen; + + //! Locally-stored source sequence length. + size_t srcSeqLen; + + //! Locally-stored vocabulary size of the target. + size_t tgtVocabSize; + + //! Locally-stored vocabulary size of the source. + size_t srcVocabSize; + + //! Locally-stored number of input units. + size_t dModel; + + //! Locally-stored number of output units. + size_t numHeads; + + //! Locally-stored weight object. + size_t dimFFN; + + //! Locally-stored weight parameters. + double dropout; + + //! Locally-stored attention mask. + InputDataType attentionMask; + + //! Locally-stored key padding mask. + InputDataType keyPaddingMask; + + //! Locally-stored transformer model. + Sequential* transformer; + +}; // class Transformer + +} // namespace ann +} // namespace mlpack + +// Include implementation. +#include "transformer_impl.hpp" + +#endif diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp new file mode 100644 index 00000000..ff48e859 --- /dev/null +++ b/models/transformer/transformer_impl.hpp @@ -0,0 +1,126 @@ +/** + * @file models/transformer/transformer_impl.hpp + * @author Mikhail Lozhnikov + * @author Mrityunjay Tripathi + * + * Implementation of the Transformer model. + * + * mlpack is free software; you may redistribute it and/or modify it under the + * terms of the 3-clause BSD license. You should have received a copy of the + * 3-clause BSD license along with mlpack. If not, see + * http://www.opensource.org/licenses/BSD-3-Clause for more information. + */ + +#ifndef MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP +#define MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP + +#include "transformer.hpp" + +namespace mlpack { +namespace ann /** Artificial Neural Network. */ { + +template +Transformer::Transformer( + const size_t numLayers, + const size_t tgtSeqLen, + const size_t srcSeqLen, + const size_t tgtVocabSize, + const size_t srcVocabSize, + const size_t dModel, + const size_t numHeads, + const size_t dimFFN, + const double dropout, + const arma::mat& attentionMask, + const arma::mat& keyPaddingMask) : + numLayers(numLayers), + tgtSeqLen(tgtSeqLen), + srcSeqLen(srcSeqLen), + tgtVocabSize(tgtVocabSize), + srcVocabSize(srcVocabSize), + dModel(dModel), + numHeads(numHeads), + dimFFN(dimFFN), + dropout(dropout), + attentionMask(attentionMask), + keyPaddingMask(keyPaddingMask) +{ + transformer = new Sequential<>(); + + Sequenatial<>* encoder = new Sequential<>(); + + // Pull out the sequences of source language which is stacked above in the + // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix. + encoder->Add>(1, 0, srcSeqLen - 1, 0, -1); + encoder->Add>(srcVocabSize, dModel); + encoder->Add>(dModel, srcSeqLen); + + Sequential<>* encoderStack = mlpack::ann::TransformerEncoder<>( + numLayers, + srcSeqLen, + dModel, + numHeads, + dimFFN, + dropout, + attentionMask, + keyPaddingMask, + ).Model(); + + encoder->Add(encoderStack); + + Sequenatial<>* decoderPE = new Sequential<>(); + + // Pull out the sequences of target language which is stacked below in the + // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last + // row and last batch of the input matrix respectively. + decoderPE->Add>(1, srcSeqLen, -1, 0, -1); + decoderPE->Add>(tgtVocabSize, dModel); + decoderPE->Add>(dModel, tgtSeqLen); + + Concat<>* encoderDecoderConcat = new Concat<>(); + encoderDecoderConcat->Add(encoder); + encoderDecoderConcat->Add(decoderPE); + + Sequential<>* decoder = new Sequential<>(); + decoder->Add(encoderDecoderConcat); + + Sequential<>* decoderStack = mlpack::ann::TransformerDecoder<>( + numLayers, + tgtSeqLen, + srcSeqLen, + dModel, + numHeads, + dimFFN, + dropout, + attentionMask, + keyPaddingMask, + ).Model(); + + decoder->Add(decoderStack); + transformer->Add(decoder); +} + +template +void Transformer::LoadModel(const std::string& filePath) +{ + data::Load(filePath, "Transformer", transformer); + std::cout << "Loaded model" << std::endl; +} + +template +void Transformer::SaveModel(const std::string& filePath) +{ + std::cout << "Saving model" << std::endl; + data::Save(filePath, "Transformer", transformer); + std::cout << "Model saved in " << filePath << std::endl; +} + +} // namespace ann +} // namespace mlpack + +#endif diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp index 84e42eb7..5efa9df8 100644 --- a/tests/ffn_model_tests.cpp +++ b/tests/ffn_model_tests.cpp @@ -14,6 +14,9 @@ #include #include #include +#include +#include +#include #include // Use namespaces for convenience. @@ -42,4 +45,115 @@ BOOST_AUTO_TEST_CASE(DarknetModelTest) BOOST_REQUIRE_EQUAL(output.n_rows, 1000); } +/** + * Simple Transformer Encoder test. + */ +BOOST_AUTO_TEST_CASE(TransformerEncoderTest) +{ + const size_t vocabSize = 20; + const size_t numLayers = 2; + const size_t srcSeqLen = 10; + const size_t dModel = 16; + const size_t numHeads = 2; + const size_t dimFFN = 16; + const double dropout = 0.3; + + arma::mat input = arma::randu(dModel * srcSeqLen, 1); + arma::mat output; + + mlpack::ann::TransformerEncoder<> encoder(numLayers, srcSeqLen, + dModel, numHeads, dimFFN, dropout); + + FFN model; + + model.Add(encoder.Model()); + model.Add>(dModel * srcSeqLen, vocabSize); + model.Add>(); + + model.Predict(input, output); + + BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize); + BOOST_REQUIRE_EQUAL(output.n_cols, 1); +} + +/** + * Simple Transformer Decoder test. + */ +BOOST_AUTO_TEST_CASE(TransformerDecoderTest) +{ + const size_t vocabSize = 20; + const size_t numLayers = 2; + const size_t tgtSeqLen = 10; + const size_t srcSeqLen = 10; + const size_t dModel = 16; + const size_t numHeads = 2; + const size_t dimFFN = 16; + const double dropout = 0.3; + + arma::mat query = arma::randu(dModel * tgtSeqLen, 1); + arma::mat memory = 0.73 * arma::randu(dModel * srcSeqLen, 1); + + arma::mat input = arma::join_cols(query, memory); + arma::mat output; + + mlpack::ann::TransformerDecoder<> decoder(numLayers, tgtSeqLen, srcSeqLen, + dModel, numHeads, dimFFN, dropout); + + FFN model; + + model.Add(decoder.Model()); + model.Add>(dModel * tgtSeqLen, vocabSize); + model.Add>(); + + model.Predict(input, output); + + BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize); + BOOST_REQUIRE_EQUAL(output.n_cols, 1); +} + +/** + * Transformer Model test. + */ +BOOST_AUTO_TEST_CASE(TransformerTest) +{ + const size_t tgtVocabSize = 20; + const size_t srcVocabSize = 20; + const size_t numLayers = 2; + const size_t tgtSeqLen = 10; + const size_t srcSeqLen = 10; + const size_t dModel = 16; + const size_t numHeads = 2; + const size_t dimFFN = 16; + const double dropout = 0.3; + + arma::mat srcLanguage(srcSeqLen, 1), tgtLanguage(tgtSeqLen, 1); + + for (size_t t = 0; t < srcSeqLen; ++t) + { + srcLanguage(t) = mlpack::math::RandInt(1, srcVocabSize); + } + + for (size_t t = 0; t < tgtSeqLen; ++t) + { + tgtSeqLen(t) = mlpack::math::RandInt(1, tgtVocabSize); + } + + arma::mat input = arma::join_cols(srcLanguage, tgtLanguage); + arma::mat output; + + mlpack::ann::Transformer<> decoder(numLayers, tgtSeqLen, srcSeqLen, + tgtVocabSize, srcVocabSize, dModel, numHeads, dimFFN, dropout); + + FFN model; + + model.Add(decoder.Model()); + model.Add>(dModel * tgtSeqLen, vocabSize); + model.Add>(); + + model.Predict(input, output); + + BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize); + BOOST_REQUIRE_EQUAL(output.n_cols, 1); +} + BOOST_AUTO_TEST_SUITE_END();