diff --git a/.gitignore b/.gitignore
index 4708e5bb..92a9136d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 build*
 xcode*
+.vscode/
 .DS_Store
 .idea
 cmake-build-*
diff --git a/models/CMakeLists.txt b/models/CMakeLists.txt
index c4bd5a8c..77aa6f85 100644
--- a/models/CMakeLists.txt
+++ b/models/CMakeLists.txt
@@ -1,7 +1,15 @@
 cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
 project(models)
 
-add_subdirectory(darknet)
+# Recurse into each model mlpack provides.
+set(DIRS
+  darknet
+  transformer
+)
+
+foreach(dir ${DIRS})
+    add_subdirectory(${dir})
+endforeach()
 
 # Add directory name to sources.
 set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
diff --git a/models/models.hpp b/models/models.hpp
new file mode 100644
index 00000000..6e620e45
--- /dev/null
+++ b/models/models.hpp
@@ -0,0 +1,9 @@
+/**
+ * @file models.hpp
+ * @author Mrityunjay Tripathi
+ *
+ * This includes various models.
+ */
+
+#include "transformer/encoder.hpp"
+#include "transformer/decoder.hpp"
diff --git a/models/transformer/CMakeLists.txt b/models/transformer/CMakeLists.txt
new file mode 100644
index 00000000..288262a4
--- /dev/null
+++ b/models/transformer/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR)
+project(transformer)
+
+set(DIR_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/)
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../")
+
+set(SOURCES
+  decoder.hpp
+  decoder_impl.hpp
+  encoder.hpp
+  encoder_impl.hpp
+  transformer.hpp
+  transformer_impl.hpp
+)
+
+foreach(file ${SOURCES})
+  set(DIR_SRCS ${DIR_SRCS} ${CMAKE_CURRENT_SOURCE_DIR}/${file})
+endforeach()
+
+set(DIRS ${DIRS} ${DIR_SRCS} PARENT_SCOPE)
diff --git a/models/transformer/decoder.hpp b/models/transformer/decoder.hpp
new file mode 100644
index 00000000..c0dabb5a
--- /dev/null
+++ b/models/transformer/decoder.hpp
@@ -0,0 +1,230 @@
+/**
+ * @file models/transformer/decoder.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the Transformer Decoder layer.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_DECODER_HPP
+#define MODELS_TRANSFORMER_DECODER_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <mlpack/methods/ann/layer/layer_types.hpp>
+#include <mlpack/methods/ann/layer/base_layer.hpp>
+#include <mlpack/methods/ann/regularizer/no_regularizer.hpp>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * In addition to the two sub-layers in each encoder layer, the decoder inserts
+ * a third sub-layer, which performs multi-head attention over the output of the
+ * encoder stack. Similar to the encoder, we employ residual connections around
+ * each of the sub-layers, followed by layer normalization. We also modify the
+ * self-attention sub-layer in the decoder stack to prevent positions from
+ * attending to subsequent positions. This masking, combined with fact that the
+ * output embeddings are offset by one position, ensures that the predictions
+ * for position i can depend only on the known outputs at positions less than i.
+ *
+ * @tparam ActivationFunction The type of the activation function to be used in
+ *         the position-wise feed forward neural network.
+ * @tparam RegularizerType The type of regularizer to be applied to layer
+ *         parameters.
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+  typename ActivationFunction = ReLULayer<>,
+  typename RegularizerType = NoRegularizer,
+  typename InputDataType = arma::mat,
+  typename OutputDataType = arma::mat
+>
+class TransformerDecoder
+{
+ public:
+  TransformerDecoder();
+
+  /**
+   * Create the TransformerDecoder object using the specified parameters.
+   *
+   * @param numLayers The number of decoder blocks.
+   * @param tgtSeqLen Target Sequence Length.
+   * @param srcSeqLen Source Sequence Length.
+   * @param memoryModule The last Encoder module.
+   * @param dModel The number of features in the input. Also, same as the
+   *        'embedDim' in 'MultiheadAttention' layer.
+   * @param numHeads The number of attention heads.
+   * @param dimFFN The dimentionality of feedforward network.
+   * @param dropout The dropout rate.
+   * @param attentionMask The attention mask used to black-out future sequences.
+   * @param keyPaddingMask The padding mask used to black-out particular token.
+   */
+  TransformerDecoder(const size_t numLayers,
+                     const size_t tgtSeqLen,
+                     const size_t srcSeqLen,
+                     const size_t dModel = 512,
+                     const size_t numHeads = 8,
+                     const size_t dimFFN = 1024,
+                     const double dropout = 0.1,
+                     const InputDataType& attentionMask = InputDataType(),
+                     const InputDataType& keyPaddingMask = InputDataType());
+
+  /**
+   * Get the Transformer Decoder model.
+   */
+  Sequential<>* Model() { return decoder; }
+  /**
+   * Load the network from a local directory.
+   *
+   * @param filepath The location of the stored model.
+   */
+  void LoadModel(const std::string& filepath);
+
+  /**
+   * Save the network locally.
+   *
+   * @param filepath The location where the model is to be saved.
+   */
+  void SaveModel(const std::string& filepath);
+
+  //! Get the key matrix, the output of the Transformer Encoder.
+  InputDataType const& Key() const { return key; }
+
+  //! Modify the key matrix.
+  InputDataType& Key() { return key; }
+
+ private:
+  /**
+   * This method adds the attention block to the decoder.
+   */
+  void AttentionBlock()
+  {
+    Sequential<>* decoderBlockBottom = new Sequential<>();
+    decoderBlockBottom->Add<Subview<>>(1, 0, dModel * tgtSeqLen - 1, 0, -1);
+
+    // Broadcast the incoming input to decoder
+    // i.e. query into (query, key, value).
+    Concat<>* decoderInput = new Concat<>();
+    decoderInput->Add<IdentityLayer<>>();
+    decoderInput->Add<IdentityLayer<>>();
+    decoderInput->Add<IdentityLayer<>>();
+
+    // Masked Self attention layer.
+    Sequential<>* maskedSelfAttention = new Sequential<>();
+    maskedSelfAttention->Add(decoderInput);
+    maskedSelfAttention->Add<MultiheadAttention<
+        InputDataType, OutputDataType, RegularizerType>>(
+          tgtSeqLen,
+          tgtSeqLen,
+          dModel,
+          numHeads,
+          attentionMask
+        );
+
+    // Residual connection.
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAdd->Add(maskedSelfAttention);
+    residualAdd->Add<IdentityLayer<>>();
+
+    decoderBlockBottom->Add(residualAddMerge);
+
+    // Add the LayerNorm layer with required parameters.
+    decoderBlockBottom->Add<LayerNorm<>>(dModel * tgtSeqLen);
+
+    // This layer broadcasts the output of encoder i.e. key into (key, value).
+    Concat<>* broadcastEncoderOutput = new Concat<>();
+    broadcastEncoderOutput->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
+    broadcastEncoderOutput->Add<Subview<>>(1, dModel * tgtSeqLen, -1, 0, -1);
+
+    // This layer concatenates the output of the bottom decoder block (query)
+    // and the output of the encoder (key, value).
+    Concat<>* encoderDecoderAttentionInput = new Concat<>();
+    encoderDecoderAttentionInput->Add(decoderBlockBottom);
+    encoderDecoderAttentionInput->Add(broadcastEncoderOutput);
+
+    // Encoder-decoder attention.
+    Sequential<>* encoderDecoderAttention = new Sequential<>();
+    encoderDecoderAttention->Add(encoderDecoderAttentionInput);
+    encoderDecoderAttention->Add<MultiheadAttention<
+        InputDataType, OutputDataType, RegularizerType>>(
+          tgtSeqLen,
+          srcSeqLen,
+          dModel,
+          numHeads,
+          InputDatatype(), // No attention mask to encoder-decoder attention.
+          keyPaddingMask);
+
+    // Residual connection.
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAdd->Add(encoderDecoderAttention);
+    residualAdd->Add<IdentityLayer<>>();
+
+    decoder->Add(residualAdd);
+    decoder->Add<LayerNorm<>>(dModel * tgtSeqLen);
+  }
+
+  /**
+   * This method adds the position-wise feed forward network to the decoder.
+   */
+  void PositionWiseFFNBlock()
+  {
+    Sequential<>* positionWiseFFN = new Sequential<>();
+    positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
+    positionWiseFFN->Add<ActivationFunction>();
+    positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
+    positionWiseFFN->Add<Dropout<>>(dropout);
+
+    /* Residual connection. */
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAdd->Add(positionWiseFFN);
+    residualAdd->Add<IdentityLayer<>>();
+    decoder->Add(residualAdd);
+  }
+
+  //! Locally-stored number of decoder layers.
+  size_t numLayers;
+
+  //! Locally-stored target sequence length.
+  size_t tgtSeqLen;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored number of input units.
+  size_t dModel;
+
+  //! Locally-stored number of output units.
+  size_t numHeads;
+
+  //! Locally-stored weight object.
+  size_t dimFFN;
+
+  //! Locally-stored weight parameters.
+  double dropout;
+
+  //! Locally-stored attention mask.
+  InputDataType attentionMask;
+
+  //! Locally-stored key padding mask.
+  InputDataType keyPaddingMask;
+
+  //! Locally-stored complete decoder network.
+  Sequential<InputDataType, OutputDataType, false>* decoder;
+
+}; // class TransformerDecoder
+
+} // namespace ann
+} // namespace mlpack
+
+// Include implementation.
+#include "decoder_impl.hpp"
+
+#endif
diff --git a/models/transformer/decoder_impl.hpp b/models/transformer/decoder_impl.hpp
new file mode 100644
index 00000000..0fb8c89c
--- /dev/null
+++ b/models/transformer/decoder_impl.hpp
@@ -0,0 +1,91 @@
+/**
+ * @file models/transformer/decoder_impl.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Implementation of the Transformer Decoder class.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_DECODER_IMPL_HPP
+#define MODELS_TRANSFORMER_DECODER_IMPL_HPP
+
+#include "decoder.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerDecoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::TransformerDecoder() :
+    tgtSeqLen(0),
+    srcSeqLen(0),
+    memoryModule(NULL),
+    dModel(0),
+    numHeads(0),
+    dimFFN(0),
+    dropout(0)
+{
+  // Nothing to do here.
+}
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerDecoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::TransformerDecoder(
+    const size_t numLayers,
+    const size_t tgtSeqLen,
+    const size_t srcSeqLen,
+    const size_t dModel,
+    const size_t numHeads,
+    const size_t dimFFN,
+    const double dropout,
+    const InputDataType& attentionMask,
+    const InputDataType& keyPaddingMask) :
+    numLayers(numLayers),
+    tgtSeqLen(tgtSeqLen),
+    srcSeqLen(srcSeqLen),
+    dModel(dModel),
+    numHeads(numHeads),
+    dimFFN(dimFFN),
+    dropout(dropout),
+    attentionMask(attentionMask),
+    keyPaddingMask(keyPaddingMask)
+{
+  decoder = new Sequential<InputDataType, OutputDataType, false>();
+
+  for (size_t N = 0; N < numLayers; ++N)
+  {
+    AttentionBlock();
+    PositionWiseFFNBlock();
+  }
+}
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void TransformerDecoder<ActivationFunction, RegularizerType,
+InputDataType, OutputDataType>::LoadModel(const std::string& filepath)
+{
+  data::Load(filepath, "TransformerDecoder", decoder);
+  std::cout << "Loaded model" << std::endl;
+}
+
+template<typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+void TransformerDecoder<ActivationFunction, RegularizerType,
+InputDataType, OutputDataType>::SaveModel(const std::string& filepath)
+{
+  std::cout << "Saving model" << std::endl;
+  data::Save(filepath, "TransformerDecoder", decoder);
+  std::cout << "Model saved in " << filepath << std::endl;
+}
+
+} // namespace ann
+} // namespace mlpack
+
+#endif
diff --git a/models/transformer/encoder.hpp b/models/transformer/encoder.hpp
new file mode 100644
index 00000000..ca38abff
--- /dev/null
+++ b/models/transformer/encoder.hpp
@@ -0,0 +1,191 @@
+/**
+ * @file models/transformer/encoder.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the Transformer Encoder layer.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_ENCODER_HPP
+#define MODELS_TRANSFORMER_ENCODER_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <mlpack/methods/ann/layer/layer_types.hpp>
+#include <mlpack/methods/ann/layer/base_layer.hpp>
+#include <mlpack/methods/ann/regularizer/no_regularizer.hpp>
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * The Transformer Encoder layer has two sub-layers. The first is a multi-head
+ * self-attention mechanism, and the second is a simple, position-wise
+ * fully connected feed-forward network. We employ a residual connection around
+ * each of the two sub-layers, followed by layer normalization. Hence the output
+ * of each sub-layer is 'LayerNorm(x + Sublayer(x))', where 'Sublayer(x)' is the
+ * function implemented by the sub-layer itself. To facilitate these residual
+ * connections, all sub-layers in the model, as well as the embedding layers,
+ * produce outputs of dimension 'dModel'.
+ *
+ * @tparam ActivationType The type of activation function to be used in the
+ *         position-wise feed forward neural network.
+ * @tparam RegularizerType The regularizer type to be applied on layer
+ *         parameters.
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+  typename ActivationFunction = ReLULayer<>,
+  typename RegularizerType = NoRegularizer,
+  typename InputDataType = arma::mat,
+  typename OutputDataType = arma::mat
+>
+class TransformerEncoder
+{
+ public:
+  /**
+   * Create the TransformerEncoder object using the specified parameters.
+   *
+   * @param numLayers The number of encoder blocks.
+   * @param srcSeqLen Source Sequence Length.
+   * @param dModel The number of features in the input. Also, same as the
+   *               'embedDim' in 'MultiheadAttention' layer.
+   * @param numHeads The number of attention heads.
+   * @param dimFFN The dimentionality of feedforward network.
+   * @param dropout The dropout rate.
+   * @param attentionMask The attention mask to be applied to the sequences.
+   * @param keyPaddingMask The key padding mask applied to the sequences.
+   */
+  TransformerEncoder(const size_t numLayers,
+                     const size_t srcSeqLen,
+                     const size_t dModel = 512,
+                     const size_t numHeads = 2,
+                     const size_t dimFFN = 1024,
+                     const double dropout = 0.1,
+                     const InputDataType& attentionMask = InputDataType(),
+                     const InputDataType& keyPaddingMask = InputDataType());
+
+  /**
+   * Get the Transformer Encoder Model.
+   */
+  Sequential<InputDataType, OutputDataType, false>* Model()
+  {
+    return encoder;
+  }
+
+  /**
+   * Load the encoder block from a local directory.
+   *
+   * @param filepath The location of the stored model.
+   */
+  void LoadModel(const std::string& filepath);
+
+  /**
+   * Save the encoder block locally.
+   *
+   * @param filepath The location where the model is to be saved.
+   */
+  void SaveModel(const std::string& filepath);
+
+  //! Get the attention mask.
+  InputDataType const& AttentionMask() const { return attentionMask; }
+
+  //! Modify the attention mask.
+  InputDataType& AttentionMask() { return attentionMask; }
+
+  //! Get the key padding mask.
+  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+
+  //! Modify the key padding mask.
+  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+
+ private:
+  /**
+   * The method adds attention block to the encoder block.
+   */
+  void AttentionBlock()
+  {
+    Concat<>* input = new Concat<>();
+    input->Add<IdentityLayer<>>();
+    input->Add<IdentityLayer<>>();
+    input->Add<IdentityLayer<>>();
+
+    /* Self attention layer. */
+    Sequential<>* selfAttention = new Sequential<>();
+    selfAttention->Add(input);
+    selfAttention->Add<MultiheadAttention<
+        InputDataType, OutputDataType, RegularizerType>
+        >(srcSeqLen, srcSeqLen, dModel, numHeads);
+
+    /* This layer adds a residual connection. */
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAdd->Add(selfAttention);
+    residualAdd->Add<IdentityLayer<>>();
+
+    encoder->Add(residualAdd);
+    encoder->Add<LayerNorm<>(dModel * srcSeqLen);
+  }
+
+  /**
+   * This method adds position-wise feed forward block to the encoder.
+   */
+  void PositionWiseFFNBlock()
+  {
+    Sequential<>* positionWiseFFN = new Sequential<>();
+    positionWiseFFN->Add<Linear3D<>>(dModel, dimFFN);
+    positionWiseFFN->Add<ActivationFunction>();
+    positionWiseFFN->Add<Linear3D<>>(dimFFN, dModel);
+    positionWiseFFN->Add<Dropout<>>(dropout);
+
+    /* This layer adds a residual connection. */
+    AddMerge<>* residualAdd = new AddMerge<>();
+    residualAddMerge->Add(positionWiseFFN);
+    residualAddMerge->Add<IdentityLayer<>>();
+
+    encoder->Add(residualAddMerge);
+    encoder->Add<LayerNorm<>>(dModel * srcSeqLen);
+  }
+
+  //! Locally-stored number of encoder blocks.
+  size_t numLayers;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored number of input units.
+  size_t dModel;
+
+  //! Locally-stored number of output units.
+  size_t numHeads;
+
+  //! Locally-stored weight object.
+  size_t dimFFN;
+
+  //! Locally-stored weight parameters.
+  double dropout;
+
+  //! Locally-stored attention mask.
+  InputDataType attentionMask;
+
+  //! Locally-stored key padding mask.
+  InputDataType keyPaddingMask;
+
+  //! Locally-stored encoder block.
+  Sequential<InputDataType, OutputDataType, false>* encoder;
+
+}; // class TransformerEncoder
+
+} // namespace ann
+} // namespace mlpack
+
+// Include implementation.
+#include "encoder_impl.hpp"
+
+#endif
diff --git a/models/transformer/encoder_impl.hpp b/models/transformer/encoder_impl.hpp
new file mode 100644
index 00000000..592bae33
--- /dev/null
+++ b/models/transformer/encoder_impl.hpp
@@ -0,0 +1,76 @@
+/**
+ * @file models/transformer/encoder_impl.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Implementation of the Transformer Encoder class.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_ENCODER_IMPL_HPP
+#define MODELS_TRANSFORMER_ENCODER_IMPL_HPP
+
+#include "encoder.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::TransformerEncoder(
+    const size_t numLayers,
+    const size_t srcSeqLen,
+    const size_t dModel,
+    const size_t numHeads,
+    const size_t dimFFN,
+    const double dropout,
+    const InputDataType& attentionMask,
+    const InputDataType& keyPaddingMask) :
+    numLayers(numLayers),
+    srcSeqLen(srcSeqLen),
+    dModel(dModel),
+    numHeads(numHeads),
+    dimFFN(dimFFN),
+    dropout(dropout),
+    attentionMask(attentionMask),
+    keyPaddingMask(keyPaddingMask)
+{
+  encoder = new Sequential<InputDataType, OutputDataType, false>();
+
+  encoder->Add<IdentityLayer<>>();
+
+  for (size_t N = 0; N < numLayers; ++N)
+  {
+    AttentionBlock();
+    PositionWiseFFNBlock();
+  }
+}
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::LoadModel(const std::string& filePath)
+{
+  data::Load(filePath, "TransformerEncoder", encoder);
+  std::cout << "Loaded model" << std::endl;
+}
+
+template <typename ActivationFunction, typename RegularizerType,
+    typename InputDataType, typename OutputDataType>
+TransformerEncoder<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::SaveModel(const std::string& filePath)
+{
+  std::cout << "Saving model" << std::endl;
+  data::Save(filePath, "TransformerEncoder", encoder);
+  std::cout << "Model saved in " << filePath << std::endl;
+}
+
+} // namespace ann
+} // namespace mlpack
+
+#endif
diff --git a/models/transformer/transformer.hpp b/models/transformer/transformer.hpp
new file mode 100644
index 00000000..8cc1a94f
--- /dev/null
+++ b/models/transformer/transformer.hpp
@@ -0,0 +1,156 @@
+/**
+ * @file models/transformer/transformer.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Definition of the Transformer model.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_TRANSFORMER_HPP
+#define MODELS_TRANSFORMER_TRANSFORMER_HPP
+
+#include <mlpack/prereqs.hpp>
+#include <mlpack/methods/ann/ffn.hpp>
+#include <mlpack/methods/ann/layer/layer_types.hpp>
+#include <mlpack/methods/ann/layer/base_layer.hpp>
+#include <mlpack/methods/ann/regularizer/no_regularizer.hpp>
+
+#include "encoder.hpp"
+#include "decoder.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+/**
+ * @tparam ActivationType The type of activation function to be used in the
+ *         position-wise feed forward neural network.
+ * @tparam RegularizerType The regularizer type to be applied on layer
+ *         parameters.
+ * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
+ *         arma::sp_mat or arma::cube).
+ */
+template <
+  typename ActivationFunction = ReLULayer<>,
+  typename RegularizerType = NoRegularizer,
+  typename InputDataType = arma::mat,
+  typename OutputDataType = arma::mat
+>
+class Transformer
+{
+ public:
+  /**
+   * Create the Transformer object using the specified parameters.
+   *
+   * @param numLayers The number of encoder and decoder layers.
+   * @param tgtSeqLen Target Sequence Length.
+   * @param srcSeqLen Source Sequence Length.
+   * @param tgtVocabSize Target vocabulary size.
+   * @param srcVocabSize Source vocabulary size.
+   * @param dModel The number of features in the input. Also, same as the
+   *               'embedDim' in 'MultiheadAttention' layer.
+   * @param numHeads The number of attention heads.
+   * @param dimFFN The dimentionality of feedforward network.
+   * @param dropout The dropout rate.
+   * @param attentionMask The attention mask to be applied to the sequences.
+   * @param keyPaddingMask The key padding mask applied to the sequences.
+   */
+  Transformer(const size_t numLayers,
+              const size_t tgtSeqLen,
+              const size_t srcSeqLen,
+              const size_t tgtVocabSize,
+              const size_t srcVocabSize,
+              const size_t dModel = 512,
+              const size_t numHeads = 12,
+              const size_t dimFFN = 1024,
+              const double dropout = 0.1,
+              const InputDataType& attentionMask = InputDataType(),
+              const InputDataType& keyPaddingMask = InputDataType());
+
+  /**
+   * Get the Transformer Encoder Model.
+   */
+  Sequential<InputDataType, OutputDataType, false>* Model()
+  {
+    return transformer;
+  }
+
+  /**
+   * Load the Transformer model from a local directory.
+   *
+   * @param filepath The location of the stored model.
+   */
+  void LoadModel(const std::string& filepath);
+
+  /**
+   * Save the Transformer model locally.
+   *
+   * @param filepath The location where the model is to be saved.
+   */
+  void SaveModel(const std::string& filepath);
+
+  //! Get the attention mask.
+  InputDataType const& AttentionMask() const { return attentionMask; }
+
+  //! Modify the attention mask.
+  InputDataType& AttentionMask() { return attentionMask; }
+
+  //! Get the key padding mask.
+  InputDataType const& KeyPaddingMask() const { return keyPaddingMask; }
+
+  //! Modify the key padding mask.
+  InputDataType& KeyPaddingMask() { return keyPaddingMask; }
+
+ private:
+
+  //! Locally-stored number of encoder and decoder layers.
+  size_t numLayers;
+
+  //! Locally-stored target sequence length.
+  size_t tgtSeqLen;
+
+  //! Locally-stored source sequence length.
+  size_t srcSeqLen;
+
+  //! Locally-stored vocabulary size of the target.
+  size_t tgtVocabSize;
+
+  //! Locally-stored vocabulary size of the source.
+  size_t srcVocabSize;
+
+  //! Locally-stored number of input units.
+  size_t dModel;
+
+  //! Locally-stored number of output units.
+  size_t numHeads;
+
+  //! Locally-stored weight object.
+  size_t dimFFN;
+
+  //! Locally-stored weight parameters.
+  double dropout;
+
+  //! Locally-stored attention mask.
+  InputDataType attentionMask;
+
+  //! Locally-stored key padding mask.
+  InputDataType keyPaddingMask;
+
+  //! Locally-stored transformer model.
+  Sequential<InputDataType, OutputDataType>* transformer;
+
+}; // class Transformer
+
+} // namespace ann
+} // namespace mlpack
+
+// Include implementation.
+#include "transformer_impl.hpp"
+
+#endif
diff --git a/models/transformer/transformer_impl.hpp b/models/transformer/transformer_impl.hpp
new file mode 100644
index 00000000..ff48e859
--- /dev/null
+++ b/models/transformer/transformer_impl.hpp
@@ -0,0 +1,126 @@
+/**
+ * @file models/transformer/transformer_impl.hpp
+ * @author Mikhail Lozhnikov
+ * @author Mrityunjay Tripathi
+ *
+ * Implementation of the Transformer model.
+ *
+ * mlpack is free software; you may redistribute it and/or modify it under the
+ * terms of the 3-clause BSD license.  You should have received a copy of the
+ * 3-clause BSD license along with mlpack.  If not, see
+ * http://www.opensource.org/licenses/BSD-3-Clause for more information.
+ */
+
+#ifndef MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP
+#define MODELS_TRANSFORMER_TRANSFORMER_IMPL_HPP
+
+#include "transformer.hpp"
+
+namespace mlpack {
+namespace ann /** Artificial Neural Network. */ {
+
+template <typename ActivationFunction, typename RegularizerType
+    typename InputDataType, typename OutputDataType>
+Transformer<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::Transformer(
+    const size_t numLayers,
+    const size_t tgtSeqLen,
+    const size_t srcSeqLen,
+    const size_t tgtVocabSize,
+    const size_t srcVocabSize,
+    const size_t dModel,
+    const size_t numHeads,
+    const size_t dimFFN,
+    const double dropout,
+    const arma::mat& attentionMask,
+    const arma::mat& keyPaddingMask) :
+    numLayers(numLayers),
+    tgtSeqLen(tgtSeqLen),
+    srcSeqLen(srcSeqLen),
+    tgtVocabSize(tgtVocabSize),
+    srcVocabSize(srcVocabSize),
+    dModel(dModel),
+    numHeads(numHeads),
+    dimFFN(dimFFN),
+    dropout(dropout),
+    attentionMask(attentionMask),
+    keyPaddingMask(keyPaddingMask)
+{
+  transformer = new Sequential<>();
+
+  Sequenatial<>* encoder = new Sequential<>();
+
+  // Pull out the sequences of source language which is stacked above in the
+  // input matrix. Here 'lastCol = -1' denotes upto last batch of input matrix.
+  encoder->Add<Subview<>>(1, 0, srcSeqLen - 1, 0, -1);
+  encoder->Add<Lookup<>>(srcVocabSize, dModel);
+  encoder->Add<PositionalEncoding<>>(dModel, srcSeqLen);
+
+  Sequential<>* encoderStack = mlpack::ann::TransformerEncoder<>(
+      numLayers,
+      srcSeqLen,
+      dModel,
+      numHeads,
+      dimFFN,
+      dropout,
+      attentionMask,
+      keyPaddingMask,
+  ).Model();
+
+  encoder->Add(encoderStack);
+
+  Sequenatial<>* decoderPE = new Sequential<>();
+
+  // Pull out the sequences of target language which is stacked below in the
+  // input matrix. Here 'lastRow = -1' and 'lastCol = -1' denotes upto last
+  // row and last batch of the input matrix respectively.
+  decoderPE->Add<Subview<>>(1, srcSeqLen, -1, 0, -1);
+  decoderPE->Add<Lookup<>>(tgtVocabSize, dModel);
+  decoderPE->Add<PositionalEncoding<>>(dModel, tgtSeqLen);
+
+  Concat<>* encoderDecoderConcat = new Concat<>();
+  encoderDecoderConcat->Add(encoder);
+  encoderDecoderConcat->Add(decoderPE);
+
+  Sequential<>* decoder = new Sequential<>();
+  decoder->Add(encoderDecoderConcat);
+
+  Sequential<>* decoderStack = mlpack::ann::TransformerDecoder<>(
+      numLayers,
+      tgtSeqLen,
+      srcSeqLen,
+      dModel,
+      numHeads,
+      dimFFN,
+      dropout,
+      attentionMask,
+      keyPaddingMask,
+  ).Model();
+
+  decoder->Add(decoderStack);
+  transformer->Add(decoder);
+}
+
+template <typename ActivationFunction, typename RegularizerType
+    typename InputDataType, typename OutputDataType>
+void Transformer<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::LoadModel(const std::string& filePath)
+{
+  data::Load(filePath, "Transformer", transformer);
+  std::cout << "Loaded model" << std::endl;
+}
+
+template <typename ActivationFunction, typename RegularizerType
+    typename InputDataType, typename OutputDataType>
+void Transformer<ActivationFunction, RegularizerType, InputDataType,
+OutputDataType>::SaveModel(const std::string& filePath)
+{
+  std::cout << "Saving model" << std::endl;
+  data::Save(filePath, "Transformer", transformer);
+  std::cout << "Model saved in " << filePath << std::endl;
+}
+
+} // namespace ann
+} // namespace mlpack
+
+#endif
diff --git a/tests/ffn_model_tests.cpp b/tests/ffn_model_tests.cpp
index 84e42eb7..5efa9df8 100644
--- a/tests/ffn_model_tests.cpp
+++ b/tests/ffn_model_tests.cpp
@@ -14,6 +14,9 @@
 #include <ensmallen.hpp>
 #include <dataloader/dataloader.hpp>
 #include <models/darknet/darknet.hpp>
+#include <models/transformer/encoder.hpp>
+#include <models/transformer/decoder.hpp>
+#include <models/transformer/transformer.hpp>
 #include <boost/test/unit_test.hpp>
 
 // Use namespaces for convenience.
@@ -42,4 +45,115 @@ BOOST_AUTO_TEST_CASE(DarknetModelTest)
   BOOST_REQUIRE_EQUAL(output.n_rows, 1000);
 }
 
+/**
+ * Simple Transformer Encoder test.
+ */
+BOOST_AUTO_TEST_CASE(TransformerEncoderTest)
+{
+  const size_t vocabSize = 20;
+  const size_t numLayers = 2;
+  const size_t srcSeqLen = 10;
+  const size_t dModel = 16;
+  const size_t numHeads = 2;
+  const size_t dimFFN = 16;
+  const double dropout = 0.3;
+
+  arma::mat input = arma::randu(dModel * srcSeqLen, 1);
+  arma::mat output;
+
+  mlpack::ann::TransformerEncoder<> encoder(numLayers, srcSeqLen,
+      dModel, numHeads, dimFFN, dropout);
+
+  FFN<NegativeLogLikelihood, XavierInitialization> model;
+
+  model.Add(encoder.Model());
+  model.Add<Linear<>>(dModel * srcSeqLen, vocabSize);
+  model.Add<LogSoftMax<>>();
+
+  model.Predict(input, output);
+
+  BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize);
+  BOOST_REQUIRE_EQUAL(output.n_cols, 1);
+}
+
+/**
+ * Simple Transformer Decoder test.
+ */
+BOOST_AUTO_TEST_CASE(TransformerDecoderTest)
+{
+  const size_t vocabSize = 20;
+  const size_t numLayers = 2;
+  const size_t tgtSeqLen = 10;
+  const size_t srcSeqLen = 10;
+  const size_t dModel = 16;
+  const size_t numHeads = 2;
+  const size_t dimFFN = 16;
+  const double dropout = 0.3;
+
+  arma::mat query = arma::randu(dModel * tgtSeqLen, 1);
+  arma::mat memory = 0.73 * arma::randu(dModel * srcSeqLen, 1);
+
+  arma::mat input = arma::join_cols(query, memory);
+  arma::mat output;
+
+  mlpack::ann::TransformerDecoder<> decoder(numLayers, tgtSeqLen, srcSeqLen,
+      dModel, numHeads, dimFFN, dropout);
+
+  FFN<NegativeLogLikelihood, XavierInitialization> model;
+
+  model.Add(decoder.Model());
+  model.Add<Linear<>>(dModel * tgtSeqLen, vocabSize);
+  model.Add<LogSoftMax<>>();
+
+  model.Predict(input, output);
+
+  BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize);
+  BOOST_REQUIRE_EQUAL(output.n_cols, 1);
+}
+
+/**
+ * Transformer Model test.
+ */
+BOOST_AUTO_TEST_CASE(TransformerTest)
+{
+  const size_t tgtVocabSize = 20;
+  const size_t srcVocabSize = 20;
+  const size_t numLayers = 2;
+  const size_t tgtSeqLen = 10;
+  const size_t srcSeqLen = 10;
+  const size_t dModel = 16;
+  const size_t numHeads = 2;
+  const size_t dimFFN = 16;
+  const double dropout = 0.3;
+
+  arma::mat srcLanguage(srcSeqLen, 1), tgtLanguage(tgtSeqLen, 1);
+
+  for (size_t t = 0; t < srcSeqLen; ++t)
+  {
+    srcLanguage(t) = mlpack::math::RandInt(1, srcVocabSize);
+  }
+
+  for (size_t t = 0; t < tgtSeqLen; ++t)
+  {
+    tgtSeqLen(t) = mlpack::math::RandInt(1, tgtVocabSize);
+  }
+
+  arma::mat input = arma::join_cols(srcLanguage, tgtLanguage);
+  arma::mat output;
+
+  mlpack::ann::Transformer<> decoder(numLayers, tgtSeqLen, srcSeqLen,
+      tgtVocabSize, srcVocabSize, dModel, numHeads, dimFFN, dropout);
+
+  FFN<NegativeLogLikelihood, XavierInitialization> model;
+
+  model.Add(decoder.Model());
+  model.Add<Linear<>>(dModel * tgtSeqLen, vocabSize);
+  model.Add<LogSoftMax<>>();
+
+  model.Predict(input, output);
+
+  BOOST_REQUIRE_EQUAL(output.n_rows, vocabSize);
+  BOOST_REQUIRE_EQUAL(output.n_cols, 1);
+}
+
 BOOST_AUTO_TEST_SUITE_END();