From e3f5b83ba306e543b7a3dc2aae382a03b09918a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Troels=20F=2E=20R=C3=B8nnow?= <troels.roennow@gmail.com>
Date: Tue, 14 May 2019 14:55:12 +0100
Subject: [PATCH] Feature/blas refactor (#983)

* Creating BLAS routines for tensor

* bugfix for embeddings

* tidying and style

* style

* fixed bug - failure to zero out accumulated gradients

* removed unecessary copies & minor refactor

* style

* updated embeddings backward test to check gradients zeroed appropriately + gradients copy export for weights

* Changes

* Adding iterator tests

* Fixing iterator test

* Fixing issues

* Fixing naming

* Minor changes

* Adding licenses

* Minor fixes

* Adding default values in Tensor

* Updating tensor impl.

* Updating style

* Fixing Relu and adding test

* Some minor changes to convolution 1d

* Adding enhanced tensor iterator

* trivial style changes

* update conv1d test

* bugfixes to convolution tests

* Updating style

* Fixing cast error

* Restarting jenkins

* Addressing comments

* Merging develop and updating

* Fixing slice

* Making embeddings test pass

* Fixing some style issues

* Fixing some style issues

* fix

* Updating

* bugfix relu backward
---
 .gitignore                                    |   1 +
 README.md                                     |   2 +-
 libs/math/CMakeLists.txt                      |   2 +-
 libs/math/benchmark/basic_math/exp_bench.cpp  |  16 +-
 libs/math/include/math/linalg/blas/base.hpp   |  41 +
 .../math/linalg/blas/gemm_nn_novector.hpp     |  56 ++
 .../math/linalg/blas/gemm_nn_vector.hpp       |  56 ++
 .../math/linalg/blas/gemm_nt_novector.hpp     |  57 ++
 .../math/linalg/blas/gemm_nt_vector.hpp       |  56 ++
 .../math/linalg/blas/gemm_tn_novector.hpp     |  57 ++
 .../math/linalg/blas/gemm_tn_vector.hpp       |  56 ++
 .../math/linalg/blas/gemm_tt_novector.hpp     |  57 ++
 .../math/linalg/blas/gemm_tt_vector.hpp       |  57 ++
 libs/math/include/math/linalg/blas/gemv_n.hpp |  70 ++
 libs/math/include/math/linalg/blas/gemv_t.hpp |  70 ++
 .../include/math/linalg/blas/scal_all.hpp     |  63 ++
 .../include/math/linalg/blas/swap_all.hpp     |  72 ++
 libs/math/include/math/linalg/prototype.hpp   | 176 ++++
 libs/math/include/math/matrix_operations.hpp  |  33 +-
 libs/math/include/math/tensor.hpp             | 840 +++++++++---------
 libs/math/include/math/tensor_broadcast.hpp   |  13 +-
 libs/math/include/math/tensor_iterator.hpp    | 288 +-----
 .../include/math/tensor_slice_iterator.hpp    | 379 ++++++++
 libs/math/include/math/tensor_squeeze.hpp     |  20 +-
 .../src/math/linalg/blas/gemm_nn_novector.cpp | 108 +++
 .../src/math/linalg/blas/gemm_nn_vector.cpp   | 137 +++
 .../src/math/linalg/blas/gemm_nt_novector.cpp | 111 +++
 .../src/math/linalg/blas/gemm_nt_vector.cpp   | 137 +++
 .../src/math/linalg/blas/gemm_tn_novector.cpp | 105 +++
 .../src/math/linalg/blas/gemm_tn_vector.cpp   | 117 +++
 .../src/math/linalg/blas/gemm_tt_novector.cpp | 105 +++
 .../src/math/linalg/blas/gemm_tt_vector.cpp   | 115 +++
 .../src/math/linalg/blas/gemv_n_novector.cpp  | 165 ++++
 .../src/math/linalg/blas/gemv_n_vector.cpp    | 183 ++++
 .../src/math/linalg/blas/gemv_t_novector.cpp  | 168 ++++
 .../src/math/linalg/blas/gemv_t_vector.cpp    | 184 ++++
 libs/math/src/math/linalg/blas/scal_all.cpp   |  97 ++
 libs/math/src/math/linalg/blas/swap_all.cpp   | 115 +++
 libs/math/tests/CMakeLists.txt                |   2 +
 .../math/linalg/blas/gemm_nn_novector.cpp     | 312 +++++++
 .../tests/math/linalg/blas/gemm_nn_vector.cpp | 310 +++++++
 .../math/linalg/blas/gemm_nt_novector.cpp     | 274 ++++++
 .../tests/math/linalg/blas/gemm_nt_vector.cpp | 333 +++++++
 .../math/linalg/blas/gemm_tn_novector.cpp     | 330 +++++++
 .../tests/math/linalg/blas/gemm_tn_vector.cpp | 298 +++++++
 .../math/linalg/blas/gemm_tt_novector.cpp     | 184 ++++
 .../tests/math/linalg/blas/gemm_tt_vector.cpp | 145 +++
 .../math/linalg/blas/gemv_n_novector.cpp      | 244 +++++
 .../tests/math/linalg/blas/gemv_n_vector.cpp  | 244 +++++
 .../math/linalg/blas/gemv_t_novector.cpp      | 246 +++++
 .../tests/math/linalg/blas/gemv_t_vector.cpp  | 246 +++++
 libs/math/tests/math/linalg/blas/scal_all.cpp | 132 +++
 libs/math/tests/math/linalg/blas/swap_all.cpp | 280 ++++++
 .../matrix_operations/matrix_operations.cpp   |   7 +-
 libs/math/tests/math/tensor/tensor_concat.cpp |   2 +-
 .../tests/math/tensor/tensor_indexing.cpp     |  37 +-
 .../tests/math/tensor_iterator/broadcast.cpp  | 108 +++
 .../tests/math/tensor_iterator/iterator.cpp   | 238 +++++
 .../tests/math/tensor_iterator/squeeze.cpp    |  81 ++
 libs/ml/include/ml/clustering/tsne.hpp        |   5 +-
 libs/ml/include/ml/layers/skip_gram.hpp       |   4 +-
 libs/ml/include/ml/ops/activations/relu.hpp   |  35 +-
 libs/ml/include/ml/ops/convolution_1d.hpp     |  30 +-
 libs/ml/include/ml/ops/convolution_2d.hpp     |   3 +
 libs/ml/include/ml/ops/embeddings.hpp         |  20 +-
 libs/ml/include/ml/ops/ops.hpp                |   5 +-
 libs/ml/include/ml/ops/weights.hpp            |   9 +
 libs/ml/include/ml/subgraph.hpp               |  15 +-
 libs/ml/tests/ml/layers/convolution_1d.cpp    |  27 +-
 libs/ml/tests/ml/layers/convolution_2d.cpp    |  93 +-
 libs/ml/tests/ml/ops/embeddings.cpp           |  25 +-
 .../include/vectorise/memory/vector_slice.hpp |   2 -
 72 files changed, 7921 insertions(+), 790 deletions(-)
 create mode 100644 libs/math/include/math/linalg/blas/base.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_nn_novector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_nn_vector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_nt_novector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_nt_vector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_tn_novector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_tn_vector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_tt_novector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemm_tt_vector.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemv_n.hpp
 create mode 100644 libs/math/include/math/linalg/blas/gemv_t.hpp
 create mode 100644 libs/math/include/math/linalg/blas/scal_all.hpp
 create mode 100644 libs/math/include/math/linalg/blas/swap_all.hpp
 create mode 100644 libs/math/include/math/linalg/prototype.hpp
 create mode 100644 libs/math/include/math/tensor_slice_iterator.hpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_nn_novector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_nn_vector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_nt_novector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_nt_vector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_tn_novector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_tn_vector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_tt_novector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemm_tt_vector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemv_n_novector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemv_n_vector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemv_t_novector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/gemv_t_vector.cpp
 create mode 100644 libs/math/src/math/linalg/blas/scal_all.cpp
 create mode 100644 libs/math/src/math/linalg/blas/swap_all.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_nn_novector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_nn_vector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_nt_novector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_nt_vector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_tn_novector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_tn_vector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_tt_novector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemm_tt_vector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemv_n_novector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemv_n_vector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemv_t_novector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/gemv_t_vector.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/scal_all.cpp
 create mode 100644 libs/math/tests/math/linalg/blas/swap_all.cpp
 create mode 100644 libs/math/tests/math/tensor_iterator/broadcast.cpp
 create mode 100644 libs/math/tests/math/tensor_iterator/iterator.cpp
 create mode 100644 libs/math/tests/math/tensor_iterator/squeeze.cpp

diff --git a/.gitignore b/.gitignore
index 7cc9109550..dafbec54d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ CMakeCache.txt
 
 nodes/
 .ipynb_checkpoints/
+docker-images/
 
 # Legacy Editors
 \#*#
diff --git a/README.md b/README.md
index d789b9690b..1963920ad1 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ features. Fetch will be delivering regular updates.
 3. [Community Website](https://community.fetch.ai/)
 4. [Community Telegram Group](https://t.me/fetchai)
 5. [Whitepapers](https://fetch.ai/publications.html)
-6. [Roadmap](https://fetch.ai/#/roadmap)
+6. [Roadmap](https://fetch.ai/#/roadmap) 
 
 
 ## Supported platforms
diff --git a/libs/math/CMakeLists.txt b/libs/math/CMakeLists.txt
index 70dc5bf5d1..de4efef366 100644
--- a/libs/math/CMakeLists.txt
+++ b/libs/math/CMakeLists.txt
@@ -15,7 +15,7 @@ setup_compiler()
 #-------------------------------------------------------------------------------
 
 setup_library(fetch-math)
-target_link_libraries(fetch-math INTERFACE fetch-core)
+target_link_libraries(fetch-math PUBLIC fetch-core)
 
 add_test_target()
 
diff --git a/libs/math/benchmark/basic_math/exp_bench.cpp b/libs/math/benchmark/basic_math/exp_bench.cpp
index 5c25704f39..753a59ef1f 100644
--- a/libs/math/benchmark/basic_math/exp_bench.cpp
+++ b/libs/math/benchmark/basic_math/exp_bench.cpp
@@ -29,14 +29,16 @@ static void BM_ApproxExpImplementation(benchmark::State &state)
 {
 
   fetch::math::ApproxExpImplementation<N, C> fexp;
-  double                                     x = (double)state.range(0);
-  double                                     result;
+  double                                     x      = 0.1;  //(double)state.range(0);
+  double                                     result = 0;
   for (auto _ : state)
   {
     // Single iteration is too small to get accurate benchmarks.
+    x += 0.1;
     for (int i = 0; i < 1000; i++)
     {
-      benchmark::DoNotOptimize(result = fexp(x));
+      x += 0.0001;
+      result += fexp(x);
     }
   }
 }
@@ -46,14 +48,16 @@ BENCHMARK_TEMPLATE(BM_ApproxExpImplementation, 12, 60801)->RangeMultiplier(10)->
 
 static void BM_exp(benchmark::State &state)
 {
-  double x = (double)state.range(0);
-  double result;
+  double x      = 0.1;  // (double)state.range(0);
+  double result = 0.0;
   for (auto _ : state)
   {
     // Single iteration is too small to get accurate benchmarks.
+    x += 0.1;
     for (int i = 0; i < 1000; i++)
     {
-      benchmark::DoNotOptimize(result = exp(x));
+      x += 0.0001;
+      result += exp(x);
     }
   }
 }
diff --git a/libs/math/include/math/linalg/blas/base.hpp b/libs/math/include/math/linalg/blas/base.hpp
new file mode 100644
index 0000000000..52f5163ced
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/base.hpp
@@ -0,0 +1,41 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "vectorise/memory/shared_array.hpp"
+#include "vectorise/platform.hpp"
+
+namespace fetch {
+namespace math {
+
+template <typename T, typename C>
+class Tensor;
+
+namespace linalg {
+template <typename T, uint64_t S, uint64_t I,
+          uint64_t V = platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>
+class Blas
+{
+public:
+  template <typename... Args>
+  void operator()(Args... args) = delete;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
diff --git a/libs/math/include/math/linalg/blas/gemm_nn_novector.hpp b/libs/math/include/math/linalg/blas/gemm_nn_novector.hpp
new file mode 100644
index 0000000000..d905eeeceb
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_nn_novector.hpp
@@ -0,0 +1,56 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_nn_novector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A, B) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemm_nn_vector.hpp b/libs/math/include/math/linalg/blas/gemm_nn_vector.hpp
new file mode 100644
index 0000000000..29eed0692d
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_nn_vector.hpp
@@ -0,0 +1,56 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_nn_vector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A, B) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemm_nt_novector.hpp b/libs/math/include/math/linalg/blas/gemm_nt_novector.hpp
new file mode 100644
index 0000000000..e04bcdde1f
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_nt_novector.hpp
@@ -0,0 +1,57 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_nt_novector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A, B.T) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * _A * T(_B) + _beta * _C),
+           platform::Parallelisation::NOT_PARALLEL>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemm_nt_vector.hpp b/libs/math/include/math/linalg/blas/gemm_nt_vector.hpp
new file mode 100644
index 0000000000..25231a8a1c
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_nt_vector.hpp
@@ -0,0 +1,56 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_nt_vector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A, B.T) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemm_tn_novector.hpp b/libs/math/include/math/linalg/blas/gemm_tn_novector.hpp
new file mode 100644
index 0000000000..85dc5d468d
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_tn_novector.hpp
@@ -0,0 +1,57 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_tn_novector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A.T, B) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * T(_A) * _B + _beta * _C),
+           platform::Parallelisation::NOT_PARALLEL>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemm_tn_vector.hpp b/libs/math/include/math/linalg/blas/gemm_tn_vector.hpp
new file mode 100644
index 0000000000..b5216c7850
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_tn_vector.hpp
@@ -0,0 +1,56 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_tn_vector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A.T, B) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemm_tt_novector.hpp b/libs/math/include/math/linalg/blas/gemm_tt_novector.hpp
new file mode 100644
index 0000000000..0d75edd5c3
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_tt_novector.hpp
@@ -0,0 +1,57 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_tt_novector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A.T, B.T) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+           platform::Parallelisation::NOT_PARALLEL>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemm_tt_vector.hpp b/libs/math/include/math/linalg/blas/gemm_tt_vector.hpp
new file mode 100644
index 0000000000..6c2811dd11
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemm_tt_vector.hpp
@@ -0,0 +1,57 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemm_tt_vector(alpha, A, B, beta, C):
+ *   C = alpha * np.dot(A.T, B.T) + beta * C
+ *
+ *   return C
+ *
+ * Authors:
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+class Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+           Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+           platform::Parallelisation::VECTORISE>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+                  Tensor<Type> &c) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemv_n.hpp b/libs/math/include/math/linalg/blas/gemv_n.hpp
new file mode 100644
index 0000000000..43a2066cfb
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemv_n.hpp
@@ -0,0 +1,70 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemv_n(alpha, A, x, n, beta, y, m):
+ *   leny = A.shape[0]
+ *   lenx = A.shape[1]
+ *   if m >= 0 and n >= 0:
+ *     y[::m] = alpha * np.dot(A, x[::n]) + beta * y[::m]
+ *   elif m < 0 and n >= 0:
+ *     y[-(leny -1)*m::m] = alpha * np.dot(A, x[::n]) + beta * y[-(leny -1)*m::m]
+ *   elif m >= 0 and n < 0:
+ *     y[::m] = alpha * np.dot(A, x[-(lenx -1)*n::n]) + beta * y[::m]
+ *   else:
+ *     y[-(leny -1)*m::m] = alpha * np.dot(A, x[-(lenx -1)*n::n]) + beta * y[-(leny -1)*m::m]
+ *
+ *   return y
+ *
+ * Authors:
+ *  - Fetch.AI Limited             (C++ version)
+ *  - Univ. of Tennessee           (Fortran version)
+ *  - Univ. of California Berkeley (Fortran version)
+ *  - Univ. of Colorado Denver     (Fortran version)
+ *  - NAG Ltd.                     (Fortran version)
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+class Blas<S, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+           Computes(_y <= _alpha * _A * _x + _beta * _y), V>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &x, int const &incx,
+                  Type const &beta, Tensor<Type> &y, int const &incy) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/gemv_t.hpp b/libs/math/include/math/linalg/blas/gemv_t.hpp
new file mode 100644
index 0000000000..45e636d145
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/gemv_t.hpp
@@ -0,0 +1,70 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def gemv_t(alpha, A, x, n, beta, y, m):
+ *   leny = A.T.shape[0]
+ *   lenx = A.T.shape[1]
+ *   if m >= 0 and n >= 0:
+ *     y[::m] = alpha * np.dot(A.T, x[::n]) + beta * y[::m]
+ *   elif m < 0 and n >= 0:
+ *     y[-(leny -1)*m::m] = alpha * np.dot(A.T, x[::n]) + beta * y[-(leny -1)*m::m]
+ *   elif m >= 0 and n < 0:
+ *     y[::m] = alpha * np.dot(A.T, x[-(lenx -1)*n::n]) + beta * y[::m]
+ *   else:
+ *     y[-(leny -1)*m::m] = alpha * np.dot(A.T, x[-(lenx -1)*n::n]) + beta * y[-(leny -1)*m::m]
+ *
+ *   return y
+ *
+ * Authors:
+ *  - Fetch.AI Limited             (C++ version)
+ *  - Univ. of Tennessee           (Fortran version)
+ *  - Univ. of California Berkeley (Fortran version)
+ *  - Univ. of Colorado Denver     (Fortran version)
+ *  - NAG Ltd.                     (Fortran version)
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+class Blas<S, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+           Computes(_y <= _alpha * T(_A) * _x + _beta * _y), V>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &x, int const &incx,
+                  Type const &beta, Tensor<Type> &y, int const &incy) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/scal_all.hpp b/libs/math/include/math/linalg/blas/scal_all.hpp
new file mode 100644
index 0000000000..6158a6f95a
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/scal_all.hpp
@@ -0,0 +1,63 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def scal_all(n, alpha, x, m):
+ *   if m == 1:
+ *     x = alpha * x
+ *   elif m > 0:
+ *     x[:n*m:m] = alpha * x[:n*m:m]
+ *   # else nothing to do for negative increases
+ *
+ *   return x
+ *
+ * Authors:
+ *  - Fetch.AI Limited             (C++ version)
+ *  - Univ. of Tennessee           (Fortran version)
+ *  - Univ. of California Berkeley (Fortran version)
+ *  - Univ. of Colorado Denver     (Fortran version)
+ *  - NAG Ltd.                     (Fortran version)
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+class Blas<S, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x), V>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(int const &n, Type const &da, Tensor<Type> &dx, int const &incx) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/blas/swap_all.hpp b/libs/math/include/math/linalg/blas/swap_all.hpp
new file mode 100644
index 0000000000..7b30d44e75
--- /dev/null
+++ b/libs/math/include/math/linalg/blas/swap_all.hpp
@@ -0,0 +1,72 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+/* The class defined in this file implements the equivalent of
+ * following Python code:
+ *
+ * import numpy as np
+ * import copy
+ *
+ * def swap_all(n, x, m, y, p):
+ *   A = copy.copy(x)
+ *   if p >= 0 and m >= 0:
+ *     x[:(m * n):m] = y[:(p * n):p]
+ *     y[:(p * n):p] = A[:(m * n):m]
+ *   elif p >= 0 and m < 0:
+ *     x[:(m * n + m):m] = y[:(p * n):p]
+ *     y[:(p * n):p] = A[:(m * n + m):m]
+ *   elif p < 0 and m >= 0:
+ *     x[:(m * n):m] = y[:(p * n + p):p]
+ *     y[:(p * n + p):p] = A[:(m * n):m]
+ *   else:
+ *     x[m *(-n + 1)::m] = y[p *(-n + 1)::p]
+ *     y[p *(-n + 1)::p] = A[m *(-n + 1)::m]
+ *
+ *   return x, y
+ *
+ * Authors:
+ *  - Fetch.AI Limited             (C++ version)
+ *  - Univ. of Tennessee           (Fortran version)
+ *  - Univ. of California Berkeley (Fortran version)
+ *  - Univ. of Colorado Denver     (Fortran version)
+ *  - NAG Ltd.                     (Fortran version)
+ */
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+class Blas<S, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x), V>
+{
+public:
+  using Type               = S;
+  using VectorRegisterType = typename Tensor<Type>::VectorRegisterType;
+
+  void operator()(int const &n, Tensor<Type> &dx, int const &incx, Tensor<Type> &dy,
+                  int const &incy) const;
+};
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/include/math/linalg/prototype.hpp b/libs/math/include/math/linalg/prototype.hpp
new file mode 100644
index 0000000000..8fe50fd791
--- /dev/null
+++ b/libs/math/include/math/linalg/prototype.hpp
@@ -0,0 +1,176 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <iostream>
+#include <stack>
+
+namespace fetch {
+namespace math {
+namespace linalg {
+
+/*
+ * This class uses the compiler to compile small 64-bit images
+ * of expressions such as C = A * B + C into a rom. This number
+ * can be used as a unique identifier for a given implementation
+ * of various mathematical functions.
+ *
+ * This prototype framework is constant at compile time.
+ */
+template <uint64_t P, uint64_t S1>
+struct Prototype
+{
+  static_assert(P <= 64, "stack overflow for const expression");
+  enum
+  {
+    OpSize    = 4ull,
+    StackSize = uint64_t(P),
+    Stack     = uint64_t(S1)
+  };
+
+  enum
+  {
+    RET  = 0ull,
+    MULT = 1ull,
+    ADD  = 2ull,
+    SUB  = 3ull,
+    EQ   = 4ull,
+
+    CONCAT    = 12ull,
+    TRANSPOSE = 13ull,
+    UPPER     = 14ull,
+    LOWER     = 15ull
+  };
+
+  template <uint64_t OP>
+  using one_op_return_type = Prototype<P + OpSize, uint64_t(S1) | (uint64_t(OP) << (P))>;
+
+  template <typename O, uint64_t OP>
+  using two_op_return_type =
+      Prototype<P + O::StackSize + OpSize,
+                uint64_t(S1) | (uint64_t(O::Stack) << P) | (uint64_t(OP) << (P + O::StackSize))>;
+
+  template <typename O>
+  two_op_return_type<O, ADD> constexpr operator+(O const & /*other*/) const
+  {
+    return two_op_return_type<O, ADD>();
+  }
+
+  template <typename O>
+  two_op_return_type<O, MULT> constexpr operator*(O const & /*other*/) const
+  {
+    return two_op_return_type<O, MULT>();
+  }
+
+  template <typename O>
+  two_op_return_type<O, RET> constexpr operator<=(O const & /*other*/) const
+  {
+    return two_op_return_type<O, RET>();
+  }
+
+  template <typename O>
+  two_op_return_type<O, CONCAT> constexpr operator,(O const & /*other*/) const
+  {
+    return two_op_return_type<O, CONCAT>();
+  }
+
+  template <typename O>
+  two_op_return_type<O, EQ> constexpr operator=(O const & /*other*/) const
+  {
+    return two_op_return_type<O, EQ>();
+  }
+};
+
+constexpr Prototype<4, 0>  _A{};      //< Represents Matrix 1
+constexpr Prototype<4, 1>  _B{};      //< Represents Matrix 2
+constexpr Prototype<4, 2>  _C{};      //< Represents Matrix 3
+constexpr Prototype<4, 3>  _alpha{};  //< Represents Scalar 1
+constexpr Prototype<4, 4>  _beta{};   //< Represents Scalar 2
+constexpr Prototype<4, 5>  _gamma{};  //< Represents Scalar 3
+constexpr Prototype<4, 6>  _x{};      //< Represents vector 1
+constexpr Prototype<4, 7>  _y{};      //< Represents vector 2
+constexpr Prototype<4, 8>  _z{};      //< Represents vector 3
+constexpr Prototype<4, 9>  _m{};      //< Represents integral 1
+constexpr Prototype<4, 10> _n{};      //< Represents integral 2
+constexpr Prototype<4, 11> _p{};      //< Represents integral 3
+
+// Operatation representing the transposed of a matrix.
+template <uint64_t P, uint64_t S>
+constexpr typename Prototype<P, S>::template one_op_return_type<Prototype<P, S>::TRANSPOSE> T(
+    Prototype<P, S> const &)
+{
+  return typename Prototype<P, S>::template one_op_return_type<Prototype<P, S>::TRANSPOSE>();
+}
+
+// Operatation defining the property "upper triangular" for a matrix
+template <uint64_t P, uint64_t S>
+constexpr typename Prototype<P, S>::template one_op_return_type<Prototype<P, S>::UPPER> U(
+    Prototype<P, S> const &)
+{
+  return typename Prototype<P, S>::template one_op_return_type<Prototype<P, S>::UPPER>();
+}
+
+// Operatation defining the property "lower triangular" for a matrix
+template <uint64_t P, uint64_t S>
+constexpr typename Prototype<P, S>::template one_op_return_type<Prototype<P, S>::LOWER> L(
+    Prototype<P, S> const &)
+{
+  return typename Prototype<P, S>::template one_op_return_type<Prototype<P, S>::LOWER>();
+}
+
+// Wrapper function to prettify the representation inside template constants
+template <typename O>
+constexpr uint64_t Computes(O const &)
+{
+  return O::Stack;
+}
+
+template <typename A>
+constexpr uint64_t Computes(A const &a, A const &b)
+{
+  return Computes((a, b));
+}
+
+template <typename A, typename B, typename... O>
+constexpr uint64_t Computes(A const &a, B const &b, O const &... objs)
+{
+  return Computes((a, b), objs...);
+}
+
+// Wrapper function to prettify signature representation.
+template <typename O>
+constexpr uint64_t Signature(O const &)
+{
+  return O::Stack;
+}
+
+template <typename A>
+constexpr uint64_t Signature(A const &a, A const &b)
+{
+  return Signature((a, b));
+}
+
+template <typename A, typename B, typename... O>
+constexpr uint64_t Signature(A const &a, B const &b, O const &... objs)
+{
+  return Signature((a, b), objs...);
+}
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
diff --git a/libs/math/include/math/matrix_operations.hpp b/libs/math/include/math/matrix_operations.hpp
index a3cf310102..bc533fbb46 100644
--- a/libs/math/include/math/matrix_operations.hpp
+++ b/libs/math/include/math/matrix_operations.hpp
@@ -59,10 +59,25 @@ inline void Min(ArrayType const &array, typename ArrayType::Type &ret)
 template <typename ArrayType>
 void Product(ArrayType const &obj1, typename ArrayType::Type &ret)
 {
-  ret = obj1.data().in_parallel().Reduce(memory::TrivialRange(0, obj1.size()),
+  // TODO(private issue 994): Create test for this function
+  if (obj1.padding() == 1)
+  {
+    ret =
+        obj1.data().in_parallel().Reduce(memory::TrivialRange(0, obj1.size()),
                                          [](typename ArrayType::VectorRegisterType const &a,
                                             typename ArrayType::VectorRegisterType const &b) ->
                                          typename ArrayType::VectorRegisterType { return a * b; });
+  }
+  else
+  {
+    auto it1 = obj1.cbegin();
+    ret      = static_cast<typename ArrayType::Type>(1);
+    while (it1.is_valid())
+    {
+      ret *= (*it1);
+      ++it1;
+    }
+  }
 }
 
 /**
@@ -92,7 +107,7 @@ meta::IfIsMathArray<ArrayType, void> BooleanMask(ArrayType const &input_array,
                                                  ArrayType const &mask, ArrayType &ret)
 {
   ASSERT(input_array.size() == mask.size());
-  ASSERT(ret.size() == typename ArrayType::SizeType(Sum(mask)));
+  ASSERT(ret.size() >= typename ArrayType::SizeType(Sum(mask)));
 
   auto     it1 = input_array.cbegin();
   auto     it2 = mask.cbegin();
@@ -102,7 +117,7 @@ meta::IfIsMathArray<ArrayType, void> BooleanMask(ArrayType const &input_array,
   {
     // TODO(private issue 193): implement boolean only array
     ASSERT((*it2 == 1) || (*it2 == 0));
-    if (std::uint64_t(*it2))
+    if (static_cast<uint64_t>(*it2))
     {
       *rit = *it1;
       ++counter;
@@ -112,7 +127,7 @@ meta::IfIsMathArray<ArrayType, void> BooleanMask(ArrayType const &input_array,
     ++rit;
   }
 
-  ret.LazyResize(counter);
+  ret.Resize({counter});
 }
 template <typename ArrayType>
 meta::IfIsMathArray<ArrayType, ArrayType> BooleanMask(ArrayType &input_array, ArrayType const &mask)
@@ -139,9 +154,10 @@ void Scatter(ArrayType &input_array, ArrayType const &updates,
 
   auto     indices_it = indices.begin();
   SizeType update_idx{0};
+
   while (indices_it != indices.end())
   {
-    input_array[input_array.ComputeIndex(*indices_it)] = updates[update_idx];
+    input_array.data()[input_array.ComputeIndex(*indices_it)] = updates[update_idx];
     ++indices_it;
     ++update_idx;
   }
@@ -529,13 +545,16 @@ meta::IfIsMathArray<ArrayType, void> ArgMax(ArrayType const &array, ArrayType &r
     SizeType position = 0;
     auto     it       = array.begin();
     Type     value    = numeric_lowest<Type>();
+
+    SizeType counter = SizeType{0};
     while (it.is_valid())
     {
       if (*it > value)
       {
         value    = *it;
-        position = it.counter();
+        position = counter;
       }
+      ++counter;
       ++it;
     }
 
@@ -723,7 +742,7 @@ fetch::math::meta::IfIsMathArray<ArrayType, void> DynamicStitch(ArrayType &
   ASSERT(data.size() <= input_array.size());
   ASSERT(input_array.size() >= Max(indices));
   ASSERT(Min(indices) >= 0);
-  input_array.LazyResize(indices.size());
+  input_array.Resize({indices.size()});
 
   auto ind_it  = indices.cbegin();
   auto data_it = data.cbegin();
diff --git a/libs/math/include/math/tensor.hpp b/libs/math/include/math/tensor.hpp
index ff85b33349..f01650cdb0 100644
--- a/libs/math/include/math/tensor.hpp
+++ b/libs/math/include/math/tensor.hpp
@@ -36,6 +36,7 @@
 #include "math/standard_functions/remainder.hpp"
 #include "math/tensor_broadcast.hpp"
 #include "math/tensor_iterator.hpp"
+#include "math/tensor_slice_iterator.hpp"
 
 #include <iostream>
 #include <memory>
@@ -52,9 +53,9 @@ static void ArangeImplementation(DataType const &from, DataType const &to, DataT
                                  ArrayType &ret)
 {
   SizeType N = SizeType((to - from) / delta);
-  ret.LazyResize(N);
-  ret.SetPaddedZero();
-  ret.FillArange(from, to);
+  ret.Resize({N});
+  ret.FillArange(static_cast<typename ArrayType::Type>(from),
+                 static_cast<typename ArrayType::Type>(to));
 }
 }  // namespace details
 
@@ -67,14 +68,23 @@ class Tensor
   using VectorSliceType            = typename ContainerType::VectorSliceType;
   using VectorRegisterType         = typename ContainerType::VectorRegisterType;
   using VectorRegisterIteratorType = typename ContainerType::VectorRegisterIteratorType;
-  using SelfType                   = Tensor<T, C>;
-  using IteratorType               = TensorIterator<T, typename SelfType::ContainerType>;
-  using ConstIteratorType          = ConstTensorIterator<T, typename SelfType::ContainerType>;
-  using SizeType                   = fetch::math::SizeType;
-  using SizeVector                 = fetch::math::SizeVector;
+
+  using IteratorType      = TensorIterator<T, ContainerType>;
+  using ConstIteratorType = ConstTensorIterator<T, ContainerType>;
+
+  using SliceIteratorType      = TensorSliceIterator<T, ContainerType>;
+  using ConstSliceIteratorType = ConstTensorSliceIterator<T, ContainerType>;
+  using SizeType               = fetch::math::SizeType;
+  using SizeVector             = fetch::math::SizeVector;
 
   static constexpr char const *LOGGING_NAME = "Tensor";
 
+  enum
+  {
+    LOG_PADDING = 3,
+    PADDING     = static_cast<SizeType>(1) << LOG_PADDING
+  };
+
 private:
   template <typename STensor>
   class TensorSliceImplementation;
@@ -84,7 +94,7 @@ class Tensor
   struct TensorSetter<N, TSType>;
 
 public:
-  using ConstSliceType = TensorSliceImplementation<SelfType const>;
+  using ConstSliceType = TensorSliceImplementation<Tensor const>;
 
   class TensorSlice;
 
@@ -97,9 +107,9 @@ class Tensor
   };
 
   Tensor()
-    : data_()
-    , size_(0)
-  {}
+  {
+    Resize({0});
+  }
 
   static Tensor FromString(byte_array::ConstByteArray const &c);
   explicit Tensor(SizeType const &n);
@@ -123,12 +133,12 @@ class Tensor
   /// ASSIGNMENT AND ACCESSING ///
   ////////////////////////////////
 
-  void     Copy(SelfType const &x);
-  SelfType Copy() const;
+  void   Copy(Tensor const &x);
+  Tensor Copy() const;
   template <typename G>
   void Assign(TensorSliceImplementation<G> const &other);
   void Assign(TensorSlice const &other);
-  void Assign(SelfType const &other);
+  void Assign(Tensor const &other);
 
   template <typename... Indices>
   Type &At(Indices... indices);
@@ -140,6 +150,9 @@ class Tensor
   template <typename... Indices>
   Type &operator()(Indices... indices);
 
+  template <typename... Args>
+  void Set(Args... args);
+
   Type operator()(SizeType const &index) const;
   template <typename S>
   typename std::enable_if<std::is_integral<S>::value, Type>::type &operator[](S const &i);
@@ -150,9 +163,6 @@ class Tensor
   Tensor &operator=(ConstSliceType const &slice);
   Tensor &operator=(TensorSlice const &slice);
 
-  template <typename... Args>
-  void Set(Args... args);
-
   void Fill(Type const &value, memory::Range const &range);
   void Fill(Type const &value, memory::TrivialRange const &range);
   void Fill(Type const &value);
@@ -163,90 +173,142 @@ class Tensor
   ContainerType const &data() const;
   ContainerType &      data();
 
-  template <typename DataType>
-  fetch::meta::IfIsInteger<DataType, SelfType> FillArange(DataType const &from, DataType const &to);
+  Tensor FillArange(Type const &from, Type const &to);
 
-  static SelfType UniformRandom(SizeType const &N);
-  static SelfType UniformRandomIntegers(SizeType const &N, int64_t const &min, int64_t const &max);
-  SelfType &      FillUniformRandom();
-  SelfType &      FillUniformRandomIntegers(int64_t const &min, int64_t const &max);
-  static SelfType Zeroes(SizeVector const &shape);
-  static SelfType Ones(SizeVector const &shape);
-  SizeType        ComputeIndex(SizeVector const &indices) const;
+  static Tensor UniformRandom(SizeType const &N);
+  static Tensor UniformRandomIntegers(SizeType const &N, int64_t const &min, int64_t const &max);
+  Tensor &      FillUniformRandom();
+  Tensor &      FillUniformRandomIntegers(int64_t const &min, int64_t const &max);
+  static Tensor Zeroes(SizeVector const &shape);
+  static Tensor Ones(SizeVector const &shape);
+  SizeType      ComputeIndex(SizeVector const &indices) const;
 
   ////////////////////
   /// SHAPE & SIZE ///
   ////////////////////
 
   static SizeType SizeFromShape(SizeVector const &shape);
+  static SizeType PaddedSizeFromShape(SizeVector const &shape);
+
+  void    Flatten();
+  Tensor  Transpose() const;  // TODO (private 867)
+  Tensor  Transpose(SizeVector &new_axes) const;
+  Tensor &Squeeze();
+  Tensor &Unsqueeze();
 
-  void              Flatten();
-  SelfType          Transpose() const;  // TODO (private 867)
-  SelfType          Transpose(SizeVector &new_axes) const;
-  SelfType &        Squeeze();
-  SelfType &        Unsqueeze();
-  void              ResizeFromShape(SizeVector const &shape);
-  void              LazyReshape(SizeVector const &shape);
-  bool              CanReshape(SizeVector const &shape);
-  void              Reshape(SizeVector const &shape);
+  /////////////////////////
+  /// memory management ///
+  /////////////////////////
+
+  /**
+   * Resizes and reshapes tensor according to newly specified shape
+   * @param shape the new shape to set
+   * @param copy whether to copy old data to new container or not
+   */
+  bool Resize(SizeVector const &shape, bool copy = false);
+
+  /**
+   * Resizes and reshapes tensor according to newly specified shape
+   * @param shape the new shape to set
+   */
+  bool Reshape(SizeVector const &shape)
+  {
+    return Resize(shape, true);
+  }
+
+  /**
+   * Resizes and reshapes tensor according to newly specified shape
+   * @param shape the new shape to set
+   */
+  bool ResizeFromShape(SizeVector const &shape)
+  {
+    // TODO(private issue 995): Get rid of this function
+    return Resize(shape, true);
+  }
+
+  SizeVector const &stride() const;
   SizeVector const &shape() const;
   SizeType const &  shape(SizeType const &n) const;
   SizeType          size() const;
 
+  /**
+   * Sets a single value in the array using an n-dimensional index
+   * @param indices     index position in array
+   * @param val         value to write
+   */
+  // TODO(private issue 123)
+  template <typename S>
+  fetch::meta::IfIsUnsignedInteger<S, void> Set(std::vector<S> const &indices, Type const &val)
+  {
+    assert(indices.size() == shape_.size());
+    this->operator[](ComputeColIndex(indices)) = val;
+  }
+
+  /**
+   * Gets a value from the array by N-dim index
+   * @param indices index to access
+   */
+  template <typename S>
+  fetch::meta::IfIsUnsignedInteger<S, Type> Get(std::vector<S> const &indices) const
+  {
+    assert(indices.size() == shape_.size());
+    return this->operator[](ComputeColIndex(indices));
+  }
+
   ///////////////////////
   /// MATH OPERATIONS ///
   ///////////////////////
 
-  SelfType InlineAdd(Tensor const &other);
-  SelfType InlineAdd(Type const &scalar);
-  SelfType InlineSubtract(Tensor const &other);
-  SelfType InlineSubtract(Type const &scalar);
-  SelfType InlineReverseSubtract(Tensor const &other);
-  SelfType InlineReverseSubtract(Type const &scalar);
-  SelfType InlineMultiply(Tensor const &other);
-  SelfType InlineMultiply(Type const &scalar);
-  SelfType InlineDivide(Tensor const &other);
-  SelfType InlineDivide(Type const &scalar);
-  SelfType InlineReverseDivide(Tensor const &other);
-  SelfType InlineReverseDivide(Type const &scalar);
+  Tensor InlineAdd(Tensor const &other);
+  Tensor InlineAdd(Type const &scalar);
+  Tensor InlineSubtract(Tensor const &other);
+  Tensor InlineSubtract(Type const &scalar);
+  Tensor InlineReverseSubtract(Tensor const &other);
+  Tensor InlineReverseSubtract(Type const &scalar);
+  Tensor InlineMultiply(Tensor const &other);
+  Tensor InlineMultiply(Type const &scalar);
+  Tensor InlineDivide(Tensor const &other);
+  Tensor InlineDivide(Type const &scalar);
+  Tensor InlineReverseDivide(Tensor const &other);
+  Tensor InlineReverseDivide(Type const &scalar);
 
   template <typename OtherType>
-  SelfType operator+(OtherType const &other);
+  Tensor operator+(OtherType const &other);
 
   template <typename OtherType>
-  SelfType operator+=(OtherType const &other);
+  Tensor operator+=(OtherType const &other);
 
   template <typename OtherType>
-  SelfType operator-(OtherType const &other);
+  Tensor operator-(OtherType const &other);
 
   template <typename OtherType>
-  SelfType operator-=(OtherType const &other);
+  Tensor operator-=(OtherType const &other);
 
   template <typename OtherType>
-  SelfType operator*(OtherType const &other);
+  Tensor operator*(OtherType const &other);
 
   template <typename OtherType>
-  SelfType operator*=(OtherType const &other);
+  Tensor operator*=(OtherType const &other);
 
   template <typename OtherType>
-  SelfType operator/(OtherType const &other);
+  Tensor operator/(OtherType const &other);
 
   template <typename OtherType>
-  SelfType operator/=(OtherType const &other);
+  Tensor operator/=(OtherType const &other);
 
-  SelfType &DotTranspose(SelfType const &A, SelfType const &B, Type alpha = 1.0, Type beta = 0.0);
-  SelfType &TransposeDot(SelfType const &A, SelfType const &B, Type alpha = 1.0, Type beta = 0.0);
-  Type      Sum() const;
+  Tensor &DotTranspose(Tensor const &A, Tensor const &B, Type alpha = 1.0, Type beta = 0.0);
+  Tensor &TransposeDot(Tensor const &A, Tensor const &B, Type alpha = 1.0, Type beta = 0.0);
+  Type    Sum() const;
 
-  void Exp(SelfType const &x);
-  void ApproxSoftMax(SelfType const &x);
+  void Exp(Tensor const &x);
+  void ApproxSoftMax(Tensor const &x);
   Type L2Norm() const;
   Type L2Loss() const;
 
-  Type     PeakToPeak() const;
-  void     Fmod(SelfType const &x);
-  void     Remainder(SelfType const &x);
-  SelfType Softmax(SelfType const &x);
+  Type   PeakToPeak() const;
+  void   Fmod(Tensor const &x);
+  void   Remainder(Tensor const &x);
+  Tensor Softmax(Tensor const &x);
 
   /////////////
   /// Order ///
@@ -277,42 +339,28 @@ class Tensor
   SizeType    Find(Type val) const;
 
   template <typename TensorType>
-  static SelfType              Stack(std::vector<TensorType> const &tensors);
-  static SelfType              Concat(std::vector<SelfType> const &tensors, SizeType axis);
-  static std::vector<SelfType> Split(SelfType const &tensor, SizeVector const &concat_points,
-                                     SizeType const axis);
+  static Tensor              Stack(std::vector<TensorType> const &tensors);
+  static Tensor              Concat(std::vector<Tensor> const &tensors, SizeType axis);
+  static std::vector<Tensor> Split(Tensor const &tensor, SizeVector const &concat_points,
+                                   SizeType const axis);
 
   void Sort();
   void Sort(memory::TrivialRange const &range);
 
   template <typename Unsigned>
-  static fetch::meta::IfIsUnsignedInteger<Unsigned, SelfType> Arange(Unsigned const &from,
-                                                                     Unsigned const &to,
-                                                                     Unsigned const &delta);
+  static fetch::meta::IfIsUnsignedInteger<Unsigned, Tensor> Arange(Unsigned const &from,
+                                                                   Unsigned const &to,
+                                                                   Unsigned const &delta);
 
   template <typename Signed>
-  static fetch::meta::IfIsSignedInteger<Signed, SelfType> Arange(Signed const &from,
-                                                                 Signed const &to,
-                                                                 Signed const &delta);
-
-  /////////////////////////
-  /// memory management ///
-  /////////////////////////
-
-  bool LazyReserve(SizeType const &n);
-  void Reserve(SizeType const &n);
-
-  template <typename S>
-  typename std::enable_if<std::is_integral<S>::value, void>::type LazyResize(S const &n);
-
-  template <typename S>
-  typename std::enable_if<std::is_integral<S>::value, void>::type Resize(S const &n);
+  static fetch::meta::IfIsSignedInteger<Signed, Tensor> Arange(Signed const &from, Signed const &to,
+                                                               Signed const &delta);
 
   ////////////////////////////
   /// COMPARISON OPERATORS ///
   ////////////////////////////
 
-  bool AllClose(SelfType const &o, Type const &relative_tolerance = Type(1e-5),
+  bool AllClose(Tensor const &o, Type const &relative_tolerance = Type(1e-5),
                 Type const &absolute_tolerance = Type(1e-8)) const;
   bool operator==(Tensor const &other) const;
   bool operator!=(Tensor const &other) const;
@@ -332,8 +380,8 @@ class Tensor
     using TensorSliceImplementation<Tensor>::begin;
     using TensorSliceImplementation<Tensor>::end;
 
-    IteratorType begin();
-    IteratorType end();
+    SliceIteratorType begin();
+    SliceIteratorType end();
     template <typename G>
     void Assign(TensorSliceImplementation<G> const &other);
     void Assign(Tensor const &other);
@@ -345,29 +393,28 @@ class Tensor
   ////////////////////////////////
 
   template <typename S>
-  friend void Serialize(S &serializer, SelfType const &t)
+  friend void Serialize(S &serializer, Tensor const &t)
   {
     serializer << t.size_;
     serializer << t.shape_;
     // TODO (private 870)
-    for (std::size_t i = 0; i < t.size(); ++i)
+    for (std::size_t i = 0; i < t.data().padded_size(); ++i)
     {
       serializer << t.data()[i];
     }
   }
 
   template <typename S>
-  friend void Deserialize(S &serializer, SelfType &t)
+  friend void Deserialize(S &serializer, Tensor &t)
   {
     SizeType   size;
     SizeVector shape;
     serializer >> size;
     serializer >> shape;
 
-    t.Resize(size);
     t.Reshape(shape);
 
-    for (std::size_t i = 0; i < t.size(); ++i)
+    for (std::size_t i = 0; i < t.data().padded_size(); ++i)
     {
       serializer >> t.data()[i];
     }
@@ -377,7 +424,7 @@ class Tensor
   template <typename S, typename D = memory::SharedArray<S>>
   void As(Tensor<S, D> &ret) const
   {
-    ret.LazyResize(size_);
+    ret.Resize({size_});
     auto this_it = cbegin();
     auto ret_it  = begin();
 
@@ -389,13 +436,73 @@ class Tensor
     }
   }
 
+  /////////////////////////////
+  /// Convenience functions ///
+  /////////////////////////////
+
+  SizeType height() const
+  {
+    return shape_[0];
+  }
+
+  SizeType width() const
+  {
+    return (shape_.size() > 1 ? shape_[1] : 1);
+  }
+
+  SizeType depth() const
+  {
+    return (shape_.size() > 2 ? shape_[2] : 1);
+  }
+
+  SizeType padded_size() const
+  {
+    return data_.padded_size();
+  }
+
+  SizeType padded_height() const
+  {
+    return padded_height_;
+  }
+
+  constexpr SizeType padding()
+  {
+    return PADDING;
+  }
+
+  /* @breif returns the smallest number which is a multiple of PADDING and greater than or equal to
+   a desired size.
+   * @param size is the size to be padded.
+   & @returns the padded size
+   */
+  static SizeType PadValue(SizeType size)
+  {
+    SizeType ret = SizeType(size / PADDING) * PADDING;
+    if (ret < size)
+    {
+      ret += PADDING;
+    }
+    return ret;
+  }
+
+  bool IsVector() const
+  {
+    return shape_.size() == 1;
+  }
+
+  bool IsMatrix() const
+  {
+    return shape_.size() == 2;
+  }
+
 private:
-  ContainerType data_;
-  SizeType      size_ = 0;
-  SizeVector    shape_;
-  SizeVector    stride_;
+  ContainerType data_{};
+  SizeType      size_{0};
+  SizeVector    shape_{};
+  SizeVector    stride_{};
+  SizeType      padded_height_{};
 
-  MAJOR_ORDER major_order_ = COLUMN;
+  MAJOR_ORDER major_order_{COLUMN};
 
   /**
    * Gets a value from the array by N-dim index
@@ -419,9 +526,10 @@ class Tensor
   {
     stride_.resize(shape_.size());
     SizeType n_dims = shape_.size();
-    SizeType base   = 1;
+    SizeType base   = padded_height_;
 
-    for (SizeType i = 0; i < n_dims; ++i)
+    stride_[0] = 1;
+    for (SizeType i = 1; i < n_dims; ++i)
     {
       stride_[i] = base;
       base *= shape_[i];
@@ -431,6 +539,7 @@ class Tensor
   }
 
   // TODO(private 871): replace with strides
+  /*
   SizeType ComputeRowIndex(SizeVector const &indices) const
   {
     SizeType index  = 0;
@@ -445,15 +554,18 @@ class Tensor
     }
     return index;
   }
+  */
 
-  SizeType ComputeColIndex(SizeVector const &indices) const
+  template <typename S>
+  fetch::meta::IfIsUnsignedInteger<S, SizeType> ComputeColIndex(std::vector<S> const &indices) const
   {
-    SizeType index  = 0;
+    assert(indices.size() > 0);
+    SizeType index  = indices[0];
     SizeType n_dims = indices.size();
-    SizeType base   = 1;
+    SizeType base   = padded_height_;
 
     // loop through all dimensions
-    for (SizeType i = 0; i < n_dims; ++i)
+    for (SizeType i = 1; i < n_dims; ++i)
     {
       index += indices[i] * base;
       base *= shape_[i];
@@ -467,7 +579,7 @@ class Tensor
    */
   void FlipMajorOrder(MAJOR_ORDER major_order)
   {
-    SelfType new_array{this->shape()};
+    Tensor new_array{this->shape()};
 
     SizeVector stride;
     SizeVector index;
@@ -481,8 +593,8 @@ class Tensor
       index.push_back(0);
     }
 
-    SizeType     total_size = SelfType::SizeFromShape(new_array.shape());
-    IteratorType it_this(*this);
+    SizeType          total_size = Tensor::SizeFromShape(new_array.shape());
+    SliceIteratorType it_this(*this);
 
     SizeType cur_dim;
     SizeType pos;
@@ -544,14 +656,14 @@ class Tensor
     }
   }
 
-  void TransposeImplementation(SizeVector &new_axes, SelfType &ret) const
+  void TransposeImplementation(SizeVector &new_axes, Tensor &ret) const
   {
-    auto it     = this->begin();
-    auto eit    = this->end();
-    auto ret_it = ret.end();
+    ConstSliceIteratorType it(*this);
+    SliceIteratorType      ret_it(ret);
+
     ret_it.Transpose(new_axes);
 
-    while (it != eit)
+    while (it.is_valid())
     {
       *ret_it = *it;
       ++it;
@@ -561,7 +673,7 @@ class Tensor
 
   /**
    * The TensorSlice is a convenience method for efficiently manipulating
-   * SubTensors (e.g. such as a 1D Slice). It is built on top of TensorIterator
+   * SubTensors (e.g. such as a 1D Slice). It is built on top of TensorSliceIterator
    * @tparam STensor
    */
   template <typename STensor>
@@ -577,12 +689,11 @@ class Tensor
       , axis_{std::move(axis)}
     {}
 
-    SelfType          Copy() const;
-    ConstIteratorType begin() const;
-    ConstIteratorType end() const;
-    STensor &         Tensor();
-    SizeType          size() const;
-    SizeVector        shape() const;
+    Tensor                 Copy() const;
+    ConstSliceIteratorType begin() const;
+    ConstSliceIteratorType end() const;
+    SizeType               size() const;
+    SizeVector             shape() const;
 
   protected:
     STensor &                          tensor_;
@@ -591,6 +702,10 @@ class Tensor
   };
 };
 
+/////////////////////////////////////////
+/// Tensor methods: memory management ///
+/////////////////////////////////////////
+
 /**
  * This method allows Tensor instantiation from a string which is convenient for quickly writing
  * tests.
@@ -646,15 +761,14 @@ Tensor<T, C> Tensor<T, C>::FromString(byte_array::ConstByteArray const &c)
 
   if (!failed)
   {
-    ret.ResizeFromShape({n, m});
-    ret.SetAllZero();
+    ret.Resize({n, m});
 
     SizeType k = 0;
     for (SizeType i = 0; i < n; ++i)
     {
       for (SizeType j = 0; j < m; ++j)
       {
-        ret.Set(i, j, elems[k++]);
+        ret(i, j) = elems[k++];
       }
     }
   }
@@ -672,16 +786,8 @@ Tensor<T, C> Tensor<T, C>::FromString(byte_array::ConstByteArray const &c)
  */
 template <typename T, typename C>
 Tensor<T, C>::Tensor(SizeType const &n)
-  : data_(n)
-  , size_(n)
 {
-  assert(this->size() == n);
-  this->LazyReshape({n});
-  Type zero{0};
-  for (SizeType idx = 0; idx < this->size(); ++idx)
-  {
-    operator[](idx) = zero;
-  }
+  this->Resize({n});
 }
 
 /**
@@ -692,8 +798,7 @@ Tensor<T, C>::Tensor(SizeType const &n)
 template <typename T, typename C>
 Tensor<T, C>::Tensor(SizeVector const &dims)
 {
-  ResizeFromShape(dims);
-  this->SetAllZero();
+  Resize(dims);
 }
 
 /////////////////////////////////
@@ -747,11 +852,13 @@ typename Tensor<T, C>::ConstIteratorType Tensor<T, C>::cend() const
  *
  **/
 template <typename T, typename C>
-void Tensor<T, C>::Copy(SelfType const &x)
+void Tensor<T, C>::Copy(Tensor const &x)
 {
-  this->data_ = x.data_.Copy();
-  this->size_ = x.size_;
-  this->LazyReshape(x.shape());
+  this->data_          = x.data_.Copy();
+  this->size_          = x.size_;
+  this->padded_height_ = x.padded_height_;
+  this->shape_         = x.shape();
+  this->stride_        = x.stride();
 }
 
 /**
@@ -763,10 +870,8 @@ void Tensor<T, C>::Copy(SelfType const &x)
 template <typename T, typename C>
 Tensor<T, C> Tensor<T, C>::Copy() const
 {
-  SelfType copy;
-  copy.data_ = this->data_.Copy();
-  copy.size_ = this->size_;
-  copy.LazyReshape(this->shape());
+  Tensor copy;
+  copy.Copy(*this);
   return copy;
 }
 
@@ -816,7 +921,7 @@ void Tensor<T, C>::Assign(TensorSlice const &other)
  * @param other Another Tensor to assign data from into this
  */
 template <typename T, typename C>
-void Tensor<T, C>::Assign(SelfType const &other)
+void Tensor<T, C>::Assign(Tensor const &other)
 {
   auto it1 = begin();
   auto it2 = other.begin();
@@ -921,9 +1026,19 @@ typename Tensor<T, C>::Type Tensor<T, C>::operator()(SizeType const &index) cons
 template <typename T, typename C>
 template <typename S>
 typename std::enable_if<std::is_integral<S>::value, typename Tensor<T, C>::Type>::type
-    &Tensor<T, C>::operator[](S const &i)
+    &Tensor<T, C>::operator[](S const &n)
 {
-  return data_[i];
+  assert(static_cast<SizeType>(n) < size());
+  if (shape_.size() == 1)
+  {
+    return data_[n];
+  }
+
+  SizeType j = static_cast<SizeType>(n) / height();
+  SizeType i = static_cast<SizeType>(n) - j * height();
+
+  assert(i + padded_height_ * j < padded_size());
+  return data_[i + padded_height_ * j];
 }
 
 /**
@@ -991,6 +1106,37 @@ Tensor<T, C> &Tensor<T, C>::operator=(TensorSlice const &slice)
   return *this;
 }
 
+template <typename T, typename C>
+bool Tensor<T, C>::Resize(SizeVector const &shape, bool copy)
+{
+  Tensor old_tensor = *this;
+
+  SizeType new_size = Tensor::PaddedSizeFromShape(shape);
+  data_             = ContainerType(new_size);
+
+  data_.SetAllZero();
+  shape_         = shape;
+  size_          = Tensor::SizeFromShape(shape);  // Note: differs from new_size
+  padded_height_ = PadValue(shape[0]);
+  UpdateStrides();
+
+  // Effectively a reshape
+  if (copy && (size_ == old_tensor.size()))
+  {
+    auto it  = begin();
+    auto oit = old_tensor.begin();
+    assert(it.size() == oit.size());
+    while (it.is_valid())
+    {
+      *it = *oit;
+      ++it;
+      ++oit;
+    }
+    return true;
+  }
+  return false;
+}
+
 /**
  * Set operator takes variable number of indices followed by one value.
  * This is made possible using the TensorSetter class to manage
@@ -1005,7 +1151,11 @@ template <typename T, typename C>
 template <typename... Args>
 void Tensor<T, C>::Set(Args... args)
 {
-  ASSERT(sizeof...(args) == stride_.size() + 1);  // Plus one as last arg is value
+  assert(sizeof...(args) == stride_.size() + 1);  // Plus one as last arg is value
+  if (sizeof...(args) != (stride_.size() + 1))
+  {
+    throw std::runtime_error("too many or not enough indices given to Tensor::Set");
+  }
 
   uint64_t index = TensorSetter<0, Args...>::IndexOf(stride_, shape_, std::forward<Args>(args)...);
   Type     value = TensorSetter<0, Args...>::ValueOf(std::forward<Args>(args)...);
@@ -1129,18 +1279,16 @@ typename Tensor<T, C>::ContainerType &Tensor<T, C>::data()
  * @return a reference to this
  */
 template <typename T, typename C>
-template <typename DataType>
-fetch::meta::IfIsInteger<DataType, Tensor<T, C>> Tensor<T, C>::FillArange(DataType const &from,
-                                                                          DataType const &to)
+Tensor<T, C> Tensor<T, C>::FillArange(Type const &from, Type const &to)
 {
-  SelfType ret;
+  Tensor ret;
 
   SizeType N     = this->size();
   Type     d     = static_cast<Type>(from);
   Type     delta = static_cast<Type>(to - from) / static_cast<Type>(N);
   for (SizeType i = 0; i < N; ++i)
   {
-    this->data()[i] = Type(d);
+    this->operator[](i) = Type(d);
     d += delta;
   }
   return *this;
@@ -1156,9 +1304,8 @@ fetch::meta::IfIsInteger<DataType, Tensor<T, C>> Tensor<T, C>::FillArange(DataTy
 template <typename T, typename C>
 Tensor<T, C> Tensor<T, C>::UniformRandom(SizeType const &N)
 {
-  SelfType ret;
-  ret.LazyResize(N);
-  ret.SetPaddedZero();
+  Tensor ret;
+  ret.Resize({N});
   ret.FillUniformRandom();
 
   return ret;
@@ -1177,9 +1324,8 @@ template <typename T, typename C>
 Tensor<T, C> Tensor<T, C>::UniformRandomIntegers(SizeType const &N, int64_t const &min,
                                                  int64_t const &max)
 {
-  SelfType ret;
-  ret.LazyResize(N);
-  ret.SetPaddedZero();
+  Tensor ret;
+  ret.Resize({N});
   ret.FillUniformRandomIntegers(min, max);
 
   return ret;
@@ -1196,7 +1342,7 @@ Tensor<T, C> &Tensor<T, C>::FillUniformRandom()
 {
   for (SizeType i = 0; i < this->size(); ++i)
   {
-    this->data()[i] = Type(random::Random::generator.AsDouble());
+    this->operator[](i) = Type(random::Random::generator.AsDouble());
   }
   return *this;
 }
@@ -1218,7 +1364,7 @@ Tensor<T, C> &Tensor<T, C>::FillUniformRandomIntegers(int64_t const &min, int64_
 
   for (SizeType i = 0; i < this->size(); ++i)
   {
-    this->data()[i] = Type(int64_t(random::Random::generator() % diff) + min);
+    this->operator[](i) = Type(int64_t(random::Random::generator() % diff) + min);
   }
 
   return *this;
@@ -1233,10 +1379,10 @@ Tensor<T, C> &Tensor<T, C>::FillUniformRandomIntegers(int64_t const &min, int64_
 template <typename T, typename C>
 Tensor<T, C> Tensor<T, C>::Zeroes(SizeVector const &shape)
 {
-  SizeType n = SizeFromShape(shape);
-  SelfType output{n};
+  SizeType n = PaddedSizeFromShape(shape);
+  Tensor   output{n};
   output.SetAllZero();
-  output.LazyReshape(shape);
+  output.Reshape(shape);
   return output;
 }
 
@@ -1249,11 +1395,9 @@ Tensor<T, C> Tensor<T, C>::Zeroes(SizeVector const &shape)
 template <typename T, typename C>
 Tensor<T, C> Tensor<T, C>::Ones(SizeVector const &shape)
 {
-  SizeType n =
-      std::accumulate(std::begin(shape), std::end(shape), SizeType(1), std::multiplies<SizeType>());
-  SelfType output{n};
+
+  Tensor output{shape};
   output.SetAllOne();
-  output.LazyReshape(shape);
   return output;
 }
 
@@ -1265,20 +1409,7 @@ Tensor<T, C> Tensor<T, C>::Ones(SizeVector const &shape)
 template <typename T, typename C>
 SizeType Tensor<T, C>::ComputeIndex(SizeVector const &indices) const
 {
-  ASSERT(indices.size() == shape_.size());
-
-  SizeType index{0};
-  auto     indices_it = indices.begin();
-  auto     stride_it  = stride_.begin();
-
-  while (indices_it != indices.end())
-  {
-    index += (*indices_it) * (*stride_it);
-    ++indices_it;
-    ++stride_it;
-  }
-
-  return index;
+  return ComputeColIndex(indices);
 }
 
 ////////////////////////////////////
@@ -1302,6 +1433,17 @@ typename Tensor<T, C>::SizeType Tensor<T, C>::SizeFromShape(SizeVector const &sh
   return std::accumulate(std::begin(shape), std::end(shape), SizeType(1), std::multiplies<>());
 }
 
+template <typename T, typename C>
+typename Tensor<T, C>::SizeType Tensor<T, C>::PaddedSizeFromShape(SizeVector const &shape)
+{
+  if (shape.size() == 0)
+  {
+    return SizeType{0};
+  }
+  return PadValue(shape[0]) *
+         std::accumulate(std::begin(shape) + 1, std::end(shape), SizeType(1), std::multiplies<>());
+}
+
 /**
  * Flattens the array to 1 dimension
  * @tparam T
@@ -1310,9 +1452,7 @@ typename Tensor<T, C>::SizeType Tensor<T, C>::SizeFromShape(SizeVector const &sh
 template <typename T, typename C>
 void Tensor<T, C>::Flatten()
 {
-  shape_.clear();
-  shape_.push_back(size_);
-  UpdateStrides();
+  Reshape({size()});
 }
 
 /**
@@ -1322,13 +1462,13 @@ void Tensor<T, C>::Flatten()
  * @return Returns new transposed Tensor
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::Transpose() const
+Tensor<T, C> Tensor<T, C>::Transpose() const
 {
   // TODO (private 867) -
   ASSERT(shape_.size() == 2);
   SizeVector new_axes{1, 0};
 
-  SelfType ret({shape().at(1), shape().at(0)});
+  Tensor ret({shape().at(1), shape().at(0)});
   TransposeImplementation(new_axes, ret);
   return ret;
 }
@@ -1341,12 +1481,12 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::Transpose() const
  * @return New tensor transposed as determined by new_axes
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::Transpose(SizeVector &new_axes) const
+Tensor<T, C> Tensor<T, C>::Transpose(SizeVector &new_axes) const
 {
   ASSERT(shape_.size() > 1);
   ASSERT(shape_.size() == new_axes.size());
 
-  SelfType ret(shape());
+  Tensor ret(shape());
   TransposeImplementation(new_axes, ret);
   return ret;
 }
@@ -1358,11 +1498,13 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::Transpose(SizeVector &new_axes) co
  * @return This tensor after squeezing
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType &Tensor<T, C>::Squeeze()
+Tensor<T, C> &Tensor<T, C>::Squeeze()
 {
-  ASSERT(shape_.at(0) == 1);
-  shape_.erase(shape_.begin());
-  UpdateStrides();
+  // TODO(private issue 998): Make last dimension for efficiency
+  auto shape = shape_;
+  shape.erase(shape.begin());
+  Reshape(shape);
+
   return *this;
 }
 
@@ -1373,89 +1515,27 @@ typename Tensor<T, C>::SelfType &Tensor<T, C>::Squeeze()
  * @return This tensor after unsqueeze
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType &Tensor<T, C>::Unsqueeze()
+Tensor<T, C> &Tensor<T, C>::Unsqueeze()
 {
-  shape_.insert(shape_.begin(), 1);
-  UpdateStrides();
-  return *this;
-}
+  auto shape = shape_;  // TODO: Make last dimension for efficiency
+  shape.insert(shape.begin(), 1);
 
-/**
- * Resizes and reshapes tensor according to newly specified shape
- * @tparam T Type
- * @tparam C Container
- * @param shape the new shape to set
- */
-template <typename T, typename C>
-void Tensor<T, C>::ResizeFromShape(SizeVector const &shape)
-{
-  Resize(SelfType::SizeFromShape(shape));
   Reshape(shape);
-}
 
-/**
- * Directly copies shape variable without checking anything
- * @tparam T Type
- * @tparam C Container
- * @param shape the new shape to set
- */
-template <typename T, typename C>
-void Tensor<T, C>::LazyReshape(SizeVector const &shape)
-{
-  shape_ = shape;
-  UpdateStrides();
-}
-
-/**
- * Tests if it is possible to reshape the array to a newly proposed shape
- * @tparam T Type
- * @tparam C Container
- * @param shape shape specified for the new array as a vector ot size_t.
- * @return success is a bool indicating where the proposed shape is acceptable.
- */
-template <typename T, typename C>
-bool Tensor<T, C>::CanReshape(SizeVector const &shape)
-{
-  if ((shape.size() == 0) && (size() == 0))
-  {
-    return true;
-  }
-  else
-  {
-    SizeType total = 1;
-    for (auto const &s : shape)
-    {
-      total *= s;
-    }
-    bool success                      = false;
-    (total == this->size()) ? success = true : success = false;
-    return success;
-  }
-  return false;
+  return *this;
 }
 
 /**
  */
 
 /**
- * Reshapes after checking the total size is the same
- * @tparam T Type
- * @tparam C Container
- * @param shape  specified for the new array as a vector of size type.
+ * returns the tensor's current shape
+ * @return the stride of the tensor as a vector of size_type
  */
 template <typename T, typename C>
-void Tensor<T, C>::Reshape(SizeVector const &shape)
+typename Tensor<T, C>::SizeVector const &Tensor<T, C>::stride() const
 {
-  ASSERT(CanReshape(shape));
-
-  shape_.clear();
-  shape_.reserve(shape.size());
-  for (auto const &s : shape)
-  {
-    shape_.push_back(s);
-  }
-  UpdateStrides();
-  size_ = SelfType::SizeFromShape(shape);
+  return stride_;
 }
 
 /**
@@ -1503,7 +1583,7 @@ typename Tensor<T, C>::SizeType Tensor<T, C>::size() const
  * @return
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineAdd(Tensor const &other)
+Tensor<T, C> Tensor<T, C>::InlineAdd(Tensor const &other)
 {
   if (other.shape() == shape_)
   {
@@ -1511,8 +1591,8 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineAdd(Tensor const &other)
   }
   else
   {
-    SelfType self_copy  = this->Copy();
-    SelfType other_copy = other.Copy();
+    Tensor self_copy  = this->Copy();
+    Tensor other_copy = other.Copy();
     if (!(Broadcast([](T x, T y) { return x + y; }, self_copy, other_copy, *this)))
     {
       throw std::runtime_error("arrays not broadcastable for InlineAdd!");
@@ -1527,7 +1607,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineAdd(Tensor const &other)
  * @return new array output
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineAdd(Type const &scalar)
+Tensor<T, C> Tensor<T, C>::InlineAdd(Type const &scalar)
 {
   Add(*this, scalar, *this);
   return *this;
@@ -1539,7 +1619,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineAdd(Type const &scalar)
  * @return
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineSubtract(Tensor const &other)
+Tensor<T, C> Tensor<T, C>::InlineSubtract(Tensor const &other)
 {
   if (other.shape() == shape_)
   {
@@ -1547,8 +1627,8 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineSubtract(Tensor const &other
   }
   else
   {
-    SelfType self_copy  = this->Copy();
-    SelfType other_copy = other.Copy();
+    Tensor self_copy  = this->Copy();
+    Tensor other_copy = other.Copy();
     if (!(Broadcast([](T x, T y) { return x - y; }, self_copy, other_copy, *this)))
     {
       throw std::runtime_error("arrays not broadcastable for InlineSubtract!");
@@ -1563,7 +1643,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineSubtract(Tensor const &other
  * @return new array output
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineSubtract(Type const &scalar)
+Tensor<T, C> Tensor<T, C>::InlineSubtract(Type const &scalar)
 {
   Subtract(*this, scalar, *this);
   return *this;
@@ -1575,7 +1655,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineSubtract(Type const &scalar)
  * @return
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseSubtract(Tensor const &other)
+Tensor<T, C> Tensor<T, C>::InlineReverseSubtract(Tensor const &other)
 {
   if (other.shape() == shape_)
   {
@@ -1583,8 +1663,8 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseSubtract(Tensor const
   }
   else
   {
-    SelfType self_copy  = this->Copy();
-    SelfType other_copy = other.Copy();
+    Tensor self_copy  = this->Copy();
+    Tensor other_copy = other.Copy();
     if (!(Broadcast([](T x, T y) { return x - y; }, other_copy, self_copy, *this)))
     {
       throw std::runtime_error("arrays not broadcastable for InlineReverseSubtract!");
@@ -1599,7 +1679,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseSubtract(Tensor const
  * @return new array output
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseSubtract(Type const &scalar)
+Tensor<T, C> Tensor<T, C>::InlineReverseSubtract(Type const &scalar)
 {
   Subtract(scalar, *this, *this);
   return *this;
@@ -1613,7 +1693,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseSubtract(Type const &
  * @return returns this tensor after multiplication
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineMultiply(Tensor const &other)
+Tensor<T, C> Tensor<T, C>::InlineMultiply(Tensor const &other)
 {
   if (other.shape() == shape_)
   {
@@ -1621,8 +1701,8 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineMultiply(Tensor const &other
   }
   else
   {
-    SelfType self_copy  = this->Copy();
-    SelfType other_copy = other.Copy();
+    Tensor self_copy  = this->Copy();
+    Tensor other_copy = other.Copy();
     if (!(Broadcast([](T x, T y) { return x * y; }, other_copy, self_copy, *this)))
     {
       throw std::runtime_error("arrays not broadcastable for InlineMultiply!");
@@ -1637,7 +1717,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineMultiply(Tensor const &other
  * @return new array output
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineMultiply(Type const &scalar)
+Tensor<T, C> Tensor<T, C>::InlineMultiply(Type const &scalar)
 {
   Multiply(*this, scalar, *this);
   return *this;
@@ -1649,7 +1729,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineMultiply(Type const &scalar)
  * @return
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineDivide(Tensor const &other)
+Tensor<T, C> Tensor<T, C>::InlineDivide(Tensor const &other)
 {
   if (other.shape() == shape_)
   {
@@ -1657,8 +1737,8 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineDivide(Tensor const &other)
   }
   else
   {
-    SelfType self_copy  = this->Copy();
-    SelfType other_copy = other.Copy();
+    Tensor self_copy  = this->Copy();
+    Tensor other_copy = other.Copy();
     if (!(Broadcast([](T x, T y) { return x / y; }, self_copy, other_copy, *this)))
     {
       throw std::runtime_error("arrays not broadcastable for InlineDivide!");
@@ -1673,7 +1753,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineDivide(Tensor const &other)
  * @return new array output
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineDivide(Type const &scalar)
+Tensor<T, C> Tensor<T, C>::InlineDivide(Type const &scalar)
 {
   Divide(*this, scalar, *this);
   return *this;
@@ -1685,7 +1765,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineDivide(Type const &scalar)
  * @return this tensor after inline reverse divide
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseDivide(Tensor const &other)
+Tensor<T, C> Tensor<T, C>::InlineReverseDivide(Tensor const &other)
 {
   if (other.shape() == shape_)
   {
@@ -1693,8 +1773,8 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseDivide(Tensor const &
   }
   else
   {
-    SelfType self_copy  = this->Copy();
-    SelfType other_copy = other.Copy();
+    Tensor self_copy  = this->Copy();
+    Tensor other_copy = other.Copy();
     if (!(Broadcast([](T x, T y) { return x / y; }, other_copy, self_copy, *this)))
     {
       throw std::runtime_error("arrays not broadcastable for InlineReverseDivide!");
@@ -1709,7 +1789,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseDivide(Tensor const &
  * @return new array output
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseDivide(Type const &scalar)
+Tensor<T, C> Tensor<T, C>::InlineReverseDivide(Type const &scalar)
 {
   Divide(scalar, *this, *this);
   return *this;
@@ -1723,14 +1803,14 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::InlineReverseDivide(Type const &sc
  */
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator+(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator+(OtherType const &other)
 {
   return InlineAdd(other);
 }
 
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator+=(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator+=(OtherType const &other)
 {
   return InlineAdd(other);
 }
@@ -1743,14 +1823,14 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::operator+=(OtherType const &other)
  */
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator-(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator-(OtherType const &other)
 {
   return InlineSubtract(other);
 }
 
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator-=(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator-=(OtherType const &other)
 {
   return InlineSubtract(other);
 }
@@ -1763,28 +1843,28 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::operator-=(OtherType const &other)
  */
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator*(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator*(OtherType const &other)
 {
   return InlineMultiply(other);
 }
 
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator*=(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator*=(OtherType const &other)
 {
   return InlineMultiply(other);
 }
 
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator/(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator/(OtherType const &other)
 {
   return InlineDivide(other);
 }
 
 template <typename T, typename C>
 template <typename OtherType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::operator/=(OtherType const &other)
+Tensor<T, C> Tensor<T, C>::operator/=(OtherType const &other)
 {
   return InlineDivide(other);
 }
@@ -1796,8 +1876,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::operator/=(OtherType const &other)
  * @return
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType &Tensor<T, C>::DotTranspose(SelfType const &A, SelfType const &B,
-                                                            Type alpha, Type beta)
+Tensor<T, C> &Tensor<T, C>::DotTranspose(Tensor const &A, Tensor const &B, Type alpha, Type beta)
 {
   ASSERT(this->shape().size() == 2);
   fetch::math::DotTranspose(A, B, *this, alpha, beta);
@@ -1812,8 +1891,7 @@ typename Tensor<T, C>::SelfType &Tensor<T, C>::DotTranspose(SelfType const &A, S
  * @return
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType &Tensor<T, C>::TransposeDot(SelfType const &A, SelfType const &B,
-                                                            Type alpha, Type beta)
+Tensor<T, C> &Tensor<T, C>::TransposeDot(Tensor const &A, Tensor const &B, Type alpha, Type beta)
 {
   assert(this->shape().size() == 2);
   fetch::math::TransposeDot(A, B, *this, alpha, beta);
@@ -1840,7 +1918,7 @@ typename Tensor<T, C>::Type Tensor<T, C>::Sum() const
  * Calculate the Exponentials of tensor x and stores in this
  */
 template <typename T, typename C>
-void Tensor<T, C>::Exp(SelfType const &x)
+void Tensor<T, C>::Exp(Tensor const &x)
 {
   Exp(x, *this);
 }
@@ -1849,7 +1927,7 @@ void Tensor<T, C>::Exp(SelfType const &x)
  * Calculate the ApproxSoftMax of X and store in this
  */
 template <typename T, typename C>
-void Tensor<T, C>::ApproxSoftMax(SelfType const &x)
+void Tensor<T, C>::ApproxSoftMax(Tensor const &x)
 {
   ApproxSoftMax(x, *this);
 }
@@ -1893,9 +1971,10 @@ typename Tensor<T, C>::Type Tensor<T, C>::PeakToPeak() const
  * @param x
  */
 template <typename T, typename C>
-void Tensor<T, C>::Fmod(SelfType const &x)
+void Tensor<T, C>::Fmod(Tensor const &x)
 {
-  LazyResize(x.size());
+  Resize({x.size()});
+  // TODO: Should use iterators
   fetch::math::Fmod(data_, x.data(), data_);
 }
 
@@ -1905,9 +1984,9 @@ void Tensor<T, C>::Fmod(SelfType const &x)
  * @param x
  */
 template <typename T, typename C>
-void Tensor<T, C>::Remainder(SelfType const &x)
+void Tensor<T, C>::Remainder(Tensor const &x)
 {
-  LazyResize(x.size());
+  Resize({x.size()});
   fetch::math::Remainder(data_, x.data(), data_);
 }
 
@@ -1917,9 +1996,9 @@ void Tensor<T, C>::Remainder(SelfType const &x)
  * @return
  */
 template <typename T, typename C>
-Tensor<T, C> Tensor<T, C>::Softmax(SelfType const &x)
+Tensor<T, C> Tensor<T, C>::Softmax(Tensor const &x)
 {
-  LazyResize(x.size());
+  Resize({x.size()});
   ASSERT(x.size() == this->size());
   fetch::math::Softmax(x, *this);
 
@@ -1987,15 +2066,13 @@ template <typename T, typename C>
 void Tensor<T, C>::CopyFromNumpy(T *ptr, SizeVector &shape, SizeVector & /*stride*/,
                                  SizeVector & /*index*/)
 {
-  SizeType total_size = SelfType::SizeFromShape(shape);
+  SizeType total_size = Tensor::SizeFromShape(shape);
 
   // get pointer to the data
-  Resize(total_size);
-  assert(this->CanReshape(shape));
   this->Reshape(shape);
 
   // re-allocate all the data
-  TensorIterator<T, C> it(*this);
+  TensorSliceIterator<T, C> it(*this);
 
   // copy all the data initially
   for (SizeType i = 0; i < total_size; ++i)
@@ -2013,7 +2090,7 @@ void Tensor<T, C>::CopyToNumpy(T *ptr, SizeVector &shape, SizeVector &stride, Si
 {
 
   // copy the data
-  TensorIterator<T, C> it(*this);
+  TensorSliceIterator<T, C> it(*this);
 
   for (SizeType j = 0; j < this->size(); ++j)
   {
@@ -2167,7 +2244,7 @@ SizeType Tensor<T, C>::Find(Type val) const
  */
 template <typename T, typename C>
 template <typename TensorType>
-typename Tensor<T, C>::SelfType Tensor<T, C>::Stack(std::vector<TensorType> const &tensors)
+Tensor<T, C> Tensor<T, C>::Stack(std::vector<TensorType> const &tensors)
 {
   SizeVector ret_size;
   ret_size.push_back(tensors.size());
@@ -2189,8 +2266,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::Stack(std::vector<TensorType> cons
  * @returnf
  */
 template <typename T, typename C>
-typename Tensor<T, C>::SelfType Tensor<T, C>::Concat(std::vector<SelfType> const &tensors,
-                                                     SizeType const               axis)
+Tensor<T, C> Tensor<T, C>::Concat(std::vector<Tensor> const &tensors, SizeType const axis)
 {
   // cant concatenate a single tensor
   ASSERT(tensors.size() > 1);
@@ -2219,7 +2295,7 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::Concat(std::vector<SelfType> const
   // set up the return tensor shape
   SizeVector ret_tensor_shape{tensor0_shape};
   ret_tensor_shape[axis] = sum_axis_size;
-  SelfType ret{ret_tensor_shape};
+  Tensor ret{ret_tensor_shape};
 
   // copy the data across for each tensor
   SizeType                           cur_from{0};
@@ -2251,8 +2327,8 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::Concat(std::vector<SelfType> const
     }
 
     // copy the data across
-    TensorIterator<T, C> ret_it{ret, step};
-    auto                 t_it = tensors[i].cbegin();
+    TensorSliceIterator<T, C> ret_it{ret, step};
+    auto                      t_it = tensors[i].cbegin();
 
     while (t_it.is_valid())
     {
@@ -2274,10 +2350,11 @@ typename Tensor<T, C>::SelfType Tensor<T, C>::Concat(std::vector<SelfType> const
  * @returnf
  */
 template <typename T, typename C>
-typename std::vector<typename Tensor<T, C>::SelfType> Tensor<T, C>::Split(
-    SelfType const &tensor, SizeVector const &concat_points, SizeType const axis)
+typename std::vector<Tensor<T, C>> Tensor<T, C>::Split(Tensor const &    tensor,
+                                                       SizeVector const &concat_points,
+                                                       SizeType const    axis)
 {
-  std::vector<SelfType> ret{concat_points.size()};
+  std::vector<Tensor> ret{concat_points.size()};
 
   // Move implementation to Tensor::UnConcatenate
   SizeType                           cur_from{0};
@@ -2309,13 +2386,13 @@ typename std::vector<typename Tensor<T, C>::SelfType> Tensor<T, C>::Split(
     }
 
     // copy the data across
-    ConstIteratorType err_it{tensor, step};
+    ConstSliceIteratorType err_it{tensor, step};
 
     SizeVector cur_error_tensor_shape = tensor.shape();
     cur_error_tensor_shape[axis]      = concat_points[i];
-    SelfType cur_error_tensor{cur_error_tensor_shape};
+    Tensor cur_error_tensor{cur_error_tensor_shape};
 
-    TensorIterator<T, C> t_it{cur_error_tensor};
+    TensorSliceIterator<T, C> t_it{cur_error_tensor};
 
     while (t_it.is_valid())
     {
@@ -2368,7 +2445,7 @@ fetch::meta::IfIsUnsignedInteger<Unsigned, Tensor<T, C>> Tensor<T, C>::Arange(Un
 {
   ASSERT(delta != 0);
   ASSERT(from < to);
-  SelfType ret;
+  Tensor ret;
   details::ArangeImplementation(from, to, delta, ret);
   return ret;
 }
@@ -2389,92 +2466,17 @@ fetch::meta::IfIsSignedInteger<Signed, Tensor<T, C>> Tensor<T, C>::Arange(Signed
 {
   ASSERT(delta != 0);
   ASSERT(((from < to) && delta > 0) || ((from > to) && delta < 0));
-  SelfType ret;
+  Tensor ret;
   details::ArangeImplementation(from, to, delta, ret);
   return ret;
 }
 
-/////////////////////////////////////////
-/// Tensor methods: memory management ///
-/////////////////////////////////////////
-
-/**
- * reserve memory, but throw away exisiting data_. bool return indicates whether any change was made
- * @tparam T
- * @tparam C
- * @param n
- * @return
- */
-template <typename T, typename C>
-bool Tensor<T, C>::LazyReserve(SizeType const &n)
-{
-  if (data_.size() < n)
-  {
-    data_ = ContainerType(n);
-    return true;
-  }
-  return false;
-}
-
-/**
- * reserve memory but don't throw away existing data stored in the tensor
- * @tparam T
- * @tparam C
- * @param n
- */
-template <typename T, typename C>
-void Tensor<T, C>::Reserve(SizeType const &n)
-{
-  ContainerType old_data = data_;
-
-  if (LazyReserve(n))
-  {
-    SizeType ns = std::min(old_data.size(), n);
-    memcpy(data_.pointer(), old_data.pointer(), ns);
-    data_.SetZeroAfter(ns);
-  }
-}
-
-/**
- * equivalent to lazyreserve but sets size and zeroes out data after that size
- * @tparam T
- * @tparam C
- * @tparam S
- * @param n
- * @return
- */
-template <typename T, typename C>
-template <typename S>
-typename std::enable_if<std::is_integral<S>::value, void>::type Tensor<T, C>::LazyResize(S const &n)
-{
-  LazyReserve(n);
-  size_ = n;
-  data_.SetZeroAfter(n);
-}
-
-/**
- * equivalent to lazyresize but sets all value after previous size to 0
- * @tparam T
- * @tparam C
- * @tparam S
- * @param n
- * @return
- */
-template <typename T, typename C>
-template <typename S>
-typename std::enable_if<std::is_integral<S>::value, void>::type Tensor<T, C>::Resize(S const &n)
-{
-  SizeType oldsize = size_;
-  LazyResize(n);
-  data_.SetZeroAfter(oldsize);
-}
-
 //////////////////////////////////
 /// Tensor methods: comparison ///
 //////////////////////////////////
 
 template <typename T, typename C>
-bool Tensor<T, C>::AllClose(SelfType const &o, Type const &relative_tolerance,
+bool Tensor<T, C>::AllClose(Tensor const &o, Type const &relative_tolerance,
                             Type const &absolute_tolerance) const
 {
   // Only enforcing number of elements
@@ -2500,7 +2502,6 @@ bool Tensor<T, C>::AllClose(SelfType const &o, Type const &relative_tolerance,
     T tolerance = std::max(absolute_tolerance, std::max(abs_e1, abs_e2) * relative_tolerance);
     if (abs_diff > tolerance)
     {
-      std::cout << "AllClose - " << e1 << " != " << e2 << std::endl;
       return false;
     }
   }
@@ -2624,9 +2625,9 @@ struct Tensor<T, C>::TensorSetter<N, TSType>
 // TensorSlice implementations
 
 template <typename T, typename C>
-typename Tensor<T, C>::IteratorType Tensor<T, C>::TensorSlice::begin()
+typename Tensor<T, C>::SliceIteratorType Tensor<T, C>::TensorSlice::begin()
 {
-  auto ret = IteratorType(this->tensor_, this->range_);
+  auto ret = SliceIteratorType(this->tensor_, this->range_);
   if (this->axis_ != 0)
   {
     ret.MoveAxesToFront(this->axis_);
@@ -2635,9 +2636,9 @@ typename Tensor<T, C>::IteratorType Tensor<T, C>::TensorSlice::begin()
 }
 
 template <typename T, typename C>
-typename Tensor<T, C>::IteratorType Tensor<T, C>::TensorSlice::end()
+typename Tensor<T, C>::SliceIteratorType Tensor<T, C>::TensorSlice::end()
 {
-  return IteratorType::EndIterator(this->tensor_);
+  return SliceIteratorType::EndIterator(this->tensor_);
 }
 
 template <typename T, typename C>
@@ -2684,24 +2685,24 @@ void Tensor<T, C>::TensorSlice::Fill(Type t)
 
 template <typename T, typename C>
 template <typename STensor>
-typename Tensor<T, C>::SelfType Tensor<T, C>::TensorSliceImplementation<STensor>::Copy() const
+Tensor<T, C> Tensor<T, C>::TensorSliceImplementation<STensor>::Copy() const
 {
   SizeVector shape;
   for (SizeType i{0}; i < this->range_.size(); ++i)
   {
     shape.emplace_back(this->range_[i][1] - this->range_[i][0] / this->range_[i][2]);
   }
-  SelfType ret{shape};
+  ::fetch::math::Tensor<T, C> ret{shape};
   ret.Assign(*this);
   return ret;
 }
 
 template <typename T, typename C>
 template <typename STensor>
-typename Tensor<T, C>::ConstIteratorType Tensor<T, C>::TensorSliceImplementation<STensor>::begin()
-    const
+typename Tensor<T, C>::ConstSliceIteratorType
+Tensor<T, C>::TensorSliceImplementation<STensor>::begin() const
 {
-  auto ret = ConstIteratorType(tensor_, range_);
+  auto ret = ConstSliceIteratorType(tensor_, range_);
   if (axis_ != 0)
   {
     ret.MoveAxesToFront(axis_);
@@ -2711,17 +2712,10 @@ typename Tensor<T, C>::ConstIteratorType Tensor<T, C>::TensorSliceImplementation
 
 template <typename T, typename C>
 template <typename STensor>
-typename Tensor<T, C>::ConstIteratorType Tensor<T, C>::TensorSliceImplementation<STensor>::end()
-    const
-{
-  return ConstIteratorType::EndIterator(tensor_);
-}
-
-template <typename T, typename C>
-template <typename STensor>
-STensor &Tensor<T, C>::TensorSliceImplementation<STensor>::Tensor()
+typename Tensor<T, C>::ConstSliceIteratorType
+Tensor<T, C>::TensorSliceImplementation<STensor>::end() const
 {
-  return tensor_;
+  return ConstSliceIteratorType::EndIterator(tensor_);
 }
 
 template <typename T, typename C>
diff --git a/libs/math/include/math/tensor_broadcast.hpp b/libs/math/include/math/tensor_broadcast.hpp
index 85dbe89419..8d53e1cab8 100644
--- a/libs/math/include/math/tensor_broadcast.hpp
+++ b/libs/math/include/math/tensor_broadcast.hpp
@@ -17,7 +17,7 @@
 //
 //------------------------------------------------------------------------------
 
-#include "math/tensor_iterator.hpp"
+#include "math/tensor_slice_iterator.hpp"
 #include <assert.h>
 #include <iostream>
 
@@ -87,7 +87,7 @@ inline bool ShapeFromBroadcast(SizeVector const &a, SizeVector const &b, SizeVec
 }
 
 template <typename T, typename C>
-inline bool UpgradeIteratorFromBroadcast(SizeVector const &a, TensorIterator<T, C> &iterator)
+inline bool UpgradeIteratorFromBroadcast(SizeVector const &a, TensorSliceIterator<T, C> &iterator)
 {
   assert(iterator.counter() == 0);   // Only upgrade untouched iterators.
   iterator.counter_ = uint64_t(-1);  // Invalidating the iterator
@@ -150,18 +150,17 @@ inline bool Broadcast(F function, Tensor<T, C> &a, Tensor<T, C> &b, Tensor<T, C>
     rangeC.push_back({0, i});
   }
 
-  TensorIterator<T, C> it_a(a, rangeA);
-  TensorIterator<T, C> it_b(b, rangeB);
-  TensorIterator<T, C> it_c(c, rangeC);
+  TensorSliceIterator<T, C> it_a(a, rangeA);
+  TensorSliceIterator<T, C> it_b(b, rangeB);
+  TensorSliceIterator<T, C> it_c(c, rangeC);
 
   if (!UpgradeIteratorFromBroadcast(cshape, it_a))
   {
-    std::cout << "Could not promote iterator A" << std::endl;
     return false;
   }
+
   if (!UpgradeIteratorFromBroadcast(cshape, it_b))
   {
-    std::cout << "Could not promote iterator B" << std::endl;
     return false;
   }
 
diff --git a/libs/math/include/math/tensor_iterator.hpp b/libs/math/include/math/tensor_iterator.hpp
index 7451834b93..9dff25cae3 100644
--- a/libs/math/include/math/tensor_iterator.hpp
+++ b/libs/math/include/math/tensor_iterator.hpp
@@ -19,36 +19,12 @@
 
 #include "math/base_types.hpp"
 
-#include <algorithm>
-#include <cassert>
-#include <vector>
-
 namespace fetch {
 namespace math {
 
-// need to forward declare
 template <typename T, typename C>
 class Tensor;
 
-struct TensorIteratorRange
-{
-  using SizeType       = uint64_t;
-  SizeType index       = 0;
-  SizeType from        = 0;
-  SizeType to          = 0;
-  SizeType step        = 1;
-  SizeType volume      = 1;
-  SizeType total_steps = 1;
-
-  SizeType step_volume  = 1;
-  SizeType total_volume = 1;
-
-  SizeType repeat_dimension = 1;
-  SizeType repetition       = 0;
-
-  SizeType current_n_dim_position = 0;
-};
-
 template <typename T, typename C, typename TensorType = Tensor<T, C>>
 class TensorIterator
 {
@@ -61,14 +37,7 @@ class TensorIterator
    */
   TensorIterator(TensorType &array)
     : array_(array)
-  {
-    std::vector<std::vector<SizeType>> step{};
-    for (auto i : array.shape())
-    {
-      step.push_back({0, i, 1});
-    }
-    Setup(step, array_.shape());
-  }
+  {}
 
   TensorIterator(TensorIterator const &other) = default;
   TensorIterator &operator=(TensorIterator const &other) = default;
@@ -76,42 +45,26 @@ class TensorIterator
   TensorIterator &operator=(TensorIterator &&other) = default;
 
   /**
-   * Iterator for more interesting ranges
-   * @param array the Tensor to operate upon
-   * @param step the from,to,and step range objects
+   * @brief creates an iterator for a tensor with a given starting position.
+   * @param array is the tensor that is iterated over
+   * @param position is the starting position referencing the underlying memory.
    */
-  TensorIterator(TensorType &array, std::vector<std::vector<SizeType>> const &step)
+  TensorIterator(TensorType &array, SizeType position)
     : array_(array)
-  {
-    Setup(step, array_.shape());
-  }
+    , position_{std::move(position)}
+  {}
 
   static TensorIterator EndIterator(TensorType &array)
   {
-    auto ret     = TensorIterator(array);
-    ret.counter_ = ret.size_;
-    return ret;
-  }
-
-  TensorIterator(TensorType &array, std::vector<SizeType> const &shape)
-    : array_(array)
-  {
-    std::vector<std::vector<SizeType>> step{};
-    for (auto i : array.shape())
-    {
-      step.push_back({0, i, 1});
-    }
-
-    Setup(step, shape);
+    return TensorIterator(array, array.data().size());
   }
-
   /**
    * identifies whether the iterator is still valid or has finished iterating
    * @return boolean indicating validity
    */
   bool is_valid() const
   {
-    return counter_ < size_;
+    return position_ < array_.data().size();
   }
 
   /**
@@ -130,242 +83,55 @@ class TensorIterator
    */
   TensorIterator &operator++()
   {
-    bool     next;
-    SizeType i = 0;
-    ++counter_;
-    do
-    {
-      next                   = false;
-      TensorIteratorRange &s = ranges_[i];
-      s.index += s.step;
-      position_ += s.step_volume;
-      ++s.current_n_dim_position;
-
-      if (s.index >= s.to)
-      {
-        ++s.repetition;
-        s.index                  = s.from;
-        s.current_n_dim_position = s.from;
-        position_ -= s.total_volume;
-
-        if (s.repetition == s.repeat_dimension)
-        {
-          s.repetition = 0;
-          next         = true;
-          ++i;
-        }
-      }
-    } while ((i < ranges_.size()) && (next));
+    ++i_;
+    ++position_;
 
-    // check if iteration is complete
-    if (i == ranges_.size())
+    if (i_ >= array_.height())
     {
-      if (counter_ < size_)
-      {
-        --total_runs_;
-        position_ = 0;
-        for (auto &r : ranges_)
-        {
-          r.index = r.from;
-          position_ += r.volume * r.index;
-        }
-      }
+      i_ = 0;
+      ++j_;
+      position_ = j_ * array_.padded_height();
     }
 
-#ifndef NDEBUG
-    SizeType ref = 0;
-    for (auto &s : ranges_)
-    {
-      ref += s.volume * s.index;
-    }
-
-    assert(ref == position_);
-#endif
-
     return *this;
   }
 
-  /**
-   * transpose axes according to the new order specified in perm
-   * @param perm
-   */
-  void Transpose(std::vector<SizeType> const &perm)
-  {
-    std::vector<TensorIteratorRange> new_ranges;
-    new_ranges.reserve(ranges_.size());
-    for (SizeType i = 0; i < ranges_.size(); ++i)
-    {
-      new_ranges.push_back(ranges_[perm[i]]);
-    }
-    std::swap(new_ranges, ranges_);
-  }
-
-  void PermuteAxes(SizeType const &a, SizeType const &b)
-  {
-    std::swap(ranges_[a], ranges_[b]);
-  }
-
-  // TODO: Name correctly
-  void MoveAxesToFront(SizeType const &a)
-  {
-    std::vector<TensorIteratorRange> new_ranges;
-    new_ranges.reserve(ranges_.size());
-    new_ranges.push_back(ranges_[a]);
-    for (SizeType i = 0; i < ranges_.size(); ++i)
-    {
-      if (i != a)
-      {
-        new_ranges.push_back(ranges_[i]);
-      }
-    }
-    std::swap(new_ranges, ranges_);
-  }
-
-  void ReverseAxes()
-  {
-    std::reverse(ranges_.begin(), ranges_.end());
-  }
-
   /**
    * dereference, i.e. give the value at the current position of the iterator
    * @return
    */
   Type &operator*()
   {
-    assert(position_ < array_.size());
-
-    return array_[position_];
+    assert(position_ < array_.data().size());
+    return array_.data()[position_];
   }
 
   Type const &operator*() const
   {
-    return array_[position_];
-  }
-
-  SizeType position() const
-  {
-    return position_;
-  }
-
-  SizeType PositionAlong(SizeType axis) const
-  {
-    return ranges_[axis].current_n_dim_position;
-  }
-
-  SizeVector PositionVector() const
-  {
-    SizeVector ret;
-    for (auto const &r : ranges_)
-    {
-      ret.push_back(r.current_n_dim_position);
-    }
-    return ret;
-  }
-
-  SizeType size() const
-  {
-    return size_;
-  }
-
-  SizeType counter() const
-  {
-    return counter_;
-  }
-
-  template <typename A, typename B>
-  friend bool UpgradeIteratorFromBroadcast(std::vector<SizeType> const &, TensorIterator<A, B> &);
-
-  /**
-   * returns the n-dimensional index of the current position
-   * @return
-   */
-  std::vector<SizeType> GetNDimIndex()
-  {
-    std::vector<SizeType> cur_index;
-    for (SizeType j = 0; j < ranges_.size(); ++j)
-    {
-      cur_index.push_back(ranges_[j].current_n_dim_position);
-    }
-
-    return cur_index;
-  }
-
-  TensorIteratorRange const &range(SizeType const &i)
-  {
-    return ranges_[i];
+    assert(position_ < array_.data().size());
+    return array_.data()[position_];
   }
 
   bool operator==(TensorIterator const &other) const
   {
-    if (this->end_of_iterator() && other->end_of_iterator())
-    {
-      return true;
-    }
-    return other.counter_ == counter_;
+    return other.position_ == position_;
   }
 
   bool operator!=(TensorIterator const &other) const
   {
-    return other.counter_ != counter_;
+    return other.position_ != position_;
   }
 
-protected:
-  std::vector<TensorIteratorRange> ranges_;
-  SizeType                         total_runs_ = 1;
-  SizeType                         size_       = 0;
-
-private:
-  void Setup(std::vector<std::vector<SizeType>> const &step, std::vector<SizeType> const &shape)
+  SizeType size() const
   {
-    ASSERT(array_.shape().size() == step.size());
-    SizeType volume = 1;
-
-    if (step.size() == 0)
-    {
-      size_     = 0;
-      position_ = 0;
-    }
-    else
-    {
-      size_     = 1;
-      position_ = 0;
-
-      for (SizeType i = 0; i < step.size(); ++i)
-      {
-        auto const &        a = step[i];
-        TensorIteratorRange s;
-        s.index = s.from = s.current_n_dim_position = a[0];
-        s.to                                        = a[1];
-
-        if (a.size() > 2)
-        {
-          s.step = a[2];
-        }
-        s.volume      = volume;
-        SizeType diff = (s.to - s.from);
-        s.total_steps = diff / s.step;
-        if (s.total_steps * s.step < diff)
-        {
-          ++s.total_steps;
-        }
-
-        s.total_steps *= s.step;
-        s.step_volume  = s.step * volume;
-        s.total_volume = (s.total_steps) * volume;
-
-        position_ += volume * s.from;
-        size_ *= s.total_steps;
-
-        volume *= shape[i];
-        ranges_.push_back(s);
-      }
-    }
+    return array_.size();
   }
 
+private:
   TensorType &array_;
-  SizeType    position_ = 0;
-
-  SizeType counter_ = 0;
+  SizeType    position_{0};
+  SizeType    i_{0};
+  SizeType    j_{0};
 };
 
 template <typename T, typename C>
diff --git a/libs/math/include/math/tensor_slice_iterator.hpp b/libs/math/include/math/tensor_slice_iterator.hpp
new file mode 100644
index 0000000000..ff08287fad
--- /dev/null
+++ b/libs/math/include/math/tensor_slice_iterator.hpp
@@ -0,0 +1,379 @@
+#pragma once
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/base_types.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <vector>
+
+namespace fetch {
+namespace math {
+
+// need to forward declare
+template <typename T, typename C>
+class Tensor;
+
+struct TensorSliceIteratorRange
+{
+  using SizeType       = uint64_t;
+  SizeType index       = 0;
+  SizeType from        = 0;
+  SizeType to          = 0;
+  SizeType step        = 1;
+  SizeType volume      = 1;
+  SizeType total_steps = 1;
+
+  SizeType step_volume  = 1;
+  SizeType total_volume = 1;
+
+  SizeType repeat_dimension = 1;
+  SizeType repetition       = 0;
+
+  SizeType current_n_dim_position = 0;
+};
+
+template <typename T, typename C, typename TensorType = Tensor<T, C>>
+class TensorSliceIterator
+{
+public:
+  using Type     = T;
+  using SizeType = uint64_t;
+  /**
+   * default range assumes step 1 over whole array - useful for trivial cases
+   * @param array
+   */
+  TensorSliceIterator(TensorType &array)
+    : array_(array)
+  {
+    std::vector<std::vector<SizeType>> step{};
+    for (auto i : array.shape())
+    {
+      step.push_back({0, i, 1});
+    }
+    Setup(step, array_.stride());
+  }
+
+  TensorSliceIterator(TensorSliceIterator const &other) = default;
+  TensorSliceIterator &operator=(TensorSliceIterator const &other) = default;
+  TensorSliceIterator(TensorSliceIterator &&other)                 = default;
+  TensorSliceIterator &operator=(TensorSliceIterator &&other) = default;
+
+  /**
+   * Iterator for more interesting ranges
+   * @param array the Tensor to operate upon
+   * @param step the from,to,and step range objects
+   */
+  TensorSliceIterator(TensorType &array, std::vector<std::vector<SizeType>> const &step)
+    : array_(array)
+  {
+    Setup(step, array_.stride());
+  }
+
+  static TensorSliceIterator EndIterator(TensorType &array)
+  {
+    auto ret     = TensorSliceIterator(array);
+    ret.counter_ = ret.size_;
+    return ret;
+  }
+
+  TensorSliceIterator(TensorType &array, std::vector<SizeType> const &stride)
+    : array_(array)
+  {
+    std::vector<std::vector<SizeType>> step{};
+    for (auto i : array.shape())
+    {
+      step.push_back({0, i, 1});
+    }
+
+    Setup(step, stride);
+  }
+
+  /**
+   * identifies whether the iterator is still valid or has finished iterating
+   * @return boolean indicating validity
+   */
+  bool is_valid() const
+  {
+    return counter_ < size_;
+  }
+
+  /**
+   * same as is_valid
+   * @return
+   */
+  operator bool() const
+  {
+    return is_valid();
+  }
+
+  /**
+   * incrementer, i.e. increment through the memory by 1 position making n-dim adjustments as
+   * necessary
+   * @return
+   */
+  TensorSliceIterator &operator++()
+  {
+    bool     next;
+    SizeType i = 0;
+
+    ++counter_;
+    do
+    {
+      next = false;
+      assert(i < ranges_.size());
+      TensorSliceIteratorRange &s = ranges_[i];
+      s.index += s.step;
+      position_ += s.step_volume;
+      ++s.current_n_dim_position;
+
+      if (s.index >= s.to)
+      {
+        ++s.repetition;
+        s.index                  = s.from;
+        s.current_n_dim_position = s.from;
+        position_ -= s.total_volume;
+
+        if (s.repetition == s.repeat_dimension)
+        {
+          s.repetition = 0;
+          next         = true;
+          ++i;
+        }
+      }
+    } while ((i < ranges_.size()) && (next));
+
+    // check if iteration is complete
+    if (i == ranges_.size())
+    {
+      if (counter_ < size_)
+      {
+        --total_runs_;
+        position_ = 0;
+        for (auto &r : ranges_)
+        {
+          r.index = r.from;
+          position_ += r.volume * r.index;
+        }
+      }
+    }
+
+#ifndef NDEBUG
+    SizeType ref = 0;
+    for (auto &s : ranges_)
+    {
+      ref += s.volume * s.index;
+    }
+
+    assert(ref == position_);
+#endif
+
+    return *this;
+  }
+
+  /**
+   * transpose axes according to the new order specified in perm
+   * @param perm
+   */
+  void Transpose(std::vector<SizeType> const &perm)
+  {
+    std::vector<TensorSliceIteratorRange> new_ranges;
+    new_ranges.reserve(ranges_.size());
+    for (SizeType i = 0; i < ranges_.size(); ++i)
+    {
+      new_ranges.push_back(ranges_[perm[i]]);
+    }
+    std::swap(new_ranges, ranges_);
+  }
+
+  void PermuteAxes(SizeType const &a, SizeType const &b)
+  {
+    std::swap(ranges_[a], ranges_[b]);
+  }
+
+  // TODO: Name correctly
+  void MoveAxesToFront(SizeType const &a)
+  {
+    std::vector<TensorSliceIteratorRange> new_ranges;
+    new_ranges.reserve(ranges_.size());
+    new_ranges.push_back(ranges_[a]);
+    for (SizeType i = 0; i < ranges_.size(); ++i)
+    {
+      if (i != a)
+      {
+        new_ranges.push_back(ranges_[i]);
+      }
+    }
+    std::swap(new_ranges, ranges_);
+  }
+
+  void ReverseAxes()
+  {
+    std::reverse(ranges_.begin(), ranges_.end());
+  }
+
+  /**
+   * dereference, i.e. give the value at the current position of the iterator
+   * @return
+   */
+  Type &operator*()
+  {
+    assert(position_ < array_.padded_size());
+    return array_.data()[position_];
+  }
+
+  Type const &operator*() const
+  {
+    assert(position_ < array_.padded_size());
+    return array_.data()[position_];
+  }
+
+  SizeType position() const
+  {
+    return position_;
+  }
+
+  SizeType PositionAlong(SizeType axis) const
+  {
+    return ranges_[axis].current_n_dim_position;
+  }
+
+  SizeVector PositionVector() const
+  {
+    SizeVector ret;
+    for (auto const &r : ranges_)
+    {
+      ret.push_back(r.current_n_dim_position);
+    }
+    return ret;
+  }
+
+  SizeType size() const
+  {
+    return size_;
+  }
+
+  SizeType counter() const
+  {
+    return counter_;
+  }
+
+  template <typename A, typename B>
+  friend bool UpgradeIteratorFromBroadcast(std::vector<SizeType> const &,
+                                           TensorSliceIterator<A, B> &);
+
+  /**
+   * returns the n-dimensional index of the current position
+   * @return
+   */
+  std::vector<SizeType> GetIndex()
+  {
+    std::vector<SizeType> cur_index;
+    for (SizeType j = 0; j < ranges_.size(); ++j)
+    {
+      cur_index.push_back(ranges_[j].current_n_dim_position);
+    }
+
+    return cur_index;
+  }
+
+  TensorSliceIteratorRange const &range(SizeType const &i)
+  {
+    return ranges_[i];
+  }
+
+  bool operator==(TensorSliceIterator const &other) const
+  {
+    if (this->end_of_iterator() && other->end_of_iterator())
+    {
+      return true;
+    }
+    return other.counter_ == counter_;
+  }
+
+  bool operator!=(TensorSliceIterator const &other) const
+  {
+    return other.counter_ != counter_;
+  }
+
+protected:
+  std::vector<TensorSliceIteratorRange> ranges_;
+  SizeType                              total_runs_ = 1;
+  SizeType                              size_       = 0;
+
+private:
+  void Setup(std::vector<std::vector<SizeType>> const &step, std::vector<SizeType> const &stride)
+  {
+    ASSERT(array_.shape().size() == step.size());
+    SizeType volume = 1;
+
+    if (step.size() == 0)
+    {
+      size_     = 0;
+      position_ = 0;
+    }
+    else
+    {
+      size_     = 1;
+      position_ = 0;
+
+      for (SizeType i = 0; i < step.size(); ++i)
+      {
+        auto const &             a = step[i];
+        TensorSliceIteratorRange s;
+        s.index = s.from = s.current_n_dim_position = a[0];
+        s.to                                        = a[1];
+
+        if (a.size() > 2)
+        {
+          s.step = a[2];
+        }
+        volume = stride[i];
+
+        s.volume      = volume;
+        SizeType diff = (s.to - s.from);
+        s.total_steps = diff / s.step;
+        if (s.total_steps * s.step < diff)
+        {
+          ++s.total_steps;
+        }
+
+        s.total_steps *= s.step;
+        s.step_volume  = s.step * volume;
+        s.total_volume = (s.total_steps) * volume;
+
+        position_ += volume * s.from;
+        size_ *= s.total_steps;
+
+        ranges_.push_back(s);
+      }
+    }
+  }
+
+  TensorType &array_;
+  SizeType    position_ = 0;
+
+  SizeType counter_ = 0;
+};
+
+template <typename T, typename C>
+using ConstTensorSliceIterator = TensorSliceIterator<T const, C, Tensor<T, C> const>;
+
+}  // namespace math
+}  // namespace fetch
diff --git a/libs/math/include/math/tensor_squeeze.hpp b/libs/math/include/math/tensor_squeeze.hpp
index d679522e9d..3543bf7e3e 100644
--- a/libs/math/include/math/tensor_squeeze.hpp
+++ b/libs/math/include/math/tensor_squeeze.hpp
@@ -18,7 +18,7 @@
 //------------------------------------------------------------------------------
 
 #include "math/base_types.hpp"
-#include "math/tensor_iterator.hpp"
+#include "math/tensor_slice_iterator.hpp"
 #include <cassert>
 #include <unordered_set>
 
@@ -95,7 +95,7 @@ inline void Squeeze(Tensor<T, C> &arr, SizeType const &axis = SizeType(-1))
 {
   SizeVector newshape;
   ShapeFromSqueeze(arr.shape(), newshape, axis);
-  arr.LazyReshape(newshape);
+  arr.Reshape(newshape);
 }
 
 /* Squeeze an Tensor.
@@ -112,7 +112,7 @@ void Squeeze(Tensor<T, C> &arr, SizeSet const &axes)
 
 namespace reduce_details {
 template <typename F, typename T, typename C>
-inline void Reduce(F fnc, ConstTensorIterator<T, C> &it_a, TensorIterator<T, C> &it_b,
+inline void Reduce(F fnc, ConstTensorSliceIterator<T, C> &it_a, TensorSliceIterator<T, C> &it_b,
                    SizeType const &N)
 {
   while (bool(it_a) && bool(it_b))
@@ -143,8 +143,8 @@ inline void Reduce(F fnc, Tensor<T, C> const &input, Tensor<T, C> &output, SizeT
   SizeType N;
 
   SizeType   k{1};
-  SizeVector out_shape{1};
-  for (SizeType i{0}; i < input.shape().size(); ++i)
+  SizeVector out_shape{};
+  for (SizeType i = 0; i < input.shape().size(); ++i)
   {
     if (i != axis)
     {
@@ -152,11 +152,10 @@ inline void Reduce(F fnc, Tensor<T, C> const &input, Tensor<T, C> &output, SizeT
       k *= input.shape(i);
     }
   }
-  output.Resize(k);
   output.Reshape(out_shape);
 
-  fetch::math::ConstTensorIterator<T, C> it_a(input);
-  fetch::math::TensorIterator<T, C>      it_b(output);
+  fetch::math::ConstTensorSliceIterator<T, C> it_a(input);
+  fetch::math::TensorSliceIterator<T, C>      it_b(output);
 
   if (axis != 0)
   {
@@ -193,11 +192,10 @@ inline void Reduce(F fnc, Tensor<T, C> const &input, Tensor<T, C> &output, SizeV
       k *= input.shape(i);
     }
   }
-  output.Resize(k);
   output.Reshape(out_shape);
 
-  ConstTensorIterator<T, C> it_a(input);
-  TensorIterator<T, C>      it_b(output);
+  ConstTensorSliceIterator<T, C> it_a(input);
+  TensorSliceIterator<T, C>      it_b(output);
 
   // Move the axis we want to reduce to the front
   // to make it iterable in the inner most loop.
diff --git a/libs/math/src/math/linalg/blas/gemm_nn_novector.cpp b/libs/math/src/math/linalg/blas/gemm_nn_novector.cpp
new file mode 100644
index 0000000000..f2871d04df
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_nn_novector.cpp
@@ -0,0 +1,108 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_nn_novector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>::
+     operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+           Tensor<Type> &c) const
+{
+  std::size_t i;
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.width() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = 0.0;
+        }
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = beta * c(i, j);
+        }
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    std::size_t l;
+    if (beta == 0.0)
+    {
+      for (i = 0; i < c.height(); ++i)
+      {
+        c(i, j) = 0.0;
+      }
+    }
+    else if (beta != 1.0)
+    {
+      for (i = 0; i < c.height(); ++i)
+      {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (l = 0; l < a.width(); ++l)
+    {
+      Type temp;
+      temp = alpha * b(l, j);
+      for (i = 0; i < c.height(); ++i)
+      {
+        c(i, j) = c(i, j) + temp * a(i, l);
+      }
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * _B + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * _B + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemm_nn_vector.cpp b/libs/math/src/math/linalg/blas/gemm_nn_vector.cpp
new file mode 100644
index 0000000000..b0389776fe
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_nn_vector.cpp
@@ -0,0 +1,137 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_nn_vector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>::
+     operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+           Tensor<Type> &c) const
+{
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.width() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_zero(0.0);
+
+        auto                 ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range, [vec_zero](VectorRegisterType &vw_c_j) { vw_c_j = vec_zero; });
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_beta(beta);
+
+        auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range,
+            [vec_beta](VectorRegisterType const &vr_c_j, VectorRegisterType &vw_c_j) {
+              vw_c_j = vec_beta * vr_c_j;
+            },
+            slice_c_j);
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    std::size_t l;
+    if (beta == 0.0)
+    {
+
+      VectorRegisterType vec_zero(0.0);
+
+      auto                 ret_slice = c.data().slice(c.padded_height() * j, c.height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+      ret_slice.in_parallel().Apply(range,
+                                    [vec_zero](VectorRegisterType &vw_c_j) { vw_c_j = vec_zero; });
+    }
+    else if (beta != 1.0)
+    {
+
+      VectorRegisterType vec_beta(beta);
+
+      auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+      auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+      ret_slice.in_parallel().Apply(
+          range,
+          [vec_beta](VectorRegisterType const &vr_c_j, VectorRegisterType &vw_c_j) {
+            vw_c_j = vec_beta * vr_c_j;
+          },
+          slice_c_j);
+    }
+
+    for (l = 0; l < a.width(); ++l)
+    {
+      Type temp;
+      temp = alpha * b(l, j);
+
+      VectorRegisterType vec_temp(temp);
+
+      auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+      auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+      auto slice_a_l = a.data().slice(a.padded_height() * std::size_t(l), a.padded_height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+      ret_slice.in_parallel().Apply(
+          range,
+          [vec_temp](VectorRegisterType const &vr_c_j, VectorRegisterType const &vr_a_l,
+                     VectorRegisterType &vw_c_j) { vw_c_j = vr_c_j + vec_temp * vr_a_l; },
+          slice_c_j, slice_a_l);
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * _B + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * _B + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemm_nt_novector.cpp b/libs/math/src/math/linalg/blas/gemm_nt_novector.cpp
new file mode 100644
index 0000000000..d2eb3d5d4c
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_nt_novector.cpp
@@ -0,0 +1,111 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_nt_novector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * _A * T(_B) + _beta * _C),
+          platform::Parallelisation::NOT_PARALLEL>::operator()(Type const &        alpha,
+                                                               Tensor<Type> const &a,
+                                                               Tensor<Type> const &b,
+                                                               Type const &        beta,
+                                                               Tensor<Type> &      c) const
+{
+  std::size_t i;
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.width() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = 0.0;
+        }
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = beta * c(i, j);
+        }
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    std::size_t l;
+    if (beta == 0.0)
+    {
+      for (i = 0; i < c.height(); ++i)
+      {
+        c(i, j) = 0.0;
+      }
+    }
+    else if (beta != 1.0)
+    {
+      for (i = 0; i < c.height(); ++i)
+      {
+        c(i, j) = beta * c(i, j);
+      }
+    }
+
+    for (l = 0; l < a.width(); ++l)
+    {
+      Type temp;
+      temp = alpha * b(j, l);
+      for (i = 0; i < c.height(); ++i)
+      {
+        c(i, j) = c(i, j) + temp * a(i, l);
+      }
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * T(_B) + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * T(_B) + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemm_nt_vector.cpp b/libs/math/src/math/linalg/blas/gemm_nt_vector.cpp
new file mode 100644
index 0000000000..bf2ede03ee
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_nt_vector.cpp
@@ -0,0 +1,137 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_nt_vector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>::
+     operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+           Tensor<Type> &c) const
+{
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.width() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_zero(0.0);
+
+        auto                 ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range, [vec_zero](VectorRegisterType &vw_c_j) { vw_c_j = vec_zero; });
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_beta(beta);
+
+        auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range,
+            [vec_beta](VectorRegisterType const &vr_c_j, VectorRegisterType &vw_c_j) {
+              vw_c_j = vec_beta * vr_c_j;
+            },
+            slice_c_j);
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    std::size_t l;
+    if (beta == 0.0)
+    {
+
+      VectorRegisterType vec_zero(0.0);
+
+      auto                 ret_slice = c.data().slice(c.padded_height() * j, c.height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+      ret_slice.in_parallel().Apply(range,
+                                    [vec_zero](VectorRegisterType &vw_c_j) { vw_c_j = vec_zero; });
+    }
+    else if (beta != 1.0)
+    {
+
+      VectorRegisterType vec_beta(beta);
+
+      auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+      auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+      ret_slice.in_parallel().Apply(
+          range,
+          [vec_beta](VectorRegisterType const &vr_c_j, VectorRegisterType &vw_c_j) {
+            vw_c_j = vec_beta * vr_c_j;
+          },
+          slice_c_j);
+    }
+
+    for (l = 0; l < a.width(); ++l)
+    {
+      Type temp;
+      temp = alpha * b(j, l);
+
+      VectorRegisterType vec_temp(temp);
+
+      auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+      auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+      auto slice_a_l = a.data().slice(a.padded_height() * std::size_t(l), a.padded_height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+      ret_slice.in_parallel().Apply(
+          range,
+          [vec_temp](VectorRegisterType const &vr_c_j, VectorRegisterType const &vr_a_l,
+                     VectorRegisterType &vw_c_j) { vw_c_j = vr_c_j + vec_temp * vr_a_l; },
+          slice_c_j, slice_a_l);
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * T(_B) + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * _A * T(_B) + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemm_tn_novector.cpp b/libs/math/src/math/linalg/blas/gemm_tn_novector.cpp
new file mode 100644
index 0000000000..86ac0da1f2
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_tn_novector.cpp
@@ -0,0 +1,105 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_tn_novector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * T(_A) * _B + _beta * _C),
+          platform::Parallelisation::NOT_PARALLEL>::operator()(Type const &        alpha,
+                                                               Tensor<Type> const &a,
+                                                               Tensor<Type> const &b,
+                                                               Type const &        beta,
+                                                               Tensor<Type> &      c) const
+{
+  std::size_t i;
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.height() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = 0.0;
+        }
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = beta * c(i, j);
+        }
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    for (i = 0; i < c.height(); ++i)
+    {
+      Type        temp;
+      std::size_t l;
+      temp = 0.0;
+      for (l = 0; l < a.height(); ++l)
+      {
+        temp = temp + a(l, i) * b(l, j);
+      }
+
+      if (beta == 0.0)
+      {
+        c(i, j) = alpha * temp;
+      }
+      else
+      {
+        c(i, j) = alpha * temp + beta * c(i, j);
+      }
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * _B + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * _B + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemm_tn_vector.cpp b/libs/math/src/math/linalg/blas/gemm_tn_vector.cpp
new file mode 100644
index 0000000000..bae66b2cbc
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_tn_vector.cpp
@@ -0,0 +1,117 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_tn_vector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>::
+     operator()(Type const &alpha, Tensor<Type> const &a, Tensor<Type> const &b, Type const &beta,
+           Tensor<Type> &c) const
+{
+  std::size_t i;
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.height() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_zero(0.0);
+
+        auto                 ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range, [vec_zero](VectorRegisterType &vw_c_j) { vw_c_j = vec_zero; });
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_beta(beta);
+
+        auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range,
+            [vec_beta](VectorRegisterType const &vr_c_j, VectorRegisterType &vw_c_j) {
+              vw_c_j = vec_beta * vr_c_j;
+            },
+            slice_c_j);
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    for (i = 0; i < c.height(); ++i)
+    {
+      Type temp;
+      temp = 0.0;
+
+      auto slice_a_i = a.data().slice(a.padded_height() * std::size_t(i), a.padded_height());
+      auto slice_b_j = b.data().slice(b.padded_height() * std::size_t(j), b.padded_height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(a.height()));
+      temp = slice_a_i.in_parallel().SumReduce(
+          range,
+          [](VectorRegisterType const &vr_a_i, VectorRegisterType const &vr_b_j) {
+            return vr_a_i * vr_b_j;
+          },
+          slice_b_j);
+      if (beta == 0.0)
+      {
+        c(i, j) = alpha * temp;
+      }
+      else
+      {
+        c(i, j) = alpha * temp + beta * c(i, j);
+      }
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * _B + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * _B + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemm_tt_novector.cpp b/libs/math/src/math/linalg/blas/gemm_tt_novector.cpp
new file mode 100644
index 0000000000..948c3b651a
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_tt_novector.cpp
@@ -0,0 +1,105 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_tt_novector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+          platform::Parallelisation::NOT_PARALLEL>::operator()(Type const &        alpha,
+                                                               Tensor<Type> const &a,
+                                                               Tensor<Type> const &b,
+                                                               Type const &        beta,
+                                                               Tensor<Type> &      c) const
+{
+  std::size_t i;
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.height() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = 0.0;
+        }
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+        for (i = 0; i < c.height(); ++i)
+        {
+          c(i, j) = beta * c(i, j);
+        }
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    for (i = 0; i < c.height(); ++i)
+    {
+      Type        temp;
+      std::size_t l;
+      temp = 0.0;
+      for (l = 0; l < a.height(); ++l)
+      {
+        temp = temp + a(l, i) * b(j, l);
+      }
+
+      if (beta == 0.0)
+      {
+        c(i, j) = alpha * temp;
+      }
+      else
+      {
+        c(i, j) = alpha * temp + beta * c(i, j);
+      }
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+                    platform::Parallelisation::NOT_PARALLEL>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemm_tt_vector.cpp b/libs/math/src/math/linalg/blas/gemm_tt_vector.cpp
new file mode 100644
index 0000000000..524b16dd42
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemm_tt_vector.cpp
@@ -0,0 +1,115 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/gemm_tt_vector.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S>
+void Blas<S, Signature(_C <= _alpha, _A, _B, _beta, _C),
+          Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+          platform::Parallelisation::VECTORISE>::operator()(Type const &        alpha,
+                                                            Tensor<Type> const &a,
+                                                            Tensor<Type> const &b, Type const &beta,
+                                                            Tensor<Type> &c) const
+{
+  std::size_t i;
+  std::size_t j;
+  if ((c.height() == 0) ||
+      ((c.width() == 0) || (((alpha == 0.0) || (a.height() == 0)) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  if (alpha == 0.0)
+  {
+    if (beta == 0.0)
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_zero(0.0);
+
+        auto                 ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range, [vec_zero](VectorRegisterType &vw_c_j) { vw_c_j = vec_zero; });
+      }
+    }
+    else
+    {
+      for (j = 0; j < c.width(); ++j)
+      {
+
+        VectorRegisterType vec_beta(beta);
+
+        auto ret_slice = c.data().slice(c.padded_height() * j, c.height());
+        auto slice_c_j = c.data().slice(c.padded_height() * std::size_t(j), c.padded_height());
+        memory::TrivialRange range(std::size_t(0), std::size_t(c.height()));
+        ret_slice.in_parallel().Apply(
+            range,
+            [vec_beta](VectorRegisterType const &vr_c_j, VectorRegisterType &vw_c_j) {
+              vw_c_j = vec_beta * vr_c_j;
+            },
+            slice_c_j);
+      }
+    }
+
+    return;
+  }
+
+  for (j = 0; j < c.width(); ++j)
+  {
+    for (i = 0; i < c.height(); ++i)
+    {
+      Type        temp;
+      std::size_t l;
+      temp = 0.0;
+      for (l = 0; l < a.height(); ++l)
+      {
+        temp = temp + a(l, i) * b(j, l);
+      }
+
+      if (beta == 0.0)
+      {
+        c(i, j) = alpha * temp;
+      }
+      else
+      {
+        c(i, j) = alpha * temp + beta * c(i, j);
+      }
+    }
+  }
+  return;
+}
+
+template class Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+template class Blas<float, Signature(_C <= _alpha, _A, _B, _beta, _C),
+                    Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C),
+                    platform::Parallelisation::VECTORISE>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemv_n_novector.cpp b/libs/math/src/math/linalg/blas/gemv_n_novector.cpp
new file mode 100644
index 0000000000..04ec496630
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemv_n_novector.cpp
@@ -0,0 +1,165 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_n.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+void Blas<S, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+          Computes(_y <= _alpha * _A * _x + _beta * _y), V>::operator()(Type const &        alpha,
+                                                                        Tensor<Type> const &a,
+                                                                        Tensor<Type> const &x,
+                                                                        int const &         incx,
+                                                                        Type const &        beta,
+                                                                        Tensor<Type> &      y,
+                                                                        int const &incy) const
+{
+  Type temp;
+  int  i;
+  int  iy;
+  int  j;
+  int  jx;
+  int  kx;
+  int  ky;
+  int  lenx;
+  int  leny;
+  if ((int(a.height()) == 0) || ((int(a.width()) == 0) || ((alpha == 0.0) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  lenx = int(a.width());
+  leny = int(a.height());
+  if (incx > 0)
+  {
+    kx = 1;
+  }
+  else
+  {
+    kx = 1 + (-(-1 + lenx) * incx);
+  }
+
+  if (incy > 0)
+  {
+    ky = 1;
+  }
+  else
+  {
+    ky = 1 + (-(-1 + leny) * incy);
+  }
+
+  if (beta != 1.0)
+  {
+    if (incy == 1)
+    {
+      if (beta == 0.0)
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[i] = 0.0;
+        }
+      }
+      else
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[i] = beta * y[i];
+        }
+      }
+    }
+    else
+    {
+      iy = -1 + ky;
+      if (beta == 0.0)
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = 0.0;
+          iy    = iy + incy;
+        }
+      }
+      else
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = beta * y[iy];
+          iy    = iy + incy;
+        }
+      }
+    }
+  }
+
+  if (alpha == 0.0)
+  {
+    return;
+  }
+
+  jx = -1 + kx;
+  if (incy == 1)
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      temp = alpha * x[jx];
+      for (i = 0; i < int(a.height()); ++i)
+      {
+        y[i] = y[i] + temp * a(i, j);
+      }
+
+      jx = jx + incx;
+    }
+  }
+  else
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      temp = alpha * x[jx];
+      iy   = -1 + ky;
+      for (i = 0; i < int(a.height()); ++i)
+      {
+        y[iy] = y[iy] + temp * a(i, j);
+        iy    = iy + incy;
+      }
+
+      jx = jx + incx;
+    }
+  }
+
+  return;
+}
+
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::THREADING>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemv_n_vector.cpp b/libs/math/src/math/linalg/blas/gemv_n_vector.cpp
new file mode 100644
index 0000000000..5107315ebe
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemv_n_vector.cpp
@@ -0,0 +1,183 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_n.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+void Blas<S, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+          Computes(_y <= _alpha * _A * _x + _beta * _y), V>::operator()(Type const &        alpha,
+                                                                        Tensor<Type> const &a,
+                                                                        Tensor<Type> const &x,
+                                                                        int const &         incx,
+                                                                        Type const &        beta,
+                                                                        Tensor<Type> &      y,
+                                                                        int const &incy) const
+{
+  Type temp;
+  int  i;
+  int  iy;
+  int  j;
+  int  jx;
+  int  kx;
+  int  ky;
+  int  lenx;
+  int  leny;
+  if ((int(a.height()) == 0) || ((int(a.width()) == 0) || ((alpha == 0.0) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  lenx = int(a.width());
+  leny = int(a.height());
+  if (incx > 0)
+  {
+    kx = 1;
+  }
+  else
+  {
+    kx = 1 + (-(-1 + lenx) * incx);
+  }
+
+  if (incy > 0)
+  {
+    ky = 1;
+  }
+  else
+  {
+    ky = 1 + (-(-1 + leny) * incy);
+  }
+
+  if (beta != 1.0)
+  {
+    if (incy == 1)
+    {
+      if (beta == 0.0)
+      {
+
+        VectorRegisterType vec_zero(0.0);
+
+        auto                 ret_slice = y.data().slice(0, y.padded_size());
+        memory::TrivialRange range(std::size_t(0), std::size_t(leny));
+        ret_slice.in_parallel().Apply(
+            range, [vec_zero](VectorRegisterType &vw_v_y) { vw_v_y = vec_zero; });
+      }
+      else
+      {
+
+        VectorRegisterType vec_beta(beta);
+
+        auto                 ret_slice = y.data().slice(0, y.padded_size());
+        auto                 slice_v_y = y.data().slice(0, y.padded_size());
+        memory::TrivialRange range(std::size_t(0), std::size_t(leny));
+        ret_slice.in_parallel().Apply(
+            range,
+            [vec_beta](VectorRegisterType const &vr_v_y, VectorRegisterType &vw_v_y) {
+              vw_v_y = vec_beta * vr_v_y;
+            },
+            slice_v_y);
+      }
+    }
+    else
+    {
+      iy = -1 + ky;
+      if (beta == 0.0)
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = 0.0;
+          iy    = iy + incy;
+        }
+      }
+      else
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = beta * y[iy];
+          iy    = iy + incy;
+        }
+      }
+    }
+  }
+
+  if (alpha == 0.0)
+  {
+    return;
+  }
+
+  jx = -1 + kx;
+  if (incy == 1)
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      temp = alpha * x[jx];
+
+      VectorRegisterType vec_temp(temp);
+
+      auto ret_slice = y.data().slice(0, y.padded_size());
+      auto slice_v_y = y.data().slice(0, y.padded_size());
+      auto slice_a_j = a.data().slice(a.padded_height() * std::size_t(j), a.padded_height());
+      memory::TrivialRange range(std::size_t(0), std::size_t(int(a.height())));
+      ret_slice.in_parallel().Apply(
+          range,
+          [vec_temp](VectorRegisterType const &vr_v_y, VectorRegisterType const &vr_a_j,
+                     VectorRegisterType &vw_v_y) { vw_v_y = vr_v_y + vec_temp * vr_a_j; },
+          slice_v_y, slice_a_j);
+      jx = jx + incx;
+    }
+  }
+  else
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      temp = alpha * x[jx];
+      iy   = -1 + ky;
+      for (i = 0; i < int(a.height()); ++i)
+      {
+        y[iy] = y[iy] + temp * a(i, j);
+        iy    = iy + incy;
+      }
+
+      jx = jx + incx;
+    }
+  }
+
+  return;
+}
+
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * _A * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemv_t_novector.cpp b/libs/math/src/math/linalg/blas/gemv_t_novector.cpp
new file mode 100644
index 0000000000..726fa6251d
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemv_t_novector.cpp
@@ -0,0 +1,168 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_t.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+void Blas<S, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+          Computes(_y <= _alpha * T(_A) * _x + _beta * _y), V>::operator()(Type const &alpha,
+                                                                           Tensor<Type> const &a,
+                                                                           Tensor<Type> const &x,
+                                                                           int const &         incx,
+                                                                           Type const &        beta,
+                                                                           Tensor<Type> &      y,
+                                                                           int const &incy) const
+{
+  Type temp;
+  int  i;
+  int  j;
+  int  jy;
+  int  kx;
+  int  ky;
+  int  lenx;
+  int  leny;
+  if ((int(a.height()) == 0) || ((int(a.width()) == 0) || ((alpha == 0.0) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  lenx = int(a.height());
+  leny = int(a.width());
+  if (incx > 0)
+  {
+    kx = 1;
+  }
+  else
+  {
+    kx = 1 + (-(-1 + lenx) * incx);
+  }
+
+  if (incy > 0)
+  {
+    ky = 1;
+  }
+  else
+  {
+    ky = 1 + (-(-1 + leny) * incy);
+  }
+
+  if (beta != 1.0)
+  {
+    if (incy == 1)
+    {
+      if (beta == 0.0)
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[i] = 0.0;
+        }
+      }
+      else
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[i] = beta * y[i];
+        }
+      }
+    }
+    else
+    {
+      int iy;
+      iy = -1 + ky;
+      if (beta == 0.0)
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = 0.0;
+          iy    = iy + incy;
+        }
+      }
+      else
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = beta * y[iy];
+          iy    = iy + incy;
+        }
+      }
+    }
+  }
+
+  if (alpha == 0.0)
+  {
+    return;
+  }
+
+  jy = -1 + ky;
+  if (incx == 1)
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      temp = 0.0;
+      for (i = 0; i < int(a.height()); ++i)
+      {
+        temp = temp + a(i, j) * x[i];
+      }
+
+      y[jy] = y[jy] + alpha * temp;
+      jy    = jy + incy;
+    }
+  }
+  else
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      int ix;
+      temp = 0.0;
+      ix   = -1 + kx;
+      for (i = 0; i < int(a.height()); ++i)
+      {
+        temp = temp + a(i, j) * x[ix];
+        ix   = ix + incx;
+      }
+
+      y[jy] = y[jy] + alpha * temp;
+      jy    = jy + incy;
+    }
+  }
+
+  return;
+}
+
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::THREADING>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/gemv_t_vector.cpp b/libs/math/src/math/linalg/blas/gemv_t_vector.cpp
new file mode 100644
index 0000000000..bb6b8b797f
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/gemv_t_vector.cpp
@@ -0,0 +1,184 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_t.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+void Blas<S, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+          Computes(_y <= _alpha * T(_A) * _x + _beta * _y), V>::operator()(Type const &alpha,
+                                                                           Tensor<Type> const &a,
+                                                                           Tensor<Type> const &x,
+                                                                           int const &         incx,
+                                                                           Type const &        beta,
+                                                                           Tensor<Type> &      y,
+                                                                           int const &incy) const
+{
+  Type temp;
+  int  i;
+  int  j;
+  int  jy;
+  int  kx;
+  int  ky;
+  int  lenx;
+  int  leny;
+  if ((int(a.height()) == 0) || ((int(a.width()) == 0) || ((alpha == 0.0) && (beta == 1.0))))
+  {
+    return;
+  }
+
+  lenx = int(a.height());
+  leny = int(a.width());
+  if (incx > 0)
+  {
+    kx = 1;
+  }
+  else
+  {
+    kx = 1 + (-(-1 + lenx) * incx);
+  }
+
+  if (incy > 0)
+  {
+    ky = 1;
+  }
+  else
+  {
+    ky = 1 + (-(-1 + leny) * incy);
+  }
+
+  if (beta != 1.0)
+  {
+    if (incy == 1)
+    {
+      if (beta == 0.0)
+      {
+
+        VectorRegisterType vec_zero(0.0);
+
+        auto                 ret_slice = y.data().slice(0, y.padded_size());
+        memory::TrivialRange range(std::size_t(0), std::size_t(leny));
+        ret_slice.in_parallel().Apply(
+            range, [vec_zero](VectorRegisterType &vw_v_y) { vw_v_y = vec_zero; });
+      }
+      else
+      {
+
+        VectorRegisterType vec_beta(beta);
+
+        auto                 ret_slice = y.data().slice(0, y.padded_size());
+        auto                 slice_v_y = y.data().slice(0, y.padded_size());
+        memory::TrivialRange range(std::size_t(0), std::size_t(leny));
+        ret_slice.in_parallel().Apply(
+            range,
+            [vec_beta](VectorRegisterType const &vr_v_y, VectorRegisterType &vw_v_y) {
+              vw_v_y = vec_beta * vr_v_y;
+            },
+            slice_v_y);
+      }
+    }
+    else
+    {
+      int iy;
+      iy = -1 + ky;
+      if (beta == 0.0)
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = 0.0;
+          iy    = iy + incy;
+        }
+      }
+      else
+      {
+        for (i = 0; i < leny; ++i)
+        {
+          y[iy] = beta * y[iy];
+          iy    = iy + incy;
+        }
+      }
+    }
+  }
+
+  if (alpha == 0.0)
+  {
+    return;
+  }
+
+  jy = -1 + ky;
+  if (incx == 1)
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      temp = 0.0;
+
+      auto slice_a_j = a.data().slice(a.padded_height() * std::size_t(j), a.padded_height());
+      auto slice_v_x = x.data().slice(0, x.padded_size());
+      memory::TrivialRange range(std::size_t(0), std::size_t(int(a.height())));
+      temp = slice_a_j.in_parallel().SumReduce(
+          range,
+          [](VectorRegisterType const &vr_a_j, VectorRegisterType const &vr_v_x) {
+            return vr_a_j * vr_v_x;
+          },
+          slice_v_x);
+      y[jy] = y[jy] + alpha * temp;
+      jy    = jy + incy;
+    }
+  }
+  else
+  {
+    for (j = 0; j < int(a.width()); ++j)
+    {
+      int ix;
+      temp = 0.0;
+      ix   = -1 + kx;
+      for (i = 0; i < int(a.height()); ++i)
+      {
+        temp = temp + a(i, j) * x[ix];
+        ix   = ix + incx;
+      }
+
+      y[jy] = y[jy] + alpha * temp;
+      jy    = jy + incy;
+    }
+  }
+
+  return;
+}
+
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+                    Computes(_y <= _alpha * T(_A) * _x + _beta * _y),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/scal_all.cpp b/libs/math/src/math/linalg/blas/scal_all.cpp
new file mode 100644
index 0000000000..1a8b501c1d
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/scal_all.cpp
@@ -0,0 +1,97 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/scal_all.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+void Blas<S, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x), V>::operator()(
+    int const &n, Type const &da, Tensor<Type> &dx, int const &incx) const
+{
+  int i;
+  if ((n <= 0) || (incx <= 0))
+  {
+    return;
+  }
+
+  if (incx == 1)
+  {
+    int m;
+    int mp1;
+    m = n % 5;
+    if (m != 0)
+    {
+      for (i = 0; i < m; ++i)
+      {
+        dx[i] = da * dx[i];
+      }
+
+      if (n < 5)
+      {
+        return;
+      }
+    }
+
+    mp1 = 1 + m;
+    for (i = mp1 - 1; i < n; i += 5)
+    {
+      dx[i]     = da * dx[i];
+      dx[1 + i] = da * dx[1 + i];
+      dx[2 + i] = da * dx[2 + i];
+      dx[3 + i] = da * dx[3 + i];
+      dx[4 + i] = da * dx[4 + i];
+    }
+  }
+  else
+  {
+    int nincx;
+    nincx = n * incx;
+    for (i = 0; i < nincx; i += incx)
+    {
+      dx[i] = da * dx[i];
+    }
+  }
+
+  return;
+}
+
+template class Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<float, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::THREADING>;
+template class Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<float, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/src/math/linalg/blas/swap_all.cpp b/libs/math/src/math/linalg/blas/swap_all.cpp
new file mode 100644
index 0000000000..939f140476
--- /dev/null
+++ b/libs/math/src/math/linalg/blas/swap_all.cpp
@@ -0,0 +1,115 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include "math/linalg/blas/swap_all.hpp"
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+namespace fetch {
+namespace math {
+namespace linalg {
+
+template <typename S, uint64_t V>
+void Blas<S, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x), V>::operator()(
+    int const &n, Tensor<Type> &dx, int const &incx, Tensor<Type> &dy, int const &incy) const
+{
+  Type dtemp;
+  int  i;
+  if ((incx == 1) && (incy == 1))
+  {
+    int m;
+    int mp1;
+    m = n % 3;
+    if (m != 0)
+    {
+      for (i = 0; i < m; ++i)
+      {
+        dtemp = dx[i];
+        dx[i] = dy[i];
+        dy[i] = dtemp;
+      }
+
+      if (n < 3)
+      {
+        return;
+      }
+    }
+
+    mp1 = 1 + m;
+    for (i = mp1 - 1; i < n; i += 3)
+    {
+      dtemp     = dx[i];
+      dx[i]     = dy[i];
+      dy[i]     = dtemp;
+      dtemp     = dx[1 + i];
+      dx[1 + i] = dy[1 + i];
+      dy[1 + i] = dtemp;
+      dtemp     = dx[2 + i];
+      dx[2 + i] = dy[2 + i];
+      dy[2 + i] = dtemp;
+    }
+  }
+  else
+  {
+    int ix;
+    int iy;
+    ix = 0;
+    iy = 0;
+    if (incx < 0)
+    {
+      ix = (1 + (-n)) * incx;
+    }
+
+    if (incy < 0)
+    {
+      iy = (1 + (-n)) * incy;
+    }
+
+    for (i = 0; i < n; ++i)
+    {
+      dtemp  = dx[ix];
+      dx[ix] = dy[iy];
+      dy[iy] = dtemp;
+      ix     = ix + incx;
+      iy     = iy + incy;
+    }
+  }
+
+  return;
+}
+
+template class Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<float, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::NOT_PARALLEL>;
+template class Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::THREADING>;
+template class Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<float, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::VECTORISE>;
+template class Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+template class Blas<float, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+                    platform::Parallelisation::VECTORISE | platform::Parallelisation::THREADING>;
+
+}  // namespace linalg
+}  // namespace math
+}  // namespace fetch
\ No newline at end of file
diff --git a/libs/math/tests/CMakeLists.txt b/libs/math/tests/CMakeLists.txt
index 3aac4fb170..cacaa25996 100644
--- a/libs/math/tests/CMakeLists.txt
+++ b/libs/math/tests/CMakeLists.txt
@@ -23,3 +23,5 @@ add_fetch_test(math_loss_gtest fetch-math math/ml_loss_functions)
 add_fetch_test(math_serializers_gtest fetch-math math/serializers)
 add_fetch_test(math_tensor_gtest fetch-math math/tensor)
 add_fetch_test(math_trigonometry_gtest fetch-math math/trigonometry)
+add_fetch_test(math_blas_gtest fetch-math math/linalg/blas)
+add_fetch_test(math_tensor_iterator fetch-math math/tensor_iterator)
diff --git a/libs/math/tests/math/linalg/blas/gemm_nn_novector.cpp b/libs/math/tests/math/linalg/blas/gemm_nn_novector.cpp
new file mode 100644
index 0000000000..f02a32bd50
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_nn_novector.cpp
@@ -0,0 +1,312 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_nn_novector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_DGEMM, blas_gemm_nn_novector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nn_novector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162;
+ 0.7319939418114051 0.5986584841970366;
+ 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352 0.6011150117432088;
+ 0.7080725777960455 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_nn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.6949293726918103 0.3439876897985273 1.14724886031757;
+ 0.46641050835051406 0.6463587734018926 1.0206573088309918;
+ 0.11951756833898089 0.1383506929615121 0.24508576903908222
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nn_novector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nn_novector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815;
+ 0.3663618432936917 0.45606998421703593;
+ 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425 0.046450412719997725;
+ 0.6075448519014384 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_nn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nn_novector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nn_novector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821;
+ 0.2587799816000169 0.662522284353982;
+ 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704 0.9695846277645586;
+ 0.7751328233611146 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_nn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  1.3215446273993787 1.7825366616659373 0.935519849578753;
+ 0.8510033072590965 0.7155029064233699 1.169082483203583;
+ 0.9622147327681025 0.8175735684636156 1.5963388662648617
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nn_novector4)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nn_novector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.5605511018220696);
+  Type beta  = Type(0.11295073736798);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3567533266935893 0.28093450968738076;
+ 0.5426960831582485 0.14092422497476265;
+ 0.8021969807540397 0.07455064367977082
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9868869366005173 0.7722447692966574 0.1987156815341724;
+ 0.005522117123602399 0.8154614284548342 0.7068573438476171
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935;
+ 0.6232981268275579 0.3308980248526492 0.06355835028602363
+  	)");
+
+  gemm_nn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.2805676499561156 0.3699652383624031 0.15941684211800541;
+ 0.34114496621231255 0.31242878051256023 0.21377760291942613;
+ 0.5144085679685271 0.4187100166959136 0.12607509666602187
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nn_novector5)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nn_novector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(3.2);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3109823217156622;
+ 0.32518332202674705;
+ 0.7296061783380641;
+ 0.6375574713552131
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.8872127425763265 0.4722149251619493 0.1195942459383017 0.713244787222995
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7607850486168974 0.5612771975694962 0.770967179954561 0.49379559636439074;
+ 0.5227328293819941 0.42754101835854963 0.02541912674409519 0.10789142699330445;
+ 0.03142918568673425 0.6364104112637804 0.3143559810763267 0.5085706911647028;
+ 0.907566473926093 0.24929222914887494 0.41038292303562973 0.7555511385430487
+  	)");
+
+  gemm_nn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.8829039313347398 0.4699215800820821 0.11901342805032629 0.7097808636230415;
+ 0.9232217183213793 0.49138053783927815 0.12444817340640241 0.7421929898477884;
+ 2.071410875148628 1.1024989660851352 0.2792214423368539 1.6652409709610083;
+ 1.8100771606754766 0.9634052915918195 0.24399425609298275 1.4551505375973752
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nn_novector6)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nn_novector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1.9);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.22879816549162246 0.07697990982879299;
+ 0.289751452913768 0.16122128725400442;
+ 0.9296976523425731 0.808120379564417;
+ 0.6334037565104235 0.8714605901877177;
+ 0.8036720768991145 0.18657005888603584
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.8925589984899778 0.5393422419156507 0.8074401551640625 0.8960912999234932 0.3180034749718639;
+ 0.11005192452767676 0.22793516254194168 0.4271077886262563 0.8180147659224931 0.8607305832563434
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.006952130531190703 0.5107473025775657 0.417411003148779 0.22210781047073025 0.1198653673336828;
+ 0.33761517140362796 0.9429097039125192 0.32320293202075523 0.5187906217433661 0.7030189588951778;
+ 0.363629602379294 0.9717820827209607 0.9624472949421112 0.25178229582536416 0.49724850589238545;
+ 0.30087830981676966 0.2848404943774676 0.036886947354532795 0.6095643339798968 0.5026790232288615;
+ 0.05147875124998935 0.27864646423661144 0.9082658859666537 0.23956189066697242 0.1448948720912231
+  	)");
+
+  gemm_nn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.013209048009262335 0.9704198748973748 0.79308090598268 0.4220048398943875 0.2277441979339973;
+ 0.6414688256668931 1.7915284374337863 0.6140855708394349 0.9857021813123955 1.3357360219008378;
+ 0.6908962445206586 1.8463859571698251 1.8286498603900112 0.47838636206819185 0.9447721611955323;
+ 0.5716687886518623 0.5411969393171885 0.0700851999736123 1.158172234561804 0.9550901441348367;
+ 0.09780962737497977 0.5294282820495617 1.725705183336642 0.45516759226724757 0.2753002569733239
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nn_novector7)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nn_novector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1.6543015947584463);
+  Type beta  = Type(-4.627066949382162);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.489452760277563 0.9856504541106007;
+ 0.2420552715115004 0.6721355474058786;
+ 0.7616196153287176 0.23763754399239967;
+ 0.7282163486118596 0.3677831327192532
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.6323058305935795 0.6335297107608947 0.5357746840747585 0.0902897700544083 0.835302495589238 0.32078006497173583 0.18651851039985423 0.040775141554763916 0.5908929431882418;
+ 0.6775643618422824 0.016587828927856152 0.512093058299281 0.22649577519793795 0.6451727904094499 0.17436642900499144 0.690937738102466 0.3867353463005374 0.9367299887367345
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.13752094414599325 0.3410663510502585 0.11347352124058907 0.9246936182785628 0.877339353380981 0.2579416277151556 0.659984046034179 0.8172222002012158 0.5552008115994623;
+ 0.5296505783560065 0.24185229090045168 0.09310276780589921 0.8972157579533268 0.9004180571633305 0.6331014572732679 0.3390297910487007 0.3492095746126609 0.7259556788702394;
+ 0.8971102599525771 0.8870864242651173 0.7798755458576239 0.6420316461542878 0.08413996499504883 0.16162871409461377 0.8985541885270792 0.6064290596595899 0.009197051616629648;
+ 0.1014715428660321 0.6635017691080558 0.005061583846218687 0.16080805141749865 0.5487337893665861 0.6918951976926933 0.6519612595026005 0.22426930946055978 0.7121792213475359
+  	)");
+
+  gemm_nn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9804724434023767 -1.0381187592800751 0.7437685655674593 -3.8361957710939736 -2.3311664647677253 -0.6494612975514299 -1.7761482730758869 -3.117729707389287 -0.5631064717322287;
+ -1.444139015804676 -0.8469366512951919 0.35315272019842053 -3.8634783997805653 -3.1144351879110417 -2.607071458451461 -0.7257615107362091 -1.169471595431384 -2.0808687970051003;
+ -3.0879494969924264 -3.2998723617120542 -2.732171672146779 -2.7679219828578723 0.9167499286116098 -0.2751528478468297 -3.651042338878794 -2.602578267601766 1.0701905274670498;
+ 0.7044631220126333 -2.296768100287648 0.933591652267415 -0.49749296892703154 -1.1402091090332427 -2.708916277155479 -2.3715882548386444 -0.7532885734530165 -2.013529216904504
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemm_nn_vector.cpp b/libs/math/tests/math/linalg/blas/gemm_nn_vector.cpp
new file mode 100644
index 0000000000..b1459556dc
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_nn_vector.cpp
@@ -0,0 +1,310 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_nn_vector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemm_vectorised, blas_gemm_nn_vector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nn_vector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162;
+ 0.7319939418114051 0.5986584841970366;
+ 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352 0.6011150117432088;
+ 0.7080725777960455 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_nn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.6949293726918103 0.3439876897985273 1.14724886031757;
+ 0.46641050835051406 0.6463587734018926 1.0206573088309918;
+ 0.11951756833898089 0.1383506929615121 0.24508576903908222
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nn_vector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nn_vector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815;
+ 0.3663618432936917 0.45606998421703593;
+ 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425 0.046450412719997725;
+ 0.6075448519014384 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_nn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nn_vector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nn_vector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821;
+ 0.2587799816000169 0.662522284353982;
+ 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704 0.9695846277645586;
+ 0.7751328233611146 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_nn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  1.3215446273993787 1.7825366616659373 0.935519849578753;
+ 0.8510033072590965 0.7155029064233699 1.169082483203583;
+ 0.9622147327681025 0.8175735684636156 1.5963388662648617
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nn_vector4)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nn_vector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.593853703092583);
+  Type beta  = Type(0.6813712395017687);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3567533266935893 0.28093450968738076;
+ 0.5426960831582485 0.14092422497476265;
+ 0.8021969807540397 0.07455064367977082
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9868869366005173 0.7722447692966574 0.1987156815341724;
+ 0.005522117123602399 0.8154614284548342 0.7068573438476171
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935;
+ 0.6232981268275579 0.3308980248526492 0.06355835028602363
+  	)");
+
+  gemm_nn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.7067269544987002 0.8251753471836429 0.21047949549826864;
+ 0.5627663480815914 0.39607513424909957 0.7112920949581767;
+ 0.89508263960017 0.6294545181153819 0.16926664059290447
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nn_vector5)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nn_vector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3109823217156622;
+ 0.32518332202674705;
+ 0.7296061783380641;
+ 0.6375574713552131
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.8872127425763265 0.4722149251619493 0.1195942459383017 0.713244787222995
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7607850486168974 0.5612771975694962 0.770967179954561 0.49379559636439074;
+ 0.5227328293819941 0.42754101835854963 0.02541912674409519 0.10789142699330445;
+ 0.03142918568673425 0.6364104112637804 0.3143559810763267 0.5085706911647028;
+ 0.907566473926093 0.24929222914887494 0.41038292303562973 0.7555511385430487
+  	)");
+
+  gemm_nn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.2759074785421062 0.14685049377565065 0.037191696265726965 0.22180651988220046;
+ 0.28850678697543103 0.1535564180747744 0.03889005418950075 0.23193530932743386;
+ 0.6473158984839462 0.34453092690160475 0.08725670073026684 0.5203878034253151;
+ 0.5656491127110864 0.3010641536224436 0.0762482050290571 0.4547345429991797
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nn_vector6)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nn_vector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.22879816549162246 0.07697990982879299;
+ 0.289751452913768 0.16122128725400442;
+ 0.9296976523425731 0.808120379564417;
+ 0.6334037565104235 0.8714605901877177
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.8036720768991145 0.18657005888603584 0.8925589984899778 0.5393422419156507;
+ 0.8074401551640625 0.8960912999234932 0.3180034749718639 0.11005192452767676
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.22793516254194168 0.4271077886262563 0.8180147659224931 0.8607305832563434;
+ 0.006952130531190703 0.5107473025775657 0.417411003148779 0.22210781047073025;
+ 0.1198653673336828 0.33761517140362796 0.9429097039125192 0.32320293202075523;
+ 0.5187906217433661 0.7030189588951778 0.363629602379294 0.9717820827209607
+  	)");
+
+  gemm_nn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.22793516254194168 0.4271077886262563 0.8180147659224931 0.8607305832563434;
+ 0.006952130531190703 0.5107473025775657 0.417411003148779 0.22210781047073025;
+ 0.1198653673336828 0.33761517140362796 0.9429097039125192 0.32320293202075523;
+ 0.5187906217433661 0.7030189588951778 0.363629602379294 0.9717820827209607
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nn_vector7)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nn_vector;
+  // Compuing _C <= _alpha * _A * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.7823387591340788);
+  Type beta  = Type(0.6340514568893103);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.9624472949421112 0.25178229582536416 0.49724850589238545;
+ 0.30087830981676966 0.2848404943774676 0.036886947354532795;
+ 0.6095643339798968 0.5026790232288615 0.05147875124998935;
+ 0.27864646423661144 0.9082658859666537 0.23956189066697242
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.1448948720912231 0.489452760277563 0.9856504541106007 0.2420552715115004;
+ 0.6721355474058786 0.7616196153287176 0.23763754399239967 0.7282163486118596;
+ 0.3677831327192532 0.6323058305935795 0.6335297107608947 0.5357746840747585
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.0902897700544083 0.835302495589238 0.32078006497173583 0.18651851039985423;
+ 0.040775141554763916 0.5908929431882418 0.6775643618422824 0.016587828927856152;
+ 0.512093058299281 0.22649577519793795 0.6451727904094499 0.17436642900499144;
+ 0.690937738102466 0.3867353463005374 0.9367299887367345 0.13752094414599325
+  	)");
+
+  gemm_nn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.4418188079525274 1.2941637128048789 1.2388095633962077 0.652388933164108;
+ 0.2203535485500414 0.6778360039580416 0.7328597372771557 0.2452330734857248;
+ 0.6729312663553488 0.7020069761489867 0.9980847167200914 0.5339499143517745;
+ 1.0162063895079476 1.0115999357028962 1.0963965130938047 0.7578263949128391
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemm_nt_novector.cpp b/libs/math/tests/math/linalg/blas/gemm_nt_novector.cpp
new file mode 100644
index 0000000000..a0277f8259
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_nt_novector.cpp
@@ -0,0 +1,274 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_nt_novector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_DGEMM, blas_gemm_nt_novector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nt_novector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162;
+ 0.7319939418114051 0.5986584841970366;
+ 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352;
+ 0.6011150117432088 0.7080725777960455;
+ 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_nt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.8452406966637934 0.898316417626484 0.9298168913182975;
+ 0.5610605507028993 0.8639062030527893 0.5957124870228502;
+ 0.14418085858929 0.20424058901822734 0.15451218697159372
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nt_novector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nt_novector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815;
+ 0.3663618432936917 0.45606998421703593;
+ 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425;
+ 0.046450412719997725 0.6075448519014384;
+ 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_nt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nt_novector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nt_novector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821;
+ 0.2587799816000169 0.662522284353982;
+ 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704;
+ 0.9695846277645586 0.7751328233611146;
+ 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_nt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.7847924646207151 1.6600609070711798 0.9344852473235858;
+ 0.459930734595923 0.809679149854067 1.1612969098822274;
+ 0.6552298300637807 0.9767010930495218 1.5869608246444562
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nt_novector4)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nt_novector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.9510566030240352);
+  Type beta  = Type(0.48499806063623685);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3567533266935893 0.28093450968738076;
+ 0.5426960831582485 0.14092422497476265;
+ 0.8021969807540397 0.07455064367977082
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9868869366005173 0.7722447692966574;
+ 0.1987156815341724 0.005522117123602399;
+ 0.8154614284548342 0.7068573438476171
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935;
+ 0.6232981268275579 0.3308980248526492 0.06355835028602363
+  	)");
+
+  gemm_nt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.8947424298091955 0.44296280877675565 0.5014529575942401;
+ 0.7867233533282483 0.15950043882515413 0.9342293308910294;
+ 1.1099823140804912 0.3124835247436316 0.7030870417676118
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nt_novector5)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nt_novector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.5546247678262307);
+  Type beta  = Type(0.2600011614046065);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3109823217156622;
+ 0.32518332202674705;
+ 0.7296061783380641;
+ 0.6375574713552131
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.8872127425763265;
+ 0.4722149251619493;
+ 0.1195942459383017;
+ 0.713244787222995
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7607850486168974 0.5612771975694962 0.770967179954561 0.49379559636439074;
+ 0.5227328293819941 0.42754101835854963 0.02541912674409519 0.10789142699330445;
+ 0.03142918568673425 0.6364104112637804 0.3143559810763267 0.5085706911647028;
+ 0.907566473926093 0.24929222914887494 0.41038292303562973 0.7555511385430487
+  	)");
+
+  gemm_nt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.35083011744758974 0.2273796442534794 0.22107979809946263 0.2514068181432315;
+ 0.2959241524861747 0.19632735404428048 0.028178389750957027 0.15668896341029248;
+ 0.36718905468743857 0.3565528314003248 0.13012764755813658 0.4208489350135337;
+ 0.5496913450811673 0.2317939054114992 0.1489891796212886 0.4486512138552526
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_nt_novector6)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_nt_novector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.5251502923224306);
+  Type beta  = Type(0.5431595582518957);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.22879816549162246 0.07697990982879299 0.289751452913768 0.16122128725400442;
+ 0.9296976523425731 0.808120379564417 0.6334037565104235 0.8714605901877177;
+ 0.8036720768991145 0.18657005888603584 0.8925589984899778 0.5393422419156507
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.8074401551640625 0.8960912999234932 0.3180034749718639 0.11005192452767676;
+ 0.22793516254194168 0.4271077886262563 0.8180147659224931 0.8607305832563434;
+ 0.006952130531190703 0.5107473025775657 0.417411003148779 0.22210781047073025;
+ 0.1198653673336828 0.33761517140362796 0.9429097039125192 0.32320293202075523;
+ 0.5187906217433661 0.7030189588951778 0.363629602379294 0.9717820827209607
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9624472949421112 0.25178229582536416 0.49724850589238545 0.30087830981676966 0.2848404943774676;
+ 0.036886947354532795 0.6095643339798968 0.5026790232288615 0.05147875124998935 0.27864646423661144;
+ 0.9082658859666537 0.23956189066697242 0.1448948720912231 0.489452760277563 0.9856504541106007
+  	)");
+
+  gemm_nt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.713710525998093 0.37875715408302013 0.37388746643042947 0.3623157360281193 0.38307588415979704;
+ 0.9506834080029599 1.2896431649632367 0.7336739529388108 0.6913170652246228 1.2686784279492263;
+ 1.1021365799489837 0.8953827035590771 0.39023775906225766 0.8830290294199696 1.2688868997178826
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemm_nt_vector.cpp b/libs/math/tests/math/linalg/blas/gemm_nt_vector.cpp
new file mode 100644
index 0000000000..ff16c16651
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_nt_vector.cpp
@@ -0,0 +1,333 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_nt_vector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemm_vectorised, blas_gemm_nt_vector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nt_vector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162;
+ 0.7319939418114051 0.5986584841970366;
+ 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352;
+ 0.6011150117432088 0.7080725777960455;
+ 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_nt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.8452406966637934 0.898316417626484 0.9298168913182975;
+ 0.5610605507028993 0.8639062030527893 0.5957124870228502;
+ 0.14418085858929 0.20424058901822734 0.15451218697159372
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nt_vector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nt_vector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815;
+ 0.3663618432936917 0.45606998421703593;
+ 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425;
+ 0.046450412719997725 0.6075448519014384;
+ 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_nt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nt_vector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nt_vector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821;
+ 0.2587799816000169 0.662522284353982;
+ 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704;
+ 0.9695846277645586 0.7751328233611146;
+ 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_nt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.7847924646207151 1.6600609070711798 0.9344852473235858;
+ 0.459930734595923 0.809679149854067 1.1612969098822274;
+ 0.6552298300637807 0.9767010930495218 1.5869608246444562
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nt_vector4)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nt_vector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.9631033915945032);
+  Type beta  = Type(0.22793800793408114);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3567533266935893 0.28093450968738076;
+ 0.5426960831582485 0.14092422497476265;
+ 0.8021969807540397 0.07455064367977082
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9868869366005173 0.7722447692966574;
+ 0.1987156815341724 0.005522117123602399;
+ 0.8154614284548342 0.7068573438476171
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935;
+ 0.6232981268275579 0.3308980248526492 0.06355835028602363
+  	)");
+
+  gemm_nt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.7141987374973957 0.2455727283581162 0.4883159288693666;
+ 0.7023391772178528 0.13102365953662803 0.7188912182468313;
+ 0.9599879886849823 0.22934818838829452 0.6952640901287127
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nt_vector5)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nt_vector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.013902056198525137);
+  Type beta  = Type(0.3005264928764638);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3109823217156622 0.32518332202674705 0.7296061783380641 0.6375574713552131 0.8872127425763265 0.4722149251619493 0.1195942459383017 0.713244787222995 0.7607850486168974 0.5612771975694962 0.770967179954561 0.49379559636439074;
+ 0.5227328293819941 0.42754101835854963 0.02541912674409519 0.10789142699330445 0.03142918568673425 0.6364104112637804 0.3143559810763267 0.5085706911647028 0.907566473926093 0.24929222914887494 0.41038292303562973 0.7555511385430487;
+ 0.22879816549162246 0.07697990982879299 0.289751452913768 0.16122128725400442 0.9296976523425731 0.808120379564417 0.6334037565104235 0.8714605901877177 0.8036720768991145 0.18657005888603584 0.8925589984899778 0.5393422419156507;
+ 0.8074401551640625 0.8960912999234932 0.3180034749718639 0.11005192452767676 0.22793516254194168 0.4271077886262563 0.8180147659224931 0.8607305832563434 0.006952130531190703 0.5107473025775657 0.417411003148779 0.22210781047073025;
+ 0.1198653673336828 0.33761517140362796 0.9429097039125192 0.32320293202075523 0.5187906217433661 0.7030189588951778 0.363629602379294 0.9717820827209607 0.9624472949421112 0.25178229582536416 0.49724850589238545 0.30087830981676966;
+ 0.2848404943774676 0.036886947354532795 0.6095643339798968 0.5026790232288615 0.05147875124998935 0.27864646423661144 0.9082658859666537 0.23956189066697242 0.1448948720912231 0.489452760277563 0.9856504541106007 0.2420552715115004;
+ 0.6721355474058786 0.7616196153287176 0.23763754399239967 0.7282163486118596 0.3677831327192532 0.6323058305935795 0.6335297107608947 0.5357746840747585 0.0902897700544083 0.835302495589238 0.32078006497173583 0.18651851039985423;
+ 0.040775141554763916 0.5908929431882418 0.6775643618422824 0.016587828927856152 0.512093058299281 0.22649577519793795 0.6451727904094499 0.17436642900499144 0.690937738102466 0.3867353463005374 0.9367299887367345 0.13752094414599325;
+ 0.3410663510502585 0.11347352124058907 0.9246936182785628 0.877339353380981 0.2579416277151556 0.659984046034179 0.8172222002012158 0.5552008115994623 0.5296505783560065 0.24185229090045168 0.09310276780589921 0.8972157579533268
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9004180571633305 0.6331014572732679 0.3390297910487007 0.3492095746126609 0.7259556788702394 0.8971102599525771 0.8870864242651173 0.7798755458576239 0.6420316461542878 0.08413996499504883 0.16162871409461377 0.8985541885270792;
+ 0.6064290596595899 0.009197051616629648 0.1014715428660321 0.6635017691080558 0.005061583846218687 0.16080805141749865 0.5487337893665861 0.6918951976926933 0.6519612595026005 0.22426930946055978 0.7121792213475359 0.23724908749680007
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.3253996981592677 0.7464914051180241;
+ 0.6496328990472147 0.8492234104941779;
+ 0.6576128923003434 0.5683086033354716;
+ 0.09367476782809248 0.3677158030594335;
+ 0.26520236768172545 0.24398964337908358;
+ 0.9730105547524456 0.3930977246667604;
+ 0.8920465551771133 0.6311386259972629;
+ 0.7948113035416484 0.5026370931051921;
+ 0.5769038846263591 0.4925176938188639
+  	)");
+
+  gemm_nt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.15047880768416744 0.26071205879270626;
+ 0.24257915690765156 0.2849798135054769;
+ 0.2561753226794971 0.20819447917180442;
+ 0.07959797148696499 0.1409047985198379;
+ 0.13297009457720543 0.10784362760093887;
+ 0.326516290013295 0.14929490297415324;
+ 0.3179089084352911 0.2211297319779776;
+ 0.2761289558999083 0.17692124552668761;
+ 0.23061982028974742 0.1828053199688841
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nt_vector6)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nt_vector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.3253815540007322);
+  Type beta  = Type(0.0943238921266506);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.1952429877980445 0.7224521152615053 0.2807723624408558 0.02431596643145384 0.6454722959071678 0.17711067940704894 0.9404585843529143 0.9539285770025874;
+ 0.9148643902204485 0.3701587002554444 0.015456616528867428 0.9283185625877254 0.42818414831731433 0.9666548190436696 0.9636199770892528 0.8530094554673601;
+ 0.2944488920695857 0.38509772860192526 0.8511366715168569 0.31692200515627766 0.1694927466860925 0.5568012624583502 0.936154774160781 0.696029796674973;
+ 0.570061170089365 0.09717649377076854 0.6150072266991697 0.9900538501042633 0.14008401523652403 0.5183296523637367 0.8773730719279554 0.7407686177542044;
+ 0.697015740995268 0.7024840839871093 0.35949115121975517 0.29359184426449336 0.8093611554785136 0.8101133946791808 0.8670723185801037 0.9132405525564713;
+ 0.5113423988609378 0.5015162946871996 0.7982951789667752 0.6499639307777652 0.7019668772577033 0.795792669436101 0.8900053418175663 0.3379951568515358;
+ 0.375582952639944 0.093981939840869 0.578280140996174 0.035942273796742086 0.46559801813246016 0.5426446347075766 0.2865412521282844 0.5908332605690108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.03050024993904943 0.03734818874921442 0.8226005606596583 0.3601906414112629 0.12706051265188478 0.5222432600548044 0.7699935530986108 0.21582102749684318;
+ 0.6228904758190003 0.085347464993768 0.0516817211686077 0.531354631568148 0.5406351216101065 0.6374299014982066 0.7260913337226615 0.9758520794625346;
+ 0.5163003483011953 0.32295647294124596 0.7951861947687037 0.2708322512620742 0.4389714207056361 0.07845638134226596 0.02535074341545751 0.9626484146779251;
+ 0.8359801205122058 0.695974206093698 0.4089529444142699 0.17329432007084578 0.15643704267108605 0.25024289816459533 0.5492266647061205 0.7145959227000623;
+ 0.6601973767177313 0.27993389694594284 0.9548652806631941 0.7378969166957685 0.5543540525114007 0.6117207462343522 0.4196000624277899 0.24773098950115746;
+ 0.3559726786512616 0.7578461104643691 0.014393488629755868 0.11607264050691624 0.04600264202175275 0.040728802318970136 0.8554605840110072 0.7036578593800237;
+ 0.4741738290873252 0.09783416065100148 0.49161587511683236 0.4734717707805657 0.17320186991001518 0.43385164923797304 0.39850473439737344 0.6158500980522165
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.6350936508676438 0.04530400977204452 0.3746126146264712 0.6258599157142364 0.5031362585800877 0.8564898411883223 0.658693631618945;
+ 0.1629344270814297 0.07056874740042984 0.6424192782063156 0.026511310541621813 0.5857755812734633 0.9402302414249576 0.575474177875879;
+ 0.3881699262065219 0.6432882184423532 0.45825289049151663 0.5456167893159349 0.9414648087765252 0.38610263780077425 0.9611905638239142;
+ 0.9053506419560637 0.19579113478929644 0.06936130087516545 0.10077800137742665 0.018221825651549728 0.0944429607559284 0.6830067734163568;
+ 0.07118864846022899 0.3189756302937613 0.8448753109694546 0.023271935735825866 0.8144684825889358 0.28185477477339993 0.11816482762165625;
+ 0.6967371653641506 0.628942846779884 0.877472013527053 0.7350710438038858 0.8034809303848486 0.2820345725713065 0.17743954377972282;
+ 0.7506147516408583 0.806834739267264 0.9905051420006733 0.4126176769114265 0.37201808579278317 0.7764129607419968 0.34080354025301784
+  	)");
+
+  gemm_nt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.5080176762391384 0.748199053223754 0.6221133638460504 0.7516272700551815 0.6052518234707122 0.7759794409068939 0.5383901157564123;
+ 0.6251751075152086 1.1374453502458208 0.6999627473177347 0.8606320071820817 0.9831476789259297 0.8038094074873847 0.8091760712274642;
+ 0.6942208361209999 0.7876365240381099 0.6454696557116189 0.7329955167169262 0.8529864372803924 0.6112971994789692 0.6823433421830908;
+ 0.7385893066048945 0.8928657977293024 0.631402323910047 0.7026079935057677 0.8697550352546449 0.5619599756390165 0.7496467820560906;
+ 0.6052909378637183 1.0528982870080301 0.8189826598536917 0.8897225091629477 0.9719551945964776 0.7666101769883427 0.69918984885943;
+ 0.7777026899201367 0.9088176463758797 0.7189883500385287 0.8030423197545428 1.069167015788816 0.5839586940501921 0.6744592858925714;
+ 0.45940599899577955 0.6205290574600508 0.5869759209715997 0.4978117288527927 0.5913698187883578 0.373172869953628 0.44951872011336214
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_nt_vector7)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * _A * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_nt_vector;
+  // Compuing _C <= _alpha * _A * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.5509327753225545);
+  Type beta  = Type(0.6475798043554243);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.9307573256035647 0.8584127518430118
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.42899402737501835 0.7508710677914974
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7545428740846823
+  	)");
+
+  gemm_nt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  1.0637160494184412
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemm_tn_novector.cpp b/libs/math/tests/math/linalg/blas/gemm_tn_novector.cpp
new file mode 100644
index 0000000000..faa247b3e4
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_tn_novector.cpp
@@ -0,0 +1,330 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_tn_novector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_DGEMM, blas_gemm_tn_novector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tn_novector;
+  // Compuing _C <= _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162 0.7319939418114051;
+ 0.5986584841970366 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352 0.6011150117432088;
+ 0.7080725777960455 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_tn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.4456482991294304 0.33674079873438223 0.8057864498423065;
+ 0.1656934419785827 0.8266976184734581 0.7228126579480724;
+ 0.15297229436215787 0.6372467595628419 0.591313169085288
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tn_novector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tn_novector;
+  // Compuing _C <= _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815 0.3663618432936917;
+ 0.45606998421703593 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425 0.046450412719997725;
+ 0.6075448519014384 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_tn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tn_novector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tn_novector;
+  // Compuing _C <= _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821 0.2587799816000169;
+ 0.662522284353982 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704 0.9695846277645586;
+ 0.7751328233611146 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_tn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  1.1302433056071457 1.5506700912834535 0.7146781438045392;
+ 0.9347351599342958 0.5061714427949007 1.4859210106475778;
+ 0.9332767593138604 0.8077890198114085 1.545017690717192
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tn_novector4)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tn_novector;
+  // Compuing _C <= _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.607788294419729);
+  Type beta  = Type(0.22914210685028136);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3567533266935893 0.28093450968738076 0.5426960831582485;
+ 0.14092422497476265 0.8021969807540397 0.07455064367977082
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9868869366005173 0.7722447692966574 0.1987156815341724;
+ 0.005522117123602399 0.8154614284548342 0.7068573438476171
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935;
+ 0.6232981268275579 0.3308980248526492 0.06355835028602363
+  	)");
+
+  gemm_tn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.38150640320989604 0.4140227077201213 0.12059817918140146;
+ 0.25334165634226297 0.5560014894603751 0.576343344582955;
+ 0.4685931158755239 0.36749260637166836 0.11213755365181256
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tn_novector5)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tn_novector;
+  // Compuing _C <= _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.13847194714175437);
+  Type beta  = Type(0.1056287791628393);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3109823217156622 0.32518332202674705 0.7296061783380641 0.6375574713552131 0.8872127425763265;
+ 0.4722149251619493 0.1195942459383017 0.713244787222995 0.7607850486168974 0.5612771975694962;
+ 0.770967179954561 0.49379559636439074 0.5227328293819941 0.42754101835854963 0.02541912674409519
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.10789142699330445 0.03142918568673425 0.6364104112637804 0.3143559810763267 0.5085706911647028;
+ 0.907566473926093 0.24929222914887494 0.41038292303562973 0.7555511385430487 0.22879816549162246;
+ 0.07697990982879299 0.289751452913768 0.16122128725400442 0.9296976523425731 0.808120379564417
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.6334037565104235 0.8714605901877177 0.8036720768991145 0.18657005888603584 0.8925589984899778;
+ 0.5393422419156507 0.8074401551640625 0.8960912999234932 0.3180034749718639 0.11005192452767676;
+ 0.22793516254194168 0.4271077886262563 0.8180147659224931 0.8607305832563434 0.006952130531190703;
+ 0.5107473025775657 0.417411003148779 0.22210781047073025 0.1198653673336828 0.33761517140362796;
+ 0.9429097039125192 0.32320293202075523 0.5187906217433661 0.7030189588951778 0.363629602379294
+  	)");
+
+  gemm_tn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.139114319541609 0.14063867256018112 0.15634209960889672 0.18190047454688124 0.2174136998012278;
+ 0.08212163086848083 0.11064481443847864 0.14112973354457856 0.12382743222252708 0.0935706781801081;
+ 0.13018413669876633 0.09388468129863171 0.20290347296126202 0.2645939317842899 0.13320719861149904;
+ 0.16364183671114718 0.09028158055632482 0.13242318367980238 0.17504948585637226 0.15250642900502132;
+ 0.18366136593025872 0.05839589013218685 0.1654477367862942 0.17487363186395466 0.12151664021519772
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tn_novector6)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tn_novector;
+  // Compuing _C <= _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(8.60965278921923);
+  Type beta  = Type(0.8743259268504366);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.9717820827209607 0.9624472949421112 0.25178229582536416;
+ 0.49724850589238545 0.30087830981676966 0.2848404943774676
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.036886947354532795 0.6095643339798968 0.5026790232288615;
+ 0.05147875124998935 0.27864646423661144 0.9082658859666537
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.23956189066697242 0.1448948720912231 0.489452760277563;
+ 0.9856504541106007 0.2420552715115004 0.6721355474058786;
+ 0.7616196153287176 0.23763754399239967 0.7282163486118596
+  	)");
+
+  gemm_tn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.7384650135692088 6.419654792266364 8.522119675616448;
+ 1.30079095654607 5.984512617898074 7.105857773549434;
+ 0.8721111011896757 2.2125071070469877 3.953796037233847
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tn_novector7)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tn_novector;
+  // Compuing _C <= _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(5.695027257002394);
+  Type beta  = Type(-2.895708179245069);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3677831327192532 0.6323058305935795 0.6335297107608947 0.5357746840747585 0.0902897700544083 0.835302495589238 0.32078006497173583 0.18651851039985423 0.040775141554763916 0.5908929431882418;
+ 0.6775643618422824 0.016587828927856152 0.512093058299281 0.22649577519793795 0.6451727904094499 0.17436642900499144 0.690937738102466 0.3867353463005374 0.9367299887367345 0.13752094414599325;
+ 0.3410663510502585 0.11347352124058907 0.9246936182785628 0.877339353380981 0.2579416277151556 0.659984046034179 0.8172222002012158 0.5552008115994623 0.5296505783560065 0.24185229090045168;
+ 0.09310276780589921 0.8972157579533268 0.9004180571633305 0.6331014572732679 0.3390297910487007 0.3492095746126609 0.7259556788702394 0.8971102599525771 0.8870864242651173 0.7798755458576239;
+ 0.6420316461542878 0.08413996499504883 0.16162871409461377 0.8985541885270792 0.6064290596595899 0.009197051616629648 0.1014715428660321 0.6635017691080558 0.005061583846218687 0.16080805141749865;
+ 0.5487337893665861 0.6918951976926933 0.6519612595026005 0.22426930946055978 0.7121792213475359 0.23724908749680007 0.3253996981592677 0.7464914051180241 0.6496328990472147 0.8492234104941779;
+ 0.6576128923003434 0.5683086033354716 0.09367476782809248 0.3677158030594335 0.26520236768172545 0.24398964337908358 0.9730105547524456 0.3930977246667604 0.8920465551771133 0.6311386259972629;
+ 0.7948113035416484 0.5026370931051921 0.5769038846263591 0.4925176938188639 0.1952429877980445 0.7224521152615053 0.2807723624408558 0.02431596643145384 0.6454722959071678 0.17711067940704894;
+ 0.9404585843529143 0.9539285770025874 0.9148643902204485 0.3701587002554444 0.015456616528867428 0.9283185625877254 0.42818414831731433 0.9666548190436696 0.9636199770892528 0.8530094554673601;
+ 0.2944488920695857 0.38509772860192526 0.8511366715168569 0.31692200515627766 0.1694927466860925 0.5568012624583502 0.936154774160781 0.696029796674973 0.570061170089365 0.09717649377076854
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.6150072266991697 0.9900538501042633 0.14008401523652403 0.5183296523637367 0.8773730719279554 0.7407686177542044 0.697015740995268 0.7024840839871093 0.35949115121975517 0.29359184426449336;
+ 0.8093611554785136 0.8101133946791808 0.8670723185801037 0.9132405525564713 0.5113423988609378 0.5015162946871996 0.7982951789667752 0.6499639307777652 0.7019668772577033 0.795792669436101;
+ 0.8900053418175663 0.3379951568515358 0.375582952639944 0.093981939840869 0.578280140996174 0.035942273796742086 0.46559801813246016 0.5426446347075766 0.2865412521282844 0.5908332605690108;
+ 0.03050024993904943 0.03734818874921442 0.8226005606596583 0.3601906414112629 0.12706051265188478 0.5222432600548044 0.7699935530986108 0.21582102749684318 0.6228904758190003 0.085347464993768;
+ 0.0516817211686077 0.531354631568148 0.5406351216101065 0.6374299014982066 0.7260913337226615 0.9758520794625346 0.5163003483011953 0.32295647294124596 0.7951861947687037 0.2708322512620742;
+ 0.4389714207056361 0.07845638134226596 0.02535074341545751 0.9626484146779251 0.8359801205122058 0.695974206093698 0.4089529444142699 0.17329432007084578 0.15643704267108605 0.25024289816459533;
+ 0.5492266647061205 0.7145959227000623 0.6601973767177313 0.27993389694594284 0.9548652806631941 0.7378969166957685 0.5543540525114007 0.6117207462343522 0.4196000624277899 0.24773098950115746;
+ 0.3559726786512616 0.7578461104643691 0.014393488629755868 0.11607264050691624 0.04600264202175275 0.040728802318970136 0.8554605840110072 0.7036578593800237 0.4741738290873252 0.09783416065100148;
+ 0.49161587511683236 0.4734717707805657 0.17320186991001518 0.43385164923797304 0.39850473439737344 0.6158500980522165 0.6350936508676438 0.04530400977204452 0.3746126146264712 0.6258599157142364;
+ 0.5031362585800877 0.8564898411883223 0.658693631618945 0.1629344270814297 0.07056874740042984 0.6424192782063156 0.026511310541621813 0.5857755812734633 0.9402302414249576 0.575474177875879
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.3881699262065219 0.6432882184423532 0.45825289049151663 0.5456167893159349 0.9414648087765252 0.38610263780077425 0.9611905638239142 0.9053506419560637 0.19579113478929644 0.06936130087516545;
+ 0.10077800137742665 0.018221825651549728 0.0944429607559284 0.6830067734163568 0.07118864846022899 0.3189756302937613 0.8448753109694546 0.023271935735825866 0.8144684825889358 0.28185477477339993;
+ 0.11816482762165625 0.6967371653641506 0.628942846779884 0.877472013527053 0.7350710438038858 0.8034809303848486 0.2820345725713065 0.17743954377972282 0.7506147516408583 0.806834739267264;
+ 0.9905051420006733 0.4126176769114265 0.37201808579278317 0.7764129607419968 0.34080354025301784 0.9307573256035647 0.8584127518430118 0.42899402737501835 0.7508710677914974 0.7545428740846823;
+ 0.10312386883593261 0.9025529066795667 0.5052523724478571 0.8264574661077416 0.32004960103061175 0.8955232284962005 0.3892016787341631 0.01083765148029836 0.9053819764192637 0.09128667678613356;
+ 0.31931363759041487 0.9500619670508049 0.9506071469375561 0.5734378881232861 0.6318372121697993 0.44844552197831977 0.29321077169806453 0.32866454536991596 0.6725184560770384 0.75237452943768;
+ 0.7915790437258485 0.7896181427945539 0.09120610304869037 0.49442030470258147 0.057558760016644284 0.5495288823237355 0.441530501373377 0.8877041827582998 0.3509150125520787 0.11706701642760586;
+ 0.14299168205283586 0.7615106317174722 0.6182180633162611 0.10112267612279024 0.08410680611499743 0.70096913145912 0.07276300636419353 0.8218600592903562 0.7062422271564962 0.08134878064189976;
+ 0.08483771408519192 0.9866395785011755 0.3742707957561203 0.3706421470668909 0.8127995672575026 0.9472485773838587 0.9860010638228709 0.7533781852589416 0.37625958553091576 0.08350071669866876;
+ 0.7771469159274368 0.558404249735805 0.4242220092469763 0.906354385094736 0.11119748230615134 0.49262510429085915 0.011353644767419069 0.46866064199412627 0.05630327568183735 0.11881791626807192
+  	)");
+
+  gemm_tn_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  13.73792001678889 16.28003537315356 10.103829026430203 12.912919052533724 13.579608845872594 15.782115917219876 15.629768809041057 10.949644116596781 14.478809436913291 12.137721317583875;
+ 11.05554451989334 13.49475142901783 9.681262981862425 9.927383057718448 13.208358381819526 14.952368951400098 13.984326941323262 10.339610052262412 10.18035431753887 7.9939878315947706;
+ 17.220693548207613 16.158598735098767 12.495519959552867 12.076437456504888 13.076303187926298 15.008410649243672 18.964078939883482 14.534976409898382 15.477629238918826 12.275153209823232;
+ 9.527401983760292 13.679918736008279 11.086986930866242 8.891421659117274 13.57673709393861 12.274403369619375 13.884793891253983 11.510542119727267 12.470036726926022 7.853941799703348;
+ 8.07089932646108 6.384429136658188 7.568978486220537 8.915509052001928 9.992801226822108 8.71867272476307 9.646926176511396 8.025784611362685 7.248235034168124 6.887127901772533;
+ 13.228674286750563 13.57172395211795 5.865912272798168 7.7926191900132995 10.292344752547624 11.374406619146432 14.852024092912751 11.597481581999508 10.524523842559887 8.631355375934845;
+ 14.621626107760484 15.782317509066655 16.527484664932132 10.851019608171333 15.34649804259431 14.996333167240067 15.672363240564941 12.944313974138169 16.070639941855887 13.17509734287993;
+ 13.032854700335582 11.937947998354177 12.853720719567056 14.584699330212077 15.207474160637275 16.884054526237797 16.34438236699445 8.611615295581485 14.435346908139653 12.51514586958608;
+ 17.109164444397933 15.0034667648216 15.468186092849107 14.360671115496404 13.514045600112217 14.818006256885898 17.919908510688764 12.671179934035845 16.334566020098176 14.139745517563966;
+ 8.984589079300363 9.953648930856207 8.30403603810394 10.124668234713267 14.553328498134665 14.128367183619806 15.405028619656322 7.808494078649353 10.569123795575319 8.26628090909105
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemm_tn_vector.cpp b/libs/math/tests/math/linalg/blas/gemm_tn_vector.cpp
new file mode 100644
index 0000000000..2e43e0ed1e
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_tn_vector.cpp
@@ -0,0 +1,298 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_tn_vector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemm_vectorised, blas_gemm_tn_vector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tn_vector;
+  // Compuing _C <=  _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162 0.7319939418114051;
+ 0.5986584841970366 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352 0.6011150117432088;
+ 0.7080725777960455 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_tn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.4456482991294304 0.33674079873438223 0.8057864498423065;
+ 0.1656934419785827 0.8266976184734581 0.7228126579480724;
+ 0.15297229436215787 0.6372467595628419 0.591313169085288
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tn_vector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tn_vector;
+  // Compuing _C <=  _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815 0.3663618432936917;
+ 0.45606998421703593 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425 0.046450412719997725;
+ 0.6075448519014384 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_tn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tn_vector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tn_vector;
+  // Compuing _C <=  _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821 0.2587799816000169;
+ 0.662522284353982 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704 0.9695846277645586;
+ 0.7751328233611146 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_tn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  1.1302433056071457 1.5506700912834535 0.7146781438045392;
+ 0.9347351599342958 0.5061714427949007 1.4859210106475778;
+ 0.9332767593138604 0.8077890198114085 1.545017690717192
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tn_vector4)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tn_vector;
+  // Compuing _C <=  _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.10180053730598937);
+  Type beta  = Type(0.8698587835631502);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3567533266935893 0.28093450968738076 0.5426960831582485;
+ 0.14092422497476265 0.8021969807540397 0.07455064367977082
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9868869366005173 0.7722447692966574 0.1987156815341724;
+ 0.005522117123602399 0.8154614284548342 0.7068573438476171
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935;
+ 0.6232981268275579 0.3308980248526492 0.06355835028602363
+  	)");
+
+  gemm_tn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.6700539538850498 0.7106411658566169 0.08176597312770231;
+ 0.3404897810246475 0.18946927699173277 0.814186079607432;
+ 0.5967455580650104 0.3366873471152119 0.07162973472963013
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tn_vector5)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tn_vector;
+  // Compuing _C <=  _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(-1.4106164688738558);
+  Type beta  = Type(-0.3391340332274022);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3109823217156622 0.32518332202674705 0.7296061783380641;
+ 0.6375574713552131 0.8872127425763265 0.4722149251619493
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.1195942459383017 0.713244787222995 0.7607850486168974;
+ 0.5612771975694962 0.770967179954561 0.49379559636439074
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5227328293819941 0.42754101835854963 0.02541912674409519;
+ 0.10789142699330445 0.03142918568673425 0.6364104112637804;
+ 0.3143559810763267 0.5085706911647028 0.907566473926093
+  	)");
+
+  gemm_tn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  -0.7345238370419157 -1.151246255256524 -0.7864538396683348;
+ -0.7938965074534533 -1.3027089332941115 -1.1828008769655816;
+ -0.6035692343326593 -1.4200933833591156 -1.4197066936475742
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tn_vector6)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tn_vector;
+  // Compuing _C <=  _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(3);
+  Type beta  = Type(3);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.24929222914887494 0.41038292303562973 0.7555511385430487;
+ 0.22879816549162246 0.07697990982879299 0.289751452913768
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.16122128725400442 0.9296976523425731 0.808120379564417;
+ 0.6334037565104235 0.8714605901877177 0.8036720768991145
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.18657005888603584 0.8925589984899778 0.5393422419156507;
+ 0.8074401551640625 0.8960912999234932 0.3180034749718639;
+ 0.11005192452767676 0.22793516254194168 0.4271077886262563
+  	)");
+
+  gemm_tn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  1.1150486714307748 3.9711419490303896 2.7740372088277763;
+ 2.7670859470329567 4.0341248930495945 2.134526647541353;
+ 1.2461775311992485 3.54862876330246 3.8116476403012087
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tn_vector7)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * _B + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tn_vector;
+  // Compuing _C <=  _alpha * T(_A) * _B + _beta * _C
+  using Type = double;
+  Type alpha = Type(1.2923016455408614);
+  Type beta  = Type(1.7695071276677026);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.8180147659224931 0.8607305832563434 0.006952130531190703 0.5107473025775657 0.417411003148779;
+ 0.22210781047073025 0.1198653673336828 0.33761517140362796 0.9429097039125192 0.32320293202075523
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5187906217433661 0.7030189588951778 0.363629602379294 0.9717820827209607 0.9624472949421112 0.25178229582536416 0.49724850589238545;
+ 0.30087830981676966 0.2848404943774676 0.036886947354532795 0.6095643339798968 0.5026790232288615 0.05147875124998935 0.27864646423661144
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9082658859666537 0.23956189066697242 0.1448948720912231 0.489452760277563 0.9856504541106007 0.2420552715115004 0.6721355474058786;
+ 0.7616196153287176 0.23763754399239967 0.7282163486118596 0.3677831327192532 0.6323058305935795 0.6335297107608947 0.5357746840747585;
+ 0.0902897700544083 0.835302495589238 0.32078006497173583 0.18651851039985423 0.040775141554763916 0.5908929431882418 0.6775643618422824;
+ 0.016587828927856152 0.512093058299281 0.22649577519793795 0.6451727904094499 0.17436642900499144 0.690937738102466 0.3867353463005374;
+ 0.9367299887367345 0.13752094414599325 0.3410663510502585 0.11347352124058907 0.9246936182785628 0.877339353380981 0.2579416277151556
+  	)");
+
+  gemm_tn_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  2.241969037690814 1.2488410094621074 0.6513809700965056 2.0683456266687874 2.905823653086802 0.7092590499163391 1.7949808665989102;
+ 1.9713610790855722 1.2466082938347656 1.6987716589514612 1.8261538333496201 2.2672885573890618 1.4090728880873125 1.5443212986804262;
+ 0.2957027357736185 1.6086659080326617 0.5869833493326826 0.6047298764946565 0.3001179507939324 1.0703115618795966 1.3249960036021968;
+ 0.7384019953549967 1.7172575655213604 0.6857434403331495 2.5258211139253515 1.5563222813129851 1.4515336146625781 1.3520719844332847;
+ 2.0630664623920816 0.74153837627396 0.8150755369657324 0.979591994041476 2.365372963042061 1.7097763008948856 0.8410396500608759
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemm_tt_novector.cpp b/libs/math/tests/math/linalg/blas/gemm_tt_novector.cpp
new file mode 100644
index 0000000000..fd0d496a0a
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_tt_novector.cpp
@@ -0,0 +1,184 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_tt_novector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_DGEMM, blas_gemm_tt_novector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tt_novector;
+  // Compuing _C <= _alpha * T(_A) * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162 0.7319939418114051;
+ 0.5986584841970366 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352;
+ 0.6011150117432088 0.7080725777960455;
+ 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_tt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.5402983414818157 0.649035344064104 0.5883544808430341;
+ 0.1903605457037474 0.6819611623843438 0.17089398970327166;
+ 0.17763558461246698 0.5504679890644331 0.16636834727714633
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tt_novector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tt_novector;
+  // Compuing _C <= _alpha * T(_A) * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815 0.3663618432936917;
+ 0.45606998421703593 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425;
+ 0.046450412719997725 0.6075448519014384;
+ 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_tt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tt_novector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tt_novector;
+  // Compuing _C <= _alpha * T(_A) * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821 0.2587799816000169;
+ 0.662522284353982 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704;
+ 0.9695846277645586 0.7751328233611146;
+ 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_tt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.7391707329439722 1.4687595852789468 0.7136435415493719;
+ 0.7507388547039637 1.168507858960929 1.458563482375042;
+ 0.6262918566095386 0.925379917501852 1.5372321173958363
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_DGEMM, blas_gemm_tt_novector4)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C), platform::Parallelisation::NOT_PARALLEL>
+      gemm_tt_novector;
+  // Compuing _C <= _alpha * T(_A) * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.3164524921858566);
+  Type beta  = Type(0.9199326378866257);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3567533266935893 0.28093450968738076 0.5426960831582485;
+ 0.14092422497476265 0.8021969807540397 0.07455064367977082
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.9868869366005173 0.7722447692966574;
+ 0.1987156815341724 0.005522117123602399;
+ 0.8154614284548342 0.7068573438476171
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935;
+ 0.6232981268275579 0.3308980248526492 0.06355835028602363
+  	)");
+
+  gemm_tt_novector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.8164914513125054 0.7321971300207077 0.19170088505871147;
+ 0.6135408867757737 0.12565986507531443 1.045934360403664;
+ 0.7610964084107334 0.33866110913344155 0.2151907232957467
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemm_tt_vector.cpp b/libs/math/tests/math/linalg/blas/gemm_tt_vector.cpp
new file mode 100644
index 0000000000..870208e9e7
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemm_tt_vector.cpp
@@ -0,0 +1,145 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemm_tt_vector.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemm_vectorised, blas_gemm_tt_vector1)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tt_vector;
+  // Compuing _C <= _alpha * T(_A) * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(0);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162 0.7319939418114051;
+ 0.5986584841970366 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.05808361216819946 0.8661761457749352;
+ 0.6011150117432088 0.7080725777960455;
+ 0.020584494295802447 0.9699098521619943
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.8324426408004217 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377 0.5247564316322378;
+ 0.43194501864211576 0.2912291401980419 0.6118528947223795
+  	)");
+
+  gemm_tt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.5402983414818157 0.649035344064104 0.5883544808430341;
+ 0.1903605457037474 0.6819611623843438 0.17089398970327166;
+ 0.17763558461246698 0.5504679890644331 0.16636834727714633
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tt_vector2)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tt_vector;
+  // Compuing _C <= _alpha * T(_A) * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.13949386065204183 0.29214464853521815 0.3663618432936917;
+ 0.45606998421703593 0.7851759613930136 0.19967378215835974
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5142344384136116 0.5924145688620425;
+ 0.046450412719997725 0.6075448519014384;
+ 0.17052412368729153 0.06505159298527952
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  	)");
+
+  gemm_tt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.9488855372533332 0.9656320330745594 0.8083973481164611;
+ 0.3046137691733707 0.09767211400638387 0.6842330265121569;
+ 0.4401524937396013 0.12203823484477883 0.4951769101112702
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
+
+TEST(blas_gemm_vectorised, blas_gemm_tt_vector3)
+{
+
+  Blas<double, Signature(_C <= _alpha, _A, _B, _beta, _C),
+       Computes(_C <= _alpha * T(_A) * T(_B) + _beta * _C), platform::Parallelisation::VECTORISE>
+      gemm_tt_vector;
+  // Compuing _C <= _alpha * T(_A) * T(_B) + _beta * _C
+  using Type = double;
+  Type alpha = Type(0.21739149379543965);
+  Type beta  = Type(0.17701817626521066);
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.034388521115218396 0.9093204020787821 0.2587799816000169;
+ 0.662522284353982 0.31171107608941095 0.5200680211778108
+  	)");
+
+  Tensor<Type> B = Tensor<Type>::FromString(R"(
+  	0.5467102793432796 0.18485445552552704;
+ 0.9695846277645586 0.7751328233611146;
+ 0.9394989415641891 0.8948273504276488
+  	)");
+
+  Tensor<Type> C = Tensor<Type>::FromString(R"(
+  	0.5978999788110851 0.9218742350231168 0.0884925020519195;
+ 0.1959828624191452 0.045227288910538066 0.32533033076326434;
+ 0.388677289689482 0.2713490317738959 0.8287375091519293
+  	)");
+
+  gemm_tt_vector(alpha, A, B, beta, C);
+
+  Tensor<Type> refC = Tensor<Type>::FromString(R"(
+  0.136550224108703 0.28207671905663545 0.15156729965049928;
+ 0.15529176273944087 0.2521976932750107 0.30394462948286904;
+ 0.12045833062684462 0.19021446197270062 0.30072230370480557
+  )");
+
+  ASSERT_TRUE(refC.AllClose(C));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemv_n_novector.cpp b/libs/math/tests/math/linalg/blas/gemv_n_novector.cpp
new file mode 100644
index 0000000000..d3538d696e
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemv_n_novector.cpp
@@ -0,0 +1,244 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_n.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemv, blas_gemv_n_novector1)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_n_novector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162;
+ 0.7319939418114051 0.5986584841970366;
+ 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.05808361216819946; 0.8661761457749352
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6011150117432088; 0.7080725777960455; 0.020584494295802447
+    )");
+
+  gemv_n_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.4463557084070022; 1.2691331284989449; 0.16476535288509245
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_novector2)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_n_novector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.9699098521619943 0.8324426408004217;
+ 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.5247564316322378; 0.43194501864211576
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+    )");
+
+  gemv_n_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_novector3)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_n_novector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.5758929234029931);
+  Type beta  = Type(0.20568904817588307);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.29214464853521815 0.3663618432936917 0.45606998421703593 0.7851759613930136;
+ 0.19967378215835974 0.5142344384136116 0.5924145688620425 0.046450412719997725;
+ 0.6075448519014384 0.17052412368729153 0.06505159298527952 0.9488855372533332
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.9656320330745594; 0.8083973481164611; 0.3046137691733707; 0.09767211400638387
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6842330265121569; 0.4401524937396013; 0.12203823484477883
+    )");
+
+  gemv_n_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.5979320892793293; 0.5475123208953713; 0.5071308054777198
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_novector4)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_n_novector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.4951769101112702 0.034388521115218396 0.9093204020787821 0.2587799816000169;
+ 0.662522284353982 0.31171107608941095 0.5200680211778108 0.5467102793432796;
+ 0.18485445552552704 0.9695846277645586 0.7751328233611146 0.9394989415641891
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.8948273504276488; 0.5978999788110851; 0.9218742350231168; 0.0884925020519195; 0.1959828624191452; 0.045227288910538066; 0.32533033076326434; 0.388677289689482
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+    )");
+
+  gemv_n_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_novector5)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_n_novector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.19466586480354853);
+  Type beta  = Type(0.9629843888388224);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.7722447692966574 0.1987156815341724 0.005522117123602399 0.8154614284548342;
+ 0.7068573438476171 0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935 0.6232981268275579
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.3308980248526492; 0.06355835028602363; 0.3109823217156622; 0.32518332202674705; 0.7296061783380641; 0.6375574713552131; 0.8872127425763265; 0.4722149251619493
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.1195942459383017; 0.713244787222995; 0.7607850486168974; 0.5612771975694962; 0.770967179954561; 0.49379559636439074; 0.5227328293819941; 0.42754101835854963; 0.02541912674409519
+    )");
+
+  gemv_n_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.31856364575551; 0.713244787222995; 0.7607850486168974; 0.7524967849563918; 0.770967179954561; 0.49379559636439074; 0.7637243213626628; 0.42754101835854963; 0.02541912674409519
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_novector6)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_n_novector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.12575798950416517);
+  Type beta  = Type(0.1075304955619012);
+  int  n     = -2;
+  int  m     = -3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.10789142699330445 0.03142918568673425 0.6364104112637804 0.3143559810763267;
+ 0.5085706911647028 0.907566473926093 0.24929222914887494 0.41038292303562973;
+ 0.7555511385430487 0.22879816549162246 0.07697990982879299 0.289751452913768
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.16122128725400442; 0.9296976523425731; 0.808120379564417; 0.6334037565104235; 0.8714605901877177; 0.8036720768991145; 0.18657005888603584; 0.8925589984899778
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.5393422419156507; 0.8074401551640625; 0.8960912999234932; 0.3180034749718639; 0.11005192452767676; 0.22793516254194168; 0.4271077886262563; 0.8180147659224931; 0.8607305832563434
+    )");
+
+  gemv_n_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.11449565185469689; 0.8074401551640625; 0.8960912999234932; 0.1792459803524568; 0.11005192452767676; 0.22793516254194168; 0.1229533406892267; 0.8180147659224931; 0.8607305832563434
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemv_n_vector.cpp b/libs/math/tests/math/linalg/blas/gemv_n_vector.cpp
new file mode 100644
index 0000000000..be1ffdc933
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemv_n_vector.cpp
@@ -0,0 +1,244 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_n.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemv, blas_gemv_n_vector1)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_n_vector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162;
+ 0.7319939418114051 0.5986584841970366;
+ 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.05808361216819946; 0.8661761457749352
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6011150117432088; 0.7080725777960455; 0.020584494295802447
+    )");
+
+  gemv_n_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.4463557084070022; 1.2691331284989449; 0.16476535288509245
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_vector2)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_n_vector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.9699098521619943 0.8324426408004217;
+ 0.21233911067827616 0.18182496720710062;
+ 0.18340450985343382 0.3042422429595377
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.5247564316322378; 0.43194501864211576
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+    )");
+
+  gemv_n_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_vector3)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_n_vector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.6230998535760757);
+  Type beta  = Type(0.26246795255094646);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.29214464853521815 0.3663618432936917 0.45606998421703593 0.7851759613930136;
+ 0.19967378215835974 0.5142344384136116 0.5924145688620425 0.046450412719997725;
+ 0.6075448519014384 0.17052412368729153 0.06505159298527952 0.9488855372533332
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.9656320330745594; 0.8083973481164611; 0.3046137691733707; 0.09767211400638387
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6842330265121569; 0.4401524937396013; 0.12203823484477883
+    )");
+
+  gemv_n_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.6742589724777102; 0.6099629487218058; 0.553572736133376
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_vector4)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_n_vector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.4951769101112702 0.034388521115218396 0.9093204020787821 0.2587799816000169;
+ 0.662522284353982 0.31171107608941095 0.5200680211778108 0.5467102793432796;
+ 0.18485445552552704 0.9695846277645586 0.7751328233611146 0.9394989415641891
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.8948273504276488; 0.5978999788110851; 0.9218742350231168; 0.0884925020519195; 0.1959828624191452; 0.045227288910538066; 0.32533033076326434; 0.388677289689482
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+    )");
+
+  gemv_n_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_vector5)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_n_vector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.9989418228235344);
+  Type beta  = Type(0.34876666289850045);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.7722447692966574 0.1987156815341724 0.005522117123602399 0.8154614284548342;
+ 0.7068573438476171 0.7290071680409873 0.7712703466859457 0.07404465173409036;
+ 0.3584657285442726 0.11586905952512971 0.8631034258755935 0.6232981268275579
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.3308980248526492; 0.06355835028602363; 0.3109823217156622; 0.32518332202674705; 0.7296061783380641; 0.6375574713552131; 0.8872127425763265; 0.4722149251619493
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.1195942459383017; 0.713244787222995; 0.7607850486168974; 0.5612771975694962; 0.770967179954561; 0.49379559636439074; 0.5227328293819941; 0.42754101835854963; 0.02541912674409519
+    )");
+
+  gemv_n_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.085452925716307; 0.713244787222995; 0.7607850486168974; 1.283625405109936; 0.770967179954561; 0.49379559636439074; 1.5182690707816915; 0.42754101835854963; 0.02541912674409519
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_n_vector6)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * _A * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_n_vector;
+  // Compuing _y <= _alpha * _A * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.12927713750612646);
+  Type beta  = Type(0.48029120500022227);
+  int  n     = -2;
+  int  m     = -3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.10789142699330445 0.03142918568673425 0.6364104112637804 0.3143559810763267;
+ 0.5085706911647028 0.907566473926093 0.24929222914887494 0.41038292303562973;
+ 0.7555511385430487 0.22879816549162246 0.07697990982879299 0.289751452913768
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.16122128725400442; 0.9296976523425731; 0.808120379564417; 0.6334037565104235; 0.8714605901877177; 0.8036720768991145; 0.18657005888603584; 0.8925589984899778
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.5393422419156507; 0.8074401551640625; 0.8960912999234932; 0.3180034749718639; 0.11005192452767676; 0.22793516254194168; 0.4271077886262563; 0.8180147659224931; 0.8607305832563434
+    )");
+
+  gemv_n_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.3171223135918808; 0.8074401551640625; 0.8960912999234932; 0.30184421259726535; 0.11005192452767676; 0.22793516254194168; 0.2843178060228583; 0.8180147659224931; 0.8607305832563434
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemv_t_novector.cpp b/libs/math/tests/math/linalg/blas/gemv_t_novector.cpp
new file mode 100644
index 0000000000..4a9b52b7ee
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemv_t_novector.cpp
@@ -0,0 +1,246 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_t.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemv, blas_gemv_t_novector1)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_t_novector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162 0.7319939418114051;
+ 0.5986584841970366 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.05808361216819946; 0.8661761457749352
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6011150117432088; 0.7080725777960455; 0.020584494295802447
+    )");
+
+  gemv_t_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.1414133532250244; 0.8984331234997929; 0.19822007890826943
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_novector2)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_t_novector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.9699098521619943 0.8324426408004217 0.21233911067827616;
+ 0.18182496720710062 0.18340450985343382 0.3042422429595377
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.5247564316322378; 0.43194501864211576
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+    )");
+
+  gemv_t_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_novector3)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_t_novector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.9959433156478034);
+  Type beta  = Type(0.3870598118289402);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.29214464853521815 0.3663618432936917 0.45606998421703593;
+ 0.7851759613930136 0.19967378215835974 0.5142344384136116;
+ 0.5924145688620425 0.046450412719997725 0.6075448519014384;
+ 0.17052412368729153 0.06505159298527952 0.9488855372533332
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.9656320330745594; 0.8083973481164611; 0.3046137691733707; 0.09767211400638387
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6842330265121569; 0.4401524937396013; 0.12203823484477883
+    )");
+
+  gemv_t_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.3742716392373349; 0.7038818670547331; 1.1764841599475624
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_novector4)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_t_novector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.4951769101112702 0.034388521115218396 0.9093204020787821;
+ 0.2587799816000169 0.662522284353982 0.31171107608941095;
+ 0.5200680211778108 0.5467102793432796 0.18485445552552704;
+ 0.9695846277645586 0.7751328233611146 0.9394989415641891
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.8948273504276488; 0.5978999788110851; 0.9218742350231168; 0.0884925020519195; 0.1959828624191452; 0.045227288910538066; 0.32533033076326434; 0.388677289689482
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+    )");
+
+  gemv_t_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_novector5)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_t_novector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.9433392779408821);
+  Type beta  = Type(0.7374670463528943);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.7722447692966574 0.1987156815341724 0.005522117123602399;
+ 0.8154614284548342 0.7068573438476171 0.7290071680409873;
+ 0.7712703466859457 0.07404465173409036 0.3584657285442726;
+ 0.11586905952512971 0.8631034258755935 0.6232981268275579
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.3308980248526492; 0.06355835028602363; 0.3109823217156622; 0.32518332202674705; 0.7296061783380641; 0.6375574713552131; 0.8872127425763265; 0.4722149251619493
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.1195942459383017; 0.713244787222995; 0.7607850486168974; 0.5612771975694962; 0.770967179954561; 0.49379559636439074; 0.5227328293819941; 0.42754101835854963; 0.02541912674409519
+    )");
+
+  gemv_t_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.1962926313501785; 0.713244787222995; 0.7607850486168974; 1.4566478071369535; 0.770967179954561; 0.49379559636439074; 1.369469447659496; 0.42754101835854963; 0.02541912674409519
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_novector6)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::NOT_PARALLEL>
+      gemv_t_novector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.17753757946407733);
+  Type beta  = Type(0.8028809727653375);
+  int  n     = -2;
+  int  m     = -3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.10789142699330445 0.03142918568673425 0.6364104112637804;
+ 0.3143559810763267 0.5085706911647028 0.907566473926093;
+ 0.24929222914887494 0.41038292303562973 0.7555511385430487;
+ 0.22879816549162246 0.07697990982879299 0.289751452913768
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.16122128725400442; 0.9296976523425731; 0.808120379564417; 0.6334037565104235; 0.8714605901877177; 0.8036720768991145; 0.18657005888603584; 0.8925589984899778
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.5393422419156507; 0.8074401551640625; 0.8960912999234932; 0.3180034749718639; 0.11005192452767676; 0.22793516254194168; 0.4271077886262563; 0.8180147659224931; 0.8607305832563434
+    )");
+
+  gemv_t_novector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.7112172788227125; 0.8074401551640625; 0.8960912999234932; 0.3961262418277567; 0.11005192452767676; 0.22793516254194168; 0.43744188280621926; 0.8180147659224931; 0.8607305832563434
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
diff --git a/libs/math/tests/math/linalg/blas/gemv_t_vector.cpp b/libs/math/tests/math/linalg/blas/gemv_t_vector.cpp
new file mode 100644
index 0000000000..c0eddbdb5e
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/gemv_t_vector.cpp
@@ -0,0 +1,246 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/gemv_t.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_gemv, blas_gemv_t_vector1)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_t_vector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(1);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.3745401188473625 0.9507143064099162 0.7319939418114051;
+ 0.5986584841970366 0.15601864044243652 0.15599452033620265
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.05808361216819946; 0.8661761457749352
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6011150117432088; 0.7080725777960455; 0.020584494295802447
+    )");
+
+  gemv_t_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.1414133532250244; 0.8984331234997929; 0.19822007890826943
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_vector2)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_t_vector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.9699098521619943 0.8324426408004217 0.21233911067827616;
+ 0.18182496720710062 0.18340450985343382 0.3042422429595377
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.5247564316322378; 0.43194501864211576
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+    )");
+
+  gemv_t_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2912291401980419; 0.6118528947223795; 0.13949386065204183
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_vector3)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_t_vector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.7797736830913918);
+  Type beta  = Type(0.5415935837391784);
+  int  n     = 1;
+  int  m     = 1;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.29214464853521815 0.3663618432936917 0.45606998421703593;
+ 0.7851759613930136 0.19967378215835974 0.5142344384136116;
+ 0.5924145688620425 0.046450412719997725 0.6075448519014384;
+ 0.17052412368729153 0.06505159298527952 0.9488855372533332
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.9656320330745594; 0.8083973481164611; 0.3046137691733707; 0.09767211400638387
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6842330265121569; 0.4401524937396013; 0.12203823484477883
+    )");
+
+  gemv_t_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.239206267076279; 0.6561004574377036; 0.9502397423779314
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_vector4)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_t_vector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0);
+  Type beta  = Type(1);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.4951769101112702 0.034388521115218396 0.9093204020787821;
+ 0.2587799816000169 0.662522284353982 0.31171107608941095;
+ 0.5200680211778108 0.5467102793432796 0.18485445552552704;
+ 0.9695846277645586 0.7751328233611146 0.9394989415641891
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.8948273504276488; 0.5978999788110851; 0.9218742350231168; 0.0884925020519195; 0.1959828624191452; 0.045227288910538066; 0.32533033076326434; 0.388677289689482
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+    )");
+
+  gemv_t_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_vector5)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_t_vector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.6974096755404444);
+  Type beta  = Type(0.7483972817624156);
+  int  n     = 2;
+  int  m     = 3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.7722447692966574 0.1987156815341724 0.005522117123602399;
+ 0.8154614284548342 0.7068573438476171 0.7290071680409873;
+ 0.7712703466859457 0.07404465173409036 0.3584657285442726;
+ 0.11586905952512971 0.8631034258755935 0.6232981268275579
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.3308980248526492; 0.06355835028602363; 0.3109823217156622; 0.32518332202674705; 0.7296061783380641; 0.6375574713552131; 0.8872127425763265; 0.4722149251619493
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.1195942459383017; 0.713244787222995; 0.7607850486168974; 0.5612771975694962; 0.770967179954561; 0.49379559636439074; 0.5227328293819941; 0.42754101835854963; 0.02541912674409519
+    )");
+
+  gemv_t_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.9087180088833477; 0.713244787222995; 0.7607850486168974; 1.1909432922199015; 0.770967179954561; 0.49379559636439074; 1.1186606476061773; 0.42754101835854963; 0.02541912674409519
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_gemv, blas_gemv_t_vector6)
+{
+
+  Blas<double, Signature(_y <= _alpha, _A, _x, _n, _beta, _y, _m),
+       Computes(_y <= _alpha * T(_A) * _x + _beta * _y), platform::Parallelisation::VECTORISE>
+      gemv_t_vector;
+  // Compuing _y <= _alpha * T(_A) * _x + _beta * _y
+  using Type = double;
+  Type alpha = Type(0.4629360214092787);
+  Type beta  = Type(0.5903351016572239);
+  int  n     = -2;
+  int  m     = -3;
+
+  Tensor<Type> A = Tensor<Type>::FromString(R"(
+  	0.10789142699330445 0.03142918568673425 0.6364104112637804;
+ 0.3143559810763267 0.5085706911647028 0.907566473926093;
+ 0.24929222914887494 0.41038292303562973 0.7555511385430487;
+ 0.22879816549162246 0.07697990982879299 0.289751452913768
+  	)");
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.16122128725400442; 0.9296976523425731; 0.808120379564417; 0.6334037565104235; 0.8714605901877177; 0.8036720768991145; 0.18657005888603584; 0.8925589984899778
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.5393422419156507; 0.8074401551640625; 0.8960912999234932; 0.3180034749718639; 0.11005192452767676; 0.22793516254194168; 0.4271077886262563; 0.8180147659224931; 0.8607305832563434
+    )");
+
+  gemv_t_vector(alpha, A, x, n, beta, y, m);
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  1.0437828110146694; 0.8074401551640625; 0.8960912999234932; 0.5548890346413955; 0.11005192452767676; 0.22793516254194168; 0.4986147015629199; 0.8180147659224931; 0.8607305832563434
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
diff --git a/libs/math/tests/math/linalg/blas/scal_all.cpp b/libs/math/tests/math/linalg/blas/scal_all.cpp
new file mode 100644
index 0000000000..ed1210df45
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/scal_all.cpp
@@ -0,0 +1,132 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/scal_all.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_A_withA, blas_scal_all1)
+{
+
+  Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      scal_all;
+  // Compuing _x <= _alpha * _x
+  using Type = double;
+  Type alpha = Type(-1.7819375538239353);
+
+  int n = 20;
+  int m = 1;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.3745401188473625; 0.9507143064099162; 0.7319939418114051; 0.5986584841970366; 0.15601864044243652; 0.15599452033620265; 0.05808361216819946; 0.8661761457749352; 0.6011150117432088; 0.7080725777960455; 0.020584494295802447; 0.9699098521619943; 0.8324426408004217; 0.21233911067827616; 0.18182496720710062; 0.18340450985343382; 0.3042422429595377; 0.5247564316322378; 0.43194501864211576; 0.2912291401980419
+    )");
+
+  scal_all(n, alpha, x, m);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  -0.6674071031877951; -1.6941135255495052; -1.3043674940853551; -1.0667720349060124; -0.27801547450093145; -0.2779724939778311; -0.10350136978425951; -1.5434718023828324; -1.0711494135925397; -1.2617411172076933; -0.03668028341216496; -1.7283187893912788; -1.4833608030466403; -0.3783750354631973; -0.32400073728913814; -0.3268153836485057; -0.5421406781892261; -0.935083192136127; -0.7696990499055659; -0.5189521416867467
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+}
+
+TEST(blas_A_withA, blas_scal_all2)
+{
+
+  Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      scal_all;
+  // Compuing _x <= _alpha * _x
+  using Type = double;
+  Type alpha = Type(-3.1216412148558152);
+
+  int n = 10;
+  int m = 2;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.6118528947223795; 0.13949386065204183; 0.29214464853521815; 0.3663618432936917; 0.45606998421703593; 0.7851759613930136; 0.19967378215835974; 0.5142344384136116; 0.5924145688620425; 0.046450412719997725; 0.6075448519014384; 0.17052412368729153; 0.06505159298527952; 0.9488855372533332; 0.9656320330745594; 0.8083973481164611; 0.3046137691733707; 0.09767211400638387; 0.6842330265121569; 0.4401524937396013
+    )");
+
+  scal_all(n, alpha, x, m);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  -1.909985213594216; 0.13949386065204183; -0.9119707755671036; 0.3663618432936917; -1.4236868595905405; 0.7851759613930136; -0.6233099079116775; 0.5142344384136116; -1.8493057344407902; 0.046450412719997725; -1.8965370495690024; 0.17052412368729153; -0.20306773375487397; 0.9488855372533332; -3.014356752830558; 0.8083973481164611; -0.9508948964641698; 0.09767211400638387; -2.1359300161258807; 0.4401524937396013
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+}
+
+TEST(blas_A_withA, blas_scal_all3)
+{
+
+  Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      scal_all;
+  // Compuing _x <= _alpha * _x
+  using Type = double;
+  Type alpha = Type(-9.830081393267703);
+
+  int n = 6;
+  int m = 3;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.12203823484477883; 0.4951769101112702; 0.034388521115218396; 0.9093204020787821; 0.2587799816000169; 0.662522284353982; 0.31171107608941095; 0.5200680211778108; 0.5467102793432796; 0.18485445552552704; 0.9695846277645586; 0.7751328233611146; 0.9394989415641891; 0.8948273504276488; 0.5978999788110851; 0.9218742350231168; 0.0884925020519195; 0.1959828624191452; 0.045227288910538066; 0.32533033076326434
+    )");
+
+  scal_all(n, alpha, x, m);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  -1.1996457816148944; 0.4951769101112702; 0.034388521115218396; -8.938693564993342; 0.2587799816000169; 0.662522284353982; -3.064145249141972; 0.5200680211778108; 0.5467102793432796; -1.8171343437241154; 0.9695846277645586; 0.7751328233611146; -9.235351064464837; 0.8948273504276488; 0.5978999788110851; -9.062098764633637; 0.0884925020519195; 0.1959828624191452; 0.045227288910538066; 0.32533033076326434
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+}
+
+TEST(blas_A_withA, blas_scal_all4)
+{
+
+  Blas<double, Signature(_x <= _n, _alpha, _x, _m), Computes(_x <= _alpha * _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      scal_all;
+  // Compuing _x <= _alpha * _x
+  using Type = double;
+  Type alpha = Type(7.513264587729719);
+
+  int n = 10;
+  int m = -2;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.388677289689482; 0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173; 0.7722447692966574; 0.1987156815341724; 0.005522117123602399; 0.8154614284548342; 0.7068573438476171; 0.7290071680409873; 0.7712703466859457; 0.07404465173409036; 0.3584657285442726; 0.11586905952512971
+    )");
+
+  scal_all(n, alpha, x, m);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.388677289689482; 0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173; 0.7722447692966574; 0.1987156815341724; 0.005522117123602399; 0.8154614284548342; 0.7068573438476171; 0.7290071680409873; 0.7712703466859457; 0.07404465173409036; 0.3584657285442726; 0.11586905952512971
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+}
diff --git a/libs/math/tests/math/linalg/blas/swap_all.cpp b/libs/math/tests/math/linalg/blas/swap_all.cpp
new file mode 100644
index 0000000000..abf2866bbb
--- /dev/null
+++ b/libs/math/tests/math/linalg/blas/swap_all.cpp
@@ -0,0 +1,280 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+
+#include "math/linalg/blas/base.hpp"
+#include "math/linalg/blas/swap_all.hpp"
+#include "math/linalg/prototype.hpp"
+#include "math/tensor.hpp"
+
+using namespace fetch;
+using namespace fetch::math;
+using namespace fetch::math::linalg;
+
+TEST(blas_A_withA, blas_swap_all1)
+{
+
+  Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      swap_all;
+  // Compuing _x, _y <= _y, _x
+  using Type = double;
+
+  int n = 20;
+  int m = 1;
+  int p = 1;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.3745401188473625; 0.9507143064099162; 0.7319939418114051; 0.5986584841970366; 0.15601864044243652; 0.15599452033620265; 0.05808361216819946; 0.8661761457749352; 0.6011150117432088; 0.7080725777960455; 0.020584494295802447; 0.9699098521619943; 0.8324426408004217; 0.21233911067827616; 0.18182496720710062; 0.18340450985343382; 0.3042422429595377; 0.5247564316322378; 0.43194501864211576; 0.2912291401980419
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6118528947223795; 0.13949386065204183; 0.29214464853521815; 0.3663618432936917; 0.45606998421703593; 0.7851759613930136; 0.19967378215835974; 0.5142344384136116; 0.5924145688620425; 0.046450412719997725; 0.6075448519014384; 0.17052412368729153; 0.06505159298527952; 0.9488855372533332; 0.9656320330745594; 0.8083973481164611; 0.3046137691733707; 0.09767211400638387; 0.6842330265121569; 0.4401524937396013
+    )");
+
+  swap_all(n, x, m, y, p);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.6118528947223795; 0.13949386065204183; 0.29214464853521815; 0.3663618432936917; 0.45606998421703593; 0.7851759613930136; 0.19967378215835974; 0.5142344384136116; 0.5924145688620425; 0.046450412719997725; 0.6075448519014384; 0.17052412368729153; 0.06505159298527952; 0.9488855372533332; 0.9656320330745594; 0.8083973481164611; 0.3046137691733707; 0.09767211400638387; 0.6842330265121569; 0.4401524937396013
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.3745401188473625; 0.9507143064099162; 0.7319939418114051; 0.5986584841970366; 0.15601864044243652; 0.15599452033620265; 0.05808361216819946; 0.8661761457749352; 0.6011150117432088; 0.7080725777960455; 0.020584494295802447; 0.9699098521619943; 0.8324426408004217; 0.21233911067827616; 0.18182496720710062; 0.18340450985343382; 0.3042422429595377; 0.5247564316322378; 0.43194501864211576; 0.2912291401980419
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_A_withA, blas_swap_all2)
+{
+
+  Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      swap_all;
+  // Compuing _x, _y <= _y, _x
+  using Type = double;
+
+  int n = 10;
+  int m = 2;
+  int p = 2;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.12203823484477883; 0.4951769101112702; 0.034388521115218396; 0.9093204020787821; 0.2587799816000169; 0.662522284353982; 0.31171107608941095; 0.5200680211778108; 0.5467102793432796; 0.18485445552552704; 0.9695846277645586; 0.7751328233611146; 0.9394989415641891; 0.8948273504276488; 0.5978999788110851; 0.9218742350231168; 0.0884925020519195; 0.1959828624191452; 0.045227288910538066; 0.32533033076326434
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.388677289689482; 0.2713490317738959; 0.8287375091519293; 0.3567533266935893; 0.28093450968738076; 0.5426960831582485; 0.14092422497476265; 0.8021969807540397; 0.07455064367977082; 0.9868869366005173; 0.7722447692966574; 0.1987156815341724; 0.005522117123602399; 0.8154614284548342; 0.7068573438476171; 0.7290071680409873; 0.7712703466859457; 0.07404465173409036; 0.3584657285442726; 0.11586905952512971
+    )");
+
+  swap_all(n, x, m, y, p);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.388677289689482; 0.4951769101112702; 0.8287375091519293; 0.9093204020787821; 0.28093450968738076; 0.662522284353982; 0.14092422497476265; 0.5200680211778108; 0.07455064367977082; 0.18485445552552704; 0.7722447692966574; 0.7751328233611146; 0.005522117123602399; 0.8948273504276488; 0.7068573438476171; 0.9218742350231168; 0.7712703466859457; 0.1959828624191452; 0.3584657285442726; 0.32533033076326434
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.12203823484477883; 0.2713490317738959; 0.034388521115218396; 0.3567533266935893; 0.2587799816000169; 0.5426960831582485; 0.31171107608941095; 0.8021969807540397; 0.5467102793432796; 0.9868869366005173; 0.9695846277645586; 0.1987156815341724; 0.9394989415641891; 0.8154614284548342; 0.5978999788110851; 0.7290071680409873; 0.0884925020519195; 0.07404465173409036; 0.045227288910538066; 0.11586905952512971
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_A_withA, blas_swap_all3)
+{
+
+  Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      swap_all;
+  // Compuing _x, _y <= _y, _x
+  using Type = double;
+
+  int n = 10;
+  int m = 3;
+  int p = 3;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.8631034258755935; 0.6232981268275579; 0.3308980248526492; 0.06355835028602363; 0.3109823217156622; 0.32518332202674705; 0.7296061783380641; 0.6375574713552131; 0.8872127425763265; 0.4722149251619493; 0.1195942459383017; 0.713244787222995; 0.7607850486168974; 0.5612771975694962; 0.770967179954561; 0.49379559636439074; 0.5227328293819941; 0.42754101835854963; 0.02541912674409519; 0.10789142699330445; 0.03142918568673425; 0.6364104112637804; 0.3143559810763267; 0.5085706911647028; 0.907566473926093; 0.24929222914887494; 0.41038292303562973; 0.7555511385430487; 0.22879816549162246; 0.07697990982879299; 0.289751452913768; 0.16122128725400442; 0.9296976523425731; 0.808120379564417; 0.6334037565104235; 0.8714605901877177; 0.8036720768991145; 0.18657005888603584; 0.8925589984899778; 0.5393422419156507
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.8074401551640625; 0.8960912999234932; 0.3180034749718639; 0.11005192452767676; 0.22793516254194168; 0.4271077886262563; 0.8180147659224931; 0.8607305832563434; 0.006952130531190703; 0.5107473025775657; 0.417411003148779; 0.22210781047073025; 0.1198653673336828; 0.33761517140362796; 0.9429097039125192; 0.32320293202075523; 0.5187906217433661; 0.7030189588951778; 0.363629602379294; 0.9717820827209607; 0.9624472949421112; 0.25178229582536416; 0.49724850589238545; 0.30087830981676966; 0.2848404943774676; 0.036886947354532795; 0.6095643339798968; 0.5026790232288615; 0.05147875124998935; 0.27864646423661144; 0.9082658859666537; 0.23956189066697242; 0.1448948720912231; 0.489452760277563; 0.9856504541106007; 0.2420552715115004; 0.6721355474058786; 0.7616196153287176; 0.23763754399239967; 0.7282163486118596
+    )");
+
+  swap_all(n, x, m, y, p);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.8074401551640625; 0.6232981268275579; 0.3308980248526492; 0.11005192452767676; 0.3109823217156622; 0.32518332202674705; 0.8180147659224931; 0.6375574713552131; 0.8872127425763265; 0.5107473025775657; 0.1195942459383017; 0.713244787222995; 0.1198653673336828; 0.5612771975694962; 0.770967179954561; 0.32320293202075523; 0.5227328293819941; 0.42754101835854963; 0.363629602379294; 0.10789142699330445; 0.03142918568673425; 0.25178229582536416; 0.3143559810763267; 0.5085706911647028; 0.2848404943774676; 0.24929222914887494; 0.41038292303562973; 0.5026790232288615; 0.22879816549162246; 0.07697990982879299; 0.289751452913768; 0.16122128725400442; 0.9296976523425731; 0.808120379564417; 0.6334037565104235; 0.8714605901877177; 0.8036720768991145; 0.18657005888603584; 0.8925589984899778; 0.5393422419156507
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.8631034258755935; 0.8960912999234932; 0.3180034749718639; 0.06355835028602363; 0.22793516254194168; 0.4271077886262563; 0.7296061783380641; 0.8607305832563434; 0.006952130531190703; 0.4722149251619493; 0.417411003148779; 0.22210781047073025; 0.7607850486168974; 0.33761517140362796; 0.9429097039125192; 0.49379559636439074; 0.5187906217433661; 0.7030189588951778; 0.02541912674409519; 0.9717820827209607; 0.9624472949421112; 0.6364104112637804; 0.49724850589238545; 0.30087830981676966; 0.907566473926093; 0.036886947354532795; 0.6095643339798968; 0.7555511385430487; 0.05147875124998935; 0.27864646423661144; 0.9082658859666537; 0.23956189066697242; 0.1448948720912231; 0.489452760277563; 0.9856504541106007; 0.2420552715115004; 0.6721355474058786; 0.7616196153287176; 0.23763754399239967; 0.7282163486118596
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_A_withA, blas_swap_all4)
+{
+
+  Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      swap_all;
+  // Compuing _x, _y <= _y, _x
+  using Type = double;
+
+  int n = 10;
+  int m = 3;
+  int p = 2;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.3677831327192532; 0.6323058305935795; 0.6335297107608947; 0.5357746840747585; 0.0902897700544083; 0.835302495589238; 0.32078006497173583; 0.18651851039985423; 0.040775141554763916; 0.5908929431882418; 0.6775643618422824; 0.016587828927856152; 0.512093058299281; 0.22649577519793795; 0.6451727904094499; 0.17436642900499144; 0.690937738102466; 0.3867353463005374; 0.9367299887367345; 0.13752094414599325; 0.3410663510502585; 0.11347352124058907; 0.9246936182785628; 0.877339353380981; 0.2579416277151556; 0.659984046034179; 0.8172222002012158; 0.5552008115994623; 0.5296505783560065; 0.24185229090045168; 0.09310276780589921; 0.8972157579533268; 0.9004180571633305; 0.6331014572732679; 0.3390297910487007; 0.3492095746126609; 0.7259556788702394; 0.8971102599525771; 0.8870864242651173; 0.7798755458576239
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6420316461542878; 0.08413996499504883; 0.16162871409461377; 0.8985541885270792; 0.6064290596595899; 0.009197051616629648; 0.1014715428660321; 0.6635017691080558; 0.005061583846218687; 0.16080805141749865; 0.5487337893665861; 0.6918951976926933; 0.6519612595026005; 0.22426930946055978; 0.7121792213475359; 0.23724908749680007; 0.3253996981592677; 0.7464914051180241; 0.6496328990472147; 0.8492234104941779; 0.6576128923003434; 0.5683086033354716; 0.09367476782809248; 0.3677158030594335; 0.26520236768172545; 0.24398964337908358; 0.9730105547524456; 0.3930977246667604; 0.8920465551771133; 0.6311386259972629; 0.7948113035416484; 0.5026370931051921; 0.5769038846263591; 0.4925176938188639; 0.1952429877980445; 0.7224521152615053; 0.2807723624408558; 0.02431596643145384; 0.6454722959071678; 0.17711067940704894
+    )");
+
+  swap_all(n, x, m, y, p);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.6420316461542878; 0.6323058305935795; 0.6335297107608947; 0.16162871409461377; 0.0902897700544083; 0.835302495589238; 0.6064290596595899; 0.18651851039985423; 0.040775141554763916; 0.1014715428660321; 0.6775643618422824; 0.016587828927856152; 0.005061583846218687; 0.22649577519793795; 0.6451727904094499; 0.5487337893665861; 0.690937738102466; 0.3867353463005374; 0.6519612595026005; 0.13752094414599325; 0.3410663510502585; 0.7121792213475359; 0.9246936182785628; 0.877339353380981; 0.3253996981592677; 0.659984046034179; 0.8172222002012158; 0.6496328990472147; 0.5296505783560065; 0.24185229090045168; 0.09310276780589921; 0.8972157579533268; 0.9004180571633305; 0.6331014572732679; 0.3390297910487007; 0.3492095746126609; 0.7259556788702394; 0.8971102599525771; 0.8870864242651173; 0.7798755458576239
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.3677831327192532; 0.08413996499504883; 0.5357746840747585; 0.8985541885270792; 0.32078006497173583; 0.009197051616629648; 0.5908929431882418; 0.6635017691080558; 0.512093058299281; 0.16080805141749865; 0.17436642900499144; 0.6918951976926933; 0.9367299887367345; 0.22426930946055978; 0.11347352124058907; 0.23724908749680007; 0.2579416277151556; 0.7464914051180241; 0.5552008115994623; 0.8492234104941779; 0.6576128923003434; 0.5683086033354716; 0.09367476782809248; 0.3677158030594335; 0.26520236768172545; 0.24398964337908358; 0.9730105547524456; 0.3930977246667604; 0.8920465551771133; 0.6311386259972629; 0.7948113035416484; 0.5026370931051921; 0.5769038846263591; 0.4925176938188639; 0.1952429877980445; 0.7224521152615053; 0.2807723624408558; 0.02431596643145384; 0.6454722959071678; 0.17711067940704894
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_A_withA, blas_swap_all5)
+{
+
+  Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      swap_all;
+  // Compuing _x, _y <= _y, _x
+  using Type = double;
+
+  int n = 20;
+  int m = -1;
+  int p = -1;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.9404585843529143; 0.9539285770025874; 0.9148643902204485; 0.3701587002554444; 0.015456616528867428; 0.9283185625877254; 0.42818414831731433; 0.9666548190436696; 0.9636199770892528; 0.8530094554673601; 0.2944488920695857; 0.38509772860192526; 0.8511366715168569; 0.31692200515627766; 0.1694927466860925; 0.5568012624583502; 0.936154774160781; 0.696029796674973; 0.570061170089365; 0.09717649377076854
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.6150072266991697; 0.9900538501042633; 0.14008401523652403; 0.5183296523637367; 0.8773730719279554; 0.7407686177542044; 0.697015740995268; 0.7024840839871093; 0.35949115121975517; 0.29359184426449336; 0.8093611554785136; 0.8101133946791808; 0.8670723185801037; 0.9132405525564713; 0.5113423988609378; 0.5015162946871996; 0.7982951789667752; 0.6499639307777652; 0.7019668772577033; 0.795792669436101
+    )");
+
+  swap_all(n, x, m, y, p);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.6150072266991697; 0.9900538501042633; 0.14008401523652403; 0.5183296523637367; 0.8773730719279554; 0.7407686177542044; 0.697015740995268; 0.7024840839871093; 0.35949115121975517; 0.29359184426449336; 0.8093611554785136; 0.8101133946791808; 0.8670723185801037; 0.9132405525564713; 0.5113423988609378; 0.5015162946871996; 0.7982951789667752; 0.6499639307777652; 0.7019668772577033; 0.795792669436101
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.9404585843529143; 0.9539285770025874; 0.9148643902204485; 0.3701587002554444; 0.015456616528867428; 0.9283185625877254; 0.42818414831731433; 0.9666548190436696; 0.9636199770892528; 0.8530094554673601; 0.2944488920695857; 0.38509772860192526; 0.8511366715168569; 0.31692200515627766; 0.1694927466860925; 0.5568012624583502; 0.936154774160781; 0.696029796674973; 0.570061170089365; 0.09717649377076854
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_A_withA, blas_swap_all6)
+{
+
+  Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      swap_all;
+  // Compuing _x, _y <= _y, _x
+  using Type = double;
+
+  int n = 10;
+  int m = -2;
+  int p = -2;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.8900053418175663; 0.3379951568515358; 0.375582952639944; 0.093981939840869; 0.578280140996174; 0.035942273796742086; 0.46559801813246016; 0.5426446347075766; 0.2865412521282844; 0.5908332605690108; 0.03050024993904943; 0.03734818874921442; 0.8226005606596583; 0.3601906414112629; 0.12706051265188478; 0.5222432600548044; 0.7699935530986108; 0.21582102749684318; 0.6228904758190003; 0.085347464993768
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.0516817211686077; 0.531354631568148; 0.5406351216101065; 0.6374299014982066; 0.7260913337226615; 0.9758520794625346; 0.5163003483011953; 0.32295647294124596; 0.7951861947687037; 0.2708322512620742; 0.4389714207056361; 0.07845638134226596; 0.02535074341545751; 0.9626484146779251; 0.8359801205122058; 0.695974206093698; 0.4089529444142699; 0.17329432007084578; 0.15643704267108605; 0.25024289816459533
+    )");
+
+  swap_all(n, x, m, y, p);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.0516817211686077; 0.3379951568515358; 0.5406351216101065; 0.093981939840869; 0.7260913337226615; 0.035942273796742086; 0.5163003483011953; 0.5426446347075766; 0.7951861947687037; 0.5908332605690108; 0.4389714207056361; 0.03734818874921442; 0.02535074341545751; 0.3601906414112629; 0.8359801205122058; 0.5222432600548044; 0.4089529444142699; 0.21582102749684318; 0.15643704267108605; 0.085347464993768
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.8900053418175663; 0.531354631568148; 0.375582952639944; 0.6374299014982066; 0.578280140996174; 0.9758520794625346; 0.46559801813246016; 0.32295647294124596; 0.2865412521282844; 0.2708322512620742; 0.03050024993904943; 0.07845638134226596; 0.8226005606596583; 0.9626484146779251; 0.12706051265188478; 0.695974206093698; 0.7699935530986108; 0.17329432007084578; 0.6228904758190003; 0.25024289816459533
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
+
+TEST(blas_A_withA, blas_swap_all7)
+{
+
+  Blas<double, Signature(_x, _y <= _n, _x, _m, _y, _p), Computes(_x, _y <= _y, _x),
+       platform::Parallelisation::NOT_PARALLEL>
+      swap_all;
+  // Compuing _x, _y <= _y, _x
+  using Type = double;
+
+  int n = 5;
+  int m = -2;
+  int p = -3;
+
+  Tensor<Type> x = Tensor<Type>::FromString(R"(
+    0.5492266647061205; 0.7145959227000623; 0.6601973767177313; 0.27993389694594284; 0.9548652806631941; 0.7378969166957685; 0.5543540525114007; 0.6117207462343522; 0.4196000624277899; 0.24773098950115746; 0.3559726786512616; 0.7578461104643691; 0.014393488629755868; 0.11607264050691624; 0.04600264202175275; 0.040728802318970136; 0.8554605840110072; 0.7036578593800237; 0.4741738290873252; 0.09783416065100148
+    )");
+
+  Tensor<Type> y = Tensor<Type>::FromString(R"(
+    0.49161587511683236; 0.4734717707805657; 0.17320186991001518; 0.43385164923797304; 0.39850473439737344; 0.6158500980522165; 0.6350936508676438; 0.04530400977204452; 0.3746126146264712; 0.6258599157142364; 0.5031362585800877; 0.8564898411883223; 0.658693631618945; 0.1629344270814297; 0.07056874740042984; 0.6424192782063156; 0.026511310541621813; 0.5857755812734633; 0.9402302414249576; 0.575474177875879; 0.3881699262065219; 0.6432882184423532; 0.45825289049151663; 0.5456167893159349; 0.9414648087765252; 0.38610263780077425; 0.9611905638239142; 0.9053506419560637; 0.19579113478929644; 0.06936130087516545
+    )");
+
+  swap_all(n, x, m, y, p);
+
+  Tensor<Type> refx = Tensor<Type>::FromString(R"(
+  0.49161587511683236; 0.7145959227000623; 0.43385164923797304; 0.27993389694594284; 0.6350936508676438; 0.7378969166957685; 0.6258599157142364; 0.6117207462343522; 0.658693631618945; 0.24773098950115746; 0.3559726786512616; 0.7578461104643691; 0.014393488629755868; 0.11607264050691624; 0.04600264202175275; 0.040728802318970136; 0.8554605840110072; 0.7036578593800237; 0.4741738290873252; 0.09783416065100148
+  )");
+
+  ASSERT_TRUE(refx.AllClose(x));
+
+  Tensor<Type> refy = Tensor<Type>::FromString(R"(
+  0.5492266647061205; 0.4734717707805657; 0.17320186991001518; 0.6601973767177313; 0.39850473439737344; 0.6158500980522165; 0.9548652806631941; 0.04530400977204452; 0.3746126146264712; 0.5543540525114007; 0.5031362585800877; 0.8564898411883223; 0.4196000624277899; 0.1629344270814297; 0.07056874740042984; 0.6424192782063156; 0.026511310541621813; 0.5857755812734633; 0.9402302414249576; 0.575474177875879; 0.3881699262065219; 0.6432882184423532; 0.45825289049151663; 0.5456167893159349; 0.9414648087765252; 0.38610263780077425; 0.9611905638239142; 0.9053506419560637; 0.19579113478929644; 0.06936130087516545
+  )");
+
+  ASSERT_TRUE(refy.AllClose(y));
+}
diff --git a/libs/math/tests/math/matrix_operations/matrix_operations.cpp b/libs/math/tests/math/matrix_operations/matrix_operations.cpp
index 9a23573096..05545ad7ed 100644
--- a/libs/math/tests/math/matrix_operations/matrix_operations.cpp
+++ b/libs/math/tests/math/matrix_operations/matrix_operations.cpp
@@ -187,12 +187,13 @@ TYPED_TEST(FreeFunctionsTest, Product_TwoDimension)
   TypeParam array1{{n_data, n_features}};
 
   array1.Set(0, 0, typename TypeParam::Type(-17));
-  array1.Set(0, 1, typename TypeParam::Type(21));
   array1.Set(1, 0, typename TypeParam::Type(1));
-  array1.Set(1, 1, typename TypeParam::Type(1));
   array1.Set(2, 0, typename TypeParam::Type(13));
-  array1.Set(2, 1, typename TypeParam::Type(10));
   array1.Set(3, 0, typename TypeParam::Type(21));
+
+  array1.Set(0, 1, typename TypeParam::Type(21));
+  array1.Set(1, 1, typename TypeParam::Type(1));
+  array1.Set(2, 1, typename TypeParam::Type(10));
   array1.Set(3, 1, typename TypeParam::Type(-0.5));
 
   Type output = fetch::math::Product(array1);
diff --git a/libs/math/tests/math/tensor/tensor_concat.cpp b/libs/math/tests/math/tensor/tensor_concat.cpp
index 459848232c..550e8294d7 100644
--- a/libs/math/tests/math/tensor/tensor_concat.cpp
+++ b/libs/math/tests/math/tensor/tensor_concat.cpp
@@ -32,7 +32,7 @@ template <typename T>
 fetch::math::Tensor<T> PrepareTensor(std::vector<fetch::math::SizeType> const &shape)
 {
   fetch::math::Tensor<T> t{shape};
-  t.FillArange(fetch::math::SizeType(0), t.size());
+  t.FillArange(static_cast<T>(0), static_cast<T>(t.size()));
   return t;
 }
 
diff --git a/libs/math/tests/math/tensor/tensor_indexing.cpp b/libs/math/tests/math/tensor/tensor_indexing.cpp
index 38e53e07dd..d36a3d1374 100644
--- a/libs/math/tests/math/tensor/tensor_indexing.cpp
+++ b/libs/math/tests/math/tensor/tensor_indexing.cpp
@@ -31,7 +31,7 @@ TYPED_TEST(TensorIndexingTest, empty_tensor_test)
 {
   fetch::math::Tensor<TypeParam> t;
   ASSERT_EQ(t.size(), 0);
-  ASSERT_TRUE(t.shape().empty());
+  ASSERT_EQ(t.shape().size(), 1);
 }
 
 TYPED_TEST(TensorIndexingTest, one_dimentional_tensor_test)
@@ -53,6 +53,38 @@ TYPED_TEST(TensorIndexingTest, two_dimentional_tensor_test)
   ASSERT_EQ(t.shape()[1], 5);
 }
 
+TYPED_TEST(TensorIndexingTest, index_op_vs_iterator)
+{
+  TypeParam                      from      = TypeParam(20);
+  TypeParam                      to        = TypeParam(29);
+  TypeParam                      step_size = TypeParam(1);
+  fetch::math::Tensor<TypeParam> a = fetch::math::Tensor<TypeParam>::Arange(from, to, step_size);
+  EXPECT_EQ(a.size(), 9);
+  a.Reshape({3, 3});
+
+  fetch::math::Tensor<TypeParam> b{a.shape()};
+  fetch::math::Tensor<TypeParam> c;
+  c.Resize(a.shape());
+
+  auto it1 = a.begin();
+  auto it2 = b.begin();
+  while (it1.is_valid())
+  {
+    *it2 = *it1;
+    ++it1;
+    ++it2;
+  }
+
+  for (std::size_t i = 0; i < a.size(); ++i)
+  {
+    c[i] = a[i];
+  }
+
+  EXPECT_EQ(a, c);
+  EXPECT_EQ(b, c);
+  EXPECT_EQ(b, a);
+}
+
 TYPED_TEST(TensorIndexingTest, three_dimentional_tensor_test)
 {
   using SizeType = typename fetch::math::Tensor<TypeParam>::SizeType;
@@ -310,9 +342,8 @@ TYPED_TEST(TensorIndexingTest, three_dimentional_squeeze_test)
 
 TYPED_TEST(TensorIndexingTest, major_order_flip_test)
 {
-  using SizeType = typename fetch::math::Tensor<TypeParam>::SizeType;
   fetch::math::Tensor<TypeParam> t({3, 3});
-  t.FillArange(SizeType{0}, t.size());
+  t.FillArange(static_cast<TypeParam>(0), static_cast<TypeParam>(t.size()));
 
   EXPECT_EQ(t[0], 0);
   EXPECT_EQ(t[1], 1);
diff --git a/libs/math/tests/math/tensor_iterator/broadcast.cpp b/libs/math/tests/math/tensor_iterator/broadcast.cpp
new file mode 100644
index 0000000000..344c2d2fd4
--- /dev/null
+++ b/libs/math/tests/math/tensor_iterator/broadcast.cpp
@@ -0,0 +1,108 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <gtest/gtest.h>
+#include <iomanip>
+#include <iostream>
+
+#include "math/tensor.hpp"
+#include "math/tensor_broadcast.hpp"
+
+using namespace fetch::math;
+
+TEST(tensor_iterator, simple_broadcast_test)
+{
+  Tensor<double> a = Tensor<double>::Arange(0u, 20u, 1u);
+  a.Reshape({1, a.size()});
+
+  Tensor<double> b{a};
+  b.Reshape({b.size(), 1});
+
+  Tensor<double> ret;
+
+  ASSERT_TRUE(Broadcast([](double &x, double y) { return x + y; }, a, b, ret));
+
+  for (SizeType i = 0; i < ret.shape(0); ++i)
+  {
+    for (SizeType j = 0; j < ret.shape(1); ++j)
+    {
+      std::vector<SizeType> idxs = {i, j};
+      ASSERT_TRUE(static_cast<SizeType>(ret.Get(idxs)) == i + j);
+    }
+  }
+}
+
+TEST(Tensor, broadcast_3D_test)
+{
+  Tensor<double> a = Tensor<double>::Arange(0u, 21u, 1u);
+  ASSERT_TRUE(a.size() == 21);
+  a.Reshape({1, 3, 7});
+  Tensor<double> b = Tensor<double>::Arange(0u, 21u, 1u);
+  ASSERT_TRUE(b.size() == 21);
+  b.Reshape({7, 3, 1});
+
+  Tensor<double> ret;
+
+  ASSERT_TRUE(Broadcast([](double &x, double y) { return x + y; }, a, b, ret));
+
+  for (SizeType i = 0; i < 7; ++i)
+  {
+    for (SizeType j = 0; j < 3; ++j)
+    {
+      for (SizeType k = 0; k < 7; ++k)
+      {
+        std::vector<SizeType> idxs  = {i, j, k};
+        std::vector<SizeType> idxs2 = {0, j, k};
+        std::vector<SizeType> idxs3 = {i, j, 0};
+        if ((i == 0) && (k == 0))
+        {
+          ASSERT_TRUE(ret.Get(idxs) == a.Get(idxs) + b.Get(idxs));
+        }
+        else if ((i > 0) && (k == 0))
+        {
+          ASSERT_TRUE(ret.Get(idxs) == b.Get(idxs) + a.Get(idxs2));
+        }
+        else if ((i == 0) && (k > 0))
+        {
+          ASSERT_TRUE(ret.Get(idxs) == a.Get(idxs) + b.Get(idxs3));
+        }
+        else
+        {
+          ASSERT_TRUE(ret.Get(idxs) == a.Get(idxs2) + b.Get(idxs3));
+        }
+      }
+    }
+  }
+}
+
+TEST(tensor_iterator, broadcast_shape_size_test)
+{
+
+  Tensor<double> a = Tensor<double>::Arange(0u, 90u, 1u);
+  a.Reshape({1, 3, 1, 6, 5});
+  Tensor<double> b = Tensor<double>::Arange(0u, 42u, 1u);
+  b.Reshape({7, 3, 2, 1, 1});
+
+  std::vector<SizeType> ret_shape = {7, 3, 2, 6, 5};
+  std::vector<SizeType> ref_shape;
+  ShapeFromBroadcast(a.shape(), b.shape(), ref_shape);
+  ASSERT_TRUE(ref_shape == ret_shape);
+  ASSERT_TRUE(Tensor<double>::SizeFromShape(ref_shape) ==
+              std::accumulate(std::begin(ret_shape), std::end(ret_shape), SizeType(1),
+                              std::multiplies<>()));
+}
\ No newline at end of file
diff --git a/libs/math/tests/math/tensor_iterator/iterator.cpp b/libs/math/tests/math/tensor_iterator/iterator.cpp
new file mode 100644
index 0000000000..37c4428a59
--- /dev/null
+++ b/libs/math/tests/math/tensor_iterator/iterator.cpp
@@ -0,0 +1,238 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <iomanip>
+#include <iostream>
+
+#include <gtest/gtest.h>
+
+#include "math/tensor.hpp"
+#include "math/tensor_slice_iterator.hpp"
+
+using namespace fetch::math;
+
+TEST(tensor_iterator, reshape_iterator_test)
+{
+  Tensor<double> a = Tensor<double>::Arange(0u, 20u, 1u);
+  a.Reshape({1, a.size()});
+
+  Tensor<double> b{a};
+  b.Reshape({b.size(), 1});
+
+  auto it1 = a.begin();
+  auto it2 = b.begin();
+  while (it1.is_valid())
+  {
+    EXPECT_EQ((*it1), (*it2));
+    ++it1;
+    ++it2;
+  }
+}
+
+TEST(tensor_iterator, simple_iterator_permute_test)
+{
+
+  // set up an initial array
+  Tensor<double> array{Tensor<double>::Arange(0u, 77u, 1u)};
+  array.Reshape({7, 11});
+  EXPECT_EQ(array.size(), 77);
+
+  Tensor<double> ret;
+  ret.ResizeFromShape(array.shape());
+
+  EXPECT_EQ(ret.size(), array.size());
+  EXPECT_EQ(ret.shape(), array.shape());
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it(array);
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it2(ret);
+
+  it2.PermuteAxes(0, 1);
+  while (it2)
+  {
+    ASSERT_TRUE(bool(it));
+    ASSERT_TRUE(bool(it2));
+
+    *it2 = *it;
+    ++it;
+    ++it2;
+  }
+
+  ASSERT_FALSE(bool(it));
+  ASSERT_FALSE(bool(it2));
+
+  SizeType test_val, cur_row;
+
+  for (SizeType i = 0; i < array.size(); ++i)
+  {
+    EXPECT_EQ(array[i], double(i));
+
+    cur_row  = i / 7;
+    test_val = (11 * (i % 7)) + cur_row;
+
+    EXPECT_EQ(double(ret[i]), double(test_val));
+  }
+}
+
+TEST(tensor_iterator, iterator_4dim_copy_test)
+{
+
+  // set up an initial array
+  Tensor<double> array{Tensor<double>::Arange(0u, 1008u, 1u)};
+  array.Reshape({4, 6, 7, 6});
+  Tensor<double> ret = array.Copy();
+
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it(
+      array, {{1, 2, 1}, {2, 3, 1}, {1, 4, 1}, {2, 6, 1}});
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it2(
+      ret, {{1, 2, 1}, {2, 3, 1}, {1, 4, 1}, {2, 6, 1}});
+
+  while (it2)
+  {
+
+    assert(bool(it));
+    assert(bool(it2));
+
+    *it2 = *it;
+    ++it;
+    ++it2;
+  }
+
+  for (SizeType i = 0; i < 4; ++i)
+  {
+    for (SizeType j = 0; j < 6; ++j)
+    {
+      for (SizeType k = 0; k < 7; ++k)
+      {
+        for (SizeType l = 0; l < 6; ++l)
+        {
+          std::vector<SizeType> idxs = {i, j, k, l};
+          EXPECT_EQ(int(ret.Get(idxs)), int(array.Get(idxs)));
+        }
+      }
+    }
+  }
+}
+
+TEST(Tensor, iterator_4dim_permute_test)
+{
+
+  // set up an initial array
+  Tensor<double> array{Tensor<double>::Arange(0u, 1008u, 1u)};
+  array.Reshape({4, 6, 7, 6});
+  Tensor<double> ret = array.Copy();
+
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it(
+      array, {{1, 2, 1}, {0, 6, 1}, {1, 4, 1}, {0, 6, 1}});
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it2(
+      ret, {{1, 2, 1}, {0, 6, 1}, {1, 4, 1}, {0, 6, 1}});
+
+  it.PermuteAxes(1, 3);
+  while (it2)
+  {
+
+    assert(bool(it));
+    assert(bool(it2));
+
+    *it2 = *it;
+    ++it;
+    ++it2;
+  }
+
+  for (SizeType i = 1; i < 2; ++i)
+  {
+    for (SizeType j = 0; j < 6; ++j)
+    {
+      for (SizeType k = 1; k < 4; ++k)
+      {
+        for (SizeType l = 0; l < 6; ++l)
+        {
+          std::vector<SizeType> idxs  = {i, j, k, l};
+          std::vector<SizeType> idxs2 = {i, l, k, j};
+          EXPECT_EQ(int(ret.Get(idxs)), int(array.Get(idxs2)));
+        }
+      }
+    }
+  }
+}
+
+TEST(Tensor, simple_iterator_transpose_test)
+{
+  std::vector<SizeType> perm{2, 1, 0};
+  std::vector<SizeType> original_shape{2, 3, 4};
+  std::vector<SizeType> new_shape;
+  for (SizeType i = 0; i < perm.size(); ++i)
+  {
+    new_shape.push_back(original_shape[perm[i]]);
+  }
+  SizeType arr_size = fetch::math::Product(original_shape);
+
+  // set up an initial array
+  Tensor<double> array =
+      Tensor<double>::Arange(static_cast<SizeType>(0u), arr_size, static_cast<SizeType>(1u));
+  array.Reshape(original_shape);
+
+  Tensor<double> ret =
+      Tensor<double>::Arange(static_cast<SizeType>(0u), arr_size, static_cast<SizeType>(1u));
+  ret.Reshape(new_shape);
+
+  Tensor<double> test_array{original_shape};
+
+  ASSERT_TRUE(ret.size() == array.size());
+
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it_arr(array);
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it_ret(ret);
+
+  it_ret.Transpose(perm);
+  while (it_ret)
+  {
+    ASSERT_TRUE(bool(it_arr));
+    ASSERT_TRUE(bool(it_ret));
+
+    *it_arr = *it_ret;
+    ++it_arr;
+    ++it_ret;
+  }
+  for (SizeType i = 0; i < array.shape()[0]; ++i)
+  {
+    for (SizeType j = 0; j < array.shape()[1]; ++j)
+    {
+      for (SizeType k = 0; k < array.shape()[2]; ++k)
+      {
+        EXPECT_EQ(array(i, j, k), ret(k, j, i));
+      }
+    }
+  }
+
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it_arr2(test_array);
+  TensorSliceIterator<double, Tensor<double>::ContainerType> it_ret2(ret);
+  it_ret2.Transpose(perm);
+
+  while (it_ret2)
+  {
+    ASSERT_TRUE(bool(it_arr2));
+    ASSERT_TRUE(bool(it_ret2));
+
+    *it_arr2 = *it_ret2;
+    ++it_arr2;
+    ++it_ret2;
+  }
+
+  for (SizeType j = 0; j < array.size(); ++j)
+  {
+    EXPECT_EQ(array[j], test_array[j]);
+  }
+}
\ No newline at end of file
diff --git a/libs/math/tests/math/tensor_iterator/squeeze.cpp b/libs/math/tests/math/tensor_iterator/squeeze.cpp
new file mode 100644
index 0000000000..715fac1c30
--- /dev/null
+++ b/libs/math/tests/math/tensor_iterator/squeeze.cpp
@@ -0,0 +1,81 @@
+//------------------------------------------------------------------------------
+//
+//   Copyright 2018-2019 Fetch.AI Limited
+//
+//   Licensed under the Apache License, Version 2.0 (the "License");
+//   you may not use this file except in compliance with the License.
+//   You may obtain a copy of the License at
+//
+//       http://www.apache.org/licenses/LICENSE-2.0
+//
+//   Unless required by applicable law or agreed to in writing, software
+//   distributed under the License is distributed on an "AS IS" BASIS,
+//   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//   See the License for the specific language governing permissions and
+//   limitations under the License.
+//
+//------------------------------------------------------------------------------
+
+#include <iomanip>
+#include <iostream>
+
+#include <gtest/gtest.h>
+
+#include "math/tensor.hpp"
+#include "math/tensor_squeeze.hpp"
+
+using namespace fetch::math;
+
+TEST(tensor_iterator, tensor_reduce_test)
+{
+  Tensor<double> a = Tensor<double>::Arange(0u, 3u * 4u * 5u, 1u);
+  a.Reshape({3, 4, 5});
+
+  Tensor<double> ret;
+
+  Reduce([](double const &x, double const &z) { return x + z; }, a, ret);
+  SizeType m = 0;
+
+  for (SizeType j = 0; j < ret.shape(1); ++j)
+  {
+    for (SizeType i = 0; i < ret.shape(0); ++i)
+    {
+      double ref = 0.0;
+      for (SizeType k = 3 * m; k < (m + 1) * 3; ++k)
+      {
+        ref += double(k);
+      }
+      ++m;
+
+      std::vector<SizeType> idxs = {i, j};
+      ASSERT_TRUE(ret.Get(idxs) == ref);
+    }
+  }
+}
+
+TEST(tensor_iterator, dimension_reduction)
+{
+  Tensor<double> a = Tensor<double>::Arange(0u, 3u * 4u * 5u, 1u);
+  a.Reshape({3, 4, 5});
+
+  Tensor<double> ret;
+
+  Reduce([](double const &x, double const &z) { return std::max(x, z); }, a, ret, 2);
+  SizeType m = 0;
+  for (SizeType j = 0; j < ret.shape(1); ++j)
+  {
+    for (SizeType i = 0; i < ret.shape(0); ++i)
+    {
+      SizeType offset = i + j * 3;
+      double   ref    = 0.0;
+      for (SizeType k = 0; k < 5; ++k)
+      {
+        SizeType v = offset + k * 3 * 4;
+        ref        = std::max(ref, double(v));
+        ++m;
+      }
+      std::vector<SizeType> idxs = {i, j};
+      ASSERT_TRUE(ret.Get(idxs) == ref);
+    }
+  }
+}
\ No newline at end of file
diff --git a/libs/ml/include/ml/clustering/tsne.hpp b/libs/ml/include/ml/clustering/tsne.hpp
index 9c55806d45..07ed1ffb86 100644
--- a/libs/ml/include/ml/clustering/tsne.hpp
+++ b/libs/ml/include/ml/clustering/tsne.hpp
@@ -90,6 +90,7 @@ class TSNE
     output_symmetric_affinities_.Fill(DataType(0));
     DataType min_gain{0.01f};
     DataType momentum = initial_momentum;
+    assert(output_matrix_.shape().size() == 2);
 
     // i_y is output_matrix value from last iteration
     ArrayType i_y(output_matrix_.shape());
@@ -121,12 +122,12 @@ class TSNE
         {
           if ((gradient.At(i, j) > 0.0) != (i_y.At(i, j) > 0.0))
           {
-            gains.Set(i, j, gains.At(i, j) + DataType(0.2));
+            gains(i, j) = gains.At(i, j) + DataType(0.2);
           }
 
           if ((gradient.At(i, j) > 0.0) == (i_y.At(i, j) > 0.0))
           {
-            gains.Set(i, j, gains.At(i, j) * DataType(0.8));
+            gains(i, j) = gains.At(i, j) * DataType(0.8);
           }
         }
       }
diff --git a/libs/ml/include/ml/layers/skip_gram.hpp b/libs/ml/include/ml/layers/skip_gram.hpp
index 742cebbd4b..463008df1a 100644
--- a/libs/ml/include/ml/layers/skip_gram.hpp
+++ b/libs/ml/include/ml/layers/skip_gram.hpp
@@ -90,8 +90,8 @@ class SkipGram : public Layer<T>
     std::vector<ArrayType> results;
     for (typename ArrayType::SizeType b(0); b < inputs.front().get().shape()[0]; ++b)
     {
-      ArrayType slice_input   = inputs.front().get().Slice(b).Tensor();
-      ArrayType slice_context = inputs.back().get().Slice(b).Tensor();
+      ArrayType slice_input   = inputs.front().get().Slice(b).Copy();
+      ArrayType slice_context = inputs.back().get().Slice(b).Copy();
       results.push_back(this->Ops<T>::Forward({slice_input, slice_context}));
     }
     return ArrayType::Stack(results);
diff --git a/libs/ml/include/ml/ops/activations/relu.hpp b/libs/ml/include/ml/ops/activations/relu.hpp
index c258c497a0..52cb13238b 100644
--- a/libs/ml/include/ml/ops/activations/relu.hpp
+++ b/libs/ml/include/ml/ops/activations/relu.hpp
@@ -48,22 +48,43 @@ class Relu : public fetch::ml::ElementWiseOps<T>
     return output;
   }
 
-  // x>0 f'(x)=1, x<=0 f'(x)=0
+  /**
+   * Gradients for backprop with Relu are as follows:
+   * x>0 f'(x)=1, x<=0 f'(x)=0
+   * therefore we should return error_signal but zeroed out at the relevant places
+   * @param inputs
+   * @param error_signal
+   * @return
+   */
   virtual std::vector<ArrayType> Backward(
       std::vector<std::reference_wrapper<const ArrayType>> const &inputs,
-      ArrayType const &                                           errorSignal)
+      ArrayType const &                                           error_signal)
   {
     ASSERT(inputs.size() == 1);
-    ASSERT(inputs[0].get().shape() == errorSignal.shape());
+    ASSERT(inputs[0].get().shape() == error_signal.shape());
 
-    ArrayType return_signal = errorSignal.Copy();
-    for (SizeType i{0}; i < inputs.front().get().size(); ++i)
+    ArrayType const &input = inputs.front().get();
+    ArrayType        return_signal{error_signal.shape()};
+
+    auto it1    = input.begin();
+    auto it2    = return_signal.begin();
+    auto err_it = error_signal.cbegin();
+
+    while (it1.is_valid())
     {
-      if (inputs.front().get()[i] <= DataType(0))
+      if (*it1 <= DataType(0))
       {
-        return_signal.data().Set(i, DataType(0));
+        *it2 = static_cast<DataType>(0);
       }
+      else
+      {
+        *it2 = *err_it;
+      }
+      ++it1;
+      ++it2;
+      ++err_it;
     }
+
     return {return_signal};
   }
 
diff --git a/libs/ml/include/ml/ops/convolution_1d.hpp b/libs/ml/include/ml/ops/convolution_1d.hpp
index 1b6bf6bb6e..63144eb36a 100644
--- a/libs/ml/include/ml/ops/convolution_1d.hpp
+++ b/libs/ml/include/ml/ops/convolution_1d.hpp
@@ -265,7 +265,9 @@ void Convolution1D<ArrayType>::ReverseFillVerticalStride(ArrayType &      input,
                                                          SizeType const   input_channels,
                                                          SizeType const   kernel_height)
 {
-  SizeType j_s = 0;                                      // stride height iterator
+  SizeType j_s = 0;  // stride height iterator
+  assert(input.shape().size() == 3);
+  assert(vertical_stride.shape().size() == 2);
   for (SizeType i_ic{0}; i_ic < input_channels; ++i_ic)  // Iterate over input channels
   {
 
@@ -274,7 +276,7 @@ void Convolution1D<ArrayType>::ReverseFillVerticalStride(ArrayType &      input,
       for (SizeType i_oc{0}; i_oc < output_channels; ++i_oc)  // Iterate over output channels
       {
 
-        input.Set(i_oc, i_ic, i_k, vertical_stride.At(i_oc, j_s));
+        input(i_oc, i_ic, i_k) = vertical_stride(i_oc, j_s);
       }
       ++j_s;
     }
@@ -300,18 +302,20 @@ void Convolution1D<ArrayType>::FillHorizontalStride(ArrayType const &input,
 {
   SizeType i_s;  // stride width index
   SizeType j_s;  // stride height index
+  assert(horizontal_stride.shape().size() == 2);
+  assert(input.shape().size() == 2);
 
   j_s = 0;
-  for (SizeType i_o{0}; i_o < output_height; ++i_o)  // Iterate over output height
+  for (SizeType i_o = 0; i_o < output_height; ++i_o)  // Iterate over output height
   {
 
     i_s = 0;
-    for (SizeType i_ic(0); i_ic < input_channels; ++i_ic)  // Iterate over input channels
+    for (SizeType i_ic = 0; i_ic < input_channels; ++i_ic)  // Iterate over input channels
     {
 
-      for (SizeType i_k(0); i_k < kernel_height; i_k++)  // Iterate over kernel height
+      for (SizeType i_k = 0; i_k < kernel_height; i_k++)  // Iterate over kernel height
       {
-        horizontal_stride.Set(i_s, j_s, input.At(i_ic, i_o * stride_size_ + i_k));
+        horizontal_stride(i_s, j_s) = input(i_ic, i_o * stride_size_ + i_k);
         ++i_s;
       }
     }
@@ -349,7 +353,7 @@ void Convolution1D<ArrayType>::ReverseFillHorizontalStride(ArrayType &      inpu
 
       for (SizeType i_k(0); i_k < kernel_height; i_k++)  // Iterate over kernel height
       {
-        input.Set(i_ic, i_o * stride_size_ + i_k, horizontal_stride.At(i_s, j_s));
+        input(i_ic, i_o * stride_size_ + i_k) = horizontal_stride(i_s, j_s);
         ++i_s;
       }
     }
@@ -372,12 +376,12 @@ void Convolution1D<ArrayType>::FillOutput(ArrayType const &gemm_output, ArrayTyp
                                           SizeType const output_height)
 {
   SizeType i_tmp;
-  for (SizeType i_oc{0}; i_oc < output_channels; ++i_oc)  // Iterate over output channels
+  for (SizeType i_oc = 0; i_oc < output_channels; ++i_oc)  // Iterate over output channels
   {
     i_tmp = 0;
-    for (SizeType i_o{0}; i_o < output_height; ++i_o)  // Iterate over output height
+    for (SizeType i_o = 0; i_o < output_height; ++i_o)  // Iterate over output height
     {
-      output.Set(i_oc, i_o, gemm_output.At(i_oc, i_tmp));
+      output(i_oc, i_o) = gemm_output(i_oc, i_tmp);
       ++i_tmp;
     }
   }
@@ -398,12 +402,12 @@ void Convolution1D<ArrayType>::ReverseFillOutput(ArrayType &gemm_output, ArrayTy
                                                  SizeType const output_height)
 {
   SizeType i_tmp;
-  for (SizeType i_oc{0}; i_oc < output_channels; ++i_oc)  // Iterate over output channels
+  for (SizeType i_oc = 0; i_oc < output_channels; ++i_oc)  // Iterate over output channels
   {
     i_tmp = 0;
-    for (SizeType i_o{0}; i_o < output_height; ++i_o)  // Iterate over output height
+    for (SizeType i_o = 0; i_o < output_height; ++i_o)  // Iterate over output height
     {
-      gemm_output.Set(i_oc, i_tmp, output.At(i_oc, i_o));
+      gemm_output(i_oc, i_tmp) = output(i_oc, i_o);
       ++i_tmp;
     }
   }
diff --git a/libs/ml/include/ml/ops/convolution_2d.hpp b/libs/ml/include/ml/ops/convolution_2d.hpp
index 96bd846fa1..cfced5c19f 100644
--- a/libs/ml/include/ml/ops/convolution_2d.hpp
+++ b/libs/ml/include/ml/ops/convolution_2d.hpp
@@ -220,7 +220,10 @@ std::vector<typename ArrayType::SizeType> Convolution2D<ArrayType>::ComputeOutpu
 {
   // Return pre-computed value if exist
   if (output_shape_.size() != 0)
+  {
     return output_shape_;
+  }
+
   // output_shape_[0]=number of output channels
   output_shape_.emplace_back(inputs.at(1).get().shape()[0]);
   // output_shape_[1]=number of stride_size steps over input height
diff --git a/libs/ml/include/ml/ops/embeddings.hpp b/libs/ml/include/ml/ops/embeddings.hpp
index 8c1c6cb1a5..29525b6929 100644
--- a/libs/ml/include/ml/ops/embeddings.hpp
+++ b/libs/ml/include/ml/ops/embeddings.hpp
@@ -97,16 +97,24 @@ class Embeddings : public fetch::ml::ops::Weights<T>
     return {ArrayType(errorSignal.shape())};
   }
 
-  virtual void Step(typename T::Type learningRate)
+  virtual void Step(typename T::Type learning_rate)
   {
+    ArrayType embedding_slice;
+
     for (auto const &r : updated_rows_)
     {
-      auto gradient_accumulation_slice = this->gradient_accumulation_->Slice(r).Tensor();
-      auto output_slice                = this->output_->Slice(r).Tensor();
+      // get the relevant slice from gradients and embeddings
+      auto grad_slice = this->gradient_accumulation_->Slice(r);
+      auto out_slice  = this->output_->Slice(r);
+
+      embedding_slice = out_slice.Copy();
+
+      // multiply accumulated gradients by learning rate, then subtract from current embeddings
+      embedding_slice.InlineSubtract(grad_slice.Copy().InlineMultiply(learning_rate));
 
-      gradient_accumulation_slice.InlineMultiply(-learningRate);
-      output_slice.InlineAdd(gradient_accumulation_slice);
-      gradient_accumulation_slice.Fill(typename T::Type(0));
+      // zero out gradients and assign new embeddings values
+      grad_slice.Assign(ArrayType::Zeroes(embedding_slice.shape()));
+      out_slice.Assign(embedding_slice);
     }
     updated_rows_.clear();
   }
diff --git a/libs/ml/include/ml/ops/ops.hpp b/libs/ml/include/ml/ops/ops.hpp
index 2897b4a200..f1240a1990 100644
--- a/libs/ml/include/ml/ops/ops.hpp
+++ b/libs/ml/include/ml/ops/ops.hpp
@@ -133,8 +133,9 @@ class BatchOps : public Ops<T>
     std::vector<std::vector<ArrayType>> results;
     for (typename ArrayType::SizeType b(0); b < inputs.front().get().shape()[0]; ++b)
     {
-      auto ret =
-          this->Backward({inputs.front().get().Slice(b).Tensor()}, errorSignal.Slice(b).Tensor());
+      auto input        = inputs.front().get().Slice(b).Copy();
+      auto error_signal = errorSignal.Slice(b).Copy();
+      auto ret          = this->Backward({input}, error_signal);
       for (std::size_t i(0); i < ret.size(); ++i)
       {
         results[i].push_back(ret[i]);
diff --git a/libs/ml/include/ml/ops/weights.hpp b/libs/ml/include/ml/ops/weights.hpp
index c58bf3a1c7..c7c39075d7 100644
--- a/libs/ml/include/ml/ops/weights.hpp
+++ b/libs/ml/include/ml/ops/weights.hpp
@@ -169,6 +169,15 @@ class Weights : public fetch::ml::ops::PlaceHolder<T>, public Trainable<T>
     return *this->output_;
   }
 
+  /**
+   * Returns a copy of embeddings gradients for enquiry
+   * @return
+   */
+  ArrayType Gradients()
+  {
+    return gradient_accumulation_->Copy();
+  }
+
   static constexpr char const *DESCRIPTOR = "Weights";
 
 private:
diff --git a/libs/ml/include/ml/subgraph.hpp b/libs/ml/include/ml/subgraph.hpp
index 840ab6f108..70f218bd3b 100644
--- a/libs/ml/include/ml/subgraph.hpp
+++ b/libs/ml/include/ml/subgraph.hpp
@@ -33,9 +33,8 @@ template <class T>
 class SubGraph : public Graph<T>, public BatchOps<T>
 {
 public:
-  using ArrayType      = T;
-  using ArrayPtrType   = std::shared_ptr<ArrayType>;
-  using ConstSliceType = typename ArrayType::ConstSliceType;
+  using ArrayType    = T;
+  using ArrayPtrType = std::shared_ptr<ArrayType>;
 
   virtual ArrayType Forward(std::vector<std::reference_wrapper<ArrayType const>> const &inputs,
                             ArrayType &                                                 output)
@@ -54,22 +53,22 @@ class SubGraph : public Graph<T>, public BatchOps<T>
       ArrayType const &                                           errorSignal)
   {
     ASSERT(inputs.size() == this->input_nodes_.size());
-    std::vector<std::pair<NodeInterface<T> *, ArrayType>> nonBackpropagatedErrorSignals =
+    std::vector<std::pair<NodeInterface<T> *, ArrayType>> non_back_prop_err_signal =
         this->output_node_->BackPropagate(errorSignal);
-    std::vector<ArrayType> backpropagatedErrorSignals;
+    std::vector<ArrayType> back_prop_err_signal;
 
     for (std::string const &s : input_nodes_)
     {
       std::shared_ptr<NodeInterface<T>> node = this->nodes_[s];
-      for (auto const &grad : nonBackpropagatedErrorSignals)
+      for (auto const &grad : non_back_prop_err_signal)
       {
         if (grad.first == node.get())
         {
-          backpropagatedErrorSignals.push_back(grad.second);
+          back_prop_err_signal.push_back(grad.second);
         }
       }
     }
-    return backpropagatedErrorSignals;
+    return back_prop_err_signal;
   }
 
 protected:
diff --git a/libs/ml/tests/ml/layers/convolution_1d.cpp b/libs/ml/tests/ml/layers/convolution_1d.cpp
index a92611d446..18838656f7 100644
--- a/libs/ml/tests/ml/layers/convolution_1d.cpp
+++ b/libs/ml/tests/ml/layers/convolution_1d.cpp
@@ -169,10 +169,15 @@ TYPED_TEST(Convolution1DTest, ops_backward_test)  // Use the class as an Ops
   ASSERT_EQ(backprop_error[0].shape()[0], input_channels);
   ASSERT_EQ(backprop_error[0].shape()[1], input_height);
 
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 0)), -4.30774927f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 0)), 9.1627159f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 0)), 0.80360967f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(3, 0)), 1.2491617f);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 0)), -4.3077492713928222656);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 0)), 9.162715911865234375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 0)), 0.80360949039459228516);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 1)), 1.2491617202758789062);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 1)), 2.8053097724914550781);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 1)), -4.166011810302734375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 2)), 2.4086174964904785156);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 2)), -0.86411559581756591797);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 2)), -3.5623354911804199219);
 }
 
 TYPED_TEST(Convolution1DTest, node_forward_test)  // Use the class as a Node
@@ -278,10 +283,16 @@ TYPED_TEST(Convolution1DTest, node_backward_test)  // Use the class as a Node
   ASSERT_EQ(backprop_error[0].second.shape()[0], input_channels);
   ASSERT_EQ(backprop_error[0].second.shape()[1], input_height);
 
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 0)), -4.30774927f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 0)), 9.1627159f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 0)), 0.80360967f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(3, 0)), 1.2491617f);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 0)), -4.3077492713928222656);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 0)), 9.162715911865234375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 0)), 0.80360949039459228516);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 1)), 1.2491617202758789062);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 1)), 2.8053097724914550781);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 1)), -4.166011810302734375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 2)), 2.4086174964904785156);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 2)),
+                  -0.86411559581756591797);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 2)), -3.5623354911804199219);
 }
 
 TYPED_TEST(Convolution1DTest, graph_forward_test)  // Use the class as a Node
diff --git a/libs/ml/tests/ml/layers/convolution_2d.cpp b/libs/ml/tests/ml/layers/convolution_2d.cpp
index 2dec80dc97..d04822c7fd 100644
--- a/libs/ml/tests/ml/layers/convolution_2d.cpp
+++ b/libs/ml/tests/ml/layers/convolution_2d.cpp
@@ -195,10 +195,35 @@ TYPED_TEST(Convolution2DTest, ops_backward_test)  // Use the class as an Ops
   ASSERT_EQ(backprop_error[0].shape()[1], input_height);
   ASSERT_EQ(backprop_error[0].shape()[2], input_width);
 
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 0, 0)), -4.30774927f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 0, 0)), 9.1627159f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 0, 0)), 0.80360967f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(3, 0, 0)), 1.2491617f);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 0, 0)), -4.3077492713928222656);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 0, 0)), 9.162715911865234375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 0, 0)), 0.80360949039459228516);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 1, 0)), 1.2491617202758789062);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 1, 0)), 2.8053097724914550781);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 1, 0)), -4.166011810302734375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 2, 0)), 2.4086174964904785156);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 2, 0)), -0.86411559581756591797);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 2, 0)), -3.5623354911804199219);
+
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 0, 1)), -2.9907839298248291016);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 0, 1)), -0.16291338205337524414);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 0, 1)), -2.5308477878570556641);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 1, 1)), -1.2312210798263549805);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 1, 1)), -6.6115474700927734375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 1, 1)), 3.2868711948394775391);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 2, 1)), -4.994899749755859375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 2, 1)), -2.9489955902099609375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 2, 1)), -2.4173920154571533203);
+
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 0, 2)), 2.4823324680328369141);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 0, 2)), 2.4479858875274658203);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 0, 2)), -0.3612575531005859375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 1, 2)), -6.4253511428833007812);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 1, 2)), -3.184307098388671875);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 1, 2)), 0.51499307155609130859);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(0, 2, 2)), -1.5936613082885742188);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(1, 2, 2)), -0.41774189472198486328);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).At(2, 2, 2)), 0.98040378093719482422);
 }
 
 TYPED_TEST(Convolution2DTest, node_forward_test)  // Use the class as a Node
@@ -322,10 +347,62 @@ TYPED_TEST(Convolution2DTest, node_backward_test)  // Use the class as a Node
   ASSERT_EQ(backprop_error[0].second.shape()[1], input_height);
   ASSERT_EQ(backprop_error[0].second.shape()[2], input_width);
 
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 0, 0)), -4.30774927f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 0, 0)), 9.1627159f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 0, 0)), 0.80360967f);
-  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(3, 0, 0)), 1.2491617f);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 0, 0)),
+                  -4.3077492713928222656);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 0, 0)),
+                  9.162715911865234375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 0, 0)),
+                  0.80360949039459228516);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 1, 0)),
+                  1.2491617202758789062);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 1, 0)),
+                  2.8053097724914550781);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 1, 0)),
+                  -4.166011810302734375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 2, 0)),
+                  2.4086174964904785156);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 2, 0)),
+                  -0.86411559581756591797);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 2, 0)),
+                  -3.5623354911804199219);
+
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 0, 1)),
+                  -2.9907839298248291016);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 0, 1)),
+                  -0.16291338205337524414);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 0, 1)),
+                  -2.5308477878570556641);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 1, 1)),
+                  -1.2312210798263549805);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 1, 1)),
+                  -6.6115474700927734375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 1, 1)),
+                  3.2868711948394775391);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 2, 1)),
+                  -4.994899749755859375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 2, 1)),
+                  -2.9489955902099609375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 2, 1)),
+                  -2.4173920154571533203);
+
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 0, 2)),
+                  2.4823324680328369141);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 0, 2)),
+                  2.4479858875274658203);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 0, 2)),
+                  -0.3612575531005859375);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 1, 2)),
+                  -6.4253511428833007812);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 1, 2)),
+                  -3.184307098388671875);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 1, 2)),
+                  0.51499307155609130859);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(0, 2, 2)),
+                  -1.5936613082885742188);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(1, 2, 2)),
+                  -0.41774189472198486328);
+  EXPECT_FLOAT_EQ(static_cast<float>(backprop_error.at(0).second.At(2, 2, 2)),
+                  0.98040378093719482422);
 }
 
 TYPED_TEST(Convolution2DTest, graph_forward_test)  // Use the class as a Node
diff --git a/libs/ml/tests/ml/ops/embeddings.cpp b/libs/ml/tests/ml/ops/embeddings.cpp
index 69b0de24af..3bfd7ec9d3 100644
--- a/libs/ml/tests/ml/ops/embeddings.cpp
+++ b/libs/ml/tests/ml/ops/embeddings.cpp
@@ -89,33 +89,42 @@ TYPED_TEST(EmbeddingsTest, forward)
 
 TYPED_TEST(EmbeddingsTest, backward)
 {
+  using Type     = typename TypeParam::Type;
+  using SizeType = typename TypeParam::SizeType;
+
   fetch::ml::ops::Embeddings<TypeParam> e(10, 6);
   TypeParam                             weights(std::vector<uint64_t>({10, 6}));
   for (unsigned int i(0); i < 10; ++i)
   {
     for (unsigned int j(0); j < 6; ++j)
     {
-      weights.Set(i, j, typename TypeParam::Type(i * 10 + j));
+      weights.Set(i, j, Type(i * 10 + j));
     }
   }
   e.SetData(weights);
 
   TypeParam input(std::vector<uint64_t>({2}));
-  input.At(0)      = typename TypeParam::Type(3);
-  input.At(1)      = typename TypeParam::Type(5);
+  input.At(0)      = Type(3);
+  input.At(1)      = Type(5);
   TypeParam output = e.fetch::ml::template Ops<TypeParam>::Forward(
       std::vector<std::reference_wrapper<TypeParam const>>({input}));
 
-  TypeParam errorSignal(std::vector<uint64_t>({2, 6}));
+  TypeParam error_signal(std::vector<uint64_t>({2, 6}));
   for (unsigned int j(0); j < 2; ++j)
   {
     for (unsigned int k{0}; k < 6; ++k)
     {
-      errorSignal.Set(j, k, typename TypeParam::Type((j * 6) + k));
+      error_signal.Set(j, k, Type((j * 6) + k));
     }
   }
-  e.Backward({input}, errorSignal);
-  e.Step(typename TypeParam::Type(1));
+
+  e.Backward({input}, error_signal);
+  e.Step(Type(1));
+
+  // Get a copy of the gradients and check that they were zeroed out after Step
+  TypeParam grads_copy = e.Gradients();
+  EXPECT_TRUE(TypeParam::Zeroes({1, 6}).AllClose(grads_copy.Slice(SizeType(input.At(0))).Copy()));
+  EXPECT_TRUE(TypeParam::Zeroes({1, 6}).AllClose(grads_copy.Slice(SizeType(input.At(1))).Copy()));
 
   output = e.fetch::ml::template Ops<TypeParam>::Forward(
       std::vector<std::reference_wrapper<TypeParam const>>({input}));
@@ -125,7 +134,7 @@ TYPED_TEST(EmbeddingsTest, backward)
   {
     for (unsigned int k{0}; k < 6; ++k)
     {
-      EXPECT_EQ(output.At(j, k), typename TypeParam::Type(gt[(j * 6) + k]));
+      EXPECT_EQ(output.At(j, k), Type(gt[(j * 6) + k]));
     }
   }
 }
diff --git a/libs/vectorise/include/vectorise/memory/vector_slice.hpp b/libs/vectorise/include/vectorise/memory/vector_slice.hpp
index 22c7a09480..80397ad9fb 100644
--- a/libs/vectorise/include/vectorise/memory/vector_slice.hpp
+++ b/libs/vectorise/include/vectorise/memory/vector_slice.hpp
@@ -91,7 +91,6 @@ class VectorSlice
   // TODO(private 860): ensure trivial type
   void SetAllZero()
   {
-    assert(pointer_ != nullptr);
     if (pointer_)
     {
       std::memset(static_cast<void *>(pointer_), 0, padded_size() * sizeof(Type));
@@ -100,7 +99,6 @@ class VectorSlice
 
   void SetPaddedZero()
   {
-    assert(pointer_ != nullptr);
     if (pointer_)
     {
       std::memset(static_cast<void *>(pointer_ + size()), 0,