From 5c11158b7df41539aa7320f5777bd05434701533 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Scipione?=
 <9421873+s-Nick@users.noreply.github.com>
Date: Wed, 13 Sep 2023 16:04:12 +0200
Subject: [PATCH] Added Omatcopy_batch BLAS Extension operator (#445)

Co-authored-by: Ouadie El Farouki <ouadie.elfarouki@codeplay.com>
---
 README.md                                     |   4 +-
 benchmark/cublas/extension/omatadd.cpp        |  22 +-
 benchmark/cublas/extension/omatcopy.cpp       |  23 +-
 benchmark/portblas/CMakeLists.txt             |   1 +
 benchmark/portblas/extension/omatadd.cpp      |  19 +-
 benchmark/portblas/extension/omatcopy.cpp     |  21 +-
 benchmark/portblas/extension/omatcopy2.cpp    |  23 +-
 .../portblas/extension/omatcopy_batched.cpp   | 202 ++++++++++++++++
 benchmark/rocblas/extension/omatadd.cpp       |  18 +-
 benchmark/rocblas/extension/omatcopy.cpp      |  21 +-
 cmake/CmakeFunctionHelper.cmake               |   3 +-
 .../include/common/benchmark_identifier.hpp   |  27 ++-
 common/include/common/benchmark_names.hpp     |  35 +++
 .../common/blas_extension_state_counters.hpp  |  42 ++--
 common/include/common/common_utils.hpp        |  56 +++++
 include/interface/extension_interface.h       | 197 ++++++++++-----
 include/operations/extension/matcopy_batch.h  |  72 ++++++
 include/operations/extension/transpose.h      |  30 ++-
 include/portblas.h                            |   2 +
 src/interface/extension/CMakeLists.txt        |   2 +
 src/interface/extension/backend/amd_gpu.hpp   |  43 +++-
 .../extension/backend/default_cpu.hpp         |  31 ++-
 src/interface/extension/backend/intel_gpu.hpp |  43 +++-
 .../extension/backend/nvidia_gpu.hpp          |  43 +++-
 src/interface/extension/matcopy_batch.cpp.in  |  87 +++++++
 src/interface/extension_interface.hpp         | 227 +++++++++++-------
 src/operations/extension/matcopy_batch.hpp    | 171 +++++++++++++
 src/operations/extension/transpose.hpp        |  35 ++-
 src/portblas.hpp                              |   2 +
 test/unittest/CMakeLists.txt                  |   1 +
 .../extension/extension_reference.hpp         |   8 +-
 test/unittest/extension/omatcopy2_test.cpp    |   4 +-
 .../extension/omatcopy_batched_test.cpp       | 156 ++++++++++++
 test/unittest/extension/omatcopy_test.cpp     |   7 +-
 test/unittest/extension/transpose_test.cpp    |   3 +-
 35 files changed, 1351 insertions(+), 330 deletions(-)
 create mode 100644 benchmark/portblas/extension/omatcopy_batched.cpp
 create mode 100644 include/operations/extension/matcopy_batch.h
 create mode 100644 src/interface/extension/matcopy_batch.cpp.in
 create mode 100644 src/operations/extension/matcopy_batch.hpp
 create mode 100644 test/unittest/extension/omatcopy_batched_test.cpp

diff --git a/README.md b/README.md
index adebe5426..5b8541b18 100644
--- a/README.md
+++ b/README.md
@@ -311,7 +311,7 @@ For all these operations:
 * `A`, `B` and `C` are containers for the column-major matrices A, B and C.
 * `lda`, `ldb` and `ldc` are the leading dimensions of the matrices A, B and C
   (cf BLAS 2). The leading dimension of a matrix must be greater than or equal
-  to its number of rows. In the case of in-place transpose, the same matrix `A`
+  to its number of rows. In the case of in-place copy/transpose, the same matrix `A`
   is used with two different leading dimensions for input & output.
 * `stride_a`, `stride_b` and `stride_c` are the striding size between consecutive 
 matrices in a batched entry for inputs/outputs A, B and C. 
@@ -327,6 +327,8 @@ matrices in a batched entry for inputs/outputs A, B and C.
 | `_omatcopy` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `B`, `ldb`  | Perform an out-of-place scaled matrix transpose or copy operation using a general dense matrix. |
 | `_omatcopy2`| `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `inc_a`, `B`, `ldb`, `inc_b`  | Computes two-strided scaling and out-of-place transposition or copying of general dense matrices. |
 | `_omatadd`| `sb_handle`, `transa`, `transb`, `M`, `N`, `alpha`, `A`, `lda`, `beta`, `B`, `ldb`, `C`,`ldc`  | Computes scaled general dense matrix addition with possibly transposed arguments. |
+| `_omatcopy_batch` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `stride_a`, `B`, `ldb`, `stride_b`, `batch_size` | Perform an out-of-place scaled batched-strided matrix transpose or copy operation using a general dense matrix. |
+| `_imatcopy_batch` | `sb_handle`, `transa`, `M`, `N`, `alpha`, `A`, `lda`, `ldb`, `stride`, `batch_size` | Perform an in-place scaled batched-strided matrix transpose* or copy operation using a general dense matrix. (*: Currently the transpose case is not supported). |
 
 Other non-official extension operators : 
 | operation | arguments | description |
diff --git a/benchmark/cublas/extension/omatadd.cpp b/benchmark/cublas/extension/omatadd.cpp
index 67cc544cf..7a4202cd2 100644
--- a/benchmark/cublas/extension/omatadd.cpp
+++ b/benchmark/cublas/extension/omatadd.cpp
@@ -26,17 +26,6 @@
 #include "../../../test/unittest/extension/extension_reference.hpp"
 #include "../utils.hpp"
 
-template <typename scalar_t>
-std::string get_name(std::string ts_a, std::string ts_b, int m, int n,
-                     scalar_t alpha, scalar_t beta, index_t lda_mul,
-                     index_t ldb_mul, index_t ldc_mul) {
-  std::ostringstream str{};
-  str << "BM_omatadd<" << blas_benchmark::utils::get_type_name<scalar_t>()
-      << ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha
-      << "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul;
-  return str.str();
-}
-
 template <typename scalar_t, typename... args_t>
 static inline void cublas_routine(args_t&&... args) {
   if constexpr (std::is_same_v<scalar_t, float>) {
@@ -71,7 +60,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti_a,
   const auto size_c = ldc * n;
 
   blas_benchmark::utils::init_extension_counters<
-      blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>(
+      blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
       state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul);
 
   cublasHandle_t& cuda_handle = *cuda_handle_ptr;
@@ -182,11 +171,12 @@ void register_benchmark(blas_benchmark::Args& args,
                         lda_mul, ldb_mul, ldc_mul, success);
         };
     benchmark::RegisterBenchmark(
-        get_name<scalar_t>(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul,
-                           ldc_mul)
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
+            ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul)
             .c_str(),
-        BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul,
-        ldb_mul, ldc_mul, success)
+        BM_lambda, cublas_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul,
+        ldc_mul, success)
         ->UseRealTime();
   }
 }
diff --git a/benchmark/cublas/extension/omatcopy.cpp b/benchmark/cublas/extension/omatcopy.cpp
index 2704ffe1b..02d3527d1 100644
--- a/benchmark/cublas/extension/omatcopy.cpp
+++ b/benchmark/cublas/extension/omatcopy.cpp
@@ -26,16 +26,6 @@
 #include "../../../test/unittest/extension/extension_reference.hpp"
 #include "../utils.hpp"
 
-template <typename scalar_t>
-std::string get_name(std::string ts_a, int m, int n, scalar_t alpha,
-                     index_t lda_mul, index_t ldb_mul) {
-  std::ostringstream str{};
-  str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name<scalar_t>()
-      << ">/" << ts_a << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
-      << "/" << ldb_mul;
-  return str.str();
-}
-
 template <typename scalar_t, typename... args_t>
 static inline void cublas_routine(args_t&&... args) {
   if constexpr (std::is_same_v<scalar_t, float>) {
@@ -66,7 +56,7 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti,
   const auto cuda_size_b = cuda_ldb * n;
 
   blas_benchmark::utils::init_extension_counters<
-      blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>(
+      blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
       state, t_str, m, n, lda_mul, ldb_mul);
 
   cublasHandle_t& cuda_handle = *cuda_handle_ptr;
@@ -89,10 +79,10 @@ void run(benchmark::State& state, cublasHandle_t* cuda_handle_ptr, int ti,
 
 #ifdef BLAS_VERIFY_BENCHMARK
   // Run a first time with a verification of the results
-  std::vector<scalar_t> m_b_ref = m_b;  // m_b;
+  std::vector<scalar_t> m_b_ref = m_b;
 
-  reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a, cuda_lda,
-                                      m_b_ref, cuda_ldb);
+  reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a.data(), cuda_lda,
+                                      m_b_ref.data(), cuda_ldb);
 
   std::vector<scalar_t> m_b_temp = m_b;
   {
@@ -180,7 +170,10 @@ void register_benchmark(blas_benchmark::Args& args,
                     success);
     };
     benchmark::RegisterBenchmark(
-        get_name<scalar_t>(ts_a, m, n, alpha, lda_mul, ldb_mul).c_str(),
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
+            ts_a, m, n, alpha, lda_mul, ldb_mul)
+            .c_str(),
         BM_lambda, cublas_handle_ptr, t_a, m, n, alpha, lda_mul, ldb_mul,
         success)
         ->UseRealTime();
diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt
index 00e73379d..03d86997e 100644
--- a/benchmark/portblas/CMakeLists.txt
+++ b/benchmark/portblas/CMakeLists.txt
@@ -67,6 +67,7 @@ set(sources
   extension/omatcopy.cpp
   extension/omatcopy2.cpp
   extension/omatadd.cpp
+  extension/omatcopy_batched.cpp
 )
 
 if(${BLAS_ENABLE_EXTENSIONS})
diff --git a/benchmark/portblas/extension/omatadd.cpp b/benchmark/portblas/extension/omatadd.cpp
index 393304d6b..da2879299 100644
--- a/benchmark/portblas/extension/omatadd.cpp
+++ b/benchmark/portblas/extension/omatadd.cpp
@@ -26,18 +26,6 @@
 #include "../../../test/unittest/extension/extension_reference.hpp"
 #include "../utils.hpp"
 
-template <typename scalar_t>
-std::string get_name(std::string ts_a, std::string ts_b, int m, int n,
-                     scalar_t alpha, scalar_t beta, index_t lda_mul,
-                     index_t ldb_mul, index_t ldc_mul, std::string mem_type) {
-  std::ostringstream str{};
-  str << "BM_omatadd<" << blas_benchmark::utils::get_type_name<scalar_t>()
-      << ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha
-      << "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul;
-  str << "/" << mem_type;
-  return str.str();
-}
-
 template <typename scalar_t, blas::helper::AllocType mem_alloc>
 void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a,
          int ti_b, index_t m, index_t n, scalar_t alpha, scalar_t beta,
@@ -63,7 +51,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti_a,
   const auto size_c = ldc * n;
 
   blas_benchmark::utils::init_extension_counters<
-      blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>(
+      blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
       state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul);
 
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
@@ -179,8 +167,9 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
                                lda_mul, ldb_mul, ldc_mul, success);
     };
     benchmark::RegisterBenchmark(
-        get_name<scalar_t>(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul,
-                           ldc_mul, mem_type)
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
+            ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul, mem_type)
             .c_str(),
         BM_lambda, sb_handle_ptr, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul,
         ldc_mul, success)
diff --git a/benchmark/portblas/extension/omatcopy.cpp b/benchmark/portblas/extension/omatcopy.cpp
index 6f214ff57..17e990d90 100644
--- a/benchmark/portblas/extension/omatcopy.cpp
+++ b/benchmark/portblas/extension/omatcopy.cpp
@@ -26,17 +26,6 @@
 #include "../../../test/unittest/extension/extension_reference.hpp"
 #include "../utils.hpp"
 
-template <typename scalar_t>
-std::string get_name(std::string t, int m, int n, scalar_t alpha,
-                     index_t lda_mul, index_t ldb_mul, std::string mem_type) {
-  std::ostringstream str{};
-  str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name<scalar_t>()
-      << ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
-      << "/" << ldb_mul;
-  str << "/" << mem_type;
-  return str.str();
-}
-
 template <typename scalar_t, blas::helper::AllocType mem_alloc>
 void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
          index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
@@ -57,7 +46,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
   const auto size_b = ldb * ((*t_str == 't') ? m : n);
 
   blas_benchmark::utils::init_extension_counters<
-      blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>(
+      blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
       state, t_str, m, n, lda_mul, ldb_mul);
 
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
@@ -83,7 +72,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
   // Run a first time with a verification of the results
   std::vector<scalar_t> m_b_ref = m_b;
 
-  reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a, lda, m_b_ref, ldb);
+  reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a.data(), lda,
+                               m_b_ref.data(), ldb);
 
   std::vector<scalar_t> m_b_temp = m_b;
   {
@@ -163,7 +153,10 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
                                ldb_mul, success);
     };
     benchmark::RegisterBenchmark(
-        get_name<scalar_t>(ts, m, n, alpha, lda_mul, ldb_mul, mem_type).c_str(),
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
+            ts, m, n, alpha, lda_mul, ldb_mul, mem_type)
+            .c_str(),
         BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul, success)
         ->UseRealTime();
   }
diff --git a/benchmark/portblas/extension/omatcopy2.cpp b/benchmark/portblas/extension/omatcopy2.cpp
index 783438a9b..121eef9f7 100644
--- a/benchmark/portblas/extension/omatcopy2.cpp
+++ b/benchmark/portblas/extension/omatcopy2.cpp
@@ -26,18 +26,6 @@
 #include "../../../test/unittest/extension/extension_reference.hpp"
 #include "../utils.hpp"
 
-template <typename scalar_t>
-std::string get_name(std::string t, int m, int n, scalar_t alpha,
-                     index_t lda_mul, index_t ldb_mul, index_t inc_a,
-                     index_t inc_b, std::string mem_type) {
-  std::ostringstream str{};
-  str << "BM_omatcopy2<" << blas_benchmark::utils::get_type_name<scalar_t>()
-      << ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
-      << "/" << ldb_mul << "/" << inc_a << "/" << inc_b;
-  str << "/" << mem_type;
-  return str.str();
-}
-
 template <typename scalar_t, blas::helper::AllocType mem_alloc>
 void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
          index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
@@ -59,7 +47,7 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
   const auto size_b = ldb * ((*t_str == 't') ? m : n);
 
   blas_benchmark::utils::init_extension_counters<
-      blas_benchmark::utils::ExtensionOP::omatcopy2, scalar_t>(
+      blas_benchmark::utils::ExtensionOp::omatcopy2, scalar_t>(
       state, t_str, m, n, lda_mul, ldb_mul, inc_a, inc_b);
 
   blas::SB_Handle& sb_handle = *sb_handle_ptr;
@@ -85,8 +73,8 @@ void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
   // Run a first time with a verification of the results
   std::vector<scalar_t> m_b_ref = m_b;
 
-  reference_blas::ext_omatcopy2(*t_str, m, n, alpha, m_a, lda, inc_a, m_b_ref,
-                                ldb, inc_b);
+  reference_blas::ext_omatcopy2(*t_str, m, n, alpha, m_a.data(), lda, inc_a,
+                                m_b_ref.data(), ldb, inc_b);
 
   std::vector<scalar_t> m_b_temp = m_b;
   {
@@ -167,8 +155,9 @@ void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
                                ldb_mul, inc_a, inc_b, success);
     };
     benchmark::RegisterBenchmark(
-        get_name<scalar_t>(ts, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b,
-                           mem_type)
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatcopy2, scalar_t>(
+            ts, m, n, alpha, lda_mul, ldb_mul, inc_a, inc_b, mem_type)
             .c_str(),
         BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul, inc_a,
         inc_b, success)
diff --git a/benchmark/portblas/extension/omatcopy_batched.cpp b/benchmark/portblas/extension/omatcopy_batched.cpp
new file mode 100644
index 000000000..c88ce624a
--- /dev/null
+++ b/benchmark/portblas/extension/omatcopy_batched.cpp
@@ -0,0 +1,202 @@
+/**************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename omatcopy_batched.cpp
+ *
+ **************************************************************************/
+
+#include "../../../test/unittest/extension/extension_reference.hpp"
+#include "../utils.hpp"
+
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, int ti,
+         index_t m, index_t n, scalar_t alpha, index_t lda_mul, index_t ldb_mul,
+         index_t stride_a_mul, index_t stride_b_mul, index_t batch_size,
+         bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<scalar_t>(
+      state, sb_handle_ptr->get_queue());
+
+  // Standard test setup.
+  std::string ts = blas_benchmark::utils::from_transpose_enum(
+      static_cast<blas_benchmark::utils::Transposition>(ti));
+  const char* t_str = ts.c_str();
+
+  const auto lda = lda_mul * m;
+  const auto ldb = (*t_str == 't') ? ldb_mul * n : ldb_mul * m;
+
+  const auto stride_a = lda * n * stride_a_mul;
+  const auto stride_b = ((*t_str == 't') ? ldb * m : ldb * n) * stride_b_mul;
+
+  const auto size_a = stride_a * batch_size;
+  const auto size_b = stride_b * batch_size;
+
+  blas_benchmark::utils::init_extension_counters<
+      blas_benchmark::utils::ExtensionOp::omatcopy_batch, scalar_t>(
+      state, t_str, m, n, lda_mul, ldb_mul, stride_a_mul, stride_b_mul,
+      batch_size);
+
+  blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
+
+  // Input matrix/vector, output vector.
+  std::vector<scalar_t> m_a =
+      blas_benchmark::utils::random_data<scalar_t>(size_a);
+  std::vector<scalar_t> m_b =
+      blas_benchmark::utils::random_data<scalar_t>(size_b);
+
+  auto m_a_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_a, q);
+  auto m_b_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_b, q);
+
+  auto copy_a = blas::helper::copy_to_device(q, m_a.data(), m_a_gpu, size_a);
+  auto copy_b = blas::helper::copy_to_device(q, m_b.data(), m_b_gpu, size_b);
+
+  sb_handle.wait({copy_a, copy_b});
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<scalar_t> m_b_ref = m_b;
+
+  for (int i = 0; i < batch_size; ++i) {
+    reference_blas::ext_omatcopy(*t_str, m, n, alpha, m_a.data() + i * stride_a,
+                                 lda, m_b_ref.data() + i * stride_b, ldb);
+  }
+
+  std::vector<scalar_t> m_b_temp = m_b;
+  {
+    auto m_b_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_b, q);
+    auto copy_tmp = blas::helper::copy_to_device<scalar_t>(
+        q, m_b_temp.data(), m_b_temp_gpu, size_b);
+
+    auto event = blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu,
+                                       lda, stride_a, m_b_temp_gpu, ldb,
+                                       stride_b, batch_size, {copy_tmp});
+    sb_handle.wait(event);
+    auto copy_res = blas::helper::copy_to_host<scalar_t>(
+        q, m_b_temp_gpu, m_b_temp.data(), size_b);
+
+    sb_handle.wait(copy_res);
+    blas::helper::deallocate<mem_alloc>(m_b_temp_gpu, q);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors(m_b_temp, m_b_ref, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+
+  auto blas_warmup_method_def = [&]() -> void {
+    auto event =
+        blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda,
+                              stride_a, m_b_gpu, ldb, stride_b, batch_size);
+    return;
+  };
+
+  auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
+    auto event =
+        blas::_omatcopy_batch(sb_handle, *t_str, m, n, alpha, m_a_gpu, lda,
+                              stride_a, m_b_gpu, ldb, stride_b, batch_size);
+    sb_handle.wait(event);
+    return event;
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_warmup_method_def);
+  sb_handle.wait();
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  blas::helper::deallocate<mem_alloc>(m_a_gpu, q);
+  blas::helper::deallocate<mem_alloc>(m_b_gpu, q);
+}
+
+template <typename scalar_t, typename blas::helper::AllocType mem_alloc>
+void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
+                        std::string mem_type,
+                        std::vector<matcopy_batch_param_t<scalar_t>> params) {
+  for (auto p : params) {
+    std::string ts;
+    index_t m, n, lda_mul, ldb_mul, stride_a_mul, stride_b_mul, batch_size;
+    scalar_t alpha;
+    std::tie(ts, m, n, alpha, lda_mul, ldb_mul, stride_a_mul, stride_b_mul,
+             batch_size) = p;
+    int t = static_cast<int>(blas_benchmark::utils::to_transpose_enum(ts));
+
+    auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
+                         int t, index_t m, index_t n, scalar_t alpha,
+                         index_t lda_mul, index_t ldb_mul, index_t stride_a_mul,
+                         index_t stride_b_mul, index_t batch_size,
+                         bool* success) {
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, t, m, n, alpha, lda_mul,
+                               ldb_mul, stride_a_mul, stride_b_mul, batch_size,
+                               success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatcopy_batch, scalar_t,
+            index_t>(ts, m, n, alpha, lda_mul, ldb_mul, stride_a_mul,
+                     stride_b_mul, batch_size, mem_type)
+            .c_str(),
+        BM_lambda, sb_handle_ptr, t, m, n, alpha, lda_mul, ldb_mul,
+        stride_a_mul, stride_b_mul, batch_size, success)
+        ->UseRealTime();
+  }
+}
+
+template <typename scalar_t>
+void register_benchmark(blas_benchmark::Args& args,
+                        blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto omatcopy_batch_params =
+      blas_benchmark::utils::get_matcopy_batch_params<scalar_t>(args);
+  register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      omatcopy_batch_params);
+#ifdef SB_ENABLE_USM
+  register_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
+      omatcopy_batch_params);
+#endif
+}
+
+namespace blas_benchmark {
+void create_benchmark(blas_benchmark::Args& args,
+                      blas::SB_Handle* sb_handle_ptr, bool* success) {
+  BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
+}
+}  // namespace blas_benchmark
diff --git a/benchmark/rocblas/extension/omatadd.cpp b/benchmark/rocblas/extension/omatadd.cpp
index f2d1bb39b..71f0ca483 100644
--- a/benchmark/rocblas/extension/omatadd.cpp
+++ b/benchmark/rocblas/extension/omatadd.cpp
@@ -26,17 +26,6 @@
 #include "../../../../test/unittest/extension/extension_reference.hpp"
 #include "../utils.hpp"
 
-template <typename scalar_t>
-std::string get_name(std::string ts_a, std::string ts_b, int m, int n,
-                     scalar_t alpha, scalar_t beta, index_t lda_mul,
-                     index_t ldb_mul, index_t ldc_mul) {
-  std::ostringstream str{};
-  str << "BM_omatadd<" << blas_benchmark::utils::get_type_name<scalar_t>()
-      << ">/" << ts_a << "/" << ts_b << "/" << m << "/" << n << "/" << alpha
-      << "/" << beta << "/" << lda_mul << "/" << ldb_mul << "/" << ldc_mul;
-  return str.str();
-}
-
 template <typename scalar_t, typename... args_t>
 static inline void rocblas_geam_f(args_t&&... args) {
   if constexpr (std::is_same_v<scalar_t, float>) {
@@ -71,7 +60,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int ti_a, int ti_b,
   const auto size_c = ldc * n;
 
   blas_benchmark::utils::init_extension_counters<
-      blas_benchmark::utils::ExtensionOP::omatadd, scalar_t>(
+      blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
       state, t_str_a, t_str_b, m, n, lda_mul, ldb_mul, ldc_mul);
 
   // Input matrix/vector, output vector.
@@ -183,8 +172,9 @@ void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
                     ldb_mul, ldc_mul, success);
     };
     benchmark::RegisterBenchmark(
-        get_name<scalar_t>(ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul,
-                           ldc_mul)
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatadd, scalar_t>(
+            ts_a, ts_b, m, n, alpha, beta, lda_mul, ldb_mul, ldc_mul)
             .c_str(),
         BM_lambda, rb_handle, t_a, t_b, m, n, alpha, beta, lda_mul, ldb_mul,
         ldc_mul, success)
diff --git a/benchmark/rocblas/extension/omatcopy.cpp b/benchmark/rocblas/extension/omatcopy.cpp
index 5d0e7862d..8d87dd7a4 100644
--- a/benchmark/rocblas/extension/omatcopy.cpp
+++ b/benchmark/rocblas/extension/omatcopy.cpp
@@ -26,16 +26,6 @@
 #include "../../../../test/unittest/extension/extension_reference.hpp"
 #include "../utils.hpp"
 
-template <typename scalar_t>
-std::string get_name(std::string t, int m, int n, scalar_t alpha,
-                     index_t lda_mul, index_t ldb_mul) {
-  std::ostringstream str{};
-  str << "BM_omatcopy<" << blas_benchmark::utils::get_type_name<scalar_t>()
-      << ">/" << t << "/" << m << "/" << n << "/" << alpha << "/" << lda_mul
-      << "/" << ldb_mul;
-  return str.str();
-}
-
 template <typename scalar_t, typename... args_t>
 static inline void rocblas_geam_f(args_t&&... args) {
   if constexpr (std::is_same_v<scalar_t, float>) {
@@ -65,7 +55,7 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int ti, index_t m,
   const auto size_b = ldb * n;
 
   blas_benchmark::utils::init_extension_counters<
-      blas_benchmark::utils::ExtensionOP::omatcopy, scalar_t>(
+      blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
       state, t_str, m, n, lda_mul, ldb_mul);
 
   // Input matrix/vector, output vector.
@@ -90,8 +80,8 @@ void run(benchmark::State& state, rocblas_handle& rb_handle, int ti, index_t m,
   // Run a first time with a verification of the results
   std::vector<scalar_t> m_b_ref = m_b;
 
-  reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a, lda, m_b_ref,
-                                      ldb);
+  reference_blas::ext_omatcopy<false>(*t_str, m, n, alpha, m_a.data(), lda,
+                                      m_b_ref.data(), ldb);
 
   std::vector<scalar_t> m_b_temp = m_b;
   {
@@ -176,7 +166,10 @@ void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
       run<scalar_t>(st, rb_handle_, t, m, n, alpha, lda_mul, ldb_mul, success);
     };
     benchmark::RegisterBenchmark(
-        get_name<scalar_t>(ts, m, n, alpha, lda_mul, ldb_mul).c_str(),
+        blas_benchmark::utils::get_name<
+            blas_benchmark::utils::ExtensionOp::omatcopy, scalar_t>(
+            ts, m, n, alpha, lda_mul, ldb_mul)
+            .c_str(),
         BM_lambda, rb_handle, t, m, n, alpha, lda_mul, ldb_mul, success)
         ->UseRealTime();
   }
diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake
index df35c8c37..638ce8a8e 100644
--- a/cmake/CmakeFunctionHelper.cmake
+++ b/cmake/CmakeFunctionHelper.cmake
@@ -562,7 +562,8 @@ function (build_library LIB_NAME ENABLE_EXTENSIONS)
                 $<TARGET_OBJECTS:trsm>
                 $<TARGET_OBJECTS:matcopy>
                 $<TARGET_OBJECTS:transpose>
-                $<TARGET_OBJECTS:omatadd>)
+                $<TARGET_OBJECTS:omatadd>
+                $<TARGET_OBJECTS:matcopy_batch>)
 
    if (${ENABLE_EXTENSIONS})
      list(APPEND LIB_SRCS $<TARGET_OBJECTS:reduction>)
diff --git a/common/include/common/benchmark_identifier.hpp b/common/include/common/benchmark_identifier.hpp
index d0fa906cf..832f00614 100644
--- a/common/include/common/benchmark_identifier.hpp
+++ b/common/include/common/benchmark_identifier.hpp
@@ -74,7 +74,16 @@ enum class Level3Op : int {
   trsm = 8
 };
 
-enum class ExtensionOp : int { reduction = 0 };
+enum class ExtensionOp : int {
+  omatcopy = 0,
+  imatcopy = 1,
+  omatadd = 2,
+  omatcopy_batch = 3,
+  imatcopy_batch = 4,
+  omatadd_batch = 5,
+  omatcopy2 = 6,
+  reduction = 7
+};
 
 template <Level1Op op>
 std::string get_operator_name() {
@@ -170,7 +179,21 @@ std::string get_operator_name() {
 
 template <ExtensionOp op>
 std::string get_operator_name() {
-  if constexpr (op == ExtensionOp::reduction)
+  if constexpr ( op == ExtensionOp::omatcopy)
+    return "Omatcopy";
+  else if constexpr (op == ExtensionOp::imatcopy)
+    return "Imatcopy";
+  else if constexpr (op == ExtensionOp::omatadd)
+    return "Omatadd";
+  else if constexpr (op == ExtensionOp::omatcopy_batch)
+    return "Omatcopy_batch";
+  else if constexpr (op == ExtensionOp::imatcopy_batch)
+    return "Imatcopy_batch";
+  else if constexpr (op == ExtensionOp::omatadd_batch)
+    return "Omatadd_batch";
+  else if constexpr (op == ExtensionOp::omatcopy2)
+    return "Omatcopy2";
+  else if constexpr (op == ExtensionOp::reduction)
     return "Reduction";
   else
     throw std::runtime_error("Unknown BLAS extension operator");
diff --git a/common/include/common/benchmark_names.hpp b/common/include/common/benchmark_names.hpp
index 0e3d06c01..d23f140e1 100644
--- a/common/include/common/benchmark_names.hpp
+++ b/common/include/common/benchmark_names.hpp
@@ -241,6 +241,41 @@ get_name(char side, char uplo, char trans, char diag, index_t m, index_t n,
                                           stride_b_mul, mem_type);
 }
 
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == ExtensionOp::omatcopy, std::string>::type
+get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul,
+         index_t ldb_mul, std::string mem_type) {
+  return internal::get_name<op, scalar_t>(trans, m, n, alpha, lda_mul, ldb_mul,
+                                          mem_type);
+}
+
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == ExtensionOp::omatcopy2, std::string>::type
+get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul,
+         index_t ldb_mul, index_t inc_a, index_t inc_b, std::string mem_type) {
+  return internal::get_name<op, scalar_t>(trans, m, n, alpha, lda_mul, ldb_mul,
+                                          inc_a, inc_b, mem_type);
+}
+
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == ExtensionOp::omatadd, std::string>::type
+get_name(std::string trans_a, std::string trans_b, int m, int n, scalar_t alpha,
+         scalar_t beta, index_t lda_mul, index_t ldb_mul, index_t ldc_mul,
+         std::string mem_type) {
+  return internal::get_name<op, scalar_t>(trans_a, trans_b, m, n, alpha, beta,
+                                          lda_mul, ldb_mul, ldc_mul, mem_type);
+}
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == ExtensionOp::omatcopy_batch,
+                               std::string>::type
+get_name(std::string trans, int m, int n, scalar_t alpha, index_t lda_mul,
+         index_t ldb_mul, index_t stride_a_mul, index_t stride_b_mul,
+         index_t batch_size, std::string mem_type) {
+  return internal::get_name<op, scalar_t>(trans, m, n, alpha, lda_mul, ldb_mul,
+                                          stride_a_mul, stride_b_mul,
+                                          batch_size, mem_type);
+}
+
 template <ExtensionOp op, typename scalar_t, typename index_t>
 inline typename std::enable_if<op == ExtensionOp::reduction, std::string>::type
 get_name(index_t rows, index_t cols, std::string reduction_dim,
diff --git a/common/include/common/blas_extension_state_counters.hpp b/common/include/common/blas_extension_state_counters.hpp
index 3fe58134c..65761255b 100644
--- a/common/include/common/blas_extension_state_counters.hpp
+++ b/common/include/common/blas_extension_state_counters.hpp
@@ -26,44 +26,46 @@
 #ifndef COMMON_BLAS_EXTENSION_STATE_COUNTERS
 #define COMMON_BLAS_EXTENSION_STATE_COUNTERS
 
+#include "benchmark_identifier.hpp"
+
 namespace blas_benchmark {
 namespace utils {
 
-enum class ExtensionOP : int {
-  omatcopy = 0,
-  imatcopy = 1,
-  omatadd = 2,
-  omatcopy_batch = 3,
-  imatcopy_batch = 4,
-  omatadd_batch = 5,
-  omatcopy2 = 6
-};
-
-template <ExtensionOP op, typename scalar_t, typename index_t>
-inline typename std::enable_if<op == ExtensionOP::omatcopy ||
-                               op == ExtensionOP::imatcopy ||
-                               op == ExtensionOP::omatcopy2>::type
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<
+    op == ExtensionOp::omatcopy || op == ExtensionOp::imatcopy ||
+    op == ExtensionOp::omatcopy2 || op == ExtensionOp::omatcopy_batch ||
+    op == ExtensionOp::imatcopy_batch>::type
 init_extension_counters(benchmark::State& state, const char* trans, index_t m,
                         index_t n, index_t lda_mul, index_t ldb_mul,
-                        index_t inc_a = 1, index_t inc_b = 1) {
+                        index_t inc_a = 1, index_t inc_b = 1,
+                        index_t stride_a_mul = 1, index_t stride_b_mul = 1,
+                        index_t batch_size = 1) {
   // Google-benchmark counters are double.
   double size_d = static_cast<double>(m * n);
   state.counters["m"] = static_cast<double>(m);
   state.counters["n"] = static_cast<double>(n);
-  state.counters["n_fl_ops"] = size_d;
+  state.counters["n_fl_ops"] = size_d * batch_size;
   state.counters["lda_m"] = static_cast<double>(lda_mul);
   state.counters["ldb_m"] = static_cast<double>(ldb_mul);
   state.counters["trans"] = static_cast<double>((*trans == 't') ? 1 : 0);
-  state.counters["bytes_processed"] = (2 * size_d + 1) * sizeof(scalar_t);
-  if constexpr (op == ExtensionOP::omatcopy2) {
+  state.counters["bytes_processed"] =
+      (2 * size_d + 1) * sizeof(scalar_t) * batch_size;
+  if constexpr (op == ExtensionOp::omatcopy_batch ||
+                op == ExtensionOp::imatcopy_batch) {
+    state.counters["stride_a_mul"] = static_cast<double>(stride_a_mul);
+    state.counters["stride_b_mul"] = static_cast<double>(stride_b_mul);
+    state.counters["batch_size"] = static_cast<double>(batch_size);
+  }
+  if constexpr (op == ExtensionOp::omatcopy2) {
     state.counters["inc_a"] = static_cast<double>(inc_a);
     state.counters["inc_b"] = static_cast<double>(inc_b);
   }
   return;
 }
 
-template <ExtensionOP op, typename scalar_t, typename index_t>
-inline typename std::enable_if<op == ExtensionOP::omatadd>::type
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == ExtensionOp::omatadd>::type
 init_extension_counters(benchmark::State& state, const char* t_a,
                         const char* t_b, index_t m, index_t n, index_t lda_mul,
                         index_t ldb_mul, index_t ldc_mul) {
diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp
index 3c5161c2e..f4b9b88f6 100644
--- a/common/include/common/common_utils.hpp
+++ b/common/include/common/common_utils.hpp
@@ -113,6 +113,11 @@ template <typename scalar_t>
 using omatadd_param_t = std::tuple<char, char, index_t, index_t, scalar_t,
                                    scalar_t, index_t, index_t, index_t>;
 
+template <typename scalar_t>
+using matcopy_batch_param_t =
+    std::tuple<char, index_t, index_t, scalar_t, index_t, index_t, index_t,
+               index_t, index_t>;
+
 namespace blas_benchmark {
 
 namespace utils {
@@ -1193,6 +1198,57 @@ static inline std::vector<omatadd_param_t<scalar_t>> get_omatadd_params(
   }
 }
 
+/**
+ * @fn get_matcopy_batch_params
+ * @brief Returns a vector containing the matcopy_batch benchmark parameters,
+ * either read from a file according to the command - line args, or the default
+ * ones.
+ */
+template <typename scalar_t>
+static inline std::vector<matcopy_batch_param_t<scalar_t>>
+get_matcopy_batch_params(Args& args) {
+  if (args.csv_param.empty()) {
+    warning_no_csv();
+    std::vector<matcopy_batch_param_t<scalar_t>> matcopy_batch_default;
+    constexpr index_t dmin = 256, dmax = 8192;
+    constexpr scalar_t alpha{2};
+    constexpr index_t batch_size{5};
+    constexpr index_t stride_a_mul{1};
+    constexpr index_t stride_b_mul{1};
+    for (char trans : {'n', 't'}) {
+      for (index_t m = dmin; m <= dmax; m *= 2) {
+        for (index_t n = dmin; n <= dmax; n *= 2) {
+          for (index_t lda_mul = 1; lda_mul < 2; ++lda_mul) {
+            for (index_t ldb_mul = 1; ldb_mul < 2; ++ldb_mul) {
+              matcopy_batch_default.push_back(
+                  std::make_tuple(trans, m, n, alpha, lda_mul, ldb_mul,
+                                  stride_a_mul, stride_b_mul, batch_size));
+            }
+          }
+        }
+      }
+    }
+    return matcopy_batch_default;
+  } else {
+    return parse_csv_file<matcopy_batch_param_t<scalar_t>>(
+        args.csv_param, [&](std::vector<std::string>& v) {
+          if (v.size() != 9) {
+            throw std::runtime_error(
+                "invalid number of parameters (9 expected)");
+          }
+          try {
+            return std::make_tuple(
+                v[0][0], str_to_int<index_t>(v[1]), str_to_int<index_t>(v[2]),
+                str_to_scalar<scalar_t>(v[3]), str_to_int<index_t>(v[4]),
+                str_to_int<index_t>(v[5]), str_to_int<index_t>(v[6]),
+                str_to_int<index_t>(v[7]), str_to_int<index_t>(v[8]));
+          } catch (...) {
+            throw std::runtime_error("invalid parameter");
+          }
+        });
+  }
+}
+
 /**
  * @fn get_type_name
  * @brief Returns a string with the given type. The C++ specification
diff --git a/include/interface/extension_interface.h b/include/interface/extension_interface.h
index 600fb728e..646858ade 100644
--- a/include/interface/extension_interface.h
+++ b/include/interface/extension_interface.h
@@ -61,46 +61,58 @@ namespace internal {
  */
 template <bool in_place, typename sb_handle_t, typename element_t,
           typename index_t, typename in_t, typename out_t>
-typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans,
-                                       index_t m, index_t n, element_t alpha,
-                                       in_t in_memory, index_t ld_in,
-                                       index_t inc_in, out_t out_memory,
-                                       index_t ld_out, index_t inc_out,
-                                       const typename sb_handle_t::event_t& _dependencies);
+typename sb_handle_t::event_t _matcopy(
+    sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
+    index_t ld_out, index_t inc_out,
+    const typename sb_handle_t::event_t& _dependencies);
 
 template <typename sb_handle_t, typename element_t, typename index_t,
-          typename container_0_t, typename container_1_t, typename container_2_t>
-typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a,
-                                       char trans_b, index_t m, index_t n,
-                                       element_t alpha, container_0_t a,
-                                       index_t lda, element_t beta,
-                                       container_1_t b, index_t ldb,
-                                       container_2_t c, index_t ldc,
-                                       const typename sb_handle_t::event_t& _dependencies);
+          typename container_0_t, typename container_1_t,
+          typename container_2_t>
+typename sb_handle_t::event_t _omatadd(
+    sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n,
+    element_t alpha, container_0_t a, index_t lda, element_t beta,
+    container_1_t b, index_t ldb, container_2_t c, index_t ldc,
+    const typename sb_handle_t::event_t& _dependencies);
 
 template <bool in_place, typename element_t, typename sb_handle_t,
           typename index_t, typename in_t, typename out_t>
-typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
-                                         index_t n, in_t A, index_t ld_a,
-                                         out_t B, index_t ld_b,
-                                         const typename sb_handle_t::event_t& _dependencies);
+typename sb_handle_t::event_t _transpose(
+    sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B,
+    index_t ld_b, const typename sb_handle_t::event_t& _dependencies);
+
+template <bool in_place, typename sb_handle_t, typename element_t,
+          typename index_t, typename in_t, typename out_t>
+typename sb_handle_t::event_t _matcopy_batch(
+    sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory,
+    index_t ld_out, index_t stride_out, index_t batch_size,
+    const typename sb_handle_t::event_t& _dependencies);
+
+template <uint32_t TileSize, int TilePerWG, typename sb_handle_t,
+          typename element_t, typename index_t, typename in_t, typename out_t>
+typename sb_handle_t::event_t _matcopy_batch_impl(
+    sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha, in_t memory,
+    index_t ld_in, index_t in_stride, out_t out_memory, index_t ld_out,
+    index_t out_stride, index_t batch_size,
+    const typename sb_handle_t::event_t& _dependencies);
 
 template <typename operator_t, typename element_t, typename sb_handle_t,
           typename input_t, typename output_t, typename index_t>
-typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle,
-                                         input_t buffer_in, index_t ld,
-                                         output_t buffer_out, index_t rows,
-                                         index_t cols,
-                                         reduction_dim_t reduction_dim,
-                                         const typename sb_handle_t::event_t& _dependencies);
+typename sb_handle_t::event_t _reduction(
+    sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out,
+    index_t rows, index_t cols, reduction_dim_t reduction_dim,
+    const typename sb_handle_t::event_t& _dependencies);
 
 template <int Tile_size, int wg_size, int cl_size, bool local_memory,
           typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename element_t, typename index_t>
 typename sb_handle_t::event_t _transpose_outplace_impl(
     sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
-    container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_,
-    index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies);
+    container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in,
+    container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out,
+    index_t _batch_size, const typename sb_handle_t::event_t& _dependencies);
 
 template <bool both_trans, int Tile_size, int wg_size, int cl_size,
           bool local_memory, typename sb_handle_t, typename container_0_t,
@@ -135,15 +147,13 @@ typename sb_handle_t::event_t _transpose_add_impl(
  */
 template <typename sb_handle_t, typename element_t, typename index_t,
           typename in_t, typename out_t>
-typename sb_handle_t::event_t _omatcopy(sb_handle_t& sb_handle, char trans,
-                                        index_t m, index_t n, element_t alpha,
-                                        in_t in_memory, index_t ld_in,
-                                        out_t out_memory, index_t ld_out,
-                                        const typename sb_handle_t::event_t& _dependencies = {}) {
-  return internal::_matcopy<false>(sb_handle, trans, m, n, alpha, in_memory,
-                                   ld_in, static_cast<index_t>(1), out_memory,
-                                   ld_out, static_cast<index_t>(1),
-                                   _dependencies);
+typename sb_handle_t::event_t _omatcopy(
+    sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, out_t out_memory, index_t ld_out,
+    const typename sb_handle_t::event_t& _dependencies = {}) {
+  return internal::_matcopy<false>(
+      sb_handle, trans, m, n, alpha, in_memory, ld_in, static_cast<index_t>(1),
+      out_memory, ld_out, static_cast<index_t>(1), _dependencies);
 }
 
 /**
@@ -172,12 +182,11 @@ typename sb_handle_t::event_t _omatcopy(sb_handle_t& sb_handle, char trans,
  */
 template <typename sb_handle_t, typename element_t, typename index_t,
           typename in_t, typename out_t>
-typename sb_handle_t::event_t _omatcopy2(sb_handle_t& sb_handle, char trans,
-                                         index_t m, index_t n, element_t alpha,
-                                         in_t in_memory, index_t ld_in,
-                                         index_t inc_in, out_t out_memory,
-                                         index_t ld_out, index_t inc_out,
-                                         const typename sb_handle_t::event_t& _dependencies = {}) {
+typename sb_handle_t::event_t _omatcopy2(
+    sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
+    index_t ld_out, index_t inc_out,
+    const typename sb_handle_t::event_t& _dependencies = {}) {
   return internal::_matcopy<false>(sb_handle, trans, m, n, alpha, in_memory,
                                    ld_in, inc_in, out_memory, ld_out, inc_out,
                                    _dependencies);
@@ -205,17 +214,80 @@ typename sb_handle_t::event_t _omatcopy2(sb_handle_t& sb_handle, char trans,
  * @param ldc Matrix C leading dimension
  */
 template <typename sb_handle_t, typename element_t, typename index_t,
-          typename container_0_t, typename container_1_t, typename container_2_t>
-typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a,
-                                       char trans_b, index_t m, index_t n,
-                                       element_t alpha, container_0_t A,
-                                       index_t lda, element_t beta,
-                                       container_1_t B, index_t ldb,
-                                       container_2_t C, index_t ldc,
-                                       const typename sb_handle_t::event_t& _dependencies = {}) {
+          typename container_0_t, typename container_1_t,
+          typename container_2_t>
+typename sb_handle_t::event_t _omatadd(
+    sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n,
+    element_t alpha, container_0_t A, index_t lda, element_t beta,
+    container_1_t B, index_t ldb, container_2_t C, index_t ldc,
+    const typename sb_handle_t::event_t& _dependencies = {}) {
   return internal::_omatadd(sb_handle, trans_a, trans_b, m, n, alpha, A, lda,
                             beta, B, ldb, C, ldc, _dependencies);
 }
+/**
+ * \brief COPY batch of matrices inplace with scaling factor of alpha
+ *
+ * @tparam sb_handle_t SB_Handle type
+ * @tparam element_t Scaling factor type
+ * @tparam index_t Index type
+ * @tparam in_out_t input/output type
+ * @param sb_handle SB_Handle
+ * @param trans compute matrix transpose or not
+ * @param m rows of matrix
+ * @param n cols of matrix
+ * @param alpha Scaling factor
+ * @param memory container of input & output matrices
+ * @param ld_in leading dimension at input
+ * @param ld_out leading dimention at output
+ * @param stride stride distance between matrices inside batch
+ * @param batch_size number of matrices to compute
+ */
+template <typename sb_handle_t, typename element_t, typename index_t,
+          typename in_out_t>
+typename sb_handle_t::event_t _imatcopy_batch(sb_handle_t& sb_handle,
+                                              char trans, index_t m, index_t n,
+                                              element_t alpha, in_out_t memory,
+                                              index_t ld_in, index_t ld_out,
+                                              index_t stride,
+                                              index_t batch_size) {
+  return internal::_matcopy_batch<true>(sb_handle, trans, m, n, alpha, memory,
+                                        ld_in, stride, memory, ld_out, stride,
+                                        batch_size);
+}
+
+/**
+ * \brief COPY batch of matrices outplace from in_memory to out_memory with
+ * scaling factor of alpha
+ *
+ * @tparam sb_handle_t SB_Handle type
+ * @tparam element_t Scaling factor type
+ * @tparam index_t Index type
+ * @tparam in_t container input type
+ * @tparam out_t container output type
+ * @param sb_handle SB_Handle
+ * @param trans compute matrix transpose or not
+ * @param m rows of matrix
+ * @param n cols of matrix
+ * @param alpha Scaling factor
+ * @param in_memory input matrix container
+ * @param ld_in leading dimension of input
+ * @param stride_in stride distance between matrices inside batch
+ * @param out_memory output matrix container
+ * @param ld_out leading dimention of output
+ * @param stride_out stride distance between matrices inside batch
+ * @param batch_size number of matrices to compute
+ */
+template <typename sb_handle_t, typename element_t, typename index_t,
+          typename in_t, typename out_t>
+typename sb_handle_t::event_t _omatcopy_batch(
+    sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory,
+    index_t ld_out, index_t stride_out, index_t batch_size,
+    const typename sb_handle_t::event_t& _dependencies = {}) {
+  return internal::_matcopy_batch<false>(
+      sb_handle, trans, m, n, alpha, in_memory, ld_in, stride_in, out_memory,
+      ld_out, stride_out, batch_size, _dependencies);
+}
 
 namespace extension {
 /**
@@ -238,10 +310,9 @@ namespace extension {
  */
 template <typename element_t, typename sb_handle_t, typename index_t,
           typename in_t, typename out_t>
-typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
-                                         index_t n, in_t A, index_t ld_in,
-                                         index_t ld_out,
-                                         const typename sb_handle_t::event_t& _dependencies = {}) {
+typename sb_handle_t::event_t _transpose(
+    sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_in,
+    index_t ld_out, const typename sb_handle_t::event_t& _dependencies = {}) {
   return blas::internal::_transpose<true, element_t>(sb_handle, m, n, A, ld_in,
                                                      A, ld_out, _dependencies);
 }
@@ -267,24 +338,22 @@ typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
  */
 template <typename element_t, typename sb_handle_t, typename index_t,
           typename in_t, typename out_t>
-typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
-                                         index_t n, in_t A, index_t ld_a,
-                                         out_t B, index_t ld_b,
-                                         const typename sb_handle_t::event_t& _dependencies = {}) {
+typename sb_handle_t::event_t _transpose(
+    sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B,
+    index_t ld_b, const typename sb_handle_t::event_t& _dependencies = {}) {
   return blas::internal::_transpose<false, element_t>(sb_handle, m, n, A, ld_a,
                                                       B, ld_b, _dependencies);
 }
 
 template <typename operator_t, typename element_t, typename sb_handle_t,
           typename input_t, typename output_t, typename index_t>
-typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle,
-                                         input_t buffer_in, index_t ld,
-                                         output_t buffer_out, index_t rows,
-                                         index_t cols,
-                                         reduction_dim_t reduction_dim,
-                                         const typename sb_handle_t::event_t& _dependencies = {}) {
+typename sb_handle_t::event_t _reduction(
+    sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out,
+    index_t rows, index_t cols, reduction_dim_t reduction_dim,
+    const typename sb_handle_t::event_t& _dependencies = {}) {
   return blas::internal::_reduction<operator_t, element_t>(
-      sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim, _dependencies);
+      sb_handle, buffer_in, ld, buffer_out, rows, cols, reduction_dim,
+      _dependencies);
 }
 
 }  // namespace extension
diff --git a/include/operations/extension/matcopy_batch.h b/include/operations/extension/matcopy_batch.h
new file mode 100644
index 000000000..b09fcc3ca
--- /dev/null
+++ b/include/operations/extension/matcopy_batch.h
@@ -0,0 +1,72 @@
+/***************************************************************************
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename Matcopy_batch.h
+ *
+ **************************************************************************/
+
+#ifndef PORTBLAS_EXTENSION_MATCOPY_BATCH_H
+#define PORTBLAS_EXTENSION_MATCOPY_BATCH_H
+
+namespace blas {
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+struct Matcopy_batch {
+ public:
+  using value_t = typename lhs_t::value_t;
+  using index_t = typename rhs_t::index_t;
+
+  lhs_t lhs_;
+  rhs_t rhs_1_;
+  rhs_t rhs_2_;
+  value_t alpha_, beta_;
+  index_t m_, n_, lhs_ld_, rhs_1_ld_, rhs_2_ld_, lhs_stride_, rhs_1_stride_,
+      rhs_2_stride_, batch_size_;
+
+  Matcopy_batch(lhs_t lhs, rhs_t rhs_1, rhs_t rhs_2, value_t alpha,
+                value_t beta, index_t m, index_t n, index_t lhs_ld,
+                index_t rhs_ld, index_t rhs_2_ld, index_t lhs_stride,
+                index_t rhs_stride, index_t rhs_2_stride, index_t batch_size);
+  index_t get_size() const;
+  bool valid_thread(cl::sycl::nd_item<1> ndItem) const;
+  value_t eval(index_t i);
+  value_t eval(cl::sycl::nd_item<1> ndItem);
+  template <typename sharedT>
+  value_t eval(sharedT shMem, cl::sycl::nd_item<1> ndItem);
+  void bind(cl::sycl::handler &h);
+  void adjust_access_displacement();
+};
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t> make_matcopy_batch(
+    lhs_t lhs, rhs_t rhs_1, rhs_t rhs_2, typename rhs_t::value_t alpha,
+    typename rhs_t::value_t beta, typename rhs_t::index_t m,
+    typename rhs_t::index_t n, typename rhs_t::index_t lhs_ld,
+    typename rhs_t::index_t rhs_ld, typename rhs_t::index_t rhs_2_ld,
+    typename rhs_t::index_t lhs_stride, typename rhs_t::index_t rhs_stride,
+    typename rhs_t::index_t rhs_2_stride, typename rhs_t::index_t batch_size) {
+  return Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>(
+      lhs, rhs_1, rhs_2, alpha, beta, m, n, lhs_ld, rhs_ld, rhs_2_ld,
+      lhs_stride, rhs_stride, rhs_2_stride, batch_size);
+}
+
+}  // namespace blas
+
+#endif  // PORTBLAS_EXTENSION_MATCOPY_BATCH_H
diff --git a/include/operations/extension/transpose.h b/include/operations/extension/transpose.h
index 471eb6814..c29d85f9e 100644
--- a/include/operations/extension/transpose.h
+++ b/include/operations/extension/transpose.h
@@ -67,18 +67,26 @@ class Transpose {
   // Increment value (denoted stride in oneMKL specification)
   index_t inc_a_;
   index_t inc_at_;
+  // Stride values (denoted stride in oneMKL specification for batched
+  // operations)
+  index_t stride_a_;
+  index_t stride_at_;
   // Minimum number of tiles used to cover matrices rows & columns
   index_t tile_count_m_;
   index_t tile_count_n_;
   // Total number of tiles used to cover the matrix
   index_t tile_count_total_;
-  // Inner WG Tiles
+  // Number of Inner WG Tiles
   static constexpr const index_t inner_tile_size_ = wg_size / Tile_size;
   static constexpr const index_t inner_tile_count_ =
       Tile_size / inner_tile_size_;
   // Minimum number of Tile-mutliple rows & columns to cover the matrices
   index_t M_pad_;
   index_t N_pad_;
+  // Total size of Tile-mutliple covering matrix
+  index_t size_pad_;
+  // Batch size when using batched transpose
+  index_t batch_size_;
   // Number of contiguous elements to be used in local memory to avoid bank
   // conflicts
   static constexpr index_t get_non_bank_conflict_line_size() {
@@ -90,7 +98,9 @@ class Transpose {
     return get_non_bank_conflict_line_size() / Tile_size;
   }
 
-  Transpose(in_t &A, index_t &inc_a, out_t &At, index_t &inc_at, value_t &alpha)
+  Transpose(in_t &A, index_t &inc_a, index_t &stride_a, out_t &At,
+            index_t &inc_at, index_t &stride_at, value_t &alpha,
+            index_t &batch_size)
       : A_(A),
         At_(At),
         lda_(A_.getSizeL()),
@@ -102,9 +112,13 @@ class Transpose {
         tile_count_n_((N_ - 1) / Tile_size + 1),
         tile_count_total_(tile_count_m_ * tile_count_n_),
         inc_a_(inc_a),
+        stride_a_(stride_a),
+        stride_at_(stride_at),
         inc_at_(inc_at),
         M_pad_(tile_count_m_ * Tile_size),
-        N_pad_(tile_count_n_ * Tile_size) {}
+        N_pad_(tile_count_n_ * Tile_size),
+        size_pad_(M_pad_ * N_pad_),
+        batch_size_(batch_size) {}
 
   index_t get_size() const;
 
@@ -119,7 +133,7 @@ class Transpose {
                    index_t &out_local_idx, index_t &i_block_start,
                    index_t &j_block_start, index_t &il, index_t &jl);
   void get_indices(cl::sycl::nd_item<1> id, index_t &in_idx, index_t &out_idx,
-                   index_t &il, index_t &jl);
+                   index_t &i, index_t &j);
 };
 
 /*!
@@ -130,10 +144,12 @@ template <bool in_place, int Tile_size, int wg_size, int cl_size,
           typename index_t>
 Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
           element_t>
-make_transpose(in_t &A, index_t inc_a, out_t &At, index_t inc_a_t,
-               element_t &alpha) {
+make_transpose(in_t &A, index_t inc_a, index_t &stride_a, out_t &At,
+               index_t inc_a_t, index_t &stride_at, element_t &alpha,
+               index_t &batch_size) {
   return Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t,
-                   out_t, element_t>(A, inc_a, At, inc_a_t, alpha);
+                   out_t, element_t>(A, inc_a, stride_a, At, inc_a_t, stride_at,
+                                     alpha, batch_size);
 }
 
 /*!
diff --git a/include/portblas.h b/include/portblas.h
index e289c55ce..1c212166a 100644
--- a/include/portblas.h
+++ b/include/portblas.h
@@ -53,6 +53,8 @@
 
 #include "operations/extension/transpose.h"
 
+#include "operations/extension/matcopy_batch.h"
+
 #include "operations/blas_constants.h"
 
 #include "operations/blas_operators.h"
diff --git a/src/interface/extension/CMakeLists.txt b/src/interface/extension/CMakeLists.txt
index a1269f10d..6e9ff9fff 100644
--- a/src/interface/extension/CMakeLists.txt
+++ b/src/interface/extension/CMakeLists.txt
@@ -26,4 +26,6 @@
 generate_blas_objects(extension matcopy)
 generate_blas_objects(extension transpose)
 generate_blas_objects(extension omatadd)
+generate_blas_objects(extension matcopy_batch)
+
 generate_blas_reduction_objects(extension reduction)
diff --git a/src/interface/extension/backend/amd_gpu.hpp b/src/interface/extension/backend/amd_gpu.hpp
index 8ced784df..fed86006e 100644
--- a/src/interface/extension/backend/amd_gpu.hpp
+++ b/src/interface/extension/backend/amd_gpu.hpp
@@ -34,17 +34,18 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename element_t, typename index_t>
 typename sb_handle_t::event_t _transpose_outplace(
     sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
-    container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_,
-    index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) {
+    container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in,
+    container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out,
+    index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) {
   if (_M * _N > (1 << 18)) {
     return blas::internal::_transpose_outplace_impl<16, 256, 64, true>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
 
   } else {
     return blas::internal::_transpose_outplace_impl<16, 64, 64, true>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
   }
 }
 
@@ -70,6 +71,36 @@ typename sb_handle_t::event_t _transpose_add(
 
 }  // namespace backend
 }  // namespace transpose
+
+namespace matcopy_batch {
+namespace backend {
+template <bool trans, typename sb_handle_t, typename element_t,
+          typename index_t, typename in_t, typename out_t>
+typename std::enable_if<!trans, typename sb_handle_t::event_t>::type
+_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
+               in_t in_memory, index_t ld_in, index_t in_stride,
+               out_t out_memory, index_t ld_out, index_t out_stride,
+               index_t batch_size,
+               const typename sb_handle_t::event_t& _dependencies) {
+  if ((m * n) > (1 << 20)) {
+    return blas::internal::_matcopy_batch_impl<32, 4, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  } else if ((m * n) > (1 << 14)) {
+    return blas::internal::_matcopy_batch_impl<8, 16, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  } else {
+    return blas::internal::_matcopy_batch_impl<4, 64, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace matcopy_batch
 }  // namespace blas
 
 #endif
diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp
index f73e9ec1f..936a1f651 100644
--- a/src/interface/extension/backend/default_cpu.hpp
+++ b/src/interface/extension/backend/default_cpu.hpp
@@ -34,16 +34,17 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename element_t, typename index_t>
 typename sb_handle_t::event_t _transpose_outplace(
     sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
-    container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_,
-    index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) {
+    container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in,
+    container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out,
+    index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) {
   if (_M * _N < (1 << 20)) {
     return blas::internal::_transpose_outplace_impl<16, 64, 64, false>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
   } else {
     return blas::internal::_transpose_outplace_impl<32, 128, 64, false>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
   }
 }
 
@@ -69,6 +70,24 @@ typename sb_handle_t::event_t _transpose_add(
 
 }  // namespace backend
 }  // namespace transpose
+
+namespace matcopy_batch {
+namespace backend {
+template <bool trans, typename sb_handle_t, typename element_t,
+          typename index_t, typename in_t, typename out_t>
+typename std::enable_if<!trans, typename sb_handle_t::event_t>::type
+_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
+               in_t in_memory, index_t ld_in, index_t in_stride,
+               out_t out_memory, index_t ld_out, index_t out_stride,
+               index_t batch_size,
+               const typename sb_handle_t::event_t& _dependencies) {
+  return blas::internal::_matcopy_batch_impl<1, 8, sb_handle_t, element_t,
+                                             index_t, in_t, out_t>(
+      sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+      out_stride, batch_size, _dependencies);
+}
+}  // namespace backend
+}  // namespace matcopy_batch
 }  // namespace blas
 
 #endif
diff --git a/src/interface/extension/backend/intel_gpu.hpp b/src/interface/extension/backend/intel_gpu.hpp
index f17049614..bdf6ddaf6 100644
--- a/src/interface/extension/backend/intel_gpu.hpp
+++ b/src/interface/extension/backend/intel_gpu.hpp
@@ -34,17 +34,18 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename element_t, typename index_t>
 typename sb_handle_t::event_t _transpose_outplace(
     sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
-    container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_,
-    index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) {
+    container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in,
+    container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out,
+    index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) {
   if (_M * _N > (1 << 18)) {
     return blas::internal::_transpose_outplace_impl<32, 256, 128, true>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
 
   } else {
     return blas::internal::_transpose_outplace_impl<16, 64, 64, true>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
   }
 }
 
@@ -70,6 +71,36 @@ typename sb_handle_t::event_t _transpose_add(
 
 }  // namespace backend
 }  // namespace transpose
+
+namespace matcopy_batch {
+namespace backend {
+template <bool trans, typename sb_handle_t, typename element_t,
+          typename index_t, typename in_t, typename out_t>
+typename std::enable_if<!trans, typename sb_handle_t::event_t>::type
+_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
+               in_t in_memory, index_t ld_in, index_t in_stride,
+               out_t out_memory, index_t ld_out, index_t out_stride,
+               index_t batch_size,
+               const typename sb_handle_t::event_t& _dependencies) {
+  if ((m * n) >= (1 << 18)) {
+    return blas::internal::_matcopy_batch_impl<16, 4, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  } else if ((m * n) >= (1 << 14)) {
+    return blas::internal::_matcopy_batch_impl<4, 16, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  } else {
+    return blas::internal::_matcopy_batch_impl<1, 256, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace matcopy_batch
 }  // namespace blas
 
 #endif
diff --git a/src/interface/extension/backend/nvidia_gpu.hpp b/src/interface/extension/backend/nvidia_gpu.hpp
index 994c48964..ce4307f3b 100644
--- a/src/interface/extension/backend/nvidia_gpu.hpp
+++ b/src/interface/extension/backend/nvidia_gpu.hpp
@@ -34,17 +34,18 @@ template <typename sb_handle_t, typename container_0_t, typename container_1_t,
           typename element_t, typename index_t>
 typename sb_handle_t::event_t _transpose_outplace(
     sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
-    container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_,
-    index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) {
+    container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in,
+    container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out,
+    index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) {
   if (_M * _N > (1 << 18)) {
     return blas::internal::_transpose_outplace_impl<32, 512, 128, true>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
 
   } else {
     return blas::internal::_transpose_outplace_impl<32, 128, 128, true>(
-        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, out_, _ld_out,
-        _inc_out, _dependencies);
+        sb_handle, _M, _N, _alpha, in_, _ld_in, _inc_in, _stride_in, out_,
+        _ld_out, _inc_out, _stride_out, _batch_size, _dependencies);
   }
 }
 
@@ -70,6 +71,34 @@ typename sb_handle_t::event_t _transpose_add(
 
 }  // namespace backend
 }  // namespace transpose
-}  // namespace blas
 
+namespace matcopy_batch {
+namespace backend {
+template <bool trans, typename sb_handle_t, typename element_t,
+          typename index_t, typename in_t, typename out_t>
+typename std::enable_if<!trans, typename sb_handle_t::event_t>::type
+_matcopy_batch(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
+               in_t in_memory, index_t ld_in, index_t in_stride,
+               out_t out_memory, index_t ld_out, index_t out_stride,
+               index_t batch_size, const typename  sb_handle_t::event_t& _dependencies) {
+  if ((m * n) > (1 << 20)) {
+    return blas::internal::_matcopy_batch_impl<32, 4, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  } else if ((m * n) > (1 << 14)) {
+    return blas::internal::_matcopy_batch_impl<8, 16, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  } else {
+    return blas::internal::_matcopy_batch_impl<4, 64, sb_handle_t, element_t,
+                                               index_t, in_t, out_t>(
+        sb_handle, m, n, alpha, in_memory, ld_in, in_stride, out_memory, ld_out,
+        out_stride, batch_size, _dependencies);
+  }
+}
+}  // namespace backend
+}  // namespace matcopy_batch
+}  // namespace blas
 #endif
diff --git a/src/interface/extension/matcopy_batch.cpp.in b/src/interface/extension/matcopy_batch.cpp.in
new file mode 100644
index 000000000..7fd441947
--- /dev/null
+++ b/src/interface/extension/matcopy_batch.cpp.in
@@ -0,0 +1,87 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename matcopy.cpp.in
+ *
+ **************************************************************************/
+
+#include "interface/extension_interface.hpp"
+#include "sb_handle/kernel_constructor.hpp"
+#include "sb_handle/portblas_handle.hpp"
+#include "operations/extension/matcopy_batch.hpp"
+#include "operations/extension/transpose.hpp" 
+
+namespace blas {
+namespace internal {
+
+// This function represents both imatcopy and omatcopy operators
+template typename SB_Handle::event_t _matcopy_batch<true>(
+    SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n,
+    ${DATA_TYPE} alpha, BufferIterator<${DATA_TYPE}>  in_memory, ${INDEX_TYPE} ld_in,
+    ${INDEX_TYPE} in_stride, BufferIterator<${DATA_TYPE}> out_memory, ${INDEX_TYPE} ld_out,
+    ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size,
+    const typename SB_Handle::event_t& _dependencies);
+
+
+// This function represents both imatcopy and omatcopy operators
+template typename SB_Handle::event_t _matcopy_batch<false>(
+    SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n,
+    ${DATA_TYPE} alpha, BufferIterator<${DATA_TYPE}> in_memory, ${INDEX_TYPE} ld_in,
+    ${INDEX_TYPE} in_stride, BufferIterator<${DATA_TYPE}> out_memory, ${INDEX_TYPE} ld_out,
+    ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size,
+    const typename SB_Handle::event_t& _dependencies);
+
+#ifdef SB_ENABLE_USM
+// This function represents both imatcopy and omatcopy operators
+template typename SB_Handle::event_t _matcopy_batch<true>(
+    SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n,
+    ${DATA_TYPE} alpha, const ${DATA_TYPE}*  in_memory, ${INDEX_TYPE} ld_in,
+    ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out,
+    ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size,
+    const typename SB_Handle::event_t& _dependencies);
+
+// This function represents both imatcopy and omatcopy operators
+template typename SB_Handle::event_t _matcopy_batch<true>(
+    SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n,
+    ${DATA_TYPE} alpha, ${DATA_TYPE}*  in_memory, ${INDEX_TYPE} ld_in,
+    ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out,
+    ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size,
+    const typename SB_Handle::event_t& _dependencies);
+
+// This function represents both imatcopy and omatcopy operators
+template typename SB_Handle::event_t _matcopy_batch<false>(
+    SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n,
+    ${DATA_TYPE} alpha, const ${DATA_TYPE}* in_memory, ${INDEX_TYPE} ld_in,
+    ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out,
+    ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size,
+    const typename SB_Handle::event_t& _dependencies);
+
+// This function represents both imatcopy and omatcopy operators
+template typename SB_Handle::event_t _matcopy_batch<false>(
+    SB_Handle& sb_handle, char trans, ${INDEX_TYPE} m, ${INDEX_TYPE} n,
+    ${DATA_TYPE} alpha, ${DATA_TYPE}* in_memory, ${INDEX_TYPE} ld_in,
+    ${INDEX_TYPE} in_stride, ${DATA_TYPE}* out_memory, ${INDEX_TYPE} ld_out,
+    ${INDEX_TYPE} out_stride, ${INDEX_TYPE} batch_size,
+    const typename SB_Handle::event_t& _dependencies);
+#endif
+
+}  // namespace internal
+}  // namespace blas
diff --git a/src/interface/extension_interface.hpp b/src/interface/extension_interface.hpp
index c9c63a0c0..d90966099 100644
--- a/src/interface/extension_interface.hpp
+++ b/src/interface/extension_interface.hpp
@@ -31,10 +31,11 @@
 #include "interface/extension_interface.h"
 #include "operations/blas1_trees.h"
 #include "operations/blas_operators.hpp"
+#include "operations/extension/matcopy_batch.h"
 #include "operations/extension/reduction.h"
 #include "operations/extension/transpose.h"
-#include "sb_handle/portblas_handle.h"
 #include "portblas_helper.h"
+#include "sb_handle/portblas_handle.h"
 #include "views/view.h"
 
 namespace blas {
@@ -58,25 +59,26 @@ template <int Tile_size, int wg_size, int cl_size, bool local_memory,
           typename element_t, typename index_t>
 typename sb_handle_t::event_t _transpose_outplace_impl(
     sb_handle_t& sb_handle, index_t _M, index_t _N, element_t _alpha,
-    container_0_t in_, index_t _ld_in, index_t _inc_in, container_1_t out_,
-    index_t _ld_out, index_t _inc_out, const typename sb_handle_t::event_t& _dependencies) {
+    container_0_t in_, index_t _ld_in, index_t _inc_in, index_t _stride_in,
+    container_1_t out_, index_t _ld_out, index_t _inc_out, index_t _stride_out,
+    index_t _batch_size, const typename sb_handle_t::event_t& _dependencies) {
   constexpr const index_t num_line_elems =
       std::max(Tile_size, static_cast<int>(cl_size / sizeof(element_t)));
   constexpr const index_t num_tiles_per_line = num_line_elems / Tile_size;
 
   // Matrix Views
   auto in_view = make_matrix_view<col_major>(in_, _M, _N, _ld_in);
-  auto out_view =
-      make_matrix_view<col_major>(out_, _M, _N, _ld_out);
+  auto out_view = make_matrix_view<col_major>(out_, _M, _N, _ld_out);
 
   // Work items & groups sizes
   index_t n_wg = ((_M - 1) / Tile_size + 1) * ((_N - 1) / Tile_size + 1);
-  index_t global_size = n_wg * wg_size;
+  index_t global_size = n_wg * wg_size * _batch_size;
 
   // Transpose expression Tree
   auto trans_scale_tree =
       make_transpose<false, Tile_size, wg_size, cl_size, local_memory>(
-          in_view, _inc_in, out_view, _inc_out, _alpha);
+          in_view, _inc_in, _stride_in, out_view, _inc_out, _stride_out, _alpha,
+          _batch_size);
 
   if constexpr (local_memory) {
     index_t local_mem = static_cast<index_t>((num_line_elems + 1) * Tile_size /
@@ -96,19 +98,20 @@ template <bool in_place, bool trans, typename sb_handle_t, typename element_t,
           typename index_t, typename in_t, typename out_t>
 typename std::enable_if<trans, typename sb_handle_t::event_t>::type
 _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
-              in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
-              index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) {
+              in_t in_memory, index_t ld_in, index_t inc_in, index_t stride_in,
+              out_t out_memory, index_t ld_out, index_t inc_out,
+              index_t stride_out, index_t batch_size,
+              const typename sb_handle_t::event_t& _dependencies) {
   if constexpr (!in_place) {
     return blas::transpose::backend::_transpose_outplace<
         sb_handle_t, in_t, out_t, element_t, index_t>(
-        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out,
-        inc_out, _dependencies);
+        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, stride_in, out_memory,
+        ld_out, inc_out, stride_out, batch_size, _dependencies);
 
   } else {
     // TODO
     // In-place transpose not implemented.
-    typename sb_handle_t::event_t ret;
-    return ret;
+    throw std::runtime_error("In-place transpose not implemented.");
   }
 }
 
@@ -121,12 +124,15 @@ template <bool in_place, bool trans, bool in_has_inc, bool out_has_inc,
 typename std::enable_if<!trans, typename sb_handle_t::event_t>::type
 _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
               in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
-              index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) {
+              index_t ld_out, index_t inc_out,
+              const typename sb_handle_t::event_t& _dependencies) {
   typename sb_handle_t::event_t ret;
   typename MatrixViewType<in_t, index_t, col_major, in_has_inc>::type in_view =
-      make_matrix_view<col_major, element_t, index_t, in_has_inc>(in_memory, m, n, ld_in, inc_in);
-  typename MatrixViewType<out_t, index_t, col_major, out_has_inc>::type out_view =
-      make_matrix_view<col_major, element_t, index_t, out_has_inc>(out_memory, m, n, ld_out, inc_out);
+      make_matrix_view<col_major, element_t, index_t, in_has_inc>(
+          in_memory, m, n, ld_in, inc_in);
+  typename MatrixViewType<out_t, index_t, col_major, out_has_inc>::type
+      out_view = make_matrix_view<col_major, element_t, index_t, out_has_inc>(
+          out_memory, m, n, ld_out, inc_out);
   // if alpha=1 no need to multiply
   if (alpha == 1) {
     auto copy_op = make_op<Assign>(out_view, in_view);
@@ -143,31 +149,58 @@ template <bool in_place, bool trans, typename sb_handle_t, typename element_t,
           typename index_t, typename in_t, typename out_t>
 typename std::enable_if<!trans, typename sb_handle_t::event_t>::type
 _matcopy_impl(sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
-              in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
-              index_t ld_out, index_t inc_out, const typename sb_handle_t::event_t& _dependencies) {
+              in_t in_memory, index_t ld_in, index_t inc_in, index_t stride_in,
+              out_t out_memory, index_t ld_out, index_t inc_out,
+              index_t stride_out, index_t batch_size,
+              const typename sb_handle_t::event_t& _dependencies) {
+  // if alpha=1 no need to multiply
   if (inc_in == 1 && inc_out == 1) {
-    return _matcopy_impl<in_place, trans, false, false>(sb_handle, m, n, alpha,
-                                                 in_memory, ld_in, inc_in,
-                                                 out_memory, ld_out, inc_out,
-                                                 _dependencies);
+    return _matcopy_impl<in_place, trans, false, false>(
+        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out,
+        inc_out, _dependencies);
   } else if (inc_in == 1) {
-    return _matcopy_impl<in_place, trans, false, true>(sb_handle, m, n, alpha,
-                                                        in_memory, ld_in, inc_in,
-                                                        out_memory, ld_out, inc_out,
-                                                        _dependencies);
+    return _matcopy_impl<in_place, trans, false, true>(
+        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out,
+        inc_out, _dependencies);
   } else if (inc_out == 1) {
-    return _matcopy_impl<in_place, trans, true, false>(sb_handle, m, n, alpha,
-                                                       in_memory, ld_in, inc_in,
-                                                       out_memory, ld_out, inc_out,
-                                                       _dependencies);
+    return _matcopy_impl<in_place, trans, true, false>(
+        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out,
+        inc_out, _dependencies);
   } else {
-    return _matcopy_impl<in_place, trans, true, true>(sb_handle, m, n, alpha,
-                                                       in_memory, ld_in, inc_in,
-                                                       out_memory, ld_out, inc_out,
-                                                       _dependencies);
+    return _matcopy_impl<in_place, trans, true, true>(
+        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, out_memory, ld_out,
+        inc_out, _dependencies);
   }
 }
 
+/**
+ * @brief Implementation of matrix copy batch operators for non transpose cases.
+ */
+template <uint32_t TileSize, int TilePerWG, typename sb_handle_t,
+          typename element_t, typename index_t, typename in_t, typename out_t>
+typename sb_handle_t::event_t _matcopy_batch_impl(
+    sb_handle_t& sb_handle, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, index_t in_stride, out_t out_memory,
+    index_t ld_out, index_t out_stride, index_t batch_size,
+    const typename sb_handle_t::event_t& _dependencies) {
+  typename MatrixViewType<in_t, index_t, col_major>::type in_view =
+      make_matrix_view<col_major>(in_memory, m, n, ld_in);
+  auto out_view = make_matrix_view<col_major>(out_memory, m, n, ld_out);
+  const element_t beta = 0;
+  const index_t ld_b = 0;
+  const index_t stride_b = 0;
+  auto copy_batch_tree = make_matcopy_batch<TileSize, TilePerWG>(
+      out_view, in_view, in_view, alpha, beta, m, n, ld_out, ld_in, ld_b,
+      out_stride, in_stride, stride_b, batch_size);
+  constexpr index_t local_size = TileSize * TilePerWG;
+  const index_t tile_per_matrix =
+      (((m - 1) / TileSize) + 1) * (((n - 1) / TileSize) + 1);
+  const index_t wg_size = (tile_per_matrix - 1) / TilePerWG + 1;
+  const index_t global_size = (wg_size)*local_size * batch_size;
+  return sb_handle.execute(copy_batch_tree, local_size, global_size,
+                           _dependencies);
+}
+
 /*!
  * @brief Wrapper around Transpose-Add. Creates the views, then makes and
  * launches Transpose Add kernel
@@ -280,7 +313,8 @@ template <typename operator_t, reduction_dim_t reduction_dim,
           typename output_t, typename index_t>
 typename sb_handle_t::event_t launch_type_based_reduction(
     sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out,
-    index_t rows, index_t cols, const typename SB_Handle::event_t& dependencies) {
+    index_t rows, index_t cols,
+    const typename SB_Handle::event_t& dependencies) {
 #ifdef POWER_VR
   constexpr int ClSize = 32;
   constexpr int WgSize = 64;
@@ -327,21 +361,21 @@ typename sb_handle_t::event_t launch_type_based_reduction(
     /* 1st step */
     auto reduction =
         blas::make_reduction<operator_t, params_t>(matrix_buffer_in, temp_);
-    reduction_event =
-        concatenate_vectors(reduction_event, sb_handle.execute(reduction, dependencies));
+    reduction_event = concatenate_vectors(
+        reduction_event, sb_handle.execute(reduction, dependencies));
 
     /* 2nd step */
     auto reduction_step_2 =
         blas::make_reduction<typename get_second_step_op<operator_t>::type,
                              params_t>(temp_, matrix_buffer_out);
-    reduction_event = concatenate_vectors(reduction_event,
-                                          sb_handle.execute(reduction_step_2, reduction_event));
+    reduction_event = concatenate_vectors(
+        reduction_event, sb_handle.execute(reduction_step_2, reduction_event));
   } else {
     /* 1-step reduction */
     auto reduction = blas::make_reduction<operator_t, params_t>(
         matrix_buffer_in, matrix_buffer_out);
-    reduction_event =
-        concatenate_vectors(reduction_event, sb_handle.execute(reduction, dependencies));
+    reduction_event = concatenate_vectors(
+        reduction_event, sb_handle.execute(reduction, dependencies));
   }
 
   return reduction_event;
@@ -349,39 +383,68 @@ typename sb_handle_t::event_t launch_type_based_reduction(
 
 template <bool in_place, typename sb_handle_t, typename element_t,
           typename index_t, typename in_t, typename out_t>
-typename sb_handle_t::event_t _matcopy(sb_handle_t& sb_handle, char trans,
-                                       index_t m, index_t n, element_t alpha,
-                                       in_t in_memory, index_t ld_in,
-                                       index_t inc_in, out_t out_memory,
-                                       index_t ld_out, index_t inc_out,
-                                       const typename sb_handle_t::event_t& _dependencies) {
+typename sb_handle_t::event_t _matcopy(
+    sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, index_t inc_in, out_t out_memory,
+    index_t ld_out, index_t inc_out,
+    const typename sb_handle_t::event_t& _dependencies) {
   // bail out early if the leading dimensions are not correct
   if (ld_in < (inc_in * (m - 1) + 1) ||
       (ld_out - 1) < (trans == 't' ? inc_out * (n - 1) : inc_out * (m - 1))) {
-    typename sb_handle_t::event_t ret;
-    return ret;
+    throw std::invalid_argument("invalid ld_in and/or ld_out, inc_out, inc_in");
+  }
+
+  const index_t stride = 1;
+  const index_t batch_size = 1;
+
+  if (trans == 't') {
+    return _matcopy_impl<in_place, true>(
+        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, stride, out_memory,
+        ld_out, inc_out, stride, index_t(1), _dependencies);
+  } else {
+    return _matcopy_impl<in_place, false>(
+        sb_handle, m, n, alpha, in_memory, ld_in, inc_in, stride, out_memory,
+        ld_out, inc_out, stride, batch_size, _dependencies);
   }
+}
+
+template <bool in_place, typename sb_handle_t, typename element_t,
+          typename index_t, typename in_t, typename out_t>
+typename sb_handle_t::event_t _matcopy_batch(
+    sb_handle_t& sb_handle, char trans, index_t m, index_t n, element_t alpha,
+    in_t in_memory, index_t ld_in, index_t stride_in, out_t out_memory,
+    index_t ld_out, index_t stride_out, index_t batch_size,
+    const typename sb_handle_t::event_t& _dependencies) {
+  // bail out early if the leading dimensions / strides are not correct
+  if (ld_in < m || (ld_out < (trans == 't' ? n : m))) {
+    throw std::invalid_argument("invalid ld_in and/or ld_out");
+  }
+  if ((stride_in < ld_in * n) ||
+      (stride_out < (ld_out * (trans == 't' ? m : n)))) {
+    throw std::invalid_argument("invalid stride_in and/or stride_out");
+  }
+
+  const index_t increment = 1;
 
   if (trans == 't') {
-    return _matcopy_impl<in_place, true>(sb_handle, m, n, alpha, in_memory,
-                                         ld_in, inc_in, out_memory, ld_out,
-                                         inc_out, _dependencies);
+    return _matcopy_impl<in_place, true>(
+        sb_handle, m, n, alpha, in_memory, ld_in, increment, stride_in,
+        out_memory, ld_out, increment, stride_out, batch_size, _dependencies);
   } else {
-    return _matcopy_impl<in_place, false>(sb_handle, m, n, alpha, in_memory,
-                                          ld_in, inc_in, out_memory, ld_out,
-                                          inc_out, _dependencies);
+    return blas::matcopy_batch::backend::_matcopy_batch<false>(
+        sb_handle, m, n, alpha, in_memory, ld_in, stride_in, out_memory, ld_out,
+        stride_out, batch_size, _dependencies);
   }
 }
 
 template <typename sb_handle_t, typename element_t, typename index_t,
-          typename container_0_t, typename container_1_t, typename container_2_t>
-typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a,
-                                       char trans_b, index_t m, index_t n,
-                                       element_t alpha, container_0_t a,
-                                       index_t lda, element_t beta,
-                                       container_1_t b, index_t ldb,
-                                       container_2_t c, index_t ldc,
-                                       const typename sb_handle_t::event_t& _dependencies) {
+          typename container_0_t, typename container_1_t,
+          typename container_2_t>
+typename sb_handle_t::event_t _omatadd(
+    sb_handle_t& sb_handle, char trans_a, char trans_b, index_t m, index_t n,
+    element_t alpha, container_0_t a, index_t lda, element_t beta,
+    container_1_t b, index_t ldb, container_2_t c, index_t ldc,
+    const typename sb_handle_t::event_t& _dependencies) {
   if (trans_a == 't') {
     if (trans_b == 't') {
       return _omatadd_impl<true, true>(sb_handle, m, n, alpha, a, lda, beta, b,
@@ -405,41 +468,39 @@ typename sb_handle_t::event_t _omatadd(sb_handle_t& sb_handle, char trans_a,
 
 template <bool in_place, typename element_t, typename sb_handle_t,
           typename index_t, typename in_t, typename out_t>
-typename sb_handle_t::event_t _transpose(sb_handle_t& sb_handle, index_t m,
-                                         index_t n, in_t A, index_t ld_a,
-                                         out_t B, index_t ld_b,
-                                         const typename sb_handle_t::event_t& _dependencies) {
+typename sb_handle_t::event_t _transpose(
+    sb_handle_t& sb_handle, index_t m, index_t n, in_t A, index_t ld_a, out_t B,
+    index_t ld_b, const typename sb_handle_t::event_t& _dependencies) {
   // bail out early if the leading dimensions are not correct
   if (ld_a < m || ld_b < n) {
     typename sb_handle_t::event_t ret;
     return ret;
   }
 
+  const element_t alpha = 1;
   const index_t inc = 1;
-  const element_t alpha = element_t(1);
+  const index_t stride = 1;
+  const index_t batch_size = 1;
 
-  return _matcopy_impl<in_place, true>(sb_handle, m, n, alpha, A, ld_a, inc, B,
-                                       ld_b, inc, _dependencies);
+  return _matcopy_impl<in_place, true>(sb_handle, m, n, (float)1.0, A, ld_a,
+                                       inc, stride, B, ld_b, inc, stride,
+                                       batch_size, _dependencies);
 }
 
 template <typename operator_t, typename element_t, typename sb_handle_t,
           typename input_t, typename output_t, typename index_t>
-typename sb_handle_t::event_t _reduction(sb_handle_t& sb_handle,
-                                         input_t buffer_in, index_t ld,
-                                         output_t buffer_out, index_t rows,
-                                         index_t cols,
-                                         reduction_dim_t reduction_dim,
-                                         const typename sb_handle_t::event_t& dependencies) {
+typename sb_handle_t::event_t _reduction(
+    sb_handle_t& sb_handle, input_t buffer_in, index_t ld, output_t buffer_out,
+    index_t rows, index_t cols, reduction_dim_t reduction_dim,
+    const typename sb_handle_t::event_t& dependencies) {
   if (reduction_dim == reduction_dim_t::inner) {
     return launch_type_based_reduction<operator_t, reduction_dim_t::inner,
-                                       element_t>(sb_handle, buffer_in, ld,
-                                                  buffer_out, rows, cols,
-                                                  dependencies);
+                                       element_t>(
+        sb_handle, buffer_in, ld, buffer_out, rows, cols, dependencies);
   } else {  // reduction_dim_t::outer
     return launch_type_based_reduction<operator_t, reduction_dim_t::outer,
-                                       element_t>(sb_handle, buffer_in, ld,
-                                                  buffer_out, rows, cols,
-                                                  dependencies);
+                                       element_t>(
+        sb_handle, buffer_in, ld, buffer_out, rows, cols, dependencies);
   }
 }
 
diff --git a/src/operations/extension/matcopy_batch.hpp b/src/operations/extension/matcopy_batch.hpp
new file mode 100644
index 000000000..2d13d332f
--- /dev/null
+++ b/src/operations/extension/matcopy_batch.hpp
@@ -0,0 +1,171 @@
+/***************************************************************************
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename matcopy_batch.hpp
+ *
+ **************************************************************************/
+
+#ifndef PORTBLAS_EXTENSION_MATCOPY_BATCH_HPP
+#define PORTBLAS_EXTENSION_MATCOPY_BATCH_HPP
+
+#include "operations/extension/matcopy_batch.h"
+
+namespace blas {
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>::Matcopy_batch(
+    lhs_t lhs, rhs_t rhs_1, rhs_t rhs_2, typename lhs_t::value_t alpha,
+    typename lhs_t::value_t beta, typename rhs_t::index_t m,
+    typename rhs_t::index_t n, typename rhs_t::index_t lhs_ld,
+    typename rhs_t::index_t rhs_ld, typename rhs_t::index_t rhs_2_ld,
+    typename rhs_t::index_t lhs_stride, typename rhs_t::index_t rhs_stride,
+    typename rhs_t::index_t rhs_2_stride, typename rhs_t::index_t batch_size)
+    : lhs_(lhs),
+      rhs_1_(rhs_1),
+      rhs_2_(rhs_2),
+      alpha_(alpha),
+      beta_(beta),
+      m_(m),
+      n_(n),
+      lhs_ld_(lhs_ld),
+      rhs_1_ld_(rhs_ld),
+      rhs_2_ld_(rhs_2_ld),
+      lhs_stride_(lhs_stride),
+      rhs_1_stride_(rhs_stride),
+      rhs_2_stride_(rhs_2_stride),
+      batch_size_(batch_size) {}
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+typename lhs_t::value_t Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>::eval(
+    index_t i) {}
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+typename lhs_t::value_t Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>::eval(
+    cl::sycl::nd_item<1> ndItem) {
+  const index_t m{m_};
+  const index_t n{n_};
+
+  const index_t required_tile =
+      (((m - 1) / TileSize) + 1) * (((n - 1) / TileSize) + 1);
+
+  const index_t tile_for_matrix = ((required_tile - 1) / TilePerWG) + 1;
+
+  const index_t wg_batch_id =
+      (ndItem.get_group(0)) / ((required_tile - 1) / TilePerWG + 1);
+
+  const index_t l_lhs_stride = lhs_stride_;
+  const index_t l_rhs_stride = rhs_1_stride_;
+
+  const index_t number_of_block_per_row = ((m_ - 1) / TileSize) + 1;
+
+  const index_t wg_id = ndItem.get_local_id(0) / TileSize +
+                        ((ndItem.get_group(0) % tile_for_matrix) * TilePerWG);
+
+  /* row tile id  per work group */
+  const index_t tile_id_row = wg_id % number_of_block_per_row;
+  /* column tile id per work group */
+  const index_t tile_id_col = wg_id / number_of_block_per_row;
+  /* the start position of the tile-row per work group */
+  const index_t wg_row = tile_id_row * TileSize;
+  /* the start position of the tile-column per work group */
+  const index_t wg_col = tile_id_col * TileSize;
+
+  const index_t item_id = ndItem.get_local_id(0) % TileSize;
+
+  auto orig_lhs = lhs_.get_pointer() + (wg_batch_id * l_lhs_stride);
+  auto orig_rhs = rhs_1_.get_pointer() + (wg_batch_id * l_rhs_stride);
+
+  orig_lhs = orig_lhs + wg_row + wg_col * lhs_ld_ + item_id;
+  orig_rhs = orig_rhs + wg_row + wg_col * rhs_1_ld_ + item_id;
+
+  value_t reg_rhs[TileSize];
+  const index_t alpha = alpha_;
+
+  const bool is_internal_block =
+      (m - wg_row >= TileSize) && (n - wg_col >= TileSize);
+
+  // check for short&large
+  const bool valid_index =
+      (item_id > m || (item_id >= (m - wg_row))) ? false : true;
+  if (!valid_index) return 0;
+
+  if (is_internal_block) {
+    auto A = orig_rhs;
+    auto B = orig_lhs;
+
+#pragma unroll
+    for (int i = 0; i < TileSize; ++i) {
+      reg_rhs[i] = A[i * rhs_1_ld_];
+    }
+#pragma unroll
+    for (int i = 0; i < TileSize; ++i) {
+      B[i * lhs_ld_] = alpha * reg_rhs[i];
+    }
+
+  } else {
+    const auto limit_m = m - wg_row;
+    const auto limit_n = n - wg_col;
+    auto A = orig_rhs;
+    auto B = orig_lhs;
+
+    for (int i = 0; i < TileSize; ++i) {
+      if (i >= limit_n) break;
+      reg_rhs[i] = A[i * rhs_1_ld_];
+    }
+    for (int i = 0; i < TileSize; ++i) {
+      if (i >= limit_n) break;
+      B[i * lhs_ld_] = alpha * reg_rhs[i];
+    }
+  }
+
+  return 0;
+}
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE void Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>::bind(
+    cl::sycl::handler& h) {
+  lhs_.bind(h);
+  rhs_1_.bind(h);
+  rhs_2_.bind(h);
+}
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE void
+Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>::adjust_access_displacement() {
+  lhs_.adjust_access_displacement();
+  rhs_1_.adjust_access_displacement();
+  rhs_2_.adjust_access_displacement();
+}
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE typename rhs_t::index_t
+Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>::get_size() const {
+  return m_ * n_ * batch_size_;
+}
+
+template <int TileSize, int TilePerWG, typename lhs_t, typename rhs_t>
+PORTBLAS_INLINE bool
+Matcopy_batch<TileSize, TilePerWG, lhs_t, rhs_t>::valid_thread(
+    cl::sycl::nd_item<1> ndItem) const {
+  return true;
+}
+}  // namespace blas
+
+#endif  // PORTBLAS_EXTENSION_MATCOPY_BATCH_HPP
diff --git a/src/operations/extension/transpose.hpp b/src/operations/extension/transpose.hpp
index 0d3adba54..57be3db27 100644
--- a/src/operations/extension/transpose.hpp
+++ b/src/operations/extension/transpose.hpp
@@ -54,8 +54,9 @@ template <bool in_place, int Tile_size, int wg_size, int cl_size,
 PORTBLAS_INLINE typename in_t::index_t
 Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
           element_t>::get_size() const {
-  // Smallest TileSize square-multiple containing input/output matrices
-  return (M_pad_ * N_pad_);
+  // Smallest TileSize square-multiple containing input/output matrices times
+  // batch_size
+  return (size_pad_ * batch_size_);
 }
 
 template <bool in_place, int Tile_size, int wg_size, int cl_size,
@@ -86,8 +87,13 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
   index_t idg = id.get_group(0);
   index_t idc = id.get_local_id(0);
 
-  const index_t jg = idg / tile_count_m_;
-  const index_t ig = idg - jg * tile_count_m_;
+  const index_t ibatch =
+      (batch_size_ == index_t(1)) ? 0 : idg / tile_count_total_;
+
+  const index_t relative_idg = idg - ibatch * tile_count_total_;
+
+  const index_t jg = relative_idg / tile_count_m_;
+  const index_t ig = relative_idg - jg * tile_count_m_;
 
   const index_t jl = idc / Tile_size;
   const index_t il = idc - jl * Tile_size;
@@ -98,11 +104,11 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
   i = (i_block_start + il) * inc_a_;
   j = (j_block_start + jl) * inc_at_;
 
-  in_idx =
-      i_block_start * inc_a_ + j_block_start * lda_ + il * inc_a_ + jl * lda_;
+  in_idx = i_block_start * inc_a_ + j_block_start * lda_ + il * inc_a_ +
+           jl * lda_ + ibatch * stride_a_;
 
   out_idx = i_block_start * ldat_ + j_block_start * inc_at_ + jl * inc_at_ +
-            il * ldat_;
+            il * ldat_ + ibatch * stride_at_;
 }
 
 template <bool in_place, int Tile_size, int wg_size, int cl_size,
@@ -156,8 +162,13 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
   index_t idg = id.get_group(0);
   index_t idc = id.get_local_id(0);
 
-  const index_t jg = idg / tile_count_m_;
-  const index_t ig = idg - jg * tile_count_m_;
+  const index_t ibatch =
+      (batch_size_ == index_t(1)) ? 0 : idg / tile_count_total_;
+
+  const index_t relative_idg = idg - ibatch * tile_count_total_;
+
+  const index_t jg = relative_idg / tile_count_m_;
+  const index_t ig = relative_idg - jg * tile_count_m_;
 
   jl = idc / Tile_size;
   il = idc - jl * Tile_size;
@@ -165,8 +176,8 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
   i_block_start = ig * Tile_size;
   j_block_start = jg * Tile_size;
 
-  in_idx =
-      i_block_start * inc_a_ + j_block_start * lda_ + il * inc_a_ + jl * lda_;
+  in_idx = i_block_start * inc_a_ + j_block_start * lda_ + il * inc_a_ +
+           jl * lda_ + ibatch * stride_a_;
 
   index_t jl_cl = idc / get_non_bank_conflict_line_size();
   index_t il_cl = idc - jl_cl * get_non_bank_conflict_line_size();
@@ -174,7 +185,7 @@ Transpose<in_place, Tile_size, wg_size, cl_size, local_memory, in_t, out_t,
   in_local_idx = jl_cl * (get_non_bank_conflict_line_size() + 1) + il_cl;
 
   out_idx = i_block_start * ldat_ + j_block_start * inc_at_ + il * inc_at_ +
-            jl * ldat_;
+            jl * ldat_ + ibatch * stride_at_;
   out_local_idx = il * Tile_size + jl + il / get_num_tiles_per_line();
 }
 
diff --git a/src/portblas.hpp b/src/portblas.hpp
index 6cdafb7d1..623a0e302 100644
--- a/src/portblas.hpp
+++ b/src/portblas.hpp
@@ -50,6 +50,8 @@
 
 #include "operations/extension/transpose.hpp"
 
+#include "operations/extension/matcopy_batch.hpp"
+
 #include "operations/blas_constants.hpp"
 
 #include "operations/blas_operators.hpp"
diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt
index b3a23497a..97696a5ab 100644
--- a/test/unittest/CMakeLists.txt
+++ b/test/unittest/CMakeLists.txt
@@ -61,6 +61,7 @@ set(SYCL_UNITTEST_SRCS
   ${PORTBLAS_UNITTEST}/extension/omatcopy_test.cpp
   ${PORTBLAS_UNITTEST}/extension/omatcopy2_test.cpp
   ${PORTBLAS_UNITTEST}/extension/omatadd_test.cpp
+  ${PORTBLAS_UNITTEST}/extension/omatcopy_batched_test.cpp
 )
 
 if(${BLAS_ENABLE_EXTENSIONS})
diff --git a/test/unittest/extension/extension_reference.hpp b/test/unittest/extension/extension_reference.hpp
index ffa9bad0d..672ee6f07 100644
--- a/test/unittest/extension/extension_reference.hpp
+++ b/test/unittest/extension/extension_reference.hpp
@@ -41,8 +41,8 @@ namespace reference_blas {
  */
 template <bool onemkl_index = true, typename index_t, typename scalar_t>
 void ext_omatcopy(char trans, const index_t m, const index_t n,
-                  const scalar_t alpha, std::vector<scalar_t>& A,
-                  const index_t lda, std::vector<scalar_t>& B, index_t ldb) {
+                  const scalar_t alpha, const scalar_t* A, const index_t lda,
+                  scalar_t* B, index_t ldb) {
   if (trans != 't') {
     for (index_t j = 0; j < n; j++) {
       for (index_t i = 0; i < m; i++) {
@@ -70,9 +70,9 @@ void ext_omatcopy(char trans, const index_t m, const index_t n,
  */
 template <typename scalar_t, typename index_t>
 void ext_omatcopy2(const char& t, const index_t& m, const index_t& n,
-                   const scalar_t& alpha, std::vector<scalar_t>& in_matrix,
+                   const scalar_t& alpha, const scalar_t* in_matrix,
                    const index_t& ld_in, const index_t& inc_in,
-                   std::vector<scalar_t>& out_matrix, const index_t& ld_out,
+                   scalar_t* out_matrix, const index_t& ld_out,
                    const index_t inc_out) {
   if (t == 't') {
     for (int i = 0; i < m; ++i) {
diff --git a/test/unittest/extension/omatcopy2_test.cpp b/test/unittest/extension/omatcopy2_test.cpp
index 8b1058f3f..bae7ba0e4 100644
--- a/test/unittest/extension/omatcopy2_test.cpp
+++ b/test/unittest/extension/omatcopy2_test.cpp
@@ -66,8 +66,8 @@ void run_test(const combination_t<scalar_t> combi) {
   // TODO: There isn't a reference implementation from any library. So we
   // compare the results with a basic host implementation. Working on a
   // better comparison.
-  reference_blas::ext_omatcopy2(trans, m, n, alpha, A_ref, ld_in, inc_in, B_ref,
-                                ld_out, inc_out);
+  reference_blas::ext_omatcopy2(trans, m, n, alpha, A_ref.data(), ld_in, inc_in,
+                                B_ref.data(), ld_out, inc_out);
 
   auto matrix_in = helper::allocate<mem_alloc, scalar_t>(m_a_size, q);
   auto matrix_out = helper::allocate<mem_alloc, scalar_t>(m_b_size, q);
diff --git a/test/unittest/extension/omatcopy_batched_test.cpp b/test/unittest/extension/omatcopy_batched_test.cpp
new file mode 100644
index 000000000..94fb91d53
--- /dev/null
+++ b/test/unittest/extension/omatcopy_batched_test.cpp
@@ -0,0 +1,156 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename omatcopy_batched_test.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+#include "extension_reference.hpp"
+
+template <typename scalar_t>
+using combination_t = std::tuple<std::string, char, index_t, index_t, scalar_t,
+                                 index_t, index_t, index_t, index_t, index_t>;
+
+template <typename scalar_t, helper::AllocType mem_alloc>
+void run_test(const combination_t<scalar_t> combi) {
+  std::string alloc;
+  char trans;
+  index_t m, n, ld_in_m, ld_out_m, stride_in_m, stride_out_m, batch_size;
+  scalar_t alpha;
+
+  std::tie(alloc, trans, m, n, alpha, ld_in_m, ld_out_m, stride_in_m,
+           stride_out_m, batch_size) = combi;
+
+  // Compute leading dimensions using second_dim-ld multipliers
+  index_t ld_in = ld_in_m * m;
+  index_t ld_out = ld_out_m * (trans == 't' ? n : m);
+
+  index_t size_a = ld_in * n;
+  index_t size_b = ld_out * (trans == 't' ? m : n);
+
+  // Compute Strides using size-stride multipliers
+  index_t stride_in = stride_in_m * size_a;
+  index_t stride_out = stride_out_m * size_b;
+
+  auto q = make_queue();
+  blas::SB_Handle sb_handle(q);
+
+  std::vector<scalar_t> A(stride_in * batch_size);
+  std::vector<scalar_t> B(stride_out * batch_size, 0);
+
+  fill_random(A);
+
+  std::vector<scalar_t> A_ref = A;
+  std::vector<scalar_t> B_ref = B;
+
+  // Reference implementation
+  for (auto b = 0; b < batch_size; b++) {
+    reference_blas::ext_omatcopy(trans, m, n, alpha,
+                                 A_ref.data() + b * stride_in, ld_in,
+                                 B_ref.data() + b * stride_out, ld_out);
+  }
+
+  auto matrix_in =
+      helper::allocate<mem_alloc, scalar_t>(stride_in * batch_size, q);
+  auto matrix_out =
+      helper::allocate<mem_alloc, scalar_t>(stride_out * batch_size, q);
+
+  auto copy_in = helper::copy_to_device<scalar_t>(q, A.data(), matrix_in,
+                                                  stride_in * batch_size);
+  auto copy_out = helper::copy_to_device<scalar_t>(q, B.data(), matrix_out,
+                                                   stride_out * batch_size);
+
+  auto operator_event = blas::_omatcopy_batch(
+      sb_handle, trans, m, n, alpha, matrix_in, ld_in, stride_in, matrix_out,
+      ld_out, stride_out, batch_size, {copy_in, copy_out});
+  sb_handle.wait(operator_event);
+
+  auto event = blas::helper::copy_to_host<scalar_t>(
+      sb_handle.get_queue(), matrix_out, B.data(), stride_out * batch_size);
+  sb_handle.wait(event);
+
+  // Validate the result
+  const bool isAlmostEqual = utils::compare_vectors(B, B_ref);
+  ASSERT_TRUE(isAlmostEqual);
+}
+
+template <typename scalar_t>
+void run_test(const combination_t<scalar_t> combi) {
+  std::string alloc;
+  char trans;
+  index_t m, n, ld_in_m, ld_out_m, stride_in_m, stride_out_m, batch_size;
+  scalar_t alpha;
+
+  std::tie(alloc, trans, m, n, alpha, ld_in_m, ld_out_m, stride_in_m,
+           stride_out_m, batch_size) = combi;
+
+  if (alloc == "usm") {
+#ifdef SB_ENABLE_USM
+    run_test<scalar_t, helper::AllocType::usm>(combi);
+#else
+    GTEST_SKIP();
+#endif
+  } else {
+    run_test<scalar_t, helper::AllocType::buffer>(combi);
+  }
+}
+
+#ifdef STRESS_TESTING
+template <typename scalar_t>
+const auto combi =
+    ::testing::Combine(::testing::Values("usm", "buf"),
+                       ::testing::Values<char>('n', 't'),              // trans
+                       ::testing::Values<index_t>(1024, 4050, 16380),  // m
+                       ::testing::Values<index_t>(1024, 4050, 16380),  // n
+                       ::testing::Values<scalar_t>(0, 1.05, -20.01),   // alpha
+                       ::testing::Values<index_t>(3, 5),     // ld_in_m
+                       ::testing::Values<index_t>(3, 5),     // ld_out_m
+                       ::testing::Values<index_t>(5, 10),    // stride_in_m
+                       ::testing::Values<index_t>(5, 10),    // stride_out_m
+                       ::testing::Values<index_t>(10, 21));  // batch_size
+#else
+template <typename scalar_t>
+const auto combi =
+    ::testing::Combine(::testing::Values("usm", "buf"),
+                       ::testing::Values<char>('n', 't'),         // trans
+                       ::testing::Values<index_t>(64, 129, 255),  // m
+                       ::testing::Values<index_t>(64, 129, 255),  // n
+                       ::testing::Values<scalar_t>(0, 2),         // alpha
+                       ::testing::Values<index_t>(1, 2, 3),       // ld_in_m
+                       ::testing::Values<index_t>(1, 2, 3),       // ld_out_m
+                       ::testing::Values<index_t>(1, 3),          // stride_in_m
+                       ::testing::Values<index_t>(1, 3),      // stride_out_m
+                       ::testing::Values<index_t>(1, 2, 5));  // batch_size
+#endif
+
+template <class T>
+static std::string generate_name(
+    const ::testing::TestParamInfo<combination_t<T>>& info) {
+  std::string alloc;
+  char trans;
+  index_t m, n, ld_in_m, ld_out_m, stride_in_m, stride_out_m, batch_size;
+  T alpha;
+  BLAS_GENERATE_NAME(info.param, alloc, trans, m, n, alpha, ld_in_m, ld_out_m,
+                     stride_in_m, stride_out_m, batch_size);
+}
+
+BLAS_REGISTER_TEST_ALL(OmatCopyBatched, combination_t, combi, generate_name);
diff --git a/test/unittest/extension/omatcopy_test.cpp b/test/unittest/extension/omatcopy_test.cpp
index 4f0eb1e35..a7a1e49a8 100644
--- a/test/unittest/extension/omatcopy_test.cpp
+++ b/test/unittest/extension/omatcopy_test.cpp
@@ -58,7 +58,8 @@ void run_test(const combination_t<scalar_t> combi) {
   std::vector<scalar_t> B_ref = B;
 
   // Reference implementation
-  reference_blas::ext_omatcopy(trans, m, n, alpha, A_ref, ld_in, B_ref, ld_out);
+  reference_blas::ext_omatcopy(trans, m, n, alpha, A_ref.data(), ld_in,
+                               B_ref.data(), ld_out);
 
   auto matrix_in = helper::allocate<mem_alloc, scalar_t>(size_a, q);
   auto matrix_out = helper::allocate<mem_alloc, scalar_t>(size_b, q);
@@ -115,7 +116,7 @@ const auto combi =
                        ::testing::Values<index_t>(1024, 4050, 16380),  // n
                        ::testing::Values<scalar_t>(0, 1.05, 2.01),     // alpha
                        ::testing::Values<index_t>(1, 3),   // ld_in_m
-                       ::testing::Values<index_t>(1, 3));  // ld_in_n
+                       ::testing::Values<index_t>(1, 3));  // ld_out_m
 #else
 template <typename scalar_t>
 const auto combi =
@@ -125,7 +126,7 @@ const auto combi =
                        ::testing::Values<index_t>(64, 129, 255),  // n
                        ::testing::Values<scalar_t>(0, 1, 2),   // alpha
                        ::testing::Values<index_t>(1, 2, 3),       // ld_in_m
-                       ::testing::Values<index_t>(1, 2, 3));      // ld_in_n
+                       ::testing::Values<index_t>(1, 2, 3));      // ld_out_m
 #endif
 
 template <class T>
diff --git a/test/unittest/extension/transpose_test.cpp b/test/unittest/extension/transpose_test.cpp
index ebbc54858..d3c6a0163 100644
--- a/test/unittest/extension/transpose_test.cpp
+++ b/test/unittest/extension/transpose_test.cpp
@@ -87,7 +87,8 @@ void run_test(const combination_t<scalar_t>& combi) {
     helper::deallocate<mem_alloc>(matrix_out, q);
 
   } else {
-    // Inplace Transpose: TODO
+    // Inplace Transpose currently disabled (TODO)
+    GTEST_SKIP();
   }
 }