diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt
index 4ac3fdeaa..87fc58eaf 100644
--- a/benchmark/portblas/CMakeLists.txt
+++ b/benchmark/portblas/CMakeLists.txt
@@ -69,6 +69,7 @@ set(sources
   extension/omatcopy_batched.cpp
   extension/omatadd.cpp
   extension/omatadd_batched.cpp
+  extension/axpy_batch.cpp
 )
 
 if(${BLAS_ENABLE_EXTENSIONS})
diff --git a/benchmark/portblas/extension/axpy_batch.cpp b/benchmark/portblas/extension/axpy_batch.cpp
new file mode 100644
index 000000000..2dcbe3451
--- /dev/null
+++ b/benchmark/portblas/extension/axpy_batch.cpp
@@ -0,0 +1,182 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename axpy_batch.cpp
+ *
+ **************************************************************************/
+
+#include "../utils.hpp"
+
+constexpr blas_benchmark::utils::ExtensionOp benchmark_op =
+    blas_benchmark::utils::ExtensionOp::axpy_batch;
+
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void run(benchmark::State& state, blas::SB_Handle* sb_handle_ptr, index_t size,
+         scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul,
+         index_t stride_y_mul, index_t batch_size, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<scalar_t>(
+      state, sb_handle_ptr->get_queue());
+
+  // Google-benchmark counters are double.
+  blas_benchmark::utils::init_extension_counters<benchmark_op, scalar_t>(
+      state, size, batch_size);
+
+  blas::SB_Handle& sb_handle = *sb_handle_ptr;
+  auto q = sb_handle.get_queue();
+
+  const auto stride_x{size * std::abs(inc_x) * stride_x_mul};
+  const auto stride_y{size * std::abs(inc_y) * stride_y_mul};
+
+  const index_t size_x{stride_x * batch_size};
+  const index_t size_y{stride_y * batch_size};
+  // Create data
+  std::vector<scalar_t> vx =
+      blas_benchmark::utils::random_data<scalar_t>(size_x);
+  std::vector<scalar_t> vy =
+      blas_benchmark::utils::random_data<scalar_t>(size_y);
+
+  auto inx = blas::helper::allocate<mem_alloc, scalar_t>(size_x, q);
+  auto iny = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);
+
+  auto copy_x =
+      blas::helper::copy_to_device<scalar_t>(q, vx.data(), inx, size_x);
+  auto copy_y =
+      blas::helper::copy_to_device<scalar_t>(q, vy.data(), iny, size_y);
+
+  sb_handle.wait({copy_x, copy_y});
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<scalar_t> y_ref = vy;
+  for (auto i = 0; i < batch_size; ++i) {
+    reference_blas::axpy(size, static_cast<scalar_t>(alpha),
+                         vx.data() + i * stride_x, inc_x,
+                         y_ref.data() + i * stride_y, inc_y);
+  }
+  std::vector<scalar_t> y_temp = vy;
+  {
+    auto y_temp_gpu = blas::helper::allocate<mem_alloc, scalar_t>(size_y, q);
+    auto copy_temp = blas::helper::copy_to_device<scalar_t>(q, y_temp.data(),
+                                                            y_temp_gpu, size_y);
+    sb_handle.wait(copy_temp);
+    auto axpy_batch_event =
+        _axpy_batch(sb_handle, size, alpha, inx, inc_x, stride_x, y_temp_gpu,
+                    inc_y, stride_y, batch_size);
+    sb_handle.wait(axpy_batch_event);
+    auto copy_output =
+        blas::helper::copy_to_host(q, y_temp_gpu, y_temp.data(), size_y);
+    sb_handle.wait(copy_output);
+
+    blas::helper::deallocate<mem_alloc>(y_temp_gpu, q);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors(y_temp, y_ref, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+
+  auto blas_method_def = [&]() -> std::vector<cl::sycl::event> {
+    auto event = _axpy_batch(sb_handle, size, alpha, inx, inc_x, stride_x, iny,
+                             inc_y, stride_y, batch_size);
+    sb_handle.wait(event);
+    return event;
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_method_def);
+  sb_handle.wait();
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  blas::helper::deallocate<mem_alloc>(inx, q);
+  blas::helper::deallocate<mem_alloc>(iny, q);
+}
+
+template <typename scalar_t, blas::helper::AllocType mem_alloc>
+void register_benchmark(blas::SB_Handle* sb_handle_ptr, bool* success,
+                        std::string mem_type,
+                        std::vector<axpy_batch_param_t<scalar_t>> params) {
+  for (auto p : params) {
+    index_t n, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size;
+    scalar_t alpha;
+    std::tie(n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size) =
+        p;
+    auto BM_lambda = [&](benchmark::State& st, blas::SB_Handle* sb_handle_ptr,
+                         index_t size, scalar_t alpha, index_t inc_x,
+                         index_t inc_y, index_t stride_x_mul,
+                         index_t stride_y_mul, index_t batch_size,
+                         bool* success) {
+      run<scalar_t, mem_alloc>(st, sb_handle_ptr, size, alpha, inc_x, inc_y,
+                               stride_x_mul, stride_y_mul, batch_size, success);
+    };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, scalar_t, index_t>(
+            n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size,
+            mem_type)
+            .c_str(),
+        BM_lambda, sb_handle_ptr, n, alpha, inc_x, inc_y, stride_x_mul,
+        stride_y_mul, batch_size, success)
+        ->UseRealTime();
+  }
+}
+
+template <typename scalar_t>
+void register_benchmark(blas_benchmark::Args& args,
+                        blas::SB_Handle* sb_handle_ptr, bool* success) {
+  auto axpy_batch_params =
+      blas_benchmark::utils::get_axpy_batch_params<scalar_t>(args);
+
+  register_benchmark<scalar_t, blas::helper::AllocType::buffer>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_BUFFER,
+      axpy_batch_params);
+#ifdef SB_ENABLE_USM
+  register_benchmark<scalar_t, blas::helper::AllocType::usm>(
+      sb_handle_ptr, success, blas_benchmark::utils::MEM_TYPE_USM,
+      axpy_batch_params);
+#endif
+}
+
+namespace blas_benchmark {
+void create_benchmark(blas_benchmark::Args& args,
+                      blas::SB_Handle* sb_handle_ptr, bool* success) {
+  BLAS_REGISTER_BENCHMARK(args, sb_handle_ptr, success);
+}
+}  // namespace blas_benchmark
diff --git a/benchmark/rocblas/CMakeLists.txt b/benchmark/rocblas/CMakeLists.txt
index 64a559931..8332590c4 100644
--- a/benchmark/rocblas/CMakeLists.txt
+++ b/benchmark/rocblas/CMakeLists.txt
@@ -74,7 +74,7 @@ set(sources
   # Extension blas
   extension/omatcopy.cpp
   extension/omatadd.cpp
-
+  extension/axpy_batch.cpp
 )
 
 # Operators supporting COMPLEX types benchmarking
diff --git a/benchmark/rocblas/extension/axpy_batch.cpp b/benchmark/rocblas/extension/axpy_batch.cpp
new file mode 100644
index 000000000..3ce877721
--- /dev/null
+++ b/benchmark/rocblas/extension/axpy_batch.cpp
@@ -0,0 +1,175 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename axpy_batch.cpp
+ *
+ **************************************************************************/
+
+#include "../utils.hpp"
+#include "common/common_utils.hpp"
+
+constexpr blas_benchmark::utils::ExtensionOp benchmark_op =
+    blas_benchmark::utils::ExtensionOp::axpy_batch;
+
+template <typename scalar_t, typename... args_t>
+static inline void rocblas_axpy_strided_batched_f(args_t&&... args) {
+  if constexpr (std::is_same_v<scalar_t, float>) {
+    CHECK_ROCBLAS_STATUS(
+        rocblas_saxpy_strided_batched(std::forward<args_t>(args)...));
+  } else if constexpr (std::is_same_v<scalar_t, double>) {
+    CHECK_ROCBLAS_STATUS(
+        rocblas_daxpy_strided_batched(std::forward<args_t>(args)...));
+  }
+  return;
+}
+
+template <typename scalar_t>
+void run(benchmark::State& state, rocblas_handle& rb_handle, index_t size,
+         scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul,
+         index_t stride_y_mul, index_t batch_size, bool* success) {
+  // initialize the state label
+  blas_benchmark::utils::set_benchmark_label<scalar_t>(state);
+
+  // Google-benchmark counters are double.
+  blas_benchmark::utils::init_extension_counters<benchmark_op, scalar_t>(
+      state, size, batch_size);
+
+  const auto stride_x{size * std::abs(inc_x) * stride_x_mul};
+  const auto stride_y{size * std::abs(inc_y) * stride_y_mul};
+
+  const index_t size_x{stride_x * batch_size};
+  const index_t size_y{stride_y * batch_size};
+  // Create data
+  std::vector<scalar_t> vx =
+      blas_benchmark::utils::random_data<scalar_t>(size_x);
+  std::vector<scalar_t> vy =
+      blas_benchmark::utils::random_data<scalar_t>(size_y);
+
+  blas_benchmark::utils::HIPVector<scalar_t> inx(size_x, vx.data());
+  blas_benchmark::utils::HIPVector<scalar_t> iny(size_y, vy.data());
+
+#ifdef BLAS_VERIFY_BENCHMARK
+  // Run a first time with a verification of the results
+  std::vector<scalar_t> y_ref = vy;
+  for (auto i = 0; i < batch_size; ++i) {
+    reference_blas::axpy(size, static_cast<scalar_t>(alpha),
+                         vx.data() + i * stride_x, inc_x,
+                         y_ref.data() + i * stride_y, inc_y);
+  }
+  std::vector<scalar_t> y_temp = vy;
+  {
+    blas_benchmark::utils::HIPVector<scalar_t, true> y_temp_gpu(size_y,
+                                                                y_temp.data());
+    rocblas_axpy_strided_batched_f<scalar_t>(rb_handle, size, &alpha, inx,
+                                             inc_x, stride_x, y_temp_gpu, inc_y,
+                                             stride_y, batch_size);
+  }
+
+  std::ostringstream err_stream;
+  if (!utils::compare_vectors(y_temp, y_ref, err_stream, "")) {
+    const std::string& err_str = err_stream.str();
+    state.SkipWithError(err_str.c_str());
+    *success = false;
+  };
+#endif
+
+  auto blas_warmup = [&]() -> void {
+    rocblas_axpy_strided_batched_f<scalar_t>(rb_handle, size, &alpha, inx,
+                                             inc_x, stride_x, iny, inc_y,
+                                             stride_y, batch_size);
+    return;
+  };
+
+  hipEvent_t start, stop;
+  CHECK_HIP_ERROR(hipEventCreate(&start));
+  CHECK_HIP_ERROR(hipEventCreate(&stop));
+
+  auto blas_method_def = [&]() -> std::vector<hipEvent_t> {
+    CHECK_HIP_ERROR(hipEventRecord(start, NULL));
+    rocblas_axpy_strided_batched_f<scalar_t>(rb_handle, size, &alpha, inx,
+                                             inc_x, stride_x, iny, inc_y,
+                                             stride_y, batch_size);
+    CHECK_HIP_ERROR(hipEventRecord(stop, NULL));
+    CHECK_HIP_ERROR(hipEventSynchronize(stop));
+    return std::vector{start, stop};
+  };
+
+  // Warmup
+  blas_benchmark::utils::warmup(blas_method_def);
+  CHECK_HIP_ERROR(hipStreamSynchronize(NULL));
+
+  blas_benchmark::utils::init_counters(state);
+
+  // Measure
+  for (auto _ : state) {
+    // Run
+    std::tuple<double, double> times =
+        blas_benchmark::utils::timef_hip(blas_method_def);
+
+    // Report
+    blas_benchmark::utils::update_counters(state, times);
+  }
+
+  state.SetItemsProcessed(state.iterations() * state.counters["n_fl_ops"]);
+  state.SetBytesProcessed(state.iterations() *
+                          state.counters["bytes_processed"]);
+
+  blas_benchmark::utils::calc_avg_counters(state);
+
+  CHECK_HIP_ERROR(hipEventDestroy(start));
+  CHECK_HIP_ERROR(hipEventDestroy(stop));
+}
+
+template <typename scalar_t>
+void register_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
+                        bool* success) {
+  auto axpy_batch_params =
+      blas_benchmark::utils::get_axpy_batch_params<scalar_t>(args);
+
+  for (auto p : axpy_batch_params) {
+    index_t n, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size;
+    scalar_t alpha;
+    std::tie(n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size) =
+        p;
+    auto BM_lambda =
+        [&](benchmark::State& st, rocblas_handle rb_handle, index_t size,
+            scalar_t alpha, index_t inc_x, index_t inc_y, index_t stride_x_mul,
+            index_t stride_y_mul, index_t batch_size, bool* success) {
+          run<scalar_t>(st, rb_handle, size, alpha, inc_x, inc_y, stride_x_mul,
+                        stride_y_mul, batch_size, success);
+        };
+    benchmark::RegisterBenchmark(
+        blas_benchmark::utils::get_name<benchmark_op, scalar_t, index_t>(
+            n, alpha, inc_x, inc_y, stride_x_mul, stride_y_mul, batch_size,
+            blas_benchmark::utils::MEM_TYPE_USM)
+            .c_str(),
+        BM_lambda, rb_handle, n, alpha, inc_x, inc_y, stride_x_mul,
+        stride_y_mul, batch_size, success)
+        ->UseRealTime();
+  }
+}
+
+namespace blas_benchmark {
+void create_benchmark(blas_benchmark::Args& args, rocblas_handle& rb_handle,
+                      bool* success) {
+  BLAS_REGISTER_BENCHMARK(args, rb_handle, success);
+}
+}  // namespace blas_benchmark
diff --git a/cmake/CmakeFunctionHelper.cmake b/cmake/CmakeFunctionHelper.cmake
index 3bed39572..f2d3244dc 100644
--- a/cmake/CmakeFunctionHelper.cmake
+++ b/cmake/CmakeFunctionHelper.cmake
@@ -701,7 +701,8 @@ function (build_library LIB_NAME ENABLE_EXTENSIONS)
                 $<TARGET_OBJECTS:matcopy_batch>
                 $<TARGET_OBJECTS:transpose>
                 $<TARGET_OBJECTS:omatadd>
-                $<TARGET_OBJECTS:omatadd_batch>)
+                $<TARGET_OBJECTS:omatadd_batch>
+                $<TARGET_OBJECTS:axpy_batch>)
 
    if (${ENABLE_EXTENSIONS})
      list(APPEND LIB_SRCS $<TARGET_OBJECTS:reduction>)
diff --git a/common/include/common/benchmark_identifier.hpp b/common/include/common/benchmark_identifier.hpp
index 8d85dcb6f..46e75e3fd 100644
--- a/common/include/common/benchmark_identifier.hpp
+++ b/common/include/common/benchmark_identifier.hpp
@@ -82,7 +82,8 @@ enum class ExtensionOp : int {
   imatcopy_batch = 4,
   omatadd_batch = 5,
   omatcopy2 = 6,
-  reduction = 7
+  reduction = 7,
+  axpy_batch = 8
 };
 
 template <Level1Op op>
@@ -195,6 +196,8 @@ std::string get_operator_name() {
     return "Omatcopy2";
   else if constexpr (op == ExtensionOp::reduction)
     return "Reduction";
+  else if constexpr (op == ExtensionOp::axpy_batch)
+    return "Axpy_batch";
   else
     throw std::runtime_error("Unknown BLAS extension operator");
 }
diff --git a/common/include/common/benchmark_names.hpp b/common/include/common/benchmark_names.hpp
index 87d6424d9..f06a6dee6 100644
--- a/common/include/common/benchmark_names.hpp
+++ b/common/include/common/benchmark_names.hpp
@@ -296,6 +296,15 @@ get_name(index_t rows, index_t cols, std::string reduction_dim,
   return internal::get_name<op, scalar_t>(rows, cols, reduction_dim, mem_type);
 }
 
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == ExtensionOp::axpy_batch, std::string>::type
+get_name(index_t n, scalar_t alpha, index_t inc_x, index_t inc_y,
+         index_t stride_x_mul, index_t stride_y_mul, index_t batch_size,
+         std::string mem_type) {
+  return internal::get_name<op, scalar_t>(n, alpha, inc_x, inc_y, stride_x_mul,
+                                          stride_y_mul, batch_size, mem_type);
+}
+
 }  // namespace utils
 }  // namespace blas_benchmark
 
diff --git a/common/include/common/blas_extension_state_counters.hpp b/common/include/common/blas_extension_state_counters.hpp
index 7b02654f2..f6720a1f5 100644
--- a/common/include/common/blas_extension_state_counters.hpp
+++ b/common/include/common/blas_extension_state_counters.hpp
@@ -91,6 +91,21 @@ init_extension_counters(benchmark::State& state, const char* t_a,
   }
   return;
 }
+
+template <ExtensionOp op, typename scalar_t, typename index_t>
+inline typename std::enable_if<op == ExtensionOp::axpy_batch>::type
+init_extension_counters(benchmark::State& state, index_t n,
+                        index_t batch_size) {
+  // The way counters are computed are the same as axpy but multiplied
+  // by the batch_size
+  // Google-benchmark counters are double.
+  double size_d = static_cast<double>(n);
+  state.counters["size"] = size_d * batch_size;
+  state.counters["n_fl_ops"] = 2.0 * size_d * batch_size;
+  state.counters["bytes_processed"] =
+      3 * size_d * sizeof(scalar_t) * batch_size;
+  return;
+}
 }  // namespace utils
 }  // namespace blas_benchmark
 
diff --git a/common/include/common/common_utils.hpp b/common/include/common/common_utils.hpp
index 251ee9b7f..fc4562aa8 100644
--- a/common/include/common/common_utils.hpp
+++ b/common/include/common/common_utils.hpp
@@ -141,6 +141,10 @@ using omatadd_batch_param_t =
     std::tuple<char, char, index_t, index_t, scalar_t, scalar_t, index_t,
                index_t, index_t, index_t, index_t, index_t, index_t>;
 
+template <typename scalar_t>
+using axpy_batch_param_t =
+    std::tuple<index_t, scalar_t, index_t, index_t, index_t, index_t, index_t>;
+
 namespace blas_benchmark {
 
 namespace utils {
@@ -1483,6 +1487,49 @@ get_omatadd_batch_params(Args& args) {
   }
 }
 
+/**
+ * @fn get_axpy_batch_params
+ * @brief Returns a vector containing the axpy_batch benchmark parameters,
+ * either read from a file according to the command-line args, or the default
+ * ones.
+ */
+template <typename scalar_t>
+static inline std::vector<axpy_batch_param_t<scalar_t>> get_axpy_batch_params(
+    Args& args) {
+  if (args.csv_param.empty()) {
+    warning_no_csv();
+    std::vector<axpy_batch_param_t<scalar_t>> axpy_batch_default;
+    constexpr index_t dmin = 1 << 10, dmax = 1 << 22;
+    constexpr index_t batch_size{5};
+    constexpr index_t incX{1};
+    constexpr index_t incY{1};
+    constexpr index_t stride_x_mul{1};
+    constexpr index_t stride_y_mul{1};
+    constexpr scalar_t alpha{1};
+    for (auto n = dmin; n <= dmax; n *= 2) {
+      axpy_batch_default.push_back(std::make_tuple(
+          n, alpha, incX, incY, stride_x_mul, stride_y_mul, batch_size));
+    }
+    return axpy_batch_default;
+  } else {
+    return parse_csv_file<axpy_batch_param_t<scalar_t>>(
+        args.csv_param, [&](std::vector<std::string>& v) {
+          if (v.size() != 7) {
+            throw std::runtime_error(
+                "invalid number of parameters (7 expected)");
+          }
+          try {
+            return std::make_tuple(
+                str_to_int<index_t>(v[0]), str_to_scalar<scalar_t>(v[1]),
+                str_to_int<index_t>(v[2]), str_to_int<index_t>(v[3]),
+                str_to_int<index_t>(v[4]), str_to_int<index_t>(v[5]),
+                str_to_int<index_t>(v[6]));
+          } catch (...) {
+            throw std::runtime_error("invalid parameter");
+          }
+        });
+  }
+}
 /**
  * @fn get_type_name
  * @brief Returns a string with the given type. The C++ specification
diff --git a/include/interface/extension_interface.h b/include/interface/extension_interface.h
index f59d3f289..2e78d7935 100644
--- a/include/interface/extension_interface.h
+++ b/include/interface/extension_interface.h
@@ -146,6 +146,23 @@ typename sb_handle_t::event_t _transpose_add_impl(
     index_t _ldc, index_t _stride_c, index_t _batch_size,
     const typename sb_handle_t::event_t& _dependencies);
 
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename element_t, typename index_t>
+typename sb_handle_t::event_t _axpy_batch(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies);
+
+template <int localSize, int maxBlockPerBatch, typename sb_handle_t,
+          typename container_0_t, typename container_1_t, typename element_t,
+          typename index_t>
+typename sb_handle_t::event_t _axpy_batch_impl(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies, index_t global_size);
+
 }  // namespace internal
 
 /**
@@ -349,6 +366,34 @@ typename sb_handle_t::event_t _omatadd_batch(
                                   stride_c, batch_size, _dependencies);
 }
 
+/**
+ * \brief Compute a batch of AXPY operation all together
+ *
+ * Implements AXPY \f$y = ax + y\f$
+ *
+ * @param sb_handle SB_Handle
+ * @param _alpha scalar
+ * @param _vx BufferIterator or USM pointer
+ * @param _incx Increment for the vector X
+ * @param _stride_x Stride distance of two consecutive vectors in X
+ * @param _vy BufferIterator or USM pointer
+ * @param _incy Increment for the vector Y
+ * @param _stride_y Stride distance of two consecutive vectors in Y
+ * @param _batch_size number of axpy operations to compute
+ * @param _dependencies Vector of events
+ */
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename element_t, typename index_t>
+typename sb_handle_t::event_t _axpy_batch(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies = {}) {
+  return internal::_axpy_batch(sb_handle, _N, _alpha, _vx, _incx, _stride_x,
+                               _vy, _incy, _stride_y, _batch_size,
+                               _dependencies);
+}
+
 namespace extension {
 /**
  * \brief Transpose a Matrix in-place
@@ -417,7 +462,6 @@ typename sb_handle_t::event_t _reduction(
 }
 
 }  // namespace extension
-
 }  // namespace blas
 
 #endif  // PORTBLAS_EXTENSION_INTERFACE_H
diff --git a/include/operations/extension/axpy_batch.h b/include/operations/extension/axpy_batch.h
new file mode 100644
index 000000000..a034201ab
--- /dev/null
+++ b/include/operations/extension/axpy_batch.h
@@ -0,0 +1,90 @@
+/***************************************************************************
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename axpy_batch.h
+ *
+ **************************************************************************/
+
+#ifndef PORTBLAS_EXTENSION_AXPY_BATCH_H
+#define PORTBLAS_EXTENSION_AXPY_BATCH_H
+
+namespace blas {
+
+/*!
+ * This class holds the kernel implementation to perform axpy_batch
+ * operator.
+ *
+ * It has three additional template parameters to keep the operation simple and
+ * to avoid some computation or code divergence inside the kernel code.
+ *
+ * If sameSign is false the kernel always assumes that inc_r is negative. This
+ * is true by construction. When the increases are of different sizes the result
+ * positions are swapped and indexes must be computed accordingly. Keeping
+ * always inc_r negative and inc_l positive reduces keep index
+ * computation consistent, obtaining the correct result.
+ *
+ * sameSign indicate if inc_r and inc_l are of the sameSign. The code
+ * implementation need to follow different index computation. This template
+ * allow the condition at compile time, avoiding code divergency.
+ *
+ * localSize local size of group, allow some device tailoring at compile
+ * time.
+ *
+ * maxBlockPerBatch set the number of device group to use for each
+ * batch. If possible multiple batches are computed concurrently.
+ */
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+struct Axpy_batch {
+  using value_t = typename lhs_t::value_t;
+  using index_t = typename rhs_t::index_t;
+
+  lhs_t lhs_;
+  rhs_t rhs_;
+  value_t alpha_;
+  index_t n_, inc_r, inc_l, lhs_stride_, rhs_stride_, batch_size_,
+      n_block_per_loop;
+
+  Axpy_batch(lhs_t _lhs, rhs_t _rhs_1, value_t _alpha, index_t _N,
+             index_t _inc_l, index_t _lhs_stride, index_t _inc_r,
+             index_t _rhs_stride, index_t _batch_size);
+  index_t get_size() const;
+  bool valid_thread(cl::sycl::nd_item<1> ndItem) const;
+  value_t eval(cl::sycl::nd_item<1> ndItem);
+  void bind(cl::sycl::handler &h);
+  void adjust_access_displacement();
+};
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t> make_axpy_batch(
+    lhs_t _lhs, rhs_t _rhs_1, typename rhs_t::value_t _alpha,
+    typename rhs_t::index_t _N, typename rhs_t::index_t _inc_l,
+    typename rhs_t::index_t _lhs_stride, typename rhs_t::index_t _inc_r,
+    typename rhs_t::index_t _rhs_stride, typename rhs_t::index_t _batch_size) {
+  return Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t>(
+      _lhs, _rhs_1, _alpha, _N, _inc_l, _lhs_stride, _inc_r, _rhs_stride,
+      _batch_size);
+}
+
+}  // namespace blas
+
+#endif  // PORTBLAS_EXTENSION_AXPY_BATCH_H
diff --git a/include/portblas.h b/include/portblas.h
index 1c212166a..93719cfd6 100644
--- a/include/portblas.h
+++ b/include/portblas.h
@@ -55,6 +55,8 @@
 
 #include "operations/extension/matcopy_batch.h"
 
+#include "operations/extension/axpy_batch.h"
+
 #include "operations/blas_constants.h"
 
 #include "operations/blas_operators.h"
diff --git a/src/interface/extension/CMakeLists.txt b/src/interface/extension/CMakeLists.txt
index 836b68cbd..132628f70 100644
--- a/src/interface/extension/CMakeLists.txt
+++ b/src/interface/extension/CMakeLists.txt
@@ -28,5 +28,6 @@ generate_blas_objects(extension transpose)
 generate_blas_objects(extension omatadd)
 generate_blas_objects(extension matcopy_batch)
 generate_blas_objects(extension omatadd_batch)
+generate_blas_objects(extension axpy_batch)
 
 generate_blas_reduction_objects(extension reduction)
diff --git a/src/interface/extension/axpy_batch.cpp.in b/src/interface/extension/axpy_batch.cpp.in
new file mode 100644
index 000000000..89b0940d3
--- /dev/null
+++ b/src/interface/extension/axpy_batch.cpp.in
@@ -0,0 +1,76 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename axpy_batch.cpp.in
+ *
+ **************************************************************************/
+
+#include "interface/extension_interface.hpp"
+#include "operations/extension/axpy_batch.hpp"
+#include "sb_handle/kernel_constructor.hpp"
+#include "sb_handle/portblas_handle.hpp"
+#include "sb_handle/kernel_constructor.hpp"
+#include "sb_handle/portblas_handle.hpp"
+
+namespace blas {
+namespace internal {
+
+/**
+ * \brief AXPY_BATCH constant times a vector plus a vector.
+ *
+ * Implements AXPY_BATCH \f$y = ax + y\f$
+ *
+ * @param SB_Handle
+ * @param _vx  ${DATA_TYPE}
+ * @param _incx Increment in X axis
+ * @param _stridex Stride distance of vector in X
+ * @param _vy  ${DATA_TYPE}
+ * @param _incy Increment in Y axis
+ * @param _stridey Stride distance of vector in Y
+ * @param _batch_size number of batches
+ */
+
+template typename SB_Handle::event_t _axpy_batch(
+    SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha,
+    BufferIterator<${DATA_TYPE}> _vx, ${INDEX_TYPE} _incx,
+    ${INDEX_TYPE} _stridex, BufferIterator<${DATA_TYPE}> _vy,
+    ${INDEX_TYPE} _incy, ${INDEX_TYPE} _stridey,
+    ${INDEX_TYPE} _batch_size,
+    const typename SB_Handle::event_t& dependencies);
+
+#ifdef SB_ENABLE_USM
+template typename SB_Handle::event_t _axpy_batch(
+    SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha,
+    ${DATA_TYPE} * _vx, ${INDEX_TYPE} _incx, ${INDEX_TYPE} _stridex,
+    ${DATA_TYPE} * _vy, ${INDEX_TYPE} _incy, ${INDEX_TYPE} _stridey,
+    ${INDEX_TYPE} _batch_size,
+    const typename SB_Handle::event_t& dependencies);
+
+template typename SB_Handle::event_t _axpy_batch(
+    SB_Handle& sb_handle, ${INDEX_TYPE} _N, ${DATA_TYPE} _alpha,
+    const ${DATA_TYPE} * _vx, ${INDEX_TYPE} _incx,
+    ${INDEX_TYPE} _stridex, ${DATA_TYPE} * _vy, ${INDEX_TYPE} _incy,
+    ${INDEX_TYPE} _stridey, ${INDEX_TYPE} _batch_size,
+    const typename SB_Handle::event_t& dependencies);
+#endif
+
+}  // namespace internal
+}  // end namespace blas
diff --git a/src/interface/extension/backend/amd_gpu.hpp b/src/interface/extension/backend/amd_gpu.hpp
index 3ee9db746..f969f77a1 100644
--- a/src/interface/extension/backend/amd_gpu.hpp
+++ b/src/interface/extension/backend/amd_gpu.hpp
@@ -138,6 +138,31 @@ typename sb_handle_t::event_t _omatadd_batch(
 }
 }  // namespace backend
 }  // namespace omatadd_batch
+
+namespace axpy_batch {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename element_t, typename index_t>
+typename sb_handle_t::event_t _axpy_batch(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies) {
+  // local_size taken empirically
+  constexpr index_t local_size = static_cast<index_t>(256);
+  const auto nWG = (_N + local_size - 1) / local_size;
+  // the limit for _N*batch_size is taken empirically from test on AMDW6800
+  const index_t global_size =
+      (_N * _batch_size >= 163840)
+          ? (_N > (1 << 19)) ? (local_size * nWG) / 4 : local_size * nWG
+          : local_size * nWG * _batch_size;
+  return blas::internal::_axpy_batch_impl<256, 32>(
+      sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y,
+      _batch_size, _dependencies, global_size);
+}
+}  // namespace backend
+}  // namespace axpy_batch
+
 }  // namespace blas
 
 #endif
diff --git a/src/interface/extension/backend/default_cpu.hpp b/src/interface/extension/backend/default_cpu.hpp
index b168bb6c5..d8a2f6c24 100644
--- a/src/interface/extension/backend/default_cpu.hpp
+++ b/src/interface/extension/backend/default_cpu.hpp
@@ -111,6 +111,29 @@ typename sb_handle_t::event_t _omatadd_batch(
 }
 }  // namespace backend
 }  // namespace omatadd_batch
+
+namespace axpy_batch {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename element_t, typename index_t>
+typename sb_handle_t::event_t _axpy_batch(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies) {
+  // local_size taken empirically
+  constexpr index_t local_size = static_cast<index_t>(256);
+  const auto nWG = (_N + local_size - 1) / local_size;
+  // the limit for _N*batch_size is taken empirically from test on i9 CPU
+  const index_t global_size = (_N * _batch_size >= 163840)
+                                  ? local_size * nWG
+                                  : local_size * nWG * _batch_size;
+  return blas::internal::_axpy_batch_impl<256, 32>(
+      sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y,
+      _batch_size, _dependencies, global_size);
+}
+}  // namespace backend
+}  // namespace axpy_batch
 }  // namespace blas
 
 #endif
diff --git a/src/interface/extension/backend/intel_gpu.hpp b/src/interface/extension/backend/intel_gpu.hpp
index 9e2566aa7..90ec53746 100644
--- a/src/interface/extension/backend/intel_gpu.hpp
+++ b/src/interface/extension/backend/intel_gpu.hpp
@@ -132,6 +132,29 @@ typename sb_handle_t::event_t _omatadd_batch(
 }
 }  // namespace backend
 }  // namespace omatadd_batch
+
+namespace axpy_batch {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename element_t, typename index_t>
+typename sb_handle_t::event_t _axpy_batch(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies) {
+  // local_size taken empirically
+  constexpr index_t local_size = static_cast<index_t>(256);
+  const auto nWG = (_N + local_size - 1) / local_size;
+  // the limit for _N*batch_size is taken empirically from test on intelGPU
+  const index_t global_size = (_N * _batch_size > 327680)
+                                  ? local_size * nWG
+                                  : local_size * nWG * _batch_size;
+  return blas::internal::_axpy_batch_impl<256, 128>(
+      sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y,
+      _batch_size, _dependencies, global_size);
+}
+}  // namespace backend
+}  // namespace axpy_batch
 }  // namespace blas
 
 #endif
diff --git a/src/interface/extension/backend/nvidia_gpu.hpp b/src/interface/extension/backend/nvidia_gpu.hpp
index e3aac7028..b21228f5d 100644
--- a/src/interface/extension/backend/nvidia_gpu.hpp
+++ b/src/interface/extension/backend/nvidia_gpu.hpp
@@ -138,6 +138,39 @@ typename sb_handle_t::event_t _omatadd_batch(
 }
 }  // namespace backend
 }  // namespace omatadd_batch
+
+namespace axpy_batch {
+namespace backend {
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename element_t, typename index_t>
+typename sb_handle_t::event_t _axpy_batch(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies) {
+  // local_size taken empirically
+  constexpr index_t local_size = static_cast<index_t>(256);
+  const auto nWG = (_N + local_size - 1) / local_size;
+  // the limit for _N*batch_size is taken empirically from test on A100
+  if (_N * _batch_size <= 81920 || _N <= 16384) {
+    const index_t global_size = local_size * nWG * _batch_size;
+    return blas::internal::_axpy_batch_impl<256, 32>(
+        sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y,
+        _batch_size, _dependencies, global_size);
+  } else if (_N <= (1 << 19)) {
+    const index_t global_size = local_size * nWG;
+    return blas::internal::_axpy_batch_impl<256, 64>(
+        sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y,
+        _batch_size, _dependencies, global_size);
+  } else {
+    const index_t global_size = (local_size * nWG);
+    return blas::internal::_axpy_batch_impl<256, 128>(
+        sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y,
+        _batch_size, _dependencies, global_size);
+  }
+}
+}  // namespace backend
+}  // namespace axpy_batch
 }  // namespace blas
 
 #endif
diff --git a/src/interface/extension_interface.hpp b/src/interface/extension_interface.hpp
index 4ebeafc01..9613a4aeb 100644
--- a/src/interface/extension_interface.hpp
+++ b/src/interface/extension_interface.hpp
@@ -31,6 +31,7 @@
 #include "interface/extension_interface.h"
 #include "operations/blas1_trees.h"
 #include "operations/blas_operators.hpp"
+#include "operations/extension/axpy_batch.h"
 #include "operations/extension/matcopy_batch.h"
 #include "operations/extension/reduction.h"
 #include "operations/extension/transpose.h"
@@ -596,6 +597,55 @@ typename sb_handle_t::event_t _reduction(
         sb_handle, buffer_in, ld, buffer_out, rows, cols, dependencies);
   }
 }
+template <typename sb_handle_t, typename container_0_t, typename container_1_t,
+          typename element_t, typename index_t>
+typename sb_handle_t::event_t _axpy_batch(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies) {
+  return blas::axpy_batch::backend::_axpy_batch(
+      sb_handle, _N, _alpha, _vx, _incx, _stride_x, _vy, _incy, _stride_y,
+      _batch_size, _dependencies);
+}
+
+template <int localSize, int maxBlockPerBatch, typename sb_handle_t,
+          typename container_0_t, typename container_1_t, typename element_t,
+          typename index_t>
+typename sb_handle_t::event_t _axpy_batch_impl(
+    sb_handle_t& sb_handle, index_t _N, element_t _alpha, container_0_t _vx,
+    index_t _incx, index_t _stride_x, container_1_t _vy, index_t _incy,
+    index_t _stride_y, index_t _batch_size,
+    const typename sb_handle_t::event_t& _dependencies, index_t global_size) {
+  // if inc are of opposite sign the values are exchanged. It doesn't matter
+  // which one is positive or negative, so to simplify index computation in
+  // kernel we always set incx to be negative and incy to be positive.
+  if (_incx > 0 && _incy < 0) {
+    _incx = -_incx;
+    _incy = -_incy;
+  }
+  // if _stride_x is zero use _N as vx size
+  const index_t overall_vx_size = (_stride_x) ? _stride_x * _batch_size : _N;
+  typename VectorViewType<container_0_t, index_t, index_t>::type vx =
+      make_vector_view(_vx, static_cast<index_t>(_incx), overall_vx_size);
+  auto vy = make_vector_view(_vy, _incy, _stride_y * _batch_size);
+  // If both vectors are read from the same side it doesn't matter the sign of
+  // the increment
+  if (_incx * _incy > 0) {
+    auto op = make_axpy_batch<true, localSize, maxBlockPerBatch>(
+        vy, vx, _alpha, _N, std::abs(_incy), _stride_y, std::abs(_incx),
+        _stride_x, _batch_size);
+    typename sb_handle_t::event_t ret = sb_handle.execute(
+        op, static_cast<index_t>(localSize), global_size, _dependencies);
+    return ret;
+  } else {
+    auto op = make_axpy_batch<false, localSize, maxBlockPerBatch>(
+        vy, vx, _alpha, _N, _incy, _stride_y, _incx, _stride_x, _batch_size);
+    typename sb_handle_t::event_t ret = sb_handle.execute(
+        op, static_cast<index_t>(localSize), global_size, _dependencies);
+    return ret;
+  }
+}
 
 }  // namespace internal
 }  // namespace blas
diff --git a/src/operations/extension/axpy_batch.hpp b/src/operations/extension/axpy_batch.hpp
new file mode 100644
index 000000000..d5a70a7d6
--- /dev/null
+++ b/src/operations/extension/axpy_batch.hpp
@@ -0,0 +1,141 @@
+/***************************************************************************
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL-BLAS: BLAS implementation using SYCL
+ *
+ *  @filename axpy_batch.hpp
+ *
+ **************************************************************************/
+
+#ifndef PORTBLAS_EXTENSION_AXPY_BATCH_HPP
+#define PORTBLAS_EXTENSION_AXPY_BATCH_HPP
+
+#include "blas_meta.h"
+#include "operations/extension/axpy_batch.h"
+
+namespace blas {
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t>::Axpy_batch(
+    lhs_t _lhs, rhs_t _rhs, typename lhs_t::value_t _alpha,
+    typename rhs_t::index_t _N, typename rhs_t::index_t _inc_l,
+    typename rhs_t::index_t _lhs_stride, typename rhs_t::index_t _inc_r,
+    typename rhs_t::index_t _rhs_stride, typename rhs_t::index_t _batch_size)
+    : lhs_(_lhs),
+      rhs_(_rhs),
+      alpha_(_alpha),
+      n_(_N),
+      inc_l(_inc_l),
+      lhs_stride_(_lhs_stride),
+      inc_r(_inc_r),
+      rhs_stride_(_rhs_stride),
+      batch_size_(_batch_size),
+      n_block_per_loop(std::min((n_ + localSize - 1) / localSize,
+                                static_cast<index_t>(maxBlockPerBatch))){};
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE typename lhs_t::value_t
+Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t>::eval(
+    cl::sycl::nd_item<1> ndItem) {
+  const index_t n{n_};
+  const value_t alpha{alpha_};
+  const auto vx = rhs_.get_data();
+  const auto vy = lhs_.get_data();
+  const auto nbl{n_block_per_loop};
+
+  const index_t block_id = ndItem.get_group(0) % nbl;
+  const index_t l_id =
+      static_cast<index_t>(ndItem.get_local_range(0)) * block_id +
+      ndItem.get_local_id(0);
+  const index_t group_id = static_cast<index_t>(ndItem.get_group(0) / nbl);
+
+  const index_t size_compute_rateo =
+      (n > nbl * localSize) ? n / (nbl * localSize) : batch_size_;
+  const index_t jump_value{sycl::min(batch_size_, size_compute_rateo)};
+
+  if (group_id >= jump_value || l_id > n) return {};
+
+  const index_t stride_x = ndItem.get_local_range(0) * nbl * inc_r;
+  const index_t stride_y = ndItem.get_local_range(0) * nbl * inc_l;
+  index_t x_index{};
+  index_t y_index{};
+  int j{};
+
+  if constexpr (sameSign) {
+    for (auto out_loop = group_id; out_loop < batch_size_;
+         out_loop += jump_value) {
+      x_index = out_loop * rhs_stride_ + l_id * inc_r;
+      y_index = out_loop * lhs_stride_ + l_id * inc_l;
+      j = y_index;
+      for (auto i = x_index; i < (out_loop * rhs_stride_) + n * inc_r;
+           i += stride_x, j += stride_y) {
+        vy[j] += alpha * vx[i];
+      }
+    }
+
+  } else {
+    for (auto out_loop = group_id; out_loop < batch_size_;
+         out_loop += jump_value) {
+      x_index = out_loop * rhs_stride_ + inc_r + n * (-inc_r) + l_id * inc_r;
+      y_index = out_loop * lhs_stride_ + l_id * inc_l;
+      j = y_index;
+      for (auto i = x_index; i >= (out_loop * rhs_stride_);
+           i += stride_x, j += stride_y) {
+        vy[j] += alpha * vx[i];
+      }
+    }
+  }
+
+  return {};
+}
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE void Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t,
+                                rhs_t>::bind(cl::sycl::handler& h) {
+  lhs_.bind(h);
+  rhs_.bind(h);
+}
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE void Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t,
+                                rhs_t>::adjust_access_displacement() {
+  lhs_.adjust_access_displacement();
+  rhs_.adjust_access_displacement();
+}
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE typename rhs_t::index_t Axpy_batch<
+    sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t>::get_size() const {
+  return n_ * batch_size_;
+}
+
+template <bool sameSign, int localSize, int maxBlockPerBatch, typename lhs_t,
+          typename rhs_t>
+PORTBLAS_INLINE bool
+Axpy_batch<sameSign, localSize, maxBlockPerBatch, lhs_t, rhs_t>::valid_thread(
+    cl::sycl::nd_item<1> ndItem) const {
+  return true;
+}
+}  // namespace blas
+
+#endif
diff --git a/src/portblas.hpp b/src/portblas.hpp
index 623a0e302..5ed50d98d 100644
--- a/src/portblas.hpp
+++ b/src/portblas.hpp
@@ -52,6 +52,8 @@
 
 #include "operations/extension/matcopy_batch.hpp"
 
+#include "operations/extension/axpy_batch.hpp"
+
 #include "operations/blas_constants.hpp"
 
 #include "operations/blas_operators.hpp"
diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt
index 54b386df8..b5629e059 100644
--- a/test/unittest/CMakeLists.txt
+++ b/test/unittest/CMakeLists.txt
@@ -64,6 +64,7 @@ set(SYCL_UNITTEST_SRCS
   ${PORTBLAS_UNITTEST}/extension/omatcopy2_test.cpp
   ${PORTBLAS_UNITTEST}/extension/omatcopy_batched_test.cpp
   ${PORTBLAS_UNITTEST}/extension/omatadd_batched_test.cpp
+  ${PORTBLAS_UNITTEST}/extension/axpy_batch_test.cpp
 )
 
 if(${BLAS_ENABLE_EXTENSIONS})
diff --git a/test/unittest/extension/axpy_batch_test.cpp b/test/unittest/extension/axpy_batch_test.cpp
new file mode 100644
index 000000000..7c7bce441
--- /dev/null
+++ b/test/unittest/extension/axpy_batch_test.cpp
@@ -0,0 +1,151 @@
+/***************************************************************************
+ *
+ *  @license
+ *  Copyright (C) Codeplay Software Limited
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  For your convenience, a copy of the License has been included in this
+ *  repository.
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  portBLAS: BLAS implementation using SYCL
+ *
+ *  @filename axpy_batch_test.cpp
+ *
+ **************************************************************************/
+
+#include "blas_test.hpp"
+
+template <typename scalar_t>
+using combination_t = std::tuple<std::string, index_t, scalar_t, index_t,
+                                 index_t, index_t, index_t, index_t>;
+
+template <typename scalar_t, helper::AllocType mem_alloc>
+void run_test(const combination_t<scalar_t> combi) {
+  std::string alloc;
+  index_t size;
+  scalar_t alpha;
+  index_t incX;
+  index_t incY;
+  index_t stride_mul_x;
+  index_t stride_mul_y;
+  index_t batch_size;
+  std::tie(alloc, size, alpha, incX, incY, stride_mul_x, stride_mul_y,
+           batch_size) = combi;
+
+  const index_t stride_x{size * std::abs(incX) * stride_mul_x};
+  const index_t stride_y{size * std::abs(incY) * stride_mul_y};
+
+  auto x_size = (stride_x) ? stride_x * batch_size : size * std::abs(incX);
+  auto y_size = stride_y * batch_size;
+  // Input vector
+  std::vector<scalar_t> x_v(x_size);
+  fill_random(x_v);
+
+  // Output vector
+  std::vector<scalar_t> y_v(y_size, 10.0);
+  std::vector<scalar_t> y_cpu_v(y_size, 10.0);
+
+  // Reference implementation
+  for (index_t i = 0; i < batch_size; ++i) {
+    reference_blas::axpy(size, alpha, x_v.data() + i * stride_x, incX,
+                         y_cpu_v.data() + i * stride_y, incY);
+  }
+
+  // SYCL implementation
+  auto q = make_queue();
+  blas::SB_Handle sb_handle(q);
+
+  // Iterators
+  auto gpu_x_v = helper::allocate<mem_alloc, scalar_t>(x_size, q);
+  auto gpu_y_v = helper::allocate<mem_alloc, scalar_t>(y_size, q);
+
+  auto copy_x = helper::copy_to_device(q, x_v.data(), gpu_x_v, x_size);
+  auto copy_y = helper::copy_to_device(q, y_v.data(), gpu_y_v, y_size);
+
+  auto axpy_batch_event =
+      _axpy_batch(sb_handle, size, alpha, gpu_x_v, incX, stride_x, gpu_y_v,
+                  incY, stride_y, batch_size, {copy_x, copy_y});
+  sb_handle.wait(axpy_batch_event);
+
+  auto event = helper::copy_to_host(q, gpu_y_v, y_v.data(), y_size);
+  sb_handle.wait(event);
+
+  // Validate the result
+  const bool isAlmostEqual = utils::compare_vectors(y_v, y_cpu_v);
+  ASSERT_TRUE(isAlmostEqual);
+
+  helper::deallocate<mem_alloc>(gpu_x_v, q);
+  helper::deallocate<mem_alloc>(gpu_y_v, q);
+}
+
+template <typename scalar_t>
+void run_test(const combination_t<scalar_t> combi) {
+  std::string alloc;
+  index_t size;
+  scalar_t alpha;
+  index_t incX;
+  index_t incY;
+  index_t stride_mul_x;
+  index_t stride_mul_y;
+  index_t batch_size;
+  std::tie(alloc, size, alpha, incX, incY, stride_mul_x, stride_mul_y,
+           batch_size) = combi;
+
+  if (alloc == "usm") {  // usm alloc
+#ifdef SB_ENABLE_USM
+    run_test<scalar_t, helper::AllocType::usm>(combi);
+#else
+    GTEST_SKIP();
+#endif
+  } else {  // buffer alloc
+    run_test<scalar_t, helper::AllocType::buffer>(combi);
+  }
+}
+
+#ifdef STRESS_TESTING
+template <typename scalar_t>
+const auto combi =
+    ::testing::Combine(::testing::Values("usm", "buf"),  // allocation type
+                       ::testing::Values(11, 65, 10020, 1000240),   // size
+                       ::testing::Values<scalar_t>(0.0, 1.3, 2.5),  // alpha
+                       ::testing::Values(1, -1, 2, -7),             // incX
+                       ::testing::Values(1, -1, 3, -5),             // incY
+                       ::testing::Values(1, 2, 3),  // stride_mul_x
+                       ::testing::Values(1, 2, 3),  // stride_mul_y
+                       ::testing::Values(5)         // batch_size
+    );
+#else
+template <typename scalar_t>
+const auto combi =
+    ::testing::Combine(::testing::Values("usm", "buf"),  // allocation type
+                       ::testing::Values(11, 65, 1002, 10240),  // size
+                       ::testing::Values<scalar_t>(0.0, 1.3),   // alpha
+                       ::testing::Values(1, -1, 2, -4),         // incX
+                       ::testing::Values(1, -1, 3, -5),         // incY
+                       ::testing::Values(0, 1, 2, 3),           // stride_mul_x
+                       ::testing::Values(1, 2, 3),              // stride_mul_y
+                       ::testing::Values(5)                     // batch_size
+    );
+#endif
+
+template <class T>
+static std::string generate_name(
+    const ::testing::TestParamInfo<combination_t<T>>& info) {
+  std::string alloc;
+  index_t size, incX, incY, stride_mul_x, stride_mul_y, batch_size;
+  T alpha;
+  BLAS_GENERATE_NAME(info.param, alloc, size, alpha, incX, incY, stride_mul_x,
+                     stride_mul_y, batch_size);
+}
+
+BLAS_REGISTER_TEST_ALL(Axpy_batch, combination_t, combi, generate_name);