diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a6d25b50bd..6c1eed33f3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -229,3 +229,6 @@ In chronological order:
 
 * Christopher Daley <https://github.com/cdaley>
   * [2024-01-24] Optimize GEMV forwarding on ARM64 systems
+
+* Aymen Qader <aymen.qader@arm.com>
+  * [2024-12-09] Add Arm®v9-A architecture SME2 SGEMM kernels
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index b7328876b4..e22bcb1079 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -345,6 +345,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#elif defined(ARMV9SME) && !defined(DOUBLE) && !defined(COMPLEX)
+	/* the current SME SGEMM kernel requires n>=8*GEMM_UNROLL_N to achieve best performance */
+	if (min_jj >= 8*GEMM_UNROLL_N) min_jj = 8*GEMM_UNROLL_N;
 #else
         if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
         else
diff --git a/kernel/arm64/KERNEL.ARMV9SME b/kernel/arm64/KERNEL.ARMV9SME
index bc59990979..2e00ae4a70 100644
--- a/kernel/arm64/KERNEL.ARMV9SME
+++ b/kernel/arm64/KERNEL.ARMV9SME
@@ -1 +1,8 @@
 include $(KERNELDIR)/KERNEL.ARMV8SVE
+
+SGEMMKERNEL = sgemm_kernel_sme.c
+
+SGEMMINCOPY = sgemm_ncopy_sme.c
+SGEMMITCOPY = sgemm_tcopy_sme.c
+SGEMMONCOPY = sgemm_ncopy_sme.c
+SGEMMOTCOPY = sgemm_tcopy_sme.c
diff --git a/kernel/arm64/sgemm_kernel_sme.c b/kernel/arm64/sgemm_kernel_sme.c
new file mode 100644
index 0000000000..073c97f5e0
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_sme.c
@@ -0,0 +1,188 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <arm_sme.h>
+
+#include "common.h"
+#include "sme_abi.h"
+
+// Outer product kernel.
+// Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA.
+// This kernel is unpredicated, and assumes a full 2SVL x 2SVL block.
+__attribute__((always_inline)) inline void
+kernel_2x2(const float *A, const float *B, float *C, float alpha,
+           size_t shared_dim, size_t a_step, size_t b_step, size_t c_step)
+    __arm_out("za") __arm_streaming {
+  const size_t svl = svcntw();
+
+  // Predicate set-up
+  svbool_t ptrue = svptrue_b32();
+
+  // Load from C into ZA
+  for (size_t i = 0; i < (svl >> 1); i++) {
+    svld1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]);
+    svld1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]);
+    svld1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]);
+    svld1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]);
+  }
+
+  svfloat32_t alpha_vec = svdup_f32(alpha);
+
+  // Iterate through shared dimension (K)
+  for (size_t k = 0; k < shared_dim; k++) {
+    // Load column of A
+    svfloat32x2_t cols_a = svld1_x2(svptrue_c32(), &A[k * a_step]);
+
+    // Load row of B
+    svfloat32x2_t rows_b = svld1_x2(svptrue_c32(), &B[k * b_step]);
+
+    // Multiply B through by alpha
+    svfloat32_t row_b_0 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 0));
+    svfloat32_t row_b_1 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 1));
+
+    // Perform outer products
+    svmopa_za32_m(0, ptrue, ptrue, svget2(cols_a, 0), row_b_0);
+    svmopa_za32_m(1, ptrue, ptrue, svget2(cols_a, 1), row_b_0);
+    svmopa_za32_m(2, ptrue, ptrue, svget2(cols_a, 0), row_b_1);
+    svmopa_za32_m(3, ptrue, ptrue, svget2(cols_a, 1), row_b_1);
+  }
+
+  // Store out to C from ZA
+  for (size_t i = 0; i < (svl >> 1); i++) {
+    // Store out one row of C per tile
+    svst1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]);
+    svst1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]);
+    svst1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]);
+    svst1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]);
+  }
+}
+
+// Outer product kernel.
+// Computes an SVL x SVL block of C, utilizing a single FP32 tile of ZA (ZA0).
+// This kernel is predicated, and can handle under-filled blocks.
+__attribute__((always_inline)) inline void
+kernel_1x1(const float *A, const float *B, float *C, float alpha,
+           size_t shared_dim, size_t a_len, size_t a_step, size_t b_len,
+           size_t b_step, size_t c_step, size_t c_rows, size_t c_cols)
+    __arm_out("za") __arm_streaming {
+
+  // Predicate set-up
+  svbool_t pg = svptrue_b32();
+  svbool_t pg_a = svwhilelt_b32_u64(0, a_len);
+  svbool_t pg_b = svwhilelt_b32_u64(0, b_len);
+  svbool_t pg_c = svwhilelt_b32_u64(0, c_rows);
+
+  // Load from C into ZA
+  for (size_t i = 0; i < c_cols; i++) {
+    svld1_ver_za32(0, i, pg_c, &C[i * c_step]);
+  }
+
+  svfloat32_t alpha_vec = svdup_f32_z(pg_b, alpha);
+
+  // Iterate through shared dimension (K)
+  for (size_t k = 0; k < shared_dim; k++) {
+    // Load column of A
+    svfloat32_t col_a = svld1(pg_a, &A[k * a_step]);
+    // Load row of B
+    svfloat32_t row_b = svld1(pg_b, &B[k * b_step]);
+    // Multiply B through by alpha
+    row_b = svmul_x(pg_b, alpha_vec, row_b);
+    // Perform outer product
+    svmopa_za32_m(0, pg, pg, col_a, row_b);
+  }
+
+  // Store out to C from ZA
+  for (size_t i = 0; i < c_cols; i++) {
+    svst1_ver_za32(0, i, pg_c, &C[i * c_step]);
+  }
+}
+
+__arm_new("za") __arm_locally_streaming
+    int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha0, FLOAT *ba,
+              FLOAT *bb, FLOAT *C, BLASLONG ldc) {
+
+  const BLASLONG num_rows = bm;
+  const BLASLONG num_cols = bn;
+
+  const FLOAT *a_ptr = ba;
+  const FLOAT *b_ptr = bb;
+  FLOAT *c_ptr = C;
+
+  const BLASLONG svl = svcntw();
+
+  const BLASLONG a_step = bm;
+  const BLASLONG b_step = bn;
+  const BLASLONG c_step = ldc;
+
+  // Block over rows of C (panels of A)
+  BLASLONG row_idx = 0;
+
+  // 2x2 loop
+  BLASLONG row_batch = 2 * svl;
+
+  // Block over row dimension of C
+  for (; row_idx + row_batch <= num_rows; row_idx += row_batch) {
+    BLASLONG col_idx = 0;
+    BLASLONG col_batch = 2 * svl;
+
+    // Block over column dimension of C
+    for (; col_idx + col_batch <= num_cols; col_idx += col_batch) {
+      kernel_2x2(&a_ptr[row_idx], &b_ptr[col_idx],
+                 &c_ptr[row_idx + col_idx * c_step], alpha0, bk, a_step, b_step,
+                 c_step);
+    }
+
+    // Handle under-filled blocks w/ 2x(1x1) kernels
+    col_batch = 1 * svl;
+    for (; col_idx < num_cols; col_idx += col_batch) {
+      col_batch = MIN(col_batch, num_cols - col_idx);
+
+      kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx],
+                 &c_ptr[row_idx + col_idx * c_step], alpha0, bk, svl, a_step,
+                 col_batch, b_step, c_step, svl, col_batch);
+
+      kernel_1x1(&a_ptr[row_idx + svl], &b_ptr[col_idx],
+                 &c_ptr[(row_idx + svl) + col_idx * c_step], alpha0, bk, svl,
+                 a_step, col_batch, b_step, c_step, svl, col_batch);
+    }
+  }
+
+  // Handle under-filled blocks w/ 1x1 kernels
+  row_batch = 1 * svl;
+  for (; row_idx < num_rows; row_idx += row_batch) {
+    row_batch = MIN(row_batch, num_rows - row_idx);
+    // Block over column dimension of C
+    BLASLONG col_batch = svl;
+    for (BLASLONG col_idx = 0; col_idx < num_cols; col_idx += col_batch) {
+      col_batch = MIN(col_batch, num_cols - col_idx);
+      kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx],
+                 &c_ptr[row_idx + col_idx * c_step], alpha0, bk, row_batch,
+                 a_step, col_batch, b_step, c_step, row_batch, col_batch);
+    }
+  }
+  return 0;
+}
diff --git a/kernel/arm64/sgemm_ncopy_sme.c b/kernel/arm64/sgemm_ncopy_sme.c
new file mode 100644
index 0000000000..8a2271b886
--- /dev/null
+++ b/kernel/arm64/sgemm_ncopy_sme.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <arm_sme.h>
+
+#include "common.h"
+#include "sme_abi.h"
+
+// Transpose 1SVL x N panel of A into B
+__attribute__((always_inline)) inline static void
+transpose_panel(const FLOAT *a, FLOAT *b, BLASLONG rows, BLASLONG cols,
+                BLASLONG a_step, BLASLONG b_step)
+    __arm_out("za") __arm_streaming {
+  BLASLONG col_batch = svcntsw();
+  const svbool_t pg_a = svwhilelt_b32_u64(0, rows);
+
+  for (BLASLONG k = 0; k < cols; k += col_batch) {
+    col_batch = MIN(col_batch, cols - k);
+    for (BLASLONG col = 0; col < col_batch; col++) {
+      svld1_ver_za32(0, col, pg_a, &a[(col + k) * a_step]);
+    }
+
+    const svbool_t pg_b = svwhilelt_b32_u64(k, cols);
+    for (BLASLONG row = 0; row < rows; row++) {
+      svst1_hor_za32(0, row, pg_b, &b[row * b_step + k]);
+    }
+  }
+}
+
+__arm_new("za") __arm_locally_streaming
+    int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) {
+  const BLASLONG num_rows = m;
+  BLASLONG row_batch = svcntsw();
+  for (BLASLONG row_idx = 0; row_idx < num_rows; row_idx += row_batch) {
+    // Transpose 1xSVL panel
+    row_batch = MIN(row_batch, num_rows - row_idx);
+    transpose_panel(&a[row_idx], &b[row_idx * n], row_batch, n, lda, n);
+  }
+  return 0;
+}
diff --git a/kernel/arm64/sgemm_tcopy_sme.c b/kernel/arm64/sgemm_tcopy_sme.c
new file mode 100644
index 0000000000..4687a73fa8
--- /dev/null
+++ b/kernel/arm64/sgemm_tcopy_sme.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <arm_sve.h>
+
+#include "common.h"
+#include "sme_abi.h"
+
+__arm_locally_streaming int CNAME(BLASLONG m, BLASLONG n, FLOAT *restrict a,
+                                  BLASLONG lda, FLOAT *restrict b) {
+  for (BLASLONG i = 0; i < m; i++) {
+    for (BLASLONG j = 0; j < n; j += svcntw()) {
+      svbool_t pg = svwhilelt_b32_u64(j, n);
+      svst1(pg, &b[i * n + j], svld1(pg, &a[i * lda + j]));
+    }
+  }
+  return 0;
+}
diff --git a/kernel/arm64/sme_abi.h b/kernel/arm64/sme_abi.h
new file mode 100644
index 0000000000..3e737ad6c9
--- /dev/null
+++ b/kernel/arm64/sme_abi.h
@@ -0,0 +1,45 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#pragma once
+
+#include <stdlib.h>
+
+/**
+ * These are SME ABI routines for saving & restoring SME state.
+ * They are typically provided by a compiler runtime library such
+ * as libgcc or compiler-rt, but support for these routines is not
+ * yet available on all platforms.
+ *
+ * Define these as aborting stubs so that we loudly fail on nested
+ * usage of SME state.
+ *
+ * These are defined as weak symbols so that a compiler runtime can
+ * override them if supported.
+ */
+__attribute__((weak)) void __arm_tpidr2_save() { abort(); }
+__attribute__((weak)) void __arm_tpidr2_restore() { abort(); }