diff --git a/configs/a64fx_SME.yaml b/configs/a64fx_SME.yaml
index 7fe7086d5e..4d1090ae82 100644
--- a/configs/a64fx_SME.yaml
+++ b/configs/a64fx_SME.yaml
@@ -80,7 +80,7 @@ Ports:
     - INT_DIV_OR_SQRT
   5:
     Portname: EAGA
-    Instruction-Support: 
+    Instruction-Group-Support: 
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -88,7 +88,7 @@ Ports:
     - INT_SIMPLE_CMP
   6:
     Portname: EAGB
-    Instruction-Support:
+    Instruction-Group-Support:
     - LOAD
     - STORE_ADDRESS
     - INT_SIMPLE_ARTH_NOSHIFT
@@ -98,6 +98,7 @@ Ports:
     Portname: BR
     Instruction-Group-Support:
     - BRANCH
+# Define example SME unit
   8:
     Portname: SME
     Instruction-Group-Support:
diff --git a/docs/sphinx/assets/instruction_groups.png b/docs/sphinx/assets/instruction_groups.png
deleted file mode 100644
index bf5bf5c73a..0000000000
Binary files a/docs/sphinx/assets/instruction_groups.png and /dev/null differ
diff --git a/src/include/simeng/Register.hh b/src/include/simeng/Register.hh
index 5758d8e67b..0152813268 100644
--- a/src/include/simeng/Register.hh
+++ b/src/include/simeng/Register.hh
@@ -1,6 +1,5 @@
 #pragma once
 #include <cstdint>
-#include <iostream>
 
 namespace simeng {
 
diff --git a/src/include/simeng/arch/aarch64/Architecture.hh b/src/include/simeng/arch/aarch64/Architecture.hh
index 8d4939c991..a654fc897a 100644
--- a/src/include/simeng/arch/aarch64/Architecture.hh
+++ b/src/include/simeng/arch/aarch64/Architecture.hh
@@ -70,6 +70,12 @@ class Architecture : public arch::Architecture {
   /** Returns the current value of SVCRval_. */
   uint64_t getSVCRval() const;
 
+  /** Returns if SVE Streaming Mode is enabled. */
+  bool isStreamingModeEnabled() const;
+
+  /** Returns if the SME ZA Register is enabled. */
+  bool isZARegisterEnabled() const;
+
   /** Update the value of SVCRval_. */
   void setSVCRval(const uint64_t newVal) const;
 
diff --git a/src/include/simeng/arch/aarch64/InstructionGroups.hh b/src/include/simeng/arch/aarch64/InstructionGroups.hh
index b50005571c..fc15e95230 100644
--- a/src/include/simeng/arch/aarch64/InstructionGroups.hh
+++ b/src/include/simeng/arch/aarch64/InstructionGroups.hh
@@ -4,7 +4,33 @@ namespace simeng {
 namespace arch {
 namespace aarch64 {
 
-/** The IDs of the instruction groups for AArch64 instructions. */
+/** The IDs of the instruction groups for AArch64 instructions.
+ * Each new group must contain 14 entries to ensure correct group assignment and
+ * general functionality.
+ * Their order must be as follows:
+ *  - BASE
+ *  - BASE_SIMPLE
+ *  - BASE_SIMPLE_ARTH
+ *  - BASE_SIMPLE_ARTH_NOSHIFT
+ *  - BASE_SIMPLE_LOGICAL
+ *  - BASE_SIMPLE_LOGICAL_NOSHIFT
+ *  - BASE_SIMPLE_CMP
+ *  - BASE_SIMPLE_CVT
+ *  - BASE_MUL
+ *  - BASE_DIV_OR_SQRT
+ *  - LOAD_BASE
+ *  - STORE_ADDRESS_BASE
+ *  - STORE_DATA_BASE
+ *  - STORE_BASE
+ *
+ * An exception to the above is "Parent" groups which do not require the LOAD_*
+ * or STORE_* groups.
+ * "Parent" groups allow for easier grouping of similar groups that may have
+ * identical execution latencies, ports, etc. For example, FP is the parent
+ * group of SCALAR and VECTOR.
+ * In simulation, an instruction's allocated group will never be a "Parent"
+ * group; they are only used to simplify config file creation and management.
+ */
 namespace InstructionGroups {
 const uint16_t INT = 0;
 const uint16_t INT_SIMPLE = 1;
@@ -102,7 +128,7 @@ static constexpr uint8_t NUM_GROUPS = 88;
 const std::unordered_map<uint16_t, std::vector<uint16_t>> groupInheritance_ = {
     {InstructionGroups::ALL,
      {InstructionGroups::INT, InstructionGroups::FP, InstructionGroups::SVE,
-      InstructionGroups::PREDICATE, InstructionGroups::SME,
+      InstructionGroups::SME, InstructionGroups::PREDICATE,
       InstructionGroups::LOAD, InstructionGroups::STORE,
       InstructionGroups::BRANCH}},
     {InstructionGroups::INT,
diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index c2626b7e91..cc9aa03461 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -568,9 +568,14 @@ RegisterValue vecUMaxP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+  // Compare each adjacent pair of elements
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::max(n[i], m[i]);
+    out[i] = std::max(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
@@ -585,9 +590,14 @@ RegisterValue vecUMinP(srcValContainer& sourceValues) {
   const T* n = sourceValues[0].getAsVector<T>();
   const T* m = sourceValues[1].getAsVector<T>();
 
+  // Concatenate the vectors
+  T temp[2 * I];
+  memcpy(temp, n, sizeof(T) * I);
+  memcpy(temp + (sizeof(T) * I), m, sizeof(T) * I);
+
   T out[I];
   for (int i = 0; i < I; i++) {
-    out[i] = std::min(n[i], m[i]);
+    out[i] = std::min(temp[2 * i], temp[2 * i + 1]);
   }
   return {out, 256};
 }
diff --git a/src/lib/arch/aarch64/Architecture.cc b/src/lib/arch/aarch64/Architecture.cc
index 015dba62b1..3ff09c5b5c 100644
--- a/src/lib/arch/aarch64/Architecture.cc
+++ b/src/lib/arch/aarch64/Architecture.cc
@@ -284,6 +284,12 @@ void Architecture::setSVCRval(const uint64_t newVal) const {
   SVCRval_ = newVal;
 }
 
+// 0th bit of SVCR register determines if streaming-mode is enabled.
+bool Architecture::isStreamingModeEnabled() const { return SVCRval_ & 1; }
+
+// 1st bit of SVCR register determines if ZA register is enabled.
+bool Architecture::isZARegisterEnabled() const { return SVCRval_ & 2; }
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index 34ddca07d7..56e438a3d8 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -232,6 +232,41 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       operands[2].access = CS_AC_READ;
       operands[3].access = CS_AC_READ;
       break;
+
+    case Opcode::AArch64_INSERT_MXIPZ_H_B:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_D:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_H:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_Q:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_H_S:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_B:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_D:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_H:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_Q:
+      [[fallthrough]];
+    case Opcode::AArch64_INSERT_MXIPZ_V_S:
+      // Need to add access specifiers
+      // although operands[0] should be READ | WRITE, due to the implemented
+      // decode logic for SME tile destinations, the register will be added as
+      // both source and destination with just WRITE access.
+      operands[0].access = CS_AC_WRITE;
+      operands[1].access = CS_AC_READ;
+      operands[2].access = CS_AC_READ;
+      break;
+    case Opcode::AArch64_LDR_ZA:
+      // Need to add access specifier
+      // although operands[0] should be READ | WRITE, due to the implemented
+      // decode logic for SME tile destinations, the register will be added as
+      // both source and destination with just WRITE access.
+      operands[0].access = CS_AC_WRITE;
+      break;
     case Opcode::AArch64_ZERO_M: {
       // Incorrect access type: All are READ but should all be WRITE
       for (int i = 0; i < operandCount; i++) {
diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 357077e7b3..ec4f269a8f 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -91,8 +91,25 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[2].get<uint64_t>(), 8}});
         break;
       }
-      case Opcode::AArch64_LD1_MXIPXX_V_D:    // ld1d {zatv.d[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, lsl #3}]
+      case Opcode::AArch64_LD1_MXIPXX_V_B:  // ld1b {zatv.b[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_LD1_MXIPXX_H_B: {  // ld1b {zath.b[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm}]
+        // SME
+        const uint16_t partition_num = VL_bits / 8;
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>();
+        setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_D:  // ld1d {zatv.d[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, lsl #3}]
+        // SME
+        [[fallthrough]];
       case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, lsl #3}]
         // SME
@@ -104,8 +121,40 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
         break;
       }
-      case Opcode::AArch64_LD1_MXIPXX_V_S:    // ld1w {zatv.s[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, LSL #2}]
+      case Opcode::AArch64_LD1_MXIPXX_V_H:  // ld1h {zatv.h[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_LD1_MXIPXX_H_H: {  // ld1h {zath.h[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        const uint16_t partition_num = VL_bits / 16;
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 1;
+        setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_Q:  // ld1q {zatv.q[ws]}, pg/z,
+                                            // [<xn|sp>{, xm, lsl #4}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_LD1_MXIPXX_H_Q: {  // ld1q {zath.q[ws]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME
+        const uint16_t partition_num = VL_bits / 128;
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 4;
+        setMemoryAddresses({(n + m), static_cast<uint16_t>(VL_bits / 8)});
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_S:  // ld1w {zatv.s[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, LSL #2}]
+        // SME
+        [[fallthrough]];
       case Opcode::AArch64_LD1_MXIPXX_H_S: {  // ld1w {zath.s[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME
@@ -459,6 +508,17 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
+      case Opcode::AArch64_LDR_ZA: {  // ldr za[wv, #imm], [<xn|sp>{, #imm, mul
+                                      // vl}]
+        // SME
+        // ZA Row count === current VL in bytes
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint64_t xn = sourceValues_[zaRowCount + 1].get<uint64_t>();
+        const uint64_t imm =
+            static_cast<uint64_t>(metadata_.operands[1].mem.disp);
+        setMemoryAddresses({xn + (imm * zaRowCount), zaRowCount});
+        break;
+      }
       case Opcode::AArch64_LDRBBpost: {  // ldrb wt, [xn], #imm
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 1}});
         break;
@@ -501,19 +561,32 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 8}});
         break;
       }
-      case Opcode::AArch64_LDRBui:     // ldr bt, [xn, #imm]
-      case Opcode::AArch64_LDRBpre:    // ldr bt, [xn, #imm]!
-      case Opcode::AArch64_LDRDui:     // ldr dt, [xn, #imm]
-      case Opcode::AArch64_LDRDpre:    // ldr dt, [xn, #imm]!
-      case Opcode::AArch64_LDRHui:     // ldr ht, [xn, #imm]
-      case Opcode::AArch64_LDRHpre:    // ldr ht, [xn, #imm]!
-      case Opcode::AArch64_LDRQui:     // ldr qt, [xn, #imm]
-      case Opcode::AArch64_LDRQpre:    // ldr qt, [xn, #imm]!
-      case Opcode::AArch64_LDRSui:     // ldr st, [xn, #imm]
-      case Opcode::AArch64_LDRSpre:    // ldr st, [xn, #imm]!
-      case Opcode::AArch64_LDRWui:     // ldr wt, [xn, #imm]
-      case Opcode::AArch64_LDRWpre:    // ldr wt, [xn, #imm]!
-      case Opcode::AArch64_LDRXui:     // ldr xt, [xn, #imm]
+      case Opcode::AArch64_LDRBui:  // ldr bt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRBpre:  // ldr bt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRDui:  // ldr dt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRDpre:  // ldr dt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRHui:  // ldr ht, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRHpre:  // ldr ht, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRQui:  // ldr qt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRQpre:  // ldr qt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRSui:  // ldr st, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRSpre:  // ldr st, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRWui:  // ldr wt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDRWpre:  // ldr wt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_LDRXui:  // ldr xt, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_LDRXpre: {  // ldr xt, [xn, #imm]!
         std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
@@ -522,12 +595,18 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_LDRBpost:    // ldr bt, [xn], #imm
-      case Opcode::AArch64_LDRDpost:    // ldr dt, [xn], #imm
-      case Opcode::AArch64_LDRHpost:    // ldr ht, [xn], #imm
-      case Opcode::AArch64_LDRQpost:    // ldr qt, [xn], #imm
-      case Opcode::AArch64_LDRSpost:    // ldr st, [xn], #imm
-      case Opcode::AArch64_LDRWpost:    // ldr wt, [xn], #imm
+      case Opcode::AArch64_LDRBpost:  // ldr bt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRDpost:  // ldr dt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRHpost:  // ldr ht, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRQpost:  // ldr qt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRSpost:  // ldr st, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDRWpost:  // ldr wt, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_LDRXpost: {  // ldr xt, [xn], #imm
         std::vector<memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(sourceValues_[0].get<uint64_t>(), 1,
@@ -645,15 +724,24 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{base, 4}, {base + 4, 4}});
         break;
       }
-      case Opcode::AArch64_LDPDi:      // ldp dt1, dt2, [xn, #imm]
-      case Opcode::AArch64_LDPDpre:    // ldp dt1, dt2, [xn, #imm!]
-      case Opcode::AArch64_LDPQi:      // ldp qt1, qt2, [xn, #imm]
-      case Opcode::AArch64_LDPQpre:    // ldp qt1, qt2, [xn, #imm!]
-      case Opcode::AArch64_LDPSi:      // ldp st1, st2, [xn, #imm]
-      case Opcode::AArch64_LDPSpre:    // ldp st1, st2, [xn, #imm!]
-      case Opcode::AArch64_LDPWi:      // ldp wt1, wt2, [xn, #imm]
-      case Opcode::AArch64_LDPWpre:    // ldp wt1, wt2, [xn, #imm!]
-      case Opcode::AArch64_LDPXi:      // ldp xt1, xt2, [xn, #imm]
+      case Opcode::AArch64_LDPDi:  // ldp dt1, dt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPDpre:  // ldp dt1, dt2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPQi:  // ldp qt1, qt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPQpre:  // ldp qt1, qt2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPSi:  // ldp st1, st2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPSpre:  // ldp st1, st2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPWi:  // ldp wt1, wt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPWpre:  // ldp wt1, wt2, [xn, #imm!]
+        [[fallthrough]];
+      case Opcode::AArch64_LDPXi:  // ldp xt1, xt2, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_LDPXpre: {  // ldp xt1, xt2, [xn, #imm!]
         std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
@@ -662,10 +750,14 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_LDPDpost:    // ldp dt1, dt2, [xn], #imm
-      case Opcode::AArch64_LDPQpost:    // ldp qt1, qt2, [xn], #imm
-      case Opcode::AArch64_LDPSpost:    // ldp st1, st2, [xn], #imm
-      case Opcode::AArch64_LDPWpost:    // ldp wt1, wt2, [xn], #imm
+      case Opcode::AArch64_LDPDpost:  // ldp dt1, dt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDPQpost:  // ldp qt1, qt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDPSpost:  // ldp st1, st2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_LDPWpost:  // ldp wt1, wt2, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_LDPXpost: {  // ldp xt1, xt2, [xn], #imm
         std::vector<memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(sourceValues_[0].get<uint64_t>(), 2,
@@ -958,8 +1050,33 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_H_D:    // st1d {zath.d[ws, #imm]}, pg,
-                                              // [<xn|sp>{, xm, lsl #3}]
+      case Opcode::AArch64_ST1_MXIPXX_H_B:  // st1b {zath.b[ws, #imm]}, pg,
+                                            // [<xn|sp>{, xm}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_ST1_MXIPXX_V_B: {  // st1b {zatv.b[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm}]
+        // SME
+        const uint16_t partition_num = VL_bits / 8;
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>();
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        generatePredicatedContiguousAddressBlocks((n + m), partition_num, 1, 1,
+                                                  pg, addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_D:  // st1d {zath.d[ws, #imm]}, pg,
+                                            // [<xn|sp>{, xm, lsl #3}]
+        // SME
+        [[fallthrough]];
       case Opcode::AArch64_ST1_MXIPXX_V_D: {  // st1d {zatv.d[ws, #imm]}, pg,
                                               // [<xn|sp>{, xm, lsl #3}]
         // SME
@@ -979,8 +1096,56 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(std::move(addresses));
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_H_S:    // st1w {zath.s[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, LSL #2}]
+      case Opcode::AArch64_ST1_MXIPXX_H_H:  // st1h {zath.h[ws, #imm]}, pg,
+                                            // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_ST1_MXIPXX_V_H: {  // st1h {zatv.h[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #1}]
+        // SME
+        const uint16_t partition_num = VL_bits / 16;
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 1;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        generatePredicatedContiguousAddressBlocks((n + m), partition_num, 2, 2,
+                                                  pg, addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_Q:  // st1q {zath.q[ws]}, pg, [<xn|sp>{,
+                                            // xm, lsl #4}]
+        // SME
+        [[fallthrough]];
+      case Opcode::AArch64_ST1_MXIPXX_V_Q: {  // st1q {zatv.q[ws]}, pg,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME
+        const uint16_t partition_num = VL_bits / 128;
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+        const uint64_t n = sourceValues_[partition_num + 2].get<uint64_t>();
+        uint64_t m = 0;
+        if (metadata_.operands[2].mem.index)
+          m = sourceValues_[partition_num + 3].get<uint64_t>() << 4;
+
+        std::vector<memory::MemoryAccessTarget> addresses;
+        addresses.reserve(partition_num);
+
+        generatePredicatedContiguousAddressBlocks((n + m), partition_num, 16,
+                                                  16, pg, addresses);
+        setMemoryAddresses(std::move(addresses));
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_S:  // st1w {zath.s[ws, #imm]}, pg/z,
+                                            // [<xn|sp>{, xm, LSL #2}]
+        // SME
+        [[fallthrough]];
       case Opcode::AArch64_ST1_MXIPXX_V_S: {  // st1w {zatv.s[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME
@@ -1358,15 +1523,24 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
-      case Opcode::AArch64_STPDi:      // stp dt1, dt2, [xn, #imm]
-      case Opcode::AArch64_STPDpre:    // stp dt1, dt2, [xn, #imm]!
-      case Opcode::AArch64_STPQi:      // stp qt1, qt2, [xn, #imm]
-      case Opcode::AArch64_STPQpre:    // stp qt1, qt2, [xn, #imm]!
-      case Opcode::AArch64_STPSi:      // stp st1, st2, [xn, #imm]
-      case Opcode::AArch64_STPSpre:    // stp st1, st2, [xn, #imm]!
-      case Opcode::AArch64_STPWi:      // stp wt1, wt2, [xn, #imm]
-      case Opcode::AArch64_STPWpre:    // stp wt1, wt2, [xn, #imm]!
-      case Opcode::AArch64_STPXi:      // stp xt1, xt2, [xn, #imm]
+      case Opcode::AArch64_STPDi:  // stp dt1, dt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPDpre:  // stp dt1, dt2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPQi:  // stp qt1, qt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPQpre:  // stp qt1, qt2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPSi:  // stp st1, st2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPSpre:  // stp st1, st2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPWi:  // stp wt1, wt2, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STPWpre:  // stp wt1, wt2, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STPXi:  // stp xt1, xt2, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_STPXpre: {  // stp xt1, xt2, [xn, #imm]!
         std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
@@ -1375,10 +1549,14 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_STPDpost:    // stp dt1, dt2, [xn], #imm
-      case Opcode::AArch64_STPQpost:    // stp qt1, qt2, [xn], #imm
-      case Opcode::AArch64_STPSpost:    // stp st1, st2, [xn], #imm
-      case Opcode::AArch64_STPWpost:    // stp wt1, wt2, [xn], #imm
+      case Opcode::AArch64_STPDpost:  // stp dt1, dt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STPQpost:  // stp qt1, qt2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STPSpost:  // stp st1, st2, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STPWpost:  // stp wt1, wt2, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_STPXpost: {  // stp xt1, xt2, [xn], #imm
         std::vector<memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(sourceValues_[2].get<uint64_t>(), 2,
@@ -1428,19 +1606,32 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>() + offset, 8}});
         break;
       }
-      case Opcode::AArch64_STRBui:     // str bt, [xn, #imm]
-      case Opcode::AArch64_STRBpre:    // str bt, [xn, #imm]!
-      case Opcode::AArch64_STRDui:     // str dt, [xn, #imm]
-      case Opcode::AArch64_STRDpre:    // str dt, [xn, #imm]!
-      case Opcode::AArch64_STRHui:     // str ht, [xn, #imm]
-      case Opcode::AArch64_STRHpre:    // str ht, [xn, #imm]!
-      case Opcode::AArch64_STRQui:     // str qt, [xn, #imm]
-      case Opcode::AArch64_STRQpre:    // str qt, [xn, #imm]!
-      case Opcode::AArch64_STRSui:     // str st, [xn, #imm]
-      case Opcode::AArch64_STRSpre:    // str st, [xn, #imm]!
-      case Opcode::AArch64_STRWui:     // str wt, [xn, #imm]
-      case Opcode::AArch64_STRWpre:    // str wt, [xn, #imm]!
-      case Opcode::AArch64_STRXui:     // str xt, [xn, #imm]
+      case Opcode::AArch64_STRBui:  // str bt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRBpre:  // str bt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRDui:  // str dt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRDpre:  // str dt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRHui:  // str ht, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRHpre:  // str ht, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRQui:  // str qt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRQpre:  // str qt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRSui:  // str st, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRSpre:  // str st, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRWui:  // str wt, [xn, #imm]
+        [[fallthrough]];
+      case Opcode::AArch64_STRWpre:  // str wt, [xn, #imm]!
+        [[fallthrough]];
+      case Opcode::AArch64_STRXui:  // str xt, [xn, #imm]
+        [[fallthrough]];
       case Opcode::AArch64_STRXpre: {  // str xt, [xn, #imm]!
         std::vector<simeng::memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(
@@ -1449,12 +1640,18 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses(addresses);
         break;
       }
-      case Opcode::AArch64_STRBpost:    // str bt, [xn], #imm
-      case Opcode::AArch64_STRDpost:    // str dt, [xn], #imm
-      case Opcode::AArch64_STRHpost:    // str ht, [xn], #imm
-      case Opcode::AArch64_STRQpost:    // str qt, [xn], #imm
-      case Opcode::AArch64_STRSpost:    // str st, [xn], #imm
-      case Opcode::AArch64_STRWpost:    // str wt, [xn], #imm
+      case Opcode::AArch64_STRBpost:  // str bt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRDpost:  // str dt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRHpost:  // str ht, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRQpost:  // str qt, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRSpost:  // str st, [xn], #imm
+        [[fallthrough]];
+      case Opcode::AArch64_STRWpost:  // str wt, [xn], #imm
+        [[fallthrough]];
       case Opcode::AArch64_STRXpost: {  // str xt, [xn], #imm
         std::vector<memory::MemoryAccessTarget> addresses;
         generateContiguousAddresses(sourceValues_[1].get<uint64_t>(), 1,
@@ -1545,6 +1742,16 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({base + (offset * partition_num), partition_num});
         break;
       }
+      case Opcode::AArch64_STR_ZA: {  // str za[wv, #imm], [xn|sp{, #imm, mul
+                                      // vl}]
+        // SME
+        // ZA Row count === current VL in bytes
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint64_t xn = sourceValues_[zaRowCount + 1].get<uint64_t>();
+        const uint64_t imm = metadata_.operands[1].mem.disp;
+        setMemoryAddresses({{xn + (imm * zaRowCount), zaRowCount}});
+        break;
+      }
       case Opcode::AArch64_STR_ZXI: {  // str zt, [xn{, #imm, mul vl}]
         const uint16_t partition_num = VL_bits / 8;
 
diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index 6d2007cb55..3535ce590f 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -2,11 +2,6 @@
 
 #include "InstructionMetadata.hh"
 
-#define NOT(bits, length) (~bits & (1 << length - 1))
-#define CONCAT(hi, lo, lowLen) ((hi << lowLen) & lo)
-#define ONES(n) ((1 << (n)) - 1)
-#define ROR(x, shift, size) ((x >> shift) | (x << (size - shift)))
-
 namespace simeng {
 namespace arch {
 namespace aarch64 {
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 20b62904b9..8f4bc38142 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -67,9 +67,9 @@ void Instruction::execute() {
       canExecute() &&
       "Attempted to execute an instruction before all operands were provided");
   // 0th bit of SVCR register determines if streaming-mode is enabled.
-  const bool SMenabled = architecture_.getSVCRval() & 1;
+  const bool SMenabled = architecture_.isStreamingModeEnabled();
   // 1st bit of SVCR register determines if ZA register is enabled.
-  const bool ZAenabled = architecture_.getSVCRval() & 2;
+  const bool ZAenabled = architecture_.isZARegisterEnabled();
   // When streaming mode is enabled, the architectural vector length goes from
   // SVE's VL to SME's SVL.
   const uint16_t VL_bits = SMenabled ? architecture_.getStreamingVectorLength()
@@ -108,6 +108,148 @@ void Instruction::execute() {
     }
   } else {
     switch (metadata_.opcode) {
+      case Opcode::AArch64_ADDHA_MPPZ_D: {  // addha zada.d, pn/m, pm/m, zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint64_t* zaRow = sourceValues_[row].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint64_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Element in 1st source pred corresponding to horizontal
+          //    slice is TRUE
+          //  - Corresponding element in 2nd source pred is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 8) * 8);
+          if (pn[row / 8] & shifted_active_pn) {
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 8) * 8);
+              if (pm[elem / 8] & shifted_active_pm) {
+                out[elem] = zn[elem];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_ADDHA_MPPZ_S: {  // addha zada.s, pn/m, pm/m, zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint32_t* zaRow = sourceValues_[row].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint32_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Element in 1st source pred corresponding to horizontal
+          //    slice is TRUE
+          //  - Corresponding element in 2nd source pred is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 16) * 4);
+          if (pn[row / 16] & shifted_active_pn) {
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 16) * 4);
+              if (pm[elem / 16] & shifted_active_pm) {
+                out[elem] = zn[elem];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_ADDVA_MPPZ_D: {  // addva zada.d, pn/m, pm/m, zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint64_t* zaRow = sourceValues_[row].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint64_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Corresponding element in 1st source pred is TRUE
+          //  - Element in 2nd source pred corresponding to vertical
+          //    slice is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 8) * 8);
+          if (pn[row / 8] & shifted_active_pn) {
+            // Corresponding slice element is active (i.e. all elements in row).
+            // Now check if each vertical slice (i.e. each row element) is
+            // active
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 8) * 8);
+              if (pm[elem / 8] & shifted_active_pm) {
+                out[elem] = zn[row];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_ADDVA_MPPZ_S: {  // addva zada.s, pn/m, pm/m, zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          const uint32_t* zaRow = sourceValues_[row].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          std::memcpy(out, zaRow, rowCount * sizeof(uint32_t));
+          // Slice element is active IFF all of the following conditions hold:
+          //  - Corresponding element in 1st source pred is TRUE
+          //  - Element in 2nd source pred corresponding to vertical
+          //    slice is TRUE
+          const uint64_t shifted_active_pn = 1ull << ((row % 16) * 4);
+          if (pn[row / 16] & shifted_active_pn) {
+            // Corresponding slice element is active (i.e. all elements in row).
+            // Now check if each vertical slice (i.e. each row element) is
+            // active in 2nd pred
+            for (uint16_t elem = 0; elem < rowCount; elem++) {
+              const uint64_t shifted_active_pm = 1ull << ((elem % 16) * 4);
+              if (pm[elem / 16] & shifted_active_pm) {
+                out[elem] = zn[row];
+              }
+            }
+          }
+          results_[row] = {out, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_ADCXr: {  // adc xd, xn, xm
         auto [result, nzcv] = addCarry_3ops<uint64_t>(sourceValues_);
         (void)nzcv;  // Prevent unused variable warnings in GCC7
@@ -700,9 +842,9 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_CMHSv16i8: {  // cmhs vd.16b, vn.16b, vm.16b
-        results_[0] = vecCompare<int8_t, 16>(
+        results_[0] = vecCompare<uint8_t, 16>(
             sourceValues_, false,
-            [](int8_t x, int8_t y) -> bool { return (x >= y); });
+            [](uint8_t x, uint8_t y) -> bool { return (x >= y); });
         break;
       }
       case Opcode::AArch64_CMPEQ_PPzZI_B: {  // cmpeq pd.b, pg/z, zn.b, #imm
@@ -1227,7 +1369,7 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> uint8_t { return x ^ y; });
         break;
       }
-      case Opcode::AArch64_EXTRACT_ZPMXI_H_B: {  // MOVA zd.b, pg/m, zanh.b[ws,
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_B: {  // mova zd.b, pg/m, zanh.b[ws,
                                                  // #imm]
         // SME
         // Check core is in correct context mode (check SM first)
@@ -1237,23 +1379,288 @@ void Instruction::execute() {
         const uint16_t rowCount = VL_bits / 8;
         const uint8_t* zd = sourceValues_[0].getAsVector<uint8_t>();
         const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
-        const uint64_t sliceNum =
+        const uint32_t sliceNum =
             (sourceValues_[2 + rowCount].get<uint32_t>() +
              static_cast<uint32_t>(
                  metadata_.operands[2].sme.slice_offset.imm)) %
             rowCount;
-        const uint8_t* zanRow =
+        const uint8_t* zaRow =
             sourceValues_[2 + sliceNum].getAsVector<uint8_t>();
+
         uint8_t out[256] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << (elem % 64);
+          if (pg[elem / 64] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_D: {  // mova zd.d, pg/m, zanh.d[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint64_t>();
+
+        uint64_t out[32] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 8) * 8);
+          if (pg[elem / 8] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_H: {  // mova zd.h, pg/m, zanh.h[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint16_t* zd = sourceValues_[0].getAsVector<uint16_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+        const uint16_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint16_t>();
+
+        uint16_t out[128] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 32) * 2);
+          if (pg[elem / 32] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_Q: {  // mova zd.q, pg/m, zanh.q[ws]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        // Use uint64_t as no 128-bit
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            sourceValues_[2 + rowCount].get<uint32_t>() % rowCount;
+        // Use uint64_t as no 128-bit
+        const uint64_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint64_t>();
+
+        // Use uint64_t as no 128-bit
+        uint64_t out[32] = {0};
+        for (int elem = 0; elem < rowCount; elem++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((elem % 4) * 16);
+          if (pg[elem / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            out[2 * elem] = zaRow[2 * elem];
+            out[2 * elem + 1] = zaRow[2 * elem + 1];
+          } else {
+            // Need to move two consecutive 64-bit elements
+            out[2 * elem] = zd[2 * elem];
+            out[2 * elem + 1] = zd[2 * elem + 1];
+          }
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_H_S: {  // mova zd.s, pg/m, zanh.s[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t* zd = sourceValues_[0].getAsVector<uint32_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+        const uint32_t* zaRow =
+            sourceValues_[2 + sliceNum].getAsVector<uint32_t>();
 
+        uint32_t out[64] = {0};
         for (int elem = 0; elem < rowCount; elem++) {
-          uint64_t shifted_active = 1ull << ((elem % 64));
+          uint64_t shifted_active = 1ull << ((elem % 16) * 4);
+          if (pg[elem / 16] & shifted_active)
+            out[elem] = zaRow[elem];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_B: {  // mova zd.b, pg/m, zanv.b[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint8_t* zd = sourceValues_[0].getAsVector<uint8_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+
+        uint8_t out[256] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << (elem % 64);
           if (pg[elem / 64] & shifted_active)
-            out[elem] = zanRow[elem];
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint8_t>()[sliceNum];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_D: {  // mova zd.d, pg/m, zanv.d[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 8) * 8);
+          if (pg[elem / 8] & shifted_active)
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint64_t>()[sliceNum];
+          else
+            out[elem] = zd[elem];
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_H: {  // mova zd.h, pg/m, zanv.h[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint16_t* zd = sourceValues_[0].getAsVector<uint16_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
+
+        uint16_t out[128] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 32) * 2);
+          if (pg[elem / 32] & shifted_active)
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint16_t>()[sliceNum];
           else
             out[elem] = zd[elem];
         }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_Q: {  // mova zd.q, pg/m, zanv.q[ws]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        // Use uint64_t as no 128-bit
+        const uint64_t* zd = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            sourceValues_[2 + rowCount].get<uint32_t>() % rowCount;
+
+        // Use uint64_t as no 128-bit
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((elem % 4) * 16);
+          if (pg[elem / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            const uint64_t* zaRow =
+                sourceValues_[2 + elem].getAsVector<uint64_t>();
+            out[2 * elem] = zaRow[2 * sliceNum];
+            out[2 * elem + 1] = zaRow[2 * sliceNum + 1];
+          } else {
+            // Need to move two consecutive 64-bit elements
+            out[2 * elem] = zd[2 * elem];
+            out[2 * elem + 1] = zd[2 * elem + 1];
+          }
+        }
+        results_[0] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_EXTRACT_ZPMXI_V_S: {  // mova zd.s, pg/m, zanv.s[ws,
+                                                 // #imm]
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t* zd = sourceValues_[0].getAsVector<uint32_t>();
+        const uint64_t* pg = sourceValues_[1].getAsVector<uint64_t>();
+        const uint32_t sliceNum =
+            (sourceValues_[2 + rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[2].sme.slice_offset.imm)) %
+            rowCount;
 
+        uint32_t out[64] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 16) * 4);
+          if (pg[elem / 16] & shifted_active)
+            out[elem] =
+                sourceValues_[2 + elem].getAsVector<uint32_t>()[sliceNum];
+          else
+            out[elem] = zd[elem];
+        }
         results_[0] = {out, 256};
         break;
       }
@@ -1948,20 +2355,84 @@ void Instruction::execute() {
         }
         break;
       }
-      case Opcode::AArch64_FMOVDXHighr: {  // fmov xd, vn.d[1]
-        results_[0] = sourceValues_[0].getAsVector<double>()[1];
-        break;
-      }
-      case Opcode::AArch64_FMOVDXr: {  // fmov xd, dn
-        results_[0] = sourceValues_[0].get<double>();
-        break;
-      }
-      case Opcode::AArch64_FMOVDi: {  // fmov dn, #imm
-        results_[0] = {metadata_.operands[1].fp, 256};
-        break;
-      }
-      case Opcode::AArch64_FMOVDr: {  // fmov dd, dn
-        results_[0] = {sourceValues_[0].get<double>(), 256};
+      case Opcode::AArch64_FMOPS_MPPZZ_D: {  // fmops zada.d, pn/m, pm/m, zn.d,
+                                             // zm.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const double* zn = sourceValues_[rowCount + 2].getAsVector<double>();
+        const double* zm = sourceValues_[rowCount + 3].getAsVector<double>();
+
+        // zn is row, zm is col
+        for (int row = 0; row < rowCount; row++) {
+          double outRow[32] = {0};
+          uint64_t shifted_active_row = 1ull << ((row % 8) * 8);
+          const double* zadaRow = sourceValues_[row].getAsVector<double>();
+          for (int col = 0; col < rowCount; col++) {
+            double zadaElem = zadaRow[col];
+            uint64_t shifted_active_col = 1ull << ((col % 8) * 8);
+            if ((pm[col / 8] & shifted_active_col) &&
+                (pn[row / 8] & shifted_active_row))
+              outRow[col] = zadaElem - (zn[row] * zm[col]);
+            else
+              outRow[col] = zadaElem;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_FMOPS_MPPZZ_S: {  // fmops zada.s, pn/m, pm/m, zn.s,
+                                             // zm.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[rowCount].getAsVector<uint64_t>();
+        const uint64_t* pm =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const float* zn = sourceValues_[rowCount + 2].getAsVector<float>();
+        const float* zm = sourceValues_[rowCount + 3].getAsVector<float>();
+
+        // zn is row, zm is col
+        for (int row = 0; row < rowCount; row++) {
+          float outRow[64] = {0};
+          uint64_t shifted_active_row = 1ull << ((row % 16) * 4);
+          const float* zadaRow = sourceValues_[row].getAsVector<float>();
+          for (int col = 0; col < rowCount; col++) {
+            float zadaElem = zadaRow[col];
+            uint64_t shifted_active_col = 1ull << ((col % 16) * 4);
+            if ((pm[col / 16] & shifted_active_col) &&
+                (pn[row / 16] & shifted_active_row))
+              outRow[col] = zadaElem - (zn[row] * zm[col]);
+            else
+              outRow[col] = zadaElem;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_FMOVDXHighr: {  // fmov xd, vn.d[1]
+        results_[0] = sourceValues_[0].getAsVector<double>()[1];
+        break;
+      }
+      case Opcode::AArch64_FMOVDXr: {  // fmov xd, dn
+        results_[0] = sourceValues_[0].get<double>();
+        break;
+      }
+      case Opcode::AArch64_FMOVDi: {  // fmov dn, #imm
+        results_[0] = {metadata_.operands[1].fp, 256};
+        break;
+      }
+      case Opcode::AArch64_FMOVDr: {  // fmov dd, dn
+        results_[0] = {sourceValues_[0].get<double>(), 256};
         break;
       }
       case Opcode::AArch64_FMOVSWr: {  // fmov wd, sn
@@ -2529,6 +3000,325 @@ void Instruction::execute() {
                                                  VL_bits, false, false);
         break;
       }
+
+      case Opcode::AArch64_INSERT_MXIPZ_H_B: {  // mova zadh.b[ws, #imm], pg/m,
+                                                // zn.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint8_t* zaRow = sourceValues_[sliceNum].getAsVector<uint8_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[rowCount + 2].getAsVector<uint8_t>();
+
+        uint8_t out[256] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << (elem % 64);
+          if (pg[elem / 64] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_D: {  // mova zadh.d[ws, #imm], pg/m,
+                                                // zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* zaRow = sourceValues_[sliceNum].getAsVector<uint64_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 8) * 8);
+          if (pg[elem / 8] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_H: {  // mova zadh.h[ws, #imm], pg/m,
+                                                // zn.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint16_t* zaRow = sourceValues_[sliceNum].getAsVector<uint16_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint16_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint16_t>();
+
+        uint16_t out[128] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 32) * 2);
+          if (pg[elem / 32] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_Q: {  // mova zadh.q[ws], pg/m, zn.q
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        const uint32_t sliceNum =
+            sourceValues_[rowCount].get<uint32_t>() % rowCount;
+        // Use uint64_t in place of 128-bit
+        const uint64_t* zaRow = sourceValues_[sliceNum].getAsVector<uint64_t>();
+
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        // Use uint64_t in place of 128-bit
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        // Use uint64_t in place of 128-bit
+        uint64_t out[32] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((elem % 4) * 16);
+          if (pg[elem / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            out[(2 * elem)] = zn[(2 * elem)];
+            out[(2 * elem + 1)] = zn[(2 * elem + 1)];
+          } else {
+            // Need to move two consecutive 64-bit elements
+            out[(2 * elem)] = zaRow[(2 * elem)];
+            out[(2 * elem + 1)] = zaRow[(2 * elem + 1)];
+          }
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_H_S: {  // mova zadh.s[ws, #imm], pg/m,
+                                                // zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint32_t* zaRow = sourceValues_[sliceNum].getAsVector<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        uint32_t out[64] = {0};
+        for (uint16_t elem = 0; elem < rowCount; elem++) {
+          uint64_t shifted_active = 1ull << ((elem % 16) * 4);
+          if (pg[elem / 16] & shifted_active)
+            out[elem] = zn[elem];
+          else
+            out[elem] = zaRow[elem];
+        }
+        // Need to update whole za tile
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] =
+              (row == sliceNum) ? RegisterValue(out, 256) : sourceValues_[row];
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_B: {  // mova zadv.b[ws, #imm], pg/m,
+                                                // zn.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[rowCount + 2].getAsVector<uint8_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint8_t* row = sourceValues_[i].getAsVector<uint8_t>();
+          uint8_t out[256] = {0};
+          memcpy(out, row, rowCount * sizeof(uint8_t));
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (pg[i / 64] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_D: {  // mova zadv.d[ws, #imm], pg/m,
+                                                // zn.d
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 64;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          memcpy(out, row, rowCount * sizeof(uint64_t));
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (pg[i / 8] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_H: {  // mova zadv.h[ws, #imm], pg/m,
+                                                // zn.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 16;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint16_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint16_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint16_t* row = sourceValues_[i].getAsVector<uint16_t>();
+          uint16_t out[128] = {0};
+          memcpy(out, row, rowCount * sizeof(uint16_t));
+          uint64_t shifted_active = 1ull << ((i % 32) * 2);
+          if (pg[i / 32] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_Q: {  // mova zadv.q[ws], pg/m, zn.q
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 128;
+        const uint32_t sliceNum =
+            sourceValues_[rowCount].get<uint32_t>() % rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        // Use uint64_t in place of 128-bit
+        const uint64_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint64_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          // Use uint64_t in place of 128-bit
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          // *2 in memcpy as need 128-bit elements but using uint64_t
+          memcpy(out, row, rowCount * sizeof(uint64_t) * 2);
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // Need to move two consecutive 64-bit elements
+            out[2 * sliceNum] = zn[2 * i];
+            out[2 * sliceNum + 1] = zn[2 * i + 1];
+          }
+          results_[i] = {out, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_INSERT_MXIPZ_V_S: {  // mova zadv.s[ws, #imm], pg/m,
+                                                // zn.s
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 32;
+        const uint32_t sliceNum =
+            (sourceValues_[rowCount].get<uint32_t>() +
+             static_cast<uint32_t>(
+                 metadata_.operands[0].sme.slice_offset.imm)) %
+            rowCount;
+        const uint64_t* pg =
+            sourceValues_[rowCount + 1].getAsVector<uint64_t>();
+        const uint32_t* zn =
+            sourceValues_[rowCount + 2].getAsVector<uint32_t>();
+
+        for (uint16_t i = 0; i < rowCount; i++) {
+          const uint32_t* row = sourceValues_[i].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          memcpy(out, row, rowCount * sizeof(uint32_t));
+          uint64_t shifted_active = 1ull << ((i % 16) * 4);
+          if (pg[i / 16] & shifted_active) out[sliceNum] = zn[i];
+          results_[i] = {out, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_INSvi16gpr: {  // ins vd.h[index], wn
         results_[0] =
             vecInsIndex_gpr<uint16_t, uint32_t, 8>(sourceValues_, metadata_);
@@ -2557,25 +3347,91 @@ void Instruction::execute() {
             vecInsIndex_gpr<uint8_t, uint32_t, 16>(sourceValues_, metadata_);
         break;
       }
+      case Opcode::AArch64_LD1_MXIPXX_H_B: {  // ld1b {zath.b[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint16_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+
+        uint8_t out[256] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (pg[i / 64] & shifted_active) {
+            out[i] = data[i];
+          } else {
+            out[i] = 0;
+          }
+        }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
       case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, lsl #3}]
         // SME, LOAD
-        // Not in right context mode. Raise exception
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 64;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint16_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+
+        uint64_t out[32] = {0};
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (pg[i / 8] & shifted_active) {
+            out[i] = data[i];
+          } else {
+            out[i] = 0;
+          }
+        }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_H_H: {  // ld1h {zath.h[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, LSL #1}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
         if (!ZAenabled) return ZAdisabled();
 
-        const uint16_t partition_num = VL_bits / 64;
+        const uint16_t partition_num = VL_bits / 16;
         const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
             sourceValues_[partition_num + 1].getAsVector<uint64_t>();
 
-        const uint16_t sliceNum =
+        const uint32_t sliceNum =
             (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
-        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+        const uint16_t* data = memoryData_[0].getAsVector<uint16_t>();
 
-        uint64_t out[32] = {0};
+        uint16_t out[128] = {0};
         for (int i = 0; i < partition_num; i++) {
-          uint64_t shifted_active = 1ull << ((i % 8) * 8);
-          if (pg[i / 8] & shifted_active) {
+          uint64_t shifted_active = 1ull << ((i % 32) * 2);
+          if (pg[i / 32] & shifted_active) {
             out[i] = data[i];
           } else {
             out[i] = 0;
@@ -2585,46 +3441,53 @@ void Instruction::execute() {
         // All Slice vectors are added to results[] so need to update the
         // correct one
         for (uint16_t i = 0; i < partition_num; i++) {
-          if (i == sliceNum)
-            results_[i] = {out, 256};
-          else
-            // Maintain un-updated rows.
-            results_[i] = sourceValues_[i];
+          results_[i] = sourceValues_[i];
         }
+        results_[sliceNum] = {out, 256};
         break;
       }
-      case Opcode::AArch64_LD1_MXIPXX_V_D: {  // ld1d {zatv.d[ws, #imm]}, pg/z,
-                                              // [<xn|sp>{, xm, lsl #3}]
+      case Opcode::AArch64_LD1_MXIPXX_H_Q: {  // ld1q {zath.q[ws]}, pg/z,
+                                              // [<xn|sp>{, xm, LSL #4}]
         // SME, LOAD
-        // Not in right context mode. Raise exception
+        // If not in right context mode, raise exception
         if (!ZAenabled) return ZAdisabled();
 
-        const uint16_t partition_num = VL_bits / 64;
+        const uint16_t partition_num = VL_bits / 128;
         const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
             sourceValues_[partition_num + 1].getAsVector<uint64_t>();
 
-        const uint32_t sliceNum =
-            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint32_t sliceNum = ws % partition_num;
+        // Use uint64_t as no 128-bit type
         const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
 
+        // Use uint64_t as no 128-bit type
+        uint64_t out[32] = {0};
         for (int i = 0; i < partition_num; i++) {
-          uint64_t* row =
-              const_cast<uint64_t*>(sourceValues_[i].getAsVector<uint64_t>());
-          uint64_t shifted_active = 1ull << ((i % 8) * 8);
-          if (pg[i / 8] & shifted_active) {
-            row[sliceNum] = data[i];
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // As using uint64_t need to modify 2 elements
+            out[2 * i] = data[2 * i];
+            out[2 * i + 1] = data[2 * i + 1];
           } else {
-            row[sliceNum] = 0;
+            out[2 * i] = 0;
+            out[2 * i + 1] = 0;
           }
-          results_[i] = RegisterValue(reinterpret_cast<char*>(row), 256);
         }
+
+        // All Slice vectors are added to results[] so need to update the
+        // correct one
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
         break;
       }
       case Opcode::AArch64_LD1_MXIPXX_H_S: {  // ld1w {zath.s[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME, LOAD
-        // Not in right context mode. Raise exception
+        // If not in right context mode, raise exception
         if (!ZAenabled) return ZAdisabled();
 
         const uint16_t partition_num = VL_bits / 32;
@@ -2648,19 +3511,129 @@ void Instruction::execute() {
 
         // All Slice vectors are added to results[] so need to update the
         // correct one
-        for (uint32_t i = 0; i < partition_num; i++) {
-          if (i == sliceNum)
-            results_[i] = {out, 256};
-          else
-            // Maintain un-updated rows.
-            results_[i] = sourceValues_[i];
+        for (uint16_t i = 0; i < partition_num; i++) {
+          results_[i] = sourceValues_[i];
+        }
+        results_[sliceNum] = {out, 256};
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_B: {  // ld1b {zatv.b[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+
+        for (int i = 0; i < partition_num; i++) {
+          const uint8_t* row = sourceValues_[i].getAsVector<uint8_t>();
+          uint8_t out[256] = {0};
+          memcpy(out, row, partition_num * sizeof(uint8_t));
+          uint64_t shifted_active = 1ull << (i % 64);
+          if (pg[i / 64] & shifted_active) {
+            out[sliceNum] = data[i];
+          }
+          results_[i] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_D: {  // ld1d {zatv.d[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 64;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+
+        for (int i = 0; i < partition_num; i++) {
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          memcpy(out, row, partition_num * sizeof(uint64_t));
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (pg[i / 8] & shifted_active) {
+            out[sliceNum] = data[i];
+          }
+          results_[i] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_H: {  // ld1h {zatv.h[ws, #imm]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #1}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 16;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint16_t* data = memoryData_[0].getAsVector<uint16_t>();
+
+        for (int i = 0; i < partition_num; i++) {
+          const uint16_t* row = sourceValues_[i].getAsVector<uint16_t>();
+          uint16_t out[128] = {0};
+          memcpy(out, row, partition_num * sizeof(uint16_t));
+          uint64_t shifted_active = 1ull << ((i % 32) * 2);
+          if (pg[i / 32] & shifted_active) {
+            out[sliceNum] = data[i];
+          }
+          results_[i] = RegisterValue(out, 256);
+        }
+        break;
+      }
+      case Opcode::AArch64_LD1_MXIPXX_V_Q: {  // ld1q {zatv.q[ws]}, pg/z,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 128;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum = ws % partition_num;
+        // Using uint64_t as no 128-bit data type
+        const uint64_t* data = memoryData_[0].getAsVector<uint64_t>();
+
+        for (int i = 0; i < partition_num; i++) {
+          // Using uint64_t as no 128-bit data type
+          const uint64_t* row = sourceValues_[i].getAsVector<uint64_t>();
+          uint64_t out[32] = {0};
+          // *2 in memcpy as need 128-bit but using uint64_t
+          memcpy(out, row, partition_num * sizeof(uint64_t) * 2);
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // As using uint64_t need to modify 2 elements
+            out[2 * sliceNum] = data[2 * i];
+            out[2 * sliceNum + 1] = data[2 * i + 1];
+          }
+          results_[i] = RegisterValue(out, 256);
         }
         break;
       }
       case Opcode::AArch64_LD1_MXIPXX_V_S: {  // ld1w {zatv.s[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, LSL #2}]
         // SME, LOAD
-        // Not in right context mode. Raise exception
+        // If not in right context mode, raise exception
         if (!ZAenabled) return ZAdisabled();
 
         const uint16_t partition_num = VL_bits / 32;
@@ -2673,15 +3646,14 @@ void Instruction::execute() {
         const uint32_t* data = memoryData_[0].getAsVector<uint32_t>();
 
         for (int i = 0; i < partition_num; i++) {
-          uint32_t* row =
-              const_cast<uint32_t*>(sourceValues_[i].getAsVector<uint32_t>());
+          const uint32_t* row = sourceValues_[i].getAsVector<uint32_t>();
+          uint32_t out[64] = {0};
+          memcpy(out, row, partition_num * sizeof(uint32_t));
           uint64_t shifted_active = 1ull << ((i % 16) * 4);
           if (pg[i / 16] & shifted_active) {
-            row[sliceNum] = data[i];
-          } else {
-            row[sliceNum] = 0;
+            out[sliceNum] = data[i];
           }
-          results_[i] = RegisterValue(reinterpret_cast<char*>(row), 256);
+          results_[i] = RegisterValue(out, 256);
         }
         break;
       }
@@ -3671,6 +4643,31 @@ void Instruction::execute() {
         results_[0] = {out, 256};
         break;
       }
+      case Opcode::AArch64_LDR_ZA: {  // ldr za[wv, #imm], [<xn|sp>{, #imm, mul
+                                      // vl}]
+        // SME, LOAD
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t rowCount = VL_bits / 8;
+        const uint32_t wn = sourceValues_[rowCount].get<uint32_t>();
+        const uint32_t sliceNum =
+            wn +
+            static_cast<uint32_t>(metadata_.operands[0].sme.slice_offset.imm);
+
+        const uint8_t* data = memoryData_[0].getAsVector<uint8_t>();
+        uint8_t out[256] = {0};
+        for (uint16_t i = 0; i < rowCount; i++) {
+          out[i] = data[i];
+        }
+
+        for (uint16_t row = 0; row < rowCount; row++) {
+          results_[row] = (row == sliceNum)
+                              ? RegisterValue(out, 256)
+                              : results_[row] = sourceValues_[row];
+        }
+        break;
+      }
       case Opcode::AArch64_LDTRSBXi: {  // ldtrsb xt, [xn, #imm]
         // LOAD
         // TODO: implement
@@ -4334,6 +5331,158 @@ void Instruction::execute() {
             [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); });
         break;
       }
+      case Opcode::AArch64_SMOPA_MPPZZ_D: {  // smopa zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SMOPA_MPPZZ_S: {  // smopa zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SMOPS_MPPZZ_D: {  // smops zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SMOPS_MPPZZ_S: {  // smops zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_SMSUBLrrr: {  // smsubl xd, wn, wm, xa
         results_[0] = msubl_4ops<int64_t, int32_t>(sourceValues_);
         break;
@@ -4410,40 +5559,195 @@ void Instruction::execute() {
         }
         break;
       }
-      case Opcode::AArch64_SST1D_IMM: {  // st1d {zd.d}, pg, [zn.d{, #imm}]
-        // STORE
-        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
-        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
+      case Opcode::AArch64_SST1D_IMM: {  // st1d {zd.d}, pg, [zn.d{, #imm}]
+        // STORE
+        const uint64_t* t = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
+
+        const uint16_t partition_num = VL_bits / 64;
+        uint16_t index = 0;
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (p[i / 8] & shifted_active) {
+            memoryData_[index] = t[i];
+            index++;
+          }
+        }
+        break;
+      }
+      case Opcode::AArch64_SST1D_SCALED: {  // st1d {zt.d}, pg, [xn,
+                                            // zm.d, lsl #3]
+        // STORE
+        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
+        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
+
+        const uint16_t partition_num = VL_bits / 64;
+        uint16_t index = 0;
+        for (int i = 0; i < partition_num; i++) {
+          uint64_t shifted_active = 1ull << ((i % 8) * 8);
+          if (p[i / 8] & shifted_active) {
+            memoryData_[index] = d[i];
+            index++;
+          }
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_B: {  // st1b {zath.b[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint8_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint8_t>();
+        memoryData_ = sve_merge_store_data<uint8_t>(tileSlice, pg, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_D: {  // st1d {zath.d[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #3}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 64;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint64_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint64_t>();
+        memoryData_ = sve_merge_store_data<uint64_t>(tileSlice, pg, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_H: {  // st1h {zath.h[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #1}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 16;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint16_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint16_t>();
+        memoryData_ = sve_merge_store_data<uint16_t>(tileSlice, pg, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_Q: {  // st1q {zath.q[ws]}, pg,
+                                              // [<xn|sp>{, xm, lsl #4}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t partition_num = VL_bits / 128;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum = ws % partition_num;
+
+        // Using uint64_t as no 128-bit type
+        const uint64_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint64_t>();
+
+        // Need to combine active adjacent elements into RegisterValues and
+        // place into each memoryData_ index.
+        int index = 0;
+        std::vector<uint64_t> memData;
+        for (uint16_t i = 0; i < partition_num; i++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((i % 4) * 16);
+          if (pg[i / 4] & shifted_active) {
+            // As using uint64_t need to push_back 2 elements
+            memData.push_back(tileSlice[2 * i]);
+            memData.push_back(tileSlice[2 * i + 1]);
+          } else if (memData.size() > 0) {
+            // Predicate false, save current data
+            memoryData_[index] = RegisterValue(
+                (char*)memData.data(), memData.size() * sizeof(uint64_t));
+            index++;
+            memData.clear();
+          }
+        }
+        // Check if final data needs putting into memoryData_
+        if (memData.size() > 0) {
+          memoryData_[index] = RegisterValue((char*)memData.data(),
+                                             memData.size() * sizeof(uint64_t));
+        }
+        break;
+      }
+      case Opcode::AArch64_ST1_MXIPXX_H_S: {  // st1w {zath.s[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, lsl #2}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
 
-        const uint16_t partition_num = VL_bits / 64;
-        uint16_t index = 0;
-        for (int i = 0; i < partition_num; i++) {
-          uint64_t shifted_active = 1ull << ((i % 8) * 8);
-          if (p[i / 8] & shifted_active) {
-            memoryData_[index] = t[i];
-            index++;
-          }
-        }
+        const uint16_t partition_num = VL_bits / 32;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        const uint32_t* tileSlice =
+            sourceValues_[sliceNum].getAsVector<uint32_t>();
+        memoryData_ = sve_merge_store_data<uint32_t>(tileSlice, pg, VL_bits);
         break;
       }
-      case Opcode::AArch64_SST1D_SCALED: {  // st1d {zt.d}, pg, [xn,
-                                            // zm.d, lsl #3]
-        // STORE
-        const uint64_t* d = sourceValues_[0].getAsVector<uint64_t>();
-        const uint64_t* p = sourceValues_[1].getAsVector<uint64_t>();
+      case Opcode::AArch64_ST1_MXIPXX_V_B: {  // st1b {zatv.b[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm}]
+        // SME, STORE
+        // Not in right context mode. Raise exception
+        if (!ZAenabled) return ZAdisabled();
 
-        const uint16_t partition_num = VL_bits / 64;
+        const uint16_t partition_num = VL_bits / 8;
+        const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
+        const uint64_t* pg =
+            sourceValues_[partition_num + 1].getAsVector<uint64_t>();
+
+        const uint32_t sliceNum =
+            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+
+        std::vector<uint8_t> memData;
         uint16_t index = 0;
-        for (int i = 0; i < partition_num; i++) {
-          uint64_t shifted_active = 1ull << ((i % 8) * 8);
-          if (p[i / 8] & shifted_active) {
-            memoryData_[index] = d[i];
+
+        for (uint16_t x = 0; x < partition_num; x++) {
+          uint64_t shifted_active = 1ull << (x % 64);
+          if (pg[x / 64] & shifted_active) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint8_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
+            memoryData_[index] =
+                RegisterValue((char*)memData.data(), memData.size());
             index++;
+            memData.clear();
           }
         }
+
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size());
+        }
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_H_D: {  // st1d {zath.d[ws, #imm]}, pg,
+      case Opcode::AArch64_ST1_MXIPXX_V_D: {  // st1d {zatv.d[ws, #imm]}, pg,
                                               // [<xn|sp>{, xm, lsl #3}]
         // SME, STORE
         // Not in right context mode. Raise exception
@@ -4457,19 +5761,35 @@ void Instruction::execute() {
         const uint32_t sliceNum =
             (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
 
-        const uint64_t* tileSlice =
-            sourceValues_[sliceNum].getAsVector<uint64_t>();
-        memoryData_ = sve_merge_store_data<uint64_t>(tileSlice, pg, VL_bits);
+        std::vector<uint64_t> memData;
+        uint16_t index = 0;
+
+        for (uint16_t x = 0; x < partition_num; x++) {
+          uint64_t shifted_active = 1ull << ((x % 8) * 8);
+          if (pg[x / 8] & shifted_active) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint64_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
+            memoryData_[index] =
+                RegisterValue((char*)memData.data(), memData.size() * 8);
+            index++;
+            memData.clear();
+          }
+        }
 
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size() * 8);
+        }
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_V_D: {  // st1d {zatv.d[ws, #imm]}, pg,
-                                              // [<xn|sp>{, xm, lsl #3}]
+      case Opcode::AArch64_ST1_MXIPXX_V_H: {  // st1h {zatv.h[ws, #imm]}, pg,
+                                              // [<xn|sp>{, xm, LSL #1}]
         // SME, STORE
         // Not in right context mode. Raise exception
         if (!ZAenabled) return ZAdisabled();
 
-        const uint16_t partition_num = VL_bits / 64;
+        const uint16_t partition_num = VL_bits / 16;
         const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
             sourceValues_[partition_num + 1].getAsVector<uint64_t>();
@@ -4477,45 +5797,68 @@ void Instruction::execute() {
         const uint32_t sliceNum =
             (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
 
-        std::array<uint64_t, 32> mdata;
-        uint16_t md_size = 0;
+        std::vector<uint16_t> memData;
         uint16_t index = 0;
 
         for (uint16_t x = 0; x < partition_num; x++) {
-          uint64_t shifted_active = 1ull << ((x % 8) * 8);
-          if (pg[x / 8] & shifted_active) {
-            mdata[md_size] = sourceValues_[x].getAsVector<uint64_t>()[sliceNum];
-            md_size++;
-          } else if (md_size) {
+          uint64_t shifted_active = 1ull << ((x % 32) * 2);
+          if (pg[x / 32] & shifted_active) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint16_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
             memoryData_[index] =
-                RegisterValue((char*)mdata.data(), md_size * 8);
-            md_size = 0;
+                RegisterValue((char*)memData.data(), memData.size() * 2);
+            index++;
+            memData.clear();
           }
         }
 
-        if (md_size) {
-          memoryData_[index] = RegisterValue((char*)mdata.data(), md_size * 8);
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size() * 2);
         }
         break;
       }
-      case Opcode::AArch64_ST1_MXIPXX_H_S: {  // st1w {zath.s[ws, #imm]}, pg,
-                                              // [<xn|sp>{, xm, LSL #2}]
+      case Opcode::AArch64_ST1_MXIPXX_V_Q: {  // st1h {zatv.q[ws]}, pg,
+                                              // [<xn|sp>{, xm, LSL #4}]
         // SME, STORE
         // Not in right context mode. Raise exception
         if (!ZAenabled) return ZAdisabled();
 
-        const uint16_t partition_num = VL_bits / 32;
+        const uint16_t partition_num = VL_bits / 128;
         const uint32_t ws = sourceValues_[partition_num].get<uint32_t>();
         const uint64_t* pg =
             sourceValues_[partition_num + 1].getAsVector<uint64_t>();
 
-        const uint32_t sliceNum =
-            (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
+        const uint32_t sliceNum = ws % partition_num;
 
-        const uint32_t* tileSlice =
-            sourceValues_[sliceNum].getAsVector<uint32_t>();
-        memoryData_ = sve_merge_store_data<uint32_t>(tileSlice, pg, VL_bits);
+        // Need to combine active adjacent elements into RegisterValues and
+        // place into each memoryData_ index.
+        std::vector<uint64_t> memData;
+        uint16_t index = 0;
+        for (uint16_t x = 0; x < partition_num; x++) {
+          // For 128-bit there are 16-bit for each active element
+          uint64_t shifted_active = 1ull << ((x % 4) * 16);
+          if (pg[x / 4] & shifted_active) {
+            // As using uint64_t need to push_back 2 elements
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint64_t>()[2 * sliceNum]);
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint64_t>()[2 * sliceNum + 1]);
+          } else if (memData.size() > 0) {
+            // Predicate false, save current data
+            memoryData_[index] = RegisterValue(
+                (char*)memData.data(), memData.size() * sizeof(uint64_t));
+            index++;
+            memData.clear();
+          }
+        }
 
+        // Check if final data needs putting into memoryData_
+        if (memData.size() > 0) {
+          memoryData_[index] = RegisterValue((char*)memData.data(),
+                                             memData.size() * sizeof(uint64_t));
+        }
         break;
       }
       case Opcode::AArch64_ST1_MXIPXX_V_S: {  // st1w {zatv.s[ws, #imm]}, pg,
@@ -4532,26 +5875,26 @@ void Instruction::execute() {
         const uint32_t sliceNum =
             (ws + metadata_.operands[0].sme.slice_offset.imm) % partition_num;
 
-        std::array<uint32_t, 64> mdata;
-        uint16_t md_size = 0;
+        std::vector<uint32_t> memData;
         uint16_t index = 0;
 
         for (uint16_t x = 0; x < partition_num; x++) {
           uint64_t shifted_active = 1ull << ((x % 16) * 4);
           if (pg[x / 16] & shifted_active) {
-            mdata[md_size] = sourceValues_[x].getAsVector<uint32_t>()[sliceNum];
-            md_size++;
-          } else if (md_size) {
+            memData.push_back(
+                sourceValues_[x].getAsVector<uint32_t>()[sliceNum]);
+          } else if (memData.size() > 0) {
             memoryData_[index] =
-                RegisterValue((char*)mdata.data(), md_size * 4);
-            md_size = 0;
+                RegisterValue((char*)memData.data(), memData.size() * 4);
+            index++;
+            memData.clear();
           }
         }
 
-        if (md_size) {
-          memoryData_[index] = RegisterValue((char*)mdata.data(), md_size * 4);
+        if (memData.size() > 0) {
+          memoryData_[index] =
+              RegisterValue((char*)memData.data(), memData.size() * 4);
         }
-
         break;
       }
       case Opcode::AArch64_SST1W_D_IMM: {  // st1w {zt.d}, pg, [zn.d{, #imm}]
@@ -5133,6 +6476,21 @@ void Instruction::execute() {
         memoryData_[0] = RegisterValue((char*)p, partition_num);
         break;
       }
+      case Opcode::AArch64_STR_ZA: {  // str za[wv, #imm], [xn|sp{, #imm, mul
+                                      // vl}]
+        // SME, STORE
+        // If not in right context mode, raise exception
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t zaRowCount = VL_bits / 8;
+        const uint32_t wv = sourceValues_[zaRowCount].get<uint32_t>();
+        const uint32_t imm = metadata_.operands[0].sme.slice_offset.imm;
+
+        const uint8_t* zaRow =
+            sourceValues_[(wv + imm) % zaRowCount].getAsVector<uint8_t>();
+        memoryData_[0] = RegisterValue((char*)zaRow, zaRowCount);
+        break;
+      }
       case Opcode::AArch64_STR_ZXI: {  // str zt, [xn{, #imm, mul vl}]
         // STORE
         const uint16_t partition_num = VL_bits / 8;
@@ -5316,6 +6674,158 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> uint8_t { return x - y; });
         break;
       }
+      case Opcode::AArch64_SUMOPA_MPPZZ_D: {  // sumopa zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SUMOPA_MPPZZ_S: {  // sumopa zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SUMOPS_MPPZZ_D: {  // sumops zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int16_t* zn = sourceValues_[tileDim + 2].getAsVector<int16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_SUMOPS_MPPZZ_S: {  // sumops zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const int8_t* zn = sourceValues_[tileDim + 2].getAsVector<int8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_SVC: {  // svc #imm
         exceptionEncountered_ = true;
         exception_ = InstructionException::SupervisorCall;
@@ -5568,6 +7078,158 @@ void Instruction::execute() {
         results_[0] = vecUMinP<uint8_t, 16>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_UMOPA_MPPZZ_D: {  // umopa zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint64_t outRow[32] = {0};
+          const uint64_t* zadaRow = sourceValues_[row].getAsVector<uint64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<uint64_t>(zn[znIndex]) *
+                        static_cast<uint64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_UMOPA_MPPZZ_S: {  // umopa zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint32_t outRow[64] = {0};
+          const uint32_t* zadaRow = sourceValues_[row].getAsVector<uint32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<uint32_t>(zn[znIndex]) *
+                        static_cast<uint32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_UMOPS_MPPZZ_D: {  // umops zada.d, pn/m, pm/m, zn.h,
+                                             // zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const uint16_t* zm = sourceValues_[tileDim + 3].getAsVector<uint16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint64_t outRow[32] = {0};
+          const uint64_t* zadaRow = sourceValues_[row].getAsVector<uint64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<uint64_t>(zn[znIndex]) *
+                        static_cast<uint64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_UMOPS_MPPZZ_S: {  // umops zada.s, pn/m, pm/m, zn.b,
+                                             // zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const uint8_t* zm = sourceValues_[tileDim + 3].getAsVector<uint8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          uint32_t outRow[64] = {0};
+          const uint32_t* zadaRow = sourceValues_[row].getAsVector<uint32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            uint32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<uint32_t>(zn[znIndex]) *
+                        static_cast<uint32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
       case Opcode::AArch64_UMOVvi32: {     // umov wd, vn.s[index]
         const uint32_t* vec = sourceValues_[0].getAsVector<uint32_t>();
@@ -5635,6 +7297,158 @@ void Instruction::execute() {
             sourceValues_, metadata_, false);
         break;
       }
+      case Opcode::AArch64_USMOPA_MPPZZ_D: {  // usmopa zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum += (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_USMOPA_MPPZZ_S: {  // usmopa zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum += (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_USMOPS_MPPZZ_D: {  // usmops zada.d, pn/m, pm/m,
+                                              // zn.h, zm.h
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 64;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint16_t* zn = sourceValues_[tileDim + 2].getAsVector<uint16_t>();
+        const int16_t* zm = sourceValues_[tileDim + 3].getAsVector<int16_t>();
+
+        // zn is a SVLd x 4 sub matrix
+        // zm is a 4 x SVLd sub matrix
+        // Resulting SVLd x SVLd matrix has results widened to 64-bit
+        for (int row = 0; row < tileDim; row++) {
+          int64_t outRow[32] = {0};
+          const int64_t* zadaRow = sourceValues_[row].getAsVector<int64_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int64_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << ((znIndex % 32) * 2);
+              const uint64_t shifted_active_zm = 1ull << ((zmIndex % 32) * 2);
+              if ((pn[znIndex / 32] & shifted_active_zn) &&
+                  (pm[zmIndex / 32] & shifted_active_zm))
+                sum -= (static_cast<int64_t>(zn[znIndex]) *
+                        static_cast<int64_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
+      case Opcode::AArch64_USMOPS_MPPZZ_S: {  // usmops zada.s, pn/m, pm/m,
+                                              // zn.b, zm.b
+        // SME
+        // Check core is in correct context mode (check SM first)
+        if (!SMenabled) return SMdisabled();
+        if (!ZAenabled) return ZAdisabled();
+
+        const uint16_t tileDim = VL_bits / 32;
+        const uint64_t* pn = sourceValues_[tileDim].getAsVector<uint64_t>();
+        const uint64_t* pm = sourceValues_[tileDim + 1].getAsVector<uint64_t>();
+        const uint8_t* zn = sourceValues_[tileDim + 2].getAsVector<uint8_t>();
+        const int8_t* zm = sourceValues_[tileDim + 3].getAsVector<int8_t>();
+
+        // zn is a SVLs x 4 sub matrix
+        // zm is a 4 x SVLs sub matrix
+        // Resulting SVLs x SVLs matrix has results widened to 32-bit
+        for (int row = 0; row < tileDim; row++) {
+          int32_t outRow[64] = {0};
+          const int32_t* zadaRow = sourceValues_[row].getAsVector<int32_t>();
+          for (int col = 0; col < tileDim; col++) {
+            // Get corresponding output element
+            int32_t sum = zadaRow[col];
+            for (int k = 0; k < 4; k++) {
+              const uint16_t znIndex = 4 * row + k;
+              const uint16_t zmIndex = 4 * col + k;
+              const uint64_t shifted_active_zn = 1ull << (znIndex % 64);
+              const uint64_t shifted_active_zm = 1ull << (zmIndex % 64);
+              if ((pn[znIndex / 64] & shifted_active_zn) &&
+                  (pm[zmIndex / 64] & shifted_active_zm))
+                sum -= (static_cast<int32_t>(zn[znIndex]) *
+                        static_cast<int32_t>(zm[zmIndex]));
+            }
+            outRow[col] = sum;
+          }
+          results_[row] = {outRow, 256};
+        }
+        break;
+      }
       case Opcode::AArch64_UUNPKHI_ZZ_D: {  // uunpkhi zd.d, zn.s
         results_[0] =
             sveUnpk_vecs<uint64_t, uint32_t>(sourceValues_, VL_bits, true);
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index a4731f388f..96d23590a6 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -727,8 +727,8 @@ TEST_P(InstNeon, cmhs) {
   heap[1] = 0x7F;
   heap[2] = INT8_MAX;
   heap[3] = 1;
-  heap[4] = -128;
-  heap[5] = -1;
+  heap[4] = 128;
+  heap[5] = 1;
   heap[6] = 0xAA;
   heap[7] = 0xBB;
   heap[8] = 0xCC;
@@ -744,7 +744,7 @@ TEST_P(InstNeon, cmhs) {
   heap[16] = INT8_MAX;
   heap[17] = 0x7F;
   heap[18] = 0;
-  heap[19] = -128;
+  heap[19] = 128;
   heap[20] = 1;
   heap[21] = 0;
   heap[22] = 0xAA;
@@ -772,10 +772,10 @@ TEST_P(InstNeon, cmhs) {
   )");
 
   CHECK_NEON(2, uint8_t,
-             {0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+             {0x00, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
               0xFF, 0xFF, 0xFF, 0xFF, 0xFF});
   CHECK_NEON(3, uint8_t,
-             {0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
+             {0xFF, 0xFF, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0x00, 0x00,
               0x00, 0xFF, 0xFF, 0xFF, 0xFF});
 }
 
@@ -2684,8 +2684,8 @@ TEST_P(InstNeon, uminp) {
 
   )");
   CHECK_NEON(2, uint8_t,
-             {0x00, 0x00, 0xEE, 0x11, 0x22, 0x33, 0x44, 0x55, 0x01, 0x02, 0x03,
-              0x04, 0x05, 0x06, 0x07, 0x08});
+             {0x00, 0xAA, 0xBB, 0xDD, 0x01, 0x03, 0x05, 0x07, 0x00, 0x11, 0x22,
+              0x44, 0xEE, 0xCC, 0xAA, 0x88});
 }
 TEST_P(InstNeon, umaxp) {
   // umaxp vd.16b vn.16b vm.16b
@@ -2738,12 +2738,12 @@ TEST_P(InstNeon, umaxp) {
 
     ldr q0, [x0]
     ldr q1, [x0, #16]
-    umaxp v2.16b, v0.16b, v1.16b
+    umaxp v2.16b, v1.16b, v0.16b
 
   )");
   CHECK_NEON(2, uint8_t,
-             {0x01, 0x00, 0xFF, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF, 0xEE, 0xDD,
-              0xCC, 0xBB, 0xAA, 0x99, 0x88});
+             {0x00, 0xEE, 0x33, 0x55, 0xFF, 0xDD, 0xBB, 0x99, 0x01, 0xFF, 0xCC,
+              0xEE, 0x02, 0x04, 0x06, 0x08});
 }
 
 TEST_P(InstNeon, smax) {
diff --git a/test/regression/aarch64/instructions/sme.cc b/test/regression/aarch64/instructions/sme.cc
index 55c7b945f3..a54c0c981a 100644
--- a/test/regression/aarch64/instructions/sme.cc
+++ b/test/regression/aarch64/instructions/sme.cc
@@ -8,110 +8,269 @@ namespace {
 using InstSme = AArch64RegressionTest;
 
 #if SIMENG_LLVM_VERSION >= 14
-TEST_P(InstSme, mova) {
-  // 8-bit
+TEST_P(InstSme, addha) {
+  // 32-bit
   RUN_AARCH64(R"(
     smstart
 
+    zero {za}
+
     ptrue p0.s
-    ptrue p1.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
 
-    fdup z1.s, #1.0
-    mov w0, #1
-    index z2.s, #1, w0
-    scvtf z2.s, p0/m, z2.s
+    dup z0.s, #65
+    index z1.s, #0, #1
 
-    fdup z4.s, #5.0
-    fdup z5.s, #10.0
-    fdup z6.s, #5.0
-    fdup z7.s, #10.0
-    fmopa za0.s, p0/m, p1/m, z2.s, z1.s
+    # Add to all rows and elems
+    addha za0.s, p0/m, p0/m, z1.s
 
-    ptrue p2.b
-    mov x2, #0
-    mov x3, #2
-    addvl x2, x2, #1
-    sdiv x2, x2, x3
-    whilelo p3.b, xzr, x2
+    # Add to all rows, even numbered elements
+    addha za1.s, p0/m, p0/m, z0.s
+    addha za1.s, p0/m, p1/m, z1.s
 
-    mov w12, #0
-    mov w15, #2
+    # Add to even rows, all elements
+    addha za2.s, p0/m, p0/m, z0.s
+    addha za2.s, p1/m, p0/m, z1.s
+
+    # Even numbered rows, even numbered elements
+    addha za3.s, p0/m, p0/m, z0.s
+    addha za3.s, p1/m, p1/m, z1.s
+  )");
+  std::vector<uint32_t> full32(64, 0);
+  std::vector<uint32_t> index32(64, 0);
+  std::vector<uint32_t> inter32(64, 0);
+  for (uint16_t i = 0; i < 64; i++) {
+    full32[i] = 65;
+    index32[i] = i;
+    inter32[i] = (i % 2 == 0) ? i : 65;
+  }
+
+  for (uint32_t i = 0; i < (SVL / 32); i++) {
+    // All rows, all elems
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>(index32, (SVL / 8)));
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({i}, (SVL / 8)));
+    // All rows, even elements
+    CHECK_MAT_ROW(AARCH64_REG_ZAS1, i, uint32_t,
+                  fillNeon<uint32_t>(inter32, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(index32, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(inter32, (SVL / 8)));
+    } else {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+    }
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    dup z0.d, #65
+    index z1.d, #0, #1
+
+    # Add to all rows and elems
+    addha za0.d, p0/m, p0/m, z1.d
 
-    mova z4.b, p2/m, za0h.b[w12, #0]
-    mova z5.b, p2/m, za0h.b[w12, #4]
-    mova z6.b, p3/m, za0h.b[w15, #6]
-    mova z7.b, p3/m, za0h.b[w15, #10]
+    # Add to all rows, even numbered elements
+    addha za1.d, p0/m, p0/m, z0.d
+    addha za1.d, p0/m, p1/m, z1.d
+
+    # Add to even rows, all elements
+    addha za2.d, p0/m, p0/m, z0.d
+    addha za2.d, p1/m, p0/m, z1.d
+
+    # Even numbered rows, even numbered elements
+    addha za3.d, p0/m, p0/m, z0.d
+    addha za3.d, p1/m, p1/m, z1.d
   )");
-  CHECK_NEON(4, float, fillNeon<float>({1}, SVL / 8));
-  CHECK_NEON(5, float, fillNeon<float>({2}, SVL / 8));
-  CHECK_NEON(6, float, fillNeonCombined<float>({3}, {5}, SVL / 8));
-  CHECK_NEON(7, float, fillNeonCombined<float>({4}, {10}, SVL / 8));
+  std::vector<uint64_t> full64(32, 0);
+  std::vector<uint64_t> index64(32, 0);
+  std::vector<uint64_t> inter64(32, 0);
+  for (uint16_t i = 0; i < 32; i++) {
+    full64[i] = 65;
+    index64[i] = i;
+    inter64[i] = (i % 2 == 0) ? i : 65;
+  }
+
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    // All rows, all elems
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>(index64, (SVL / 8)));
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({i}, (SVL / 8)));
+    // All rows, even elements
+    CHECK_MAT_ROW(AARCH64_REG_ZAD1, i, uint64_t,
+                  fillNeon<uint64_t>(inter64, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(index64, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(inter64, (SVL / 8)));
+    } else {
+      // Even rows, all elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+      // Even rows, even elements
+      CHECK_MAT_ROW(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+    }
+  }
 }
 
-TEST_P(InstSme, fmopa) {
+TEST_P(InstSme, addva) {
   // 32-bit
   RUN_AARCH64(R"(
     smstart
 
-    fdup z1.s, #2.0
-    fdup z2.s, #5.0
+    zero {za}
+
     ptrue p0.s
-    ptrue p1.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
 
-    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+    dup z0.s, #65
+    index z1.s, #0, #1
 
-    fdup z3.s, #3.0
-    fdup z4.s, #8.0
-    mov x0, #0
-    mov x1, #8
-    addvl x0, x0, #1
-    udiv x0, x0, x1
-    whilelo p2.s, xzr, x0
+    # Add to all cols and elems
+    addva za0.s, p0/m, p0/m, z1.s
 
-    fmopa za2.s, p0/m, p2/m, z3.s, z4.s
+    # All cols, even elements
+    addva za1.s, p0/m, p0/m, z0.s
+    addva za1.s, p1/m, p0/m, z1.s
+
+    # Add to even numbered cols, all elements
+    addva za2.s, p0/m, p0/m, z0.s
+    addva za2.s, p0/m, p1/m, z1.s
+
+    # Even numbered cols, even numbered elements
+    addva za3.s, p0/m, p0/m, z0.s
+    addva za3.s, p1/m, p1/m, z1.s
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
-    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float,
-                  fillNeon<float>({10.0f}, (SVL / 8)));
-    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float,
-                  fillNeon<float>({24.0f}, (SVL / 16)));
+  std::vector<uint32_t> full32(64, 0);
+  std::vector<uint32_t> index32(64, 0);
+  std::vector<uint32_t> inter32(64, 0);
+  for (uint16_t i = 0; i < 64; i++) {
+    full32[i] = 65;
+    index32[i] = i;
+    inter32[i] = (i % 2 == 0) ? i : 65;
+  }
+
+  for (uint32_t i = 0; i < (SVL / 32); i++) {
+    // All cols, all elems
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>(index32, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({i}, (SVL / 8)));
+    // All cols, even elements
+    CHECK_MAT_COL(AARCH64_REG_ZAS1, i, uint32_t,
+                  fillNeon<uint32_t>(inter32, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(index32, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(inter32, (SVL / 8)));
+    } else {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS2, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAS3, i, uint32_t,
+                    fillNeon<uint32_t>(full32, (SVL / 8)));
+    }
   }
 
   // 64-bit
   RUN_AARCH64(R"(
     smstart
 
-    fdup z1.d, #2.0
-    fdup z2.d, #5.0
+    zero {za}
+
     ptrue p0.d
-    ptrue p1.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
 
-    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+    dup z0.d, #65
+    index z1.d, #0, #1
 
-    fdup z3.d, #3.0
-    fdup z4.d, #8.0
-    mov x0, #0
-    mov x1, #16
-    addvl x0, x0, #1
-    udiv x0, x0, x1
-    whilelo p2.d, xzr, x0
+    # Add to all cols and elems
+    addva za0.d, p0/m, p0/m, z1.d
 
-    fmopa za2.d, p0/m, p2/m, z3.d, z4.d
+    # All cols, even elements
+    addva za1.d, p0/m, p0/m, z0.d
+    addva za1.d, p1/m, p0/m, z1.d
+
+    # Add to even numbered cols, all elements
+    addva za2.d, p0/m, p0/m, z0.d
+    addva za2.d, p0/m, p1/m, z1.d
+
+    # Even numbered cols, even numbered elements
+    addva za3.d, p0/m, p0/m, z0.d
+    addva za3.d, p1/m, p1/m, z1.d
   )");
+  std::vector<uint64_t> full64(32, 0);
+  std::vector<uint64_t> index64(32, 0);
+  std::vector<uint64_t> inter64(32, 0);
+  for (uint16_t i = 0; i < 32; i++) {
+    full64[i] = 65;
+    index64[i] = i;
+    inter64[i] = (i % 2 == 0) ? i : 65;
+  }
+
   for (uint64_t i = 0; i < (SVL / 64); i++) {
-    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double,
-                  fillNeon<double>({10.0}, (SVL / 8)));
-    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double,
-                  fillNeon<double>({24.0}, (SVL / 16)));
+    // All cols, all elems
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>(index64, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({i}, (SVL / 8)));
+    // All cols, even elements
+    CHECK_MAT_COL(AARCH64_REG_ZAD1, i, uint64_t,
+                  fillNeon<uint64_t>(inter64, (SVL / 8)));
+    if (i % 2 == 0) {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(index64, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(inter64, (SVL / 8)));
+    } else {
+      // Even cols, all elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD2, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+      // Even cols, even elements
+      CHECK_MAT_COL(AARCH64_REG_ZAD3, i, uint64_t,
+                    fillNeon<uint64_t>(full64, (SVL / 8)));
+    }
   }
 }
 
-TEST_P(InstSme, ld1d) {
-  // Horizontal
+TEST_P(InstSme, mova_tileToVec) {
+  // 8-bit
   initialHeapData_.resize(SVL / 4);
-  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
-  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
-  fillHeap<uint64_t>(heap64, src, SVL / 32);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src8 = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                               0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src8, SVL / 4);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -120,38 +279,77 @@ TEST_P(InstSme, ld1d) {
 
     smstart
 
-    mov x1, #1
-    ptrue p0.d
-    mov w12, #0
-    # Load and broadcast values from heap
-    ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
-    ld1d {za0h.d[w12, 1]}, p0/z, [x0]
+    zero {za}
 
-    # Test for inactive lanes
-    mov x1, #0
-    mov x3, #16
-    # TODO change to addsvl when implemented
-    addvl x1, x1, #1
-    udiv x1, x1, x3
-    mov x2, #0
-    whilelo p1.d, xzr, x1
-    ld1d {za1h.d[w12, 1]}, p1/z, [x0, x2, lsl #3]
-  )");
-  CHECK_MAT_ROW(
-      AARCH64_REG_ZAD0, 0, uint64_t,
-      fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8));
-  CHECK_MAT_ROW(
-      AARCH64_REG_ZAD0, 1, uint64_t,
-      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
-  CHECK_MAT_ROW(AARCH64_REG_ZAD1, 1, uint64_t,
-                fillNeonCombined<uint64_t>(
-                    {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8));
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
 
-  // Vertical
+    mov w12, #0
+    dup z0.b, #1
+    dup z1.b, #2
+    dup z2.b, #3
+    dup z3.b, #4
+    dup z4.b, #5
+    dup z5.b, #6
+    dup z6.b, #7
+    dup z7.b, #8
+
+    # Horizontal
+    ld1b {za0h.b[w12, #0]}, p0/z, [x0]
+    mova z0.b, p0/m, za0h.b[w12, #0]
+    mova z1.b, p1/m, za0h.b[w12, #0]
+    #Alias
+    mov z4.b, p0/m, za0h.b[w12, #0]
+    mov z5.b, p1/m, za0h.b[w12, #0]
+
+    # Vertical
+    ld1b {za0v.b[w12, #3]}, p0/z, [x0]
+    mova z2.b, p0/m, za0v.b[w12, #3]
+    mova z3.b, p1/m, za0v.b[w12, #3]
+    #Alias
+    mov z6.b, p0/m, za0v.b[w12, #3]
+    mov z7.b, p1/m, za0v.b[w12, #3]
+  )");
+  CHECK_NEON(0, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(1, uint8_t,
+             fillNeon<uint8_t>({0xDE, 2, 0xBE, 2, 0x12, 2, 0x56, 2, 0x98, 2,
+                                0x54, 2, 0xAB, 2, 0xEF, 2},
+                               SVL / 8));
+  CHECK_NEON(2, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(3, uint8_t,
+             fillNeon<uint8_t>({0xDE, 4, 0xBE, 4, 0x12, 4, 0x56, 4, 0x98, 4,
+                                0x54, 4, 0xAB, 4, 0xEF, 4},
+                               SVL / 8));
+  CHECK_NEON(4, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(5, uint8_t,
+             fillNeon<uint8_t>({0xDE, 6, 0xBE, 6, 0x12, 6, 0x56, 6, 0x98, 6,
+                                0x54, 6, 0xAB, 6, 0xEF, 6},
+                               SVL / 8));
+  CHECK_NEON(6, uint8_t,
+             fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                                0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                               SVL / 8));
+  CHECK_NEON(7, uint8_t,
+             fillNeon<uint8_t>({0xDE, 8, 0xBE, 8, 0x12, 8, 0x56, 8, 0x98, 8,
+                                0x54, 8, 0xAB, 8, 0xEF, 8},
+                               SVL / 8));
+
+  // 16-bit
   initialHeapData_.resize(SVL / 4);
-  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
-  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
-  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 32);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src16 = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                 0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16, src16, SVL / 8);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -160,40 +358,73 @@ TEST_P(InstSme, ld1d) {
 
     smstart
 
-    mov x1, #1
-    ptrue p0.d
-    mov w12, #0
-    # Load and broadcast values from heap
-    ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
-    ld1d {za0v.d[w12, 1]}, p0/z, [x0]
+    zero {za}
 
-    # Test for inactive lanes
-    mov x1, #0
-    mov x3, #16
-    # TODO change to addsvl when implemented
-    addvl x1, x1, #1
-    udiv x1, x1, x3
-    mov x2, #0
-    whilelo p1.d, xzr, x1
-    ld1d {za1v.d[w12, 1]}, p1/z, [x0, x2, lsl #3]
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+    dup z2.h, #3
+    dup z3.h, #4
+    dup z4.h, #5
+    dup z5.h, #6
+    dup z6.h, #7
+    dup z7.h, #8
+
+    # Horizontal
+    ld1h {za0h.h[w12, #0]}, p0/z, [x0]
+    mova z0.h, p0/m, za0h.h[w12, #0]
+    mova z1.h, p1/m, za0h.h[w12, #0]
+    #Alias
+    mov z4.h, p0/m, za0h.h[w12, #0]
+    mov z5.h, p1/m, za0h.h[w12, #0]
+
+    # Vertical
+    ld1h {za0v.h[w12, #3]}, p0/z, [x0]
+    mova z2.h, p0/m, za0v.h[w12, #3]
+    mova z3.h, p1/m, za0v.h[w12, #3]
+    #Alias
+    mov z6.h, p0/m, za0v.h[w12, #3]
+    mov z7.h, p1/m, za0v.h[w12, #3]
   )");
-  CHECK_MAT_COL(
-      AARCH64_REG_ZAD0, 0, uint64_t,
-      fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8));
-  CHECK_MAT_COL(
-      AARCH64_REG_ZAD0, 1, uint64_t,
-      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
-  CHECK_MAT_COL(AARCH64_REG_ZAD1, 1, uint64_t,
-                fillNeonCombined<uint64_t>(
-                    {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8));
-}
+  CHECK_NEON(0, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(1, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 2, 0x1234, 2, 0x9876, 2, 0xABCD, 2},
+                                SVL / 8));
+  CHECK_NEON(2, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(3, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 4, 0x1234, 4, 0x9876, 4, 0xABCD, 4},
+                                SVL / 8));
+  CHECK_NEON(4, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(5, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 6, 0x1234, 6, 0x9876, 6, 0xABCD, 6},
+                                SVL / 8));
+  CHECK_NEON(6, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                 0xABCD, 0xEF01},
+                                SVL / 8));
+  CHECK_NEON(7, uint16_t,
+             fillNeon<uint16_t>({0xDEAD, 8, 0x1234, 8, 0x9876, 8, 0xABCD, 8},
+                                SVL / 8));
 
-TEST_P(InstSme, ld1w) {
-  // Horizontal
+  // 32-bit
   initialHeapData_.resize(SVL / 4);
   uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
-  fillHeap<uint32_t>(heap32, src, SVL / 16);
+  std::vector<uint32_t> src32 = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src32, SVL / 16);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -202,39 +433,64 @@ TEST_P(InstSme, ld1w) {
 
     smstart
 
-    mov x1, #1
+    zero {za}
+
     ptrue p0.s
-    mov w12, #1
-    # Load and broadcast values from heap
-    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za0h.s[w12, 2]}, p0/z, [x0]
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
 
-    # Test for inactive lanes
-    mov x1, #0
-    mov x3, #8
-    # TODO change to addsvl when implemented
-    addvl x1, x1, #1
-    udiv x1, x1, x3
-    mov x2, #0
-    whilelo p1.s, xzr, x1
-    ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
+    dup z2.s, #3
+    dup z3.s, #4
+    dup z4.s, #5
+    dup z5.s, #6
+    dup z6.s, #7
+    dup z7.s, #8
+
+    # Horizontal
+    ld1w {za0h.s[w12, #0]}, p0/z, [x0]
+    mova z0.s, p0/m, za0h.s[w12, #0]
+    mova z1.s, p1/m, za0h.s[w12, #0]
+    #Alias
+    mov z4.s, p0/m, za0h.s[w12, #0]
+    mov z5.s, p1/m, za0h.s[w12, #0]
+
+    # Vertical
+    ld1w {za0v.s[w12, #3]}, p0/z, [x0]
+    mova z2.s, p0/m, za0v.s[w12, #3]
+    mova z3.s, p1/m, za0v.s[w12, #3]
+    #Alias
+    mov z6.s, p0/m, za0v.s[w12, #3]
+    mov z7.s, p1/m, za0v.s[w12, #3]
   )");
-  CHECK_MAT_ROW(
-      AARCH64_REG_ZAS0, 1, uint64_t,
-      fillNeon<uint64_t>({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8));
-  CHECK_MAT_ROW(
-      AARCH64_REG_ZAS0, 3, uint64_t,
-      fillNeon<uint64_t>({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8));
-  CHECK_MAT_ROW(AARCH64_REG_ZAS1, 1, uint64_t,
-                fillNeonCombined<uint64_t>(
-                    {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8));
+  CHECK_NEON(0, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(1, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 2, 0x98765432, 2}, SVL / 8));
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(3, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 4, 0x98765432, 4}, SVL / 8));
+  CHECK_NEON(4, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(5, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 6, 0x98765432, 6}, SVL / 8));
+  CHECK_NEON(6, uint32_t,
+             fillNeon<uint32_t>(
+                 {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_NEON(7, uint32_t,
+             fillNeon<uint32_t>({0xDEADBEEF, 8, 0x98765432, 8}, SVL / 8));
 
-  // Vertical
+  // 64-bit
   initialHeapData_.resize(SVL / 4);
-  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
-                                    0xABCDEF01};
-  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src64 = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src64, SVL / 32);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -243,42 +499,60 @@ TEST_P(InstSme, ld1w) {
 
     smstart
 
-    mov x1, #1
-    ptrue p0.s
-    mov w12, #1
-    # Load and broadcast values from heap
-    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za0v.s[w12, 2]}, p0/z, [x0]
+    zero {za}
 
-    # Test for inactive lanes
-    mov x1, #0
-    mov x3, #8
-    # TODO change to addsvl when implemented
-    addvl x1, x1, #1
-    udiv x1, x1, x3
-    mov x2, #0
-    whilelo p1.s, xzr, x1
-    ld1w {za1v.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+    dup z2.d, #3
+    dup z3.d, #4
+    dup z4.d, #5
+    dup z5.d, #6
+    dup z6.d, #7
+    dup z7.d, #8
+
+    # Horizontal
+    ld1d {za0h.d[w12, #0]}, p0/z, [x0]
+    mova z0.d, p0/m, za0h.d[w12, #0]
+    mova z1.d, p1/m, za0h.d[w12, #0]
+    #Alias
+    mov z4.d, p0/m, za0h.d[w12, #0]
+    mov z5.d, p1/m, za0h.d[w12, #0]
+
+    # Vertical
+    ld1d {za0v.d[w12, #1]}, p0/z, [x0]
+    mova z2.d, p0/m, za0v.d[w12, #1]
+    mova z3.d, p1/m, za0v.d[w12, #1]
+    #Alias
+    mov z6.d, p0/m, za0v.d[w12, #1]
+    mov z7.d, p1/m, za0v.d[w12, #1]
   )");
-  CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t,
-                fillNeon<uint32_t>(
-                    {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8));
-  CHECK_MAT_COL(AARCH64_REG_ZAS0, 3, uint32_t,
-                fillNeon<uint32_t>(
-                    {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
-  CHECK_MAT_COL(
-      AARCH64_REG_ZAS1, 1, uint32_t,
-      fillNeonCombined<uint32_t>(
-          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8));
-}
+  CHECK_NEON(
+      0, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 2}, SVL / 8));
+  CHECK_NEON(
+      2, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 4}, SVL / 8));
+  CHECK_NEON(
+      4, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(5, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 6}, SVL / 8));
+  CHECK_NEON(
+      6, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(7, uint64_t, fillNeon<uint64_t>({0xDEADBEEF12345678, 8}, SVL / 8));
 
-TEST_P(InstSme, st1d) {
-  // Horizontal
+  // 128-bit
+  // Re-use 64-bit heap
   initialHeapData_.resize(SVL / 4);
-  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
-  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
-  fillHeap<uint64_t>(heap64, src, SVL / 32);
-
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64, src64, SVL / 32);
   RUN_AARCH64(R"(
     # Get heap address
     mov x0, 0
@@ -287,235 +561,2852 @@ TEST_P(InstSme, st1d) {
 
     smstart
 
-    sub sp, sp, #4095
-    mov x1, #0
-    mov x4, #0
-    addvl x4, x4, #1
+    zero {za}
+
     ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
 
     mov w12, #0
-    ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
-    ld1d {za1h.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
-    st1d {za0h.d[w12, 0]}, p0, [sp, x1, lsl #3]
-    st1d {za1h.d[w12, 1]}, p0, [x4]
+    dup z0.d, #1
+    dup z1.d, #2
+    dup z2.d, #3
+    dup z3.d, #4
+    dup z4.d, #5
+    dup z5.d, #6
+    dup z6.d, #7
+    dup z7.d, #8
+
+    # Horizontal
+    ld1d {za0h.d[w12, #0]}, p0/z, [x0]
+    mova z0.q, p0/m, za0h.q[w12, #0]
+    mova z1.q, p1/m, za0h.q[w12, #0]
+    #Alias
+    mov z4.q, p0/m, za0h.q[w12, #0]
+    mov z5.q, p1/m, za0h.q[w12, #0]
+
+    # Vertical
+    mov w12, #1
+    ld1d {z8.d}, p0/z, [x0]
+    mova za0v.q[w12, #0], p0/m, z8.q
+    mova z2.q, p0/m, za0v.q[w12, #0]
+    mova z3.q, p1/m, za0v.q[w12, #0]
+    #Alias
+    mov z6.q, p0/m, za0v.q[w12, #0]
+    mov z7.q, p1/m, za0v.q[w12, #0]
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
-                                       4095 + (i * 8)),
-              src[i % 2]);
-    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + (i * 8)), src[i % 2]);
-  }
+  // Horizontal
+  CHECK_NEON(
+      0, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(1, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 2, 2},
+                                SVL / 8));
+  // Vertical
+  CHECK_NEON(
+      2, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(3, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 4, 4},
+                                SVL / 8));
+  // Horizontal
+  CHECK_NEON(
+      4, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(5, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 6, 6},
+                                SVL / 8));
+  // Vertical
+  CHECK_NEON(
+      6, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_NEON(7, uint64_t,
+             fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01, 8, 8},
+                                SVL / 8));
+}
 
+TEST_P(InstSme, mova_b_vecToTile) {
+  // 8-bit
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
-
     smstart
 
-    mov x2, #0
-    mov x4, #16
-    addvl x2, x2, #1
-    udiv x2, x2, x4
-    mov x3, #2
-    whilelo p1.d, xzr, x2
-    mov x5, #800
+    zero {za}
+
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
 
     mov w12, #0
-    mov w13, #1
-    ld1d {za3h.d[w12, 0]}, p1/z, [x0, x3, lsl #3]
-    st1d {za3h.d[w12, 0]}, p1, [x5]
-    ld1d {za1h.d[w13, 1]}, p1/z, [x0, x3, lsl #3]
-    st1d {za1h.d[w13, 1]}, p1, [x5, x3, lsl #3]
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Horizontal
+    mova za0h.b[w12, #0], p0/m, z0.b
+    mova za0h.b[w12, #1], p1/m, z1.b
   )");
-  for (uint64_t i = 0; i < (SVL / 128); i++) {
-    EXPECT_EQ(getMemoryValue<uint64_t>(800 + (i * 8)), src[i % 2]);
-    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src[i % 2]);
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
   }
 
-  // Vertical
-  initialHeapData_.resize(SVL / 4);
-  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
-  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
-  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 32);
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+
+    mov w12, #0
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Horizontal Alias
+    mov za0h.b[w12, #0], p0/m, z0.b
+    mov za0h.b[w12, #1], p1/m, z1.b
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
+  }
 
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
+    smstart
+
+    zero {za}
+
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+
+    mov w12, #0
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Vertical
+    mova za0v.b[w12, #0], p0/m, z0.b
+    mova za0v.b[w12, #1], p1/m, z1.b
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
+  }
 
+  RUN_AARCH64(R"(
     smstart
 
-    sub sp, sp, #4095
-    mov x1, #0
-    mov x4, #0
-    addvl x4, x4, #1
-    ptrue p0.d
+    zero {za}
+
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
 
     mov w12, #0
-    ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
-    ld1d {za1v.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
-    st1d {za0v.d[w12, 0]}, p0, [sp, x1, lsl #3]
-    st1d {za1v.d[w12, 1]}, p0, [x4]
+    dup z0.b, #1
+    dup z1.b, #2
+
+    # Vertical Alias
+    mov za0v.b[w12, #0], p0/m, z0.b
+    mov za0v.b[w12, #1], p1/m, z1.b
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
-                                       4095 + (i * 8)),
-              src_vert[i % 2]);
-    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + (i * 8)), src_vert[i % 2]);
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 0, uint8_t,
+                fillNeon<uint8_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 1, uint8_t,
+                fillNeon<uint8_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 8; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, (SVL / 8)));
   }
+}
 
+TEST_P(InstSme, mova_h_vecToTile) {
+  // 16-bit
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
+    smstart
+
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Horizontal
+    mova za0h.h[w12, #0], p0/m, z0.h
+    mova za0h.h[w12, #1], p1/m, z1.h
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
+  }
 
+  RUN_AARCH64(R"(
     smstart
 
-    mov x2, #0
-    mov x4, #16
-    addvl x2, x2, #1
-    udiv x2, x2, x4
-    mov x3, #2
-    whilelo p1.d, xzr, x2
-    mov x5, #800
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
 
     mov w12, #0
-    mov w13, #1
-    ld1d {za3v.d[w12, 0]}, p1/z, [x0, x3, lsl #3]
-    st1d {za3v.d[w12, 0]}, p1, [x5]
-    ld1d {za1v.d[w13, 1]}, p1/z, [x0, x3, lsl #3]
-    st1d {za1v.d[w13, 1]}, p1, [x5, x3, lsl #3]
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Horizontal Alias
+    mov za0h.h[w12, #0], p0/m, z0.h
+    mov za0h.h[w12, #1], p1/m, z1.h
   )");
-  for (uint64_t i = 0; i < (SVL / 128); i++) {
-    EXPECT_EQ(getMemoryValue<uint64_t>(800 + (i * 8)), src_vert[i % 2]);
-    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src_vert[i % 2]);
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Vertical
+    mova za0v.h[w12, #0], p0/m, z0.h
+    mova za0v.h[w12, #1], p1/m, z1.h
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+
+    mov w12, #0
+    dup z0.h, #1
+    dup z1.h, #2
+
+    # Vertical Alias
+    mov za0v.h[w12, #0], p0/m, z0.h
+    mov za0v.h[w12, #1], p1/m, z1.h
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 0, uint16_t,
+                fillNeon<uint16_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 16; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAH0, i, uint16_t,
+                  fillNeon<uint16_t>({0}, (SVL / 8)));
   }
 }
 
-TEST_P(InstSme, st1w) {
-  // Horizontal
-  initialHeapData_.resize(SVL / 4);
-  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
-  fillHeap<uint32_t>(heap32, src, SVL / 16);
+TEST_P(InstSme, mova_s_vecToTile) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
+
+    # Horizontal
+    mova za0h.s[w12, #0], p0/m, z0.s
+    mova za0h.s[w12, #1], p1/m, z1.s
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
 
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
 
+    # Horizontal Alias
+    mov za0h.s[w12, #0], p0/m, z0.s
+    mov za0h.s[w12, #1], p1/m, z1.s
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
     smstart
 
-    sub sp, sp, #4095
-    mov x1, #0
-    mov x4, #0
-    addvl x4, x4, #1
+    zero {za}
+
     ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
 
     mov w12, #0
-    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za1h.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
-    st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2]
-    st1w {za1h.s[w12, 1]}, p0, [x4]
+    dup z0.s, #1
+    dup z1.s, #2
+
+    # Vertical
+    mova za0v.s[w12, #0], p0/m, z0.s
+    mova za0v.s[w12, #1], p1/m, z1.s
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
-    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
-                                       4095 + (i * 4)),
-              src[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src[i % 4]);
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
   }
 
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
+    smstart
+
+    zero {za}
+
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+
+    mov w12, #0
+    dup z0.s, #1
+    dup z1.s, #2
+
+    # Vertical Alias
+    mov za0v.s[w12, #0], p0/m, z0.s
+    mov za0v.s[w12, #1], p1/m, z1.s
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 0, uint32_t,
+                fillNeon<uint32_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 32; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+}
 
+TEST_P(InstSme, mova_d_vecToTile) {
+  // 64-bit
+  RUN_AARCH64(R"(
     smstart
 
-    mov x2, #0
-    mov x4, #8
-    addvl x2, x2, #1
-    udiv x2, x2, x4
-    mov x3, #4
-    whilelo p1.s, xzr, x2
-    mov x5, #800
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
 
     mov w12, #0
-    ld1w {za3h.s[w12, 0]}, p1/z, [x0, x3, lsl #2]
-    st1w {za3h.s[w12, 0]}, p1, [x5]
-    ld1w {za1h.s[w12, 2]}, p1/z, [x0, x3, lsl #2]
-    st1w {za1h.s[w12, 2]}, p1, [x5, x3, lsl #2]
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal
+    mova za0h.d[w12, #0], p0/m, z0.d
+    mova za0h.d[w12, #1], p1/m, z1.d
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + (i * 4)), src[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src[i % 4]);
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
   }
 
-  // Vertical
-  initialHeapData_.resize(SVL / 4);
-  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
-                                    0xABCDEF01};
-  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal Alias
+    mov za0h.d[w12, #0], p0/m, z0.d
+    mov za0h.d[w12, #1], p1/m, z1.d
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_ROW(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
 
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical
+    mova za0v.d[w12, #0], p0/m, z0.d
+    mova za0v.d[w12, #1], p1/m, z1.d
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
 
+  RUN_AARCH64(R"(
     smstart
 
-    sub sp, sp, #4095
-    mov x1, #0
-    mov x4, #0
-    addvl x4, x4, #1
-    ptrue p0.s
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
 
     mov w12, #0
-    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
-    ld1w {za1v.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
-    st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2]
-    st1w {za1v.s[w12, 1]}, p0, [x4]
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical Alias
+    mov za0v.d[w12, #0], p0/m, z0.d
+    mov za0v.d[w12, #1], p1/m, z1.d
   )");
-  for (uint64_t i = 0; i < (SVL / 32); i++) {
-    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
-                                       4095 + (i * 4)),
-              src_vert[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src_vert[i % 4]);
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 0, uint64_t,
+                fillNeon<uint64_t>({1}, (SVL / 8)));
+  CHECK_MAT_COL(AARCH64_REG_ZAD0, 1, uint64_t,
+                fillNeon<uint64_t>({2, 0}, (SVL / 8)));
+  for (uint16_t i = 2; i < SVL / 64; i++) {
+    CHECK_MAT_COL(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
   }
+}
 
+TEST_P(InstSme, mova_q_vecToTile) {
+  // 128-bit
   RUN_AARCH64(R"(
-    # Get heap address
-    mov x0, 0
-    mov x8, 214
-    svc #0
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal
+    mova za0h.q[w12, #0], p0/m, z0.q
+    mova za0h.q[w12, #0], p1/m, z1.q
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 0, uint64_t,
+                fillNeon<uint64_t>({2, 2, 1, 1}, (SVL / 8)));
+  for (uint16_t i = 1; i < SVL / 128; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
 
+  RUN_AARCH64(R"(
     smstart
 
-    mov x2, #0
-    mov x4, #8
-    addvl x2, x2, #1
-    udiv x2, x2, x4
-    mov x3, #4
-    whilelo p1.s, xzr, x2
-    mov x5, #800
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
 
     mov w12, #0
-    ld1w {za3v.s[w12, 0]}, p1/z, [x0, x3, lsl #2]
-    st1w {za3v.s[w12, 0]}, p1, [x5]
-    ld1w {za1v.s[w12, 2]}, p1/z, [x0, x3, lsl #2]
-    st1w {za1v.s[w12, 2]}, p1, [x5, x3, lsl #2]
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Horizontal Alias
+    mov za0h.q[w12, #0], p0/m, z0.q
+    mov za0h.q[w12, #0], p1/m, z1.q
   )");
-  for (uint64_t i = 0; i < (SVL / 64); i++) {
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + (i * 4)), src_vert[i % 4]);
-    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src_vert[i % 4]);
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 0, uint64_t,
+                fillNeon<uint64_t>({2, 2, 1, 1}, (SVL / 8)));
+  for (uint16_t i = 1; i < SVL / 128; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t,
+                  fillNeon<uint64_t>({0}, (SVL / 8)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical
+    mova za0v.q[w12, #0], p0/m, z0.q
+    mova za0v.q[w12, #0], p1/m, z1.q
+  )");
+  auto onRow = fillNeon<uint64_t>({0}, (SVL / 8));
+  auto offRow = fillNeon<uint64_t>({0}, (SVL / 8));
+  onRow[0] = 2;
+  onRow[1] = 2;
+  offRow[0] = 1;
+  offRow[1] = 1;
+  for (uint16_t i = 0; i < SVL / 128; i++) {
+    if (i % 2 == 0) {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, onRow);
+    } else {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, offRow);
+    }
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    ptrue p0.d
+    pfalse p1.b
+    # Zip1 twice to get on-off-on-off pattern with quadwords
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+
+    mov w12, #0
+    dup z0.d, #1
+    dup z1.d, #2
+
+    # Vertical Alias
+    mov za0v.q[w12, #0], p0/m, z0.q
+    mov za0v.q[w12, #0], p1/m, z1.q
+  )");
+  for (uint16_t i = 0; i < SVL / 128; i++) {
+    if (i % 2 == 0) {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, onRow);
+    } else {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, offRow);
+    }
+  }
+}
+
+TEST_P(InstSme, fmopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.s, #2.0
+    fdup z2.s, #5.0
+    ptrue p0.s
+    ptrue p1.s
+
+    zero {za}
+
+    fmopa za0.s, p0/m, p1/m, z1.s, z2.s
+
+    fdup z3.s, #3.0
+    fdup z4.s, #8.0
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.s, xzr, x0
+
+    fmopa za2.s, p0/m, p2/m, z3.s, z4.s
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float,
+                  fillNeon<float>({10.0f}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float,
+                  fillNeon<float>({24.0f}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.d, #2.0
+    fdup z2.d, #5.0
+    ptrue p0.d
+    ptrue p1.d
+
+    zero {za}
+
+    fmopa za0.d, p0/m, p1/m, z1.d, z2.d
+
+    fdup z3.d, #3.0
+    fdup z4.d, #8.0
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.d, xzr, x0
+
+    fmopa za2.d, p0/m, p2/m, z3.d, z4.d
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double,
+                  fillNeon<double>({10.0}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double,
+                  fillNeon<double>({24.0}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, fmops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.s, #2.0
+    fdup z2.s, #5.0
+    ptrue p0.s
+    ptrue p1.s
+
+    zero {za}
+
+    fmops za0.s, p0/m, p1/m, z1.s, z2.s
+
+    fdup z3.s, #3.0
+    fdup z4.s, #8.0
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.s, xzr, x0
+
+    fmops za2.s, p0/m, p2/m, z3.s, z4.s
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, float,
+                  fillNeon<float>({-10.0f}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, float,
+                  fillNeon<float>({-24.0f}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    fdup z1.d, #2.0
+    fdup z2.d, #5.0
+    ptrue p0.d
+    ptrue p1.d
+
+    zero {za}
+
+    fmops za0.d, p0/m, p1/m, z1.d, z2.d
+
+    fdup z3.d, #3.0
+    fdup z4.d, #8.0
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.d, xzr, x0
+
+    fmops za2.d, p0/m, p2/m, z3.d, z4.d
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, double,
+                  fillNeon<double>({-10.0}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, double,
+                  fillNeon<double>({-24.0}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, ld1b) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                              0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src, SVL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0h.b[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #2
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.b, xzr, x1
+    mov w12, #15
+    ld1b {za0h.b[w12, 0]}, p1/z, [x0, x2]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 1, uint8_t,
+      fillNeon<uint8_t>({0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                         0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01, 0xDE},
+                        SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 3, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 15, uint8_t,
+                fillNeonCombined<uint8_t>(
+                    {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                     0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                    {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8_vert = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src_vert = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34,
+                                   0x56, 0x78, 0x98, 0x76, 0x54, 0x32,
+                                   0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8_vert, src_vert, SVL / 4);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1b {za0v.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0v.b[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #2
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.b, xzr, x1
+    mov w12, #15
+    ld1b {za0v.b[w12, 0]}, p1/z, [x0, x2]
+  )");
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAB0, 1, uint8_t,
+      fillNeon<uint8_t>({0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                         0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01, 0xDE},
+                        SVL / 8));
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAB0, 3, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAB0, 15, uint8_t,
+                fillNeonCombined<uint8_t>(
+                    {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98, 0x76,
+                     0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                    {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ld1d) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.d
+    mov w12, #0
+    # Load and broadcast values from heap
+    ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za0h.d[w12, 1]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #16
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.d, xzr, x1
+    ld1d {za1h.d[w12, 1]}, p1/z, [x0, x2, lsl #3]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAD0, 0, uint64_t,
+      fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAD0, 1, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAD1, 1, uint64_t,
+                fillNeonCombined<uint64_t>(
+                    {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.d
+    mov w12, #0
+    # Load and broadcast values from heap
+    ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za0v.d[w12, 1]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #16
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.d, xzr, x1
+    ld1d {za1v.d[w12, 1]}, p1/z, [x0, x2, lsl #3]
+  )");
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAD0, 0, uint64_t,
+      fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678}, SVL / 8));
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAD0, 1, uint64_t,
+      fillNeon<uint64_t>({0xDEADBEEF12345678, 0x98765432ABCDEF01}, SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAD1, 1, uint64_t,
+                fillNeonCombined<uint64_t>(
+                    {0xDEADBEEF12345678, 0x98765432ABCDEF01}, {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ld1h) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                               0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.h
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1h {za0h.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za0h.h[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #4
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.h, xzr, x1
+    ld1h {za1h.h[w12, 0]}, p1/z, [x0, x2, lsl #1]
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                    0xABCD, 0xEF01, 0xDEAD},
+                                   SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH0, 3, uint16_t,
+                fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876,
+                                    0x5432, 0xABCD, 0xEF01},
+                                   SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAH1, 1, uint16_t,
+                fillNeonCombined<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                            0x9876, 0x5432, 0xABCD, 0xEF01},
+                                           {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16_vert = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src_vert = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                    0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16_vert, src_vert, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.h
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1h {za0v.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za0v.h[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #4
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.h, xzr, x1
+    ld1h {za1v.h[w12, 0]}, p1/z, [x0, x2, lsl #1]
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 1, uint16_t,
+                fillNeon<uint16_t>({0xBEEF, 0x1234, 0x5678, 0x9876, 0x5432,
+                                    0xABCD, 0xEF01, 0xDEAD},
+                                   SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAH0, 3, uint16_t,
+                fillNeon<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678, 0x9876,
+                                    0x5432, 0xABCD, 0xEF01},
+                                   SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAH1, 1, uint16_t,
+                fillNeonCombined<uint16_t>({0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                            0x9876, 0x5432, 0xABCD, 0xEF01},
+                                           {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ld1q) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01,
+                               0x98765432ABCDEF01, 0xDEADBEEF12345678};
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+
+    # Test for inactive lanes - zip twice to get on-off for 128-bits
+    pfalse p1.b
+    zip1 p0.d, p0.d, p1.d
+    zip1 p0.d, p0.d, p0.d
+    ld1q {za15h.q[w12, 0]}, p0/z, [x0]
+  )");
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ0, 1 % (SVL / 128), uint64_t,
+                fillNeon<uint64_t>({0x98765432ABCDEF01, 0xDEADBEEF12345678,
+                                    0xDEADBEEF12345678, 0x98765432ABCDEF01},
+                                   SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAQ15, 1 % (SVL / 128), uint64_t,
+                fillNeon<uint64_t>(
+                    {0xDEADBEEF12345678, 0x98765432ABCDEF01, 0, 0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.b
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1q {za0v.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+
+    # Test for inactive lanes - zip twice to get on-off for 128-bits
+    pfalse p1.b
+    zip1 p0.d, p0.d, p1.d
+    zip1 p0.d, p0.d, p0.d
+    ld1q {za15v.q[w12, 0]}, p0/z, [x0]
+  )");
+  // Can't check Q columns as CHECK_MAT_COL isn't set up for doing this with
+  // uint64_t.
+  // Instead, manually place values into 1st column of Q tile (as per
+  // asm above) and check each Q row.
+  auto row0 = fillNeon<uint64_t>({0}, (SVL / 8));
+  auto row1 = fillNeon<uint64_t>({0}, (SVL / 8));
+  auto zeroRow = fillNeon<uint64_t>({0}, (SVL / 8));
+  // MOD SVL / 64 as dealing with uint64_t even though its a 128-bit tile
+  row0[2 % (SVL / 64)] = 0x98765432ABCDEF01;
+  row0[3 % (SVL / 64)] = 0xDEADBEEF12345678;
+  row1[2 % (SVL / 64)] = 0xDEADBEEF12345678;
+  row1[3 % (SVL / 64)] = 0x98765432ABCDEF01;
+  for (uint16_t i = 0; i < SVL / 128; i++) {
+    if (i % 2 == 0) {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, row0);
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ15, i, uint64_t, row1);
+    } else {
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ0, i, uint64_t, row1);
+      CHECK_MAT_ROW(AARCH64_REG_ZAQ15, i, uint64_t, zeroRow);
+    }
+  }
+}
+
+TEST_P(InstSme, ld1w) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, SVL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.s
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za0h.s[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #8
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.s, xzr, x1
+    ld1w {za1h.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAS0, 1, uint64_t,
+      fillNeon<uint64_t>({0x9876543212345678, 0xDEADBEEFABCDEF01}, SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAS0, 3, uint64_t,
+      fillNeon<uint64_t>({0x12345678DEADBEEF, 0xABCDEF0198765432}, SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAS1, 1, uint64_t,
+                fillNeonCombined<uint64_t>(
+                    {0x12345678DEADBEEF, 0xABCDEF0198765432}, {0}, SVL / 8));
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                    0xABCDEF01};
+  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x1, #1
+    ptrue p0.s
+    mov w12, #1
+    # Load and broadcast values from heap
+    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za0v.s[w12, 2]}, p0/z, [x0]
+
+    # Test for inactive lanes
+    mov x1, #0
+    mov x3, #8
+    # TODO change to addsvl when implemented
+    addvl x1, x1, #1
+    udiv x1, x1, x3
+    mov x2, #0
+    whilelo p1.s, xzr, x1
+    ld1w {za1v.s[w12, 0]}, p1/z, [x0, x2, lsl #2]
+  )");
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 1, uint32_t,
+                fillNeon<uint32_t>(
+                    {0x12345678, 0x98765432, 0xABCDEF01, 0xDEADBEEF}, SVL / 8));
+  CHECK_MAT_COL(AARCH64_REG_ZAS0, 3, uint32_t,
+                fillNeon<uint32_t>(
+                    {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, SVL / 8));
+  CHECK_MAT_COL(
+      AARCH64_REG_ZAS1, 1, uint32_t,
+      fillNeonCombined<uint32_t>(
+          {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01}, {0}, SVL / 8));
+}
+
+TEST_P(InstSme, ldr) {
+  // Horizontal
+  initialHeapData_.resize(SVL);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                              0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src, SVL);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    ptrue p0.b
+    mov w12, #0
+    # Load and broadcast values from heap
+    ldr za[w12, 0], [x0]
+    ldr za[w12, 2], [x0, #2, mul vl]
+  )");
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 0, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+  CHECK_MAT_ROW(AARCH64_REG_ZAB0, 1, uint8_t, fillNeon<uint8_t>({0}, SVL / 8));
+  CHECK_MAT_ROW(
+      AARCH64_REG_ZAB0, 2, uint8_t,
+      fillNeon<uint8_t>({0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78, 0x98,
+                         0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01},
+                        SVL / 8));
+
+  for (uint16_t i = 3; i < SVL / 8; i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAB0, i, uint8_t,
+                  fillNeon<uint8_t>({0}, SVL / 8));
+  }
+}
+
+TEST_P(InstSme, smopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    smopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    smopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    smopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    smopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({112}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, smops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    smops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    smops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    smops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    smops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-112}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, st1b) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34, 0x56, 0x78,
+                              0x98, 0x76, 0x54, 0x32, 0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8, src, SVL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0h.b[w12, 3]}, p0/z, [x0, x1]
+    st1b {za0h.b[w12, 0]}, p0, [sp, x1]
+    st1b {za0h.b[w12, 3]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    EXPECT_EQ(
+        getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4095 + i),
+        src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>((SVL / 8) + i), src[i % 16]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #16
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1b {za0h.b[w12, 0]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0h.b[w12, 5]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1b {za0h.b[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1b {za0h.b[w13, 1]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0h.b[w12, 5]}, p0, [x6, x3]
+    # Store odd indexed elements to memory
+    st1b {za0h.b[w13, 1]}, p1, [x6, x3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + (i + 1)), 0);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + (i + 1)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint8_t* heap8_vert = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  std::vector<uint8_t> src_vert = {0xDE, 0xAD, 0xBE, 0xEF, 0x12, 0x34,
+                                   0x56, 0x78, 0x98, 0x76, 0x54, 0x32,
+                                   0xAB, 0xCD, 0xEF, 0x01};
+  fillHeap<uint8_t>(heap8_vert, src_vert, SVL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    ld1b {za0v.b[w12, 0]}, p0/z, [x0, x1]
+    ld1b {za0v.b[w12, 1]}, p0/z, [x0, x1]
+    st1b {za0v.b[w12, 0]}, p0, [sp, x1]
+    st1b {za0v.b[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i++) {
+    EXPECT_EQ(
+        getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4095 + i),
+        src_vert[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>((SVL / 8) + i), src_vert[i % 16]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #16
+    ptrue p0.b
+    pfalse p1.b
+    zip1 p1.b, p0.b, p1.b
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1b {za0v.b[w12, 0]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0v.b[w12, 5]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1b {za0v.b[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1b {za0v.b[w13, 1]}, p0/z, [x0, x3]
+    # Store all 0s to memory
+    st1b {za0v.b[w12, 5]}, p0, [x6, x3]
+    # Store odd indexed elements to memory
+    st1b {za0v.b[w13, 1]}, p1, [x6, x3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 8); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(400 + (i + 1)), 0);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + i), src[i % 16]);
+    EXPECT_EQ(getMemoryValue<uint8_t>(800 + 16 + (i + 1)), 0);
+  }
+}
+
+TEST_P(InstSme, st1d) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src, SVL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.d
+
+    mov w12, #0
+    ld1d {za0h.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za1h.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
+    st1d {za0h.d[w12, 0]}, p0, [sp, x1, lsl #3]
+    st1d {za1h.d[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + (i * 8)), src[i % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1d {za3h.d[w12, 0]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0h.d[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1d {za3h.d[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1d {za1h.d[w13, 1]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0h.d[w12, 0]}, p0, [x6, x3, lsl #3]
+    # Store odd indexed elements to memory
+    st1d {za1h.d[w13, 1]}, p1, [x6, x3, lsl #3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (i * 8)), src[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((i + 1) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + ((i + 1) * 8)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.d
+
+    mov w12, #0
+    ld1d {za0v.d[w12, 0]}, p0/z, [x0, x1, lsl #3]
+    ld1d {za1v.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
+    st1d {za0v.d[w12, 0]}, p0, [sp, x1, lsl #3]
+    st1d {za1v.d[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 8)),
+              src_vert[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + (i * 8)), src_vert[i % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    zip1 p1.d, p0.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1d {za3v.d[w12, 0]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0v.d[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1d {za3v.d[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1d {za1v.d[w13, 1]}, p0/z, [x0, x3, lsl #3]
+    # Store all 0s to memory
+    st1d {za0v.d[w12, 0]}, p0, [x6, x3, lsl #3]
+    # Store odd indexed elements to memory
+    st1d {za1v.d[w13, 1]}, p1, [x6, x3, lsl #3]
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (i * 8)), src_vert[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((i + 1) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + (i * 8)), src_vert[i % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 16 + ((i + 1) * 8)), 0);
+  }
+}
+
+TEST_P(InstSme, st1h) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16 = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                               0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16, src, SVL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.h
+
+    mov w12, #0
+    ld1h {za0h.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za1h.h[w12, 1]}, p0/z, [x0, x1, lsl #1]
+    st1h {za0h.h[w12, 0]}, p0, [sp, x1, lsl #1]
+    st1h {za1h.h[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 2)),
+              src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>((SVL / 8) + (i * 2)), src[i % 8]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #8
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1h {za0h.h[w12, 0]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1h.h[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1h {za0h.h[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1h {za0h.h[w13, 1]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1h.h[w12, 0]}, p0, [x6, x3, lsl #1]
+    # Store odd indexed elements to memory
+    st1h {za0h.h[w13, 1]}, p1, [x6, x3, lsl #1]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + ((i + 1) * 2)), 0);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + ((i + 1) * 2)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint16_t* heap16_vert = reinterpret_cast<uint16_t*>(initialHeapData_.data());
+  std::vector<uint16_t> src_vert = {0xDEAD, 0xBEEF, 0x1234, 0x5678,
+                                    0x9876, 0x5432, 0xABCD, 0xEF01};
+  fillHeap<uint16_t>(heap16_vert, src_vert, SVL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.h
+
+    mov w12, #0
+    ld1h {za0v.h[w12, 0]}, p0/z, [x0, x1, lsl #1]
+    ld1h {za1v.h[w12, 1]}, p0/z, [x0, x1, lsl #1]
+    st1h {za0v.h[w12, 0]}, p0, [sp, x1, lsl #1]
+    st1h {za1v.h[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i++) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 2)),
+              src_vert[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>((SVL / 8) + (i * 2)), src_vert[i % 8]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #8
+    ptrue p0.h
+    pfalse p1.b
+    zip1 p1.h, p0.h, p1.h
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1h {za0v.h[w12, 0]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1v.h[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1h {za0v.h[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1h {za0v.h[w13, 1]}, p0/z, [x0, x3, lsl #1]
+    # Store all 0s to memory
+    st1h {za1v.h[w12, 0]}, p0, [x6, x3, lsl #1]
+    # Store odd indexed elements to memory
+    st1h {za0v.h[w13, 1]}, p1, [x6, x3, lsl #1]
+  )");
+  for (uint64_t i = 0; i < (SVL / 16); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(400 + ((i + 1) * 2)), 0);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + (i * 2)), src[i % 8]);
+    EXPECT_EQ(getMemoryValue<uint16_t>(800 + 16 + ((i + 1) * 2)), 0);
+  }
+}
+
+TEST_P(InstSme, st1q) {
+  // Horizontal
+  initialHeapData_.resize(SVL);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64, src, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    mov w13, #1
+    ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+    ld1q {za1h.q[w13, 0]}, p0/z, [x0, x1, lsl #4]
+    st1q {za0h.q[w12, 0]}, p0, [sp, x1, lsl #4]
+    st1q {za1h.q[w13, 0]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    # Do zip1 twice to get on-off for 128-bit
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #2
+    # Load entire row
+    ld1q {za3h.q[w12, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0h.q[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1q {za3h.q[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1q {za1h.q[w13, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0h.q[w12, 0]}, p0, [x6, x3, lsl #4]
+    # Store odd indexed elements to memory
+    st1q {za1h.q[w13, 0]}, p1, [x6, x3, lsl #4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i) * 8)), src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i + 1) + 2) * 8)), 0);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i + 1) + 2) * 8)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL);
+  uint64_t* heap64_vert = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> src_vert = {0xDEADBEEF12345678, 0x98765432ABCDEF01};
+  fillHeap<uint64_t>(heap64_vert, src_vert, SVL / 8);
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.b
+
+    mov w12, #0
+    mov w13, #1
+    ld1q {za0v.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
+    ld1q {za1v.q[w13, 0]}, p0/z, [x0, x1, lsl #4]
+    st1q {za0v.q[w12, 0]}, p0, [sp, x1, lsl #4]
+    st1q {za1v.q[w13, 0]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i++) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i) * 8)),
+              src_vert[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() -
+                                       4095 + ((2 * i + 1) * 8)),
+              src_vert[(2 * i + 1) % 2]);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i) * 8)),
+              src_vert[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>((SVL / 8) + ((2 * i + 1) * 8)),
+              src_vert[(2 * i + 1) % 2]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #2
+    ptrue p0.d
+    pfalse p1.b
+    # Do zip1 twice to get on-off for 128-bit
+    zip1 p1.d, p0.d, p1.d
+    zip1 p1.d, p1.d, p1.d
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #2
+    # Load entire row
+    ld1q {za3v.q[w12, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0v.q[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1q {za3v.q[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1q {za1v.q[w13, 0]}, p0/z, [x0, x3, lsl #4]
+    # Store all 0s to memory
+    st1q {za0v.q[w12, 0]}, p0, [x6, x3, lsl #4]
+    # Store odd indexed elements to memory
+    st1q {za1v.q[w13, 0]}, p1, [x6, x3, lsl #4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 128); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i) * 8)), src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(400 + (((2 * i + 1) + 2) * 8)), 0);
+
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i) * 8)),
+              src[(2 * i) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + ((2 * i + 1) * 8)),
+              src[(2 * i + 1) % 2]);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i) + 2) * 8)), 0);
+    EXPECT_EQ(getMemoryValue<uint64_t>(800 + 32 + (((2 * i + 1) + 2) * 8)), 0);
+  }
+}
+
+TEST_P(InstSme, st1w) {
+  // Horizontal
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src = {0xDEADBEEF, 0x12345678, 0x98765432, 0xABCDEF01};
+  fillHeap<uint32_t>(heap32, src, SVL / 16);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.s
+
+    mov w12, #0
+    ld1w {za0h.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za1h.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
+    st1w {za0h.s[w12, 0]}, p0, [sp, x1, lsl #2]
+    st1w {za1h.s[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src[i % 4]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #4
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1w {za3h.s[w12, 0]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0h.s[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1w {za3h.s[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1w {za1h.s[w13, 1]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0h.s[w12, 0]}, p0, [x6, x3, lsl #2]
+    # Store odd indexed elements to memory
+    st1w {za1h.s[w13, 1]}, p1, [x6, x3, lsl #2]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + (i * 4)), src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + ((i + 1) * 4)), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + ((i + 1) * 4)), 0);
+  }
+
+  // Vertical
+  initialHeapData_.resize(SVL / 4);
+  uint32_t* heap32_vert = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> src_vert = {0xDEADBEEF, 0x12345678, 0x98765432,
+                                    0xABCDEF01};
+  fillHeap<uint32_t>(heap32_vert, src_vert, SVL / 16);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    sub sp, sp, #4095
+    mov x1, #0
+    mov x4, #0
+    addvl x4, x4, #1
+    ptrue p0.s
+
+    mov w12, #0
+    ld1w {za0v.s[w12, 0]}, p0/z, [x0, x1, lsl #2]
+    ld1w {za1v.s[w12, 1]}, p0/z, [x0, x1, lsl #2]
+    st1w {za0v.s[w12, 0]}, p0, [sp, x1, lsl #2]
+    st1w {za1v.s[w12, 1]}, p0, [x4]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() -
+                                       4095 + (i * 4)),
+              src_vert[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>((SVL / 8) + (i * 4)), src_vert[i % 4]);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    smstart
+
+    zero {za}
+
+    mov x3, #4
+    ptrue p0.s
+    pfalse p1.b
+    zip1 p1.s, p0.s, p1.s
+    mov x5, #400
+    mov x6, #800
+
+    mov w12, #0
+    mov w13, #1
+    # Load entire row
+    ld1w {za3v.s[w12, 0]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0v.s[w12, 0]}, p0, [x5]
+    # Store odd indexed elements to memory
+    st1w {za3v.s[w12, 0]}, p1, [x5]
+
+    # Load entire row
+    ld1w {za1v.s[w13, 1]}, p0/z, [x0, x3, lsl #2]
+    # Store all 0s to memory
+    st1w {za0v.s[w12, 0]}, p0, [x6, x3, lsl #2]
+    # Store odd indexed elements to memory
+    st1w {za1v.s[w13, 1]}, p1, [x6, x3, lsl #2]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i += 2) {
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + (i * 4)), src_vert[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(400 + ((i + 1) * 4)), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + (i * 4)), src_vert[i % 4]);
+    EXPECT_EQ(getMemoryValue<uint32_t>(800 + 16 + ((i + 1) * 4)), 0);
+  }
+}
+
+TEST_P(InstSme, str) {
+  RUN_AARCH64(R"(
+    smstart
+
+    zero {za}
+
+    dup z0.b, #2
+    dup z1.b, #5
+    ptrue p0.b
+    ptrue p1.b
+
+    # Fill first 32-bit ZA tile with 40 in every element
+    umopa za0.s, p0/m, p1/m, z0.b, z1.b
+
+    dup z0.b, #1
+    dup z1.b, #5
+
+    # Fill third 32-bit ZA tile with 20 in every element
+    umopa za2.s, p0/m, p1/m, z0.b, z1.b
+
+    mov x2, #600
+    mov w12, #0
+
+    # ZA sub tiles are interleaved, so 0th, 4th, 8th... rows will be for za0.s
+    # 2nd, 6th, 10th ... rows will be for za2.s
+    str za[w12, #0], [x2]
+    str za[w12, #1], [x2, #1, mul vl]
+    str za[w12, #2], [x2, #2, mul vl]
+    str za[w12, #3], [x2, #3, mul vl]
+    
+    # Store 8th row (3rd row of za0.s)
+    add w12, w12, #8
+    mov x3, #0
+    addvl x3, x3, #4
+    add x2, x2, x3
+    str za[w12, #0], [x2]
+
+    # Store 10th row (3rd row of za2.s)
+    add w12, w12, #2
+    mov x3, #0
+    addvl x3, x3, #1
+    add x2, x2, x3
+    str za[w12, #0], [x2]
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({40}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS1, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                  fillNeon<uint32_t>({20}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS3, i, uint32_t,
+                  fillNeon<uint32_t>({0}, (SVL / 8)));
+  }
+  const uint64_t SVL_bytes = SVL / 8;
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    const uint64_t off = i * sizeof(uint32_t);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + off), 40);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + SVL_bytes + off), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (2 * SVL_bytes) + off), 20);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (3 * SVL_bytes) + off), 0);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (4 * SVL_bytes) + off), 40);
+    EXPECT_EQ(getMemoryValue<uint32_t>(600 + (5 * SVL_bytes) + off), 20);
+  }
+}
+
+TEST_P(InstSme, sumopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #-8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #-7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 255
+    dup z1.b, #3
+    dup z2.b, #-1
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is signed, z4 is unsigned so will become 254
+    dup z3.b, #7
+    dup z4.b, #-2
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({3060}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #-8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #-7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 65535
+    dup z1.h, #3
+    dup z2.h, #-1
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is signed, z4 is unsigned so will become 65534
+    dup z3.h, #7
+    dup z4.h, #-2
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({786420}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({1834952}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, sumops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #-8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #-7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 255
+    dup z1.b, #3
+    dup z2.b, #-1
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    sumops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is signed, z4 is unsigned so will become 254
+    dup z3.b, #7
+    dup z4.b, #-2
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    sumops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-3060}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #-8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #-7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is signed, z2 is unsigned so will become 255
+    dup z1.h, #3
+    dup z2.h, #-1
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    sumops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is signed, z4 is unsigned so will become 254
+    dup z3.h, #7
+    dup z4.h, #-2
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    sumops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-786420}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-1834952}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, umopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    umopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                  fillNeon<uint32_t>({112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    umopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    umopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                  fillNeon<uint64_t>({112}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, umops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #3
+    dup z3.b, #2
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    umopa za0.s, p0/m, p1/m, z1.b, z2.b
+    umops za0.s, p0/m, p1/m, z1.b, z3.b
+
+    dup z3.b, #7
+    dup z4.b, #4
+    dup z5.b, #3
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    umopa za2.s, p0/m, p2/m, z3.b, z4.b
+    umops za2.s, p0/m, p2/m, z3.b, z5.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, uint32_t,
+                  fillNeon<uint32_t>({32}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, uint32_t,
+                  fillNeon<uint32_t>({28}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #3
+    dup z3.h, #2
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    umopa za0.d, p0/m, p1/m, z1.h, z2.h
+    umops za0.d, p0/m, p1/m, z1.h, z3.h
+
+    dup z3.h, #7
+    dup z4.h, #4
+    dup z5.h, #3
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    umopa za2.d, p0/m, p2/m, z3.h, z4.h
+    umops za2.d, p0/m, p2/m, z3.h, z5.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, uint64_t,
+                  fillNeon<uint64_t>({32}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, uint64_t,
+                  fillNeon<uint64_t>({28}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, usmopa) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #-3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #-4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 253, z2 is signed
+    dup z1.b, #-3
+    dup z2.b, #2
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmopa za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is unsigned so will become 254, z4 is unsigned
+    dup z3.b, #-2
+    dup z4.b, #7
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmopa za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({2024}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #-3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #-4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 65533, z2 is unsigned
+    dup z1.h, #-3
+    dup z2.h, #2
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmopa za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is unsigned so will become 65534, z4 is signed
+    dup z3.h, #-2
+    dup z4.h, #7
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmopa za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({524264}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({1834952}, (SVL / 16)));
+  }
+}
+
+TEST_P(InstSme, usmops) {
+  // 32-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.b, #8
+    dup z2.b, #-3
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    dup z3.b, #7
+    dup z4.b, #-4
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 253, z2 is signed
+    dup z1.b, #-3
+    dup z2.b, #2
+    ptrue p0.b
+    ptrue p1.b
+
+    zero {za}
+
+    usmops za0.s, p0/m, p1/m, z1.b, z2.b
+
+    # z3 is unsigned so will become 254, z4 is signed
+    dup z3.b, #-2
+    dup z4.b, #7
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.b, xzr, x0
+
+    usmops za2.s, p0/m, p2/m, z3.b, z4.b
+  )");
+  for (uint64_t i = 0; i < (SVL / 32); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAS0, i, int32_t,
+                  fillNeon<int32_t>({-2024}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAS2, i, int32_t,
+                  fillNeon<int32_t>({-7112}, (SVL / 16)));
+  }
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    smstart
+
+    dup z1.h, #8
+    dup z2.h, #-3
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    dup z3.h, #7
+    dup z4.h, #-4
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({96}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({112}, (SVL / 16)));
+  }
+
+  RUN_AARCH64(R"(
+    smstart
+
+    # z1 is unsigned so will become 65533, z2 is signed
+    dup z1.h, #-3
+    dup z2.h, #2
+    ptrue p0.h
+    ptrue p1.h
+
+    zero {za}
+
+    usmops za0.d, p0/m, p1/m, z1.h, z2.h
+
+    # z3 is unsigned so will become 65534, z4 is signed
+    dup z3.h, #-2
+    dup z4.h, #7
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    udiv x0, x0, x1
+    whilelo p2.h, xzr, x0
+
+    usmops za2.d, p0/m, p2/m, z3.h, z4.h
+  )");
+  for (uint64_t i = 0; i < (SVL / 64); i++) {
+    CHECK_MAT_ROW(AARCH64_REG_ZAD0, i, int64_t,
+                  fillNeon<int64_t>({-524264}, (SVL / 8)));
+    CHECK_MAT_ROW(AARCH64_REG_ZAD2, i, int64_t,
+                  fillNeon<int64_t>({-1834952}, (SVL / 16)));
   }
 }
 
diff --git a/test/unit/aarch64/ArchitectureTest.cc b/test/unit/aarch64/ArchitectureTest.cc
index dbc1fa65ac..8f2619a283 100644
--- a/test/unit/aarch64/ArchitectureTest.cc
+++ b/test/unit/aarch64/ArchitectureTest.cc
@@ -117,6 +117,7 @@ TEST_F(AArch64ArchitectureTest, predecode) {
   EXPECT_EQ(result, 4);
   EXPECT_EQ(output[0]->getInstructionAddress(), 0x4);
   EXPECT_EQ(output[0]->exceptionEncountered(), false);
+  EXPECT_EQ(output[0]->getGroup(), InstructionGroups::SVE_DIV_OR_SQRT);
 }
 
 TEST_F(AArch64ArchitectureTest, getSystemRegisterTag) {
@@ -239,6 +240,23 @@ TEST_F(AArch64ArchitectureTest, get_set_SVCRVal) {
   EXPECT_EQ(arch->getSVCRval(), 3);
 }
 
+TEST_F(AArch64ArchitectureTest, isSM_ZA_enabled) {
+  EXPECT_FALSE(arch->isStreamingModeEnabled());
+  EXPECT_FALSE(arch->isZARegisterEnabled());
+  arch->setSVCRval(1);
+  EXPECT_TRUE(arch->isStreamingModeEnabled());
+  EXPECT_FALSE(arch->isZARegisterEnabled());
+  arch->setSVCRval(2);
+  EXPECT_FALSE(arch->isStreamingModeEnabled());
+  EXPECT_TRUE(arch->isZARegisterEnabled());
+  arch->setSVCRval(3);
+  EXPECT_TRUE(arch->isStreamingModeEnabled());
+  EXPECT_TRUE(arch->isZARegisterEnabled());
+  arch->setSVCRval(0);
+  EXPECT_FALSE(arch->isStreamingModeEnabled());
+  EXPECT_FALSE(arch->isZARegisterEnabled());
+}
+
 }  // namespace aarch64
 }  // namespace arch
 }  // namespace simeng
diff --git a/test/unit/aarch64/InstructionTest.cc b/test/unit/aarch64/InstructionTest.cc
index 1ecf14a1a6..06ab76c1e3 100644
--- a/test/unit/aarch64/InstructionTest.cc
+++ b/test/unit/aarch64/InstructionTest.cc
@@ -58,6 +58,18 @@ class AArch64InstructionTest : public testing::Test {
                    &rawInsn_cbz);
     cbzMetadata = std::make_unique<InstructionMetadata>(rawInsn_cbz);
 
+    // psel
+    cs_insn rawInsn_psel;
+    cs_detail rawDetail_psel;
+    rawInsn_psel.detail = &rawDetail_psel;
+    size_t size_psel = 4;
+    uint64_t address_psel = 0;
+    const uint8_t* encoding_psel =
+        reinterpret_cast<const uint8_t*>(pselInstrBytes.data());
+    cs_disasm_iter(capstoneHandle, &encoding_psel, &size_psel, &address_psel,
+                   &rawInsn_psel);
+    pselMetadata = std::make_unique<InstructionMetadata>(rawInsn_psel);
+
     const uint8_t* badEncoding =
         reinterpret_cast<const uint8_t*>(invalidInstrBytes.data());
     invalidMetadata = std::make_unique<InstructionMetadata>(badEncoding);
@@ -74,6 +86,8 @@ class AArch64InstructionTest : public testing::Test {
   std::array<uint8_t, 4> ldpInstrBytes = {0x61, 0x08, 0x40, 0xA9};
   // cbz x2, #0x28
   std::array<uint8_t, 4> cbzInstrBytes = {0x42, 0x01, 0x00, 0xB4};
+  // psel	p4, p0, p2.s[w13, 0]
+  std::array<uint8_t, 4> pselInstrBytes = {0x44, 0x40, 0x31, 0x25};
   std::array<uint8_t, 4> invalidInstrBytes = {0x20, 0x00, 0x02, 0x8c};
 
   // A Capstone decoding library handle, for decoding instructions.
@@ -85,6 +99,7 @@ class AArch64InstructionTest : public testing::Test {
   std::unique_ptr<InstructionMetadata> fdivMetadata;
   std::unique_ptr<InstructionMetadata> ldpMetadata;
   std::unique_ptr<InstructionMetadata> cbzMetadata;
+  std::unique_ptr<InstructionMetadata> pselMetadata;
   std::unique_ptr<InstructionMetadata> invalidMetadata;
   std::unique_ptr<MicroOpInfo> uopInfo;
   InstructionException exception;
@@ -182,7 +197,7 @@ TEST_F(AArch64InstructionTest, invalidInsn_1) {
   }
   EXPECT_EQ(insn.getException(), InstructionException::EncodingUnallocated);
   EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
-  // Default Group
+  // Default Group for instruction that is not decoded
   EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH_NOSHIFT);
   EXPECT_EQ(insn.getInstructionAddress(), 0x44);
   EXPECT_EQ(insn.getInstructionId(), 13);
@@ -248,7 +263,7 @@ TEST_F(AArch64InstructionTest, invalidInsn_2) {
   }
   EXPECT_EQ(insn.getException(), InstructionException::HypervisorCall);
   EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
-  // Default Group
+  // Default Group for instruction that is not decoded
   EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH_NOSHIFT);
   EXPECT_EQ(insn.getInstructionAddress(), 0x43);
   EXPECT_EQ(insn.getInstructionId(), 15);
diff --git a/test/unit/riscv/InstructionTest.cc b/test/unit/riscv/InstructionTest.cc
index 6103cd4f5c..64eff7071c 100644
--- a/test/unit/riscv/InstructionTest.cc
+++ b/test/unit/riscv/InstructionTest.cc
@@ -178,7 +178,7 @@ TEST_F(RiscVInstructionTest, invalidInsn_1) {
   }
   EXPECT_EQ(insn.getException(), InstructionException::EncodingUnallocated);
   EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
-  // Default Group
+  // Default Group for instruction that is not decoded
   EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH);
   EXPECT_EQ(insn.getInstructionAddress(), 0x44);
   EXPECT_EQ(insn.getInstructionId(), 13);
@@ -242,7 +242,7 @@ TEST_F(RiscVInstructionTest, invalidInsn_2) {
   }
   EXPECT_EQ(insn.getException(), InstructionException::HypervisorCall);
   EXPECT_EQ(insn.getGeneratedAddresses().size(), 0);
-  // Default Group
+  // Default Group for instruction that is not decoded
   EXPECT_EQ(insn.getGroup(), InstructionGroups::INT_SIMPLE_ARTH);
   EXPECT_EQ(insn.getInstructionAddress(), 0x43);
   EXPECT_EQ(insn.getInstructionId(), 15);