diff --git a/src/include/simeng/arch/aarch64/helpers/neon.hh b/src/include/simeng/arch/aarch64/helpers/neon.hh
index c2626b7e91..e5cf3dd3aa 100644
--- a/src/include/simeng/arch/aarch64/helpers/neon.hh
+++ b/src/include/simeng/arch/aarch64/helpers/neon.hh
@@ -558,6 +558,39 @@ RegisterValue vecLogicOp_3vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for NEON instructions with the format `uaddlv rd, Vn.T`.
+ * T represents the type of the destination register (e.g. for h0, T =
+ * uint32_t).
+ * U represents the type of the sourceValues[0] (e.g. for v0.8b, U =
+ * uint8_t)
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename U, int I>
+RegisterValue vecAddlv(srcValContainer& sourceValues) {
+  const U* n = sourceValues[0].getAsVector<U>();
+  T out = 0;
+  for (int i = 0; i < I; i++) {
+    out += n[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for NEON instructions with the format `umaxv rd, Vn.T`.
+ * T represents the type of sourceValues (e.g. for vn.s, T = uint32_t).
+ * I represents the number of elements in the output array to be updated (e.g.
+ * for vd.8b I = 8).
+ * Returns correctly formatted RegisterValue. */
+template <typename T, int I>
+RegisterValue vecUMaxV(srcValContainer& sourceValues) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  T out = n[0];
+  for (int i = 1; i < I; i++) {
+    out = std::max(n[i], out);
+  }
+  return {out, 256};
+}
+
 /** Helper function for NEON instructions with the format `umaxp vd, vn, vm`.
  * T represents the type of sourceValues (e.g. for vn.2d, T = uint64_t).
  * I represents the number of elements in the output array to be updated (e.g.
diff --git a/src/include/simeng/arch/aarch64/helpers/sve.hh b/src/include/simeng/arch/aarch64/helpers/sve.hh
index 2c33ccfbe6..6d4c0df66a 100644
--- a/src/include/simeng/arch/aarch64/helpers/sve.hh
+++ b/src/include/simeng/arch/aarch64/helpers/sve.hh
@@ -257,6 +257,32 @@ RegisterValue sveCpy_imm(
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `cpy zd, pg/m, rn
+ * T represents the type of sourceValues (e.g. for zd.d, T = int64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveCpy_Scalar(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* zd = sourceValues[0].getAsVector<T>();
+  const uint64_t* p = sourceValues[1].getAsVector<uint64_t>();
+  const T rn = sourceValues[2].get<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i] = rn;
+    } else {
+      out[i] = zd[i];
+    }
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `dec<b,d,h,s> xdn{,
  * pattern{, MUL #imm}}`.
  * T represents the type of operation (e.g. for DECD, T = uint64_t).
@@ -849,6 +875,132 @@ RegisterValue sveFsqrtPredicated_2vecs(srcValContainer& sourceValues,
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `ftsmul zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * U represents the same precision as T, but as an integer type for the second
+ * source register.
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename U>
+RegisterValue sveFTrigSMul(srcValContainer& sourceValues,
+                           const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const U* m = sourceValues[1].getAsVector<U>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  U bit_0_mask = static_cast<U>(1) << (sizeof(T) * 8 - 1);
+  // Square each element in the first source vector and then set the sign bit
+  // to a copy of bit 0 of the corresponding element in the second source
+  // register
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = n[i] * n[i];
+    T sign_bit = m[i] & bit_0_mask ? -1.0 : 1.0;
+    out[i] = std::abs(out[i]) * sign_bit;
+  }
+
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `ftssel zd, zn, zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * U represents the same precision as T, but as an integer type for the second
+ * source register.
+ * Returns correctly formatted RegisterValue. */
+template <typename T, typename U>
+RegisterValue sveFTrigSSel(srcValContainer& sourceValues,
+                           const uint16_t VL_bits) {
+  const T* n = sourceValues[0].getAsVector<T>();
+  const U* m = sourceValues[1].getAsVector<U>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  U bit_0_mask = static_cast<U>(1) << (sizeof(T) * 8 - 1);
+  U bit_1_mask = static_cast<U>(1) << (sizeof(T) * 8 - 2);
+
+  // Place the value 1.0 or a copy of the first source vector element in the
+  // destination element, depending on bit 0 of the corresponding element of
+  // the second source vector. The sign bit of the destination element is
+  // negated from bit 1 of the second source vector
+  for (int i = 0; i < partition_num; i++) {
+    out[i] = m[i] & bit_0_mask ? static_cast<T>(1.0) : n[i];
+    out[i] = m[i] & bit_1_mask ? -out[i] : out[i];
+  }
+  return {out, 256};
+}
+
+/** Helper function for SVE instructions with the format `ftmad zd, zn, zm,
+ * #imm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = double).
+ * Returns correctly formatted RegisterValue. **/
+template <typename T>
+RegisterValue sveFTrigMad(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+  const uint8_t imm = static_cast<uint8_t>(metadata.operands[3].imm);
+
+  const std::array<double, 8> sin64 = {1.0,
+                                       -0.1666666666666661,
+                                       0.8333333333320002e-02,
+                                       -0.1984126982840213e-03,
+                                       0.2755731329901505e-05,
+                                       -0.2505070584637887e-07,
+                                       0.1589413637195215e-09,
+                                       0.0};
+
+  const std::array<double, 8> cos64 = {1.0,
+                                       -0.5000000000000000,
+                                       0.4166666666666645e-01,
+                                       -0.1388888888886111e-02,
+                                       0.2480158728388683e-04,
+                                       -0.2755731309913950e-06,
+                                       0.2087558253975872e-08,
+                                       -0.1135338700720054e-10};
+
+  const std::array<float, 8> sin32 = {1.0f,
+                                      -1.666666716337e-01f,
+                                      8.333330973983e-03f,
+                                      -1.983967522392e-04f,
+                                      2.721174723774e-06f,
+                                      0.0f,
+                                      0.0f,
+                                      0.0f};
+
+  const std::array<float, 8> cos32 = {1.0f,
+                                      -5.000000000000e-01f,
+                                      4.166664928198e-02f,
+                                      -1.388759003021e-03f,
+                                      2.446388680255e-05f,
+                                      0.0f,
+                                      0.0f,
+                                      0.0f};
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  for (int i = 0; i < partition_num; i++) {
+    T coeff;
+    const bool sign_bit = std::signbit(m[i]);
+    // If float then use those LUTs
+    if (sizeof(T) == 4) {
+      coeff = sign_bit ? cos32[imm] : sin32[imm];
+    }
+    // Else if double use those LUTs
+    else {
+      coeff = sign_bit ? cos64[imm] : sin64[imm];
+    }
+    // TODO: Add FP16 support if/when we eventually support these (may require
+    // C++23)
+    out[i] = n[i] * std::abs(m[i]) + coeff;
+  }
+
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `inc<b, d, h, w>
  * xdn{, pattern{, MUL #imm}}`.
  * T represents the type of operation (e.g. for INCB, T = int8_t).
@@ -936,6 +1088,63 @@ RegisterValue sveIndex(
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `lastb zd, pg, zn`.
+ * T represents the vector register type (e.g. zd.d would be uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveLastBSimdScalar(srcValContainer& sourceValues,
+                                 const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+
+  // Get last active element
+  int lastElem = 0;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
+    }
+    // If no active lane has been found, select highest element instead
+    if (i == 0) lastElem = partition_num - 1;
+  }
+  return {n[lastElem], 256};
+}
+
+/** Helper function for SVE instructions with the format `clastb zd, pg, zd,
+ * zn`.
+ * T represents the vector register type (e.g. zd.d would be uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveCLastBSimdScalar(srcValContainer& sourceValues,
+                                  const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* m = sourceValues[1].getAsVector<T>();
+  const T* n = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out;
+
+  // Get last active element
+  int lastElem = -1;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
+    }
+  }
+
+  if (lastElem < 0) {
+    out = m[0];
+  } else {
+    out = n[lastElem];
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `<AND, EOR, ...>
  * pd, pg/z, pn, pm`.
  * T represents the type of sourceValues (e.g. for pn.d, T = uint64_t).
@@ -1268,7 +1477,8 @@ RegisterValue sveOrr_3vecs(srcValContainer& sourceValues,
 /** Helper function for SVE2 instructions with the format `psel pd, pn,
  * pm.t[wa, #imm]`.
  * T represents the type of sourceValues (e.g. for pm.d, T =
- * uint64_t). Returns an array of 4 uint64_t elements. */
+ * uint64_t).
+ * Returns an array of 4 uint64_t elements. */
 template <typename T>
 std::array<uint64_t, 4> svePsel(
     srcValContainer& sourceValues,
@@ -1293,6 +1503,63 @@ std::array<uint64_t, 4> svePsel(
   return out;
 }
 
+/** Helper function for SVE instructions with the format `pfirst pdn.b, pg,
+ * pdn.b`.
+ * Returns an array of 4 uint64_t elements, and updates the NZCV flags.
+ */
+std::tuple<std::array<uint64_t, 4>, uint8_t> svePfirst(
+    srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const uint16_t partition_num = VL_bits / 8;
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
+  // Set destination d as source n to copy all false lanes and the active lanes
+  // beyond the first
+  std::array<uint64_t, 4> out = {dn[0], dn[1], dn[2], dn[3]};
+  // Get the first active lane and set same lane in destination predicate
+  for (int i = 0; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64)));
+    if (p[i / 64] & shifted_active) {
+      out[i / 64] |= shifted_active;
+      break;
+    }
+  }
+  return {out, getNZCVfromPred(out, VL_bits, 1)};
+}
+
+/** Helper function for SVE instructions with the format `pnext pdn, pv, pdn`.
+ * T represents the type of sourceValues (e.g. for pdn.d, T = uint64_t).
+ * Returns an array of 4 uint64_t elements, and updates the NZCV flags. */
+template <typename T>
+std::tuple<std::array<uint64_t, 4>, uint8_t> svePnext(
+    srcValContainer& sourceValues,
+    const simeng::arch::aarch64::InstructionMetadata& metadata,
+    const uint16_t VL_bits) {
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const uint64_t* dn = sourceValues[1].getAsVector<uint64_t>();
+  // Set destination elements to 0
+  std::array<uint64_t, 4> out = {0, 0, 0, 0};
+
+  // Get last active element of dn.pattern
+  int lastElem = -1;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (dn[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
+    }
+  }
+  // Get next active element of p, starting from last of dn.pattern
+  for (int i = lastElem + 1; i < partition_num; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      out[i / (64 / sizeof(T))] |= shifted_active;
+      break;
+    }
+  }
+  return {out, getNZCVfromPred(out, VL_bits, sizeof(T))};
+}
+
 /** Helper function for SVE instructions with the format `ptrue pd{,
  * pattern}.
  * T represents the type of sourceValues (e.g. for pd.d, T = uint64_t).
@@ -1425,6 +1692,51 @@ RegisterValue sveSminv(srcValContainer& sourceValues, const uint16_t VL_bits) {
   return {out, 256};
 }
 
+/** Helper function for SVE instructions with the format `splice zd, pg, zn,
+ * zm`.
+ * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
+ * Returns correctly formatted RegisterValue. */
+template <typename T>
+RegisterValue sveSplice(srcValContainer& sourceValues, const uint16_t VL_bits) {
+  const uint64_t* p = sourceValues[0].getAsVector<uint64_t>();
+  const T* n = sourceValues[1].getAsVector<T>();
+  const T* m = sourceValues[2].getAsVector<T>();
+
+  const uint16_t partition_num = VL_bits / (sizeof(T) * 8);
+  T out[256 / sizeof(T)] = {0};
+
+  // Get last active element
+  int lastElem = 0;
+  for (int i = partition_num - 1; i >= 0; i--) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) {
+      lastElem = i;
+      break;
+    }
+  }
+
+  // Extract region from n as denoted by predicate p. Copy region into the
+  // lowest elements of the destination operand
+  bool active = false;
+  int index = 0;
+  for (int i = 0; i <= lastElem; i++) {
+    uint64_t shifted_active = 1ull << ((i % (64 / sizeof(T))) * sizeof(T));
+    if (p[i / (64 / sizeof(T))] & shifted_active) active = true;
+    if (active) {
+      out[index] = n[i];
+      index++;
+    }
+  }
+
+  // Set any unassigned elements to the lowest elements in m
+  int elemsLeft = partition_num - index;
+  for (int i = 0; i < elemsLeft; i++) {
+    out[index] = m[i];
+    index++;
+  }
+  return {out, 256};
+}
+
 /** Helper function for SVE instructions with the format `Sub zd, zn,
  * zm`.
  * T represents the type of sourceValues (e.g. for zn.d, T = uint64_t).
@@ -1634,33 +1946,31 @@ RegisterValue sveUzp_vecs(srcValContainer& sourceValues, const uint16_t VL_bits,
   return {out, 256};
 }
 
-/** Helper function for SVE instructions with the format `whilelo pd,
- * <w,x>n, <w,x>m`.
+/** Helper function for SVE instructions with the format `while<ge, gt, hi, hs,
+ * le, lo, ls, lt> pd, <w,x>n, <w,x>m`.
  * T represents the type of sourceValues n and m (e.g. for wn, T = uint32_t).
  * P represents the type of operand p (e.g. for pd.b, P = uint8_t).
  * Returns tuple of type [pred results (array of 4 uint64_t), nzcv]. */
 template <typename T, typename P>
-std::tuple<std::array<uint64_t, 4>, uint8_t> sveWhilelo(
-    srcValContainer& sourceValues, const uint16_t VL_bits, bool calcNZCV) {
+std::tuple<std::array<uint64_t, 4>, uint8_t> sveWhile(
+    srcValContainer& sourceValues, const uint16_t VL_bits,
+    std::function<bool(T, T)> func) {
   const T n = sourceValues[0].get<T>();
   const T m = sourceValues[1].get<T>();
 
   const uint16_t partition_num = VL_bits / (sizeof(P) * 8);
   std::array<uint64_t, 4> out = {0, 0, 0, 0};
-  uint16_t index = 0;
 
   for (int i = 0; i < partition_num; i++) {
     // Determine whether lane should be active and shift to align with
     // element in predicate register.
     uint64_t shifted_active =
-        (n + i) < m ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0;
-    out[index / (64 / (sizeof(P)))] =
-        out[index / (64 / (sizeof(P)))] | shifted_active;
-    index++;
+        func((n + i), m) ? 1ull << ((i % (64 / (sizeof(P))) * (sizeof(P)))) : 0;
+    out[i / (64 / (sizeof(P)))] |= shifted_active;
   }
   // Byte count = sizeof(P) as destination predicate is predicate of P
   // bytes.
-  uint8_t nzcv = calcNZCV ? getNZCVfromPred(out, VL_bits, sizeof(P)) : 0;
+  uint8_t nzcv = getNZCVfromPred(out, VL_bits, sizeof(P));
   return {out, nzcv};
 }
 
diff --git a/src/include/simeng/pipeline/ReorderBuffer.hh b/src/include/simeng/pipeline/ReorderBuffer.hh
index 2e6e68e37b..4c31eeb38a 100644
--- a/src/include/simeng/pipeline/ReorderBuffer.hh
+++ b/src/include/simeng/pipeline/ReorderBuffer.hh
@@ -125,6 +125,14 @@ class ReorderBuffer {
    */
   uint64_t pc_;
 
+  /** A counter for how many cycles the same instruction has been at the head of
+   * the ROB */
+  uint64_t robHeadRepeatCounter_ = 0;
+
+  /** A limit for the counter of how long an instruction can be stuck at the
+   * head of the ROB before SimEng exits with an exception. */
+  uint64_t robHeadRepeatLimit_ = 10000000;
+
   /** The sequence ID of the youngest instruction that should remain after the
    * current flush. */
   uint64_t flushAfter_;
diff --git a/src/lib/arch/aarch64/ExceptionHandler.cc b/src/lib/arch/aarch64/ExceptionHandler.cc
index ae98dddb1a..639f8e0655 100644
--- a/src/lib/arch/aarch64/ExceptionHandler.cc
+++ b/src/lib/arch/aarch64/ExceptionHandler.cc
@@ -429,9 +429,9 @@ bool ExceptionHandler::init() {
                       << std::endl;
             return fatal();
           }
-          uint64_t retval = (pid == 0) ? 1 : 0;
-          stateChange = {ChangeType::REPLACEMENT, {R0}, {retval}};
-          stateChange.memoryAddresses.push_back({mask, 1});
+          uint64_t retval = static_cast<uint64_t>(bitmask);
+          stateChange = {ChangeType::REPLACEMENT, {R0}, {sizeof(retval)}};
+          stateChange.memoryAddresses.push_back({mask, sizeof(bitmask)});
           stateChange.memoryAddressValues.push_back(bitmask);
         } else {
           stateChange = {ChangeType::REPLACEMENT, {R0}, {-1ll}};
diff --git a/src/lib/arch/aarch64/InstructionMetadata.cc b/src/lib/arch/aarch64/InstructionMetadata.cc
index 34ddca07d7..219023d93a 100644
--- a/src/lib/arch/aarch64/InstructionMetadata.cc
+++ b/src/lib/arch/aarch64/InstructionMetadata.cc
@@ -100,6 +100,14 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       operands[0].access = CS_AC_WRITE;
       operands[1].access = CS_AC_READ;
       break;
+    case Opcode::AArch64_FTSMUL_ZZZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSMUL_ZZZ_S:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSSEL_ZZZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTSSEL_ZZZ_S:
+      [[fallthrough]];
     case Opcode::AArch64_FSUB_ZPmI_D:
       [[fallthrough]];
     case Opcode::AArch64_FSUB_ZPmI_H:
@@ -123,6 +131,41 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       operands[2].access = CS_AC_READ;
       break;
     }
+    case Opcode::AArch64_FTMAD_ZZI_D:
+      [[fallthrough]];
+    case Opcode::AArch64_FTMAD_ZZI_S: {
+      // Incorrect access types
+      operands[0].access = CS_AC_READ | CS_AC_WRITE;
+      operands[1].access = CS_AC_READ;
+      operands[2].access = CS_AC_READ;
+      break;
+    }
+    case Opcode::AArch64_PFIRST_B:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_D:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_S:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_H:
+      [[fallthrough]];
+    case Opcode::AArch64_PNEXT_B: {
+      // Incorrect access types
+      operands[0].access = CS_AC_WRITE;
+      operands[1].access = CS_AC_READ;
+      operands[2].access = CS_AC_READ;
+      // Doesn't identify implicit NZCV destination
+      implicitDestinationCount = 1;
+      implicitDestinations[0] = AARCH64_REG_NZCV;
+      break;
+    }
+    case Opcode::AArch64_CLASTB_VPZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_CLASTB_VPZ_S:
+      [[fallthrough]];
+    case Opcode::AArch64_CLASTB_VPZ_H:
+      [[fallthrough]];
+    case Opcode::AArch64_CLASTB_VPZ_B:
+      [[fallthrough]];
     case Opcode::AArch64_AND_ZPmZ_D:  // Example bytecode - 4901da04
       [[fallthrough]];
     case Opcode::AArch64_AND_ZPmZ_H:
@@ -155,6 +198,10 @@ InstructionMetadata::InstructionMetadata(const cs_insn& insn)
       [[fallthrough]];
     case Opcode::AArch64_SMAX_ZPmZ_S:  // Example bytecode - 01008804
       [[fallthrough]];
+    case Opcode::AArch64_SPLICE_ZPZ_D:
+      [[fallthrough]];
+    case Opcode::AArch64_SPLICE_ZPZ_S:
+      [[fallthrough]];
     case Opcode::AArch64_MUL_ZPmZ_B:  // Example bytecode - 40001004
       [[fallthrough]];
     case Opcode::AArch64_MUL_ZPmZ_D:
diff --git a/src/lib/arch/aarch64/Instruction_address.cc b/src/lib/arch/aarch64/Instruction_address.cc
index 357077e7b3..06eb7e2004 100644
--- a/src/lib/arch/aarch64/Instruction_address.cc
+++ b/src/lib/arch/aarch64/Instruction_address.cc
@@ -451,6 +451,10 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 8}});
         break;
       }
+      case Opcode::AArch64_LDAXRB: {  // ldaxrb wt, [xn]
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 1}});
+        break;
+      }
       case Opcode::AArch64_LDAXRW: {  // ldaxr wd, [xn]
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>(), 4}});
         break;
@@ -749,6 +753,13 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
         break;
       }
+      case Opcode::AArch64_LDRSWroW: {  // ldrsw xt, [xn, wm{, extend
+                                        // {#amount}}]
+        uint64_t offset = extendOffset(sourceValues_[1].get<uint32_t>(),
+                                       metadata_.operands[1]);
+        setMemoryAddresses({{sourceValues_[0].get<uint64_t>() + offset, 4}});
+        break;
+      }
       case Opcode::AArch64_LDRSWui: {  // ldrsw xt, [xn{, #pimm}]
         uint64_t base =
             sourceValues_[0].get<uint64_t>() + metadata_.operands[1].mem.disp;
@@ -1350,11 +1361,19 @@ span<const memory::MemoryAccessTarget> Instruction::generateAddresses() {
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
-      case Opcode::AArch64_STLXRW: {  // stlxr ws, wt, [xn]
+      case Opcode::AArch64_STLXRB: {  // stlxrb ws, wt, [xn]
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 1}});
+        break;
+      }
+      case Opcode::AArch64_STLXRH: {  // stlxrb ws, ht, [xn]
+        setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 2}});
+        break;
+      }
+      case Opcode::AArch64_STLXRW: {  // stlxrb ws, wt, [xn]
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 4}});
         break;
       }
-      case Opcode::AArch64_STLXRX: {  // stlxr ws, xt, [xn]
+      case Opcode::AArch64_STLXRX: {  // stlxr ws, xwt, [xn]
         setMemoryAddresses({{sourceValues_[1].get<uint64_t>(), 8}});
         break;
       }
diff --git a/src/lib/arch/aarch64/Instruction_decode.cc b/src/lib/arch/aarch64/Instruction_decode.cc
index 6d2007cb55..215ade08fa 100644
--- a/src/lib/arch/aarch64/Instruction_decode.cc
+++ b/src/lib/arch/aarch64/Instruction_decode.cc
@@ -500,7 +500,9 @@ void Instruction::decode() {
     // Check first operand access to determine if it's a load or store
     if (metadata_.operands[0].access & CS_AC_WRITE) {
       if (metadata_.id == AARCH64_INS_STXR ||
-          metadata_.id == AARCH64_INS_STLXR) {
+          metadata_.id == AARCH64_INS_STLXR ||
+          metadata_.id == AARCH64_INS_STLXRB ||
+          metadata_.id == AARCH64_INS_STLXRH) {
         // Exceptions to this is load condition are exclusive store with a
         // success flag as first operand
         if (microOpcode_ != MicroOpcode::STR_DATA) {
diff --git a/src/lib/arch/aarch64/Instruction_execute.cc b/src/lib/arch/aarch64/Instruction_execute.cc
index 20b62904b9..3090e3cb42 100644
--- a/src/lib/arch/aarch64/Instruction_execute.cc
+++ b/src/lib/arch/aarch64/Instruction_execute.cc
@@ -205,6 +205,10 @@ void Instruction::execute() {
         results_[0] = vecSumElems_2ops<uint8_t, 8>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_UADDLVv8i8v: {  // uaddlv hd, vn.8b
+        results_[0] = vecAddlv<uint32_t, uint8_t, 8>(sourceValues_);
+        break;
+      }
       case Opcode::AArch64_ADDWri: {  // add wd, wn, #imm{, shift}
         auto [result, nzcv] =
             addShift_imm<uint32_t>(sourceValues_, metadata_, false);
@@ -355,6 +359,30 @@ void Instruction::execute() {
             sveAdr_packedOffsets<uint32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_FTSMUL_ZZZ_S: {  // ftsmul zd.s, zn.s, zm.s
+        results_[0] = sveFTrigSMul<float, int32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTSMUL_ZZZ_D: {  // ftsmul zd.d, zn.d, zm.d
+        results_[0] = sveFTrigSMul<double, int64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTSSEL_ZZZ_S: {  // ftssel zd.s, zn.s, zm.s
+        results_[0] = sveFTrigSSel<float, int32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTSSEL_ZZZ_D: {  // ftssel zd.d, zn.d, zm.d
+        results_[0] = sveFTrigSSel<double, int64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTMAD_ZZI_S: {  // ftmad zd.s, zn.s, zm.s, #imm
+        results_[0] = sveFTrigMad<float>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_FTMAD_ZZI_D: {  // ftmad zd.s, zn.s, zm.s, #imm
+        results_[0] = sveFTrigMad<double>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_ANDSWri: {  // ands wd, wn, #imm
         auto [result, nzcv] = logicOp_imm<uint32_t>(
             sourceValues_, metadata_, true,
@@ -675,6 +703,12 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> bool { return (x == y); });
         break;
       }
+      case Opcode::AArch64_CMEQv2i32rz: {  // cmeq vd.2s, vn.2s, #0
+        results_[0] = vecCompare<uint32_t, 2>(
+            sourceValues_, true,
+            [](uint32_t x, uint32_t y) -> bool { return (x == y); });
+        break;
+      }
       case Opcode::AArch64_CMEQv4i32: {  // cmeq vd.4s, vn.4s, vm.4s
         results_[0] = vecCompare<uint32_t, 4>(
             sourceValues_, false,
@@ -693,6 +727,12 @@ void Instruction::execute() {
             [](int8_t x, int8_t y) -> bool { return (x == y); });
         break;
       }
+      case Opcode::AArch64_CMHIv2i32: {  // cmhi vd.2s, vn.2s, vm.2s
+        results_[0] = vecCompare<uint32_t, 2>(
+            sourceValues_, false,
+            [](uint32_t x, uint32_t y) -> bool { return (x > y); });
+        break;
+      }
       case Opcode::AArch64_CMHIv4i32: {  // cmhi vd.4s, vn.4s, vm.4s
         results_[0] = vecCompare<uint32_t, 4>(
             sourceValues_, false,
@@ -833,6 +873,38 @@ void Instruction::execute() {
         results_[1] = output;
         break;
       }
+      case Opcode::AArch64_CMPHS_PPzZZ_B: {  // cmphs pd.b, pg/z, zn.b, zm.b
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint8_t>(
+            sourceValues_, metadata_, VL_bits, false,
+            [](uint8_t x, uint8_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_CMPHS_PPzZZ_D: {  // cmphs pd.d, pg/z, zn.d, zm.d
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint64_t>(
+            sourceValues_, metadata_, VL_bits, false,
+            [](uint64_t x, uint64_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_CMPHS_PPzZZ_H: {  // cmphs pd.h, pg/z, zn.h, zm.h
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint16_t>(
+            sourceValues_, metadata_, VL_bits, false,
+            [](uint16_t x, uint16_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_CMPHS_PPzZZ_S: {  // cmphs pd.s, pg/z, zn.s, zm.s
+        auto [output, nzcv] = sveCmpPredicated_toPred<uint32_t>(
+            sourceValues_, metadata_, VL_bits, false,
+            [](uint32_t x, uint32_t y) -> bool { return x >= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
       case Opcode::AArch64_CMPNE_PPzZI_B: {  // cmpne pd.b, pg/z. zn.b, #imm
         auto [output, nzcv] = sveCmpPredicated_toPred<int8_t>(
             sourceValues_, metadata_, VL_bits, true,
@@ -949,6 +1021,22 @@ void Instruction::execute() {
         results_[0] = sveCpy_imm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_CPY_ZPmV_B: {  // cpy zd.b, pg/m, vn.b
+        results_[0] = sveCpy_Scalar<int8_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CPY_ZPmV_D: {  // cpy zd.d, pg/m, vn.d
+        results_[0] = sveCpy_Scalar<int64_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CPY_ZPmV_H: {  // cpy zd.h, pg/m, vn.h
+        results_[0] = sveCpy_Scalar<int16_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CPY_ZPmV_S: {  // cpy zd.s, pg/m, vn.s
+        results_[0] = sveCpy_Scalar<int32_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_DUPi32: {  // dup vd, vn.s[index]
         results_[0] =
             vecDup_gprOrIndex<uint32_t, 1>(sourceValues_, metadata_, false);
@@ -1752,6 +1840,10 @@ void Instruction::execute() {
         results_[0] = vecFDiv<double, 2>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_FDIVv4f32: {  // fdiv vd.4s, vn.4s, vm.4s
+        results_[0] = vecFDiv<float, 4>(sourceValues_);
+        break;
+      }
       case Opcode::AArch64_FDUP_ZI_D: {  // fdup zd.d, #imm
         results_[0] =
             sveDup_immOrScalar<double>(sourceValues_, metadata_, VL_bits, true);
@@ -2557,6 +2649,38 @@ void Instruction::execute() {
             vecInsIndex_gpr<uint8_t, uint32_t, 16>(sourceValues_, metadata_);
         break;
       }
+      case Opcode::AArch64_LASTB_VPZ_D: {  // lastb dd, pg, zn.d
+        results_[0] = sveLastBSimdScalar<uint64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_LASTB_VPZ_S: {  // lastb sd, pg, zn.s
+        results_[0] = sveLastBSimdScalar<uint32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_LASTB_VPZ_H: {  // lastb hd, pg, zn.h
+        results_[0] = sveLastBSimdScalar<uint16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_LASTB_VPZ_B: {  // lastb bd, pg, zn.b
+        results_[0] = sveLastBSimdScalar<uint8_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CLASTB_VPZ_D: {  // clastb dd, pg, dn, zn.d
+        results_[0] = sveCLastBSimdScalar<uint64_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CLASTB_VPZ_S: {  // clastb sd, pg, sn, zn.s
+        results_[0] = sveCLastBSimdScalar<uint32_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CLASTB_VPZ_H: {  // clastb hd, pg, hn, zn.h
+        results_[0] = sveCLastBSimdScalar<uint16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_CLASTB_VPZ_B: {  // clastb bd, pg, bn, zn.b
+        results_[0] = sveCLastBSimdScalar<uint8_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_LD1_MXIPXX_H_D: {  // ld1d {zath.d[ws, #imm]}, pg/z,
                                               // [<xn|sp>{, xm, lsl #3}]
         // SME, LOAD
@@ -3330,6 +3454,11 @@ void Instruction::execute() {
         results_[0] = memoryData_[0];
         break;
       }
+      case Opcode::AArch64_LDAXRB: {  // ldaxrb wt, [xn]
+        // LOAD
+        results_[0] = memoryData_[0].zeroExtend(1, 8);
+        break;
+      }
       case Opcode::AArch64_LDAXRW: {  // ldaxr wd, [xn]
         // LOAD
         results_[0] = memoryData_[0].zeroExtend(4, 8);
@@ -3603,6 +3732,12 @@ void Instruction::execute() {
         results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
         break;
       }
+      case Opcode::AArch64_LDRSWroW: {  // ldrsw xt, [xn, wm, {extend
+                                        // {#amount}}]
+        // LOAD
+        results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
+        break;
+      }
       case Opcode::AArch64_LDRSWui: {  // ldrsw xt, [xn{, #pimm}]
         // LOAD
         results_[0] = static_cast<int64_t>(memoryData_[0].get<int32_t>());
@@ -4007,11 +4142,51 @@ void Instruction::execute() {
             [](uint8_t x, uint8_t y) -> uint8_t { return x | y; });
         break;
       }
+      case Opcode::AArch64_ORNv8i8: {  // orn vd.8b, vn.8b, vn.8b
+        results_[0] = vecLogicOp_3vecs<uint8_t, 8>(
+            sourceValues_,
+            [](uint8_t x, uint8_t y) -> uint8_t { return x | (~y); });
+        break;
+      }
       case Opcode::AArch64_PFALSE: {  // pfalse pd.b
         uint64_t out[4] = {0, 0, 0, 0};
         results_[0] = out;
         break;
       }
+      case Opcode::AArch64_PFIRST_B: {  // pfirst pdn.b, pg, pdn.b
+        auto [result, nzcv] = svePfirst(sourceValues_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
+        break;
+      }
+      case Opcode::AArch64_PNEXT_B: {  // pnext pdn.b, pv, pdn.b
+        auto [result, nzcv] =
+            svePnext<uint8_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
+        break;
+      }
+      case Opcode::AArch64_PNEXT_H: {  // pnext pdn.h, pv, pdn.h
+        auto [result, nzcv] =
+            svePnext<uint16_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
+        break;
+      }
+      case Opcode::AArch64_PNEXT_S: {  // pnext pdn.s, pv, pdn.s
+        auto [result, nzcv] =
+            svePnext<uint32_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
+        break;
+      }
+      case Opcode::AArch64_PNEXT_D: {  // pnext pdn.d, pv, pdn.d
+        auto [result, nzcv] =
+            svePnext<uint64_t>(sourceValues_, metadata_, VL_bits);
+        results_[0] = nzcv;
+        results_[1] = result;
+        break;
+      }
       case Opcode::AArch64_PRFMui: {  // prfm op, [xn, xm{, extend{, #amount}}]
         break;
       }
@@ -4300,34 +4475,88 @@ void Instruction::execute() {
         results_[0] = maddl_4ops<int64_t, int32_t>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_SMAX_ZI_D: {  // smax zdn.d, zdn.d, #imm
+        results_[0] = sveMax_vecImm<int64_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMAX_ZI_S: {  // smax zdn.s, zdn.s, #imm
         results_[0] = sveMax_vecImm<int32_t>(sourceValues_, metadata_, VL_bits);
         break;
       }
+      case Opcode::AArch64_SMAX_ZI_H: {  // smax zdn.h, zdn.h, #imm
+        results_[0] = sveMax_vecImm<int16_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMAX_ZI_B: {  // smax zdn.b, zdn.b, #imm
+        results_[0] = sveMax_vecImm<int8_t>(sourceValues_, metadata_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMAX_ZPmZ_D: {  // smax zd.d, pg/m, zn.d, zm.d
+        results_[0] = sveMaxPredicated_vecs<int64_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMAX_ZPmZ_S: {  // smax zd.s, pg/m, zn.s, zm.s
         results_[0] = sveMaxPredicated_vecs<int32_t>(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_SMAX_ZPmZ_H: {  // smax zd.h, pg/m, zn.h, zm.h
+        results_[0] = sveMaxPredicated_vecs<int16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMAX_ZPmZ_B: {  // smax zd.b, pg/m, zn.b, zm.b
+        results_[0] = sveMaxPredicated_vecs<int8_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMAXv4i32: {  // smax vd.4s, vn.4s, vm.4s
         results_[0] = vecLogicOp_3vecs<int32_t, 4>(
             sourceValues_,
             [](int32_t x, int32_t y) -> int32_t { return std::max(x, y); });
         break;
       }
+      case Opcode::AArch64_SMINV_VPZ_D: {  // sminv sd, pg, zn.d
+        results_[0] = sveSminv<int64_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMINV_VPZ_S: {  // sminv sd, pg, zn.s
         results_[0] = sveSminv<int32_t>(sourceValues_, VL_bits);
         break;
       }
+      case Opcode::AArch64_SMINV_VPZ_H: {  // sminv sd, pg, zn.h
+        results_[0] = sveSminv<int16_t>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SMINV_VPZ_B: {  // sminv sd, pg, zn.b
+        results_[0] = sveSminv<int8_t>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SMINVv4i32v: {  // sminv sd, vn.4s
         results_[0] = vecMinv_2ops<int32_t, 4>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_SMIN_ZPmZ_D: {  // smin zd.d, pg/m, zn.d, zm.d
+        results_[0] = sveLogicOpPredicated_3vecs<int64_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> int64_t { return std::min(x, y); });
+        break;
+      }
       case Opcode::AArch64_SMIN_ZPmZ_S: {  // smin zd.s, pg/m, zn.s, zm.s
         results_[0] = sveLogicOpPredicated_3vecs<int32_t>(
             sourceValues_, VL_bits,
             [](int32_t x, int32_t y) -> int32_t { return std::min(x, y); });
         break;
       }
+      case Opcode::AArch64_SMIN_ZPmZ_H: {  // smin zd.h, pg/m, zn.h, zm.h
+        results_[0] = sveLogicOpPredicated_3vecs<int16_t>(
+            sourceValues_, VL_bits,
+            [](int16_t x, int16_t y) -> int16_t { return std::min(x, y); });
+        break;
+      }
+      case Opcode::AArch64_SMIN_ZPmZ_B: {  // smin zd.b, pg/m, zn.b, zm.b
+        results_[0] = sveLogicOpPredicated_3vecs<int8_t>(
+            sourceValues_, VL_bits,
+            [](int8_t x, int8_t y) -> int8_t { return std::min(x, y); });
+        break;
+      }
       case Opcode::AArch64_SMINv4i32: {  // smin vd.4s, vn.4s, vm.4s
         results_[0] = vecLogicOp_3vecs<int32_t, 4>(
             sourceValues_,
@@ -4359,6 +4588,14 @@ void Instruction::execute() {
                             sourceValues_[1].get<uint64_t>());
         break;
       }
+      case Opcode::AArch64_SPLICE_ZPZ_D: {  // splice zdn.d, pv, zdn.t, zm.d
+        results_[0] = sveSplice<double>(sourceValues_, VL_bits);
+        break;
+      }
+      case Opcode::AArch64_SPLICE_ZPZ_S: {  // splice zdn.s, pv, zdn.t, zm.s
+        results_[0] = sveSplice<float>(sourceValues_, VL_bits);
+        break;
+      }
       case Opcode::AArch64_SSHLLv2i32_shift: {  // sshll vd.2d, vn.2s, #imm
         results_[0] = vecShllShift_vecImm<int64_t, int32_t, 2>(
             sourceValues_, metadata_, false);
@@ -4945,12 +5182,14 @@ void Instruction::execute() {
         memoryData_[0] = sourceValues_[0];
         break;
       }
+      case Opcode::AArch64_STLXRB:    // stlxrb ws, wt, [xn]
+      case Opcode::AArch64_STLXRH:    // stlxrh ws, wt, [xn]
       case Opcode::AArch64_STLXRW:    // stlxr ws, wt, [xn]
       case Opcode::AArch64_STLXRX: {  // stlxr ws, xt, [xn]
         // STORE
         memoryData_[0] = sourceValues_[0];
-        // TODO: Implement atomic memory access
-        results_[0] = static_cast<uint64_t>(0);
+        //  TODO: Implement atomic memory access
+        results_[0] = {0, 8};
         break;
       }
       case Opcode::AArch64_STPDi:    // stp dt1, dt2, [xn, #imm]
@@ -5568,6 +5807,26 @@ void Instruction::execute() {
         results_[0] = vecUMinP<uint8_t, 16>(sourceValues_);
         break;
       }
+      case Opcode::AArch64_UMAXVv16i8v: {  // umaxv bd, vn.16b
+        results_[0] = vecUMaxV<uint8_t, 16>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv4i16v: {  // umaxv hd, vn.4h
+        results_[0] = vecUMaxV<uint16_t, 4>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv4i32v: {  // umaxv sd, vn.4s
+        results_[0] = vecUMaxV<uint32_t, 4>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv8i16v: {  // umaxv hd, vn.8h
+        results_[0] = vecUMaxV<uint16_t, 8>(sourceValues_);
+        break;
+      }
+      case Opcode::AArch64_UMAXVv8i8v: {  // umaxv bd, vn.8b
+        results_[0] = vecUMaxV<uint8_t, 8>(sourceValues_);
+        break;
+      }
       case Opcode::AArch64_UMOVvi32_idx0:  // umov wd, vn.s[0]
       case Opcode::AArch64_UMOVvi32: {     // umov wd, vn.s[index]
         const uint32_t* vec = sourceValues_[0].getAsVector<uint32_t>();
@@ -5726,85 +5985,129 @@ void Instruction::execute() {
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_B: {  // whilelo pd.b, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint8_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint8_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_D: {  // whilelo pd.d, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint64_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint64_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_H: {  // whilelo pd.h, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint16_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint16_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PWW_S: {  // whilelo pd.s, wn, wm
-        auto [output, nzcv] =
-            sveWhilelo<uint32_t, uint32_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint32_t, uint32_t>(
+            sourceValues_, VL_bits,
+            [](uint32_t x, uint32_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_B: {  // whilelo pd.b, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint8_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint8_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_D: {  // whilelo pd.d, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint64_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint64_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_H: {  // whilelo pd.h, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint16_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint16_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELO_PXX_S: {  // whilelo pd.s, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<uint64_t, uint32_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<uint64_t, uint32_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x < y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_B: {  // whilels pd.b, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint8_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_D: {  // whilels pd.d, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint64_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_H: {  // whilels pd.h, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint16_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
+        results_[0] = nzcv;
+        results_[1] = output;
+        break;
+      }
+      case Opcode::AArch64_WHILELS_PXX_S: {  // whilels pd.s, xn, xm
+        auto [output, nzcv] = sveWhile<uint64_t, uint32_t>(
+            sourceValues_, VL_bits,
+            [](uint64_t x, uint64_t y) -> bool { return x <= y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_B: {  // whilelt pd.b, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int8_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int8_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_D: {  // whilelt pd.d, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int64_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int64_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_H: {  // whilelt pd.h, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int16_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int16_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
       }
       case Opcode::AArch64_WHILELT_PXX_S: {  // whilelt pd.s, xn, xm
-        auto [output, nzcv] =
-            sveWhilelo<int64_t, int32_t>(sourceValues_, VL_bits, true);
+        auto [output, nzcv] = sveWhile<int64_t, int32_t>(
+            sourceValues_, VL_bits,
+            [](int64_t x, int64_t y) -> bool { return x < y; });
         results_[0] = nzcv;
         results_[1] = output;
         break;
diff --git a/src/lib/pipeline/ReorderBuffer.cc b/src/lib/pipeline/ReorderBuffer.cc
index e72e6e79dc..20a2970995 100644
--- a/src/lib/pipeline/ReorderBuffer.cc
+++ b/src/lib/pipeline/ReorderBuffer.cc
@@ -81,9 +81,31 @@ unsigned int ReorderBuffer::commit(uint64_t maxCommitSize) {
   unsigned int n;
   for (n = 0; n < maxCommits; n++) {
     auto& uop = buffer_[0];
+
     if (!uop->canCommit()) {
+      // If an instruction has been stuck at the head of the rob for
+      // sufficiently long, assume an error in SimEng has occured.
+      robHeadRepeatCounter_++;
+      if (robHeadRepeatCounter_ > robHeadRepeatLimit_) {
+        std::cerr << "[SimEng:ReorderBuffer] Instruction stuck unable to "
+                     "commit at the head of ROB for 10,000,000 cycles at "
+                     "instruction address 0x"
+                  << std::hex << uop->getInstructionAddress() << std::dec
+                  << " (MicroOp Index: " << uop->getMicroOpIndex()
+                  << "). This is unexpected behaviour for most valid core "
+                     "configurations, though may arise in designs with very "
+                     "high latencies or bottlenecks. If this is not the case, "
+                     "please try re-running. If this may be expected, you can "
+                     "increase this limit in "
+                     "`SimEng/src/include/pipeline/ReorderBuffer.hh` under the "
+                     "variable `robHeadRepeatLimit_`. Please raise "
+                     "an issue on GitHub if the problem persists."
+                  << std::endl;
+        exit(1);
+      }
       break;
     }
+    robHeadRepeatCounter_ = 0;
 
     if (uop->isLastMicroOp()) instructionsCommitted_++;
 
diff --git a/test/regression/aarch64/AArch64RegressionTest.hh b/test/regression/aarch64/AArch64RegressionTest.hh
index 32d975b09d..3b2490666d 100644
--- a/test/regression/aarch64/AArch64RegressionTest.hh
+++ b/test/regression/aarch64/AArch64RegressionTest.hh
@@ -503,7 +503,7 @@ class AArch64RegressionTest : public RegressionTest {
     std::array<T, (32 / sizeof(T))> generatedArray;
     generatedArray.fill(0);
     // Fill array by cycling through source elements
-    for (int i = 0; i < (num_bytes / sizeof(T)); i++) {
+    for (size_t i = 0; i < (num_bytes / sizeof(T)); i++) {
       generatedArray[i] = src[i % src.size()];
     }
     return generatedArray;
diff --git a/test/regression/aarch64/Syscall.cc b/test/regression/aarch64/Syscall.cc
index 0866c278e2..c7c19eb9a2 100644
--- a/test/regression/aarch64/Syscall.cc
+++ b/test/regression/aarch64/Syscall.cc
@@ -1080,7 +1080,7 @@ TEST_P(Syscall, sched_getaffinity) {
     )");
   EXPECT_EQ(getGeneralRegister<int64_t>(21), -1);
   EXPECT_EQ(getGeneralRegister<int64_t>(22), -1);
-  EXPECT_EQ(getGeneralRegister<int64_t>(23), 1);
+  EXPECT_EQ(getGeneralRegister<int64_t>(23), 8);
 }
 
 // TODO: write tgkill test
diff --git a/test/regression/aarch64/instructions/bitmanip.cc b/test/regression/aarch64/instructions/bitmanip.cc
index a72dcb64dc..8622169db0 100644
--- a/test/regression/aarch64/instructions/bitmanip.cc
+++ b/test/regression/aarch64/instructions/bitmanip.cc
@@ -71,11 +71,17 @@ TEST_P(InstBitmanip, extr) {
     extr w4, w1, w2, 4
     extr w5, w1, w2, 24
     extr w6, w1, w2, 31
+
+    # Check alias
+    ror w7, w1, 31
+    ror w8, w1, 24
   )");
   EXPECT_EQ(getGeneralRegister<uint32_t>(3), 0x12345678);
   EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0xF1234567);
   EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0xADBEEF12);
   EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0xBD5B7DDE);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0xBD5B7DDF);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(8), 0xADBEEFDE);
 
   // 64-bit
   initialHeapData_.resize(16);
diff --git a/test/regression/aarch64/instructions/load.cc b/test/regression/aarch64/instructions/load.cc
index 09269eebb8..bf5a3cad47 100644
--- a/test/regression/aarch64/instructions/load.cc
+++ b/test/regression/aarch64/instructions/load.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstLoad = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstLoad, ld1r) {
   // 8-bit
@@ -695,6 +696,45 @@ TEST_P(InstLoad, ldarb) {
   EXPECT_EQ(getGeneralRegister<uint32_t>(7), 64);
 }
 
+TEST_P(InstLoad, ldaxrb) {
+  initialHeapData_.resize(8);
+  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heap[0] = 0xDEADBEEF;
+  heap[1] = 0x12345678;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+    ldaxrb w1, [x0]
+    add x0, x0, #1
+    ldaxrb w2, [x0]
+    add x0, x0, #1
+    ldaxrb w3, [x0]
+    add x0, x0, #1
+    ldaxrb w4, [x0]
+    add x0, x0, #1
+    ldaxrb w5, [x0]
+    add x0, x0, #1
+    ldaxrb w6, [x0]
+    add x0, x0, #1
+    ldaxrb w7, [x0]
+    add x0, x0, #1
+    ldaxrb w8, [x0]
+  )");
+  EXPECT_EQ(getGeneralRegister<uint32_t>(1), 0xEF);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(2), 0xBE);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(3), 0xAD);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0xDE);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0x78);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0x56);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0x34);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(8), 0x12);
+
+  EXPECT_GROUP(R"(ldaxrb w8, [x0])", LOAD_INT);
+}
+
 TEST_P(InstLoad, ldrb) {
   initialHeapData_.resize(8);
   uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
@@ -1277,17 +1317,21 @@ TEST_P(InstLoad, ldrsw) {
     mov x0, 0
     mov x8, 214
     svc #0
-    mov x5, 1
+    mov x6, 1
     # Load 32-bit values from heap and sign-extend to 64-bits
     ldrsw x1, [x0, #4]
     ldrsw x2, [x0], #4
     ldrsw x3, [x0]
-    ldrsw x4, [x0, x5, lsl #2]
+    ldrsw x4, [x0, x6, lsl #2]
+    ldrsw x5, [x0, w6, uxtw #2]
   )");
   EXPECT_EQ(getGeneralRegister<int64_t>(1), INT32_MAX);
   EXPECT_EQ(getGeneralRegister<int64_t>(2), -2);
   EXPECT_EQ(getGeneralRegister<int64_t>(3), INT32_MAX);
   EXPECT_EQ(getGeneralRegister<int64_t>(4), -5);
+  EXPECT_EQ(getGeneralRegister<int64_t>(5), -5);
+
+  EXPECT_GROUP(R"(ldrsw x4, [x0, x6, lsl #2])", LOAD_INT);
 
   // ldursw
   RUN_AARCH64(R"(
diff --git a/test/regression/aarch64/instructions/neon.cc b/test/regression/aarch64/instructions/neon.cc
index a4731f388f..c66f6f3c6f 100644
--- a/test/regression/aarch64/instructions/neon.cc
+++ b/test/regression/aarch64/instructions/neon.cc
@@ -356,6 +356,28 @@ TEST_P(InstNeon, addv) {
   CHECK_NEON(1, uint8_t, {40});
 }
 
+TEST_P(InstNeon, uaddlv) {
+  // 16-bit
+  initialHeapData_.resize(16);
+  uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  for (int i = 0; i < 16; i++) {
+    heap8[i] = (i + 1);
+  }
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    uaddlv h1, v0.8b
+  )");
+  CHECK_NEON(1, uint16_t, {36});
+
+  EXPECT_GROUP(R"(uaddlv h1, v0.8b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
+}
+
 TEST_P(InstNeon, and) {
   initialHeapData_.resize(32);
   uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
@@ -691,18 +713,53 @@ TEST_P(InstNeon, cmeq) {
   CHECK_NEON(2, uint8_t, {0xFF, 0xFF, 0xFF, 0x00, 0xFF, 0xFF, 0xFF, 0xFF});
   CHECK_NEON(3, uint8_t, {0x00, 0x00, 0x00, 0xFF, 0x00, 0x00, 0xFF, 0x00});
 
-  // 32-bit
+  // 32-bit, 2 lane
   initialHeapData_.resize(128);
-  uint32_t* heap32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  heap32[0] = 10;
-  heap32[1] = 11;
-  heap32[2] = 12;
-  heap32[3] = 13;
+  uint32_t* heapv2i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv2i32[0] = 10;
+  heapv2i32[1] = 0;
+
+  heapv2i32[2] = 0;
+  heapv2i32[3] = 12;
+
+  heapv2i32[4] = 15;
+  heapv2i32[5] = 9;
+
+  heapv2i32[6] = 0;
+  heapv2i32[7] = 0;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #8]
+    ldr q2, [x0, #16]
+    ldr q3, [x0, #24]
+    cmeq v4.2s, v0.2s, #0
+    cmeq v5.2s, v1.2s, #0
+    cmeq v6.2s, v2.2s, #0
+    cmeq v7.2s, v3.2s, #0
+  )");
+  CHECK_NEON(4, uint32_t, {0, 0xFFFFFFFFu});
+  CHECK_NEON(5, uint32_t, {0xFFFFFFFFu, 0});
+  CHECK_NEON(6, uint32_t, {0, 0});
+  CHECK_NEON(7, uint32_t, {0xFFFFFFFFu, 0xFFFFFFFFu});
 
-  heap32[4] = 13;
-  heap32[5] = 11;
-  heap32[6] = 12;
-  heap32[7] = 10;
+  // 32-bit, 4 lane
+  initialHeapData_.resize(128);
+  uint32_t* heapv4i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv4i32[0] = 10;
+  heapv4i32[1] = 11;
+  heapv4i32[2] = 12;
+  heapv4i32[3] = 13;
+
+  heapv4i32[4] = 13;
+  heapv4i32[5] = 11;
+  heapv4i32[6] = 12;
+  heapv4i32[7] = 10;
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -715,6 +772,8 @@ TEST_P(InstNeon, cmeq) {
     cmeq v2.4s, v0.4s, v1.4s
   )");
   CHECK_NEON(2, uint32_t, {0, 0xFFFFFFFFu, 0xFFFFFFFFu, 0});
+
+  EXPECT_GROUP(R"(cmeq v2.4s, v0.4s, v1.4s)", VECTOR_SIMPLE_CMP);
 }
 
 TEST_P(InstNeon, cmhs) {
@@ -780,16 +839,40 @@ TEST_P(InstNeon, cmhs) {
 }
 
 TEST_P(InstNeon, cmhi) {
+  // 32-bit, 2 lane
   initialHeapData_.resize(32);
-  uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
-  heap[0] = 42;
-  heap[1] = 7;
-  heap[2] = UINT32_MAX;
-  heap[3] = 7;
-  heap[4] = 1;
-  heap[5] = (1u << 31) - 1;
-  heap[6] = 0;
-  heap[7] = 7;
+  uint32_t* heapv2i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv2i32[0] = UINT32_MAX;
+  heapv2i32[1] = 7;
+
+  heapv2i32[2] = 1;
+  heapv2i32[3] = 7;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #8]
+    cmhi v2.2s, v0.2s, v1.2s
+    cmhi v3.2s, v1.2s, v0.2s
+  )");
+  CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0});
+  CHECK_NEON(3, uint32_t, {0x0, 0x0});
+
+  // 32-bit, 4 lane
+  initialHeapData_.resize(32);
+  uint32_t* heapv4i32 = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  heapv4i32[0] = 42;
+  heapv4i32[1] = 7;
+  heapv4i32[2] = UINT32_MAX;
+  heapv4i32[3] = 7;
+  heapv4i32[4] = 1;
+  heapv4i32[5] = (1u << 31) - 1;
+  heapv4i32[6] = 0;
+  heapv4i32[7] = 7;
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -804,6 +887,8 @@ TEST_P(InstNeon, cmhi) {
   )");
   CHECK_NEON(2, uint32_t, {0xFFFFFFFF, 0x0, 0xFFFFFFFF, 0x0});
   CHECK_NEON(3, uint32_t, {0x0, 0xFFFFFFFF, 0x0, 0x0});
+
+  EXPECT_GROUP(R"(cmhi v3.4s, v1.4s, v0.4s)", VECTOR_SIMPLE_CMP);
 }
 
 TEST_P(InstNeon, cnt) {
@@ -1011,6 +1096,29 @@ TEST_P(InstNeon, eor) {
   CHECK_NEON(3, uint8_t, {1, 3, 1, 7, 1, 3, 1, 15, 0, 0, 0, 0, 0, 0, 0, 0});
 }
 
+TEST_P(InstNeon, orn) {
+  initialHeapData_.resize(16);
+  uint8_t* heap = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+  for (int i = 0; i < 8; i++) {
+    heap[i] = i;
+    heap[i + 8] = i + 1;
+  }
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #8]
+
+    orn v2.8b, v0.8b, v1.8b
+  )");
+  CHECK_NEON(2, uint8_t, {254, 253, 254, 251, 254, 253, 254, 247});
+
+  EXPECT_GROUP(R"(orn v2.8b, v0.8b, v1.8b)", VECTOR_SIMPLE_LOGICAL_NOSHIFT);
+}
+
 TEST_P(InstNeon, ext) {
   RUN_AARCH64(R"(
     movi v0.16b, #0xAB
@@ -1531,11 +1639,12 @@ TEST_P(InstNeon, fcvtl2) {
 
 TEST_P(InstNeon, fdiv) {
   initialHeapData_.resize(32);
-  double* heap = reinterpret_cast<double*>(initialHeapData_.data());
-  heap[0] = 1.0;
-  heap[1] = -42.5;
-  heap[2] = -0.125;
-  heap[3] = 16.0;
+  // 2 Doubles
+  double* heapv2f64 = reinterpret_cast<double*>(initialHeapData_.data());
+  heapv2f64[0] = 1.0;
+  heapv2f64[1] = -42.5;
+  heapv2f64[2] = -0.125;
+  heapv2f64[3] = 16.0;
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -1548,6 +1657,29 @@ TEST_P(InstNeon, fdiv) {
     fdiv v2.2d, v0.2d, v1.2d
   )");
   CHECK_NEON(2, double, {-8.0, -2.65625});
+
+  // 4 Floats
+  float* heapv4f32 = reinterpret_cast<float*>(initialHeapData_.data());
+  heapv4f32[0] = 1.0f;
+  heapv4f32[1] = -42.5f;
+  heapv4f32[2] = 10.0f;
+  heapv4f32[3] = 0.0f;
+  heapv4f32[4] = -0.125f;
+  heapv4f32[5] = 16.0f;
+  heapv4f32[6] = -2.0f;
+  heapv4f32[7] = 256.0f;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+    fdiv v2.4s, v0.4s, v1.4s
+  )");
+  CHECK_NEON(2, float, {-8.0f, -2.65625f, -5.0f, 0.0f});
 }
 
 TEST_P(InstNeon, fmla) {
@@ -2746,6 +2878,97 @@ TEST_P(InstNeon, umaxp) {
               0xCC, 0xBB, 0xAA, 0x99, 0x88});
 }
 
+TEST_P(InstNeon, umaxv) {
+  // umaxv vd, vn.t
+  initialHeapData_.resize(32);
+  uint8_t* heap = reinterpret_cast<uint8_t*>(initialHeapData_.data());
+
+  // v0
+  heap[0] = 0x01;
+  heap[1] = 0x00;
+  heap[2] = 0xFF;
+  heap[3] = 0xAA;
+  heap[4] = 0xBB;
+  heap[5] = 0xCC;
+  heap[6] = 0xDD;
+  heap[7] = 0xEE;
+  heap[8] = 0x07;
+  heap[9] = 0x00;
+  heap[10] = 0xFC;
+  heap[11] = 0xFD;
+  heap[12] = 0xBA;
+  heap[13] = 0xCA;
+  heap[14] = 0x39;
+  heap[15] = 0xEF;
+
+  // v1
+  heap[16] = 0x00;
+  heap[17] = 0x00;
+  heap[18] = 0xEE;
+  heap[19] = 0x11;
+  heap[20] = 0x22;
+  heap[21] = 0x33;
+  heap[22] = 0x44;
+  heap[23] = 0x55;
+  heap[24] = 0x26;
+  heap[25] = 0xFF;
+  heap[26] = 0xEA;
+  heap[27] = 0xFA;
+  heap[28] = 0x14;
+  heap[29] = 0x43;
+  heap[30] = 0x21;
+  heap[31] = 0xAE;
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    ldr q0, [x0]
+    ldr q1, [x0, #16]
+    umaxv h2, v0.4h
+    umaxv h3, v1.4h
+
+    umaxv h4, v0.8h
+    umaxv h5, v1.8h
+
+    umaxv s6, v0.4s
+    umaxv s7, v1.4s
+
+    umaxv b8, v0.8b
+    umaxv b9, v1.8b
+    
+    umaxv b10, v0.16b
+    umaxv b11, v1.16b
+
+  )");
+  CHECK_NEON(2, uint16_t,
+             {0xEEDD, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(3, uint16_t,
+             {0x5544, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(4, uint16_t,
+             {0xFDFC, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(5, uint16_t,
+             {0xFF26, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000});
+  CHECK_NEON(6, uint32_t, {0xFDFC0007, 0x00000000, 0x00000000, 0x00000000});
+  CHECK_NEON(7, uint32_t, {0xFAEAFF26, 0x00000000, 0x00000000, 0x00000000});
+  CHECK_NEON(8, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(9, uint8_t,
+             {0xEE, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(10, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+  CHECK_NEON(11, uint8_t,
+             {0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+              0x00, 0x00, 0x00, 0x00, 0x00})
+
+  EXPECT_GROUP(R"(umaxv b11, v1.16b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
+}
+
 TEST_P(InstNeon, smax) {
   initialHeapData_.resize(32);
   uint32_t* heap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
diff --git a/test/regression/aarch64/instructions/store.cc b/test/regression/aarch64/instructions/store.cc
index 6d6876b494..2b43e510e4 100644
--- a/test/regression/aarch64/instructions/store.cc
+++ b/test/regression/aarch64/instructions/store.cc
@@ -3,6 +3,7 @@
 namespace {
 
 using InstStore = AArch64RegressionTest;
+using namespace simeng::arch::aarch64::InstructionGroups;
 
 TEST_P(InstStore, stlr) {
   // stlrb
@@ -60,6 +61,101 @@ TEST_P(InstStore, stlr) {
             0xBABA);
 }
 
+TEST_P(InstStore, stlxr) {
+  // stlxrb
+  RUN_AARCH64(R"(
+    mov w0, 0xAB
+    mov w1, 0x12
+    mov w2, 0xCD
+    mov w3, 0x34
+    sub sp, sp, #4
+    stlxrb w4, w0, [sp]
+    add sp, sp, #1
+    stlxrb w5, w1, [sp]
+    add sp, sp, #1
+    stlxrb w6, w2, [sp]
+    add sp, sp, #1
+    stlxrb w7, w3, [sp]
+    add sp, sp, #1
+  )");
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 4),
+            0xAB);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 3),
+            0x12);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 2),
+            0xCD);
+  EXPECT_EQ(getMemoryValue<uint8_t>(process_->getInitialStackPointer() - 1),
+            0x34);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
+
+  EXPECT_GROUP(R"(stlxrb w7, w3, [sp])", STORE_ADDRESS_INT);
+
+  // stlxrh
+  RUN_AARCH64(R"(
+    mov w0, 0xABCD
+    mov w1, 0x1234
+    mov w2, 0xCDEF
+    mov w3, 0x3456
+    sub sp, sp, #8
+    stlxrh w4, w0, [sp]
+    add sp, sp, #2
+    stlxrh w5, w1, [sp]
+    add sp, sp, #2
+    stlxrh w6, w2, [sp]
+    add sp, sp, #2
+    stlxrh w7, w3, [sp]
+    add sp, sp, #2
+  )");
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 8),
+            0xABCD);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 6),
+            0x1234);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 4),
+            0xCDEF);
+  EXPECT_EQ(getMemoryValue<uint16_t>(process_->getInitialStackPointer() - 2),
+            0x3456);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
+
+  EXPECT_GROUP(R"(stlxrh w7, w3, [sp])", STORE_ADDRESS_INT);
+
+  // stlxr
+  RUN_AARCH64(R"(
+    mov w0, 0xABCD
+    mov w1, 0x1234
+    mov w2, 0xCDEF
+    mov w3, 0x3456
+    sub sp, sp, #24
+    stlxr w4, x0, [sp]
+    add sp, sp, #8
+    stlxr w5, x1, [sp]
+    add sp, sp, #8
+    stlxr w6, w2, [sp]
+    add sp, sp, #4
+    stlxr w7, w3, [sp]
+    add sp, sp, #4
+  )");
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 24),
+            0xABCD);
+  EXPECT_EQ(getMemoryValue<uint64_t>(process_->getInitialStackPointer() - 16),
+            0x1234);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 8),
+            0xCDEF);
+  EXPECT_EQ(getMemoryValue<uint32_t>(process_->getInitialStackPointer() - 4),
+            0x3456);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(4), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(5), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(6), 0);
+  EXPECT_EQ(getGeneralRegister<uint32_t>(7), 0);
+
+  EXPECT_GROUP(R"(stlxr w7, w3, [sp])", STORE_ADDRESS_INT);
+}
+
 TEST_P(InstStore, strb) {
   RUN_AARCH64(R"(
     mov w0, 0xAB
diff --git a/test/regression/aarch64/instructions/sve.cc b/test/regression/aarch64/instructions/sve.cc
index 6a52d46b95..f94ee28262 100644
--- a/test/regression/aarch64/instructions/sve.cc
+++ b/test/regression/aarch64/instructions/sve.cc
@@ -1322,6 +1322,194 @@ TEST_P(InstSve, cmphi_vec) {
   EXPECT_EQ(getNZCV(), 0b0110);
 }
 
+TEST_P(InstSve, cmphs_vec) {
+  // 8-bit
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    dup z0.b, #-5
+    dup z1.b, #4
+
+    cmphs p1.b, p0/z, z0.b, z1.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #2
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.b, xzr, x0
+    dup z0.b, #-5
+    dup z1.b, #4
+
+    cmphs p1.b, p0/z, z0.b, z1.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    dup z0.b, #-5
+    dup z1.b, #4
+
+    cmphs p1.b, p0/z, z1.b, z0.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    dup z0.b, #6
+    dup z1.b, #6
+
+    cmphs p1.b, p0/z, z1.b, z0.b
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 16-bit
+  RUN_AARCH64(R"(
+    ptrue p0.h
+    dup z0.h, #-5
+    dup z1.h, #4
+
+    cmphs p1.h, p0/z, z0.h, z1.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.h, xzr, x0
+    dup z0.h, #-5
+    dup z1.h, #4
+
+    cmphs p1.h, p0/z, z0.h, z1.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.h
+    dup z0.h, #-5
+    dup z1.h, #4
+
+    cmphs p1.h, p0/z, z1.h, z0.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.h
+    dup z0.h, #6
+    dup z1.h, #6
+
+    cmphs p1.h, p0/z, z1.h, z0.h
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 32-bit
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    dup z0.s, #-5
+    dup z1.s, #4
+
+    cmphs p1.s, p0/z, z0.s, z1.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.s, xzr, x0
+    dup z0.s, #-5
+    dup z1.s, #4
+
+    cmphs p1.s, p0/z, z0.s, z1.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    dup z0.s, #-5
+    dup z1.s, #4
+
+    cmphs p1.s, p0/z, z1.s, z0.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.s
+    dup z0.s, #6
+    dup z1.s, #6
+
+    cmphs p1.s, p0/z, z1.s, z0.s
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    ptrue p0.d
+    dup z0.d, #-5
+    dup z1.d, #4
+
+    cmphs p1.d, p0/z, z0.d, z1.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    whilelo p0.d, xzr, x0
+    dup z0.d, #-5
+    dup z1.d, #4
+
+    cmphs p1.d, p0/z, z0.d, z1.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 16, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    ptrue p0.d
+    dup z0.d, #-5
+    dup z1.d, #4
+
+    cmphs p1.d, p0/z, z1.d, z0.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, {0, 0, 0, 0});
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  RUN_AARCH64(R"(
+    ptrue p0.d
+    dup z0.d, #6
+    dup z1.d, #6
+
+    cmphs p1.d, p0/z, z1.d, z0.d
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  EXPECT_GROUP(R"(cmphs p1.d, p0/z, z1.d, z0.d)", PREDICATE);
+}
+
 TEST_P(InstSve, cnt) {
   // pattern = all
   RUN_AARCH64(R"(
@@ -1544,6 +1732,146 @@ TEST_P(InstSve, cpy) {
   CHECK_NEON(4, int64_t, fillNeon<int64_t>({12}, VL / 8));
   CHECK_NEON(5, int64_t,
              fillNeon<int64_t>({static_cast<int16_t>(-2048)}, VL / 16));
+
+  // SIMD & FP scalar
+  // Tests are different for 8/16 bit vs 32/64 bit due to the lack of fmov
+  // support for h and b registers
+  // 8-bit
+  RUN_AARCH64(R"(
+      mov x0, #0
+      mov x1, #2
+      addvl x0, x0, #1
+      sdiv x0, x0, x1
+
+      ptrue p0.b
+      whilelo p1.b, xzr, x0
+
+      cpy z6.b, p0/z, #10
+      cpy z7.b, p0/z, #-8
+      cpy z8.b, p0/z, #12
+      cpy z9.b, p0/z, #-16
+      cpy z10.b, p0/z, #12
+      cpy z11.b, p0/z, #-8
+
+      cpy z0.b, p0/m, b6
+      cpy z1.b, p0/m, b7
+      cpy z2.b, p1/m, b8
+      cpy z3.b, p1/m, b9
+
+      # Test Alias
+      mov z4.b, p0/m, b10
+      mov z5.b, p1/m, b11
+    )");
+  CHECK_NEON(0, int8_t, fillNeon<int8_t>({10}, VL / 8));
+  CHECK_NEON(1, int8_t, fillNeon<int8_t>({-8}, VL / 8));
+  CHECK_NEON(2, int8_t, fillNeon<int8_t>({12}, VL / 16));
+  CHECK_NEON(3, int8_t, fillNeon<int8_t>({-16}, VL / 16));
+  CHECK_NEON(4, int8_t, fillNeon<int8_t>({12}, VL / 8));
+  CHECK_NEON(5, int8_t, fillNeon<int8_t>({-8}, VL / 16));
+
+  // 16-bit
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #4
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    ptrue p0.h
+    whilelo p1.h, xzr, x0
+
+    cpy z6.h, p0/z, #10
+    cpy z7.h, p0/z, #8, lsl #8
+    cpy z8.h, p0/z, #-12
+    cpy z9.h, p0/z, #-16, lsl #8
+    cpy z10.h, p0/z, #12
+    cpy z11.h, p0/z, #-8, lsl #8
+
+    cpy z0.h, p0/m, h6
+    cpy z1.h, p0/m, h7
+    cpy z2.h, p1/m, h8
+    cpy z3.h, p1/m, h9
+
+    # Test Alias
+    mov z4.h, p0/m, h10
+    mov z5.h, p1/m, h11
+  )");
+  CHECK_NEON(0, int16_t, fillNeon<int16_t>({10}, VL / 8));
+  CHECK_NEON(1, int16_t,
+             fillNeon<int16_t>({static_cast<int16_t>(2048)}, VL / 8));
+  CHECK_NEON(2, int16_t, fillNeon<int16_t>({-12}, VL / 16));
+  CHECK_NEON(3, int16_t,
+             fillNeon<int16_t>({static_cast<int16_t>(-4096)}, VL / 16));
+  CHECK_NEON(4, int16_t, fillNeon<int16_t>({12}, VL / 8));
+  CHECK_NEON(5, int16_t,
+             fillNeon<int16_t>({static_cast<int16_t>(-2048)}, VL / 16));
+
+  // 32-bit
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #8
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    ptrue p0.s
+    whilelo p1.s, xzr, x0
+
+    fmov s6, #10
+    fmov s7, #-8
+    fmov s8, #12
+    fmov s9, #-16
+    fmov s10, #12
+    fmov s11, #-8
+
+    cpy z0.s, p0/m, s6
+    cpy z1.s, p0/m, s7
+    cpy z2.s, p1/m, s8
+    cpy z3.s, p1/m, s9
+
+    # Test Alias
+    mov z4.S, p0/m, s10
+    mov z5.S, p1/m, s11
+  )");
+  CHECK_NEON(0, float, fillNeon<float>({10}, VL / 8));
+  CHECK_NEON(1, float, fillNeon<float>({static_cast<int16_t>(-8)}, VL / 8));
+  CHECK_NEON(2, float, fillNeon<float>({12}, VL / 16));
+  CHECK_NEON(3, float, fillNeon<float>({static_cast<int16_t>(-16)}, VL / 16));
+  CHECK_NEON(4, float, fillNeon<float>({12}, VL / 8));
+  CHECK_NEON(5, float, fillNeon<float>({static_cast<int16_t>(-8)}, VL / 16));
+
+  // 64-bit
+  RUN_AARCH64(R"(
+    mov x0, #0
+    mov x1, #16
+    addvl x0, x0, #1
+    sdiv x0, x0, x1
+
+    ptrue p0.d
+    whilelo p1.d, xzr, x0
+
+    fmov d6, #10
+    fmov d7, #-8
+    fmov d8, #12
+    fmov d9, #-16
+    fmov d10, #12
+    fmov d11, #-8
+
+    cpy z0.d, p0/m, d6
+    cpy z1.d, p0/m, d7
+    cpy z2.d, p1/m, d8
+    cpy z3.d, p1/m, d9
+
+    # Test Alias
+    mov z4.d, p0/m, d10
+    mov z5.d, p1/m, d11
+  )");
+  CHECK_NEON(0, double, fillNeon<double>({10}, VL / 8));
+  CHECK_NEON(1, double, fillNeon<double>({static_cast<int16_t>(-8)}, VL / 8));
+  CHECK_NEON(2, double, fillNeon<double>({12}, VL / 16));
+  CHECK_NEON(3, double, fillNeon<double>({static_cast<int16_t>(-16)}, VL / 16));
+  CHECK_NEON(4, double, fillNeon<double>({12}, VL / 8));
+  CHECK_NEON(5, double, fillNeon<double>({static_cast<int16_t>(-8)}, VL / 16));
+
+  EXPECT_GROUP(R"(cpy z3.d, p1/m, d9)", SVE_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, fcpy) {
@@ -3451,6 +3779,8 @@ TEST_P(InstSve, fdiv) {
   CHECK_NEON(1, double, fillNeon<double>(dresults, VL / 8));
   std::rotate(dsrcB.begin(), dsrcB.begin() + ((VL / 128) % 8), dsrcB.end());
   CHECK_NEON(2, double, fillNeonCombined<double>(dresults, dsrcB, VL / 8));
+
+  EXPECT_GROUP(R"(fdiv z2.d, p0/m, z2.d, z0.d)", SVE_DIV_OR_SQRT);
 }
 
 TEST_P(InstSve, fnmls) {
@@ -4610,10 +4940,17 @@ TEST_P(InstSve, index) {
   CHECK_NEON(7, uint64_t, fillNeonBaseAndOffset<uint64_t>(10, 10, VL / 8));
 }
 
-TEST_P(InstSve, ld1rd) {
-  initialHeapData_.resize(16);
-  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
-  fillHeap<uint64_t>(heap64, {0xDEADBEEF, 0x12345678}, 2);
+TEST_P(InstSve, ftsmul) {
+  initialHeapData_.resize(VL / 4);
+  // 64-bit arrangement
+  double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> srcA64 = {1.0, 2.0, 4.0, 12.34};
+  // Note that "the elements of the second source vector hold the corresponding
+  // value of the quadrant Q number as an integer not a floating-point value".
+  // We use doubles anyway as we only care about the sign bit, and currently
+  // "fillHeapCombined" only takes a single templated type
+  std::vector<double> srcB64 = {1.0, -5.4, 0.0, 78.2};
+  fillHeapCombined<double>(dheap, srcA64, srcB64, VL / 32);
 
   RUN_AARCH64(R"(
     # Get heap address
@@ -4621,24 +4958,251 @@ TEST_P(InstSve, ld1rd) {
     mov x8, 214
     svc #0
 
-    # Load and broadcast values from heap
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x3
     ptrue p0.d
-    ld1rd {z0.d}, p0/z, [x0]
-    ld1rd {z1.d}, p0/z, [x0, #8]
 
-    # Test for inactive lanes
-    mov x1, #0
-    addvl x1, x1, #1
-    mov x2, #16
-    udiv x1, x1, x2
-    whilelo p1.d, xzr, x1
-    ld1rd {z2.d}, p1/z, [x0]
-    ld1rd {z3.d}, p1/z, [x0, #8]
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3] 
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+
+    ftsmul z2.d, z0.d, z1.d
+    ftsmul z3.d, z1.d, z0.d
   )");
-  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0xDEADBEEF}, VL / 8));
-  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x12345678}, VL / 8));
-  CHECK_NEON(2, uint64_t, fillNeon<uint64_t>({0xDEADBEEF}, VL / 16));
-  CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({0x12345678}, VL / 16));
+  CHECK_NEON(2, double, fillNeon<double>({1.0, -4.0, 16.0, 152.2756}, VL / 8));
+  CHECK_NEON(3, double, fillNeon<double>({1.0, 29.16, 0.0, 6115.24}, VL / 8));
+
+  // 32-bit arrangement
+  initialHeapData_.resize(VL / 8);
+  float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> fsrcA = {1.0f,  2.0f,   4.0f, 12.34f,
+                              -3.0f, -19.6f, 0.0f, 7.0f};
+  // Note that "the elements of the second source vector hold the corresponding
+  // value of the quadrant Q number as an integer not a floating-point value".
+  // We use floats anyway as we only care about the sign bit, and currently
+  // "fillHeapCombined" only takes a single templated type
+  std::vector<float> fsrcB = {1.0f, -5.4f,   0.0f,  78.2f,
+                              2.1f, -26.42f, 12.0f, 3.5f};
+  fillHeapCombined<float>(fheap, fsrcA, fsrcB, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    sdiv x2, x2, x3
+
+    whilelo p0.s, xzr, x2
+    ptrue p1.s
+
+    ld1w {z0.s}, p0/z, [x0, x1, lsl #2]
+    ld1w {z1.s}, p0/z, [x0, x2, lsl #2]
+
+    ftsmul z2.s, z0.s, z1.s
+    ftsmul z3.s, z1.s, z0.s
+  )");
+  CHECK_NEON(2, float,
+             fillNeon<float>(
+                 {1.0f, -4.0f, 16.0f, 152.2756f, 9.0f, -384.16f, 0.0f, 49.0f},
+                 VL / 16));
+  CHECK_NEON(3, float,
+             fillNeon<float>({1.0f, 29.16f, 0.0f, 6115.24f, -4.41f, -698.0164f,
+                              144.0f, 12.25f},
+                             VL / 16));
+
+  EXPECT_GROUP(R"(ftsmul z3.s, z1.s, z0.s)", SVE_SIMPLE_ARTH_NOSHIFT);
+}
+
+TEST_P(InstSve, ftssel) {
+  initialHeapData_.resize(VL / 4);
+  // 64-bit arrangement
+  // We use uint64_t to model doubles here as we care about the bit patterns
+  // rather than values
+  uint64_t* dheap = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  std::vector<uint64_t> srcA64 = {0x1234, 0xABCD, 0x00000000F0F0FFFF, 0x9876};
+  // Note that "The use of the second operand is consistent with it holding an
+  // integer corresponding to the desired sine-wave quadrant."
+  std::vector<uint64_t> srcB64 = {0x0, 0x8000000000000000, 0x4000000000000000,
+                                  0xC000000000000000};
+  fillHeapCombined<uint64_t>(dheap, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3] 
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+
+    ftssel z2.d, z0.d, z1.d
+  )");
+  CHECK_NEON(2, uint64_t,
+             fillNeon<uint64_t>({0x1234, 0x3ff0000000000000, 0x80000000F0F0FFFF,
+                                 0xbff0000000000000},
+                                VL / 8));
+
+  // 32-bit arrangement
+  // We use uint32_t to model floats here as we care about the bit patterns
+  // rather than values
+  initialHeapData_.resize(VL / 8);
+  uint32_t* fheap = reinterpret_cast<uint32_t*>(initialHeapData_.data());
+  std::vector<uint32_t> fsrcA = {0x1234, 0xABCD, 0x00F0FFFF, 0x9876};
+  // Note that "the elements of the second source vector hold the corresponding
+  // value of the quadrant Q number as an integer not a floating-point value".
+  std::vector<uint32_t> fsrcB = {0x0, 0x80000000, 0x40000000, 0xC0000000};
+  fillHeapCombined<uint32_t>(fheap, fsrcA, fsrcB, VL / 32);
+
+  RUN_AARCH64(R"(
+     # Get heap address
+     mov x0, 0
+     mov x8, 214
+     svc #0
+
+     mov x1, #0
+     mov x2, #0
+     mov x3, #8
+     addvl x2, x2, #1
+     sdiv x2, x2, x3
+
+     whilelo p0.s, xzr, x2
+     ptrue p1.s
+
+     ld1w {z0.s}, p0/z, [x0, x1, lsl #2]
+     ld1w {z1.s}, p0/z, [x0, x2, lsl #2]
+
+     ftssel z2.s, z0.s, z1.s
+   )");
+  CHECK_NEON(2, uint32_t,
+             fillNeon<uint32_t>({0x1234, 0x3f800000, 0x80F0FFFF, 0xBF800000},
+                                VL / 16));
+
+  EXPECT_GROUP(R"(ftssel z2.s, z0.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT);
+}
+
+TEST_P(InstSve, ftmad) {
+  initialHeapData_.resize(VL / 4);
+  // 64-bit arrangement
+  double* dheap = reinterpret_cast<double*>(initialHeapData_.data());
+  std::vector<double> srcA64 = {0.0, 0.5, -0.5, 0.75};
+  std::vector<double> srcB64 = {0.0, 0.5, -0.4, -0.2};
+  fillHeapCombined<double>(dheap, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+    mov z2.d, z0.d
+    mov z3.d, z0.d
+    mov z4.d, z0.d
+
+    ftmad z2.d, z2.d, z1.d, #0
+    ftmad z3.d, z3.d, z1.d, #2
+    ftmad z4.d, z4.d, z1.d, #7
+  )");
+  CHECK_NEON(2, double, fillNeon<double>({1.0, 1.25, 0.8, 1.15}, VL / 8));
+  CHECK_NEON(3, double,
+             fillNeon<double>({0.008333333333320002, 0.258333333333320002,
+                               -0.15833333333333355, 0.19166666666666645},
+                              VL / 8));
+  CHECK_NEON(
+      4, double,
+      fillNeon<double>({0.0, 0.25, -0.20000000001135337, 0.1499999999886466},
+                       VL / 8));
+
+  // 32-bit arrangement
+  initialHeapData_.resize(VL / 4);
+  float* fheap = reinterpret_cast<float*>(initialHeapData_.data());
+  std::vector<float> fsrcA = {0.0f, 0.5f, -0.5f, 0.75f};
+  std::vector<float> fsrcB = {0.0f, 0.5f, -0.4f, -0.2f};
+  fillHeapCombined<float>(fheap, fsrcA, fsrcB, VL / 16);
+
+  RUN_AARCH64(R"(
+     # Get heap address
+     mov x0, 0
+     mov x8, 214
+     svc #0
+
+     mov x1, #0
+     mov x2, #0
+     mov x3, #4
+     addvl x2, x2, #1
+     sdiv x2, x2, x3
+
+     whilelo p0.s, xzr, x2
+
+     ld1w {z2.s}, p0/z, [x0]
+     ld1w {z3.s}, p0/z, [x0]
+     ld1w {z4.s}, p0/z, [x0, x1, lsl #2]
+     ld1w {z1.s}, p0/z, [x0, x2, lsl #2]
+
+     ftmad z2.s, z2.s, z1.s, #0
+     ftmad z3.s, z3.s, z1.s, #2
+     ftmad z4.s, z4.s, z1.s, #7
+   )");
+  CHECK_NEON(2, float, fillNeon<float>({1.0f, 1.25f, 0.8f, 1.15f}, VL / 8));
+  CHECK_NEON(3, float,
+             fillNeon<float>(
+                 {0.00833333f, 0.25833333f, -0.1583334f, 0.1916666f}, VL / 8));
+  CHECK_NEON(4, float, fillNeon<float>({0.0f, 0.25f, -0.2f, 0.15f}, VL / 8));
+
+  EXPECT_GROUP(R"(ftmad z4.s, z4.s, z1.s, #7)", SVE_MUL);
+}
+
+TEST_P(InstSve, ld1rd) {
+  initialHeapData_.resize(16);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+  fillHeap<uint64_t>(heap64, {0xDEADBEEF, 0x12345678}, 2);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    # Load and broadcast values from heap
+    ptrue p0.d
+    ld1rd {z0.d}, p0/z, [x0]
+    ld1rd {z1.d}, p0/z, [x0, #8]
+
+    # Test for inactive lanes
+    mov x1, #0
+    addvl x1, x1, #1
+    mov x2, #16
+    udiv x1, x1, x2
+    whilelo p1.d, xzr, x1
+    ld1rd {z2.d}, p1/z, [x0]
+    ld1rd {z3.d}, p1/z, [x0, #8]
+  )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0xDEADBEEF}, VL / 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x12345678}, VL / 8));
+  CHECK_NEON(2, uint64_t, fillNeon<uint64_t>({0xDEADBEEF}, VL / 16));
+  CHECK_NEON(3, uint64_t, fillNeon<uint64_t>({0x12345678}, VL / 16));
 }
 
 TEST_P(InstSve, ld1rqd) {
@@ -5647,6 +6211,29 @@ TEST_P(InstSve, pfalse) {
   CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {0}, 1));
 }
 
+TEST_P(InstSve, pfirst) {
+  RUN_AARCH64(R"(
+    ptrue p0.b
+    pfalse p1.b
+    ptrue p2.b
+    ptrue p3.b
+    pfalse p4.b
+    pfalse p5.b
+
+    pfirst p2.b, p0, p2.b
+    pfirst p3.b, p1, p3.b
+    pfirst p4.b, p0, p4.b
+    pfirst p5.b, p1, p5.b
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred(VL / 8, {1}, 1));
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1));
+  CHECK_PREDICATE(4, uint64_t, fillPred(1, {1}, 1));
+  CHECK_PREDICATE(5, uint64_t, fillPred(VL / 8, {0}, 1));
+  EXPECT_EQ(getNZCV(), 0b0110);
+
+  EXPECT_GROUP(R"(pfirst p5.b, p1, p5.b)", PREDICATE);
+}
+
 TEST_P(InstSve, ptrue) {
   RUN_AARCH64(R"(
     ptrue p0.s
@@ -5660,6 +6247,108 @@ TEST_P(InstSve, ptrue) {
   CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2));
 }
 
+TEST_P(InstSve, pnext) {
+  initialHeapData_.resize(1024);
+  uint64_t* heap64 = reinterpret_cast<uint64_t*>(initialHeapData_.data());
+
+  //      B arrangement
+  // Allow 32 Byte space for each predicate register for when VL=2048
+  std::vector<uint64_t> src = {0xAAAA, 0x0, 0x0,    0x0, 0x0, 0x0,
+                               0x0,    0x0, 0xAA00, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 12);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p2, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.b, p2, p0.b
+
+        ldr p1, [x0]
+        add x0, x0, #32
+        ldr p3, [x0]
+
+        pnext p1.b, p3, p1.b
+  )");
+  CHECK_PREDICATE(0, uint64_t,
+                  fillPredFromSource<uint64_t>({0x02, 0, 0, 0}, 32));
+  CHECK_PREDICATE(1, uint64_t,
+                  fillPredFromSource<uint64_t>({0x0200, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b0010);
+
+  //      H arrangement
+  src = {0x555, 0x0, 0x0, 0x0, 0x333, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 8);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p1, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.h, p1, p0.h
+  )");
+  CHECK_PREDICATE(0, uint64_t,
+                  fillPredFromSource<uint64_t>({0x400, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b0010);
+
+  //      S arrangement
+  src = {0x9, 0x0, 0x0, 0x0, 0x6, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 8);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p1, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.s, p1, p0.s
+  )");
+  CHECK_PREDICATE(0, uint64_t,
+                  fillPredFromSource<uint64_t>({0x1, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  //      D arrangement
+  src = {0x3,  0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0,
+         0xF3, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
+  fillHeap<uint64_t>(heap64, src, 12);
+  RUN_AARCH64(R"(
+        # Get heap address
+        mov x0, 0
+        mov x8, 214
+        svc #0
+
+        ldr p2, [x0]
+        add x0, x0, #32
+        ldr p0, [x0]
+
+        pnext p0.d, p2, p0.d
+
+        add x0, x0, #32
+        ldr p3, [x0]
+        add x0, x0, #32
+        ldr p1, [x0]
+
+        pnext p1.d, p3, p1.d
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPredFromSource<uint64_t>({0, 0, 0, 0}, 32));
+  CHECK_PREDICATE(1, uint64_t,
+                  fillPredFromSource<uint64_t>({0x1, 0, 0, 0}, 32));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  EXPECT_GROUP(R"(pnext p1.d, p3, p1.d)", PREDICATE);
+}
+
 TEST_P(InstSve, punpk) {
   RUN_AARCH64(R"(
     ptrue p0.b
@@ -5965,6 +6654,64 @@ TEST_P(InstSve, sel) {
 }
 
 TEST_P(InstSve, smax) {
+  // 64-bit
+  initialHeapData_.resize(VL / 4);
+  int64_t* heap64 = reinterpret_cast<int64_t*>(initialHeapData_.data());
+  std::vector<int64_t> srcA64 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int64_t> srcB64 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int64_t>(heap64, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #8
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.d, xzr, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+    ld1d {z2.d}, p0/z, [x0, x2, lsl #3]
+    ld1d {z3.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z4.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z5.d}, p0/z, [x0, x1, lsl #3]
+
+    smax z1.d, p0/m, z1.d, z0.d
+    smax z2.d, p1/m, z2.d, z0.d
+      
+    smax z3.d, z3.d, #0
+    smax z4.d, z4.d, #-128
+    smax z5.d, z5.d, #127
+  )");
+  std::vector<int64_t> results64 = {16, 15, 14, 13, 5,  6,  7,  8,
+                                    8,  7,  6,  5,  13, 14, -2, -1};
+  CHECK_NEON(1, int64_t, fillNeon<int64_t>(results64, VL / 8));
+  std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end());
+  CHECK_NEON(2, int64_t, fillNeonCombined<int64_t>(results64, srcB64, VL / 8));
+
+  CHECK_NEON(3, int64_t,
+             fillNeon<int64_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8));
+  CHECK_NEON(4, int64_t,
+             fillNeon<int64_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1},
+                 VL / 8));
+  CHECK_NEON(5, int64_t,
+             fillNeon<int64_t>({127, 127, 127, 127, 127, 127, 127, 127, 127,
+                                127, 127, 127, 127, 127, 127, 127},
+                               VL / 8));
+
   // 32-bit
   initialHeapData_.resize(VL / 4);
   int32_t* heap32 = reinterpret_cast<int32_t*>(initialHeapData_.data());
@@ -6022,9 +6769,184 @@ TEST_P(InstSve, smax) {
              fillNeon<int32_t>({127, 127, 127, 127, 127, 127, 127, 127, 127,
                                 127, 127, 127, 127, 127, 127, 127},
                                VL / 8));
+
+  // 16-bit
+  initialHeapData_.resize(VL / 4);
+  int16_t* heap16 = reinterpret_cast<int16_t*>(initialHeapData_.data());
+  std::vector<int16_t> srcA16 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int16_t> srcB16 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int16_t>(heap16, srcA16, srcB16, VL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #2
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.h, xzr, x3
+    ptrue p0.h
+
+    ld1h {z0.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z1.h}, p0/z, [x0, x2, lsl #1]
+    ld1h {z2.h}, p0/z, [x0, x2, lsl #1]
+    ld1h {z3.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z4.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z5.h}, p0/z, [x0, x1, lsl #1]
+
+    smax z1.h, p0/m, z1.h, z0.h
+    smax z2.h, p1/m, z2.h, z0.h
+      
+    smax z3.h, z3.h, #0
+    smax z4.h, z4.h, #-128
+    smax z5.h, z5.h, #127
+  )");
+  std::vector<int16_t> results16 = {16, 15, 14, 13, 5,  6,  7,  8,
+                                    8,  7,  6,  5,  13, 14, -2, -1};
+  CHECK_NEON(1, int16_t, fillNeon<int16_t>(results16, VL / 8));
+  std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end());
+  CHECK_NEON(2, int16_t, fillNeonCombined<int16_t>(results16, srcB16, VL / 8));
+
+  CHECK_NEON(3, int16_t,
+             fillNeon<int16_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8));
+  CHECK_NEON(4, int16_t,
+             fillNeon<int16_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1},
+                 VL / 8));
+  CHECK_NEON(5, int16_t,
+             fillNeon<int16_t>({127, 127, 127, 127, 127, 127, 127, 127, 127,
+                                127, 127, 127, 127, 127, 127, 127},
+                               VL / 8));
+
+  // 8-bit
+  initialHeapData_.resize(VL / 4);
+  int8_t* heap8 = reinterpret_cast<int8_t*>(initialHeapData_.data());
+  std::vector<int8_t> srcA8 = {1,  2,   3,   4,   5,  6,  7,   8,
+                               -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int8_t> srcB8 = {16, 15, 14, 13, -12, -11, -10, -9,
+                               8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int8_t>(heap8, srcA8, srcB8, VL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #1
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.b, xzr, x3
+    ptrue p0.b
+
+    ld1b {z0.b}, p0/z, [x0, x1]
+    ld1b {z1.b}, p0/z, [x0, x2]
+    ld1b {z2.b}, p0/z, [x0, x2]
+    ld1b {z3.b}, p0/z, [x0, x1]
+    ld1b {z4.b}, p0/z, [x0, x1]
+    ld1b {z5.b}, p0/z, [x0, x1]
+
+    smax z1.b, p0/m, z1.b, z0.b
+    smax z2.b, p1/m, z2.b, z0.b
+      
+    smax z3.b, z3.b, #0
+    smax z4.b, z4.b, #-128
+    smax z5.b, z5.b, #127
+  )");
+  std::vector<int8_t> results8 = {16, 15, 14, 13, 5,  6,  7,  8,
+                                  8,  7,  6,  5,  13, 14, -2, -1};
+  CHECK_NEON(1, int8_t, fillNeon<int8_t>(results8, VL / 8));
+  std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end());
+  CHECK_NEON(2, int8_t, fillNeonCombined<int8_t>(results8, srcB8, VL / 8));
+
+  CHECK_NEON(3, int8_t,
+             fillNeon<int8_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 13, 14, 0, 0}, VL / 8));
+  CHECK_NEON(4, int8_t,
+             fillNeon<int8_t>(
+                 {1, 2, 3, 4, 5, 6, 7, 8, -9, -10, -11, -12, 13, 14, -15, -1},
+                 VL / 8));
+  CHECK_NEON(5, int8_t,
+             fillNeon<int8_t>({127, 127, 127, 127, 127, 127, 127, 127, 127, 127,
+                               127, 127, 127, 127, 127, 127},
+                              VL / 8));
+
+  EXPECT_GROUP(R"(smax z5.b, z5.b, #127)", SVE_SIMPLE_ARTH_NOSHIFT);
 }
 
-TEST_P(InstSve, smin) {
+TEST_P(InstSve, smin) {
+  // 64-bit
+  initialHeapData_.resize(VL / 4);
+  int64_t* heap64 = reinterpret_cast<int64_t*>(initialHeapData_.data());
+  std::vector<int64_t> srcA64 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int64_t> srcB64 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int64_t>(heap64, srcA64, srcB64, VL / 32);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #8
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.d, xzr, x3
+    ptrue p0.d
+
+    ld1d {z0.d}, p0/z, [x0, x1, lsl #3]
+    ld1d {z1.d}, p0/z, [x0, x2, lsl #3]
+    ld1d {z2.d}, p0/z, [x0, x2, lsl #3]
+
+    smin z1.d, p0/m, z1.d, z0.d
+    smin z2.d, p1/m, z2.d, z0.d
+
+    sminv d3, p1, z1.d
+    sminv d4, p0, z2.d
+  )");
+
+  std::vector<int64_t> results64 = {1,  2,   3,   4,   -12, -11, -10, -9,
+                                    -9, -10, -11, -12, 4,   3,   -15, -1};
+  std::array<int64_t, 32> arrA64 = fillNeon<int64_t>(results64, VL / 8);
+  std::rotate(srcB64.begin(), srcB64.begin() + ((VL / 128) % 16), srcB64.end());
+  std::array<int64_t, 32> arrB64 =
+      fillNeonCombined<int64_t>(results64, srcB64, VL / 8);
+
+  CHECK_NEON(1, int64_t, arrA64);
+  CHECK_NEON(2, int64_t, arrB64);
+  // Find miniumum element. Modify search end point to only consider the
+  // elements within the current VL and predication.
+  int64_t minElemA64 = arrA64[std::distance(
+      arrA64.begin(),
+      std::min_element(arrA64.begin(), arrA64.end() - (32 - VL / 128)))];
+  int64_t minElemB64 = arrB64[std::distance(
+      arrB64.begin(),
+      std::min_element(arrB64.begin(), arrB64.end() - (32 - VL / 64)))];
+  CHECK_NEON(3, int64_t, {minElemA64, 0, 0, 0});
+  CHECK_NEON(4, int64_t, {minElemB64, 0, 0, 0});
+
   // 32-bit
   initialHeapData_.resize(VL / 4);
   int32_t* heap32 = reinterpret_cast<int32_t*>(initialHeapData_.data());
@@ -6064,23 +6986,140 @@ TEST_P(InstSve, smin) {
 
   std::vector<int32_t> results32 = {1,  2,   3,   4,   -12, -11, -10, -9,
                                     -9, -10, -11, -12, 4,   3,   -15, -1};
-  std::array<int32_t, 64> arrA = fillNeon<int32_t>(results32, VL / 8);
+  std::array<int32_t, 64> arrA32 = fillNeon<int32_t>(results32, VL / 8);
   std::rotate(srcB32.begin(), srcB32.begin() + ((VL / 64) % 16), srcB32.end());
-  std::array<int32_t, 64> arrB =
+  std::array<int32_t, 64> arrB32 =
       fillNeonCombined<int32_t>(results32, srcB32, VL / 8);
 
-  CHECK_NEON(1, int32_t, arrA);
-  CHECK_NEON(2, int32_t, arrB);
+  CHECK_NEON(1, int32_t, arrA32);
+  CHECK_NEON(2, int32_t, arrB32);
+  // Find miniumum element. Modify search end point to only consider the
+  // elements within the current VL and predication.
+  int32_t minElemA32 = arrA32[std::distance(
+      arrA32.begin(),
+      std::min_element(arrA32.begin(), arrA32.end() - (64 - VL / 64)))];
+  int32_t minElemB32 = arrB32[std::distance(
+      arrB32.begin(),
+      std::min_element(arrB32.begin(), arrB32.end() - (64 - VL / 32)))];
+  CHECK_NEON(3, int32_t, {minElemA32, 0, 0, 0});
+  CHECK_NEON(4, int32_t, {minElemB32, 0, 0, 0});
+
+  // 16-bit
+  initialHeapData_.resize(VL / 4);
+  int16_t* heap16 = reinterpret_cast<int16_t*>(initialHeapData_.data());
+  std::vector<int16_t> srcA16 = {1,  2,   3,   4,   5,  6,  7,   8,
+                                 -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int16_t> srcB16 = {16, 15, 14, 13, -12, -11, -10, -9,
+                                 8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int16_t>(heap16, srcA16, srcB16, VL / 8);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #2
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.h, xzr, x3
+    ptrue p0.h
+
+    ld1h {z0.h}, p0/z, [x0, x1, lsl #1]
+    ld1h {z1.h}, p0/z, [x0, x2, lsl #1]
+    ld1h {z2.h}, p0/z, [x0, x2, lsl #1]
+
+    smin z1.h, p0/m, z1.h, z0.h
+    smin z2.h, p1/m, z2.h, z0.h
+
+    sminv h3, p1, z1.h
+    sminv h4, p0, z2.h
+  )");
+
+  std::vector<int16_t> results16 = {1,  2,   3,   4,   -12, -11, -10, -9,
+                                    -9, -10, -11, -12, 4,   3,   -15, -1};
+  std::array<int16_t, 128> arrA16 = fillNeon<int16_t>(results16, VL / 8);
+  std::rotate(srcB16.begin(), srcB16.begin() + ((VL / 32) % 16), srcB16.end());
+  std::array<int16_t, 128> arrB16 =
+      fillNeonCombined<int16_t>(results16, srcB16, VL / 8);
+
+  CHECK_NEON(1, int16_t, arrA16);
+  CHECK_NEON(2, int16_t, arrB16);
+  // Find miniumum element. Modify search end point to only consider the
+  // elements within the current VL and predication.
+  int16_t minElemA16 = arrA16[std::distance(
+      arrA16.begin(),
+      std::min_element(arrA16.begin(), arrA16.end() - (128 - VL / 32)))];
+  int16_t minElemB16 = arrB16[std::distance(
+      arrB16.begin(),
+      std::min_element(arrB16.begin(), arrB16.end() - (128 - VL / 16)))];
+  CHECK_NEON(3, int16_t, {minElemA16, 0, 0, 0});
+  CHECK_NEON(4, int16_t, {minElemB16, 0, 0, 0});
+
+  // 8-bit
+  initialHeapData_.resize(VL / 4);
+  int8_t* heap8 = reinterpret_cast<int8_t*>(initialHeapData_.data());
+  std::vector<int8_t> srcA8 = {1,  2,   3,   4,   5,  6,  7,   8,
+                               -9, -10, -11, -12, 13, 14, -15, -1};
+  std::vector<int8_t> srcB8 = {16, 15, 14, 13, -12, -11, -10, -9,
+                               8,  7,  6,  5,  4,   3,   -2,  -1};
+  fillHeapCombined<int8_t>(heap8, srcA8, srcB8, VL / 4);
+
+  RUN_AARCH64(R"(
+    # Get heap address
+    mov x0, 0
+    mov x8, 214
+    svc #0
+
+    mov x1, #0
+    mov x2, #0
+    mov x3, #0
+    mov x4, #1
+    mov x5, #2
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    udiv x3, x2, x5
+    whilelo p1.b, xzr, x3
+    ptrue p0.b
+
+    ld1b {z0.b}, p0/z, [x0, x1]
+    ld1b {z1.b}, p0/z, [x0, x2]
+    ld1b {z2.b}, p0/z, [x0, x2]
+
+    smin z1.b, p0/m, z1.b, z0.b
+    smin z2.b, p1/m, z2.b, z0.b
+
+    sminv b3, p1, z1.b
+    sminv b4, p0, z2.b
+  )");
+
+  std::vector<int8_t> results8 = {1,  2,   3,   4,   -12, -11, -10, -9,
+                                  -9, -10, -11, -12, 4,   3,   -15, -1};
+  std::array<int8_t, 256> arrA8 = fillNeon<int8_t>(results8, VL / 8);
+  std::rotate(srcB8.begin(), srcB8.begin() + ((VL / 16) % 16), srcB8.end());
+  std::array<int8_t, 256> arrB8 =
+      fillNeonCombined<int8_t>(results8, srcB8, VL / 8);
+
+  CHECK_NEON(1, int8_t, arrA8);
+  CHECK_NEON(2, int8_t, arrB8);
   // Find miniumum element. Modify search end point to only consider the
   // elements within the current VL and predication.
-  int32_t minElemA = arrA[std::distance(
-      arrA.begin(),
-      std::min_element(arrA.begin(), arrA.end() - (64 - VL / 64)))];
-  int32_t minElemB = arrB[std::distance(
-      arrB.begin(),
-      std::min_element(arrB.begin(), arrB.end() - (64 - VL / 32)))];
-  CHECK_NEON(3, int32_t, {minElemA, 0, 0, 0});
-  CHECK_NEON(4, int32_t, {minElemB, 0, 0, 0});
+  int8_t minElemA8 = arrA8[std::distance(
+      arrA8.begin(),
+      std::min_element(arrA8.begin(), arrA8.end() - (256 - VL / 16)))];
+  int8_t minElemB8 = arrB8[std::distance(
+      arrB8.begin(),
+      std::min_element(arrB8.begin(), arrB8.end() - (256 - VL / 8)))];
+  CHECK_NEON(3, int8_t, {minElemA8, 0, 0, 0});
+  CHECK_NEON(4, int8_t, {minElemB8, 0, 0, 0});
+
+  EXPECT_GROUP(R"(smin z2.b, p1/m, z2.b, z0.b)", SVE_SIMPLE_ARTH_NOSHIFT);
+  EXPECT_GROUP(R"(sminv b4, p0, z2.b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
 }
 
 TEST_P(InstSve, smulh) {
@@ -6164,6 +7203,260 @@ TEST_P(InstSve, smulh) {
              fillNeonCombined<int32_t>({-12}, {-1076902265}, VL / 8));
 }
 
+TEST_P(InstSve, clastb) {
+  // 64 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        clastb d2, p0, d2, z3.d
+        mov z0.d, z2.d
+
+        ptrue p0.d
+        clastb d2, p0, d2, z3.d
+        mov z1.d, z2.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x0123456789ABCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA987654321}, 8));
+
+  // 32 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        clastb s2, p0, s2, z3.s
+        mov z0.d, z2.d
+
+        ptrue p0.s
+        clastb s2, p0, s2, z3.s
+        mov z1.d, z2.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x89ABCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA9}, 8));
+
+  // 16 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        clastb h2, p0, h2, z3.h
+        mov z0.d, z2.d
+
+        ptrue p0.h
+        clastb h2, p0, h2, z3.h
+        mov z1.d, z2.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0xCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FED}, 8));
+
+  // 8 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        clastb b2, p0, b2, z3.b
+        mov z0.d, z2.d
+
+        ptrue p0.b
+        clastb b2, p0, b2, z3.b
+        mov z1.d, z2.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0xEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
+
+  EXPECT_GROUP(R"(clastb b2, p0, b2, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
+}
+
+TEST_P(InstSve, lastb) {
+  // 64 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        ptrue p0.d
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb d4, p0, z2.d
+        mov z0.d, z4.d
+
+        ptrue p0.d
+        lastb d5, p0, z3.d
+        mov z1.d, z5.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x0123456789ABCDEF}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA987654321}, 8));
+
+  // 32 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb s4, p0, z2.s
+        mov z0.d, z4.d
+
+        ptrue p0.s
+        lastb s4, p0, z3.s
+        mov z1.d, z4.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x01234567}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FEDCBA9}, 8));
+
+  // 16 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb h4, p0, z2.h
+        mov z0.d, z4.d
+
+        ptrue p0.h
+        lastb h4, p0, z3.h
+        mov z1.d, z4.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x0123}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1FED}, 8));
+
+  // 8 bit
+  RUN_AARCH64(R"(
+        movz    x0, #0xCDEF
+        movk    x0, #0x89AB, LSL #16
+        movk x0, #0x4567, LSL #32
+        movk x0, #0x0123, LSL #48
+        movz x1, #0x4321
+        movk x1, #0x8765, LSL #16
+        movk x1, #0xCBA9, LSL #32
+        movk x1, #0x1FED, LSL #48
+
+        dup z2.d, x0
+        dup z3.d, x1
+        
+        pfalse p0.b
+        lastb b4, p0, z2.b
+        mov z0.d, z4.d
+
+        ptrue p0.b
+        lastb b4, p0, z3.b
+        mov z1.d, z4.d
+    )");
+  CHECK_NEON(0, uint64_t, fillNeon<uint64_t>({0x01}, 8));
+  CHECK_NEON(1, uint64_t, fillNeon<uint64_t>({0x1F}, 8));
+
+  EXPECT_GROUP(R"(lastb b4, p0, z3.b)", SCALAR_SIMPLE_ARTH_NOSHIFT);
+}
+
+TEST_P(InstSve, splice) {
+  // 64-bit arrangement
+  RUN_AARCH64(R"(
+    fmov z0.d, #1.5
+    fmov z1.d, #-0.5
+    fmov z2.d, #1.5
+
+    ptrue p0.d
+
+    mov x2, #0
+    mov x4, #16
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    whilelo p1.d, xzr, x2
+
+    splice z0.d, p0, z0.d, z1.d
+    splice z2.d, p1, z2.d, z1.d
+  )");
+  CHECK_NEON(0, double, fillNeon<double>({1.5}, VL / 8));
+  CHECK_NEON(2, double, fillNeonCombined<double>({1.5}, {-0.5}, VL / 8));
+
+  // 32-bit arrangement
+  RUN_AARCH64(R"(
+    fmov z0.s, #1.5
+    fmov z1.s, #-0.5
+    fmov z2.s, #1.5
+
+    ptrue p0.s
+
+    mov x2, #0
+    mov x4, #8
+    addvl x2, x2, #1
+    udiv x2, x2, x4
+    whilelo p1.s, xzr, x2
+
+    splice z0.s, p0, z0.s, z1.s
+    splice z2.s, p1, z2.s, z1.s
+  )");
+  CHECK_NEON(0, float, fillNeon<float>({1.5}, VL / 8));
+  CHECK_NEON(2, float, fillNeonCombined<float>({1.5}, {-0.5}, VL / 8));
+
+  EXPECT_GROUP(R"(splice z2.s, p1, z2.s, z1.s)", SVE_SIMPLE_ARTH_NOSHIFT);
+}
+
 TEST_P(InstSve, st1b) {
   initialHeapData_.resize(VL / 4);
   uint8_t* heap8 = reinterpret_cast<uint8_t*>(initialHeapData_.data());
@@ -7704,6 +8997,202 @@ TEST_P(InstSve, whilelo) {
   EXPECT_EQ(getNZCV(), 0b0110);
 }
 
+TEST_P(InstSve, whilels) {
+  // 8-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+
+    whilels p0.b, xzr, x0
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x2, x0, x1
+
+    whilels p1.b, x2, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x2, x0, x1
+    mov x3, #4
+    udiv x4, x0, x3
+    add x5, x4, x2
+
+    whilels p2.b, x5, x0
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    sub x0, x0, #1
+    mov x1, #0
+
+    whilels p3.b, x1, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 1));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 16-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x2, x0, x1
+
+    whilels p0.h, xzr, x2
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x0, x0, x1
+    udiv x2, x0, x1
+
+    whilels p1.h, x2, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #4
+    udiv x2, x0, x1
+    mov x3, #8
+    udiv x4, x0, x3
+    mov x5, #2
+    udiv x0, x0, x5
+    add x6, x4, x2
+
+    whilels p2.h, x6, x0
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    udiv x0, x0, x1
+    sub x0, x0, #1
+
+    whilels p3.h, xzr, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 2));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 32-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #4
+    udiv x2, x0, x1
+
+    whilels p0.s, xzr, x2
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    mov x2, #4
+    udiv x0, x0, x2
+    udiv x3, x0, x1
+
+    whilels p1.s, x3, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #8
+    udiv x2, x0, x1
+    mov x3, #16
+    udiv x4, x0, x3
+    mov x5, #4
+    udiv x0, x0, x5
+    add x6, x4, x2
+
+    whilels p2.s, x6, x0
+  )");
+  CHECK_PREDICATE(2, uint64_t, fillPred((VL / 32) + 1, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1010);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #4
+    udiv x0, x0, x1
+    sub x0, x0, #1
+
+    whilels p3.s, xzr, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 4));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  // 64-bit arrangement, 64-bit source operands
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #8
+    udiv x2, x0, x1
+
+    whilels p0.d, xzr, x2
+  )");
+  CHECK_PREDICATE(0, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #2
+    mov x2, #8
+    udiv x0, x0, x2
+    udiv x3, x0, x1
+
+    whilels p1.d, x3, x0
+  )");
+  CHECK_PREDICATE(1, uint64_t, fillPred((VL / 16) + 1, {1}, 8));
+  if (VL == 128) {
+    EXPECT_EQ(getNZCV(), 0b1000);
+  } else {
+    EXPECT_EQ(getNZCV(), 0b1010);
+  }
+
+  RUN_AARCH64(R"(
+    mov x0, #0
+    addvl x0, x0, #1
+    mov x1, #8
+    udiv x0, x0, x1
+    sub x0, x0, #1
+
+    whilels p3.d, xzr, x0
+  )");
+  CHECK_PREDICATE(3, uint64_t, fillPred(VL / 8, {1}, 8));
+  EXPECT_EQ(getNZCV(), 0b1000);
+
+  EXPECT_GROUP(R"(whilels p3.d, xzr, x0)", PREDICATE);
+}
+
 TEST_P(InstSve, whilelt) {
   // 8-bit arrangement, 64-bit source operands
   RUN_AARCH64(R"(